mistocr 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mistocr/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.3.1"
1
+ __version__ = "0.3.2"
mistocr/_modidx.py CHANGED
@@ -21,7 +21,7 @@ d = { 'settings': { 'branch': 'main',
21
21
  'mistocr.core.subset_pdf': ('core.html#subset_pdf', 'mistocr/core.py'),
22
22
  'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
23
23
  'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
24
- 'mistocr.pipeline': {},
24
+ 'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},
25
25
  'mistocr.refine': { 'mistocr.refine.HeadingCorrection': ('refine.html#headingcorrection', 'mistocr/refine.py'),
26
26
  'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
27
27
  'mistocr.refine.ImgDescription': ('refine.html#imgdescription', 'mistocr/refine.py'),
mistocr/pipeline.py CHANGED
@@ -3,7 +3,7 @@
3
3
  # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_pipeline.ipynb.
4
4
 
5
5
  # %% auto 0
6
- __all__ = []
6
+ __all__ = ['pdf_to_md']
7
7
 
8
8
  # %% ../nbs/02_pipeline.ipynb 3
9
9
  from fastcore.all import *
@@ -13,3 +13,29 @@ from pathlib import Path
13
13
  from asyncio import Semaphore, gather, sleep
14
14
  import tempfile
15
15
  import os, json, shutil
16
+
17
+ # %% ../nbs/02_pipeline.ipynb 4
18
+ @delegates(add_img_descs)
19
+ async def pdf_to_md(
20
+ pdf_path:str, # Path to input PDF file
21
+ dst:str, # Destination directory for output markdown
22
+ ocr_dst:str=None, # Optional OCR output directory
23
+ model:str='claude-sonnet-4-5', # Model to use for heading fixes and image descriptions
24
+ add_img_desc:bool=True, # Whether to add image descriptions
25
+ progress:bool=True, # Whether to show progress messages
26
+ **kwargs
27
+ ):
28
+ "Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
29
+ cleanup = ocr_dst is None
30
+ if cleanup: ocr_dst = tempfile.mkdtemp()
31
+ n_steps = 3 if add_img_desc else 2
32
+ if progress: print(f"Step 1/{n_steps}: Running OCR on {pdf_path}...")
33
+ ocr_dir = ocr_pdf(pdf_path, ocr_dst)[0]
34
+ if progress: print(f"Step 2/{n_steps}: Fixing heading hierarchy...")
35
+ fix_hdgs(ocr_dir, model=model)
36
+ if add_img_desc:
37
+ if progress: print(f"Step 3/{n_steps}: Adding image descriptions...")
38
+ await add_img_descs(ocr_dir, dst=dst, model=model, progress=progress, **kwargs)
39
+ elif dst != str(ocr_dir): shutil.copytree(ocr_dir, dst, dirs_exist_ok=True)
40
+ if cleanup: shutil.rmtree(ocr_dst)
41
+ if progress: print("Done!")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.3.1
3
+ Version: 0.3.2
4
4
  Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -0,0 +1,11 @@
1
+ mistocr/__init__.py,sha256=vNiWJ14r_cw5t_7UDqDQIVZvladKFGyHH2avsLpN7Vg,22
2
+ mistocr/_modidx.py,sha256=2rHVTcz3A3BfDCmGqUNsJFW3_n3Ch1vxSorrPVyLvMI,4256
3
+ mistocr/core.py,sha256=EQYQgpnX2skgSX123u3dYaJHc1oDk5Nhgt5uBdXnCKs,8386
4
+ mistocr/pipeline.py,sha256=hVXpxRYtshaiUm9qXgfSLlyHCAxHZ6nAfPzoGXGmJMQ,1769
5
+ mistocr/refine.py,sha256=hQg4ZYRTkPFEiCNNBU2ykgXFfVLCNAp2IuwOwHBVQ2k,12983
6
+ mistocr-0.3.2.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
+ mistocr-0.3.2.dist-info/METADATA,sha256=igTgaDeBu00u_xJYtIcGlQswQCj2gIrdBi6NLiN5NNU,8438
8
+ mistocr-0.3.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
+ mistocr-0.3.2.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
+ mistocr-0.3.2.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
+ mistocr-0.3.2.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- mistocr/__init__.py,sha256=r4xAFihOf72W9TD-lpMi6ntWSTKTP2SlzKP1ytkjRbI,22
2
- mistocr/_modidx.py,sha256=4apaXSL_JCEHVA8B-tH3w_23jIZBzBflE3vnQ9TQDDo,4176
3
- mistocr/core.py,sha256=EQYQgpnX2skgSX123u3dYaJHc1oDk5Nhgt5uBdXnCKs,8386
4
- mistocr/pipeline.py,sha256=TDk8qMGFLpQReGelNyQ9mHnh2cIb-5P2YgpDHC0iqdI,438
5
- mistocr/refine.py,sha256=hQg4ZYRTkPFEiCNNBU2ykgXFfVLCNAp2IuwOwHBVQ2k,12983
6
- mistocr-0.3.1.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
- mistocr-0.3.1.dist-info/METADATA,sha256=O6X89wifAKyOWhee-7SN5SuDkuZZ0W5FIOEEGZ_8J3M,8438
8
- mistocr-0.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
- mistocr-0.3.1.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
- mistocr-0.3.1.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
- mistocr-0.3.1.dist-info/RECORD,,