mistocr 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mistocr/__init__.py +1 -1
- mistocr/_modidx.py +1 -1
- mistocr/pipeline.py +2 -24
- {mistocr-0.3.0.dist-info → mistocr-0.3.1.dist-info}/METADATA +1 -1
- mistocr-0.3.1.dist-info/RECORD +11 -0
- mistocr-0.3.0.dist-info/RECORD +0 -11
- {mistocr-0.3.0.dist-info → mistocr-0.3.1.dist-info}/WHEEL +0 -0
- {mistocr-0.3.0.dist-info → mistocr-0.3.1.dist-info}/entry_points.txt +0 -0
- {mistocr-0.3.0.dist-info → mistocr-0.3.1.dist-info}/licenses/LICENSE +0 -0
- {mistocr-0.3.0.dist-info → mistocr-0.3.1.dist-info}/top_level.txt +0 -0
mistocr/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.3.
|
|
1
|
+
__version__ = "0.3.1"
|
mistocr/_modidx.py
CHANGED
|
@@ -21,7 +21,7 @@ d = { 'settings': { 'branch': 'main',
|
|
|
21
21
|
'mistocr.core.subset_pdf': ('core.html#subset_pdf', 'mistocr/core.py'),
|
|
22
22
|
'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
|
|
23
23
|
'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
|
|
24
|
-
'mistocr.pipeline': {
|
|
24
|
+
'mistocr.pipeline': {},
|
|
25
25
|
'mistocr.refine': { 'mistocr.refine.HeadingCorrection': ('refine.html#headingcorrection', 'mistocr/refine.py'),
|
|
26
26
|
'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
|
|
27
27
|
'mistocr.refine.ImgDescription': ('refine.html#imgdescription', 'mistocr/refine.py'),
|
mistocr/pipeline.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_pipeline.ipynb.
|
|
4
4
|
|
|
5
5
|
# %% auto 0
|
|
6
|
-
__all__ = [
|
|
6
|
+
__all__ = []
|
|
7
7
|
|
|
8
8
|
# %% ../nbs/02_pipeline.ipynb 3
|
|
9
9
|
from fastcore.all import *
|
|
@@ -11,27 +11,5 @@ from .core import read_pgs, ocr_pdf
|
|
|
11
11
|
from .refine import add_img_descs, fix_hdgs
|
|
12
12
|
from pathlib import Path
|
|
13
13
|
from asyncio import Semaphore, gather, sleep
|
|
14
|
+
import tempfile
|
|
14
15
|
import os, json, shutil
|
|
15
|
-
|
|
16
|
-
# %% ../nbs/02_pipeline.ipynb 4
|
|
17
|
-
@delegates(add_img_descs)
|
|
18
|
-
async def pdf_to_md(
|
|
19
|
-
pdf_path:str, # Path to input PDF file
|
|
20
|
-
dst:str, # Destination directory for output markdown
|
|
21
|
-
ocr_output:str=None, # Optional OCR output directory (defaults to pdf_path stem)
|
|
22
|
-
model:str='claude-sonnet-4-5', # Model to use for heading fixes and image descriptions
|
|
23
|
-
add_img_desc:bool=True, # Whether to add image descriptions
|
|
24
|
-
progress:bool=True, # Whether to show progress messages
|
|
25
|
-
**kwargs):
|
|
26
|
-
"Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
|
|
27
|
-
n_steps = 3 if add_img_desc else 2
|
|
28
|
-
if progress: print(f"Step 1/{n_steps}: Running OCR on {pdf_path}...")
|
|
29
|
-
ocr_dirs = ocr_pdf(pdf_path, ocr_output or 'ocr_temp')
|
|
30
|
-
ocr_dir = ocr_dirs[0]
|
|
31
|
-
if progress: print(f"Step 2/{n_steps}: Fixing heading hierarchy...")
|
|
32
|
-
fix_hdgs(ocr_dir, model=model)
|
|
33
|
-
if add_img_desc:
|
|
34
|
-
if progress: print(f"Step 3/{n_steps}: Adding image descriptions...")
|
|
35
|
-
await add_img_descs(ocr_dir, dst=dst, model=model, progress=progress, **kwargs)
|
|
36
|
-
elif dst and Path(dst) != ocr_dir: shutil.copytree(ocr_dir, dst, dirs_exist_ok=True)
|
|
37
|
-
if progress: print("Done!")
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
mistocr/__init__.py,sha256=r4xAFihOf72W9TD-lpMi6ntWSTKTP2SlzKP1ytkjRbI,22
|
|
2
|
+
mistocr/_modidx.py,sha256=4apaXSL_JCEHVA8B-tH3w_23jIZBzBflE3vnQ9TQDDo,4176
|
|
3
|
+
mistocr/core.py,sha256=EQYQgpnX2skgSX123u3dYaJHc1oDk5Nhgt5uBdXnCKs,8386
|
|
4
|
+
mistocr/pipeline.py,sha256=TDk8qMGFLpQReGelNyQ9mHnh2cIb-5P2YgpDHC0iqdI,438
|
|
5
|
+
mistocr/refine.py,sha256=hQg4ZYRTkPFEiCNNBU2ykgXFfVLCNAp2IuwOwHBVQ2k,12983
|
|
6
|
+
mistocr-0.3.1.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
|
|
7
|
+
mistocr-0.3.1.dist-info/METADATA,sha256=O6X89wifAKyOWhee-7SN5SuDkuZZ0W5FIOEEGZ_8J3M,8438
|
|
8
|
+
mistocr-0.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
+
mistocr-0.3.1.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
|
|
10
|
+
mistocr-0.3.1.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
|
|
11
|
+
mistocr-0.3.1.dist-info/RECORD,,
|
mistocr-0.3.0.dist-info/RECORD
DELETED
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
mistocr/__init__.py,sha256=VrXpHDu3erkzwl_WXrqINBm9xWkcyUy53IQOj042dOs,22
|
|
2
|
-
mistocr/_modidx.py,sha256=2rHVTcz3A3BfDCmGqUNsJFW3_n3Ch1vxSorrPVyLvMI,4256
|
|
3
|
-
mistocr/core.py,sha256=EQYQgpnX2skgSX123u3dYaJHc1oDk5Nhgt5uBdXnCKs,8386
|
|
4
|
-
mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
|
|
5
|
-
mistocr/refine.py,sha256=hQg4ZYRTkPFEiCNNBU2ykgXFfVLCNAp2IuwOwHBVQ2k,12983
|
|
6
|
-
mistocr-0.3.0.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
|
|
7
|
-
mistocr-0.3.0.dist-info/METADATA,sha256=XTduA6AA_UcHYrjeJo4oOl9tQYnIfqfiP3oNJvy_nTI,8438
|
|
8
|
-
mistocr-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
-
mistocr-0.3.0.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
|
|
10
|
-
mistocr-0.3.0.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
|
|
11
|
-
mistocr-0.3.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|