mistocr 0.2.1__py3-none-any.whl → 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mistocr/__init__.py +1 -1
- mistocr/_modidx.py +2 -1
- mistocr/refine.py +11 -38
- {mistocr-0.2.1.dist-info → mistocr-0.2.7.dist-info}/METADATA +21 -10
- mistocr-0.2.7.dist-info/RECORD +11 -0
- mistocr-0.2.1.dist-info/RECORD +0 -11
- {mistocr-0.2.1.dist-info → mistocr-0.2.7.dist-info}/WHEEL +0 -0
- {mistocr-0.2.1.dist-info → mistocr-0.2.7.dist-info}/entry_points.txt +0 -0
- {mistocr-0.2.1.dist-info → mistocr-0.2.7.dist-info}/licenses/LICENSE +0 -0
- {mistocr-0.2.1.dist-info → mistocr-0.2.7.dist-info}/top_level.txt +0 -0
mistocr/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.2.
|
|
1
|
+
__version__ = "0.2.7"
|
mistocr/_modidx.py
CHANGED
|
@@ -21,7 +21,8 @@ d = { 'settings': { 'branch': 'main',
|
|
|
21
21
|
'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
|
|
22
22
|
'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
|
|
23
23
|
'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},
|
|
24
|
-
'mistocr.refine': { 'mistocr.refine.
|
|
24
|
+
'mistocr.refine': { 'mistocr.refine.HeadingCorrection': ('refine.html#headingcorrection', 'mistocr/refine.py'),
|
|
25
|
+
'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
|
|
25
26
|
'mistocr.refine.ImgDescription': ('refine.html#imgdescription', 'mistocr/refine.py'),
|
|
26
27
|
'mistocr.refine.add_descs_to_pg': ('refine.html#add_descs_to_pg', 'mistocr/refine.py'),
|
|
27
28
|
'mistocr.refine.add_descs_to_pgs': ('refine.html#add_descs_to_pgs', 'mistocr/refine.py'),
|
mistocr/refine.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_refine.ipynb.
|
|
4
4
|
|
|
5
5
|
# %% auto 0
|
|
6
|
-
__all__ = ['
|
|
6
|
+
__all__ = ['describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx', 'HeadingCorrection',
|
|
7
7
|
'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes', 'fix_hdgs', 'ImgDescription',
|
|
8
8
|
'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs', 'add_descs_to_pg', 'add_descs_to_pgs',
|
|
9
9
|
'add_img_descs']
|
|
@@ -59,40 +59,12 @@ def fmt_hdgs_idx(
|
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
# %% ../nbs/01_refine.ipynb 18
|
|
62
|
-
class
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
# %% ../nbs/01_refine.ipynb 20
|
|
66
|
-
prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
|
|
67
|
-
|
|
68
|
-
INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title ... page 1")
|
|
69
|
-
|
|
70
|
-
RULES - Apply these fixes in order:
|
|
71
|
-
|
|
72
|
-
1. **Single H1 rule**: Documents must have exactly ONE # heading (typically the document title at the top)
|
|
73
|
-
- If index 0 is already #, then all subsequent headings (index 1+) must be ## or deeper
|
|
74
|
-
- If no H1 exists, the first major heading should be #, and all others ## or deeper
|
|
75
|
-
- NO exceptions: appendices, references, and all sections are ## or deeper after the title
|
|
76
|
-
|
|
77
|
-
2. **Infer depth from numbering patterns**: If headings contain section numbers, deeper nesting means deeper heading level
|
|
78
|
-
- Parent section (e.g., "1", "2", "A") should be shallower than child (e.g., "1.1", "2.a", "A.1")
|
|
79
|
-
- Child section should be one # deeper than parent
|
|
80
|
-
- Works with any numbering: "1/1.1/1.1.1", "A/A.1/A.1.a", "I/I.A/I.A.1", etc.
|
|
81
|
-
|
|
82
|
-
3. **Level jumps**: Headings can only increase by one # at a time when moving deeper
|
|
83
|
-
- Wrong: ## Section → ##### Subsection
|
|
84
|
-
- Fixed: ## Section → ### Subsection
|
|
85
|
-
|
|
86
|
-
4. **Decreasing levels is OK**: Moving back up the hierarchy (### to ##) is valid for new sections
|
|
87
|
-
|
|
88
|
-
OUTPUT: Return a Python dictionary mapping index to corrected heading (without the index prefix).
|
|
89
|
-
IMPORTANT: Preserve the " ... page N" suffix in all corrected headings.
|
|
90
|
-
Only include entries that need changes.
|
|
91
|
-
|
|
92
|
-
Headings to analyze:
|
|
93
|
-
{headings_list}
|
|
94
|
-
"""
|
|
62
|
+
class HeadingCorrection(BaseModel):
|
|
63
|
+
index: int
|
|
64
|
+
corrected: str
|
|
95
65
|
|
|
66
|
+
class HeadingCorrections(BaseModel):
|
|
67
|
+
corrections: list[HeadingCorrection]
|
|
96
68
|
|
|
97
69
|
# %% ../nbs/01_refine.ipynb 22
|
|
98
70
|
def fix_hdg_hierarchy(
|
|
@@ -106,7 +78,8 @@ def fix_hdg_hierarchy(
|
|
|
106
78
|
if prompt is None: prompt = prompt_fix_hdgs
|
|
107
79
|
prompt = prompt.format(headings_list=fmt_hdgs_idx(hdgs))
|
|
108
80
|
r = completion(model=model, messages=[{"role": "user", "content": prompt}], response_format=HeadingCorrections, api_key=api_key)
|
|
109
|
-
|
|
81
|
+
fixes = json.loads(r.choices[0].message.content)['corrections']
|
|
82
|
+
return {o['index']: o['corrected'] for o in fixes}
|
|
110
83
|
|
|
111
84
|
|
|
112
85
|
# %% ../nbs/01_refine.ipynb 25
|
|
@@ -120,7 +93,7 @@ def mk_fixes_lut(
|
|
|
120
93
|
"Make a lookup table of fixes"
|
|
121
94
|
if api_key is None: api_key = os.getenv('ANTHROPIC_API_KEY')
|
|
122
95
|
fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key, **kwargs)
|
|
123
|
-
return {hdgs[
|
|
96
|
+
return {hdgs[k]:v for k,v in fixes.items()}
|
|
124
97
|
|
|
125
98
|
# %% ../nbs/01_refine.ipynb 28
|
|
126
99
|
def apply_hdg_fixes(
|
|
@@ -197,8 +170,8 @@ async def describe_imgs(
|
|
|
197
170
|
imgs: list[Path], # List of image file paths to describe
|
|
198
171
|
model: str = 'claude-sonnet-4-5', # Model to use for image description
|
|
199
172
|
prompt: str = describe_img_prompt, # Prompt template for description
|
|
200
|
-
semaphore: int =
|
|
201
|
-
delay: float = 1 # Delay in seconds between requests
|
|
173
|
+
semaphore: int = 10, # Max concurrent API requests
|
|
174
|
+
delay: float = 0.1 # Delay in seconds between requests
|
|
202
175
|
) -> dict[str, dict]: # Dict mapping filename to parsed description
|
|
203
176
|
"Describe multiple images in parallel with rate limiting"
|
|
204
177
|
sem = Semaphore(semaphore)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mistocr
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.7
|
|
4
4
|
Summary: Batch OCR for PDFs with heading restoration and visual content integration
|
|
5
5
|
Home-page: https://github.com/franckalbinet/mistocr
|
|
6
6
|
Author: Solveit
|
|
@@ -38,7 +38,7 @@ Dynamic: requires-dist
|
|
|
38
38
|
Dynamic: requires-python
|
|
39
39
|
Dynamic: summary
|
|
40
40
|
|
|
41
|
-
#
|
|
41
|
+
# Mistocr
|
|
42
42
|
|
|
43
43
|
|
|
44
44
|
<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->
|
|
@@ -69,12 +69,20 @@ fundamental challenges that raw OCR output leaves unsolved:
|
|
|
69
69
|
markdown. This makes visual information searchable and accessible for
|
|
70
70
|
downstream applications.
|
|
71
71
|
|
|
72
|
-
- **Cost-efficient batch processing**:
|
|
73
|
-
batch API,
|
|
74
|
-
while eliminating the boilerplate code typically required.
|
|
72
|
+
- **Cost-efficient batch processing**: The OCR step exclusively uses
|
|
73
|
+
Mistral’s batch API, cutting costs by 50% (\$0.50 vs \$1.00 per 1000
|
|
74
|
+
pages) while eliminating the boilerplate code typically required.
|
|
75
75
|
|
|
76
|
-
**In short**:
|
|
77
|
-
|
|
76
|
+
**In short**: Complete PDF OCR with heading hierarchy fixes and image
|
|
77
|
+
descriptions for RAG and LLM pipelines.
|
|
78
|
+
|
|
79
|
+
> [!NOTE]
|
|
80
|
+
>
|
|
81
|
+
> **Want to see mistocr in action?** This
|
|
82
|
+
> [tutorial](https://share.solve.it.com/d/97f75412ca949af76a5945b4dfc443c7)
|
|
83
|
+
> demonstrates real-world PDF processing and shows how clean markdown
|
|
84
|
+
> enables structure-aware navigation through long documents—letting you
|
|
85
|
+
> find exactly what you need, fast.
|
|
78
86
|
|
|
79
87
|
## Get Started
|
|
80
88
|
|
|
@@ -94,7 +102,10 @@ os.environ['ANTHROPIC_API_KEY'] = 'your-key-here' # for refine features (see Ad
|
|
|
94
102
|
|
|
95
103
|
### Complete Pipeline
|
|
96
104
|
|
|
97
|
-
|
|
105
|
+
#### Single File Processing
|
|
106
|
+
|
|
107
|
+
Process a single PDF with OCR (using Mistral’s batch API for cost
|
|
108
|
+
efficiency), heading fixes, and image descriptions:
|
|
98
109
|
|
|
99
110
|
``` python
|
|
100
111
|
from mistocr.pipeline import pdf_to_md
|
|
@@ -171,12 +182,12 @@ instead.
|
|
|
171
182
|
|
|
172
183
|
### Advanced Usage
|
|
173
184
|
|
|
174
|
-
**Batch
|
|
185
|
+
**Batch OCR for entire folders:**
|
|
175
186
|
|
|
176
187
|
``` python
|
|
177
188
|
from mistocr.core import ocr_pdf
|
|
178
189
|
|
|
179
|
-
#
|
|
190
|
+
# OCR all PDFs in a folder using Mistral's batch API
|
|
180
191
|
output_dirs = ocr_pdf('path/to/pdf_folder', dst='output_folder')
|
|
181
192
|
```
|
|
182
193
|
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
mistocr/__init__.py,sha256=XHypfHSPdgXFKmOdoewn7czU670gt8InhHhzlP5j_aA,22
|
|
2
|
+
mistocr/_modidx.py,sha256=55k_EmMPrRmLa04qCE-_8lukJPCax66q5MALfpa8T4A,4154
|
|
3
|
+
mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
|
|
4
|
+
mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
|
|
5
|
+
mistocr/refine.py,sha256=zSCF0gOtEKhhQTQgVq4Jh5Ujk8l8CGSO_rURhsQ09P8,10351
|
|
6
|
+
mistocr-0.2.7.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
|
|
7
|
+
mistocr-0.2.7.dist-info/METADATA,sha256=eyQ65s8HsoHUUINrGiijrC8e0RzO_Wvte3rk2OLU8QY,8416
|
|
8
|
+
mistocr-0.2.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
+
mistocr-0.2.7.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
|
|
10
|
+
mistocr-0.2.7.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
|
|
11
|
+
mistocr-0.2.7.dist-info/RECORD,,
|
mistocr-0.2.1.dist-info/RECORD
DELETED
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
mistocr/__init__.py,sha256=HfjVOrpTnmZ-xVFCYSVmX50EXaBQeJteUHG-PD6iQs8,22
|
|
2
|
-
mistocr/_modidx.py,sha256=JtXJb6ErVwyGFCWgeVz7N9JvybBp8Zyao4Ahn3smr90,4030
|
|
3
|
-
mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
|
|
4
|
-
mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
|
|
5
|
-
mistocr/refine.py,sha256=zsPoxWD63bk1rzRVO9OPsevWeMNORHgT_y8H7T7CxYs,11785
|
|
6
|
-
mistocr-0.2.1.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
|
|
7
|
-
mistocr-0.2.1.dist-info/METADATA,sha256=-y9Ze92RygrKGCfHbBjlGXlv-5iRYVAOyHtC9MHnplw,7990
|
|
8
|
-
mistocr-0.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
-
mistocr-0.2.1.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
|
|
10
|
-
mistocr-0.2.1.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
|
|
11
|
-
mistocr-0.2.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|