mistocr 0.2.1__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mistocr/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.1"
1
+ __version__ = "0.2.7"
mistocr/_modidx.py CHANGED
@@ -21,7 +21,8 @@ d = { 'settings': { 'branch': 'main',
21
21
  'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
22
22
  'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
23
23
  'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},
24
- 'mistocr.refine': { 'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
24
+ 'mistocr.refine': { 'mistocr.refine.HeadingCorrection': ('refine.html#headingcorrection', 'mistocr/refine.py'),
25
+ 'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
25
26
  'mistocr.refine.ImgDescription': ('refine.html#imgdescription', 'mistocr/refine.py'),
26
27
  'mistocr.refine.add_descs_to_pg': ('refine.html#add_descs_to_pg', 'mistocr/refine.py'),
27
28
  'mistocr.refine.add_descs_to_pgs': ('refine.html#add_descs_to_pgs', 'mistocr/refine.py'),
mistocr/refine.py CHANGED
@@ -3,7 +3,7 @@
3
3
  # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_refine.ipynb.
4
4
 
5
5
  # %% auto 0
6
- __all__ = ['prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
6
+ __all__ = ['describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx', 'HeadingCorrection',
7
7
  'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes', 'fix_hdgs', 'ImgDescription',
8
8
  'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs', 'add_descs_to_pg', 'add_descs_to_pgs',
9
9
  'add_img_descs']
@@ -59,40 +59,12 @@ def fmt_hdgs_idx(
59
59
 
60
60
 
61
61
  # %% ../nbs/01_refine.ipynb 18
62
- class HeadingCorrections(BaseModel):
63
- corrections: dict[int, str] # index → corrected heading
64
-
65
- # %% ../nbs/01_refine.ipynb 20
66
- prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
67
-
68
- INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title ... page 1")
69
-
70
- RULES - Apply these fixes in order:
71
-
72
- 1. **Single H1 rule**: Documents must have exactly ONE # heading (typically the document title at the top)
73
- - If index 0 is already #, then all subsequent headings (index 1+) must be ## or deeper
74
- - If no H1 exists, the first major heading should be #, and all others ## or deeper
75
- - NO exceptions: appendices, references, and all sections are ## or deeper after the title
76
-
77
- 2. **Infer depth from numbering patterns**: If headings contain section numbers, deeper nesting means deeper heading level
78
- - Parent section (e.g., "1", "2", "A") should be shallower than child (e.g., "1.1", "2.a", "A.1")
79
- - Child section should be one # deeper than parent
80
- - Works with any numbering: "1/1.1/1.1.1", "A/A.1/A.1.a", "I/I.A/I.A.1", etc.
81
-
82
- 3. **Level jumps**: Headings can only increase by one # at a time when moving deeper
83
- - Wrong: ## Section → ##### Subsection
84
- - Fixed: ## Section → ### Subsection
85
-
86
- 4. **Decreasing levels is OK**: Moving back up the hierarchy (### to ##) is valid for new sections
87
-
88
- OUTPUT: Return a Python dictionary mapping index to corrected heading (without the index prefix).
89
- IMPORTANT: Preserve the " ... page N" suffix in all corrected headings.
90
- Only include entries that need changes.
91
-
92
- Headings to analyze:
93
- {headings_list}
94
- """
62
+ class HeadingCorrection(BaseModel):
63
+ index: int
64
+ corrected: str
95
65
 
66
+ class HeadingCorrections(BaseModel):
67
+ corrections: list[HeadingCorrection]
96
68
 
97
69
  # %% ../nbs/01_refine.ipynb 22
98
70
  def fix_hdg_hierarchy(
@@ -106,7 +78,8 @@ def fix_hdg_hierarchy(
106
78
  if prompt is None: prompt = prompt_fix_hdgs
107
79
  prompt = prompt.format(headings_list=fmt_hdgs_idx(hdgs))
108
80
  r = completion(model=model, messages=[{"role": "user", "content": prompt}], response_format=HeadingCorrections, api_key=api_key)
109
- return json.loads(r.choices[0].message.content)['corrections']
81
+ fixes = json.loads(r.choices[0].message.content)['corrections']
82
+ return {o['index']: o['corrected'] for o in fixes}
110
83
 
111
84
 
112
85
  # %% ../nbs/01_refine.ipynb 25
@@ -120,7 +93,7 @@ def mk_fixes_lut(
120
93
  "Make a lookup table of fixes"
121
94
  if api_key is None: api_key = os.getenv('ANTHROPIC_API_KEY')
122
95
  fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key, **kwargs)
123
- return {hdgs[int(k)]:v for k,v in fixes.items()}
96
+ return {hdgs[k]:v for k,v in fixes.items()}
124
97
 
125
98
  # %% ../nbs/01_refine.ipynb 28
126
99
  def apply_hdg_fixes(
@@ -197,8 +170,8 @@ async def describe_imgs(
197
170
  imgs: list[Path], # List of image file paths to describe
198
171
  model: str = 'claude-sonnet-4-5', # Model to use for image description
199
172
  prompt: str = describe_img_prompt, # Prompt template for description
200
- semaphore: int = 2, # Max concurrent API requests
201
- delay: float = 1 # Delay in seconds between requests
173
+ semaphore: int = 10, # Max concurrent API requests
174
+ delay: float = 0.1 # Delay in seconds between requests
202
175
  ) -> dict[str, dict]: # Dict mapping filename to parsed description
203
176
  "Describe multiple images in parallel with rate limiting"
204
177
  sem = Semaphore(semaphore)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.2.1
3
+ Version: 0.2.7
4
4
  Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -38,7 +38,7 @@ Dynamic: requires-dist
38
38
  Dynamic: requires-python
39
39
  Dynamic: summary
40
40
 
41
- # mistocr
41
+ # Mistocr
42
42
 
43
43
 
44
44
  <!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->
@@ -69,12 +69,20 @@ fundamental challenges that raw OCR output leaves unsolved:
69
69
  markdown. This makes visual information searchable and accessible for
70
70
  downstream applications.
71
71
 
72
- - **Cost-efficient batch processing**: By exclusively using Mistral’s
73
- batch API, mistocr cuts costs by 50% (\$0.50 vs \$1.00 per 1000 pages)
74
- while eliminating the boilerplate code typically required.
72
+ - **Cost-efficient batch processing**: The OCR step exclusively uses
73
+ Mistral’s batch API, cutting costs by 50% (\$0.50 vs \$1.00 per 1000
74
+ pages) while eliminating the boilerplate code typically required.
75
75
 
76
- **In short**: Production-ready batch OCR with intelligent postprocessing
77
- that ensures your documents are actually usable for AI systems.
76
+ **In short**: Complete PDF OCR with heading hierarchy fixes and image
77
+ descriptions for RAG and LLM pipelines.
78
+
79
+ > [!NOTE]
80
+ >
81
+ > **Want to see mistocr in action?** This
82
+ > [tutorial](https://share.solve.it.com/d/97f75412ca949af76a5945b4dfc443c7)
83
+ > demonstrates real-world PDF processing and shows how clean markdown
84
+ > enables structure-aware navigation through long documents—letting you
85
+ > find exactly what you need, fast.
78
86
 
79
87
  ## Get Started
80
88
 
@@ -94,7 +102,10 @@ os.environ['ANTHROPIC_API_KEY'] = 'your-key-here' # for refine features (see Ad
94
102
 
95
103
  ### Complete Pipeline
96
104
 
97
- Full pipeline with all features:
105
+ #### Single File Processing
106
+
107
+ Process a single PDF with OCR (using Mistral’s batch API for cost
108
+ efficiency), heading fixes, and image descriptions:
98
109
 
99
110
  ``` python
100
111
  from mistocr.pipeline import pdf_to_md
@@ -171,12 +182,12 @@ instead.
171
182
 
172
183
  ### Advanced Usage
173
184
 
174
- **Batch process entire folders:**
185
+ **Batch OCR for entire folders:**
175
186
 
176
187
  ``` python
177
188
  from mistocr.core import ocr_pdf
178
189
 
179
- # Process all PDFs in a folder
190
+ # OCR all PDFs in a folder using Mistral's batch API
180
191
  output_dirs = ocr_pdf('path/to/pdf_folder', dst='output_folder')
181
192
  ```
182
193
 
@@ -0,0 +1,11 @@
1
+ mistocr/__init__.py,sha256=XHypfHSPdgXFKmOdoewn7czU670gt8InhHhzlP5j_aA,22
2
+ mistocr/_modidx.py,sha256=55k_EmMPrRmLa04qCE-_8lukJPCax66q5MALfpa8T4A,4154
3
+ mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
4
+ mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
5
+ mistocr/refine.py,sha256=zSCF0gOtEKhhQTQgVq4Jh5Ujk8l8CGSO_rURhsQ09P8,10351
6
+ mistocr-0.2.7.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
+ mistocr-0.2.7.dist-info/METADATA,sha256=eyQ65s8HsoHUUINrGiijrC8e0RzO_Wvte3rk2OLU8QY,8416
8
+ mistocr-0.2.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
+ mistocr-0.2.7.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
+ mistocr-0.2.7.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
+ mistocr-0.2.7.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- mistocr/__init__.py,sha256=HfjVOrpTnmZ-xVFCYSVmX50EXaBQeJteUHG-PD6iQs8,22
2
- mistocr/_modidx.py,sha256=JtXJb6ErVwyGFCWgeVz7N9JvybBp8Zyao4Ahn3smr90,4030
3
- mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
4
- mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
5
- mistocr/refine.py,sha256=zsPoxWD63bk1rzRVO9OPsevWeMNORHgT_y8H7T7CxYs,11785
6
- mistocr-0.2.1.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
- mistocr-0.2.1.dist-info/METADATA,sha256=-y9Ze92RygrKGCfHbBjlGXlv-5iRYVAOyHtC9MHnplw,7990
8
- mistocr-0.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
- mistocr-0.2.1.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
- mistocr-0.2.1.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
- mistocr-0.2.1.dist-info/RECORD,,