mistocr 0.2.1__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mistocr/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.1"
1
+ __version__ = "0.2.5"
mistocr/refine.py CHANGED
@@ -197,8 +197,8 @@ async def describe_imgs(
197
197
  imgs: list[Path], # List of image file paths to describe
198
198
  model: str = 'claude-sonnet-4-5', # Model to use for image description
199
199
  prompt: str = describe_img_prompt, # Prompt template for description
200
- semaphore: int = 2, # Max concurrent API requests
201
- delay: float = 1 # Delay in seconds between requests
200
+ semaphore: int = 10, # Max concurrent API requests
201
+ delay: float = 0.1 # Delay in seconds between requests
202
202
  ) -> dict[str, dict]: # Dict mapping filename to parsed description
203
203
  "Describe multiple images in parallel with rate limiting"
204
204
  sem = Semaphore(semaphore)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.2.1
3
+ Version: 0.2.5
4
4
  Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -38,7 +38,7 @@ Dynamic: requires-dist
38
38
  Dynamic: requires-python
39
39
  Dynamic: summary
40
40
 
41
- # mistocr
41
+ # Mistocr
42
42
 
43
43
 
44
44
  <!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->
@@ -69,12 +69,20 @@ fundamental challenges that raw OCR output leaves unsolved:
69
69
  markdown. This makes visual information searchable and accessible for
70
70
  downstream applications.
71
71
 
72
- - **Cost-efficient batch processing**: By exclusively using Mistral’s
73
- batch API, mistocr cuts costs by 50% (\$0.50 vs \$1.00 per 1000 pages)
74
- while eliminating the boilerplate code typically required.
72
+ - **Cost-efficient batch processing**: The OCR step exclusively uses
73
+ Mistral’s batch API, cutting costs by 50% (\$0.50 vs \$1.00 per 1000
74
+ pages) while eliminating the boilerplate code typically required.
75
75
 
76
- **In short**: Production-ready batch OCR with intelligent postprocessing
77
- that ensures your documents are actually usable for AI systems.
76
+ **In short**: Complete PDF OCR with heading hierarchy fixes and image
77
+ descriptions for RAG and LLM pipelines.
78
+
79
+ > [!NOTE]
80
+ >
81
+ > **Want to see mistocr in action?** This
82
+ > [tutorial](https://share.solve.it.com/d/97f75412ca949af76a5945b4dfc443c7)
83
+ > demonstrates real-world PDF processing and shows how clean markdown
84
+ > enables structure-aware navigation through long documents—letting you
85
+ > find exactly what you need, fast.
78
86
 
79
87
  ## Get Started
80
88
 
@@ -94,7 +102,10 @@ os.environ['ANTHROPIC_API_KEY'] = 'your-key-here' # for refine features (see Ad
94
102
 
95
103
  ### Complete Pipeline
96
104
 
97
- Full pipeline with all features:
105
+ #### Single File Processing
106
+
107
+ Process a single PDF with OCR (using Mistral’s batch API for cost
108
+ efficiency), heading fixes, and image descriptions:
98
109
 
99
110
  ``` python
100
111
  from mistocr.pipeline import pdf_to_md
@@ -171,12 +182,12 @@ instead.
171
182
 
172
183
  ### Advanced Usage
173
184
 
174
- **Batch process entire folders:**
185
+ **Batch OCR for entire folders:**
175
186
 
176
187
  ``` python
177
188
  from mistocr.core import ocr_pdf
178
189
 
179
- # Process all PDFs in a folder
190
+ # OCR all PDFs in a folder using Mistral's batch API
180
191
  output_dirs = ocr_pdf('path/to/pdf_folder', dst='output_folder')
181
192
  ```
182
193
 
@@ -0,0 +1,11 @@
1
+ mistocr/__init__.py,sha256=Xsa3ayOMVkhUWm4t06YeyHE0apjpZefxLH4ylp0CDtU,22
2
+ mistocr/_modidx.py,sha256=JtXJb6ErVwyGFCWgeVz7N9JvybBp8Zyao4Ahn3smr90,4030
3
+ mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
4
+ mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
5
+ mistocr/refine.py,sha256=arJPOg1eP4MwtkD1zwnYY4EFrBfTTSP-mtR4AVnTiR8,11788
6
+ mistocr-0.2.5.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
+ mistocr-0.2.5.dist-info/METADATA,sha256=uGim0pZ4V3-oolsihRFr4aOWh3ZDOO7u3d8Mn0n-gmc,8416
8
+ mistocr-0.2.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
+ mistocr-0.2.5.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
+ mistocr-0.2.5.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
+ mistocr-0.2.5.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- mistocr/__init__.py,sha256=HfjVOrpTnmZ-xVFCYSVmX50EXaBQeJteUHG-PD6iQs8,22
2
- mistocr/_modidx.py,sha256=JtXJb6ErVwyGFCWgeVz7N9JvybBp8Zyao4Ahn3smr90,4030
3
- mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
4
- mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
5
- mistocr/refine.py,sha256=zsPoxWD63bk1rzRVO9OPsevWeMNORHgT_y8H7T7CxYs,11785
6
- mistocr-0.2.1.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
- mistocr-0.2.1.dist-info/METADATA,sha256=-y9Ze92RygrKGCfHbBjlGXlv-5iRYVAOyHtC9MHnplw,7990
8
- mistocr-0.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
- mistocr-0.2.1.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
- mistocr-0.2.1.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
- mistocr-0.2.1.dist-info/RECORD,,