mistocr 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mistocr/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.1"
1
+ __version__ = "0.2.3"
mistocr/refine.py CHANGED
@@ -197,8 +197,8 @@ async def describe_imgs(
197
197
  imgs: list[Path], # List of image file paths to describe
198
198
  model: str = 'claude-sonnet-4-5', # Model to use for image description
199
199
  prompt: str = describe_img_prompt, # Prompt template for description
200
- semaphore: int = 2, # Max concurrent API requests
201
- delay: float = 1 # Delay in seconds between requests
200
+ semaphore: int = 10, # Max concurrent API requests
201
+ delay: float = 0.1 # Delay in seconds between requests
202
202
  ) -> dict[str, dict]: # Dict mapping filename to parsed description
203
203
  "Describe multiple images in parallel with rate limiting"
204
204
  sem = Semaphore(semaphore)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -38,7 +38,7 @@ Dynamic: requires-dist
38
38
  Dynamic: requires-python
39
39
  Dynamic: summary
40
40
 
41
- # mistocr
41
+ # Mistocr
42
42
 
43
43
 
44
44
  <!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->
@@ -69,12 +69,12 @@ fundamental challenges that raw OCR output leaves unsolved:
69
69
  markdown. This makes visual information searchable and accessible for
70
70
  downstream applications.
71
71
 
72
- - **Cost-efficient batch processing**: By exclusively using Mistral’s
73
- batch API, mistocr cuts costs by 50% (\$0.50 vs \$1.00 per 1000 pages)
74
- while eliminating the boilerplate code typically required.
72
+ - **Cost-efficient batch processing**: The OCR step exclusively uses
73
+ Mistral’s batch API, cutting costs by 50% (\$0.50 vs \$1.00 per 1000
74
+ pages) while eliminating the boilerplate code typically required.
75
75
 
76
- **In short**: Production-ready batch OCR with intelligent postprocessing
77
- that ensures your documents are actually usable for AI systems.
76
+ **In short**: Complete PDF OCR with heading hierarchy fixes and image
77
+ descriptions for RAG and LLM pipelines.
78
78
 
79
79
  ## Get Started
80
80
 
@@ -94,7 +94,10 @@ os.environ['ANTHROPIC_API_KEY'] = 'your-key-here' # for refine features (see Ad
94
94
 
95
95
  ### Complete Pipeline
96
96
 
97
- Full pipeline with all features:
97
+ #### Single File Processing
98
+
99
+ Process a single PDF with OCR (using Mistral’s batch API for cost
100
+ efficiency), heading fixes, and image descriptions:
98
101
 
99
102
  ``` python
100
103
  from mistocr.pipeline import pdf_to_md
@@ -171,12 +174,12 @@ instead.
171
174
 
172
175
  ### Advanced Usage
173
176
 
174
- **Batch process entire folders:**
177
+ **Batch OCR for entire folders:**
175
178
 
176
179
  ``` python
177
180
  from mistocr.core import ocr_pdf
178
181
 
179
- # Process all PDFs in a folder
182
+ # OCR all PDFs in a folder using Mistral's batch API
180
183
  output_dirs = ocr_pdf('path/to/pdf_folder', dst='output_folder')
181
184
  ```
182
185
 
@@ -0,0 +1,11 @@
1
+ mistocr/__init__.py,sha256=PNiDER4qM19h9zdsdfgKt2_dT4WgYK7EguJ8RU2qA_g,22
2
+ mistocr/_modidx.py,sha256=JtXJb6ErVwyGFCWgeVz7N9JvybBp8Zyao4Ahn3smr90,4030
3
+ mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
4
+ mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
5
+ mistocr/refine.py,sha256=arJPOg1eP4MwtkD1zwnYY4EFrBfTTSP-mtR4AVnTiR8,11788
6
+ mistocr-0.2.3.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
+ mistocr-0.2.3.dist-info/METADATA,sha256=73k28u9AtCL1xcoCbYvzDo_JRCDpmUYtomwQFX04KVE,8105
8
+ mistocr-0.2.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
+ mistocr-0.2.3.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
+ mistocr-0.2.3.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
+ mistocr-0.2.3.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- mistocr/__init__.py,sha256=HfjVOrpTnmZ-xVFCYSVmX50EXaBQeJteUHG-PD6iQs8,22
2
- mistocr/_modidx.py,sha256=JtXJb6ErVwyGFCWgeVz7N9JvybBp8Zyao4Ahn3smr90,4030
3
- mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
4
- mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
5
- mistocr/refine.py,sha256=zsPoxWD63bk1rzRVO9OPsevWeMNORHgT_y8H7T7CxYs,11785
6
- mistocr-0.2.1.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
- mistocr-0.2.1.dist-info/METADATA,sha256=-y9Ze92RygrKGCfHbBjlGXlv-5iRYVAOyHtC9MHnplw,7990
8
- mistocr-0.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
- mistocr-0.2.1.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
- mistocr-0.2.1.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
- mistocr-0.2.1.dist-info/RECORD,,