mistocr 0.2.1__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mistocr-0.2.1/mistocr.egg-info → mistocr-0.2.2}/PKG-INFO +13 -10
- {mistocr-0.2.1 → mistocr-0.2.2}/README.md +12 -9
- mistocr-0.2.2/mistocr/__init__.py +1 -0
- {mistocr-0.2.1 → mistocr-0.2.2}/mistocr/refine.py +2 -2
- {mistocr-0.2.1 → mistocr-0.2.2/mistocr.egg-info}/PKG-INFO +13 -10
- {mistocr-0.2.1 → mistocr-0.2.2}/settings.ini +1 -1
- mistocr-0.2.1/mistocr/__init__.py +0 -1
- {mistocr-0.2.1 → mistocr-0.2.2}/LICENSE +0 -0
- {mistocr-0.2.1 → mistocr-0.2.2}/MANIFEST.in +0 -0
- {mistocr-0.2.1 → mistocr-0.2.2}/mistocr/_modidx.py +0 -0
- {mistocr-0.2.1 → mistocr-0.2.2}/mistocr/core.py +0 -0
- {mistocr-0.2.1 → mistocr-0.2.2}/mistocr/pipeline.py +0 -0
- {mistocr-0.2.1 → mistocr-0.2.2}/mistocr.egg-info/SOURCES.txt +0 -0
- {mistocr-0.2.1 → mistocr-0.2.2}/mistocr.egg-info/dependency_links.txt +0 -0
- {mistocr-0.2.1 → mistocr-0.2.2}/mistocr.egg-info/entry_points.txt +0 -0
- {mistocr-0.2.1 → mistocr-0.2.2}/mistocr.egg-info/not-zip-safe +0 -0
- {mistocr-0.2.1 → mistocr-0.2.2}/mistocr.egg-info/requires.txt +0 -0
- {mistocr-0.2.1 → mistocr-0.2.2}/mistocr.egg-info/top_level.txt +0 -0
- {mistocr-0.2.1 → mistocr-0.2.2}/pyproject.toml +0 -0
- {mistocr-0.2.1 → mistocr-0.2.2}/setup.cfg +0 -0
- {mistocr-0.2.1 → mistocr-0.2.2}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mistocr
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: Batch OCR for PDFs with heading restoration and visual content integration
|
|
5
5
|
Home-page: https://github.com/franckalbinet/mistocr
|
|
6
6
|
Author: Solveit
|
|
@@ -38,7 +38,7 @@ Dynamic: requires-dist
|
|
|
38
38
|
Dynamic: requires-python
|
|
39
39
|
Dynamic: summary
|
|
40
40
|
|
|
41
|
-
#
|
|
41
|
+
# Mistocr
|
|
42
42
|
|
|
43
43
|
|
|
44
44
|
<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->
|
|
@@ -69,12 +69,12 @@ fundamental challenges that raw OCR output leaves unsolved:
|
|
|
69
69
|
markdown. This makes visual information searchable and accessible for
|
|
70
70
|
downstream applications.
|
|
71
71
|
|
|
72
|
-
- **Cost-efficient batch processing**:
|
|
73
|
-
batch API,
|
|
74
|
-
while eliminating the boilerplate code typically required.
|
|
72
|
+
- **Cost-efficient batch processing**: The OCR step exclusively uses
|
|
73
|
+
Mistral’s batch API, cutting costs by 50% (\$0.50 vs \$1.00 per 1000
|
|
74
|
+
pages) while eliminating the boilerplate code typically required.
|
|
75
75
|
|
|
76
|
-
**In short**:
|
|
77
|
-
|
|
76
|
+
**In short**: Complete PDF OCR with heading hierarchy fixes and image
|
|
77
|
+
descriptions for RAG and LLM pipelines.
|
|
78
78
|
|
|
79
79
|
## Get Started
|
|
80
80
|
|
|
@@ -94,7 +94,10 @@ os.environ['ANTHROPIC_API_KEY'] = 'your-key-here' # for refine features (see Ad
|
|
|
94
94
|
|
|
95
95
|
### Complete Pipeline
|
|
96
96
|
|
|
97
|
-
|
|
97
|
+
#### Single File Processing
|
|
98
|
+
|
|
99
|
+
Process a single PDF with OCR (using Mistral’s batch API for cost
|
|
100
|
+
efficiency), heading fixes, and image descriptions:
|
|
98
101
|
|
|
99
102
|
``` python
|
|
100
103
|
from mistocr.pipeline import pdf_to_md
|
|
@@ -171,12 +174,12 @@ instead.
|
|
|
171
174
|
|
|
172
175
|
### Advanced Usage
|
|
173
176
|
|
|
174
|
-
**Batch
|
|
177
|
+
**Batch OCR for entire folders:**
|
|
175
178
|
|
|
176
179
|
``` python
|
|
177
180
|
from mistocr.core import ocr_pdf
|
|
178
181
|
|
|
179
|
-
#
|
|
182
|
+
# OCR all PDFs in a folder using Mistral's batch API
|
|
180
183
|
output_dirs = ocr_pdf('path/to/pdf_folder', dst='output_folder')
|
|
181
184
|
```
|
|
182
185
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
#
|
|
1
|
+
# Mistocr
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->
|
|
@@ -29,12 +29,12 @@ fundamental challenges that raw OCR output leaves unsolved:
|
|
|
29
29
|
markdown. This makes visual information searchable and accessible for
|
|
30
30
|
downstream applications.
|
|
31
31
|
|
|
32
|
-
- **Cost-efficient batch processing**:
|
|
33
|
-
batch API,
|
|
34
|
-
while eliminating the boilerplate code typically required.
|
|
32
|
+
- **Cost-efficient batch processing**: The OCR step exclusively uses
|
|
33
|
+
Mistral’s batch API, cutting costs by 50% (\$0.50 vs \$1.00 per 1000
|
|
34
|
+
pages) while eliminating the boilerplate code typically required.
|
|
35
35
|
|
|
36
|
-
**In short**:
|
|
37
|
-
|
|
36
|
+
**In short**: Complete PDF OCR with heading hierarchy fixes and image
|
|
37
|
+
descriptions for RAG and LLM pipelines.
|
|
38
38
|
|
|
39
39
|
## Get Started
|
|
40
40
|
|
|
@@ -54,7 +54,10 @@ os.environ['ANTHROPIC_API_KEY'] = 'your-key-here' # for refine features (see Ad
|
|
|
54
54
|
|
|
55
55
|
### Complete Pipeline
|
|
56
56
|
|
|
57
|
-
|
|
57
|
+
#### Single File Processing
|
|
58
|
+
|
|
59
|
+
Process a single PDF with OCR (using Mistral’s batch API for cost
|
|
60
|
+
efficiency), heading fixes, and image descriptions:
|
|
58
61
|
|
|
59
62
|
``` python
|
|
60
63
|
from mistocr.pipeline import pdf_to_md
|
|
@@ -131,12 +134,12 @@ instead.
|
|
|
131
134
|
|
|
132
135
|
### Advanced Usage
|
|
133
136
|
|
|
134
|
-
**Batch
|
|
137
|
+
**Batch OCR for entire folders:**
|
|
135
138
|
|
|
136
139
|
``` python
|
|
137
140
|
from mistocr.core import ocr_pdf
|
|
138
141
|
|
|
139
|
-
#
|
|
142
|
+
# OCR all PDFs in a folder using Mistral's batch API
|
|
140
143
|
output_dirs = ocr_pdf('path/to/pdf_folder', dst='output_folder')
|
|
141
144
|
```
|
|
142
145
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.2"
|
|
@@ -197,8 +197,8 @@ async def describe_imgs(
|
|
|
197
197
|
imgs: list[Path], # List of image file paths to describe
|
|
198
198
|
model: str = 'claude-sonnet-4-5', # Model to use for image description
|
|
199
199
|
prompt: str = describe_img_prompt, # Prompt template for description
|
|
200
|
-
semaphore: int =
|
|
201
|
-
delay: float = 1 # Delay in seconds between requests
|
|
200
|
+
semaphore: int = 10, # Max concurrent API requests
|
|
201
|
+
delay: float = 0.1 # Delay in seconds between requests
|
|
202
202
|
) -> dict[str, dict]: # Dict mapping filename to parsed description
|
|
203
203
|
"Describe multiple images in parallel with rate limiting"
|
|
204
204
|
sem = Semaphore(semaphore)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mistocr
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: Batch OCR for PDFs with heading restoration and visual content integration
|
|
5
5
|
Home-page: https://github.com/franckalbinet/mistocr
|
|
6
6
|
Author: Solveit
|
|
@@ -38,7 +38,7 @@ Dynamic: requires-dist
|
|
|
38
38
|
Dynamic: requires-python
|
|
39
39
|
Dynamic: summary
|
|
40
40
|
|
|
41
|
-
#
|
|
41
|
+
# Mistocr
|
|
42
42
|
|
|
43
43
|
|
|
44
44
|
<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->
|
|
@@ -69,12 +69,12 @@ fundamental challenges that raw OCR output leaves unsolved:
|
|
|
69
69
|
markdown. This makes visual information searchable and accessible for
|
|
70
70
|
downstream applications.
|
|
71
71
|
|
|
72
|
-
- **Cost-efficient batch processing**:
|
|
73
|
-
batch API,
|
|
74
|
-
while eliminating the boilerplate code typically required.
|
|
72
|
+
- **Cost-efficient batch processing**: The OCR step exclusively uses
|
|
73
|
+
Mistral’s batch API, cutting costs by 50% (\$0.50 vs \$1.00 per 1000
|
|
74
|
+
pages) while eliminating the boilerplate code typically required.
|
|
75
75
|
|
|
76
|
-
**In short**:
|
|
77
|
-
|
|
76
|
+
**In short**: Complete PDF OCR with heading hierarchy fixes and image
|
|
77
|
+
descriptions for RAG and LLM pipelines.
|
|
78
78
|
|
|
79
79
|
## Get Started
|
|
80
80
|
|
|
@@ -94,7 +94,10 @@ os.environ['ANTHROPIC_API_KEY'] = 'your-key-here' # for refine features (see Ad
|
|
|
94
94
|
|
|
95
95
|
### Complete Pipeline
|
|
96
96
|
|
|
97
|
-
|
|
97
|
+
#### Single File Processing
|
|
98
|
+
|
|
99
|
+
Process a single PDF with OCR (using Mistral’s batch API for cost
|
|
100
|
+
efficiency), heading fixes, and image descriptions:
|
|
98
101
|
|
|
99
102
|
``` python
|
|
100
103
|
from mistocr.pipeline import pdf_to_md
|
|
@@ -171,12 +174,12 @@ instead.
|
|
|
171
174
|
|
|
172
175
|
### Advanced Usage
|
|
173
176
|
|
|
174
|
-
**Batch
|
|
177
|
+
**Batch OCR for entire folders:**
|
|
175
178
|
|
|
176
179
|
``` python
|
|
177
180
|
from mistocr.core import ocr_pdf
|
|
178
181
|
|
|
179
|
-
#
|
|
182
|
+
# OCR all PDFs in a folder using Mistral's batch API
|
|
180
183
|
output_dirs = ocr_pdf('path/to/pdf_folder', dst='output_folder')
|
|
181
184
|
```
|
|
182
185
|
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.2.1"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|