mistocr 0.1.5__tar.gz → 0.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mistocr-0.2.5/PKG-INFO +264 -0
- mistocr-0.2.5/README.md +224 -0
- mistocr-0.2.5/mistocr/__init__.py +1 -0
- {mistocr-0.1.5 → mistocr-0.2.5}/mistocr/_modidx.py +13 -3
- {mistocr-0.1.5 → mistocr-0.2.5}/mistocr/core.py +5 -4
- mistocr-0.2.5/mistocr/pipeline.py +37 -0
- mistocr-0.2.5/mistocr/refine.py +265 -0
- mistocr-0.2.5/mistocr.egg-info/PKG-INFO +264 -0
- {mistocr-0.1.5 → mistocr-0.2.5}/mistocr.egg-info/SOURCES.txt +1 -0
- {mistocr-0.1.5 → mistocr-0.2.5}/settings.ini +2 -2
- mistocr-0.1.5/PKG-INFO +0 -183
- mistocr-0.1.5/README.md +0 -143
- mistocr-0.1.5/mistocr/__init__.py +0 -1
- mistocr-0.1.5/mistocr/refine.py +0 -133
- mistocr-0.1.5/mistocr.egg-info/PKG-INFO +0 -183
- {mistocr-0.1.5 → mistocr-0.2.5}/LICENSE +0 -0
- {mistocr-0.1.5 → mistocr-0.2.5}/MANIFEST.in +0 -0
- {mistocr-0.1.5 → mistocr-0.2.5}/mistocr.egg-info/dependency_links.txt +0 -0
- {mistocr-0.1.5 → mistocr-0.2.5}/mistocr.egg-info/entry_points.txt +0 -0
- {mistocr-0.1.5 → mistocr-0.2.5}/mistocr.egg-info/not-zip-safe +0 -0
- {mistocr-0.1.5 → mistocr-0.2.5}/mistocr.egg-info/requires.txt +0 -0
- {mistocr-0.1.5 → mistocr-0.2.5}/mistocr.egg-info/top_level.txt +0 -0
- {mistocr-0.1.5 → mistocr-0.2.5}/pyproject.toml +0 -0
- {mistocr-0.1.5 → mistocr-0.2.5}/setup.cfg +0 -0
- {mistocr-0.1.5 → mistocr-0.2.5}/setup.py +0 -0
mistocr-0.2.5/PKG-INFO
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mistocr
|
|
3
|
+
Version: 0.2.5
|
|
4
|
+
Summary: Batch OCR for PDFs with heading restoration and visual content integration
|
|
5
|
+
Home-page: https://github.com/franckalbinet/mistocr
|
|
6
|
+
Author: Solveit
|
|
7
|
+
Author-email: nobody@fast.ai
|
|
8
|
+
License: Apache Software License 2.0
|
|
9
|
+
Keywords: nbdev jupyter notebook python
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Natural Language :: English
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
18
|
+
Requires-Python: >=3.9
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: fastcore
|
|
22
|
+
Requires-Dist: mistralai
|
|
23
|
+
Requires-Dist: pillow
|
|
24
|
+
Requires-Dist: dotenv
|
|
25
|
+
Requires-Dist: lisette
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Dynamic: author
|
|
28
|
+
Dynamic: author-email
|
|
29
|
+
Dynamic: classifier
|
|
30
|
+
Dynamic: description
|
|
31
|
+
Dynamic: description-content-type
|
|
32
|
+
Dynamic: home-page
|
|
33
|
+
Dynamic: keywords
|
|
34
|
+
Dynamic: license
|
|
35
|
+
Dynamic: license-file
|
|
36
|
+
Dynamic: provides-extra
|
|
37
|
+
Dynamic: requires-dist
|
|
38
|
+
Dynamic: requires-python
|
|
39
|
+
Dynamic: summary
|
|
40
|
+
|
|
41
|
+
# Mistocr
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->
|
|
45
|
+
|
|
46
|
+
**PDF OCR is a critical bottleneck in AI pipelines.** It’s often
|
|
47
|
+
mentioned in passing, as if it’s a trivial step. Practice shows it’s far
|
|
48
|
+
from it. Poorly converted PDFs mean garbage-in-garbage-out for
|
|
49
|
+
downstream AI-system (RAG, …).
|
|
50
|
+
|
|
51
|
+
When [Mistral AI](https://mistral.ai) released their [state-of-the-art
|
|
52
|
+
OCR model](https://mistral.ai/fr/news/mistral-ocr) in March 2025, it
|
|
53
|
+
opened new possibilities for large-scale document processing. While
|
|
54
|
+
alternatives like [datalab.to](https://www.datalab.to) and
|
|
55
|
+
[docling.ai](https://www.docling.ai) offer viable solutions, Mistral OCR
|
|
56
|
+
delivers exceptional accuracy at a compelling price point.
|
|
57
|
+
|
|
58
|
+
**mistocr** emerged from months of real-world usage across projects
|
|
59
|
+
requiring large-scale processing of niche-domain PDFs. It addresses two
|
|
60
|
+
fundamental challenges that raw OCR output leaves unsolved:
|
|
61
|
+
|
|
62
|
+
- **Heading hierarchy restoration**: Even state-of-the-art OCR sometimes
|
|
63
|
+
produces inconsistent heading levels in large documents—a complex task
|
|
64
|
+
to get right. mistocr uses LLM-based analysis to restore proper
|
|
65
|
+
document structure, essential for downstream AI tasks.
|
|
66
|
+
|
|
67
|
+
- **Visual content integration**: Charts, figures and diagrams are
|
|
68
|
+
automatically classified and described, then integrated into the
|
|
69
|
+
markdown. This makes visual information searchable and accessible for
|
|
70
|
+
downstream applications.
|
|
71
|
+
|
|
72
|
+
- **Cost-efficient batch processing**: The OCR step exclusively uses
|
|
73
|
+
Mistral’s batch API, cutting costs by 50% (\$0.50 vs \$1.00 per 1000
|
|
74
|
+
pages) while eliminating the boilerplate code typically required.
|
|
75
|
+
|
|
76
|
+
**In short**: Complete PDF OCR with heading hierarchy fixes and image
|
|
77
|
+
descriptions for RAG and LLM pipelines.
|
|
78
|
+
|
|
79
|
+
> [!NOTE]
|
|
80
|
+
>
|
|
81
|
+
> **Want to see mistocr in action?** This
|
|
82
|
+
> [tutorial](https://share.solve.it.com/d/97f75412ca949af76a5945b4dfc443c7)
|
|
83
|
+
> demonstrates real-world PDF processing and shows how clean markdown
|
|
84
|
+
> enables structure-aware navigation through long documents—letting you
|
|
85
|
+
> find exactly what you need, fast.
|
|
86
|
+
|
|
87
|
+
## Get Started
|
|
88
|
+
|
|
89
|
+
Install latest from [pypi](https://pypi.org/project/mistocr), then:
|
|
90
|
+
|
|
91
|
+
``` sh
|
|
92
|
+
$ pip install mistocr
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Set your API keys:
|
|
96
|
+
|
|
97
|
+
``` python
|
|
98
|
+
import os
|
|
99
|
+
os.environ['MISTRAL_API_KEY'] = 'your-key-here'
|
|
100
|
+
os.environ['ANTHROPIC_API_KEY'] = 'your-key-here' # for refine features (see Advanced Usage for other LLMs)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Complete Pipeline
|
|
104
|
+
|
|
105
|
+
#### Single File Processing
|
|
106
|
+
|
|
107
|
+
Process a single PDF with OCR (using Mistral’s batch API for cost
|
|
108
|
+
efficiency), heading fixes, and image descriptions:
|
|
109
|
+
|
|
110
|
+
``` python
|
|
111
|
+
from mistocr.pipeline import pdf_to_md
|
|
112
|
+
await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
Step 1/3: Running OCR on files/test/resnet.pdf...
|
|
116
|
+
Mistral batch job status: QUEUED
|
|
117
|
+
Mistral batch job status: RUNNING
|
|
118
|
+
Mistral batch job status: RUNNING
|
|
119
|
+
Step 2/3: Fixing heading hierarchy...
|
|
120
|
+
Step 3/3: Adding image descriptions...
|
|
121
|
+
Describing 7 images...
|
|
122
|
+
Saved descriptions to ocr_temp/resnet/img_descriptions.json
|
|
123
|
+
Adding descriptions to 12 pages...
|
|
124
|
+
Done! Enriched pages saved to files/test/md_test
|
|
125
|
+
Done!
|
|
126
|
+
|
|
127
|
+
This will (as indicated by the output):
|
|
128
|
+
|
|
129
|
+
1. OCR the PDF using Mistral’s batch API
|
|
130
|
+
2. Fix heading hierarchy inconsistencies
|
|
131
|
+
3. Describe images (charts, diagrams) and add those descriptions into
|
|
132
|
+
the markdown Save everything to `files/test/md_test`
|
|
133
|
+
|
|
134
|
+
The output structure will be:
|
|
135
|
+
|
|
136
|
+
files/test/md_test/
|
|
137
|
+
├── img/
|
|
138
|
+
│ ├── img-0.jpeg
|
|
139
|
+
│ ├── img-1.jpeg
|
|
140
|
+
│ └── ...
|
|
141
|
+
├── page_1.md
|
|
142
|
+
├── page_2.md
|
|
143
|
+
└── ...
|
|
144
|
+
|
|
145
|
+
Each page’s markdown will include inline image descriptions:
|
|
146
|
+
|
|
147
|
+
```` markdown
|
|
148
|
+
```markdown
|
|
149
|
+

|
|
150
|
+
AI-generated image description:
|
|
151
|
+
___
|
|
152
|
+
A residual learning block...
|
|
153
|
+
___
|
|
154
|
+
```
|
|
155
|
+
````
|
|
156
|
+
|
|
157
|
+
To print the the processed markdown, you can use the
|
|
158
|
+
[`read_pgs`](https://franckalbinet.github.io/mistocr/core.html#read_pgs)
|
|
159
|
+
function. Here’s how:
|
|
160
|
+
|
|
161
|
+
Then to read the fully processed document:
|
|
162
|
+
|
|
163
|
+
``` python
|
|
164
|
+
from mistocr.pipeline import read_pgs
|
|
165
|
+
md = read_pgs('files/test/md_test')
|
|
166
|
+
print(md[:500])
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
# Deep Residual Learning for Image Recognition ... page 1
|
|
170
|
+
|
|
171
|
+
Kaiming He Xiangyu Zhang Shaoqing Ren Jian Sun<br>Microsoft Research<br>\{kahe, v-xiangz, v-shren, jiansun\}@microsoft.com
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
## Abstract ... page 1
|
|
175
|
+
|
|
176
|
+
Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, ins
|
|
177
|
+
|
|
178
|
+
By default,
|
|
179
|
+
[`read_pgs()`](https://franckalbinet.github.io/mistocr/core.html#read_pgs)
|
|
180
|
+
joins all pages. Pass `join=False` to get a list of individual pages
|
|
181
|
+
instead.
|
|
182
|
+
|
|
183
|
+
### Advanced Usage
|
|
184
|
+
|
|
185
|
+
**Batch OCR for entire folders:**
|
|
186
|
+
|
|
187
|
+
``` python
|
|
188
|
+
from mistocr.core import ocr_pdf
|
|
189
|
+
|
|
190
|
+
# OCR all PDFs in a folder using Mistral's batch API
|
|
191
|
+
output_dirs = ocr_pdf('path/to/pdf_folder', dst='output_folder')
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
**Custom models and prompts for heading fixes:**
|
|
195
|
+
|
|
196
|
+
``` python
|
|
197
|
+
from mistocr.refine import fix_hdgs
|
|
198
|
+
|
|
199
|
+
# Use a different model or custom prompt
|
|
200
|
+
fix_hdgs('ocr_output/doc1',
|
|
201
|
+
model='gpt-4o',
|
|
202
|
+
prompt=your_custom_prompt)
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
**Custom image description with rate limiting:**
|
|
206
|
+
|
|
207
|
+
``` python
|
|
208
|
+
from mistocr.refine import add_img_descs
|
|
209
|
+
|
|
210
|
+
# Control API usage and customize descriptions
|
|
211
|
+
await add_img_descs('ocr_output/doc1',
|
|
212
|
+
model='claude-opus-4',
|
|
213
|
+
semaphore=5, # More concurrent requests
|
|
214
|
+
delay=0.5) # Shorter delay between calls
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
For complete control over each pipeline step, see the
|
|
218
|
+
[core](https://fr.anckalbi.net/mistocr/core.html),
|
|
219
|
+
[refine](https://fr.anckalbi.net/mistocr/refine.html), and
|
|
220
|
+
[pipeline](https://fr.anckalbi.net/mistocr/pipeline.html) module
|
|
221
|
+
documentation.
|
|
222
|
+
|
|
223
|
+
## Known Limitations & Future Work
|
|
224
|
+
|
|
225
|
+
`mistocr` is under active development. Current limitations include:
|
|
226
|
+
|
|
227
|
+
- **No timeout on batch jobs**: Jobs poll indefinitely until completion.
|
|
228
|
+
If a job stalls, manual intervention is required.
|
|
229
|
+
- **Limited error handling**: When batch jobs fail, error reporting and
|
|
230
|
+
recovery options are minimal.
|
|
231
|
+
- **Progress monitoring**: Currently limited to periodic status prints.
|
|
232
|
+
Future versions will support callbacks or streaming updates for better
|
|
233
|
+
real-time monitoring.
|
|
234
|
+
|
|
235
|
+
Contributions are welcome! If you encounter issues or have ideas for
|
|
236
|
+
improvements, please open an issue or discussion on
|
|
237
|
+
[GitHub](https://github.com/franckalbinet/mistocr).
|
|
238
|
+
|
|
239
|
+
## Developer Guide
|
|
240
|
+
|
|
241
|
+
If you are new to using `nbdev` here are some useful pointers to get you
|
|
242
|
+
started.
|
|
243
|
+
|
|
244
|
+
### Install mistocr in Development mode
|
|
245
|
+
|
|
246
|
+
``` sh
|
|
247
|
+
# make sure mistocr package is installed in development mode
|
|
248
|
+
$ pip install -e .
|
|
249
|
+
|
|
250
|
+
# make changes under nbs/ directory
|
|
251
|
+
# ...
|
|
252
|
+
|
|
253
|
+
# compile to have changes apply to mistocr
|
|
254
|
+
$ nbdev_prepare
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
### Documentation
|
|
258
|
+
|
|
259
|
+
Documentation can be found hosted on this GitHub
|
|
260
|
+
[repository](https://github.com/franckalbinet/mistocr)’s
|
|
261
|
+
[pages](https://franckalbinet.github.io/mistocr/). Additionally you can
|
|
262
|
+
find package manager specific guidelines on
|
|
263
|
+
[conda](https://anaconda.org/franckalbinet/mistocr) and
|
|
264
|
+
[pypi](https://pypi.org/project/mistocr/) respectively.
|
mistocr-0.2.5/README.md
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
# Mistocr
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->
|
|
5
|
+
|
|
6
|
+
**PDF OCR is a critical bottleneck in AI pipelines.** It’s often
|
|
7
|
+
mentioned in passing, as if it’s a trivial step. Practice shows it’s far
|
|
8
|
+
from it. Poorly converted PDFs mean garbage-in-garbage-out for
|
|
9
|
+
downstream AI-system (RAG, …).
|
|
10
|
+
|
|
11
|
+
When [Mistral AI](https://mistral.ai) released their [state-of-the-art
|
|
12
|
+
OCR model](https://mistral.ai/fr/news/mistral-ocr) in March 2025, it
|
|
13
|
+
opened new possibilities for large-scale document processing. While
|
|
14
|
+
alternatives like [datalab.to](https://www.datalab.to) and
|
|
15
|
+
[docling.ai](https://www.docling.ai) offer viable solutions, Mistral OCR
|
|
16
|
+
delivers exceptional accuracy at a compelling price point.
|
|
17
|
+
|
|
18
|
+
**mistocr** emerged from months of real-world usage across projects
|
|
19
|
+
requiring large-scale processing of niche-domain PDFs. It addresses two
|
|
20
|
+
fundamental challenges that raw OCR output leaves unsolved:
|
|
21
|
+
|
|
22
|
+
- **Heading hierarchy restoration**: Even state-of-the-art OCR sometimes
|
|
23
|
+
produces inconsistent heading levels in large documents—a complex task
|
|
24
|
+
to get right. mistocr uses LLM-based analysis to restore proper
|
|
25
|
+
document structure, essential for downstream AI tasks.
|
|
26
|
+
|
|
27
|
+
- **Visual content integration**: Charts, figures and diagrams are
|
|
28
|
+
automatically classified and described, then integrated into the
|
|
29
|
+
markdown. This makes visual information searchable and accessible for
|
|
30
|
+
downstream applications.
|
|
31
|
+
|
|
32
|
+
- **Cost-efficient batch processing**: The OCR step exclusively uses
|
|
33
|
+
Mistral’s batch API, cutting costs by 50% (\$0.50 vs \$1.00 per 1000
|
|
34
|
+
pages) while eliminating the boilerplate code typically required.
|
|
35
|
+
|
|
36
|
+
**In short**: Complete PDF OCR with heading hierarchy fixes and image
|
|
37
|
+
descriptions for RAG and LLM pipelines.
|
|
38
|
+
|
|
39
|
+
> [!NOTE]
|
|
40
|
+
>
|
|
41
|
+
> **Want to see mistocr in action?** This
|
|
42
|
+
> [tutorial](https://share.solve.it.com/d/97f75412ca949af76a5945b4dfc443c7)
|
|
43
|
+
> demonstrates real-world PDF processing and shows how clean markdown
|
|
44
|
+
> enables structure-aware navigation through long documents—letting you
|
|
45
|
+
> find exactly what you need, fast.
|
|
46
|
+
|
|
47
|
+
## Get Started
|
|
48
|
+
|
|
49
|
+
Install latest from [pypi](https://pypi.org/project/mistocr), then:
|
|
50
|
+
|
|
51
|
+
``` sh
|
|
52
|
+
$ pip install mistocr
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Set your API keys:
|
|
56
|
+
|
|
57
|
+
``` python
|
|
58
|
+
import os
|
|
59
|
+
os.environ['MISTRAL_API_KEY'] = 'your-key-here'
|
|
60
|
+
os.environ['ANTHROPIC_API_KEY'] = 'your-key-here' # for refine features (see Advanced Usage for other LLMs)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Complete Pipeline
|
|
64
|
+
|
|
65
|
+
#### Single File Processing
|
|
66
|
+
|
|
67
|
+
Process a single PDF with OCR (using Mistral’s batch API for cost
|
|
68
|
+
efficiency), heading fixes, and image descriptions:
|
|
69
|
+
|
|
70
|
+
``` python
|
|
71
|
+
from mistocr.pipeline import pdf_to_md
|
|
72
|
+
await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Step 1/3: Running OCR on files/test/resnet.pdf...
|
|
76
|
+
Mistral batch job status: QUEUED
|
|
77
|
+
Mistral batch job status: RUNNING
|
|
78
|
+
Mistral batch job status: RUNNING
|
|
79
|
+
Step 2/3: Fixing heading hierarchy...
|
|
80
|
+
Step 3/3: Adding image descriptions...
|
|
81
|
+
Describing 7 images...
|
|
82
|
+
Saved descriptions to ocr_temp/resnet/img_descriptions.json
|
|
83
|
+
Adding descriptions to 12 pages...
|
|
84
|
+
Done! Enriched pages saved to files/test/md_test
|
|
85
|
+
Done!
|
|
86
|
+
|
|
87
|
+
This will (as indicated by the output):
|
|
88
|
+
|
|
89
|
+
1. OCR the PDF using Mistral’s batch API
|
|
90
|
+
2. Fix heading hierarchy inconsistencies
|
|
91
|
+
3. Describe images (charts, diagrams) and add those descriptions into
|
|
92
|
+
the markdown Save everything to `files/test/md_test`
|
|
93
|
+
|
|
94
|
+
The output structure will be:
|
|
95
|
+
|
|
96
|
+
files/test/md_test/
|
|
97
|
+
├── img/
|
|
98
|
+
│ ├── img-0.jpeg
|
|
99
|
+
│ ├── img-1.jpeg
|
|
100
|
+
│ └── ...
|
|
101
|
+
├── page_1.md
|
|
102
|
+
├── page_2.md
|
|
103
|
+
└── ...
|
|
104
|
+
|
|
105
|
+
Each page’s markdown will include inline image descriptions:
|
|
106
|
+
|
|
107
|
+
```` markdown
|
|
108
|
+
```markdown
|
|
109
|
+

|
|
110
|
+
AI-generated image description:
|
|
111
|
+
___
|
|
112
|
+
A residual learning block...
|
|
113
|
+
___
|
|
114
|
+
```
|
|
115
|
+
````
|
|
116
|
+
|
|
117
|
+
To print the the processed markdown, you can use the
|
|
118
|
+
[`read_pgs`](https://franckalbinet.github.io/mistocr/core.html#read_pgs)
|
|
119
|
+
function. Here’s how:
|
|
120
|
+
|
|
121
|
+
Then to read the fully processed document:
|
|
122
|
+
|
|
123
|
+
``` python
|
|
124
|
+
from mistocr.pipeline import read_pgs
|
|
125
|
+
md = read_pgs('files/test/md_test')
|
|
126
|
+
print(md[:500])
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
# Deep Residual Learning for Image Recognition ... page 1
|
|
130
|
+
|
|
131
|
+
Kaiming He Xiangyu Zhang Shaoqing Ren Jian Sun<br>Microsoft Research<br>\{kahe, v-xiangz, v-shren, jiansun\}@microsoft.com
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
## Abstract ... page 1
|
|
135
|
+
|
|
136
|
+
Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, ins
|
|
137
|
+
|
|
138
|
+
By default,
|
|
139
|
+
[`read_pgs()`](https://franckalbinet.github.io/mistocr/core.html#read_pgs)
|
|
140
|
+
joins all pages. Pass `join=False` to get a list of individual pages
|
|
141
|
+
instead.
|
|
142
|
+
|
|
143
|
+
### Advanced Usage
|
|
144
|
+
|
|
145
|
+
**Batch OCR for entire folders:**
|
|
146
|
+
|
|
147
|
+
``` python
|
|
148
|
+
from mistocr.core import ocr_pdf
|
|
149
|
+
|
|
150
|
+
# OCR all PDFs in a folder using Mistral's batch API
|
|
151
|
+
output_dirs = ocr_pdf('path/to/pdf_folder', dst='output_folder')
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
**Custom models and prompts for heading fixes:**
|
|
155
|
+
|
|
156
|
+
``` python
|
|
157
|
+
from mistocr.refine import fix_hdgs
|
|
158
|
+
|
|
159
|
+
# Use a different model or custom prompt
|
|
160
|
+
fix_hdgs('ocr_output/doc1',
|
|
161
|
+
model='gpt-4o',
|
|
162
|
+
prompt=your_custom_prompt)
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
**Custom image description with rate limiting:**
|
|
166
|
+
|
|
167
|
+
``` python
|
|
168
|
+
from mistocr.refine import add_img_descs
|
|
169
|
+
|
|
170
|
+
# Control API usage and customize descriptions
|
|
171
|
+
await add_img_descs('ocr_output/doc1',
|
|
172
|
+
model='claude-opus-4',
|
|
173
|
+
semaphore=5, # More concurrent requests
|
|
174
|
+
delay=0.5) # Shorter delay between calls
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
For complete control over each pipeline step, see the
|
|
178
|
+
[core](https://fr.anckalbi.net/mistocr/core.html),
|
|
179
|
+
[refine](https://fr.anckalbi.net/mistocr/refine.html), and
|
|
180
|
+
[pipeline](https://fr.anckalbi.net/mistocr/pipeline.html) module
|
|
181
|
+
documentation.
|
|
182
|
+
|
|
183
|
+
## Known Limitations & Future Work
|
|
184
|
+
|
|
185
|
+
`mistocr` is under active development. Current limitations include:
|
|
186
|
+
|
|
187
|
+
- **No timeout on batch jobs**: Jobs poll indefinitely until completion.
|
|
188
|
+
If a job stalls, manual intervention is required.
|
|
189
|
+
- **Limited error handling**: When batch jobs fail, error reporting and
|
|
190
|
+
recovery options are minimal.
|
|
191
|
+
- **Progress monitoring**: Currently limited to periodic status prints.
|
|
192
|
+
Future versions will support callbacks or streaming updates for better
|
|
193
|
+
real-time monitoring.
|
|
194
|
+
|
|
195
|
+
Contributions are welcome! If you encounter issues or have ideas for
|
|
196
|
+
improvements, please open an issue or discussion on
|
|
197
|
+
[GitHub](https://github.com/franckalbinet/mistocr).
|
|
198
|
+
|
|
199
|
+
## Developer Guide
|
|
200
|
+
|
|
201
|
+
If you are new to using `nbdev` here are some useful pointers to get you
|
|
202
|
+
started.
|
|
203
|
+
|
|
204
|
+
### Install mistocr in Development mode
|
|
205
|
+
|
|
206
|
+
``` sh
|
|
207
|
+
# make sure mistocr package is installed in development mode
|
|
208
|
+
$ pip install -e .
|
|
209
|
+
|
|
210
|
+
# make changes under nbs/ directory
|
|
211
|
+
# ...
|
|
212
|
+
|
|
213
|
+
# compile to have changes apply to mistocr
|
|
214
|
+
$ nbdev_prepare
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
### Documentation
|
|
218
|
+
|
|
219
|
+
Documentation can be found hosted on this GitHub
|
|
220
|
+
[repository](https://github.com/franckalbinet/mistocr)’s
|
|
221
|
+
[pages](https://franckalbinet.github.io/mistocr/). Additionally you can
|
|
222
|
+
find package manager specific guidelines on
|
|
223
|
+
[conda](https://anaconda.org/franckalbinet/mistocr) and
|
|
224
|
+
[pypi](https://pypi.org/project/mistocr/) respectively.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.5"
|
|
@@ -11,7 +11,7 @@ d = { 'settings': { 'branch': 'main',
|
|
|
11
11
|
'mistocr.core.create_batch_entry': ('core.html#create_batch_entry', 'mistocr/core.py'),
|
|
12
12
|
'mistocr.core.download_results': ('core.html#download_results', 'mistocr/core.py'),
|
|
13
13
|
'mistocr.core.get_api_key': ('core.html#get_api_key', 'mistocr/core.py'),
|
|
14
|
-
'mistocr.core.
|
|
14
|
+
'mistocr.core.ocr_pdf': ('core.html#ocr_pdf', 'mistocr/core.py'),
|
|
15
15
|
'mistocr.core.prep_pdf_batch': ('core.html#prep_pdf_batch', 'mistocr/core.py'),
|
|
16
16
|
'mistocr.core.read_pgs': ('core.html#read_pgs', 'mistocr/core.py'),
|
|
17
17
|
'mistocr.core.save_images': ('core.html#save_images', 'mistocr/core.py'),
|
|
@@ -20,12 +20,22 @@ d = { 'settings': { 'branch': 'main',
|
|
|
20
20
|
'mistocr.core.submit_batch': ('core.html#submit_batch', 'mistocr/core.py'),
|
|
21
21
|
'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
|
|
22
22
|
'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
|
|
23
|
+
'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},
|
|
23
24
|
'mistocr.refine': { 'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
|
|
25
|
+
'mistocr.refine.ImgDescription': ('refine.html#imgdescription', 'mistocr/refine.py'),
|
|
26
|
+
'mistocr.refine.add_descs_to_pg': ('refine.html#add_descs_to_pg', 'mistocr/refine.py'),
|
|
27
|
+
'mistocr.refine.add_descs_to_pgs': ('refine.html#add_descs_to_pgs', 'mistocr/refine.py'),
|
|
28
|
+
'mistocr.refine.add_img_descs': ('refine.html#add_img_descs', 'mistocr/refine.py'),
|
|
24
29
|
'mistocr.refine.add_pg_hdgs': ('refine.html#add_pg_hdgs', 'mistocr/refine.py'),
|
|
25
30
|
'mistocr.refine.apply_hdg_fixes': ('refine.html#apply_hdg_fixes', 'mistocr/refine.py'),
|
|
31
|
+
'mistocr.refine.describe_img': ('refine.html#describe_img', 'mistocr/refine.py'),
|
|
32
|
+
'mistocr.refine.describe_imgs': ('refine.html#describe_imgs', 'mistocr/refine.py'),
|
|
26
33
|
'mistocr.refine.fix_hdg_hierarchy': ('refine.html#fix_hdg_hierarchy', 'mistocr/refine.py'),
|
|
27
|
-
'mistocr.refine.
|
|
34
|
+
'mistocr.refine.fix_hdgs': ('refine.html#fix_hdgs', 'mistocr/refine.py'),
|
|
28
35
|
'mistocr.refine.fmt_hdgs_idx': ('refine.html#fmt_hdgs_idx', 'mistocr/refine.py'),
|
|
29
36
|
'mistocr.refine.get_hdgs': ('refine.html#get_hdgs', 'mistocr/refine.py'),
|
|
37
|
+
'mistocr.refine.limit': ('refine.html#limit', 'mistocr/refine.py'),
|
|
30
38
|
'mistocr.refine.mk_fixes_lut': ('refine.html#mk_fixes_lut', 'mistocr/refine.py'),
|
|
31
|
-
'mistocr.refine.
|
|
39
|
+
'mistocr.refine.parse_r': ('refine.html#parse_r', 'mistocr/refine.py'),
|
|
40
|
+
'mistocr.refine.read_pgs_pg': ('refine.html#read_pgs_pg', 'mistocr/refine.py'),
|
|
41
|
+
'mistocr.refine.save_img_descs': ('refine.html#save_img_descs', 'mistocr/refine.py')}}}
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
# %% auto 0
|
|
6
6
|
__all__ = ['ocr_model', 'ocr_endpoint', 'get_api_key', 'upload_pdf', 'create_batch_entry', 'prep_pdf_batch', 'submit_batch',
|
|
7
|
-
'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', '
|
|
7
|
+
'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr_pdf', 'read_pgs']
|
|
8
8
|
|
|
9
9
|
# %% ../nbs/00_core.ipynb 3
|
|
10
10
|
from fastcore.all import *
|
|
@@ -79,10 +79,11 @@ def submit_batch(
|
|
|
79
79
|
def wait_for_job(
|
|
80
80
|
job:dict, # Job dict,
|
|
81
81
|
c:Mistral=None, # Mistral client,
|
|
82
|
-
poll_interval:int=
|
|
82
|
+
poll_interval:int=1 # Poll interval in seconds
|
|
83
83
|
) -> dict: # Job dict (with status)
|
|
84
84
|
"Poll job until completion and return final job status"
|
|
85
85
|
while job.status in ["QUEUED", "RUNNING"]:
|
|
86
|
+
print(f'Mistral batch job status: {job.status}')
|
|
86
87
|
time.sleep(poll_interval)
|
|
87
88
|
job = c.batch.jobs.get(job_id=job.id)
|
|
88
89
|
return job
|
|
@@ -161,7 +162,7 @@ def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]
|
|
|
161
162
|
return download_results(job, c)
|
|
162
163
|
|
|
163
164
|
# %% ../nbs/00_core.ipynb 43
|
|
164
|
-
def
|
|
165
|
+
def ocr_pdf(
|
|
165
166
|
path:str, # Path to PDF file or folder,
|
|
166
167
|
dst:str='md', # Directory to save markdown pages,
|
|
167
168
|
inc_img:bool=True, # Include image in response,
|
|
@@ -174,7 +175,7 @@ def ocr(
|
|
|
174
175
|
results = _run_batch(entries, c, poll_interval)
|
|
175
176
|
return L([save_pages(r['response']['body'], dst, r['custom_id']) for r in results])
|
|
176
177
|
|
|
177
|
-
# %% ../nbs/00_core.ipynb
|
|
178
|
+
# %% ../nbs/00_core.ipynb 47
|
|
178
179
|
def read_pgs(
|
|
179
180
|
path:str, # OCR output directory,
|
|
180
181
|
join:bool=True # Join pages into single string
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""End-to-End Pipeline: PDF OCR, Markdown Heading Correction, and AI Image Descriptions"""
|
|
2
|
+
|
|
3
|
+
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_pipeline.ipynb.
|
|
4
|
+
|
|
5
|
+
# %% auto 0
|
|
6
|
+
__all__ = ['pdf_to_md']
|
|
7
|
+
|
|
8
|
+
# %% ../nbs/02_pipeline.ipynb 3
|
|
9
|
+
from fastcore.all import *
|
|
10
|
+
from .core import read_pgs, ocr_pdf
|
|
11
|
+
from .refine import add_img_descs, fix_hdgs
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from asyncio import Semaphore, gather, sleep
|
|
14
|
+
import os, json, shutil
|
|
15
|
+
|
|
16
|
+
# %% ../nbs/02_pipeline.ipynb 4
|
|
17
|
+
@delegates(add_img_descs)
|
|
18
|
+
async def pdf_to_md(
|
|
19
|
+
pdf_path:str, # Path to input PDF file
|
|
20
|
+
dst:str, # Destination directory for output markdown
|
|
21
|
+
ocr_output:str=None, # Optional OCR output directory (defaults to pdf_path stem)
|
|
22
|
+
model:str='claude-sonnet-4-5', # Model to use for heading fixes and image descriptions
|
|
23
|
+
add_img_desc:bool=True, # Whether to add image descriptions
|
|
24
|
+
progress:bool=True, # Whether to show progress messages
|
|
25
|
+
**kwargs):
|
|
26
|
+
"Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
|
|
27
|
+
n_steps = 3 if add_img_desc else 2
|
|
28
|
+
if progress: print(f"Step 1/{n_steps}: Running OCR on {pdf_path}...")
|
|
29
|
+
ocr_dirs = ocr_pdf(pdf_path, ocr_output or 'ocr_temp')
|
|
30
|
+
ocr_dir = ocr_dirs[0]
|
|
31
|
+
if progress: print(f"Step 2/{n_steps}: Fixing heading hierarchy...")
|
|
32
|
+
fix_hdgs(ocr_dir, model=model)
|
|
33
|
+
if add_img_desc:
|
|
34
|
+
if progress: print(f"Step 3/{n_steps}: Adding image descriptions...")
|
|
35
|
+
await add_img_descs(ocr_dir, dst=dst, model=model, progress=progress, **kwargs)
|
|
36
|
+
elif dst and Path(dst) != ocr_dir: shutil.copytree(ocr_dir, dst, dirs_exist_ok=True)
|
|
37
|
+
if progress: print("Done!")
|