mistocr 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mistocr/__init__.py +1 -1
- mistocr/refine.py +35 -29
- {mistocr-0.4.0.dist-info → mistocr-0.4.1.dist-info}/METADATA +1 -13
- mistocr-0.4.1.dist-info/RECORD +11 -0
- mistocr-0.4.0.dist-info/RECORD +0 -11
- {mistocr-0.4.0.dist-info → mistocr-0.4.1.dist-info}/WHEEL +0 -0
- {mistocr-0.4.0.dist-info → mistocr-0.4.1.dist-info}/entry_points.txt +0 -0
- {mistocr-0.4.0.dist-info → mistocr-0.4.1.dist-info}/licenses/LICENSE +0 -0
- {mistocr-0.4.0.dist-info → mistocr-0.4.1.dist-info}/top_level.txt +0 -0
mistocr/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.4.
|
|
1
|
+
__version__ = "0.4.1"
|
mistocr/refine.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_refine.ipynb.
|
|
4
4
|
|
|
5
5
|
# %% auto 0
|
|
6
|
-
__all__ = ['prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
|
|
6
|
+
__all__ = ['logger', 'prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
|
|
7
7
|
'HeadingCorrection', 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes',
|
|
8
8
|
'fix_hdgs', 'ImgDescription', 'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs',
|
|
9
9
|
'add_descs_to_pg', 'add_descs_to_pgs', 'add_img_descs']
|
|
@@ -20,8 +20,14 @@ import os
|
|
|
20
20
|
import json
|
|
21
21
|
import shutil
|
|
22
22
|
from asyncio import Semaphore, gather, sleep
|
|
23
|
+
import logging
|
|
23
24
|
|
|
24
|
-
# %% ../nbs/01_refine.ipynb
|
|
25
|
+
# %% ../nbs/01_refine.ipynb 4
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
logging.basicConfig(level=logging.WARNING, format='%(name)s - %(levelname)s - %(message)s')
|
|
28
|
+
logger.setLevel(logging.INFO)
|
|
29
|
+
|
|
30
|
+
# %% ../nbs/01_refine.ipynb 8
|
|
25
31
|
def get_hdgs(
|
|
26
32
|
md:str # Markdown file string
|
|
27
33
|
) -> L: # L of strings
|
|
@@ -32,7 +38,7 @@ def get_hdgs(
|
|
|
32
38
|
|
|
33
39
|
|
|
34
40
|
|
|
35
|
-
# %% ../nbs/01_refine.ipynb
|
|
41
|
+
# %% ../nbs/01_refine.ipynb 9
|
|
36
42
|
def add_pg_hdgs(
|
|
37
43
|
md:str, # Markdown file string,
|
|
38
44
|
n:int # Page number
|
|
@@ -42,7 +48,7 @@ def add_pg_hdgs(
|
|
|
42
48
|
def repl(m): return m.group(0) + f' ... page {n}'
|
|
43
49
|
return sub(r'^#{1,6} .+$', repl, md, flags=MULTILINE)
|
|
44
50
|
|
|
45
|
-
# %% ../nbs/01_refine.ipynb
|
|
51
|
+
# %% ../nbs/01_refine.ipynb 13
|
|
46
52
|
def read_pgs_pg(
|
|
47
53
|
path:str # Path to the markdown file
|
|
48
54
|
) -> L: # List of markdown pages
|
|
@@ -50,7 +56,7 @@ def read_pgs_pg(
|
|
|
50
56
|
pgs = read_pgs(path, join=False)
|
|
51
57
|
return L([add_pg_hdgs(pg, n) for n, pg in enumerate(pgs, 1)]).concat()
|
|
52
58
|
|
|
53
|
-
# %% ../nbs/01_refine.ipynb
|
|
59
|
+
# %% ../nbs/01_refine.ipynb 16
|
|
54
60
|
def fmt_hdgs_idx(
|
|
55
61
|
hdgs: list[str] # List of markdown headings
|
|
56
62
|
) -> str: # Formatted string with index
|
|
@@ -58,18 +64,18 @@ def fmt_hdgs_idx(
|
|
|
58
64
|
return '\n'.join(f"{i}. {h}" for i, h in enumerate(hdgs))
|
|
59
65
|
|
|
60
66
|
|
|
61
|
-
# %% ../nbs/01_refine.ipynb
|
|
67
|
+
# %% ../nbs/01_refine.ipynb 19
|
|
62
68
|
class HeadingCorrection(BaseModel):
|
|
63
69
|
"A single heading correction mapping an index to its corrected markdown heading"
|
|
64
70
|
index: int
|
|
65
71
|
corrected: str
|
|
66
72
|
|
|
67
|
-
# %% ../nbs/01_refine.ipynb
|
|
73
|
+
# %% ../nbs/01_refine.ipynb 20
|
|
68
74
|
class HeadingCorrections(BaseModel):
|
|
69
75
|
"Collection of heading corrections returned by the LLM"
|
|
70
76
|
corrections: list[HeadingCorrection]
|
|
71
77
|
|
|
72
|
-
# %% ../nbs/01_refine.ipynb
|
|
78
|
+
# %% ../nbs/01_refine.ipynb 22
|
|
73
79
|
prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
|
|
74
80
|
|
|
75
81
|
INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title ... page 1")
|
|
@@ -109,7 +115,7 @@ Headings to analyze:
|
|
|
109
115
|
{headings_list}
|
|
110
116
|
"""
|
|
111
117
|
|
|
112
|
-
# %% ../nbs/01_refine.ipynb
|
|
118
|
+
# %% ../nbs/01_refine.ipynb 24
|
|
113
119
|
def fix_hdg_hierarchy(
|
|
114
120
|
hdgs: list[str], # List of markdown headings
|
|
115
121
|
prompt: str=None, # Prompt to use
|
|
@@ -125,7 +131,7 @@ def fix_hdg_hierarchy(
|
|
|
125
131
|
return {o['index']: o['corrected'] for o in fixes}
|
|
126
132
|
|
|
127
133
|
|
|
128
|
-
# %% ../nbs/01_refine.ipynb
|
|
134
|
+
# %% ../nbs/01_refine.ipynb 27
|
|
129
135
|
@delegates(fix_hdg_hierarchy)
|
|
130
136
|
def mk_fixes_lut(
|
|
131
137
|
hdgs: list[str], # List of markdown headings
|
|
@@ -138,7 +144,7 @@ def mk_fixes_lut(
|
|
|
138
144
|
fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key, **kwargs)
|
|
139
145
|
return {hdgs[k]:v for k,v in fixes.items()}
|
|
140
146
|
|
|
141
|
-
# %% ../nbs/01_refine.ipynb
|
|
147
|
+
# %% ../nbs/01_refine.ipynb 30
|
|
142
148
|
def apply_hdg_fixes(
|
|
143
149
|
p:str, # Page to fix
|
|
144
150
|
lut_fixes: dict[str, str], # Lookup table of fixes
|
|
@@ -147,7 +153,7 @@ def apply_hdg_fixes(
|
|
|
147
153
|
for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old))
|
|
148
154
|
return p
|
|
149
155
|
|
|
150
|
-
# %% ../nbs/01_refine.ipynb
|
|
156
|
+
# %% ../nbs/01_refine.ipynb 33
|
|
151
157
|
@delegates(mk_fixes_lut)
|
|
152
158
|
def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:str='img', **kwargs):
|
|
153
159
|
"Fix heading hierarchy in markdown document"
|
|
@@ -159,13 +165,13 @@ def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:st
|
|
|
159
165
|
lut = mk_fixes_lut(L([get_hdgs(pg) for pg in pgs_with_pg]).concat(), model, **kwargs)
|
|
160
166
|
for i,p in enumerate(pgs_with_pg, 1): (dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut))
|
|
161
167
|
|
|
162
|
-
# %% ../nbs/01_refine.ipynb
|
|
168
|
+
# %% ../nbs/01_refine.ipynb 39
|
|
163
169
|
class ImgDescription(BaseModel):
|
|
164
170
|
"Image classification and description for OCR'd documents"
|
|
165
171
|
is_informative:bool # Whether image contains informative content (charts, diagrams, tables) vs decorative (logos, backgrounds)
|
|
166
172
|
description:str # Detailed description of the image content for RAG and accessibility
|
|
167
173
|
|
|
168
|
-
# %% ../nbs/01_refine.ipynb
|
|
174
|
+
# %% ../nbs/01_refine.ipynb 42
|
|
169
175
|
describe_img_prompt = """Analyze this image from an academic/technical document.
|
|
170
176
|
|
|
171
177
|
Step 1: Determine if this image is informative for understanding the document content.
|
|
@@ -178,7 +184,7 @@ Step 2:
|
|
|
178
184
|
|
|
179
185
|
Return your response as JSON with 'is_informative' (boolean) and 'description' (string) fields."""
|
|
180
186
|
|
|
181
|
-
# %% ../nbs/01_refine.ipynb
|
|
187
|
+
# %% ../nbs/01_refine.ipynb 43
|
|
182
188
|
async def describe_img(
|
|
183
189
|
img_path: Path, # Path to the image file
|
|
184
190
|
model: str = 'claude-sonnet-4-5', # Model to use
|
|
@@ -189,7 +195,7 @@ async def describe_img(
|
|
|
189
195
|
r = await chat([img_path.read_bytes(), prompt], response_format=ImgDescription)
|
|
190
196
|
return r
|
|
191
197
|
|
|
192
|
-
# %% ../nbs/01_refine.ipynb
|
|
198
|
+
# %% ../nbs/01_refine.ipynb 47
|
|
193
199
|
async def limit(
|
|
194
200
|
semaphore, # Semaphore for concurrency control
|
|
195
201
|
coro, # Coroutine to execute
|
|
@@ -201,14 +207,14 @@ async def limit(
|
|
|
201
207
|
if delay: await sleep(delay)
|
|
202
208
|
return r
|
|
203
209
|
|
|
204
|
-
# %% ../nbs/01_refine.ipynb
|
|
210
|
+
# %% ../nbs/01_refine.ipynb 49
|
|
205
211
|
def parse_r(
|
|
206
212
|
result # ModelResponse object from API call
|
|
207
213
|
): # Dictionary with 'is_informative' and 'description' keys
|
|
208
214
|
"Extract and parse JSON content from model response"
|
|
209
215
|
return json.loads(result.choices[0].message.content)
|
|
210
216
|
|
|
211
|
-
# %% ../nbs/01_refine.ipynb
|
|
217
|
+
# %% ../nbs/01_refine.ipynb 51
|
|
212
218
|
async def describe_imgs(
|
|
213
219
|
imgs: list[Path], # List of image file paths to describe
|
|
214
220
|
model: str = 'claude-sonnet-4-5', # Model to use for image description
|
|
@@ -221,7 +227,7 @@ async def describe_imgs(
|
|
|
221
227
|
results = await gather(*[limit(sem, describe_img(img, model, prompt), delay) for img in imgs])
|
|
222
228
|
return {img.name: parse_r(r) for img, r in zip(imgs, results)}
|
|
223
229
|
|
|
224
|
-
# %% ../nbs/01_refine.ipynb
|
|
230
|
+
# %% ../nbs/01_refine.ipynb 53
|
|
225
231
|
def save_img_descs(
|
|
226
232
|
descs: dict, # Dictionary of image descriptions
|
|
227
233
|
dst_fname: Path, # Path to save the JSON file
|
|
@@ -229,7 +235,7 @@ def save_img_descs(
|
|
|
229
235
|
"Save image descriptions to JSON file"
|
|
230
236
|
Path(dst_fname).write_text(json.dumps(descs, indent=2))
|
|
231
237
|
|
|
232
|
-
# %% ../nbs/01_refine.ipynb
|
|
238
|
+
# %% ../nbs/01_refine.ipynb 58
|
|
233
239
|
def add_descs_to_pg(
|
|
234
240
|
pg:str, # Page markdown content
|
|
235
241
|
descs:dict # Dictionary mapping image filenames to their descriptions
|
|
@@ -240,7 +246,7 @@ def add_descs_to_pg(
|
|
|
240
246
|
if fname in descs: pg = pg.replace(link, f"{link}\nAI-generated image description:\n___\n{descs[fname]['description']}\n___")
|
|
241
247
|
return pg
|
|
242
248
|
|
|
243
|
-
# %% ../nbs/01_refine.ipynb
|
|
249
|
+
# %% ../nbs/01_refine.ipynb 63
|
|
244
250
|
def add_descs_to_pgs(
|
|
245
251
|
pgs:list, # List of page markdown strings
|
|
246
252
|
descs:dict # Dictionary mapping image filenames to their descriptions
|
|
@@ -248,7 +254,7 @@ def add_descs_to_pgs(
|
|
|
248
254
|
"Add AI-generated descriptions to images in all pages"
|
|
249
255
|
return [add_descs_to_pg(pg, descs) for pg in pgs]
|
|
250
256
|
|
|
251
|
-
# %% ../nbs/01_refine.ipynb
|
|
257
|
+
# %% ../nbs/01_refine.ipynb 66
|
|
252
258
|
async def add_img_descs(
|
|
253
259
|
src:str, # Path to source markdown directory
|
|
254
260
|
dst:str=None, # Destination directory (defaults to src if None)
|
|
@@ -257,7 +263,7 @@ async def add_img_descs(
|
|
|
257
263
|
semaphore:int=2, # Max concurrent API requests
|
|
258
264
|
delay:float=1, # Delay in seconds between API calls
|
|
259
265
|
force:bool=False, # Force regeneration even if cache exists
|
|
260
|
-
progress:bool=True #
|
|
266
|
+
progress:bool=True # Log progress messages
|
|
261
267
|
):
|
|
262
268
|
"Describe all images in markdown document and insert descriptions inline"
|
|
263
269
|
src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
|
|
@@ -266,23 +272,23 @@ async def add_img_descs(
|
|
|
266
272
|
|
|
267
273
|
# Check if image folder exists
|
|
268
274
|
if not src_imgs.exists():
|
|
269
|
-
if progress:
|
|
275
|
+
if progress: logger.info(f"No images to describe in the document (no '{img_folder}' folder found)")
|
|
270
276
|
return
|
|
271
277
|
|
|
272
278
|
if src_imgs.exists() and dst_path != src_path: shutil.copytree(src_imgs, dst_path/img_folder, dirs_exist_ok=True)
|
|
273
279
|
desc_file = src_path/'img_descriptions.json'
|
|
274
280
|
if desc_file.exists() and not force:
|
|
275
|
-
if progress:
|
|
281
|
+
if progress: logger.info(f"Loading existing descriptions from {desc_file}")
|
|
276
282
|
descs = json.loads(desc_file.read_text())
|
|
277
283
|
else:
|
|
278
284
|
imgs = (src_path/img_folder).ls(file_exts=['.jpeg', '.jpg', '.png'])
|
|
279
|
-
if progress:
|
|
285
|
+
if progress: logger.info(f"Describing {len(imgs)} images...")
|
|
280
286
|
descs = await describe_imgs(imgs, model, semaphore=semaphore, delay=delay)
|
|
281
287
|
save_img_descs(descs, desc_file)
|
|
282
|
-
if progress:
|
|
288
|
+
if progress: logger.info(f"Saved descriptions to {desc_file}")
|
|
283
289
|
pgs = read_pgs(src_path, join=False)
|
|
284
|
-
if progress:
|
|
290
|
+
if progress: logger.info(f"Adding descriptions to {len(pgs)} pages...")
|
|
285
291
|
enriched = [add_descs_to_pg(pg, descs) for pg in pgs]
|
|
286
292
|
for i,pg in enumerate(enriched, 1): (dst_path/f'page_{i}.md').write_text(pg)
|
|
287
|
-
if progress:
|
|
293
|
+
if progress: logger.info(f"Done! Enriched pages saved to {dst_path}")
|
|
288
294
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mistocr
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: Batch OCR for PDFs with heading restoration and visual content integration
|
|
5
5
|
Home-page: https://github.com/franckalbinet/mistocr
|
|
6
6
|
Author: Solveit
|
|
@@ -113,18 +113,6 @@ from mistocr.pipeline import pdf_to_md
|
|
|
113
113
|
await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
|
|
114
114
|
```
|
|
115
115
|
|
|
116
|
-
Step 1/3: Running OCR on files/test/resnet.pdf...
|
|
117
|
-
Mistral batch job status: QUEUED
|
|
118
|
-
Mistral batch job status: RUNNING
|
|
119
|
-
Mistral batch job status: RUNNING
|
|
120
|
-
Step 2/3: Fixing heading hierarchy...
|
|
121
|
-
Step 3/3: Adding image descriptions...
|
|
122
|
-
Describing 7 images...
|
|
123
|
-
Saved descriptions to ocr_temp/resnet/img_descriptions.json
|
|
124
|
-
Adding descriptions to 12 pages...
|
|
125
|
-
Done! Enriched pages saved to files/test/md_test
|
|
126
|
-
Done!
|
|
127
|
-
|
|
128
116
|
This will (as indicated by the output):
|
|
129
117
|
|
|
130
118
|
1. OCR the PDF using Mistral’s batch API
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
mistocr/__init__.py,sha256=pMtTmSUht-XtbR_7Doz6bsQqopJJd8rZ8I8zy2HwwoA,22
|
|
2
|
+
mistocr/_modidx.py,sha256=LyFez7ndKOXQpF0unhg1imCrW7xcUwO106W82ttVnnk,4366
|
|
3
|
+
mistocr/core.py,sha256=-yXqEro_kTE66lXWBrewS73SRTl-Btt9uyKNxMnzjIw,9181
|
|
4
|
+
mistocr/pipeline.py,sha256=n8AHIHGZBXYy_k7LtEBZSiTL6HClT84-7K4QlTjOLvo,2107
|
|
5
|
+
mistocr/refine.py,sha256=Q14DhUUsT5FLMxP9oIJ2TGQ3qbxe7ulXfRMPKpsd4Wo,13232
|
|
6
|
+
mistocr-0.4.1.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
|
|
7
|
+
mistocr-0.4.1.dist-info/METADATA,sha256=cvASaYVhDfCJ9bzrosdmTRd5ECIAPAl84H7nN5P06zY,7992
|
|
8
|
+
mistocr-0.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
+
mistocr-0.4.1.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
|
|
10
|
+
mistocr-0.4.1.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
|
|
11
|
+
mistocr-0.4.1.dist-info/RECORD,,
|
mistocr-0.4.0.dist-info/RECORD
DELETED
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
mistocr/__init__.py,sha256=42STGor_9nKYXumfeV5tiyD_M8VdcddX7CEexmibPBk,22
|
|
2
|
-
mistocr/_modidx.py,sha256=LyFez7ndKOXQpF0unhg1imCrW7xcUwO106W82ttVnnk,4366
|
|
3
|
-
mistocr/core.py,sha256=-yXqEro_kTE66lXWBrewS73SRTl-Btt9uyKNxMnzjIw,9181
|
|
4
|
-
mistocr/pipeline.py,sha256=n8AHIHGZBXYy_k7LtEBZSiTL6HClT84-7K4QlTjOLvo,2107
|
|
5
|
-
mistocr/refine.py,sha256=hQg4ZYRTkPFEiCNNBU2ykgXFfVLCNAp2IuwOwHBVQ2k,12983
|
|
6
|
-
mistocr-0.4.0.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
|
|
7
|
-
mistocr-0.4.0.dist-info/METADATA,sha256=c0LUM6UrwIIoeug8fA8H4dYvutdieBFLQ52Sho4uGgY,8438
|
|
8
|
-
mistocr-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
-
mistocr-0.4.0.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
|
|
10
|
-
mistocr-0.4.0.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
|
|
11
|
-
mistocr-0.4.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|