mistocr 0.2.9__py3-none-any.whl → 0.2.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mistocr/__init__.py +1 -1
- mistocr/_modidx.py +0 -1
- mistocr/core.py +7 -1
- mistocr/refine.py +19 -21
- {mistocr-0.2.9.dist-info → mistocr-0.2.11.dist-info}/METADATA +1 -1
- mistocr-0.2.11.dist-info/RECORD +11 -0
- mistocr-0.2.9.dist-info/RECORD +0 -11
- {mistocr-0.2.9.dist-info → mistocr-0.2.11.dist-info}/WHEEL +0 -0
- {mistocr-0.2.9.dist-info → mistocr-0.2.11.dist-info}/entry_points.txt +0 -0
- {mistocr-0.2.9.dist-info → mistocr-0.2.11.dist-info}/licenses/LICENSE +0 -0
- {mistocr-0.2.9.dist-info → mistocr-0.2.11.dist-info}/top_level.txt +0 -0
mistocr/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.2.
|
|
1
|
+
__version__ = "0.2.11"
|
mistocr/_modidx.py
CHANGED
|
@@ -22,7 +22,6 @@ d = { 'settings': { 'branch': 'main',
|
|
|
22
22
|
'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
|
|
23
23
|
'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},
|
|
24
24
|
'mistocr.refine': { 'mistocr.refine.HeadingCorrection': ('refine.html#headingcorrection', 'mistocr/refine.py'),
|
|
25
|
-
'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
|
|
26
25
|
'mistocr.refine.ImgDescription': ('refine.html#imgdescription', 'mistocr/refine.py'),
|
|
27
26
|
'mistocr.refine.add_descs_to_pg': ('refine.html#add_descs_to_pg', 'mistocr/refine.py'),
|
|
28
27
|
'mistocr.refine.add_descs_to_pgs': ('refine.html#add_descs_to_pgs', 'mistocr/refine.py'),
|
mistocr/core.py
CHANGED
|
@@ -79,11 +79,17 @@ def submit_batch(
|
|
|
79
79
|
def wait_for_job(
|
|
80
80
|
job:dict, # Job dict,
|
|
81
81
|
c:Mistral=None, # Mistral client,
|
|
82
|
-
poll_interval:int=1 # Poll interval in seconds
|
|
82
|
+
poll_interval:int=1, # Poll interval in seconds
|
|
83
|
+
queued_timeout:int=300 # Timeout for QUEUED status in seconds
|
|
83
84
|
) -> dict: # Job dict (with status)
|
|
84
85
|
"Poll job until completion and return final job status"
|
|
86
|
+
queued_time = 0
|
|
85
87
|
while job.status in ["QUEUED", "RUNNING"]:
|
|
86
88
|
print(f'Mistral batch job status: {job.status}')
|
|
89
|
+
if job.status == "QUEUED":
|
|
90
|
+
queued_time += poll_interval
|
|
91
|
+
if queued_time >= queued_timeout:
|
|
92
|
+
raise TimeoutError(f"Job stayed in QUEUED status for {queued_time}s, exceeding timeout of {queued_timeout}s. Check your balance or Mistral Status.")
|
|
87
93
|
time.sleep(poll_interval)
|
|
88
94
|
job = c.batch.jobs.get(job_id=job.id)
|
|
89
95
|
return job
|
mistocr/refine.py
CHANGED
|
@@ -4,9 +4,9 @@
|
|
|
4
4
|
|
|
5
5
|
# %% auto 0
|
|
6
6
|
__all__ = ['prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
|
|
7
|
-
'HeadingCorrection', '
|
|
8
|
-
'
|
|
9
|
-
'
|
|
7
|
+
'HeadingCorrection', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes', 'fix_hdgs', 'ImgDescription',
|
|
8
|
+
'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs', 'add_descs_to_pg', 'add_descs_to_pgs',
|
|
9
|
+
'add_img_descs']
|
|
10
10
|
|
|
11
11
|
# %% ../nbs/01_refine.ipynb 3
|
|
12
12
|
from fastcore.all import *
|
|
@@ -60,13 +60,11 @@ def fmt_hdgs_idx(
|
|
|
60
60
|
|
|
61
61
|
# %% ../nbs/01_refine.ipynb 18
|
|
62
62
|
class HeadingCorrection(BaseModel):
|
|
63
|
+
"A single heading correction mapping an index to its corrected markdown heading"
|
|
63
64
|
index: int
|
|
64
65
|
corrected: str
|
|
65
66
|
|
|
66
|
-
|
|
67
|
-
corrections: list[HeadingCorrection]
|
|
68
|
-
|
|
69
|
-
# %% ../nbs/01_refine.ipynb 20
|
|
67
|
+
# %% ../nbs/01_refine.ipynb 21
|
|
70
68
|
prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
|
|
71
69
|
|
|
72
70
|
INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title ... page 1")
|
|
@@ -106,7 +104,7 @@ Headings to analyze:
|
|
|
106
104
|
{headings_list}
|
|
107
105
|
"""
|
|
108
106
|
|
|
109
|
-
# %% ../nbs/01_refine.ipynb
|
|
107
|
+
# %% ../nbs/01_refine.ipynb 23
|
|
110
108
|
def fix_hdg_hierarchy(
|
|
111
109
|
hdgs: list[str], # List of markdown headings
|
|
112
110
|
prompt: str=None, # Prompt to use
|
|
@@ -122,7 +120,7 @@ def fix_hdg_hierarchy(
|
|
|
122
120
|
return {o['index']: o['corrected'] for o in fixes}
|
|
123
121
|
|
|
124
122
|
|
|
125
|
-
# %% ../nbs/01_refine.ipynb
|
|
123
|
+
# %% ../nbs/01_refine.ipynb 26
|
|
126
124
|
@delegates(fix_hdg_hierarchy)
|
|
127
125
|
def mk_fixes_lut(
|
|
128
126
|
hdgs: list[str], # List of markdown headings
|
|
@@ -135,7 +133,7 @@ def mk_fixes_lut(
|
|
|
135
133
|
fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key, **kwargs)
|
|
136
134
|
return {hdgs[k]:v for k,v in fixes.items()}
|
|
137
135
|
|
|
138
|
-
# %% ../nbs/01_refine.ipynb
|
|
136
|
+
# %% ../nbs/01_refine.ipynb 29
|
|
139
137
|
def apply_hdg_fixes(
|
|
140
138
|
p:str, # Page to fix
|
|
141
139
|
lut_fixes: dict[str, str], # Lookup table of fixes
|
|
@@ -144,7 +142,7 @@ def apply_hdg_fixes(
|
|
|
144
142
|
for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old))
|
|
145
143
|
return p
|
|
146
144
|
|
|
147
|
-
# %% ../nbs/01_refine.ipynb
|
|
145
|
+
# %% ../nbs/01_refine.ipynb 32
|
|
148
146
|
@delegates(mk_fixes_lut)
|
|
149
147
|
def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:str='img', **kwargs):
|
|
150
148
|
"Fix heading hierarchy in markdown document"
|
|
@@ -156,13 +154,13 @@ def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:st
|
|
|
156
154
|
lut = mk_fixes_lut(L([get_hdgs(pg) for pg in pgs_with_pg]).concat(), model, **kwargs)
|
|
157
155
|
for i,p in enumerate(pgs_with_pg, 1): (dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut))
|
|
158
156
|
|
|
159
|
-
# %% ../nbs/01_refine.ipynb
|
|
157
|
+
# %% ../nbs/01_refine.ipynb 38
|
|
160
158
|
class ImgDescription(BaseModel):
|
|
161
159
|
"Image classification and description for OCR'd documents"
|
|
162
160
|
is_informative:bool # Whether image contains informative content (charts, diagrams, tables) vs decorative (logos, backgrounds)
|
|
163
161
|
description:str # Detailed description of the image content for RAG and accessibility
|
|
164
162
|
|
|
165
|
-
# %% ../nbs/01_refine.ipynb
|
|
163
|
+
# %% ../nbs/01_refine.ipynb 41
|
|
166
164
|
describe_img_prompt = """Analyze this image from an academic/technical document.
|
|
167
165
|
|
|
168
166
|
Step 1: Determine if this image is informative for understanding the document content.
|
|
@@ -175,7 +173,7 @@ Step 2:
|
|
|
175
173
|
|
|
176
174
|
Return your response as JSON with 'is_informative' (boolean) and 'description' (string) fields."""
|
|
177
175
|
|
|
178
|
-
# %% ../nbs/01_refine.ipynb
|
|
176
|
+
# %% ../nbs/01_refine.ipynb 42
|
|
179
177
|
async def describe_img(
|
|
180
178
|
img_path: Path, # Path to the image file
|
|
181
179
|
model: str = 'claude-sonnet-4-5', # Model to use
|
|
@@ -186,7 +184,7 @@ async def describe_img(
|
|
|
186
184
|
r = await chat([img_path.read_bytes(), prompt], response_format=ImgDescription)
|
|
187
185
|
return r
|
|
188
186
|
|
|
189
|
-
# %% ../nbs/01_refine.ipynb
|
|
187
|
+
# %% ../nbs/01_refine.ipynb 46
|
|
190
188
|
async def limit(
|
|
191
189
|
semaphore, # Semaphore for concurrency control
|
|
192
190
|
coro, # Coroutine to execute
|
|
@@ -198,14 +196,14 @@ async def limit(
|
|
|
198
196
|
if delay: await sleep(delay)
|
|
199
197
|
return r
|
|
200
198
|
|
|
201
|
-
# %% ../nbs/01_refine.ipynb
|
|
199
|
+
# %% ../nbs/01_refine.ipynb 48
|
|
202
200
|
def parse_r(
|
|
203
201
|
result # ModelResponse object from API call
|
|
204
202
|
): # Dictionary with 'is_informative' and 'description' keys
|
|
205
203
|
"Extract and parse JSON content from model response"
|
|
206
204
|
return json.loads(result.choices[0].message.content)
|
|
207
205
|
|
|
208
|
-
# %% ../nbs/01_refine.ipynb
|
|
206
|
+
# %% ../nbs/01_refine.ipynb 50
|
|
209
207
|
async def describe_imgs(
|
|
210
208
|
imgs: list[Path], # List of image file paths to describe
|
|
211
209
|
model: str = 'claude-sonnet-4-5', # Model to use for image description
|
|
@@ -218,7 +216,7 @@ async def describe_imgs(
|
|
|
218
216
|
results = await gather(*[limit(sem, describe_img(img, model, prompt), delay) for img in imgs])
|
|
219
217
|
return {img.name: parse_r(r) for img, r in zip(imgs, results)}
|
|
220
218
|
|
|
221
|
-
# %% ../nbs/01_refine.ipynb
|
|
219
|
+
# %% ../nbs/01_refine.ipynb 52
|
|
222
220
|
def save_img_descs(
|
|
223
221
|
descs: dict, # Dictionary of image descriptions
|
|
224
222
|
dst_fname: Path, # Path to save the JSON file
|
|
@@ -226,7 +224,7 @@ def save_img_descs(
|
|
|
226
224
|
"Save image descriptions to JSON file"
|
|
227
225
|
Path(dst_fname).write_text(json.dumps(descs, indent=2))
|
|
228
226
|
|
|
229
|
-
# %% ../nbs/01_refine.ipynb
|
|
227
|
+
# %% ../nbs/01_refine.ipynb 57
|
|
230
228
|
def add_descs_to_pg(
|
|
231
229
|
pg:str, # Page markdown content
|
|
232
230
|
descs:dict # Dictionary mapping image filenames to their descriptions
|
|
@@ -237,7 +235,7 @@ def add_descs_to_pg(
|
|
|
237
235
|
if fname in descs: pg = pg.replace(link, f"{link}\nAI-generated image description:\n___\n{descs[fname]['description']}\n___")
|
|
238
236
|
return pg
|
|
239
237
|
|
|
240
|
-
# %% ../nbs/01_refine.ipynb
|
|
238
|
+
# %% ../nbs/01_refine.ipynb 62
|
|
241
239
|
def add_descs_to_pgs(
|
|
242
240
|
pgs:list, # List of page markdown strings
|
|
243
241
|
descs:dict # Dictionary mapping image filenames to their descriptions
|
|
@@ -245,7 +243,7 @@ def add_descs_to_pgs(
|
|
|
245
243
|
"Add AI-generated descriptions to images in all pages"
|
|
246
244
|
return [add_descs_to_pg(pg, descs) for pg in pgs]
|
|
247
245
|
|
|
248
|
-
# %% ../nbs/01_refine.ipynb
|
|
246
|
+
# %% ../nbs/01_refine.ipynb 65
|
|
249
247
|
async def add_img_descs(
|
|
250
248
|
src:str, # Path to source markdown directory
|
|
251
249
|
dst:str=None, # Destination directory (defaults to src if None)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
mistocr/__init__.py,sha256=_MLx4ac1juJPWEEiC9kMQISX3x3jFBr507jM2P_hxMg,23
|
|
2
|
+
mistocr/_modidx.py,sha256=WTS9JpZdbrp2LghjhOV-CK0JYChHE4PzttgKfh7pTy4,4028
|
|
3
|
+
mistocr/core.py,sha256=ohh2ru05gUKbIQCRHPMz_hw4ui39FtpoV3_S3n4bl_c,7592
|
|
4
|
+
mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
|
|
5
|
+
mistocr/refine.py,sha256=H_IAF02k6CwBQdDJm9txknzUcTlz245zXitaHELX-P4,12791
|
|
6
|
+
mistocr-0.2.11.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
|
|
7
|
+
mistocr-0.2.11.dist-info/METADATA,sha256=CZ9TO24sY0OT9B19KGPIuy3MF9T7B1G9qHaVBoEMRIk,8417
|
|
8
|
+
mistocr-0.2.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
+
mistocr-0.2.11.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
|
|
10
|
+
mistocr-0.2.11.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
|
|
11
|
+
mistocr-0.2.11.dist-info/RECORD,,
|
mistocr-0.2.9.dist-info/RECORD
DELETED
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
mistocr/__init__.py,sha256=F8OVhAhMXSkvvXYgZtbPn2SG1AQC3joK4yu-FrHt81Y,22
|
|
2
|
-
mistocr/_modidx.py,sha256=55k_EmMPrRmLa04qCE-_8lukJPCax66q5MALfpa8T4A,4154
|
|
3
|
-
mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
|
|
4
|
-
mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
|
|
5
|
-
mistocr/refine.py,sha256=wtfS_bHlD39R8T2RbITgNX8cDCIPXI9gRrJ4y9nI_rM,12807
|
|
6
|
-
mistocr-0.2.9.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
|
|
7
|
-
mistocr-0.2.9.dist-info/METADATA,sha256=n9pFeWhh-Vzd7KR7s6s3R0mdJ3xvpexujEH-0iQsRQY,8416
|
|
8
|
-
mistocr-0.2.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
-
mistocr-0.2.9.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
|
|
10
|
-
mistocr-0.2.9.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
|
|
11
|
-
mistocr-0.2.9.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|