mistocr 0.2.5__py3-none-any.whl → 0.2.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mistocr/__init__.py +1 -1
- mistocr/_modidx.py +1 -1
- mistocr/refine.py +43 -25
- {mistocr-0.2.5.dist-info → mistocr-0.2.10.dist-info}/METADATA +1 -1
- mistocr-0.2.10.dist-info/RECORD +11 -0
- mistocr-0.2.5.dist-info/RECORD +0 -11
- {mistocr-0.2.5.dist-info → mistocr-0.2.10.dist-info}/WHEEL +0 -0
- {mistocr-0.2.5.dist-info → mistocr-0.2.10.dist-info}/entry_points.txt +0 -0
- {mistocr-0.2.5.dist-info → mistocr-0.2.10.dist-info}/licenses/LICENSE +0 -0
- {mistocr-0.2.5.dist-info → mistocr-0.2.10.dist-info}/top_level.txt +0 -0
mistocr/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.2.
|
|
1
|
+
__version__ = "0.2.10"
|
mistocr/_modidx.py
CHANGED
|
@@ -21,7 +21,7 @@ d = { 'settings': { 'branch': 'main',
|
|
|
21
21
|
'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
|
|
22
22
|
'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
|
|
23
23
|
'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},
|
|
24
|
-
'mistocr.refine': { 'mistocr.refine.
|
|
24
|
+
'mistocr.refine': { 'mistocr.refine.HeadingCorrection': ('refine.html#headingcorrection', 'mistocr/refine.py'),
|
|
25
25
|
'mistocr.refine.ImgDescription': ('refine.html#imgdescription', 'mistocr/refine.py'),
|
|
26
26
|
'mistocr.refine.add_descs_to_pg': ('refine.html#add_descs_to_pg', 'mistocr/refine.py'),
|
|
27
27
|
'mistocr.refine.add_descs_to_pgs': ('refine.html#add_descs_to_pgs', 'mistocr/refine.py'),
|
mistocr/refine.py
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
# %% auto 0
|
|
6
6
|
__all__ = ['prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
|
|
7
|
-
'
|
|
7
|
+
'HeadingCorrection', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes', 'fix_hdgs', 'ImgDescription',
|
|
8
8
|
'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs', 'add_descs_to_pg', 'add_descs_to_pgs',
|
|
9
9
|
'add_img_descs']
|
|
10
10
|
|
|
@@ -59,14 +59,21 @@ def fmt_hdgs_idx(
|
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
# %% ../nbs/01_refine.ipynb 18
|
|
62
|
-
class
|
|
63
|
-
|
|
62
|
+
class HeadingCorrection(BaseModel):
|
|
63
|
+
"A single heading correction mapping an index to its corrected markdown heading"
|
|
64
|
+
index: int
|
|
65
|
+
corrected: str
|
|
64
66
|
|
|
65
|
-
# %% ../nbs/01_refine.ipynb
|
|
67
|
+
# %% ../nbs/01_refine.ipynb 21
|
|
66
68
|
prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
|
|
67
69
|
|
|
68
70
|
INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title ... page 1")
|
|
69
71
|
|
|
72
|
+
ANALYSIS STEPS (think through these before outputting corrections):
|
|
73
|
+
1. For each numbered heading (e.g., "4.1", "2.a", "A.1"), identify its parent heading (e.g., "4", "2", "A")
|
|
74
|
+
2. Verify the child heading is exactly one # deeper than its parent
|
|
75
|
+
3. If not, mark it for correction
|
|
76
|
+
|
|
70
77
|
RULES - Apply these fixes in order:
|
|
71
78
|
|
|
72
79
|
1. **Single H1 rule**: Documents must have exactly ONE # heading (typically the document title at the top)
|
|
@@ -75,8 +82,8 @@ RULES - Apply these fixes in order:
|
|
|
75
82
|
- NO exceptions: appendices, references, and all sections are ## or deeper after the title
|
|
76
83
|
|
|
77
84
|
2. **Infer depth from numbering patterns**: If headings contain section numbers, deeper nesting means deeper heading level
|
|
78
|
-
- Parent section (e.g., "1", "2", "A")
|
|
79
|
-
- Child section
|
|
85
|
+
- Parent section (e.g., "1", "2", "A") MUST be shallower than child (e.g., "1.1", "2.a", "A.1")
|
|
86
|
+
- Child section MUST be exactly one # deeper than parent
|
|
80
87
|
- Works with any numbering: "1/1.1/1.1.1", "A/A.1/A.1.a", "I/I.A/I.A.1", etc.
|
|
81
88
|
|
|
82
89
|
3. **Level jumps**: Headings can only increase by one # at a time when moving deeper
|
|
@@ -85,16 +92,19 @@ RULES - Apply these fixes in order:
|
|
|
85
92
|
|
|
86
93
|
4. **Decreasing levels is OK**: Moving back up the hierarchy (### to ##) is valid for new sections
|
|
87
94
|
|
|
88
|
-
|
|
95
|
+
5. **Unnumbered headings in numbered documents**: If the document uses numbered headings consistently, any unnumbered heading appearing within that structure is likely misclassified bold text and should be converted to regular text (output the heading text without any # symbols in the corrected field)
|
|
96
|
+
|
|
97
|
+
OUTPUT: Return a list of corrections, where each correction has:
|
|
98
|
+
- index: the heading's index number
|
|
99
|
+
- corrected: the fixed heading text (without the index prefix), or empty string "" to remove the heading entirely
|
|
89
100
|
IMPORTANT: Preserve the " ... page N" suffix in all corrected headings.
|
|
90
|
-
Only include
|
|
101
|
+
Only include headings that need changes.
|
|
91
102
|
|
|
92
103
|
Headings to analyze:
|
|
93
104
|
{headings_list}
|
|
94
105
|
"""
|
|
95
106
|
|
|
96
|
-
|
|
97
|
-
# %% ../nbs/01_refine.ipynb 22
|
|
107
|
+
# %% ../nbs/01_refine.ipynb 23
|
|
98
108
|
def fix_hdg_hierarchy(
|
|
99
109
|
hdgs: list[str], # List of markdown headings
|
|
100
110
|
prompt: str=None, # Prompt to use
|
|
@@ -106,10 +116,11 @@ def fix_hdg_hierarchy(
|
|
|
106
116
|
if prompt is None: prompt = prompt_fix_hdgs
|
|
107
117
|
prompt = prompt.format(headings_list=fmt_hdgs_idx(hdgs))
|
|
108
118
|
r = completion(model=model, messages=[{"role": "user", "content": prompt}], response_format=HeadingCorrections, api_key=api_key)
|
|
109
|
-
|
|
119
|
+
fixes = json.loads(r.choices[0].message.content)['corrections']
|
|
120
|
+
return {o['index']: o['corrected'] for o in fixes}
|
|
110
121
|
|
|
111
122
|
|
|
112
|
-
# %% ../nbs/01_refine.ipynb
|
|
123
|
+
# %% ../nbs/01_refine.ipynb 26
|
|
113
124
|
@delegates(fix_hdg_hierarchy)
|
|
114
125
|
def mk_fixes_lut(
|
|
115
126
|
hdgs: list[str], # List of markdown headings
|
|
@@ -120,9 +131,9 @@ def mk_fixes_lut(
|
|
|
120
131
|
"Make a lookup table of fixes"
|
|
121
132
|
if api_key is None: api_key = os.getenv('ANTHROPIC_API_KEY')
|
|
122
133
|
fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key, **kwargs)
|
|
123
|
-
return {hdgs[
|
|
134
|
+
return {hdgs[k]:v for k,v in fixes.items()}
|
|
124
135
|
|
|
125
|
-
# %% ../nbs/01_refine.ipynb
|
|
136
|
+
# %% ../nbs/01_refine.ipynb 29
|
|
126
137
|
def apply_hdg_fixes(
|
|
127
138
|
p:str, # Page to fix
|
|
128
139
|
lut_fixes: dict[str, str], # Lookup table of fixes
|
|
@@ -131,7 +142,7 @@ def apply_hdg_fixes(
|
|
|
131
142
|
for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old))
|
|
132
143
|
return p
|
|
133
144
|
|
|
134
|
-
# %% ../nbs/01_refine.ipynb
|
|
145
|
+
# %% ../nbs/01_refine.ipynb 32
|
|
135
146
|
@delegates(mk_fixes_lut)
|
|
136
147
|
def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:str='img', **kwargs):
|
|
137
148
|
"Fix heading hierarchy in markdown document"
|
|
@@ -143,13 +154,13 @@ def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:st
|
|
|
143
154
|
lut = mk_fixes_lut(L([get_hdgs(pg) for pg in pgs_with_pg]).concat(), model, **kwargs)
|
|
144
155
|
for i,p in enumerate(pgs_with_pg, 1): (dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut))
|
|
145
156
|
|
|
146
|
-
# %% ../nbs/01_refine.ipynb
|
|
157
|
+
# %% ../nbs/01_refine.ipynb 38
|
|
147
158
|
class ImgDescription(BaseModel):
|
|
148
159
|
"Image classification and description for OCR'd documents"
|
|
149
160
|
is_informative:bool # Whether image contains informative content (charts, diagrams, tables) vs decorative (logos, backgrounds)
|
|
150
161
|
description:str # Detailed description of the image content for RAG and accessibility
|
|
151
162
|
|
|
152
|
-
# %% ../nbs/01_refine.ipynb
|
|
163
|
+
# %% ../nbs/01_refine.ipynb 41
|
|
153
164
|
describe_img_prompt = """Analyze this image from an academic/technical document.
|
|
154
165
|
|
|
155
166
|
Step 1: Determine if this image is informative for understanding the document content.
|
|
@@ -162,7 +173,7 @@ Step 2:
|
|
|
162
173
|
|
|
163
174
|
Return your response as JSON with 'is_informative' (boolean) and 'description' (string) fields."""
|
|
164
175
|
|
|
165
|
-
# %% ../nbs/01_refine.ipynb
|
|
176
|
+
# %% ../nbs/01_refine.ipynb 42
|
|
166
177
|
async def describe_img(
|
|
167
178
|
img_path: Path, # Path to the image file
|
|
168
179
|
model: str = 'claude-sonnet-4-5', # Model to use
|
|
@@ -173,7 +184,7 @@ async def describe_img(
|
|
|
173
184
|
r = await chat([img_path.read_bytes(), prompt], response_format=ImgDescription)
|
|
174
185
|
return r
|
|
175
186
|
|
|
176
|
-
# %% ../nbs/01_refine.ipynb
|
|
187
|
+
# %% ../nbs/01_refine.ipynb 46
|
|
177
188
|
async def limit(
|
|
178
189
|
semaphore, # Semaphore for concurrency control
|
|
179
190
|
coro, # Coroutine to execute
|
|
@@ -185,14 +196,14 @@ async def limit(
|
|
|
185
196
|
if delay: await sleep(delay)
|
|
186
197
|
return r
|
|
187
198
|
|
|
188
|
-
# %% ../nbs/01_refine.ipynb
|
|
199
|
+
# %% ../nbs/01_refine.ipynb 48
|
|
189
200
|
def parse_r(
|
|
190
201
|
result # ModelResponse object from API call
|
|
191
202
|
): # Dictionary with 'is_informative' and 'description' keys
|
|
192
203
|
"Extract and parse JSON content from model response"
|
|
193
204
|
return json.loads(result.choices[0].message.content)
|
|
194
205
|
|
|
195
|
-
# %% ../nbs/01_refine.ipynb
|
|
206
|
+
# %% ../nbs/01_refine.ipynb 50
|
|
196
207
|
async def describe_imgs(
|
|
197
208
|
imgs: list[Path], # List of image file paths to describe
|
|
198
209
|
model: str = 'claude-sonnet-4-5', # Model to use for image description
|
|
@@ -205,7 +216,7 @@ async def describe_imgs(
|
|
|
205
216
|
results = await gather(*[limit(sem, describe_img(img, model, prompt), delay) for img in imgs])
|
|
206
217
|
return {img.name: parse_r(r) for img, r in zip(imgs, results)}
|
|
207
218
|
|
|
208
|
-
# %% ../nbs/01_refine.ipynb
|
|
219
|
+
# %% ../nbs/01_refine.ipynb 52
|
|
209
220
|
def save_img_descs(
|
|
210
221
|
descs: dict, # Dictionary of image descriptions
|
|
211
222
|
dst_fname: Path, # Path to save the JSON file
|
|
@@ -213,7 +224,7 @@ def save_img_descs(
|
|
|
213
224
|
"Save image descriptions to JSON file"
|
|
214
225
|
Path(dst_fname).write_text(json.dumps(descs, indent=2))
|
|
215
226
|
|
|
216
|
-
# %% ../nbs/01_refine.ipynb
|
|
227
|
+
# %% ../nbs/01_refine.ipynb 57
|
|
217
228
|
def add_descs_to_pg(
|
|
218
229
|
pg:str, # Page markdown content
|
|
219
230
|
descs:dict # Dictionary mapping image filenames to their descriptions
|
|
@@ -224,7 +235,7 @@ def add_descs_to_pg(
|
|
|
224
235
|
if fname in descs: pg = pg.replace(link, f"{link}\nAI-generated image description:\n___\n{descs[fname]['description']}\n___")
|
|
225
236
|
return pg
|
|
226
237
|
|
|
227
|
-
# %% ../nbs/01_refine.ipynb
|
|
238
|
+
# %% ../nbs/01_refine.ipynb 62
|
|
228
239
|
def add_descs_to_pgs(
|
|
229
240
|
pgs:list, # List of page markdown strings
|
|
230
241
|
descs:dict # Dictionary mapping image filenames to their descriptions
|
|
@@ -232,7 +243,7 @@ def add_descs_to_pgs(
|
|
|
232
243
|
"Add AI-generated descriptions to images in all pages"
|
|
233
244
|
return [add_descs_to_pg(pg, descs) for pg in pgs]
|
|
234
245
|
|
|
235
|
-
# %% ../nbs/01_refine.ipynb
|
|
246
|
+
# %% ../nbs/01_refine.ipynb 65
|
|
236
247
|
async def add_img_descs(
|
|
237
248
|
src:str, # Path to source markdown directory
|
|
238
249
|
dst:str=None, # Destination directory (defaults to src if None)
|
|
@@ -247,6 +258,12 @@ async def add_img_descs(
|
|
|
247
258
|
src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
|
|
248
259
|
if dst_path != src_path: dst_path.mkdir(parents=True, exist_ok=True)
|
|
249
260
|
src_imgs = src_path/img_folder
|
|
261
|
+
|
|
262
|
+
# Check if image folder exists
|
|
263
|
+
if not src_imgs.exists():
|
|
264
|
+
if progress: print(f"No images to describe in the document (no '{img_folder}' folder found)")
|
|
265
|
+
return
|
|
266
|
+
|
|
250
267
|
if src_imgs.exists() and dst_path != src_path: shutil.copytree(src_imgs, dst_path/img_folder, dirs_exist_ok=True)
|
|
251
268
|
desc_file = src_path/'img_descriptions.json'
|
|
252
269
|
if desc_file.exists() and not force:
|
|
@@ -263,3 +280,4 @@ async def add_img_descs(
|
|
|
263
280
|
enriched = [add_descs_to_pg(pg, descs) for pg in pgs]
|
|
264
281
|
for i,pg in enumerate(enriched, 1): (dst_path/f'page_{i}.md').write_text(pg)
|
|
265
282
|
if progress: print(f"Done! Enriched pages saved to {dst_path}")
|
|
283
|
+
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
mistocr/__init__.py,sha256=waXgc7p-jgGCsUjdVfO_KjlVZblnCvrzf4A0dsBj_lg,23
|
|
2
|
+
mistocr/_modidx.py,sha256=WTS9JpZdbrp2LghjhOV-CK0JYChHE4PzttgKfh7pTy4,4028
|
|
3
|
+
mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
|
|
4
|
+
mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
|
|
5
|
+
mistocr/refine.py,sha256=H_IAF02k6CwBQdDJm9txknzUcTlz245zXitaHELX-P4,12791
|
|
6
|
+
mistocr-0.2.10.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
|
|
7
|
+
mistocr-0.2.10.dist-info/METADATA,sha256=mkMu_9nYAXZ5jFdJd01AZqK3t93_Rt0xkkD0rRnl9Ew,8417
|
|
8
|
+
mistocr-0.2.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
+
mistocr-0.2.10.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
|
|
10
|
+
mistocr-0.2.10.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
|
|
11
|
+
mistocr-0.2.10.dist-info/RECORD,,
|
mistocr-0.2.5.dist-info/RECORD
DELETED
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
mistocr/__init__.py,sha256=Xsa3ayOMVkhUWm4t06YeyHE0apjpZefxLH4ylp0CDtU,22
|
|
2
|
-
mistocr/_modidx.py,sha256=JtXJb6ErVwyGFCWgeVz7N9JvybBp8Zyao4Ahn3smr90,4030
|
|
3
|
-
mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
|
|
4
|
-
mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
|
|
5
|
-
mistocr/refine.py,sha256=arJPOg1eP4MwtkD1zwnYY4EFrBfTTSP-mtR4AVnTiR8,11788
|
|
6
|
-
mistocr-0.2.5.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
|
|
7
|
-
mistocr-0.2.5.dist-info/METADATA,sha256=uGim0pZ4V3-oolsihRFr4aOWh3ZDOO7u3d8Mn0n-gmc,8416
|
|
8
|
-
mistocr-0.2.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
-
mistocr-0.2.5.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
|
|
10
|
-
mistocr-0.2.5.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
|
|
11
|
-
mistocr-0.2.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|