mistocr 0.2.8__tar.gz → 0.2.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mistocr-0.2.8/mistocr.egg-info → mistocr-0.2.10}/PKG-INFO +1 -1
- mistocr-0.2.10/mistocr/__init__.py +1 -0
- {mistocr-0.2.8 → mistocr-0.2.10}/mistocr/_modidx.py +0 -1
- {mistocr-0.2.8 → mistocr-0.2.10}/mistocr/refine.py +56 -18
- {mistocr-0.2.8 → mistocr-0.2.10/mistocr.egg-info}/PKG-INFO +1 -1
- {mistocr-0.2.8 → mistocr-0.2.10}/settings.ini +1 -1
- mistocr-0.2.8/mistocr/__init__.py +0 -1
- {mistocr-0.2.8 → mistocr-0.2.10}/LICENSE +0 -0
- {mistocr-0.2.8 → mistocr-0.2.10}/MANIFEST.in +0 -0
- {mistocr-0.2.8 → mistocr-0.2.10}/README.md +0 -0
- {mistocr-0.2.8 → mistocr-0.2.10}/mistocr/core.py +0 -0
- {mistocr-0.2.8 → mistocr-0.2.10}/mistocr/pipeline.py +0 -0
- {mistocr-0.2.8 → mistocr-0.2.10}/mistocr.egg-info/SOURCES.txt +0 -0
- {mistocr-0.2.8 → mistocr-0.2.10}/mistocr.egg-info/dependency_links.txt +0 -0
- {mistocr-0.2.8 → mistocr-0.2.10}/mistocr.egg-info/entry_points.txt +0 -0
- {mistocr-0.2.8 → mistocr-0.2.10}/mistocr.egg-info/not-zip-safe +0 -0
- {mistocr-0.2.8 → mistocr-0.2.10}/mistocr.egg-info/requires.txt +0 -0
- {mistocr-0.2.8 → mistocr-0.2.10}/mistocr.egg-info/top_level.txt +0 -0
- {mistocr-0.2.8 → mistocr-0.2.10}/pyproject.toml +0 -0
- {mistocr-0.2.8 → mistocr-0.2.10}/setup.cfg +0 -0
- {mistocr-0.2.8 → mistocr-0.2.10}/setup.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.10"
|
|
@@ -22,7 +22,6 @@ d = { 'settings': { 'branch': 'main',
|
|
|
22
22
|
'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
|
|
23
23
|
'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},
|
|
24
24
|
'mistocr.refine': { 'mistocr.refine.HeadingCorrection': ('refine.html#headingcorrection', 'mistocr/refine.py'),
|
|
25
|
-
'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
|
|
26
25
|
'mistocr.refine.ImgDescription': ('refine.html#imgdescription', 'mistocr/refine.py'),
|
|
27
26
|
'mistocr.refine.add_descs_to_pg': ('refine.html#add_descs_to_pg', 'mistocr/refine.py'),
|
|
28
27
|
'mistocr.refine.add_descs_to_pgs': ('refine.html#add_descs_to_pgs', 'mistocr/refine.py'),
|
|
@@ -3,8 +3,8 @@
|
|
|
3
3
|
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_refine.ipynb.
|
|
4
4
|
|
|
5
5
|
# %% auto 0
|
|
6
|
-
__all__ = ['describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
|
|
7
|
-
'
|
|
6
|
+
__all__ = ['prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
|
|
7
|
+
'HeadingCorrection', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes', 'fix_hdgs', 'ImgDescription',
|
|
8
8
|
'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs', 'add_descs_to_pg', 'add_descs_to_pgs',
|
|
9
9
|
'add_img_descs']
|
|
10
10
|
|
|
@@ -60,13 +60,51 @@ def fmt_hdgs_idx(
|
|
|
60
60
|
|
|
61
61
|
# %% ../nbs/01_refine.ipynb 18
|
|
62
62
|
class HeadingCorrection(BaseModel):
|
|
63
|
+
"A single heading correction mapping an index to its corrected markdown heading"
|
|
63
64
|
index: int
|
|
64
65
|
corrected: str
|
|
65
66
|
|
|
66
|
-
|
|
67
|
-
|
|
67
|
+
# %% ../nbs/01_refine.ipynb 21
|
|
68
|
+
prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
|
|
68
69
|
|
|
69
|
-
|
|
70
|
+
INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title ... page 1")
|
|
71
|
+
|
|
72
|
+
ANALYSIS STEPS (think through these before outputting corrections):
|
|
73
|
+
1. For each numbered heading (e.g., "4.1", "2.a", "A.1"), identify its parent heading (e.g., "4", "2", "A")
|
|
74
|
+
2. Verify the child heading is exactly one # deeper than its parent
|
|
75
|
+
3. If not, mark it for correction
|
|
76
|
+
|
|
77
|
+
RULES - Apply these fixes in order:
|
|
78
|
+
|
|
79
|
+
1. **Single H1 rule**: Documents must have exactly ONE # heading (typically the document title at the top)
|
|
80
|
+
- If index 0 is already #, then all subsequent headings (index 1+) must be ## or deeper
|
|
81
|
+
- If no H1 exists, the first major heading should be #, and all others ## or deeper
|
|
82
|
+
- NO exceptions: appendices, references, and all sections are ## or deeper after the title
|
|
83
|
+
|
|
84
|
+
2. **Infer depth from numbering patterns**: If headings contain section numbers, deeper nesting means deeper heading level
|
|
85
|
+
- Parent section (e.g., "1", "2", "A") MUST be shallower than child (e.g., "1.1", "2.a", "A.1")
|
|
86
|
+
- Child section MUST be exactly one # deeper than parent
|
|
87
|
+
- Works with any numbering: "1/1.1/1.1.1", "A/A.1/A.1.a", "I/I.A/I.A.1", etc.
|
|
88
|
+
|
|
89
|
+
3. **Level jumps**: Headings can only increase by one # at a time when moving deeper
|
|
90
|
+
- Wrong: ## Section → ##### Subsection
|
|
91
|
+
- Fixed: ## Section → ### Subsection
|
|
92
|
+
|
|
93
|
+
4. **Decreasing levels is OK**: Moving back up the hierarchy (### to ##) is valid for new sections
|
|
94
|
+
|
|
95
|
+
5. **Unnumbered headings in numbered documents**: If the document uses numbered headings consistently, any unnumbered heading appearing within that structure is likely misclassified bold text and should be converted to regular text (output the heading text without any # symbols in the corrected field)
|
|
96
|
+
|
|
97
|
+
OUTPUT: Return a list of corrections, where each correction has:
|
|
98
|
+
- index: the heading's index number
|
|
99
|
+
- corrected: the fixed heading text (without the index prefix), or empty string "" to remove the heading entirely
|
|
100
|
+
IMPORTANT: Preserve the " ... page N" suffix in all corrected headings.
|
|
101
|
+
Only include headings that need changes.
|
|
102
|
+
|
|
103
|
+
Headings to analyze:
|
|
104
|
+
{headings_list}
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
# %% ../nbs/01_refine.ipynb 23
|
|
70
108
|
def fix_hdg_hierarchy(
|
|
71
109
|
hdgs: list[str], # List of markdown headings
|
|
72
110
|
prompt: str=None, # Prompt to use
|
|
@@ -82,7 +120,7 @@ def fix_hdg_hierarchy(
|
|
|
82
120
|
return {o['index']: o['corrected'] for o in fixes}
|
|
83
121
|
|
|
84
122
|
|
|
85
|
-
# %% ../nbs/01_refine.ipynb
|
|
123
|
+
# %% ../nbs/01_refine.ipynb 26
|
|
86
124
|
@delegates(fix_hdg_hierarchy)
|
|
87
125
|
def mk_fixes_lut(
|
|
88
126
|
hdgs: list[str], # List of markdown headings
|
|
@@ -95,7 +133,7 @@ def mk_fixes_lut(
|
|
|
95
133
|
fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key, **kwargs)
|
|
96
134
|
return {hdgs[k]:v for k,v in fixes.items()}
|
|
97
135
|
|
|
98
|
-
# %% ../nbs/01_refine.ipynb
|
|
136
|
+
# %% ../nbs/01_refine.ipynb 29
|
|
99
137
|
def apply_hdg_fixes(
|
|
100
138
|
p:str, # Page to fix
|
|
101
139
|
lut_fixes: dict[str, str], # Lookup table of fixes
|
|
@@ -104,7 +142,7 @@ def apply_hdg_fixes(
|
|
|
104
142
|
for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old))
|
|
105
143
|
return p
|
|
106
144
|
|
|
107
|
-
# %% ../nbs/01_refine.ipynb
|
|
145
|
+
# %% ../nbs/01_refine.ipynb 32
|
|
108
146
|
@delegates(mk_fixes_lut)
|
|
109
147
|
def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:str='img', **kwargs):
|
|
110
148
|
"Fix heading hierarchy in markdown document"
|
|
@@ -116,13 +154,13 @@ def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:st
|
|
|
116
154
|
lut = mk_fixes_lut(L([get_hdgs(pg) for pg in pgs_with_pg]).concat(), model, **kwargs)
|
|
117
155
|
for i,p in enumerate(pgs_with_pg, 1): (dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut))
|
|
118
156
|
|
|
119
|
-
# %% ../nbs/01_refine.ipynb
|
|
157
|
+
# %% ../nbs/01_refine.ipynb 38
|
|
120
158
|
class ImgDescription(BaseModel):
|
|
121
159
|
"Image classification and description for OCR'd documents"
|
|
122
160
|
is_informative:bool # Whether image contains informative content (charts, diagrams, tables) vs decorative (logos, backgrounds)
|
|
123
161
|
description:str # Detailed description of the image content for RAG and accessibility
|
|
124
162
|
|
|
125
|
-
# %% ../nbs/01_refine.ipynb
|
|
163
|
+
# %% ../nbs/01_refine.ipynb 41
|
|
126
164
|
describe_img_prompt = """Analyze this image from an academic/technical document.
|
|
127
165
|
|
|
128
166
|
Step 1: Determine if this image is informative for understanding the document content.
|
|
@@ -135,7 +173,7 @@ Step 2:
|
|
|
135
173
|
|
|
136
174
|
Return your response as JSON with 'is_informative' (boolean) and 'description' (string) fields."""
|
|
137
175
|
|
|
138
|
-
# %% ../nbs/01_refine.ipynb
|
|
176
|
+
# %% ../nbs/01_refine.ipynb 42
|
|
139
177
|
async def describe_img(
|
|
140
178
|
img_path: Path, # Path to the image file
|
|
141
179
|
model: str = 'claude-sonnet-4-5', # Model to use
|
|
@@ -146,7 +184,7 @@ async def describe_img(
|
|
|
146
184
|
r = await chat([img_path.read_bytes(), prompt], response_format=ImgDescription)
|
|
147
185
|
return r
|
|
148
186
|
|
|
149
|
-
# %% ../nbs/01_refine.ipynb
|
|
187
|
+
# %% ../nbs/01_refine.ipynb 46
|
|
150
188
|
async def limit(
|
|
151
189
|
semaphore, # Semaphore for concurrency control
|
|
152
190
|
coro, # Coroutine to execute
|
|
@@ -158,14 +196,14 @@ async def limit(
|
|
|
158
196
|
if delay: await sleep(delay)
|
|
159
197
|
return r
|
|
160
198
|
|
|
161
|
-
# %% ../nbs/01_refine.ipynb
|
|
199
|
+
# %% ../nbs/01_refine.ipynb 48
|
|
162
200
|
def parse_r(
|
|
163
201
|
result # ModelResponse object from API call
|
|
164
202
|
): # Dictionary with 'is_informative' and 'description' keys
|
|
165
203
|
"Extract and parse JSON content from model response"
|
|
166
204
|
return json.loads(result.choices[0].message.content)
|
|
167
205
|
|
|
168
|
-
# %% ../nbs/01_refine.ipynb
|
|
206
|
+
# %% ../nbs/01_refine.ipynb 50
|
|
169
207
|
async def describe_imgs(
|
|
170
208
|
imgs: list[Path], # List of image file paths to describe
|
|
171
209
|
model: str = 'claude-sonnet-4-5', # Model to use for image description
|
|
@@ -178,7 +216,7 @@ async def describe_imgs(
|
|
|
178
216
|
results = await gather(*[limit(sem, describe_img(img, model, prompt), delay) for img in imgs])
|
|
179
217
|
return {img.name: parse_r(r) for img, r in zip(imgs, results)}
|
|
180
218
|
|
|
181
|
-
# %% ../nbs/01_refine.ipynb
|
|
219
|
+
# %% ../nbs/01_refine.ipynb 52
|
|
182
220
|
def save_img_descs(
|
|
183
221
|
descs: dict, # Dictionary of image descriptions
|
|
184
222
|
dst_fname: Path, # Path to save the JSON file
|
|
@@ -186,7 +224,7 @@ def save_img_descs(
|
|
|
186
224
|
"Save image descriptions to JSON file"
|
|
187
225
|
Path(dst_fname).write_text(json.dumps(descs, indent=2))
|
|
188
226
|
|
|
189
|
-
# %% ../nbs/01_refine.ipynb
|
|
227
|
+
# %% ../nbs/01_refine.ipynb 57
|
|
190
228
|
def add_descs_to_pg(
|
|
191
229
|
pg:str, # Page markdown content
|
|
192
230
|
descs:dict # Dictionary mapping image filenames to their descriptions
|
|
@@ -197,7 +235,7 @@ def add_descs_to_pg(
|
|
|
197
235
|
if fname in descs: pg = pg.replace(link, f"{link}\nAI-generated image description:\n___\n{descs[fname]['description']}\n___")
|
|
198
236
|
return pg
|
|
199
237
|
|
|
200
|
-
# %% ../nbs/01_refine.ipynb
|
|
238
|
+
# %% ../nbs/01_refine.ipynb 62
|
|
201
239
|
def add_descs_to_pgs(
|
|
202
240
|
pgs:list, # List of page markdown strings
|
|
203
241
|
descs:dict # Dictionary mapping image filenames to their descriptions
|
|
@@ -205,7 +243,7 @@ def add_descs_to_pgs(
|
|
|
205
243
|
"Add AI-generated descriptions to images in all pages"
|
|
206
244
|
return [add_descs_to_pg(pg, descs) for pg in pgs]
|
|
207
245
|
|
|
208
|
-
# %% ../nbs/01_refine.ipynb
|
|
246
|
+
# %% ../nbs/01_refine.ipynb 65
|
|
209
247
|
async def add_img_descs(
|
|
210
248
|
src:str, # Path to source markdown directory
|
|
211
249
|
dst:str=None, # Destination directory (defaults to src if None)
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.2.8"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|