mistocr 0.2.5__py3-none-any.whl → 0.2.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mistocr/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.5"
1
+ __version__ = "0.2.10"
mistocr/_modidx.py CHANGED
@@ -21,7 +21,7 @@ d = { 'settings': { 'branch': 'main',
21
21
  'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
22
22
  'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
23
23
  'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},
24
- 'mistocr.refine': { 'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
24
+ 'mistocr.refine': { 'mistocr.refine.HeadingCorrection': ('refine.html#headingcorrection', 'mistocr/refine.py'),
25
25
  'mistocr.refine.ImgDescription': ('refine.html#imgdescription', 'mistocr/refine.py'),
26
26
  'mistocr.refine.add_descs_to_pg': ('refine.html#add_descs_to_pg', 'mistocr/refine.py'),
27
27
  'mistocr.refine.add_descs_to_pgs': ('refine.html#add_descs_to_pgs', 'mistocr/refine.py'),
mistocr/refine.py CHANGED
@@ -4,7 +4,7 @@
4
4
 
5
5
  # %% auto 0
6
6
  __all__ = ['prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
7
- 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes', 'fix_hdgs', 'ImgDescription',
7
+ 'HeadingCorrection', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes', 'fix_hdgs', 'ImgDescription',
8
8
  'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs', 'add_descs_to_pg', 'add_descs_to_pgs',
9
9
  'add_img_descs']
10
10
 
@@ -59,14 +59,21 @@ def fmt_hdgs_idx(
59
59
 
60
60
 
61
61
  # %% ../nbs/01_refine.ipynb 18
62
- class HeadingCorrections(BaseModel):
63
- corrections: dict[int, str] # index corrected heading
62
+ class HeadingCorrection(BaseModel):
63
+ "A single heading correction mapping an index to its corrected markdown heading"
64
+ index: int
65
+ corrected: str
64
66
 
65
- # %% ../nbs/01_refine.ipynb 20
67
+ # %% ../nbs/01_refine.ipynb 21
66
68
  prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
67
69
 
68
70
  INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title ... page 1")
69
71
 
72
+ ANALYSIS STEPS (think through these before outputting corrections):
73
+ 1. For each numbered heading (e.g., "4.1", "2.a", "A.1"), identify its parent heading (e.g., "4", "2", "A")
74
+ 2. Verify the child heading is exactly one # deeper than its parent
75
+ 3. If not, mark it for correction
76
+
70
77
  RULES - Apply these fixes in order:
71
78
 
72
79
  1. **Single H1 rule**: Documents must have exactly ONE # heading (typically the document title at the top)
@@ -75,8 +82,8 @@ RULES - Apply these fixes in order:
75
82
  - NO exceptions: appendices, references, and all sections are ## or deeper after the title
76
83
 
77
84
  2. **Infer depth from numbering patterns**: If headings contain section numbers, deeper nesting means deeper heading level
78
- - Parent section (e.g., "1", "2", "A") should be shallower than child (e.g., "1.1", "2.a", "A.1")
79
- - Child section should be one # deeper than parent
85
+ - Parent section (e.g., "1", "2", "A") MUST be shallower than child (e.g., "1.1", "2.a", "A.1")
86
+ - Child section MUST be exactly one # deeper than parent
80
87
  - Works with any numbering: "1/1.1/1.1.1", "A/A.1/A.1.a", "I/I.A/I.A.1", etc.
81
88
 
82
89
  3. **Level jumps**: Headings can only increase by one # at a time when moving deeper
@@ -85,16 +92,19 @@ RULES - Apply these fixes in order:
85
92
 
86
93
  4. **Decreasing levels is OK**: Moving back up the hierarchy (### to ##) is valid for new sections
87
94
 
88
- OUTPUT: Return a Python dictionary mapping index to corrected heading (without the index prefix).
95
+ 5. **Unnumbered headings in numbered documents**: If the document uses numbered headings consistently, any unnumbered heading appearing within that structure is likely misclassified bold text and should be converted to regular text (output the heading text without any # symbols in the corrected field)
96
+
97
+ OUTPUT: Return a list of corrections, where each correction has:
98
+ - index: the heading's index number
99
+ - corrected: the fixed heading text (without the index prefix), or empty string "" to remove the heading entirely
89
100
  IMPORTANT: Preserve the " ... page N" suffix in all corrected headings.
90
- Only include entries that need changes.
101
+ Only include headings that need changes.
91
102
 
92
103
  Headings to analyze:
93
104
  {headings_list}
94
105
  """
95
106
 
96
-
97
- # %% ../nbs/01_refine.ipynb 22
107
+ # %% ../nbs/01_refine.ipynb 23
98
108
  def fix_hdg_hierarchy(
99
109
  hdgs: list[str], # List of markdown headings
100
110
  prompt: str=None, # Prompt to use
@@ -106,10 +116,11 @@ def fix_hdg_hierarchy(
106
116
  if prompt is None: prompt = prompt_fix_hdgs
107
117
  prompt = prompt.format(headings_list=fmt_hdgs_idx(hdgs))
108
118
  r = completion(model=model, messages=[{"role": "user", "content": prompt}], response_format=HeadingCorrections, api_key=api_key)
109
- return json.loads(r.choices[0].message.content)['corrections']
119
+ fixes = json.loads(r.choices[0].message.content)['corrections']
120
+ return {o['index']: o['corrected'] for o in fixes}
110
121
 
111
122
 
112
- # %% ../nbs/01_refine.ipynb 25
123
+ # %% ../nbs/01_refine.ipynb 26
113
124
  @delegates(fix_hdg_hierarchy)
114
125
  def mk_fixes_lut(
115
126
  hdgs: list[str], # List of markdown headings
@@ -120,9 +131,9 @@ def mk_fixes_lut(
120
131
  "Make a lookup table of fixes"
121
132
  if api_key is None: api_key = os.getenv('ANTHROPIC_API_KEY')
122
133
  fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key, **kwargs)
123
- return {hdgs[int(k)]:v for k,v in fixes.items()}
134
+ return {hdgs[k]:v for k,v in fixes.items()}
124
135
 
125
- # %% ../nbs/01_refine.ipynb 28
136
+ # %% ../nbs/01_refine.ipynb 29
126
137
  def apply_hdg_fixes(
127
138
  p:str, # Page to fix
128
139
  lut_fixes: dict[str, str], # Lookup table of fixes
@@ -131,7 +142,7 @@ def apply_hdg_fixes(
131
142
  for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old))
132
143
  return p
133
144
 
134
- # %% ../nbs/01_refine.ipynb 31
145
+ # %% ../nbs/01_refine.ipynb 32
135
146
  @delegates(mk_fixes_lut)
136
147
  def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:str='img', **kwargs):
137
148
  "Fix heading hierarchy in markdown document"
@@ -143,13 +154,13 @@ def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:st
143
154
  lut = mk_fixes_lut(L([get_hdgs(pg) for pg in pgs_with_pg]).concat(), model, **kwargs)
144
155
  for i,p in enumerate(pgs_with_pg, 1): (dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut))
145
156
 
146
- # %% ../nbs/01_refine.ipynb 37
157
+ # %% ../nbs/01_refine.ipynb 38
147
158
  class ImgDescription(BaseModel):
148
159
  "Image classification and description for OCR'd documents"
149
160
  is_informative:bool # Whether image contains informative content (charts, diagrams, tables) vs decorative (logos, backgrounds)
150
161
  description:str # Detailed description of the image content for RAG and accessibility
151
162
 
152
- # %% ../nbs/01_refine.ipynb 40
163
+ # %% ../nbs/01_refine.ipynb 41
153
164
  describe_img_prompt = """Analyze this image from an academic/technical document.
154
165
 
155
166
  Step 1: Determine if this image is informative for understanding the document content.
@@ -162,7 +173,7 @@ Step 2:
162
173
 
163
174
  Return your response as JSON with 'is_informative' (boolean) and 'description' (string) fields."""
164
175
 
165
- # %% ../nbs/01_refine.ipynb 41
176
+ # %% ../nbs/01_refine.ipynb 42
166
177
  async def describe_img(
167
178
  img_path: Path, # Path to the image file
168
179
  model: str = 'claude-sonnet-4-5', # Model to use
@@ -173,7 +184,7 @@ async def describe_img(
173
184
  r = await chat([img_path.read_bytes(), prompt], response_format=ImgDescription)
174
185
  return r
175
186
 
176
- # %% ../nbs/01_refine.ipynb 45
187
+ # %% ../nbs/01_refine.ipynb 46
177
188
  async def limit(
178
189
  semaphore, # Semaphore for concurrency control
179
190
  coro, # Coroutine to execute
@@ -185,14 +196,14 @@ async def limit(
185
196
  if delay: await sleep(delay)
186
197
  return r
187
198
 
188
- # %% ../nbs/01_refine.ipynb 47
199
+ # %% ../nbs/01_refine.ipynb 48
189
200
  def parse_r(
190
201
  result # ModelResponse object from API call
191
202
  ): # Dictionary with 'is_informative' and 'description' keys
192
203
  "Extract and parse JSON content from model response"
193
204
  return json.loads(result.choices[0].message.content)
194
205
 
195
- # %% ../nbs/01_refine.ipynb 49
206
+ # %% ../nbs/01_refine.ipynb 50
196
207
  async def describe_imgs(
197
208
  imgs: list[Path], # List of image file paths to describe
198
209
  model: str = 'claude-sonnet-4-5', # Model to use for image description
@@ -205,7 +216,7 @@ async def describe_imgs(
205
216
  results = await gather(*[limit(sem, describe_img(img, model, prompt), delay) for img in imgs])
206
217
  return {img.name: parse_r(r) for img, r in zip(imgs, results)}
207
218
 
208
- # %% ../nbs/01_refine.ipynb 51
219
+ # %% ../nbs/01_refine.ipynb 52
209
220
  def save_img_descs(
210
221
  descs: dict, # Dictionary of image descriptions
211
222
  dst_fname: Path, # Path to save the JSON file
@@ -213,7 +224,7 @@ def save_img_descs(
213
224
  "Save image descriptions to JSON file"
214
225
  Path(dst_fname).write_text(json.dumps(descs, indent=2))
215
226
 
216
- # %% ../nbs/01_refine.ipynb 56
227
+ # %% ../nbs/01_refine.ipynb 57
217
228
  def add_descs_to_pg(
218
229
  pg:str, # Page markdown content
219
230
  descs:dict # Dictionary mapping image filenames to their descriptions
@@ -224,7 +235,7 @@ def add_descs_to_pg(
224
235
  if fname in descs: pg = pg.replace(link, f"{link}\nAI-generated image description:\n___\n{descs[fname]['description']}\n___")
225
236
  return pg
226
237
 
227
- # %% ../nbs/01_refine.ipynb 61
238
+ # %% ../nbs/01_refine.ipynb 62
228
239
  def add_descs_to_pgs(
229
240
  pgs:list, # List of page markdown strings
230
241
  descs:dict # Dictionary mapping image filenames to their descriptions
@@ -232,7 +243,7 @@ def add_descs_to_pgs(
232
243
  "Add AI-generated descriptions to images in all pages"
233
244
  return [add_descs_to_pg(pg, descs) for pg in pgs]
234
245
 
235
- # %% ../nbs/01_refine.ipynb 64
246
+ # %% ../nbs/01_refine.ipynb 65
236
247
  async def add_img_descs(
237
248
  src:str, # Path to source markdown directory
238
249
  dst:str=None, # Destination directory (defaults to src if None)
@@ -247,6 +258,12 @@ async def add_img_descs(
247
258
  src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
248
259
  if dst_path != src_path: dst_path.mkdir(parents=True, exist_ok=True)
249
260
  src_imgs = src_path/img_folder
261
+
262
+ # Check if image folder exists
263
+ if not src_imgs.exists():
264
+ if progress: print(f"No images to describe in the document (no '{img_folder}' folder found)")
265
+ return
266
+
250
267
  if src_imgs.exists() and dst_path != src_path: shutil.copytree(src_imgs, dst_path/img_folder, dirs_exist_ok=True)
251
268
  desc_file = src_path/'img_descriptions.json'
252
269
  if desc_file.exists() and not force:
@@ -263,3 +280,4 @@ async def add_img_descs(
263
280
  enriched = [add_descs_to_pg(pg, descs) for pg in pgs]
264
281
  for i,pg in enumerate(enriched, 1): (dst_path/f'page_{i}.md').write_text(pg)
265
282
  if progress: print(f"Done! Enriched pages saved to {dst_path}")
283
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.2.5
3
+ Version: 0.2.10
4
4
  Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -0,0 +1,11 @@
1
+ mistocr/__init__.py,sha256=waXgc7p-jgGCsUjdVfO_KjlVZblnCvrzf4A0dsBj_lg,23
2
+ mistocr/_modidx.py,sha256=WTS9JpZdbrp2LghjhOV-CK0JYChHE4PzttgKfh7pTy4,4028
3
+ mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
4
+ mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
5
+ mistocr/refine.py,sha256=H_IAF02k6CwBQdDJm9txknzUcTlz245zXitaHELX-P4,12791
6
+ mistocr-0.2.10.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
+ mistocr-0.2.10.dist-info/METADATA,sha256=mkMu_9nYAXZ5jFdJd01AZqK3t93_Rt0xkkD0rRnl9Ew,8417
8
+ mistocr-0.2.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
+ mistocr-0.2.10.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
+ mistocr-0.2.10.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
+ mistocr-0.2.10.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- mistocr/__init__.py,sha256=Xsa3ayOMVkhUWm4t06YeyHE0apjpZefxLH4ylp0CDtU,22
2
- mistocr/_modidx.py,sha256=JtXJb6ErVwyGFCWgeVz7N9JvybBp8Zyao4Ahn3smr90,4030
3
- mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
4
- mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
5
- mistocr/refine.py,sha256=arJPOg1eP4MwtkD1zwnYY4EFrBfTTSP-mtR4AVnTiR8,11788
6
- mistocr-0.2.5.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
- mistocr-0.2.5.dist-info/METADATA,sha256=uGim0pZ4V3-oolsihRFr4aOWh3ZDOO7u3d8Mn0n-gmc,8416
8
- mistocr-0.2.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
- mistocr-0.2.5.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
- mistocr-0.2.5.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
- mistocr-0.2.5.dist-info/RECORD,,