mistocr 0.2.8__tar.gz → 0.2.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.2.8
3
+ Version: 0.2.10
4
4
  Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -0,0 +1 @@
1
+ __version__ = "0.2.10"
@@ -22,7 +22,6 @@ d = { 'settings': { 'branch': 'main',
22
22
  'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
23
23
  'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},
24
24
  'mistocr.refine': { 'mistocr.refine.HeadingCorrection': ('refine.html#headingcorrection', 'mistocr/refine.py'),
25
- 'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
26
25
  'mistocr.refine.ImgDescription': ('refine.html#imgdescription', 'mistocr/refine.py'),
27
26
  'mistocr.refine.add_descs_to_pg': ('refine.html#add_descs_to_pg', 'mistocr/refine.py'),
28
27
  'mistocr.refine.add_descs_to_pgs': ('refine.html#add_descs_to_pgs', 'mistocr/refine.py'),
@@ -3,8 +3,8 @@
3
3
  # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_refine.ipynb.
4
4
 
5
5
  # %% auto 0
6
- __all__ = ['describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx', 'HeadingCorrection',
7
- 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes', 'fix_hdgs', 'ImgDescription',
6
+ __all__ = ['prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
7
+ 'HeadingCorrection', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes', 'fix_hdgs', 'ImgDescription',
8
8
  'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs', 'add_descs_to_pg', 'add_descs_to_pgs',
9
9
  'add_img_descs']
10
10
 
@@ -60,13 +60,51 @@ def fmt_hdgs_idx(
60
60
 
61
61
  # %% ../nbs/01_refine.ipynb 18
62
62
  class HeadingCorrection(BaseModel):
63
+ "A single heading correction mapping an index to its corrected markdown heading"
63
64
  index: int
64
65
  corrected: str
65
66
 
66
- class HeadingCorrections(BaseModel):
67
- corrections: list[HeadingCorrection]
67
+ # %% ../nbs/01_refine.ipynb 21
68
+ prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
68
69
 
69
- # %% ../nbs/01_refine.ipynb 22
70
+ INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title ... page 1")
71
+
72
+ ANALYSIS STEPS (think through these before outputting corrections):
73
+ 1. For each numbered heading (e.g., "4.1", "2.a", "A.1"), identify its parent heading (e.g., "4", "2", "A")
74
+ 2. Verify the child heading is exactly one # deeper than its parent
75
+ 3. If not, mark it for correction
76
+
77
+ RULES - Apply these fixes in order:
78
+
79
+ 1. **Single H1 rule**: Documents must have exactly ONE # heading (typically the document title at the top)
80
+ - If index 0 is already #, then all subsequent headings (index 1+) must be ## or deeper
81
+ - If no H1 exists, the first major heading should be #, and all others ## or deeper
82
+ - NO exceptions: appendices, references, and all sections are ## or deeper after the title
83
+
84
+ 2. **Infer depth from numbering patterns**: If headings contain section numbers, deeper nesting means deeper heading level
85
+ - Parent section (e.g., "1", "2", "A") MUST be shallower than child (e.g., "1.1", "2.a", "A.1")
86
+ - Child section MUST be exactly one # deeper than parent
87
+ - Works with any numbering: "1/1.1/1.1.1", "A/A.1/A.1.a", "I/I.A/I.A.1", etc.
88
+
89
+ 3. **Level jumps**: Headings can only increase by one # at a time when moving deeper
90
+ - Wrong: ## Section → ##### Subsection
91
+ - Fixed: ## Section → ### Subsection
92
+
93
+ 4. **Decreasing levels is OK**: Moving back up the hierarchy (### to ##) is valid for new sections
94
+
95
+ 5. **Unnumbered headings in numbered documents**: If the document uses numbered headings consistently, any unnumbered heading appearing within that structure is likely misclassified bold text and should be converted to regular text (output the heading text without any # symbols in the corrected field)
96
+
97
+ OUTPUT: Return a list of corrections, where each correction has:
98
+ - index: the heading's index number
99
+ - corrected: the fixed heading text (without the index prefix), or empty string "" to remove the heading entirely
100
+ IMPORTANT: Preserve the " ... page N" suffix in all corrected headings.
101
+ Only include headings that need changes.
102
+
103
+ Headings to analyze:
104
+ {headings_list}
105
+ """
106
+
107
+ # %% ../nbs/01_refine.ipynb 23
70
108
  def fix_hdg_hierarchy(
71
109
  hdgs: list[str], # List of markdown headings
72
110
  prompt: str=None, # Prompt to use
@@ -82,7 +120,7 @@ def fix_hdg_hierarchy(
82
120
  return {o['index']: o['corrected'] for o in fixes}
83
121
 
84
122
 
85
- # %% ../nbs/01_refine.ipynb 25
123
+ # %% ../nbs/01_refine.ipynb 26
86
124
  @delegates(fix_hdg_hierarchy)
87
125
  def mk_fixes_lut(
88
126
  hdgs: list[str], # List of markdown headings
@@ -95,7 +133,7 @@ def mk_fixes_lut(
95
133
  fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key, **kwargs)
96
134
  return {hdgs[k]:v for k,v in fixes.items()}
97
135
 
98
- # %% ../nbs/01_refine.ipynb 28
136
+ # %% ../nbs/01_refine.ipynb 29
99
137
  def apply_hdg_fixes(
100
138
  p:str, # Page to fix
101
139
  lut_fixes: dict[str, str], # Lookup table of fixes
@@ -104,7 +142,7 @@ def apply_hdg_fixes(
104
142
  for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old))
105
143
  return p
106
144
 
107
- # %% ../nbs/01_refine.ipynb 31
145
+ # %% ../nbs/01_refine.ipynb 32
108
146
  @delegates(mk_fixes_lut)
109
147
  def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:str='img', **kwargs):
110
148
  "Fix heading hierarchy in markdown document"
@@ -116,13 +154,13 @@ def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:st
116
154
  lut = mk_fixes_lut(L([get_hdgs(pg) for pg in pgs_with_pg]).concat(), model, **kwargs)
117
155
  for i,p in enumerate(pgs_with_pg, 1): (dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut))
118
156
 
119
- # %% ../nbs/01_refine.ipynb 37
157
+ # %% ../nbs/01_refine.ipynb 38
120
158
  class ImgDescription(BaseModel):
121
159
  "Image classification and description for OCR'd documents"
122
160
  is_informative:bool # Whether image contains informative content (charts, diagrams, tables) vs decorative (logos, backgrounds)
123
161
  description:str # Detailed description of the image content for RAG and accessibility
124
162
 
125
- # %% ../nbs/01_refine.ipynb 40
163
+ # %% ../nbs/01_refine.ipynb 41
126
164
  describe_img_prompt = """Analyze this image from an academic/technical document.
127
165
 
128
166
  Step 1: Determine if this image is informative for understanding the document content.
@@ -135,7 +173,7 @@ Step 2:
135
173
 
136
174
  Return your response as JSON with 'is_informative' (boolean) and 'description' (string) fields."""
137
175
 
138
- # %% ../nbs/01_refine.ipynb 41
176
+ # %% ../nbs/01_refine.ipynb 42
139
177
  async def describe_img(
140
178
  img_path: Path, # Path to the image file
141
179
  model: str = 'claude-sonnet-4-5', # Model to use
@@ -146,7 +184,7 @@ async def describe_img(
146
184
  r = await chat([img_path.read_bytes(), prompt], response_format=ImgDescription)
147
185
  return r
148
186
 
149
- # %% ../nbs/01_refine.ipynb 45
187
+ # %% ../nbs/01_refine.ipynb 46
150
188
  async def limit(
151
189
  semaphore, # Semaphore for concurrency control
152
190
  coro, # Coroutine to execute
@@ -158,14 +196,14 @@ async def limit(
158
196
  if delay: await sleep(delay)
159
197
  return r
160
198
 
161
- # %% ../nbs/01_refine.ipynb 47
199
+ # %% ../nbs/01_refine.ipynb 48
162
200
  def parse_r(
163
201
  result # ModelResponse object from API call
164
202
  ): # Dictionary with 'is_informative' and 'description' keys
165
203
  "Extract and parse JSON content from model response"
166
204
  return json.loads(result.choices[0].message.content)
167
205
 
168
- # %% ../nbs/01_refine.ipynb 49
206
+ # %% ../nbs/01_refine.ipynb 50
169
207
  async def describe_imgs(
170
208
  imgs: list[Path], # List of image file paths to describe
171
209
  model: str = 'claude-sonnet-4-5', # Model to use for image description
@@ -178,7 +216,7 @@ async def describe_imgs(
178
216
  results = await gather(*[limit(sem, describe_img(img, model, prompt), delay) for img in imgs])
179
217
  return {img.name: parse_r(r) for img, r in zip(imgs, results)}
180
218
 
181
- # %% ../nbs/01_refine.ipynb 51
219
+ # %% ../nbs/01_refine.ipynb 52
182
220
  def save_img_descs(
183
221
  descs: dict, # Dictionary of image descriptions
184
222
  dst_fname: Path, # Path to save the JSON file
@@ -186,7 +224,7 @@ def save_img_descs(
186
224
  "Save image descriptions to JSON file"
187
225
  Path(dst_fname).write_text(json.dumps(descs, indent=2))
188
226
 
189
- # %% ../nbs/01_refine.ipynb 56
227
+ # %% ../nbs/01_refine.ipynb 57
190
228
  def add_descs_to_pg(
191
229
  pg:str, # Page markdown content
192
230
  descs:dict # Dictionary mapping image filenames to their descriptions
@@ -197,7 +235,7 @@ def add_descs_to_pg(
197
235
  if fname in descs: pg = pg.replace(link, f"{link}\nAI-generated image description:\n___\n{descs[fname]['description']}\n___")
198
236
  return pg
199
237
 
200
- # %% ../nbs/01_refine.ipynb 61
238
+ # %% ../nbs/01_refine.ipynb 62
201
239
  def add_descs_to_pgs(
202
240
  pgs:list, # List of page markdown strings
203
241
  descs:dict # Dictionary mapping image filenames to their descriptions
@@ -205,7 +243,7 @@ def add_descs_to_pgs(
205
243
  "Add AI-generated descriptions to images in all pages"
206
244
  return [add_descs_to_pg(pg, descs) for pg in pgs]
207
245
 
208
- # %% ../nbs/01_refine.ipynb 64
246
+ # %% ../nbs/01_refine.ipynb 65
209
247
  async def add_img_descs(
210
248
  src:str, # Path to source markdown directory
211
249
  dst:str=None, # Destination directory (defaults to src if None)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.2.8
3
+ Version: 0.2.10
4
4
  Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -1,7 +1,7 @@
1
1
  [DEFAULT]
2
2
  repo = mistocr
3
3
  lib_name = mistocr
4
- version = 0.2.8
4
+ version = 0.2.10
5
5
  min_python = 3.9
6
6
  license = apache2
7
7
  black_formatting = False
@@ -1 +0,0 @@
1
- __version__ = "0.2.8"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes