mistocr 0.2.9__py3-none-any.whl → 0.2.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mistocr/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.9"
1
+ __version__ = "0.2.11"
mistocr/_modidx.py CHANGED
@@ -22,7 +22,6 @@ d = { 'settings': { 'branch': 'main',
22
22
  'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
23
23
  'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},
24
24
  'mistocr.refine': { 'mistocr.refine.HeadingCorrection': ('refine.html#headingcorrection', 'mistocr/refine.py'),
25
- 'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
26
25
  'mistocr.refine.ImgDescription': ('refine.html#imgdescription', 'mistocr/refine.py'),
27
26
  'mistocr.refine.add_descs_to_pg': ('refine.html#add_descs_to_pg', 'mistocr/refine.py'),
28
27
  'mistocr.refine.add_descs_to_pgs': ('refine.html#add_descs_to_pgs', 'mistocr/refine.py'),
mistocr/core.py CHANGED
@@ -79,11 +79,17 @@ def submit_batch(
79
79
  def wait_for_job(
80
80
  job:dict, # Job dict,
81
81
  c:Mistral=None, # Mistral client,
82
- poll_interval:int=1 # Poll interval in seconds
82
+ poll_interval:int=1, # Poll interval in seconds
83
+ queued_timeout:int=300 # Timeout for QUEUED status in seconds
83
84
  ) -> dict: # Job dict (with status)
84
85
  "Poll job until completion and return final job status"
86
+ queued_time = 0
85
87
  while job.status in ["QUEUED", "RUNNING"]:
86
88
  print(f'Mistral batch job status: {job.status}')
89
+ if job.status == "QUEUED":
90
+ queued_time += poll_interval
91
+ if queued_time >= queued_timeout:
92
+ raise TimeoutError(f"Job stayed in QUEUED status for {queued_time}s, exceeding timeout of {queued_timeout}s. Check your balance or Mistral Status.")
87
93
  time.sleep(poll_interval)
88
94
  job = c.batch.jobs.get(job_id=job.id)
89
95
  return job
mistocr/refine.py CHANGED
@@ -4,9 +4,9 @@
4
4
 
5
5
  # %% auto 0
6
6
  __all__ = ['prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
7
- 'HeadingCorrection', 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes',
8
- 'fix_hdgs', 'ImgDescription', 'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs',
9
- 'add_descs_to_pg', 'add_descs_to_pgs', 'add_img_descs']
7
+ 'HeadingCorrection', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes', 'fix_hdgs', 'ImgDescription',
8
+ 'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs', 'add_descs_to_pg', 'add_descs_to_pgs',
9
+ 'add_img_descs']
10
10
 
11
11
  # %% ../nbs/01_refine.ipynb 3
12
12
  from fastcore.all import *
@@ -60,13 +60,11 @@ def fmt_hdgs_idx(
60
60
 
61
61
  # %% ../nbs/01_refine.ipynb 18
62
62
  class HeadingCorrection(BaseModel):
63
+ "A single heading correction mapping an index to its corrected markdown heading"
63
64
  index: int
64
65
  corrected: str
65
66
 
66
- class HeadingCorrections(BaseModel):
67
- corrections: list[HeadingCorrection]
68
-
69
- # %% ../nbs/01_refine.ipynb 20
67
+ # %% ../nbs/01_refine.ipynb 21
70
68
  prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
71
69
 
72
70
  INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title ... page 1")
@@ -106,7 +104,7 @@ Headings to analyze:
106
104
  {headings_list}
107
105
  """
108
106
 
109
- # %% ../nbs/01_refine.ipynb 22
107
+ # %% ../nbs/01_refine.ipynb 23
110
108
  def fix_hdg_hierarchy(
111
109
  hdgs: list[str], # List of markdown headings
112
110
  prompt: str=None, # Prompt to use
@@ -122,7 +120,7 @@ def fix_hdg_hierarchy(
122
120
  return {o['index']: o['corrected'] for o in fixes}
123
121
 
124
122
 
125
- # %% ../nbs/01_refine.ipynb 25
123
+ # %% ../nbs/01_refine.ipynb 26
126
124
  @delegates(fix_hdg_hierarchy)
127
125
  def mk_fixes_lut(
128
126
  hdgs: list[str], # List of markdown headings
@@ -135,7 +133,7 @@ def mk_fixes_lut(
135
133
  fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key, **kwargs)
136
134
  return {hdgs[k]:v for k,v in fixes.items()}
137
135
 
138
- # %% ../nbs/01_refine.ipynb 28
136
+ # %% ../nbs/01_refine.ipynb 29
139
137
  def apply_hdg_fixes(
140
138
  p:str, # Page to fix
141
139
  lut_fixes: dict[str, str], # Lookup table of fixes
@@ -144,7 +142,7 @@ def apply_hdg_fixes(
144
142
  for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old))
145
143
  return p
146
144
 
147
- # %% ../nbs/01_refine.ipynb 31
145
+ # %% ../nbs/01_refine.ipynb 32
148
146
  @delegates(mk_fixes_lut)
149
147
  def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:str='img', **kwargs):
150
148
  "Fix heading hierarchy in markdown document"
@@ -156,13 +154,13 @@ def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:st
156
154
  lut = mk_fixes_lut(L([get_hdgs(pg) for pg in pgs_with_pg]).concat(), model, **kwargs)
157
155
  for i,p in enumerate(pgs_with_pg, 1): (dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut))
158
156
 
159
- # %% ../nbs/01_refine.ipynb 37
157
+ # %% ../nbs/01_refine.ipynb 38
160
158
  class ImgDescription(BaseModel):
161
159
  "Image classification and description for OCR'd documents"
162
160
  is_informative:bool # Whether image contains informative content (charts, diagrams, tables) vs decorative (logos, backgrounds)
163
161
  description:str # Detailed description of the image content for RAG and accessibility
164
162
 
165
- # %% ../nbs/01_refine.ipynb 40
163
+ # %% ../nbs/01_refine.ipynb 41
166
164
  describe_img_prompt = """Analyze this image from an academic/technical document.
167
165
 
168
166
  Step 1: Determine if this image is informative for understanding the document content.
@@ -175,7 +173,7 @@ Step 2:
175
173
 
176
174
  Return your response as JSON with 'is_informative' (boolean) and 'description' (string) fields."""
177
175
 
178
- # %% ../nbs/01_refine.ipynb 41
176
+ # %% ../nbs/01_refine.ipynb 42
179
177
  async def describe_img(
180
178
  img_path: Path, # Path to the image file
181
179
  model: str = 'claude-sonnet-4-5', # Model to use
@@ -186,7 +184,7 @@ async def describe_img(
186
184
  r = await chat([img_path.read_bytes(), prompt], response_format=ImgDescription)
187
185
  return r
188
186
 
189
- # %% ../nbs/01_refine.ipynb 45
187
+ # %% ../nbs/01_refine.ipynb 46
190
188
  async def limit(
191
189
  semaphore, # Semaphore for concurrency control
192
190
  coro, # Coroutine to execute
@@ -198,14 +196,14 @@ async def limit(
198
196
  if delay: await sleep(delay)
199
197
  return r
200
198
 
201
- # %% ../nbs/01_refine.ipynb 47
199
+ # %% ../nbs/01_refine.ipynb 48
202
200
  def parse_r(
203
201
  result # ModelResponse object from API call
204
202
  ): # Dictionary with 'is_informative' and 'description' keys
205
203
  "Extract and parse JSON content from model response"
206
204
  return json.loads(result.choices[0].message.content)
207
205
 
208
- # %% ../nbs/01_refine.ipynb 49
206
+ # %% ../nbs/01_refine.ipynb 50
209
207
  async def describe_imgs(
210
208
  imgs: list[Path], # List of image file paths to describe
211
209
  model: str = 'claude-sonnet-4-5', # Model to use for image description
@@ -218,7 +216,7 @@ async def describe_imgs(
218
216
  results = await gather(*[limit(sem, describe_img(img, model, prompt), delay) for img in imgs])
219
217
  return {img.name: parse_r(r) for img, r in zip(imgs, results)}
220
218
 
221
- # %% ../nbs/01_refine.ipynb 51
219
+ # %% ../nbs/01_refine.ipynb 52
222
220
  def save_img_descs(
223
221
  descs: dict, # Dictionary of image descriptions
224
222
  dst_fname: Path, # Path to save the JSON file
@@ -226,7 +224,7 @@ def save_img_descs(
226
224
  "Save image descriptions to JSON file"
227
225
  Path(dst_fname).write_text(json.dumps(descs, indent=2))
228
226
 
229
- # %% ../nbs/01_refine.ipynb 56
227
+ # %% ../nbs/01_refine.ipynb 57
230
228
  def add_descs_to_pg(
231
229
  pg:str, # Page markdown content
232
230
  descs:dict # Dictionary mapping image filenames to their descriptions
@@ -237,7 +235,7 @@ def add_descs_to_pg(
237
235
  if fname in descs: pg = pg.replace(link, f"{link}\nAI-generated image description:\n___\n{descs[fname]['description']}\n___")
238
236
  return pg
239
237
 
240
- # %% ../nbs/01_refine.ipynb 61
238
+ # %% ../nbs/01_refine.ipynb 62
241
239
  def add_descs_to_pgs(
242
240
  pgs:list, # List of page markdown strings
243
241
  descs:dict # Dictionary mapping image filenames to their descriptions
@@ -245,7 +243,7 @@ def add_descs_to_pgs(
245
243
  "Add AI-generated descriptions to images in all pages"
246
244
  return [add_descs_to_pg(pg, descs) for pg in pgs]
247
245
 
248
- # %% ../nbs/01_refine.ipynb 64
246
+ # %% ../nbs/01_refine.ipynb 65
249
247
  async def add_img_descs(
250
248
  src:str, # Path to source markdown directory
251
249
  dst:str=None, # Destination directory (defaults to src if None)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.2.9
3
+ Version: 0.2.11
4
4
  Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -0,0 +1,11 @@
1
+ mistocr/__init__.py,sha256=_MLx4ac1juJPWEEiC9kMQISX3x3jFBr507jM2P_hxMg,23
2
+ mistocr/_modidx.py,sha256=WTS9JpZdbrp2LghjhOV-CK0JYChHE4PzttgKfh7pTy4,4028
3
+ mistocr/core.py,sha256=ohh2ru05gUKbIQCRHPMz_hw4ui39FtpoV3_S3n4bl_c,7592
4
+ mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
5
+ mistocr/refine.py,sha256=H_IAF02k6CwBQdDJm9txknzUcTlz245zXitaHELX-P4,12791
6
+ mistocr-0.2.11.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
+ mistocr-0.2.11.dist-info/METADATA,sha256=CZ9TO24sY0OT9B19KGPIuy3MF9T7B1G9qHaVBoEMRIk,8417
8
+ mistocr-0.2.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
+ mistocr-0.2.11.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
+ mistocr-0.2.11.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
+ mistocr-0.2.11.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- mistocr/__init__.py,sha256=F8OVhAhMXSkvvXYgZtbPn2SG1AQC3joK4yu-FrHt81Y,22
2
- mistocr/_modidx.py,sha256=55k_EmMPrRmLa04qCE-_8lukJPCax66q5MALfpa8T4A,4154
3
- mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
4
- mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
5
- mistocr/refine.py,sha256=wtfS_bHlD39R8T2RbITgNX8cDCIPXI9gRrJ4y9nI_rM,12807
6
- mistocr-0.2.9.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
- mistocr-0.2.9.dist-info/METADATA,sha256=n9pFeWhh-Vzd7KR7s6s3R0mdJ3xvpexujEH-0iQsRQY,8416
8
- mistocr-0.2.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
- mistocr-0.2.9.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
- mistocr-0.2.9.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
- mistocr-0.2.9.dist-info/RECORD,,