mistocr 0.3.2__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mistocr/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.3.2"
1
+ __version__ = "0.4.1"
mistocr/_modidx.py CHANGED
@@ -5,7 +5,8 @@ d = { 'settings': { 'branch': 'main',
5
5
  'doc_host': 'https://franckalbinet.github.io',
6
6
  'git_url': 'https://github.com/franckalbinet/mistocr',
7
7
  'lib_path': 'mistocr'},
8
- 'syms': { 'mistocr.core': { 'mistocr.core._get_paths': ('core.html#_get_paths', 'mistocr/core.py'),
8
+ 'syms': { 'mistocr.core': { 'mistocr.core._check_timeout': ('core.html#_check_timeout', 'mistocr/core.py'),
9
+ 'mistocr.core._get_paths': ('core.html#_get_paths', 'mistocr/core.py'),
9
10
  'mistocr.core._prep_batch': ('core.html#_prep_batch', 'mistocr/core.py'),
10
11
  'mistocr.core._run_batch': ('core.html#_run_batch', 'mistocr/core.py'),
11
12
  'mistocr.core.create_batch_entry': ('core.html#create_batch_entry', 'mistocr/core.py'),
mistocr/core.py CHANGED
@@ -3,9 +3,9 @@
3
3
  # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_core.ipynb.
4
4
 
5
5
  # %% auto 0
6
- __all__ = ['ocr_model', 'ocr_endpoint', 'get_api_key', 'upload_pdf', 'create_batch_entry', 'prep_pdf_batch', 'submit_batch',
7
- 'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr_pdf', 'read_pgs',
8
- 'subset_pdf']
6
+ __all__ = ['logger', 'ocr_model', 'ocr_endpoint', 'get_api_key', 'upload_pdf', 'create_batch_entry', 'prep_pdf_batch',
7
+ 'submit_batch', 'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr_pdf',
8
+ 'read_pgs', 'subset_pdf']
9
9
 
10
10
  # %% ../nbs/00_core.ipynb 3
11
11
  from fastcore.all import *
@@ -15,8 +15,14 @@ from pathlib import Path
15
15
  from PIL import Image
16
16
  from mistralai import Mistral
17
17
  import PyPDF2
18
+ import logging
18
19
 
19
- # %% ../nbs/00_core.ipynb 6
20
+ # %% ../nbs/00_core.ipynb 4
21
+ logger = logging.getLogger(__name__)
22
+ logging.basicConfig(level=logging.WARNING, format='%(name)s - %(levelname)s - %(message)s')
23
+ logger.setLevel(logging.DEBUG)
24
+
25
+ # %% ../nbs/00_core.ipynb 7
20
26
  def get_api_key(
21
27
  key:str=None # Mistral API key
22
28
  ):
@@ -25,11 +31,11 @@ def get_api_key(
25
31
  if not key: raise ValueError("MISTRAL_API_KEY not found")
26
32
  return key
27
33
 
28
- # %% ../nbs/00_core.ipynb 7
34
+ # %% ../nbs/00_core.ipynb 8
29
35
  ocr_model = "mistral-ocr-latest"
30
36
  ocr_endpoint = "/v1/ocr"
31
37
 
32
- # %% ../nbs/00_core.ipynb 10
38
+ # %% ../nbs/00_core.ipynb 11
33
39
  def upload_pdf(
34
40
  path:str, # Path to PDF file
35
41
  key:str=None # Mistral API key
@@ -40,7 +46,7 @@ def upload_pdf(
40
46
  uploaded = c.files.upload(file=dict(file_name=path.stem, content=path.read_bytes()), purpose="ocr")
41
47
  return c.files.get_signed_url(file_id=uploaded.id).url, c
42
48
 
43
- # %% ../nbs/00_core.ipynb 15
49
+ # %% ../nbs/00_core.ipynb 16
44
50
  def create_batch_entry(
45
51
  path:str, # Path to PDF file,
46
52
  url:str, # Mistral signed URL
@@ -52,7 +58,7 @@ def create_batch_entry(
52
58
  if not cid: cid = path.stem
53
59
  return dict(custom_id=cid, body=dict(document=dict(type="document_url", document_url=url), include_image_base64=inc_img))
54
60
 
55
- # %% ../nbs/00_core.ipynb 17
61
+ # %% ../nbs/00_core.ipynb 18
56
62
  def prep_pdf_batch(
57
63
  path:str, # Path to PDF file,
58
64
  cid:str=None, # Custom ID (by default using the file name without extention)
@@ -63,7 +69,7 @@ def prep_pdf_batch(
63
69
  url, c = upload_pdf(path, key)
64
70
  return create_batch_entry(path, url, cid, inc_img), c
65
71
 
66
- # %% ../nbs/00_core.ipynb 21
72
+ # %% ../nbs/00_core.ipynb 22
67
73
  def submit_batch(
68
74
  entries:list[dict], # List of batch entries,
69
75
  c:Mistral=None, # Mistral client,
@@ -77,26 +83,35 @@ def submit_batch(
77
83
  batch_data = c.files.upload(file=dict(file_name="batch.jsonl", content=open(f.name, "rb")), purpose="batch")
78
84
  return c.batch.jobs.create(input_files=[batch_data.id], model=model, endpoint=endpoint)
79
85
 
80
- # %% ../nbs/00_core.ipynb 24
86
+ # %% ../nbs/00_core.ipynb 25
87
+ def _check_timeout(
88
+ queued_time:int, # Time spent in QUEUED state (seconds)
89
+ timeout:int, # Maximum allowed QUEUED time (seconds)
90
+ job_id:str # Batch job ID
91
+ ):
92
+ "Raise TimeoutError if job has been queued longer than timeout"
93
+ if queued_time >= timeout: raise TimeoutError(f"Job {job_id} stayed in QUEUED for {queued_time}s, exceeding timeout of {timeout}s. Check your balance or Mistral Status.")
94
+
95
+ # %% ../nbs/00_core.ipynb 26
81
96
  def wait_for_job(
82
- job:dict, # Job dict,
83
- c:Mistral=None, # Mistral client,
84
- poll_interval:int=1, # Poll interval in seconds
85
- queued_timeout:int=300 # Timeout for QUEUED status in seconds
86
- ) -> dict: # Job dict (with status)
97
+ job:dict, # Batch job from submit_batch
98
+ c:Mistral=None, # Mistral client
99
+ poll_interval:int=1, # Seconds between status checks
100
+ queued_timeout:int=300 # Max seconds in QUEUED before timeout
101
+ ) -> dict: # Completed job dict
87
102
  "Poll job until completion and return final job status"
103
+ logger.info(f"Waiting for batch job {job.id} (initial status: {job.status})")
88
104
  queued_time = 0
89
105
  while job.status in ["QUEUED", "RUNNING"]:
90
- print(f'Mistral batch job status: {job.status}')
91
- if job.status == "QUEUED":
92
- queued_time += poll_interval
93
- if queued_time >= queued_timeout:
94
- raise TimeoutError(f"Job stayed in QUEUED status for {queued_time}s, exceeding timeout of {queued_timeout}s. Check your balance or Mistral Status.")
106
+ logger.debug(f"Job {job.id} status: {job.status} (elapsed: {queued_time}s)")
107
+ if job.status == "QUEUED": queued_time += poll_interval; _check_timeout(queued_time, queued_timeout, job.id)
95
108
  time.sleep(poll_interval)
96
109
  job = c.batch.jobs.get(job_id=job.id)
110
+ logger.info(f"Job {job.id} completed with status: {job.status}")
111
+ if job.status != "SUCCESS": logger.warning(f"Job {job.id} finished with non-success status: {job.status}")
97
112
  return job
98
113
 
99
- # %% ../nbs/00_core.ipynb 26
114
+ # %% ../nbs/00_core.ipynb 28
100
115
  def download_results(
101
116
  job:dict, # Job dict,
102
117
  c:Mistral=None # Mistral client
@@ -105,7 +120,7 @@ def download_results(
105
120
  content = c.files.download(file_id=job.output_file).read().decode('utf-8')
106
121
  return [json.loads(line) for line in content.strip().split('\n') if line]
107
122
 
108
- # %% ../nbs/00_core.ipynb 31
123
+ # %% ../nbs/00_core.ipynb 33
109
124
  def save_images(
110
125
  page:dict, # Page dict,
111
126
  img_dir:str='img' # Directory to save images
@@ -116,7 +131,7 @@ def save_images(
116
131
  img_bytes = base64.b64decode(img['image_base64'].split(',')[1])
117
132
  Image.open(BytesIO(img_bytes)).save(img_dir / img['id'])
118
133
 
119
- # %% ../nbs/00_core.ipynb 32
134
+ # %% ../nbs/00_core.ipynb 34
120
135
  def save_page(
121
136
  page:dict, # Page dict,
122
137
  dst:str, # Directory to save page
@@ -128,7 +143,7 @@ def save_page(
128
143
  img_dir.mkdir(exist_ok=True)
129
144
  save_images(page, img_dir)
130
145
 
131
- # %% ../nbs/00_core.ipynb 34
146
+ # %% ../nbs/00_core.ipynb 36
132
147
  def save_pages(
133
148
  ocr_resp:dict, # OCR response,
134
149
  dst:str, # Directory to save pages,
@@ -141,7 +156,7 @@ def save_pages(
141
156
  for page in ocr_resp['pages']: save_page(page, dst, img_dir)
142
157
  return dst
143
158
 
144
- # %% ../nbs/00_core.ipynb 40
159
+ # %% ../nbs/00_core.ipynb 42
145
160
  def _get_paths(path:str) -> list[Path]:
146
161
  "Get list of PDFs from file or folder"
147
162
  path = Path(path)
@@ -152,7 +167,7 @@ def _get_paths(path:str) -> list[Path]:
152
167
  return pdfs
153
168
  raise ValueError(f"Path not found: {path}")
154
169
 
155
- # %% ../nbs/00_core.ipynb 41
170
+ # %% ../nbs/00_core.ipynb 43
156
171
  def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[dict], Mistral]:
157
172
  "Prepare batch entries for list of PDFs"
158
173
  entries, c = [], None
@@ -161,7 +176,7 @@ def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[
161
176
  entries.append(entry)
162
177
  return entries, c
163
178
 
164
- # %% ../nbs/00_core.ipynb 42
179
+ # %% ../nbs/00_core.ipynb 44
165
180
  def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]:
166
181
  "Submit batch, wait for completion, and download results"
167
182
  job = submit_batch(entries, c)
@@ -169,7 +184,7 @@ def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]
169
184
  if job.status != 'SUCCESS': raise Exception(f"Job failed with status: {job.status}")
170
185
  return download_results(job, c)
171
186
 
172
- # %% ../nbs/00_core.ipynb 43
187
+ # %% ../nbs/00_core.ipynb 45
173
188
  def ocr_pdf(
174
189
  path:str, # Path to PDF file or folder,
175
190
  dst:str='md', # Directory to save markdown pages,
@@ -183,7 +198,7 @@ def ocr_pdf(
183
198
  results = _run_batch(entries, c, poll_interval)
184
199
  return L([save_pages(r['response']['body'], dst, r['custom_id']) for r in results])
185
200
 
186
- # %% ../nbs/00_core.ipynb 50
201
+ # %% ../nbs/00_core.ipynb 52
187
202
  def read_pgs(
188
203
  path:str, # OCR output directory,
189
204
  join:bool=True # Join pages into single string
@@ -194,7 +209,7 @@ def read_pgs(
194
209
  contents = L([p.read_text() for p in pgs])
195
210
  return '\n\n'.join(contents) if join else contents
196
211
 
197
- # %% ../nbs/00_core.ipynb 57
212
+ # %% ../nbs/00_core.ipynb 59
198
213
  def subset_pdf(
199
214
  path:str, # Path to PDF file
200
215
  start:int=1, # Start page (1-based)
mistocr/pipeline.py CHANGED
@@ -3,7 +3,7 @@
3
3
  # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_pipeline.ipynb.
4
4
 
5
5
  # %% auto 0
6
- __all__ = ['pdf_to_md']
6
+ __all__ = ['logger', 'pdf_to_md']
7
7
 
8
8
  # %% ../nbs/02_pipeline.ipynb 3
9
9
  from fastcore.all import *
@@ -13,8 +13,14 @@ from pathlib import Path
13
13
  from asyncio import Semaphore, gather, sleep
14
14
  import tempfile
15
15
  import os, json, shutil
16
+ import logging
16
17
 
17
18
  # %% ../nbs/02_pipeline.ipynb 4
19
+ logger = logging.getLogger(__name__)
20
+ logging.basicConfig(level=logging.WARNING, format='%(name)s - %(levelname)s - %(message)s')
21
+ logger.setLevel(logging.INFO)
22
+
23
+ # %% ../nbs/02_pipeline.ipynb 5
18
24
  @delegates(add_img_descs)
19
25
  async def pdf_to_md(
20
26
  pdf_path:str, # Path to input PDF file
@@ -26,16 +32,17 @@ async def pdf_to_md(
26
32
  **kwargs
27
33
  ):
28
34
  "Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
35
+ "Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
29
36
  cleanup = ocr_dst is None
30
37
  if cleanup: ocr_dst = tempfile.mkdtemp()
31
38
  n_steps = 3 if add_img_desc else 2
32
- if progress: print(f"Step 1/{n_steps}: Running OCR on {pdf_path}...")
39
+ if progress: logger.info(f"Step 1/{n_steps}: Running OCR on {pdf_path}...")
33
40
  ocr_dir = ocr_pdf(pdf_path, ocr_dst)[0]
34
- if progress: print(f"Step 2/{n_steps}: Fixing heading hierarchy...")
41
+ if progress: logger.info(f"Step 2/{n_steps}: Fixing heading hierarchy...")
35
42
  fix_hdgs(ocr_dir, model=model)
36
43
  if add_img_desc:
37
- if progress: print(f"Step 3/{n_steps}: Adding image descriptions...")
44
+ if progress: logger.info(f"Step 3/{n_steps}: Adding image descriptions...")
38
45
  await add_img_descs(ocr_dir, dst=dst, model=model, progress=progress, **kwargs)
39
46
  elif dst != str(ocr_dir): shutil.copytree(ocr_dir, dst, dirs_exist_ok=True)
40
47
  if cleanup: shutil.rmtree(ocr_dst)
41
- if progress: print("Done!")
48
+ if progress: logger.info("Done!")
mistocr/refine.py CHANGED
@@ -3,7 +3,7 @@
3
3
  # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_refine.ipynb.
4
4
 
5
5
  # %% auto 0
6
- __all__ = ['prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
6
+ __all__ = ['logger', 'prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
7
7
  'HeadingCorrection', 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes',
8
8
  'fix_hdgs', 'ImgDescription', 'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs',
9
9
  'add_descs_to_pg', 'add_descs_to_pgs', 'add_img_descs']
@@ -20,8 +20,14 @@ import os
20
20
  import json
21
21
  import shutil
22
22
  from asyncio import Semaphore, gather, sleep
23
+ import logging
23
24
 
24
- # %% ../nbs/01_refine.ipynb 7
25
+ # %% ../nbs/01_refine.ipynb 4
26
+ logger = logging.getLogger(__name__)
27
+ logging.basicConfig(level=logging.WARNING, format='%(name)s - %(levelname)s - %(message)s')
28
+ logger.setLevel(logging.INFO)
29
+
30
+ # %% ../nbs/01_refine.ipynb 8
25
31
  def get_hdgs(
26
32
  md:str # Markdown file string
27
33
  ) -> L: # L of strings
@@ -32,7 +38,7 @@ def get_hdgs(
32
38
 
33
39
 
34
40
 
35
- # %% ../nbs/01_refine.ipynb 8
41
+ # %% ../nbs/01_refine.ipynb 9
36
42
  def add_pg_hdgs(
37
43
  md:str, # Markdown file string,
38
44
  n:int # Page number
@@ -42,7 +48,7 @@ def add_pg_hdgs(
42
48
  def repl(m): return m.group(0) + f' ... page {n}'
43
49
  return sub(r'^#{1,6} .+$', repl, md, flags=MULTILINE)
44
50
 
45
- # %% ../nbs/01_refine.ipynb 12
51
+ # %% ../nbs/01_refine.ipynb 13
46
52
  def read_pgs_pg(
47
53
  path:str # Path to the markdown file
48
54
  ) -> L: # List of markdown pages
@@ -50,7 +56,7 @@ def read_pgs_pg(
50
56
  pgs = read_pgs(path, join=False)
51
57
  return L([add_pg_hdgs(pg, n) for n, pg in enumerate(pgs, 1)]).concat()
52
58
 
53
- # %% ../nbs/01_refine.ipynb 15
59
+ # %% ../nbs/01_refine.ipynb 16
54
60
  def fmt_hdgs_idx(
55
61
  hdgs: list[str] # List of markdown headings
56
62
  ) -> str: # Formatted string with index
@@ -58,18 +64,18 @@ def fmt_hdgs_idx(
58
64
  return '\n'.join(f"{i}. {h}" for i, h in enumerate(hdgs))
59
65
 
60
66
 
61
- # %% ../nbs/01_refine.ipynb 18
67
+ # %% ../nbs/01_refine.ipynb 19
62
68
  class HeadingCorrection(BaseModel):
63
69
  "A single heading correction mapping an index to its corrected markdown heading"
64
70
  index: int
65
71
  corrected: str
66
72
 
67
- # %% ../nbs/01_refine.ipynb 19
73
+ # %% ../nbs/01_refine.ipynb 20
68
74
  class HeadingCorrections(BaseModel):
69
75
  "Collection of heading corrections returned by the LLM"
70
76
  corrections: list[HeadingCorrection]
71
77
 
72
- # %% ../nbs/01_refine.ipynb 21
78
+ # %% ../nbs/01_refine.ipynb 22
73
79
  prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
74
80
 
75
81
  INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title ... page 1")
@@ -109,7 +115,7 @@ Headings to analyze:
109
115
  {headings_list}
110
116
  """
111
117
 
112
- # %% ../nbs/01_refine.ipynb 23
118
+ # %% ../nbs/01_refine.ipynb 24
113
119
  def fix_hdg_hierarchy(
114
120
  hdgs: list[str], # List of markdown headings
115
121
  prompt: str=None, # Prompt to use
@@ -125,7 +131,7 @@ def fix_hdg_hierarchy(
125
131
  return {o['index']: o['corrected'] for o in fixes}
126
132
 
127
133
 
128
- # %% ../nbs/01_refine.ipynb 26
134
+ # %% ../nbs/01_refine.ipynb 27
129
135
  @delegates(fix_hdg_hierarchy)
130
136
  def mk_fixes_lut(
131
137
  hdgs: list[str], # List of markdown headings
@@ -138,7 +144,7 @@ def mk_fixes_lut(
138
144
  fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key, **kwargs)
139
145
  return {hdgs[k]:v for k,v in fixes.items()}
140
146
 
141
- # %% ../nbs/01_refine.ipynb 29
147
+ # %% ../nbs/01_refine.ipynb 30
142
148
  def apply_hdg_fixes(
143
149
  p:str, # Page to fix
144
150
  lut_fixes: dict[str, str], # Lookup table of fixes
@@ -147,7 +153,7 @@ def apply_hdg_fixes(
147
153
  for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old))
148
154
  return p
149
155
 
150
- # %% ../nbs/01_refine.ipynb 32
156
+ # %% ../nbs/01_refine.ipynb 33
151
157
  @delegates(mk_fixes_lut)
152
158
  def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:str='img', **kwargs):
153
159
  "Fix heading hierarchy in markdown document"
@@ -159,13 +165,13 @@ def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:st
159
165
  lut = mk_fixes_lut(L([get_hdgs(pg) for pg in pgs_with_pg]).concat(), model, **kwargs)
160
166
  for i,p in enumerate(pgs_with_pg, 1): (dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut))
161
167
 
162
- # %% ../nbs/01_refine.ipynb 38
168
+ # %% ../nbs/01_refine.ipynb 39
163
169
  class ImgDescription(BaseModel):
164
170
  "Image classification and description for OCR'd documents"
165
171
  is_informative:bool # Whether image contains informative content (charts, diagrams, tables) vs decorative (logos, backgrounds)
166
172
  description:str # Detailed description of the image content for RAG and accessibility
167
173
 
168
- # %% ../nbs/01_refine.ipynb 41
174
+ # %% ../nbs/01_refine.ipynb 42
169
175
  describe_img_prompt = """Analyze this image from an academic/technical document.
170
176
 
171
177
  Step 1: Determine if this image is informative for understanding the document content.
@@ -178,7 +184,7 @@ Step 2:
178
184
 
179
185
  Return your response as JSON with 'is_informative' (boolean) and 'description' (string) fields."""
180
186
 
181
- # %% ../nbs/01_refine.ipynb 42
187
+ # %% ../nbs/01_refine.ipynb 43
182
188
  async def describe_img(
183
189
  img_path: Path, # Path to the image file
184
190
  model: str = 'claude-sonnet-4-5', # Model to use
@@ -189,7 +195,7 @@ async def describe_img(
189
195
  r = await chat([img_path.read_bytes(), prompt], response_format=ImgDescription)
190
196
  return r
191
197
 
192
- # %% ../nbs/01_refine.ipynb 46
198
+ # %% ../nbs/01_refine.ipynb 47
193
199
  async def limit(
194
200
  semaphore, # Semaphore for concurrency control
195
201
  coro, # Coroutine to execute
@@ -201,14 +207,14 @@ async def limit(
201
207
  if delay: await sleep(delay)
202
208
  return r
203
209
 
204
- # %% ../nbs/01_refine.ipynb 48
210
+ # %% ../nbs/01_refine.ipynb 49
205
211
  def parse_r(
206
212
  result # ModelResponse object from API call
207
213
  ): # Dictionary with 'is_informative' and 'description' keys
208
214
  "Extract and parse JSON content from model response"
209
215
  return json.loads(result.choices[0].message.content)
210
216
 
211
- # %% ../nbs/01_refine.ipynb 50
217
+ # %% ../nbs/01_refine.ipynb 51
212
218
  async def describe_imgs(
213
219
  imgs: list[Path], # List of image file paths to describe
214
220
  model: str = 'claude-sonnet-4-5', # Model to use for image description
@@ -221,7 +227,7 @@ async def describe_imgs(
221
227
  results = await gather(*[limit(sem, describe_img(img, model, prompt), delay) for img in imgs])
222
228
  return {img.name: parse_r(r) for img, r in zip(imgs, results)}
223
229
 
224
- # %% ../nbs/01_refine.ipynb 52
230
+ # %% ../nbs/01_refine.ipynb 53
225
231
  def save_img_descs(
226
232
  descs: dict, # Dictionary of image descriptions
227
233
  dst_fname: Path, # Path to save the JSON file
@@ -229,7 +235,7 @@ def save_img_descs(
229
235
  "Save image descriptions to JSON file"
230
236
  Path(dst_fname).write_text(json.dumps(descs, indent=2))
231
237
 
232
- # %% ../nbs/01_refine.ipynb 57
238
+ # %% ../nbs/01_refine.ipynb 58
233
239
  def add_descs_to_pg(
234
240
  pg:str, # Page markdown content
235
241
  descs:dict # Dictionary mapping image filenames to their descriptions
@@ -240,7 +246,7 @@ def add_descs_to_pg(
240
246
  if fname in descs: pg = pg.replace(link, f"{link}\nAI-generated image description:\n___\n{descs[fname]['description']}\n___")
241
247
  return pg
242
248
 
243
- # %% ../nbs/01_refine.ipynb 62
249
+ # %% ../nbs/01_refine.ipynb 63
244
250
  def add_descs_to_pgs(
245
251
  pgs:list, # List of page markdown strings
246
252
  descs:dict # Dictionary mapping image filenames to their descriptions
@@ -248,7 +254,7 @@ def add_descs_to_pgs(
248
254
  "Add AI-generated descriptions to images in all pages"
249
255
  return [add_descs_to_pg(pg, descs) for pg in pgs]
250
256
 
251
- # %% ../nbs/01_refine.ipynb 65
257
+ # %% ../nbs/01_refine.ipynb 66
252
258
  async def add_img_descs(
253
259
  src:str, # Path to source markdown directory
254
260
  dst:str=None, # Destination directory (defaults to src if None)
@@ -257,7 +263,7 @@ async def add_img_descs(
257
263
  semaphore:int=2, # Max concurrent API requests
258
264
  delay:float=1, # Delay in seconds between API calls
259
265
  force:bool=False, # Force regeneration even if cache exists
260
- progress:bool=True # Print progress messages
266
+ progress:bool=True # Log progress messages
261
267
  ):
262
268
  "Describe all images in markdown document and insert descriptions inline"
263
269
  src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
@@ -266,23 +272,23 @@ async def add_img_descs(
266
272
 
267
273
  # Check if image folder exists
268
274
  if not src_imgs.exists():
269
- if progress: print(f"No images to describe in the document (no '{img_folder}' folder found)")
275
+ if progress: logger.info(f"No images to describe in the document (no '{img_folder}' folder found)")
270
276
  return
271
277
 
272
278
  if src_imgs.exists() and dst_path != src_path: shutil.copytree(src_imgs, dst_path/img_folder, dirs_exist_ok=True)
273
279
  desc_file = src_path/'img_descriptions.json'
274
280
  if desc_file.exists() and not force:
275
- if progress: print(f"Loading existing descriptions from {desc_file}")
281
+ if progress: logger.info(f"Loading existing descriptions from {desc_file}")
276
282
  descs = json.loads(desc_file.read_text())
277
283
  else:
278
284
  imgs = (src_path/img_folder).ls(file_exts=['.jpeg', '.jpg', '.png'])
279
- if progress: print(f"Describing {len(imgs)} images...")
285
+ if progress: logger.info(f"Describing {len(imgs)} images...")
280
286
  descs = await describe_imgs(imgs, model, semaphore=semaphore, delay=delay)
281
287
  save_img_descs(descs, desc_file)
282
- if progress: print(f"Saved descriptions to {desc_file}")
288
+ if progress: logger.info(f"Saved descriptions to {desc_file}")
283
289
  pgs = read_pgs(src_path, join=False)
284
- if progress: print(f"Adding descriptions to {len(pgs)} pages...")
290
+ if progress: logger.info(f"Adding descriptions to {len(pgs)} pages...")
285
291
  enriched = [add_descs_to_pg(pg, descs) for pg in pgs]
286
292
  for i,pg in enumerate(enriched, 1): (dst_path/f'page_{i}.md').write_text(pg)
287
- if progress: print(f"Done! Enriched pages saved to {dst_path}")
293
+ if progress: logger.info(f"Done! Enriched pages saved to {dst_path}")
288
294
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.3.2
3
+ Version: 0.4.1
4
4
  Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -113,18 +113,6 @@ from mistocr.pipeline import pdf_to_md
113
113
  await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
114
114
  ```
115
115
 
116
- Step 1/3: Running OCR on files/test/resnet.pdf...
117
- Mistral batch job status: QUEUED
118
- Mistral batch job status: RUNNING
119
- Mistral batch job status: RUNNING
120
- Step 2/3: Fixing heading hierarchy...
121
- Step 3/3: Adding image descriptions...
122
- Describing 7 images...
123
- Saved descriptions to ocr_temp/resnet/img_descriptions.json
124
- Adding descriptions to 12 pages...
125
- Done! Enriched pages saved to files/test/md_test
126
- Done!
127
-
128
116
  This will (as indicated by the output):
129
117
 
130
118
  1. OCR the PDF using Mistral’s batch API
@@ -0,0 +1,11 @@
1
+ mistocr/__init__.py,sha256=pMtTmSUht-XtbR_7Doz6bsQqopJJd8rZ8I8zy2HwwoA,22
2
+ mistocr/_modidx.py,sha256=LyFez7ndKOXQpF0unhg1imCrW7xcUwO106W82ttVnnk,4366
3
+ mistocr/core.py,sha256=-yXqEro_kTE66lXWBrewS73SRTl-Btt9uyKNxMnzjIw,9181
4
+ mistocr/pipeline.py,sha256=n8AHIHGZBXYy_k7LtEBZSiTL6HClT84-7K4QlTjOLvo,2107
5
+ mistocr/refine.py,sha256=Q14DhUUsT5FLMxP9oIJ2TGQ3qbxe7ulXfRMPKpsd4Wo,13232
6
+ mistocr-0.4.1.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
+ mistocr-0.4.1.dist-info/METADATA,sha256=cvASaYVhDfCJ9bzrosdmTRd5ECIAPAl84H7nN5P06zY,7992
8
+ mistocr-0.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
+ mistocr-0.4.1.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
+ mistocr-0.4.1.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
+ mistocr-0.4.1.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- mistocr/__init__.py,sha256=vNiWJ14r_cw5t_7UDqDQIVZvladKFGyHH2avsLpN7Vg,22
2
- mistocr/_modidx.py,sha256=2rHVTcz3A3BfDCmGqUNsJFW3_n3Ch1vxSorrPVyLvMI,4256
3
- mistocr/core.py,sha256=EQYQgpnX2skgSX123u3dYaJHc1oDk5Nhgt5uBdXnCKs,8386
4
- mistocr/pipeline.py,sha256=hVXpxRYtshaiUm9qXgfSLlyHCAxHZ6nAfPzoGXGmJMQ,1769
5
- mistocr/refine.py,sha256=hQg4ZYRTkPFEiCNNBU2ykgXFfVLCNAp2IuwOwHBVQ2k,12983
6
- mistocr-0.3.2.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
- mistocr-0.3.2.dist-info/METADATA,sha256=igTgaDeBu00u_xJYtIcGlQswQCj2gIrdBi6NLiN5NNU,8438
8
- mistocr-0.3.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
- mistocr-0.3.2.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
- mistocr-0.3.2.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
- mistocr-0.3.2.dist-info/RECORD,,