mistocr 0.2.10__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mistocr/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.10"
1
+ __version__ = "0.4.0"
mistocr/_modidx.py CHANGED
@@ -5,7 +5,8 @@ d = { 'settings': { 'branch': 'main',
5
5
  'doc_host': 'https://franckalbinet.github.io',
6
6
  'git_url': 'https://github.com/franckalbinet/mistocr',
7
7
  'lib_path': 'mistocr'},
8
- 'syms': { 'mistocr.core': { 'mistocr.core._get_paths': ('core.html#_get_paths', 'mistocr/core.py'),
8
+ 'syms': { 'mistocr.core': { 'mistocr.core._check_timeout': ('core.html#_check_timeout', 'mistocr/core.py'),
9
+ 'mistocr.core._get_paths': ('core.html#_get_paths', 'mistocr/core.py'),
9
10
  'mistocr.core._prep_batch': ('core.html#_prep_batch', 'mistocr/core.py'),
10
11
  'mistocr.core._run_batch': ('core.html#_run_batch', 'mistocr/core.py'),
11
12
  'mistocr.core.create_batch_entry': ('core.html#create_batch_entry', 'mistocr/core.py'),
@@ -18,10 +19,12 @@ d = { 'settings': { 'branch': 'main',
18
19
  'mistocr.core.save_page': ('core.html#save_page', 'mistocr/core.py'),
19
20
  'mistocr.core.save_pages': ('core.html#save_pages', 'mistocr/core.py'),
20
21
  'mistocr.core.submit_batch': ('core.html#submit_batch', 'mistocr/core.py'),
22
+ 'mistocr.core.subset_pdf': ('core.html#subset_pdf', 'mistocr/core.py'),
21
23
  'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
22
24
  'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
23
25
  'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},
24
26
  'mistocr.refine': { 'mistocr.refine.HeadingCorrection': ('refine.html#headingcorrection', 'mistocr/refine.py'),
27
+ 'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
25
28
  'mistocr.refine.ImgDescription': ('refine.html#imgdescription', 'mistocr/refine.py'),
26
29
  'mistocr.refine.add_descs_to_pg': ('refine.html#add_descs_to_pg', 'mistocr/refine.py'),
27
30
  'mistocr.refine.add_descs_to_pgs': ('refine.html#add_descs_to_pgs', 'mistocr/refine.py'),
mistocr/core.py CHANGED
@@ -3,8 +3,9 @@
3
3
  # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_core.ipynb.
4
4
 
5
5
  # %% auto 0
6
- __all__ = ['ocr_model', 'ocr_endpoint', 'get_api_key', 'upload_pdf', 'create_batch_entry', 'prep_pdf_batch', 'submit_batch',
7
- 'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr_pdf', 'read_pgs']
6
+ __all__ = ['logger', 'ocr_model', 'ocr_endpoint', 'get_api_key', 'upload_pdf', 'create_batch_entry', 'prep_pdf_batch',
7
+ 'submit_batch', 'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr_pdf',
8
+ 'read_pgs', 'subset_pdf']
8
9
 
9
10
  # %% ../nbs/00_core.ipynb 3
10
11
  from fastcore.all import *
@@ -13,8 +14,15 @@ from io import BytesIO
13
14
  from pathlib import Path
14
15
  from PIL import Image
15
16
  from mistralai import Mistral
17
+ import PyPDF2
18
+ import logging
16
19
 
17
- # %% ../nbs/00_core.ipynb 6
20
+ # %% ../nbs/00_core.ipynb 4
21
+ logger = logging.getLogger(__name__)
22
+ logging.basicConfig(level=logging.WARNING, format='%(name)s - %(levelname)s - %(message)s')
23
+ logger.setLevel(logging.DEBUG)
24
+
25
+ # %% ../nbs/00_core.ipynb 7
18
26
  def get_api_key(
19
27
  key:str=None # Mistral API key
20
28
  ):
@@ -23,11 +31,11 @@ def get_api_key(
23
31
  if not key: raise ValueError("MISTRAL_API_KEY not found")
24
32
  return key
25
33
 
26
- # %% ../nbs/00_core.ipynb 7
34
+ # %% ../nbs/00_core.ipynb 8
27
35
  ocr_model = "mistral-ocr-latest"
28
36
  ocr_endpoint = "/v1/ocr"
29
37
 
30
- # %% ../nbs/00_core.ipynb 10
38
+ # %% ../nbs/00_core.ipynb 11
31
39
  def upload_pdf(
32
40
  path:str, # Path to PDF file
33
41
  key:str=None # Mistral API key
@@ -38,7 +46,7 @@ def upload_pdf(
38
46
  uploaded = c.files.upload(file=dict(file_name=path.stem, content=path.read_bytes()), purpose="ocr")
39
47
  return c.files.get_signed_url(file_id=uploaded.id).url, c
40
48
 
41
- # %% ../nbs/00_core.ipynb 15
49
+ # %% ../nbs/00_core.ipynb 16
42
50
  def create_batch_entry(
43
51
  path:str, # Path to PDF file,
44
52
  url:str, # Mistral signed URL
@@ -50,7 +58,7 @@ def create_batch_entry(
50
58
  if not cid: cid = path.stem
51
59
  return dict(custom_id=cid, body=dict(document=dict(type="document_url", document_url=url), include_image_base64=inc_img))
52
60
 
53
- # %% ../nbs/00_core.ipynb 17
61
+ # %% ../nbs/00_core.ipynb 18
54
62
  def prep_pdf_batch(
55
63
  path:str, # Path to PDF file,
56
64
  cid:str=None, # Custom ID (by default using the file name without extention)
@@ -61,7 +69,7 @@ def prep_pdf_batch(
61
69
  url, c = upload_pdf(path, key)
62
70
  return create_batch_entry(path, url, cid, inc_img), c
63
71
 
64
- # %% ../nbs/00_core.ipynb 21
72
+ # %% ../nbs/00_core.ipynb 22
65
73
  def submit_batch(
66
74
  entries:list[dict], # List of batch entries,
67
75
  c:Mistral=None, # Mistral client,
@@ -75,20 +83,35 @@ def submit_batch(
75
83
  batch_data = c.files.upload(file=dict(file_name="batch.jsonl", content=open(f.name, "rb")), purpose="batch")
76
84
  return c.batch.jobs.create(input_files=[batch_data.id], model=model, endpoint=endpoint)
77
85
 
78
- # %% ../nbs/00_core.ipynb 24
86
+ # %% ../nbs/00_core.ipynb 25
87
+ def _check_timeout(
88
+ queued_time:int, # Time spent in QUEUED state (seconds)
89
+ timeout:int, # Maximum allowed QUEUED time (seconds)
90
+ job_id:str # Batch job ID
91
+ ):
92
+ "Raise TimeoutError if job has been queued longer than timeout"
93
+ if queued_time >= timeout: raise TimeoutError(f"Job {job_id} stayed in QUEUED for {queued_time}s, exceeding timeout of {timeout}s. Check your balance or Mistral Status.")
94
+
95
+ # %% ../nbs/00_core.ipynb 26
79
96
  def wait_for_job(
80
- job:dict, # Job dict,
81
- c:Mistral=None, # Mistral client,
82
- poll_interval:int=1 # Poll interval in seconds
83
- ) -> dict: # Job dict (with status)
97
+ job:dict, # Batch job from submit_batch
98
+ c:Mistral=None, # Mistral client
99
+ poll_interval:int=1, # Seconds between status checks
100
+ queued_timeout:int=300 # Max seconds in QUEUED before timeout
101
+ ) -> dict: # Completed job dict
84
102
  "Poll job until completion and return final job status"
103
+ logger.info(f"Waiting for batch job {job.id} (initial status: {job.status})")
104
+ queued_time = 0
85
105
  while job.status in ["QUEUED", "RUNNING"]:
86
- print(f'Mistral batch job status: {job.status}')
106
+ logger.debug(f"Job {job.id} status: {job.status} (elapsed: {queued_time}s)")
107
+ if job.status == "QUEUED": queued_time += poll_interval; _check_timeout(queued_time, queued_timeout, job.id)
87
108
  time.sleep(poll_interval)
88
109
  job = c.batch.jobs.get(job_id=job.id)
110
+ logger.info(f"Job {job.id} completed with status: {job.status}")
111
+ if job.status != "SUCCESS": logger.warning(f"Job {job.id} finished with non-success status: {job.status}")
89
112
  return job
90
113
 
91
- # %% ../nbs/00_core.ipynb 26
114
+ # %% ../nbs/00_core.ipynb 28
92
115
  def download_results(
93
116
  job:dict, # Job dict,
94
117
  c:Mistral=None # Mistral client
@@ -97,7 +120,7 @@ def download_results(
97
120
  content = c.files.download(file_id=job.output_file).read().decode('utf-8')
98
121
  return [json.loads(line) for line in content.strip().split('\n') if line]
99
122
 
100
- # %% ../nbs/00_core.ipynb 31
123
+ # %% ../nbs/00_core.ipynb 33
101
124
  def save_images(
102
125
  page:dict, # Page dict,
103
126
  img_dir:str='img' # Directory to save images
@@ -108,7 +131,7 @@ def save_images(
108
131
  img_bytes = base64.b64decode(img['image_base64'].split(',')[1])
109
132
  Image.open(BytesIO(img_bytes)).save(img_dir / img['id'])
110
133
 
111
- # %% ../nbs/00_core.ipynb 32
134
+ # %% ../nbs/00_core.ipynb 34
112
135
  def save_page(
113
136
  page:dict, # Page dict,
114
137
  dst:str, # Directory to save page
@@ -120,7 +143,7 @@ def save_page(
120
143
  img_dir.mkdir(exist_ok=True)
121
144
  save_images(page, img_dir)
122
145
 
123
- # %% ../nbs/00_core.ipynb 34
146
+ # %% ../nbs/00_core.ipynb 36
124
147
  def save_pages(
125
148
  ocr_resp:dict, # OCR response,
126
149
  dst:str, # Directory to save pages,
@@ -133,7 +156,7 @@ def save_pages(
133
156
  for page in ocr_resp['pages']: save_page(page, dst, img_dir)
134
157
  return dst
135
158
 
136
- # %% ../nbs/00_core.ipynb 40
159
+ # %% ../nbs/00_core.ipynb 42
137
160
  def _get_paths(path:str) -> list[Path]:
138
161
  "Get list of PDFs from file or folder"
139
162
  path = Path(path)
@@ -144,7 +167,7 @@ def _get_paths(path:str) -> list[Path]:
144
167
  return pdfs
145
168
  raise ValueError(f"Path not found: {path}")
146
169
 
147
- # %% ../nbs/00_core.ipynb 41
170
+ # %% ../nbs/00_core.ipynb 43
148
171
  def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[dict], Mistral]:
149
172
  "Prepare batch entries for list of PDFs"
150
173
  entries, c = [], None
@@ -153,7 +176,7 @@ def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[
153
176
  entries.append(entry)
154
177
  return entries, c
155
178
 
156
- # %% ../nbs/00_core.ipynb 42
179
+ # %% ../nbs/00_core.ipynb 44
157
180
  def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]:
158
181
  "Submit batch, wait for completion, and download results"
159
182
  job = submit_batch(entries, c)
@@ -161,7 +184,7 @@ def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]
161
184
  if job.status != 'SUCCESS': raise Exception(f"Job failed with status: {job.status}")
162
185
  return download_results(job, c)
163
186
 
164
- # %% ../nbs/00_core.ipynb 43
187
+ # %% ../nbs/00_core.ipynb 45
165
188
  def ocr_pdf(
166
189
  path:str, # Path to PDF file or folder,
167
190
  dst:str='md', # Directory to save markdown pages,
@@ -175,7 +198,7 @@ def ocr_pdf(
175
198
  results = _run_batch(entries, c, poll_interval)
176
199
  return L([save_pages(r['response']['body'], dst, r['custom_id']) for r in results])
177
200
 
178
- # %% ../nbs/00_core.ipynb 47
201
+ # %% ../nbs/00_core.ipynb 52
179
202
  def read_pgs(
180
203
  path:str, # OCR output directory,
181
204
  join:bool=True # Join pages into single string
@@ -185,3 +208,24 @@ def read_pgs(
185
208
  pgs = sorted(path.glob('page_*.md'), key=lambda p: int(p.stem.split('_')[1]))
186
209
  contents = L([p.read_text() for p in pgs])
187
210
  return '\n\n'.join(contents) if join else contents
211
+
212
+ # %% ../nbs/00_core.ipynb 59
213
+ def subset_pdf(
214
+ path:str, # Path to PDF file
215
+ start:int=1, # Start page (1-based)
216
+ end:int=None, # End page (1-based, inclusive)
217
+ dst:str='.' # Output directory
218
+ ) -> Path: # Path to subset PDF
219
+ "Extract page range from PDF and save with range suffix"
220
+ path = Path(path)
221
+ writer = PyPDF2.PdfWriter()
222
+ with open(path, 'rb') as f:
223
+ reader = PyPDF2.PdfReader(f)
224
+ n = len(reader.pages)
225
+ end = end or n
226
+ s, e = max(0, start-1), min(n, end) - 1
227
+ for i in range(s, e+1): writer.add_page(reader.pages[i])
228
+ suffix = f"_p{s+1}-{e+1}" if s>0 or e<n-1 else ""
229
+ out = Path(dst) / f"{path.stem}{suffix}.pdf"
230
+ with open(out, 'wb') as f: writer.write(f)
231
+ return out
mistocr/pipeline.py CHANGED
@@ -3,7 +3,7 @@
3
3
  # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_pipeline.ipynb.
4
4
 
5
5
  # %% auto 0
6
- __all__ = ['pdf_to_md']
6
+ __all__ = ['logger', 'pdf_to_md']
7
7
 
8
8
  # %% ../nbs/02_pipeline.ipynb 3
9
9
  from fastcore.all import *
@@ -11,27 +11,38 @@ from .core import read_pgs, ocr_pdf
11
11
  from .refine import add_img_descs, fix_hdgs
12
12
  from pathlib import Path
13
13
  from asyncio import Semaphore, gather, sleep
14
+ import tempfile
14
15
  import os, json, shutil
16
+ import logging
15
17
 
16
18
  # %% ../nbs/02_pipeline.ipynb 4
19
+ logger = logging.getLogger(__name__)
20
+ logging.basicConfig(level=logging.WARNING, format='%(name)s - %(levelname)s - %(message)s')
21
+ logger.setLevel(logging.INFO)
22
+
23
+ # %% ../nbs/02_pipeline.ipynb 5
17
24
  @delegates(add_img_descs)
18
25
  async def pdf_to_md(
19
- pdf_path:str, # Path to input PDF file
20
- dst:str, # Destination directory for output markdown
21
- ocr_output:str=None, # Optional OCR output directory (defaults to pdf_path stem)
22
- model:str='claude-sonnet-4-5', # Model to use for heading fixes and image descriptions
23
- add_img_desc:bool=True, # Whether to add image descriptions
24
- progress:bool=True, # Whether to show progress messages
25
- **kwargs):
26
+ pdf_path:str, # Path to input PDF file
27
+ dst:str, # Destination directory for output markdown
28
+ ocr_dst:str=None, # Optional OCR output directory
29
+ model:str='claude-sonnet-4-5', # Model to use for heading fixes and image descriptions
30
+ add_img_desc:bool=True, # Whether to add image descriptions
31
+ progress:bool=True, # Whether to show progress messages
32
+ **kwargs
33
+ ):
34
+ "Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
26
35
  "Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
36
+ cleanup = ocr_dst is None
37
+ if cleanup: ocr_dst = tempfile.mkdtemp()
27
38
  n_steps = 3 if add_img_desc else 2
28
- if progress: print(f"Step 1/{n_steps}: Running OCR on {pdf_path}...")
29
- ocr_dirs = ocr_pdf(pdf_path, ocr_output or 'ocr_temp')
30
- ocr_dir = ocr_dirs[0]
31
- if progress: print(f"Step 2/{n_steps}: Fixing heading hierarchy...")
39
+ if progress: logger.info(f"Step 1/{n_steps}: Running OCR on {pdf_path}...")
40
+ ocr_dir = ocr_pdf(pdf_path, ocr_dst)[0]
41
+ if progress: logger.info(f"Step 2/{n_steps}: Fixing heading hierarchy...")
32
42
  fix_hdgs(ocr_dir, model=model)
33
43
  if add_img_desc:
34
- if progress: print(f"Step 3/{n_steps}: Adding image descriptions...")
44
+ if progress: logger.info(f"Step 3/{n_steps}: Adding image descriptions...")
35
45
  await add_img_descs(ocr_dir, dst=dst, model=model, progress=progress, **kwargs)
36
- elif dst and Path(dst) != ocr_dir: shutil.copytree(ocr_dir, dst, dirs_exist_ok=True)
37
- if progress: print("Done!")
46
+ elif dst != str(ocr_dir): shutil.copytree(ocr_dir, dst, dirs_exist_ok=True)
47
+ if cleanup: shutil.rmtree(ocr_dst)
48
+ if progress: logger.info("Done!")
mistocr/refine.py CHANGED
@@ -4,9 +4,9 @@
4
4
 
5
5
  # %% auto 0
6
6
  __all__ = ['prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
7
- 'HeadingCorrection', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes', 'fix_hdgs', 'ImgDescription',
8
- 'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs', 'add_descs_to_pg', 'add_descs_to_pgs',
9
- 'add_img_descs']
7
+ 'HeadingCorrection', 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes',
8
+ 'fix_hdgs', 'ImgDescription', 'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs',
9
+ 'add_descs_to_pg', 'add_descs_to_pgs', 'add_img_descs']
10
10
 
11
11
  # %% ../nbs/01_refine.ipynb 3
12
12
  from fastcore.all import *
@@ -64,6 +64,11 @@ class HeadingCorrection(BaseModel):
64
64
  index: int
65
65
  corrected: str
66
66
 
67
+ # %% ../nbs/01_refine.ipynb 19
68
+ class HeadingCorrections(BaseModel):
69
+ "Collection of heading corrections returned by the LLM"
70
+ corrections: list[HeadingCorrection]
71
+
67
72
  # %% ../nbs/01_refine.ipynb 21
68
73
  prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
69
74
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.2.10
3
+ Version: 0.4.0
4
4
  Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -23,6 +23,7 @@ Requires-Dist: mistralai
23
23
  Requires-Dist: pillow
24
24
  Requires-Dist: dotenv
25
25
  Requires-Dist: lisette
26
+ Requires-Dist: PyPDF2
26
27
  Provides-Extra: dev
27
28
  Dynamic: author
28
29
  Dynamic: author-email
@@ -0,0 +1,11 @@
1
+ mistocr/__init__.py,sha256=42STGor_9nKYXumfeV5tiyD_M8VdcddX7CEexmibPBk,22
2
+ mistocr/_modidx.py,sha256=LyFez7ndKOXQpF0unhg1imCrW7xcUwO106W82ttVnnk,4366
3
+ mistocr/core.py,sha256=-yXqEro_kTE66lXWBrewS73SRTl-Btt9uyKNxMnzjIw,9181
4
+ mistocr/pipeline.py,sha256=n8AHIHGZBXYy_k7LtEBZSiTL6HClT84-7K4QlTjOLvo,2107
5
+ mistocr/refine.py,sha256=hQg4ZYRTkPFEiCNNBU2ykgXFfVLCNAp2IuwOwHBVQ2k,12983
6
+ mistocr-0.4.0.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
+ mistocr-0.4.0.dist-info/METADATA,sha256=c0LUM6UrwIIoeug8fA8H4dYvutdieBFLQ52Sho4uGgY,8438
8
+ mistocr-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
+ mistocr-0.4.0.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
+ mistocr-0.4.0.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
+ mistocr-0.4.0.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- mistocr/__init__.py,sha256=waXgc7p-jgGCsUjdVfO_KjlVZblnCvrzf4A0dsBj_lg,23
2
- mistocr/_modidx.py,sha256=WTS9JpZdbrp2LghjhOV-CK0JYChHE4PzttgKfh7pTy4,4028
3
- mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
4
- mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
5
- mistocr/refine.py,sha256=H_IAF02k6CwBQdDJm9txknzUcTlz245zXitaHELX-P4,12791
6
- mistocr-0.2.10.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
- mistocr-0.2.10.dist-info/METADATA,sha256=mkMu_9nYAXZ5jFdJd01AZqK3t93_Rt0xkkD0rRnl9Ew,8417
8
- mistocr-0.2.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
- mistocr-0.2.10.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
- mistocr-0.2.10.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
- mistocr-0.2.10.dist-info/RECORD,,