PyPI - mistocr - Versions diffs - 0.2.10__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

mistocr 0.2.10py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

mistocr/__init__.py +1 -1
mistocr/_modidx.py +4 -1
mistocr/core.py +67 -23
mistocr/pipeline.py +26 -15
mistocr/refine.py +8 -3
{mistocr-0.2.10.dist-info → mistocr-0.4.0.dist-info}/METADATA +2 -1
mistocr-0.4.0.dist-info/RECORD +11 -0
mistocr-0.2.10.dist-info/RECORD +0 -11
{mistocr-0.2.10.dist-info → mistocr-0.4.0.dist-info}/WHEEL +0 -0
{mistocr-0.2.10.dist-info → mistocr-0.4.0.dist-info}/entry_points.txt +0 -0
{mistocr-0.2.10.dist-info → mistocr-0.4.0.dist-info}/licenses/LICENSE +0 -0
{mistocr-0.2.10.dist-info → mistocr-0.4.0.dist-info}/top_level.txt +0 -0

mistocr/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.2.10"
1	+ __version__ = "0.4.0"

mistocr/_modidx.py CHANGED Viewed

@@ -5,7 +5,8 @@ d = { 'settings': { 'branch': 'main',
                 'doc_host': 'https://franckalbinet.github.io',
                 'git_url': 'https://github.com/franckalbinet/mistocr',
                 'lib_path': 'mistocr'},
-  'syms': { 'mistocr.core': { 'mistocr.core._get_paths': ('core.html#_get_paths', 'mistocr/core.py'),
+  'syms': { 'mistocr.core': { 'mistocr.core._check_timeout': ('core.html#_check_timeout', 'mistocr/core.py'),
+                              'mistocr.core._get_paths': ('core.html#_get_paths', 'mistocr/core.py'),
                               'mistocr.core._prep_batch': ('core.html#_prep_batch', 'mistocr/core.py'),
                               'mistocr.core._run_batch': ('core.html#_run_batch', 'mistocr/core.py'),
                               'mistocr.core.create_batch_entry': ('core.html#create_batch_entry', 'mistocr/core.py'),
@@ -18,10 +19,12 @@ d = { 'settings': { 'branch': 'main',
                               'mistocr.core.save_page': ('core.html#save_page', 'mistocr/core.py'),
                               'mistocr.core.save_pages': ('core.html#save_pages', 'mistocr/core.py'),
                               'mistocr.core.submit_batch': ('core.html#submit_batch', 'mistocr/core.py'),
+                              'mistocr.core.subset_pdf': ('core.html#subset_pdf', 'mistocr/core.py'),
                               'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
                               'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
             'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},
             'mistocr.refine': { 'mistocr.refine.HeadingCorrection': ('refine.html#headingcorrection', 'mistocr/refine.py'),
+                                'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
                                 'mistocr.refine.ImgDescription': ('refine.html#imgdescription', 'mistocr/refine.py'),
                                 'mistocr.refine.add_descs_to_pg': ('refine.html#add_descs_to_pg', 'mistocr/refine.py'),
                                 'mistocr.refine.add_descs_to_pgs': ('refine.html#add_descs_to_pgs', 'mistocr/refine.py'),

mistocr/core.py CHANGED Viewed

@@ -3,8 +3,9 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_core.ipynb.
 # %% auto 0
-__all__ = ['ocr_model', 'ocr_endpoint', 'get_api_key', 'upload_pdf', 'create_batch_entry', 'prep_pdf_batch', 'submit_batch',
-           'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr_pdf', 'read_pgs']
+__all__ = ['logger', 'ocr_model', 'ocr_endpoint', 'get_api_key', 'upload_pdf', 'create_batch_entry', 'prep_pdf_batch',
+           'submit_batch', 'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr_pdf',
+           'read_pgs', 'subset_pdf']
 # %% ../nbs/00_core.ipynb 3
 from fastcore.all import *
@@ -13,8 +14,15 @@ from io import BytesIO
 from pathlib import Path
 from PIL import Image
 from mistralai import Mistral
+import PyPDF2
+import logging
-# %% ../nbs/00_core.ipynb 6
+# %% ../nbs/00_core.ipynb 4
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.WARNING, format='%(name)s - %(levelname)s - %(message)s')
+logger.setLevel(logging.DEBUG)
+# %% ../nbs/00_core.ipynb 7
 def get_api_key(
     key:str=None # Mistral API key
     ):
@@ -23,11 +31,11 @@ def get_api_key(
     if not key: raise ValueError("MISTRAL_API_KEY not found")
     return key
-# %% ../nbs/00_core.ipynb 7
+# %% ../nbs/00_core.ipynb 8
 ocr_model = "mistral-ocr-latest"
 ocr_endpoint = "/v1/ocr"
-# %% ../nbs/00_core.ipynb 10
+# %% ../nbs/00_core.ipynb 11
 def upload_pdf(
     path:str, # Path to PDF file
     key:str=None # Mistral API key
@@ -38,7 +46,7 @@ def upload_pdf(
     uploaded = c.files.upload(file=dict(file_name=path.stem, content=path.read_bytes()), purpose="ocr")
     return c.files.get_signed_url(file_id=uploaded.id).url, c
-# %% ../nbs/00_core.ipynb 15
+# %% ../nbs/00_core.ipynb 16
 def create_batch_entry(
     path:str, # Path to PDF file,
     url:str, # Mistral signed URL
@@ -50,7 +58,7 @@ def create_batch_entry(
     if not cid: cid = path.stem
     return dict(custom_id=cid, body=dict(document=dict(type="document_url", document_url=url), include_image_base64=inc_img))
-# %% ../nbs/00_core.ipynb 17
+# %% ../nbs/00_core.ipynb 18
 def prep_pdf_batch(
     path:str, # Path to PDF file,
     cid:str=None, # Custom ID (by default using the file name without extention)
@@ -61,7 +69,7 @@ def prep_pdf_batch(
     url, c = upload_pdf(path, key)
     return create_batch_entry(path, url, cid, inc_img), c
-# %% ../nbs/00_core.ipynb 21
+# %% ../nbs/00_core.ipynb 22
 def submit_batch(
     entries:list[dict], # List of batch entries,
     c:Mistral=None, # Mistral client,
@@ -75,20 +83,35 @@ def submit_batch(
         batch_data = c.files.upload(file=dict(file_name="batch.jsonl", content=open(f.name, "rb")), purpose="batch")
     return c.batch.jobs.create(input_files=[batch_data.id], model=model, endpoint=endpoint)
-# %% ../nbs/00_core.ipynb 24
+# %% ../nbs/00_core.ipynb 25
+def _check_timeout(
+    queued_time:int, # Time spent in QUEUED state (seconds)
+    timeout:int, # Maximum allowed QUEUED time (seconds)
+    job_id:str # Batch job ID
+    ):
+    "Raise TimeoutError if job has been queued longer than timeout"
+    if queued_time >= timeout: raise TimeoutError(f"Job {job_id} stayed in QUEUED for {queued_time}s, exceeding timeout of {timeout}s. Check your balance or Mistral Status.")
+# %% ../nbs/00_core.ipynb 26
 def wait_for_job(
-    job:dict, # Job dict,
-    c:Mistral=None, # Mistral client,
-    poll_interval:int=1 # Poll interval in seconds
-    ) -> dict: # Job dict (with status)
+    job:dict, # Batch job from submit_batch
+    c:Mistral=None, # Mistral client
+    poll_interval:int=1, # Seconds between status checks
+    queued_timeout:int=300 # Max seconds in QUEUED before timeout
+    ) -> dict: # Completed job dict
     "Poll job until completion and return final job status"
+    logger.info(f"Waiting for batch job {job.id} (initial status: {job.status})")
+    queued_time = 0
     while job.status in ["QUEUED", "RUNNING"]:
-        print(f'Mistral batch job status: {job.status}')
+        logger.debug(f"Job {job.id} status: {job.status} (elapsed: {queued_time}s)")
+        if job.status == "QUEUED": queued_time += poll_interval; _check_timeout(queued_time, queued_timeout, job.id)
         time.sleep(poll_interval)
         job = c.batch.jobs.get(job_id=job.id)
+    logger.info(f"Job {job.id} completed with status: {job.status}")
+    if job.status != "SUCCESS": logger.warning(f"Job {job.id} finished with non-success status: {job.status}")
     return job
-# %% ../nbs/00_core.ipynb 26
+# %% ../nbs/00_core.ipynb 28
 def download_results(
     job:dict, # Job dict,
     c:Mistral=None # Mistral client
@@ -97,7 +120,7 @@ def download_results(
     content = c.files.download(file_id=job.output_file).read().decode('utf-8')
     return [json.loads(line) for line in content.strip().split('\n') if line]
-# %% ../nbs/00_core.ipynb 31
+# %% ../nbs/00_core.ipynb 33
 def save_images(
     page:dict, # Page dict,
     img_dir:str='img' # Directory to save images
@@ -108,7 +131,7 @@ def save_images(
             img_bytes = base64.b64decode(img['image_base64'].split(',')[1])
             Image.open(BytesIO(img_bytes)).save(img_dir / img['id'])
-# %% ../nbs/00_core.ipynb 32
+# %% ../nbs/00_core.ipynb 34
 def save_page(
     page:dict, # Page dict,
     dst:str, # Directory to save page
@@ -120,7 +143,7 @@ def save_page(
         img_dir.mkdir(exist_ok=True)
         save_images(page, img_dir)
-# %% ../nbs/00_core.ipynb 34
+# %% ../nbs/00_core.ipynb 36
 def save_pages(
     ocr_resp:dict, # OCR response,
     dst:str, # Directory to save pages,
@@ -133,7 +156,7 @@ def save_pages(
     for page in ocr_resp['pages']: save_page(page, dst, img_dir)
     return dst
-# %% ../nbs/00_core.ipynb 40
+# %% ../nbs/00_core.ipynb 42
 def _get_paths(path:str) -> list[Path]:
     "Get list of PDFs from file or folder"
     path = Path(path)
@@ -144,7 +167,7 @@ def _get_paths(path:str) -> list[Path]:
         return pdfs
     raise ValueError(f"Path not found: {path}")
-# %% ../nbs/00_core.ipynb 41
+# %% ../nbs/00_core.ipynb 43
 def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[dict], Mistral]:
     "Prepare batch entries for list of PDFs"
     entries, c = [], None
@@ -153,7 +176,7 @@ def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[
         entries.append(entry)
     return entries, c
-# %% ../nbs/00_core.ipynb 42
+# %% ../nbs/00_core.ipynb 44
 def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]:
     "Submit batch, wait for completion, and download results"
     job = submit_batch(entries, c)
@@ -161,7 +184,7 @@ def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]
     if job.status != 'SUCCESS': raise Exception(f"Job failed with status: {job.status}")
     return download_results(job, c)
-# %% ../nbs/00_core.ipynb 43
+# %% ../nbs/00_core.ipynb 45
 def ocr_pdf(
     path:str, # Path to PDF file or folder,
     dst:str='md', # Directory to save markdown pages,
@@ -175,7 +198,7 @@ def ocr_pdf(
     results = _run_batch(entries, c, poll_interval)
     return L([save_pages(r['response']['body'], dst, r['custom_id']) for r in results])
-# %% ../nbs/00_core.ipynb 47
+# %% ../nbs/00_core.ipynb 52
 def read_pgs(
     path:str, # OCR output directory,
     join:bool=True # Join pages into single string
@@ -185,3 +208,24 @@ def read_pgs(
     pgs = sorted(path.glob('page_*.md'), key=lambda p: int(p.stem.split('_')[1]))
     contents = L([p.read_text() for p in pgs])
     return '\n\n'.join(contents) if join else contents
+# %% ../nbs/00_core.ipynb 59
+def subset_pdf(
+    path:str, # Path to PDF file
+    start:int=1, # Start page (1-based)
+    end:int=None, # End page (1-based, inclusive)
+    dst:str='.' # Output directory
+    ) -> Path: # Path to subset PDF
+    "Extract page range from PDF and save with range suffix"
+    path = Path(path)
+    writer = PyPDF2.PdfWriter()
+    with open(path, 'rb') as f:
+        reader = PyPDF2.PdfReader(f)
+        n = len(reader.pages)
+        end = end or n
+        s, e = max(0, start-1), min(n, end) - 1
+        for i in range(s, e+1): writer.add_page(reader.pages[i])
+    suffix = f"_p{s+1}-{e+1}" if s>0 or e<n-1 else ""
+    out = Path(dst) / f"{path.stem}{suffix}.pdf"
+    with open(out, 'wb') as f: writer.write(f)
+    return out

mistocr/pipeline.py CHANGED Viewed

@@ -3,7 +3,7 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_pipeline.ipynb.
 # %% auto 0
-__all__ = ['pdf_to_md']
+__all__ = ['logger', 'pdf_to_md']
 # %% ../nbs/02_pipeline.ipynb 3
 from fastcore.all import *
@@ -11,27 +11,38 @@ from .core import read_pgs, ocr_pdf
 from .refine import add_img_descs, fix_hdgs
 from pathlib import Path
 from asyncio import Semaphore, gather, sleep
+import tempfile
 import os, json, shutil
+import logging
 # %% ../nbs/02_pipeline.ipynb 4
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.WARNING, format='%(name)s - %(levelname)s - %(message)s')
+logger.setLevel(logging.INFO)
+# %% ../nbs/02_pipeline.ipynb 5
 @delegates(add_img_descs)
 async def pdf_to_md(
-    pdf_path:str, # Path to input PDF file
-    dst:str, # Destination directory for output markdown
-    ocr_output:str=None, # Optional OCR output directory (defaults to pdf_path stem)
-    model:str='claude-sonnet-4-5', # Model to use for heading fixes and image descriptions
-    add_img_desc:bool=True, # Whether to add image descriptions
-    progress:bool=True, # Whether to show progress messages
-    **kwargs):
+    pdf_path:str,                   # Path to input PDF file
+    dst:str,                        # Destination directory for output markdown
+    ocr_dst:str=None,               # Optional OCR output directory
+    model:str='claude-sonnet-4-5',  # Model to use for heading fixes and image descriptions
+    add_img_desc:bool=True,         # Whether to add image descriptions
+    progress:bool=True,             # Whether to show progress messages
+    **kwargs
+    ):
+    "Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
     "Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
+    cleanup = ocr_dst is None
+    if cleanup: ocr_dst = tempfile.mkdtemp()
     n_steps = 3 if add_img_desc else 2
-    if progress: print(f"Step 1/{n_steps}: Running OCR on {pdf_path}...")
-    ocr_dirs = ocr_pdf(pdf_path, ocr_output or 'ocr_temp')
-    ocr_dir = ocr_dirs[0]
-    if progress: print(f"Step 2/{n_steps}: Fixing heading hierarchy...")
+    if progress: logger.info(f"Step 1/{n_steps}: Running OCR on {pdf_path}...")
+    ocr_dir = ocr_pdf(pdf_path, ocr_dst)[0]
+    if progress: logger.info(f"Step 2/{n_steps}: Fixing heading hierarchy...")
     fix_hdgs(ocr_dir, model=model)
     if add_img_desc:
-        if progress: print(f"Step 3/{n_steps}: Adding image descriptions...")
+        if progress: logger.info(f"Step 3/{n_steps}: Adding image descriptions...")
         await add_img_descs(ocr_dir, dst=dst, model=model, progress=progress, **kwargs)
-    elif dst and Path(dst) != ocr_dir: shutil.copytree(ocr_dir, dst, dirs_exist_ok=True)
-    if progress: print("Done!")
+    elif dst != str(ocr_dir): shutil.copytree(ocr_dir, dst, dirs_exist_ok=True)
+    if cleanup: shutil.rmtree(ocr_dst)
+    if progress: logger.info("Done!")

mistocr/refine.py CHANGED Viewed

@@ -4,9 +4,9 @@
 # %% auto 0
 __all__ = ['prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
-           'HeadingCorrection', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes', 'fix_hdgs', 'ImgDescription',
-           'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs', 'add_descs_to_pg', 'add_descs_to_pgs',
-           'add_img_descs']
+           'HeadingCorrection', 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes',
+           'fix_hdgs', 'ImgDescription', 'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs',
+           'add_descs_to_pg', 'add_descs_to_pgs', 'add_img_descs']
 # %% ../nbs/01_refine.ipynb 3
 from fastcore.all import *
@@ -64,6 +64,11 @@ class HeadingCorrection(BaseModel):
     index: int
     corrected: str
+# %% ../nbs/01_refine.ipynb 19
+class HeadingCorrections(BaseModel):
+    "Collection of heading corrections returned by the LLM"
+    corrections: list[HeadingCorrection]
 # %% ../nbs/01_refine.ipynb 21
 prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.

{mistocr-0.2.10.dist-info → mistocr-0.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mistocr
-Version: 0.2.10
+Version: 0.4.0
 Summary: Batch OCR for PDFs with heading restoration and visual content integration
 Home-page: https://github.com/franckalbinet/mistocr
 Author: Solveit
@@ -23,6 +23,7 @@ Requires-Dist: mistralai
 Requires-Dist: pillow
 Requires-Dist: dotenv
 Requires-Dist: lisette
+Requires-Dist: PyPDF2
 Provides-Extra: dev
 Dynamic: author
 Dynamic: author-email

mistocr-0.4.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,11 @@
+mistocr/__init__.py,sha256=42STGor_9nKYXumfeV5tiyD_M8VdcddX7CEexmibPBk,22
+mistocr/_modidx.py,sha256=LyFez7ndKOXQpF0unhg1imCrW7xcUwO106W82ttVnnk,4366
+mistocr/core.py,sha256=-yXqEro_kTE66lXWBrewS73SRTl-Btt9uyKNxMnzjIw,9181
+mistocr/pipeline.py,sha256=n8AHIHGZBXYy_k7LtEBZSiTL6HClT84-7K4QlTjOLvo,2107
+mistocr/refine.py,sha256=hQg4ZYRTkPFEiCNNBU2ykgXFfVLCNAp2IuwOwHBVQ2k,12983
+mistocr-0.4.0.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
+mistocr-0.4.0.dist-info/METADATA,sha256=c0LUM6UrwIIoeug8fA8H4dYvutdieBFLQ52Sho4uGgY,8438
+mistocr-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+mistocr-0.4.0.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
+mistocr-0.4.0.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
+mistocr-0.4.0.dist-info/RECORD,,

mistocr-0.2.10.dist-info/RECORD DELETED Viewed

@@ -1,11 +0,0 @@
-mistocr/__init__.py,sha256=waXgc7p-jgGCsUjdVfO_KjlVZblnCvrzf4A0dsBj_lg,23
-mistocr/_modidx.py,sha256=WTS9JpZdbrp2LghjhOV-CK0JYChHE4PzttgKfh7pTy4,4028
-mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
-mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
-mistocr/refine.py,sha256=H_IAF02k6CwBQdDJm9txknzUcTlz245zXitaHELX-P4,12791
-mistocr-0.2.10.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
-mistocr-0.2.10.dist-info/METADATA,sha256=mkMu_9nYAXZ5jFdJd01AZqK3t93_Rt0xkkD0rRnl9Ew,8417
-mistocr-0.2.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-mistocr-0.2.10.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
-mistocr-0.2.10.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
-mistocr-0.2.10.dist-info/RECORD,,

{mistocr-0.2.10.dist-info → mistocr-0.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{mistocr-0.2.10.dist-info → mistocr-0.4.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{mistocr-0.2.10.dist-info → mistocr-0.4.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{mistocr-0.2.10.dist-info → mistocr-0.4.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

mistocr 0.2.10__py3-none-any.whl → 0.4.0__py3-none-any.whl

mistocr 0.2.10py3-none-any.whl → 0.4.0py3-none-any.whl