PyPI - mistocr - Versions diffs - 0.2.7__tar.gz → 0.4.1__tar.gz - Mend

mistocr 0.2.7tar.gz → 0.4.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

{mistocr-0.2.7/mistocr.egg-info → mistocr-0.4.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mistocr
-Version: 0.2.7
+Version: 0.4.1
 Summary: Batch OCR for PDFs with heading restoration and visual content integration
 Home-page: https://github.com/franckalbinet/mistocr
 Author: Solveit
@@ -23,6 +23,7 @@ Requires-Dist: mistralai
 Requires-Dist: pillow
 Requires-Dist: dotenv
 Requires-Dist: lisette
+Requires-Dist: PyPDF2
 Provides-Extra: dev
 Dynamic: author
 Dynamic: author-email
@@ -112,18 +113,6 @@ from mistocr.pipeline import pdf_to_md
 await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
 ```
-    Step 1/3: Running OCR on files/test/resnet.pdf...
-    Mistral batch job status: QUEUED
-    Mistral batch job status: RUNNING
-    Mistral batch job status: RUNNING
-    Step 2/3: Fixing heading hierarchy...
-    Step 3/3: Adding image descriptions...
-    Describing 7 images...
-    Saved descriptions to ocr_temp/resnet/img_descriptions.json
-    Adding descriptions to 12 pages...
-    Done! Enriched pages saved to files/test/md_test
-    Done!
 This will (as indicated by the output):
 1.  OCR the PDF using Mistral’s batch API

{mistocr-0.2.7 → mistocr-0.4.1}/README.md RENAMED Viewed

@@ -72,18 +72,6 @@ from mistocr.pipeline import pdf_to_md
 await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
 ```
-    Step 1/3: Running OCR on files/test/resnet.pdf...
-    Mistral batch job status: QUEUED
-    Mistral batch job status: RUNNING
-    Mistral batch job status: RUNNING
-    Step 2/3: Fixing heading hierarchy...
-    Step 3/3: Adding image descriptions...
-    Describing 7 images...
-    Saved descriptions to ocr_temp/resnet/img_descriptions.json
-    Adding descriptions to 12 pages...
-    Done! Enriched pages saved to files/test/md_test
-    Done!
 This will (as indicated by the output):
 1.  OCR the PDF using Mistral’s batch API

mistocr-0.4.1/mistocr/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.4.1"

{mistocr-0.2.7 → mistocr-0.4.1}/mistocr/_modidx.py RENAMED Viewed

@@ -5,7 +5,8 @@ d = { 'settings': { 'branch': 'main',
                 'doc_host': 'https://franckalbinet.github.io',
                 'git_url': 'https://github.com/franckalbinet/mistocr',
                 'lib_path': 'mistocr'},
-  'syms': { 'mistocr.core': { 'mistocr.core._get_paths': ('core.html#_get_paths', 'mistocr/core.py'),
+  'syms': { 'mistocr.core': { 'mistocr.core._check_timeout': ('core.html#_check_timeout', 'mistocr/core.py'),
+                              'mistocr.core._get_paths': ('core.html#_get_paths', 'mistocr/core.py'),
                               'mistocr.core._prep_batch': ('core.html#_prep_batch', 'mistocr/core.py'),
                               'mistocr.core._run_batch': ('core.html#_run_batch', 'mistocr/core.py'),
                               'mistocr.core.create_batch_entry': ('core.html#create_batch_entry', 'mistocr/core.py'),
@@ -18,6 +19,7 @@ d = { 'settings': { 'branch': 'main',
                               'mistocr.core.save_page': ('core.html#save_page', 'mistocr/core.py'),
                               'mistocr.core.save_pages': ('core.html#save_pages', 'mistocr/core.py'),
                               'mistocr.core.submit_batch': ('core.html#submit_batch', 'mistocr/core.py'),
+                              'mistocr.core.subset_pdf': ('core.html#subset_pdf', 'mistocr/core.py'),
                               'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
                               'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
             'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},

{mistocr-0.2.7 → mistocr-0.4.1}/mistocr/core.py RENAMED Viewed

@@ -3,8 +3,9 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_core.ipynb.
 # %% auto 0
-__all__ = ['ocr_model', 'ocr_endpoint', 'get_api_key', 'upload_pdf', 'create_batch_entry', 'prep_pdf_batch', 'submit_batch',
-           'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr_pdf', 'read_pgs']
+__all__ = ['logger', 'ocr_model', 'ocr_endpoint', 'get_api_key', 'upload_pdf', 'create_batch_entry', 'prep_pdf_batch',
+           'submit_batch', 'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr_pdf',
+           'read_pgs', 'subset_pdf']
 # %% ../nbs/00_core.ipynb 3
 from fastcore.all import *
@@ -13,8 +14,15 @@ from io import BytesIO
 from pathlib import Path
 from PIL import Image
 from mistralai import Mistral
+import PyPDF2
+import logging
-# %% ../nbs/00_core.ipynb 6
+# %% ../nbs/00_core.ipynb 4
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.WARNING, format='%(name)s - %(levelname)s - %(message)s')
+logger.setLevel(logging.DEBUG)
+# %% ../nbs/00_core.ipynb 7
 def get_api_key(
     key:str=None # Mistral API key
     ):
@@ -23,11 +31,11 @@ def get_api_key(
     if not key: raise ValueError("MISTRAL_API_KEY not found")
     return key
-# %% ../nbs/00_core.ipynb 7
+# %% ../nbs/00_core.ipynb 8
 ocr_model = "mistral-ocr-latest"
 ocr_endpoint = "/v1/ocr"
-# %% ../nbs/00_core.ipynb 10
+# %% ../nbs/00_core.ipynb 11
 def upload_pdf(
     path:str, # Path to PDF file
     key:str=None # Mistral API key
@@ -38,7 +46,7 @@ def upload_pdf(
     uploaded = c.files.upload(file=dict(file_name=path.stem, content=path.read_bytes()), purpose="ocr")
     return c.files.get_signed_url(file_id=uploaded.id).url, c
-# %% ../nbs/00_core.ipynb 15
+# %% ../nbs/00_core.ipynb 16
 def create_batch_entry(
     path:str, # Path to PDF file,
     url:str, # Mistral signed URL
@@ -50,7 +58,7 @@ def create_batch_entry(
     if not cid: cid = path.stem
     return dict(custom_id=cid, body=dict(document=dict(type="document_url", document_url=url), include_image_base64=inc_img))
-# %% ../nbs/00_core.ipynb 17
+# %% ../nbs/00_core.ipynb 18
 def prep_pdf_batch(
     path:str, # Path to PDF file,
     cid:str=None, # Custom ID (by default using the file name without extention)
@@ -61,7 +69,7 @@ def prep_pdf_batch(
     url, c = upload_pdf(path, key)
     return create_batch_entry(path, url, cid, inc_img), c
-# %% ../nbs/00_core.ipynb 21
+# %% ../nbs/00_core.ipynb 22
 def submit_batch(
     entries:list[dict], # List of batch entries,
     c:Mistral=None, # Mistral client,
@@ -75,20 +83,35 @@ def submit_batch(
         batch_data = c.files.upload(file=dict(file_name="batch.jsonl", content=open(f.name, "rb")), purpose="batch")
     return c.batch.jobs.create(input_files=[batch_data.id], model=model, endpoint=endpoint)
-# %% ../nbs/00_core.ipynb 24
+# %% ../nbs/00_core.ipynb 25
+def _check_timeout(
+    queued_time:int, # Time spent in QUEUED state (seconds)
+    timeout:int, # Maximum allowed QUEUED time (seconds)
+    job_id:str # Batch job ID
+    ):
+    "Raise TimeoutError if job has been queued longer than timeout"
+    if queued_time >= timeout: raise TimeoutError(f"Job {job_id} stayed in QUEUED for {queued_time}s, exceeding timeout of {timeout}s. Check your balance or Mistral Status.")
+# %% ../nbs/00_core.ipynb 26
 def wait_for_job(
-    job:dict, # Job dict,
-    c:Mistral=None, # Mistral client,
-    poll_interval:int=1 # Poll interval in seconds
-    ) -> dict: # Job dict (with status)
+    job:dict, # Batch job from submit_batch
+    c:Mistral=None, # Mistral client
+    poll_interval:int=1, # Seconds between status checks
+    queued_timeout:int=300 # Max seconds in QUEUED before timeout
+    ) -> dict: # Completed job dict
     "Poll job until completion and return final job status"
+    logger.info(f"Waiting for batch job {job.id} (initial status: {job.status})")
+    queued_time = 0
     while job.status in ["QUEUED", "RUNNING"]:
-        print(f'Mistral batch job status: {job.status}')
+        logger.debug(f"Job {job.id} status: {job.status} (elapsed: {queued_time}s)")
+        if job.status == "QUEUED": queued_time += poll_interval; _check_timeout(queued_time, queued_timeout, job.id)
         time.sleep(poll_interval)
         job = c.batch.jobs.get(job_id=job.id)
+    logger.info(f"Job {job.id} completed with status: {job.status}")
+    if job.status != "SUCCESS": logger.warning(f"Job {job.id} finished with non-success status: {job.status}")
     return job
-# %% ../nbs/00_core.ipynb 26
+# %% ../nbs/00_core.ipynb 28
 def download_results(
     job:dict, # Job dict,
     c:Mistral=None # Mistral client
@@ -97,7 +120,7 @@ def download_results(
     content = c.files.download(file_id=job.output_file).read().decode('utf-8')
     return [json.loads(line) for line in content.strip().split('\n') if line]
-# %% ../nbs/00_core.ipynb 31
+# %% ../nbs/00_core.ipynb 33
 def save_images(
     page:dict, # Page dict,
     img_dir:str='img' # Directory to save images
@@ -108,7 +131,7 @@ def save_images(
             img_bytes = base64.b64decode(img['image_base64'].split(',')[1])
             Image.open(BytesIO(img_bytes)).save(img_dir / img['id'])
-# %% ../nbs/00_core.ipynb 32
+# %% ../nbs/00_core.ipynb 34
 def save_page(
     page:dict, # Page dict,
     dst:str, # Directory to save page
@@ -120,7 +143,7 @@ def save_page(
         img_dir.mkdir(exist_ok=True)
         save_images(page, img_dir)
-# %% ../nbs/00_core.ipynb 34
+# %% ../nbs/00_core.ipynb 36
 def save_pages(
     ocr_resp:dict, # OCR response,
     dst:str, # Directory to save pages,
@@ -133,7 +156,7 @@ def save_pages(
     for page in ocr_resp['pages']: save_page(page, dst, img_dir)
     return dst
-# %% ../nbs/00_core.ipynb 40
+# %% ../nbs/00_core.ipynb 42
 def _get_paths(path:str) -> list[Path]:
     "Get list of PDFs from file or folder"
     path = Path(path)
@@ -144,7 +167,7 @@ def _get_paths(path:str) -> list[Path]:
         return pdfs
     raise ValueError(f"Path not found: {path}")
-# %% ../nbs/00_core.ipynb 41
+# %% ../nbs/00_core.ipynb 43
 def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[dict], Mistral]:
     "Prepare batch entries for list of PDFs"
     entries, c = [], None
@@ -153,7 +176,7 @@ def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[
         entries.append(entry)
     return entries, c
-# %% ../nbs/00_core.ipynb 42
+# %% ../nbs/00_core.ipynb 44
 def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]:
     "Submit batch, wait for completion, and download results"
     job = submit_batch(entries, c)
@@ -161,7 +184,7 @@ def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]
     if job.status != 'SUCCESS': raise Exception(f"Job failed with status: {job.status}")
     return download_results(job, c)
-# %% ../nbs/00_core.ipynb 43
+# %% ../nbs/00_core.ipynb 45
 def ocr_pdf(
     path:str, # Path to PDF file or folder,
     dst:str='md', # Directory to save markdown pages,
@@ -175,7 +198,7 @@ def ocr_pdf(
     results = _run_batch(entries, c, poll_interval)
     return L([save_pages(r['response']['body'], dst, r['custom_id']) for r in results])
-# %% ../nbs/00_core.ipynb 47
+# %% ../nbs/00_core.ipynb 52
 def read_pgs(
     path:str, # OCR output directory,
     join:bool=True # Join pages into single string
@@ -185,3 +208,24 @@ def read_pgs(
     pgs = sorted(path.glob('page_*.md'), key=lambda p: int(p.stem.split('_')[1]))
     contents = L([p.read_text() for p in pgs])
     return '\n\n'.join(contents) if join else contents
+# %% ../nbs/00_core.ipynb 59
+def subset_pdf(
+    path:str, # Path to PDF file
+    start:int=1, # Start page (1-based)
+    end:int=None, # End page (1-based, inclusive)
+    dst:str='.' # Output directory
+    ) -> Path: # Path to subset PDF
+    "Extract page range from PDF and save with range suffix"
+    path = Path(path)
+    writer = PyPDF2.PdfWriter()
+    with open(path, 'rb') as f:
+        reader = PyPDF2.PdfReader(f)
+        n = len(reader.pages)
+        end = end or n
+        s, e = max(0, start-1), min(n, end) - 1
+        for i in range(s, e+1): writer.add_page(reader.pages[i])
+    suffix = f"_p{s+1}-{e+1}" if s>0 or e<n-1 else ""
+    out = Path(dst) / f"{path.stem}{suffix}.pdf"
+    with open(out, 'wb') as f: writer.write(f)
+    return out

mistocr-0.4.1/mistocr/pipeline.py ADDED Viewed

@@ -0,0 +1,48 @@
+"""End-to-End Pipeline: PDF OCR, Markdown Heading Correction, and AI Image Descriptions"""
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_pipeline.ipynb.
+# %% auto 0
+__all__ = ['logger', 'pdf_to_md']
+# %% ../nbs/02_pipeline.ipynb 3
+from fastcore.all import *
+from .core import read_pgs, ocr_pdf
+from .refine import add_img_descs, fix_hdgs
+from pathlib import Path
+from asyncio import Semaphore, gather, sleep
+import tempfile
+import os, json, shutil
+import logging
+# %% ../nbs/02_pipeline.ipynb 4
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.WARNING, format='%(name)s - %(levelname)s - %(message)s')
+logger.setLevel(logging.INFO)
+# %% ../nbs/02_pipeline.ipynb 5
+@delegates(add_img_descs)
+async def pdf_to_md(
+    pdf_path:str,                   # Path to input PDF file
+    dst:str,                        # Destination directory for output markdown
+    ocr_dst:str=None,               # Optional OCR output directory
+    model:str='claude-sonnet-4-5',  # Model to use for heading fixes and image descriptions
+    add_img_desc:bool=True,         # Whether to add image descriptions
+    progress:bool=True,             # Whether to show progress messages
+    **kwargs
+    ):
+    "Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
+    "Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
+    cleanup = ocr_dst is None
+    if cleanup: ocr_dst = tempfile.mkdtemp()
+    n_steps = 3 if add_img_desc else 2
+    if progress: logger.info(f"Step 1/{n_steps}: Running OCR on {pdf_path}...")
+    ocr_dir = ocr_pdf(pdf_path, ocr_dst)[0]
+    if progress: logger.info(f"Step 2/{n_steps}: Fixing heading hierarchy...")
+    fix_hdgs(ocr_dir, model=model)
+    if add_img_desc:
+        if progress: logger.info(f"Step 3/{n_steps}: Adding image descriptions...")
+        await add_img_descs(ocr_dir, dst=dst, model=model, progress=progress, **kwargs)
+    elif dst != str(ocr_dir): shutil.copytree(ocr_dir, dst, dirs_exist_ok=True)
+    if cleanup: shutil.rmtree(ocr_dst)
+    if progress: logger.info("Done!")

{mistocr-0.2.7 → mistocr-0.4.1}/mistocr/refine.py RENAMED Viewed

@@ -3,10 +3,10 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_refine.ipynb.
 # %% auto 0
-__all__ = ['describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx', 'HeadingCorrection',
-           'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes', 'fix_hdgs', 'ImgDescription',
-           'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs', 'add_descs_to_pg', 'add_descs_to_pgs',
-           'add_img_descs']
+__all__ = ['logger', 'prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
+           'HeadingCorrection', 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes',
+           'fix_hdgs', 'ImgDescription', 'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs',
+           'add_descs_to_pg', 'add_descs_to_pgs', 'add_img_descs']
 # %% ../nbs/01_refine.ipynb 3
 from fastcore.all import *
@@ -20,8 +20,14 @@ import os
 import json
 import shutil
 from asyncio import Semaphore, gather, sleep
+import logging
-# %% ../nbs/01_refine.ipynb 7
+# %% ../nbs/01_refine.ipynb 4
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.WARNING, format='%(name)s - %(levelname)s - %(message)s')
+logger.setLevel(logging.INFO)
+# %% ../nbs/01_refine.ipynb 8
 def get_hdgs(
     md:str # Markdown file string
     ) -> L: # L of strings
@@ -32,7 +38,7 @@ def get_hdgs(
-# %% ../nbs/01_refine.ipynb 8
+# %% ../nbs/01_refine.ipynb 9
 def add_pg_hdgs(
     md:str, # Markdown file string,
     n:int # Page number
@@ -42,7 +48,7 @@ def add_pg_hdgs(
     def repl(m): return m.group(0) + f' ... page {n}'
     return sub(r'^#{1,6} .+$', repl, md, flags=MULTILINE)
-# %% ../nbs/01_refine.ipynb 12
+# %% ../nbs/01_refine.ipynb 13
 def read_pgs_pg(
     path:str # Path to the markdown file
     ) -> L: # List of markdown pages
@@ -50,7 +56,7 @@ def read_pgs_pg(
     pgs = read_pgs(path, join=False)
     return L([add_pg_hdgs(pg, n) for n, pg in enumerate(pgs, 1)]).concat()
-# %% ../nbs/01_refine.ipynb 15
+# %% ../nbs/01_refine.ipynb 16
 def fmt_hdgs_idx(
     hdgs: list[str] # List of markdown headings
     ) -> str: # Formatted string with index
@@ -58,15 +64,58 @@ def fmt_hdgs_idx(
     return '\n'.join(f"{i}. {h}" for i, h in enumerate(hdgs))
-# %% ../nbs/01_refine.ipynb 18
+# %% ../nbs/01_refine.ipynb 19
 class HeadingCorrection(BaseModel):
+    "A single heading correction mapping an index to its corrected markdown heading"
     index: int
     corrected: str
+# %% ../nbs/01_refine.ipynb 20
 class HeadingCorrections(BaseModel):
+    "Collection of heading corrections returned by the LLM"
     corrections: list[HeadingCorrection]
 # %% ../nbs/01_refine.ipynb 22
+prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
+INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title ... page 1")
+ANALYSIS STEPS (think through these before outputting corrections):
+1. For each numbered heading (e.g., "4.1", "2.a", "A.1"), identify its parent heading (e.g., "4", "2", "A")
+2. Verify the child heading is exactly one # deeper than its parent
+3. If not, mark it for correction
+RULES - Apply these fixes in order:
+1. **Single H1 rule**: Documents must have exactly ONE # heading (typically the document title at the top)
+   - If index 0 is already #, then all subsequent headings (index 1+) must be ## or deeper
+   - If no H1 exists, the first major heading should be #, and all others ## or deeper
+   - NO exceptions: appendices, references, and all sections are ## or deeper after the title
+2. **Infer depth from numbering patterns**: If headings contain section numbers, deeper nesting means deeper heading level
+   - Parent section (e.g., "1", "2", "A") MUST be shallower than child (e.g., "1.1", "2.a", "A.1")
+   - Child section MUST be exactly one # deeper than parent
+   - Works with any numbering: "1/1.1/1.1.1", "A/A.1/A.1.a", "I/I.A/I.A.1", etc.
+3. **Level jumps**: Headings can only increase by one # at a time when moving deeper
+   - Wrong: ## Section → ##### Subsection
+   - Fixed: ## Section → ### Subsection
+4. **Decreasing levels is OK**: Moving back up the hierarchy (### to ##) is valid for new sections
+5. **Unnumbered headings in numbered documents**: If the document uses numbered headings consistently, any unnumbered heading appearing within that structure is likely misclassified bold text and should be converted to regular text (output the heading text without any # symbols in the corrected field)
+OUTPUT: Return a list of corrections, where each correction has:
+- index: the heading's index number
+- corrected: the fixed heading text (without the index prefix), or empty string "" to remove the heading entirely
+IMPORTANT: Preserve the " ... page N" suffix in all corrected headings.
+Only include headings that need changes.
+Headings to analyze:
+{headings_list}
+"""
+# %% ../nbs/01_refine.ipynb 24
 def fix_hdg_hierarchy(
     hdgs: list[str], # List of markdown headings
     prompt: str=None, # Prompt to use
@@ -82,7 +131,7 @@ def fix_hdg_hierarchy(
     return {o['index']: o['corrected'] for o in fixes}
-# %% ../nbs/01_refine.ipynb 25
+# %% ../nbs/01_refine.ipynb 27
 @delegates(fix_hdg_hierarchy)
 def mk_fixes_lut(
     hdgs: list[str], # List of markdown headings
@@ -95,7 +144,7 @@ def mk_fixes_lut(
     fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key, **kwargs)
     return {hdgs[k]:v for k,v in fixes.items()}
-# %% ../nbs/01_refine.ipynb 28
+# %% ../nbs/01_refine.ipynb 30
 def apply_hdg_fixes(
     p:str, # Page to fix
     lut_fixes: dict[str, str], # Lookup table of fixes
@@ -104,7 +153,7 @@ def apply_hdg_fixes(
     for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old))
     return p
-# %% ../nbs/01_refine.ipynb 31
+# %% ../nbs/01_refine.ipynb 33
 @delegates(mk_fixes_lut)
 def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:str='img', **kwargs):
     "Fix heading hierarchy in markdown document"
@@ -116,13 +165,13 @@ def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:st
     lut = mk_fixes_lut(L([get_hdgs(pg) for pg in pgs_with_pg]).concat(), model, **kwargs)
     for i,p in enumerate(pgs_with_pg, 1): (dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut))
-# %% ../nbs/01_refine.ipynb 37
+# %% ../nbs/01_refine.ipynb 39
 class ImgDescription(BaseModel):
     "Image classification and description for OCR'd documents"
     is_informative:bool # Whether image contains informative content (charts, diagrams, tables) vs decorative (logos, backgrounds)
     description:str # Detailed description of the image content for RAG and accessibility
-# %% ../nbs/01_refine.ipynb 40
+# %% ../nbs/01_refine.ipynb 42
 describe_img_prompt = """Analyze this image from an academic/technical document.
 Step 1: Determine if this image is informative for understanding the document content.
@@ -135,7 +184,7 @@ Step 2:
 Return your response as JSON with 'is_informative' (boolean) and 'description' (string) fields."""
-# %% ../nbs/01_refine.ipynb 41
+# %% ../nbs/01_refine.ipynb 43
 async def describe_img(
     img_path: Path,  # Path to the image file
     model: str = 'claude-sonnet-4-5',  # Model to use
@@ -146,7 +195,7 @@ async def describe_img(
     r = await chat([img_path.read_bytes(), prompt], response_format=ImgDescription)
     return r
-# %% ../nbs/01_refine.ipynb 45
+# %% ../nbs/01_refine.ipynb 47
 async def limit(
     semaphore, # Semaphore for concurrency control
     coro, # Coroutine to execute
@@ -158,14 +207,14 @@ async def limit(
         if delay: await sleep(delay)
         return r
-# %% ../nbs/01_refine.ipynb 47
+# %% ../nbs/01_refine.ipynb 49
 def parse_r(
     result # ModelResponse object from API call
 ): # Dictionary with 'is_informative' and 'description' keys
     "Extract and parse JSON content from model response"
     return json.loads(result.choices[0].message.content)
-# %% ../nbs/01_refine.ipynb 49
+# %% ../nbs/01_refine.ipynb 51
 async def describe_imgs(
     imgs: list[Path], # List of image file paths to describe
     model: str = 'claude-sonnet-4-5', # Model to use for image description
@@ -178,7 +227,7 @@ async def describe_imgs(
     results = await gather(*[limit(sem, describe_img(img, model, prompt), delay) for img in imgs])
     return {img.name: parse_r(r) for img, r in zip(imgs, results)}
-# %% ../nbs/01_refine.ipynb 51
+# %% ../nbs/01_refine.ipynb 53
 def save_img_descs(
     descs: dict, # Dictionary of image descriptions
     dst_fname: Path, # Path to save the JSON file
@@ -186,7 +235,7 @@ def save_img_descs(
     "Save image descriptions to JSON file"
     Path(dst_fname).write_text(json.dumps(descs, indent=2))
-# %% ../nbs/01_refine.ipynb 56
+# %% ../nbs/01_refine.ipynb 58
 def add_descs_to_pg(
     pg:str, # Page markdown content
     descs:dict # Dictionary mapping image filenames to their descriptions
@@ -197,7 +246,7 @@ def add_descs_to_pg(
         if fname in descs: pg = pg.replace(link, f"{link}\nAI-generated image description:\n___\n{descs[fname]['description']}\n___")
     return pg
-# %% ../nbs/01_refine.ipynb 61
+# %% ../nbs/01_refine.ipynb 63
 def add_descs_to_pgs(
     pgs:list, # List of page markdown strings
     descs:dict # Dictionary mapping image filenames to their descriptions
@@ -205,7 +254,7 @@ def add_descs_to_pgs(
     "Add AI-generated descriptions to images in all pages"
     return [add_descs_to_pg(pg, descs) for pg in pgs]
-# %% ../nbs/01_refine.ipynb 64
+# %% ../nbs/01_refine.ipynb 66
 async def add_img_descs(
     src:str, # Path to source markdown directory
     dst:str=None, # Destination directory (defaults to src if None)
@@ -214,25 +263,32 @@ async def add_img_descs(
     semaphore:int=2, # Max concurrent API requests
     delay:float=1, # Delay in seconds between API calls
     force:bool=False, # Force regeneration even if cache exists
-    progress:bool=True # Print progress messages
+    progress:bool=True # Log progress messages
 ):
     "Describe all images in markdown document and insert descriptions inline"
     src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
     if dst_path != src_path: dst_path.mkdir(parents=True, exist_ok=True)
     src_imgs = src_path/img_folder
+    # Check if image folder exists
+    if not src_imgs.exists():
+        if progress: logger.info(f"No images to describe in the document (no '{img_folder}' folder found)")
+        return
     if src_imgs.exists() and dst_path != src_path: shutil.copytree(src_imgs, dst_path/img_folder, dirs_exist_ok=True)
     desc_file = src_path/'img_descriptions.json'
     if desc_file.exists() and not force:
-        if progress: print(f"Loading existing descriptions from {desc_file}")
+        if progress: logger.info(f"Loading existing descriptions from {desc_file}")
         descs = json.loads(desc_file.read_text())
     else:
         imgs = (src_path/img_folder).ls(file_exts=['.jpeg', '.jpg', '.png'])
-        if progress: print(f"Describing {len(imgs)} images...")
+        if progress: logger.info(f"Describing {len(imgs)} images...")
         descs = await describe_imgs(imgs, model, semaphore=semaphore, delay=delay)
         save_img_descs(descs, desc_file)
-        if progress: print(f"Saved descriptions to {desc_file}")
+        if progress: logger.info(f"Saved descriptions to {desc_file}")
     pgs = read_pgs(src_path, join=False)
-    if progress: print(f"Adding descriptions to {len(pgs)} pages...")
+    if progress: logger.info(f"Adding descriptions to {len(pgs)} pages...")
     enriched = [add_descs_to_pg(pg, descs) for pg in pgs]
     for i,pg in enumerate(enriched, 1): (dst_path/f'page_{i}.md').write_text(pg)
-    if progress: print(f"Done! Enriched pages saved to {dst_path}")
+    if progress: logger.info(f"Done! Enriched pages saved to {dst_path}")

{mistocr-0.2.7 → mistocr-0.4.1/mistocr.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mistocr
-Version: 0.2.7
+Version: 0.4.1
 Summary: Batch OCR for PDFs with heading restoration and visual content integration
 Home-page: https://github.com/franckalbinet/mistocr
 Author: Solveit
@@ -23,6 +23,7 @@ Requires-Dist: mistralai
 Requires-Dist: pillow
 Requires-Dist: dotenv
 Requires-Dist: lisette
+Requires-Dist: PyPDF2
 Provides-Extra: dev
 Dynamic: author
 Dynamic: author-email
@@ -112,18 +113,6 @@ from mistocr.pipeline import pdf_to_md
 await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
 ```
-    Step 1/3: Running OCR on files/test/resnet.pdf...
-    Mistral batch job status: QUEUED
-    Mistral batch job status: RUNNING
-    Mistral batch job status: RUNNING
-    Step 2/3: Fixing heading hierarchy...
-    Step 3/3: Adding image descriptions...
-    Describing 7 images...
-    Saved descriptions to ocr_temp/resnet/img_descriptions.json
-    Adding descriptions to 12 pages...
-    Done! Enriched pages saved to files/test/md_test
-    Done!
 This will (as indicated by the output):
 1.  OCR the PDF using Mistral’s batch API

{mistocr-0.2.7 → mistocr-0.4.1}/mistocr.egg-info/requires.txt RENAMED Viewed

@@ -3,5 +3,6 @@ mistralai
 pillow
 dotenv
 lisette
+PyPDF2
 [dev]

{mistocr-0.2.7 → mistocr-0.4.1}/settings.ini RENAMED Viewed

@@ -1,7 +1,7 @@
 [DEFAULT]
 repo = mistocr
 lib_name = mistocr
-version = 0.2.7
+version = 0.4.1
 min_python = 3.9
 license = apache2
 black_formatting = False
@@ -27,7 +27,7 @@ keywords = nbdev jupyter notebook python
 language = English
 status = 3
 user = franckalbinet
-requirements = fastcore mistralai pillow dotenv lisette
+requirements = fastcore mistralai pillow dotenv lisette PyPDF2
 readme_nb = index.ipynb
 allowed_metadata_keys =
 allowed_cell_metadata_keys =

mistocr-0.2.7/mistocr/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- __version__ = "0.2.7"

mistocr-0.2.7/mistocr/pipeline.py DELETED Viewed

@@ -1,37 +0,0 @@
-"""End-to-End Pipeline: PDF OCR, Markdown Heading Correction, and AI Image Descriptions"""
-# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_pipeline.ipynb.
-# %% auto 0
-__all__ = ['pdf_to_md']
-# %% ../nbs/02_pipeline.ipynb 3
-from fastcore.all import *
-from .core import read_pgs, ocr_pdf
-from .refine import add_img_descs, fix_hdgs
-from pathlib import Path
-from asyncio import Semaphore, gather, sleep
-import os, json, shutil
-# %% ../nbs/02_pipeline.ipynb 4
-@delegates(add_img_descs)
-async def pdf_to_md(
-    pdf_path:str, # Path to input PDF file
-    dst:str, # Destination directory for output markdown
-    ocr_output:str=None, # Optional OCR output directory (defaults to pdf_path stem)
-    model:str='claude-sonnet-4-5', # Model to use for heading fixes and image descriptions
-    add_img_desc:bool=True, # Whether to add image descriptions
-    progress:bool=True, # Whether to show progress messages
-    **kwargs):
-    "Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
-    n_steps = 3 if add_img_desc else 2
-    if progress: print(f"Step 1/{n_steps}: Running OCR on {pdf_path}...")
-    ocr_dirs = ocr_pdf(pdf_path, ocr_output or 'ocr_temp')
-    ocr_dir = ocr_dirs[0]
-    if progress: print(f"Step 2/{n_steps}: Fixing heading hierarchy...")
-    fix_hdgs(ocr_dir, model=model)
-    if add_img_desc:
-        if progress: print(f"Step 3/{n_steps}: Adding image descriptions...")
-        await add_img_descs(ocr_dir, dst=dst, model=model, progress=progress, **kwargs)
-    elif dst and Path(dst) != ocr_dir: shutil.copytree(ocr_dir, dst, dirs_exist_ok=True)
-    if progress: print("Done!")