PyPI - mistocr - Versions diffs - 0.1.3__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

mistocr 0.1.3py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

mistocr/__init__.py +1 -1
mistocr/_modidx.py +15 -3
mistocr/core.py +5 -4
mistocr/pipeline.py +37 -0
mistocr/refine.py +183 -35
mistocr-0.2.1.dist-info/METADATA +253 -0
mistocr-0.2.1.dist-info/RECORD +11 -0
mistocr-0.1.3.dist-info/METADATA +0 -183
mistocr-0.1.3.dist-info/RECORD +0 -10
{mistocr-0.1.3.dist-info → mistocr-0.2.1.dist-info}/WHEEL +0 -0
{mistocr-0.1.3.dist-info → mistocr-0.2.1.dist-info}/entry_points.txt +0 -0
{mistocr-0.1.3.dist-info → mistocr-0.2.1.dist-info}/licenses/LICENSE +0 -0
{mistocr-0.1.3.dist-info → mistocr-0.2.1.dist-info}/top_level.txt +0 -0

mistocr/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.1.3"
1	+ __version__ = "0.2.1"

mistocr/_modidx.py CHANGED Viewed

@@ -11,7 +11,7 @@ d = { 'settings': { 'branch': 'main',
                               'mistocr.core.create_batch_entry': ('core.html#create_batch_entry', 'mistocr/core.py'),
                               'mistocr.core.download_results': ('core.html#download_results', 'mistocr/core.py'),
                               'mistocr.core.get_api_key': ('core.html#get_api_key', 'mistocr/core.py'),
-                              'mistocr.core.ocr': ('core.html#ocr', 'mistocr/core.py'),
+                              'mistocr.core.ocr_pdf': ('core.html#ocr_pdf', 'mistocr/core.py'),
                               'mistocr.core.prep_pdf_batch': ('core.html#prep_pdf_batch', 'mistocr/core.py'),
                               'mistocr.core.read_pgs': ('core.html#read_pgs', 'mistocr/core.py'),
                               'mistocr.core.save_images': ('core.html#save_images', 'mistocr/core.py'),
@@ -20,10 +20,22 @@ d = { 'settings': { 'branch': 'main',
                               'mistocr.core.submit_batch': ('core.html#submit_batch', 'mistocr/core.py'),
                               'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
                               'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
+            'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},
             'mistocr.refine': { 'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
+                                'mistocr.refine.ImgDescription': ('refine.html#imgdescription', 'mistocr/refine.py'),
+                                'mistocr.refine.add_descs_to_pg': ('refine.html#add_descs_to_pg', 'mistocr/refine.py'),
+                                'mistocr.refine.add_descs_to_pgs': ('refine.html#add_descs_to_pgs', 'mistocr/refine.py'),
+                                'mistocr.refine.add_img_descs': ('refine.html#add_img_descs', 'mistocr/refine.py'),
+                                'mistocr.refine.add_pg_hdgs': ('refine.html#add_pg_hdgs', 'mistocr/refine.py'),
                                 'mistocr.refine.apply_hdg_fixes': ('refine.html#apply_hdg_fixes', 'mistocr/refine.py'),
+                                'mistocr.refine.describe_img': ('refine.html#describe_img', 'mistocr/refine.py'),
+                                'mistocr.refine.describe_imgs': ('refine.html#describe_imgs', 'mistocr/refine.py'),
                                 'mistocr.refine.fix_hdg_hierarchy': ('refine.html#fix_hdg_hierarchy', 'mistocr/refine.py'),
-                                'mistocr.refine.fix_md_hdgs': ('refine.html#fix_md_hdgs', 'mistocr/refine.py'),
+                                'mistocr.refine.fix_hdgs': ('refine.html#fix_hdgs', 'mistocr/refine.py'),
                                 'mistocr.refine.fmt_hdgs_idx': ('refine.html#fmt_hdgs_idx', 'mistocr/refine.py'),
                                 'mistocr.refine.get_hdgs': ('refine.html#get_hdgs', 'mistocr/refine.py'),
-                                'mistocr.refine.mk_fixes_lut': ('refine.html#mk_fixes_lut', 'mistocr/refine.py')}}}
+                                'mistocr.refine.limit': ('refine.html#limit', 'mistocr/refine.py'),
+                                'mistocr.refine.mk_fixes_lut': ('refine.html#mk_fixes_lut', 'mistocr/refine.py'),
+                                'mistocr.refine.parse_r': ('refine.html#parse_r', 'mistocr/refine.py'),
+                                'mistocr.refine.read_pgs_pg': ('refine.html#read_pgs_pg', 'mistocr/refine.py'),
+                                'mistocr.refine.save_img_descs': ('refine.html#save_img_descs', 'mistocr/refine.py')}}}

mistocr/core.py CHANGED Viewed

@@ -4,7 +4,7 @@
 # %% auto 0
 __all__ = ['ocr_model', 'ocr_endpoint', 'get_api_key', 'upload_pdf', 'create_batch_entry', 'prep_pdf_batch', 'submit_batch',
-           'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr', 'read_pgs']
+           'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr_pdf', 'read_pgs']
 # %% ../nbs/00_core.ipynb 3
 from fastcore.all import *
@@ -79,10 +79,11 @@ def submit_batch(
 def wait_for_job(
     job:dict, # Job dict,
     c:Mistral=None, # Mistral client,
-    poll_interval:int=10 # Poll interval in seconds
+    poll_interval:int=1 # Poll interval in seconds
     ) -> dict: # Job dict (with status)
     "Poll job until completion and return final job status"
     while job.status in ["QUEUED", "RUNNING"]:
+        print(f'Mistral batch job status: {job.status}')
         time.sleep(poll_interval)
         job = c.batch.jobs.get(job_id=job.id)
     return job
@@ -161,7 +162,7 @@ def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]
     return download_results(job, c)
 # %% ../nbs/00_core.ipynb 43
-def ocr(
+def ocr_pdf(
     path:str, # Path to PDF file or folder,
     dst:str='md', # Directory to save markdown pages,
     inc_img:bool=True, # Include image in response,
@@ -174,7 +175,7 @@ def ocr(
     results = _run_batch(entries, c, poll_interval)
     return L([save_pages(r['response']['body'], dst, r['custom_id']) for r in results])
-# %% ../nbs/00_core.ipynb 48
+# %% ../nbs/00_core.ipynb 47
 def read_pgs(
     path:str, # OCR output directory,
     join:bool=True # Join pages into single string

mistocr/pipeline.py ADDED Viewed

@@ -0,0 +1,37 @@
+"""End-to-End Pipeline: PDF OCR, Markdown Heading Correction, and AI Image Descriptions"""
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_pipeline.ipynb.
+# %% auto 0
+__all__ = ['pdf_to_md']
+# %% ../nbs/02_pipeline.ipynb 3
+from fastcore.all import *
+from .core import read_pgs, ocr_pdf
+from .refine import add_img_descs, fix_hdgs
+from pathlib import Path
+from asyncio import Semaphore, gather, sleep
+import os, json, shutil
+# %% ../nbs/02_pipeline.ipynb 4
+@delegates(add_img_descs)
+async def pdf_to_md(
+    pdf_path:str, # Path to input PDF file
+    dst:str, # Destination directory for output markdown
+    ocr_output:str=None, # Optional OCR output directory (defaults to pdf_path stem)
+    model:str='claude-sonnet-4-5', # Model to use for heading fixes and image descriptions
+    add_img_desc:bool=True, # Whether to add image descriptions
+    progress:bool=True, # Whether to show progress messages
+    **kwargs):
+    "Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
+    n_steps = 3 if add_img_desc else 2
+    if progress: print(f"Step 1/{n_steps}: Running OCR on {pdf_path}...")
+    ocr_dirs = ocr_pdf(pdf_path, ocr_output or 'ocr_temp')
+    ocr_dir = ocr_dirs[0]
+    if progress: print(f"Step 2/{n_steps}: Fixing heading hierarchy...")
+    fix_hdgs(ocr_dir, model=model)
+    if add_img_desc:
+        if progress: print(f"Step 3/{n_steps}: Adding image descriptions...")
+        await add_img_descs(ocr_dir, dst=dst, model=model, progress=progress, **kwargs)
+    elif dst and Path(dst) != ocr_dir: shutil.copytree(ocr_dir, dst, dirs_exist_ok=True)
+    if progress: print("Done!")

mistocr/refine.py CHANGED Viewed

@@ -1,24 +1,30 @@
-"""Postprocess markdown files by fixing heading hierarchy and describint images"""
+"""Fix heading hierarchy and describe images in OCR'd markdown documents"""
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_refine.ipynb.
 # %% auto 0
-__all__ = ['prompt_fix_hdgs', 'get_hdgs', 'fmt_hdgs_idx', 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut',
-           'apply_hdg_fixes', 'fix_md_hdgs']
+__all__ = ['prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
+           'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes', 'fix_hdgs', 'ImgDescription',
+           'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs', 'add_descs_to_pg', 'add_descs_to_pgs',
+           'add_img_descs']
 # %% ../nbs/01_refine.ipynb 3
 from fastcore.all import *
 from .core import read_pgs
 from re import sub, findall, MULTILINE
 from pydantic import BaseModel
+from lisette import *
 from lisette.core import completion
+from typing import Callable
 import os
 import json
+import shutil
+from asyncio import Semaphore, gather, sleep
 # %% ../nbs/01_refine.ipynb 7
 def get_hdgs(
     md:str # Markdown file string
-    ):
+    ) -> L: # L of strings
     "Return the markdown headings"
     # Sanitize removing '#' in python snippet if any
     md = sub(r'```[\s\S]*?```', '', md)
@@ -26,7 +32,25 @@ def get_hdgs(
-# %% ../nbs/01_refine.ipynb 10
+# %% ../nbs/01_refine.ipynb 8
+def add_pg_hdgs(
+    md:str, # Markdown file string,
+    n:int # Page number
+    ) -> str: # Markdown file string
+    "Add page number to all headings in page markdown"
+    md = sub(r'```[\s\S]*?```', '', md)
+    def repl(m): return m.group(0) + f' ... page {n}'
+    return sub(r'^#{1,6} .+$', repl, md, flags=MULTILINE)
+# %% ../nbs/01_refine.ipynb 12
+def read_pgs_pg(
+    path:str # Path to the markdown file
+    ) -> L: # List of markdown pages
+    "Read all pages of a markdown file and add page numbers to all headings"
+    pgs = read_pgs(path, join=False)
+    return L([add_pg_hdgs(pg, n) for n, pg in enumerate(pgs, 1)]).concat()
+# %% ../nbs/01_refine.ipynb 15
 def fmt_hdgs_idx(
     hdgs: list[str] # List of markdown headings
     ) -> str: # Formatted string with index
@@ -34,19 +58,21 @@ def fmt_hdgs_idx(
     return '\n'.join(f"{i}. {h}" for i, h in enumerate(hdgs))
-# %% ../nbs/01_refine.ipynb 13
+# %% ../nbs/01_refine.ipynb 18
 class HeadingCorrections(BaseModel):
     corrections: dict[int, str]  # index → corrected heading
-# %% ../nbs/01_refine.ipynb 15
+# %% ../nbs/01_refine.ipynb 20
 prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
-INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title")
+INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title ... page 1")
 RULES - Apply these fixes in order:
-1. **Single H1 rule**: Documents must have exactly ONE # heading (the title/main heading)
-   - All other headings should be ## or deeper
+1. **Single H1 rule**: Documents must have exactly ONE # heading (typically the document title at the top)
+   - If index 0 is already #, then all subsequent headings (index 1+) must be ## or deeper
+   - If no H1 exists, the first major heading should be #, and all others ## or deeper
+   - NO exceptions: appendices, references, and all sections are ## or deeper after the title
 2. **Infer depth from numbering patterns**: If headings contain section numbers, deeper nesting means deeper heading level
    - Parent section (e.g., "1", "2", "A") should be shallower than child (e.g., "1.1", "2.a", "A.1")
@@ -60,58 +86,180 @@ RULES - Apply these fixes in order:
 4. **Decreasing levels is OK**: Moving back up the hierarchy (### to ##) is valid for new sections
 OUTPUT: Return a Python dictionary mapping index to corrected heading (without the index prefix).
+IMPORTANT: Preserve the " ... page N" suffix in all corrected headings.
 Only include entries that need changes.
 Headings to analyze:
 {headings_list}
 """
-# %% ../nbs/01_refine.ipynb 16
+# %% ../nbs/01_refine.ipynb 22
 def fix_hdg_hierarchy(
     hdgs: list[str], # List of markdown headings
-    prompt: str=prompt_fix_hdgs, # Prompt to use
+    prompt: str=None, # Prompt to use
     model: str='claude-sonnet-4-5', # Model to use
-    api_key: str=os.getenv('ANTHROPIC_API_KEY') # API key
+    api_key: str=None # API key
     ) -> dict[int, str]: # Dictionary of index → corrected heading
     "Fix the heading hierarchy"
-    r = completion(
-        model=model,
-        messages=[{"role": "user", "content": prompt_fix_hdgs.format(headings_list=fmt_hdgs_idx(hdgs))}],
-        response_format=HeadingCorrections,
-        api_key=api_key
-        )
+    if api_key is None: api_key = os.getenv('ANTHROPIC_API_KEY')
+    if prompt is None: prompt = prompt_fix_hdgs
+    prompt = prompt.format(headings_list=fmt_hdgs_idx(hdgs))
+    r = completion(model=model, messages=[{"role": "user", "content": prompt}], response_format=HeadingCorrections, api_key=api_key)
     return json.loads(r.choices[0].message.content)['corrections']
-# %% ../nbs/01_refine.ipynb 19
+# %% ../nbs/01_refine.ipynb 25
+@delegates(fix_hdg_hierarchy)
 def mk_fixes_lut(
     hdgs: list[str], # List of markdown headings
     model: str='claude-sonnet-4-5', # Model to use
-    api_key: str=os.getenv('ANTHROPIC_API_KEY') # API key
+    api_key: str=None, # API key
+    **kwargs
     ) -> dict[str, str]: # Dictionary of old → new heading
     "Make a lookup table of fixes"
-    fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key)
+    if api_key is None: api_key = os.getenv('ANTHROPIC_API_KEY')
+    fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key, **kwargs)
     return {hdgs[int(k)]:v for k,v in fixes.items()}
-# %% ../nbs/01_refine.ipynb 22
+# %% ../nbs/01_refine.ipynb 28
 def apply_hdg_fixes(
     p:str, # Page to fix
     lut_fixes: dict[str, str], # Lookup table of fixes
-    pg: int=None, # Optionnaly specify the page number to append to original heading
     ) -> str: # Page with fixes applied
     "Apply the fixes to the page"
-    for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old) + (f' .... page {pg}' if pg else ''))
+    for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old))
     return p
-# %% ../nbs/01_refine.ipynb 25
-def fix_md_hdgs(
-    src:str, # Source directory with markdown pages
-    model:str='claude-sonnet-4-5', # Model
-    dst:str=None, # Destination directory (None=overwrite)
-    pg_nums:bool=True # Add page numbers
-):
+# %% ../nbs/01_refine.ipynb 31
+@delegates(mk_fixes_lut)
+def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:str='img', **kwargs):
     "Fix heading hierarchy in markdown document"
     src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
     if dst_path != src_path: dst_path.mkdir(parents=True, exist_ok=True)
-    lut = mk_fixes_lut(get_hdgs(read_pgs(src_path)), model)
-    for i,p in enumerate(read_pgs(src_path, join=False), 1):
-        (dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut, pg=i if pg_nums else None))
+    src_imgs = src_path/img_folder
+    if src_imgs.exists() and dst_path != src_path: shutil.copytree(src_imgs, dst_path/img_folder, dirs_exist_ok=True)
+    pgs_with_pg = read_pgs_pg(src_path)
+    lut = mk_fixes_lut(L([get_hdgs(pg) for pg in pgs_with_pg]).concat(), model, **kwargs)
+    for i,p in enumerate(pgs_with_pg, 1): (dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut))
+# %% ../nbs/01_refine.ipynb 37
+class ImgDescription(BaseModel):
+    "Image classification and description for OCR'd documents"
+    is_informative:bool # Whether image contains informative content (charts, diagrams, tables) vs decorative (logos, backgrounds)
+    description:str # Detailed description of the image content for RAG and accessibility
+# %% ../nbs/01_refine.ipynb 40
+describe_img_prompt = """Analyze this image from an academic/technical document.
+Step 1: Determine if this image is informative for understanding the document content.
+- Informative: charts, diagrams, tables, technical illustrations, experimental results, architectural diagrams
+- Non-informative: logos, decorative images, generic photos, page backgrounds
+Step 2:
+- If informative: Provide a detailed description including the type of visualization, key elements and their relationships, important data or patterns, and relevant technical details.
+- If non-informative: Provide a brief label (e.g., "Company logo", "Decorative header image")
+Return your response as JSON with 'is_informative' (boolean) and 'description' (string) fields."""
+# %% ../nbs/01_refine.ipynb 41
+async def describe_img(
+    img_path: Path,  # Path to the image file
+    model: str = 'claude-sonnet-4-5',  # Model to use
+    prompt: str = describe_img_prompt  # Prompt for description
+) -> ImgDescription:
+    "Describe a single image using AsyncChat"
+    chat = AsyncChat(model=model)
+    r = await chat([img_path.read_bytes(), prompt], response_format=ImgDescription)
+    return r
+# %% ../nbs/01_refine.ipynb 45
+async def limit(
+    semaphore, # Semaphore for concurrency control
+    coro, # Coroutine to execute
+    delay:float=None # Optional delay in seconds after execution
+):
+    "Execute coroutine with semaphore-based rate limiting and optional delay"
+    async with semaphore:
+        r = await coro
+        if delay: await sleep(delay)
+        return r
+# %% ../nbs/01_refine.ipynb 47
+def parse_r(
+    result # ModelResponse object from API call
+): # Dictionary with 'is_informative' and 'description' keys
+    "Extract and parse JSON content from model response"
+    return json.loads(result.choices[0].message.content)
+# %% ../nbs/01_refine.ipynb 49
+async def describe_imgs(
+    imgs: list[Path], # List of image file paths to describe
+    model: str = 'claude-sonnet-4-5', # Model to use for image description
+    prompt: str = describe_img_prompt, # Prompt template for description
+    semaphore: int = 2, # Max concurrent API requests
+    delay: float = 1 # Delay in seconds between requests
+) -> dict[str, dict]: # Dict mapping filename to parsed description
+    "Describe multiple images in parallel with rate limiting"
+    sem = Semaphore(semaphore)
+    results = await gather(*[limit(sem, describe_img(img, model, prompt), delay) for img in imgs])
+    return {img.name: parse_r(r) for img, r in zip(imgs, results)}
+# %% ../nbs/01_refine.ipynb 51
+def save_img_descs(
+    descs: dict, # Dictionary of image descriptions
+    dst_fname: Path, # Path to save the JSON file
+    ) -> None:
+    "Save image descriptions to JSON file"
+    Path(dst_fname).write_text(json.dumps(descs, indent=2))
+# %% ../nbs/01_refine.ipynb 56
+def add_descs_to_pg(
+    pg:str, # Page markdown content
+    descs:dict # Dictionary mapping image filenames to their descriptions
+) -> str: # Page markdown with descriptions added
+    "Add AI-generated descriptions to images in page"
+    for link in re.findall(r'!\[[^\]]*\]\([^)]+\)', pg):
+        fname = re.findall(r'\(([^)]+)\)', link)[0]
+        if fname in descs: pg = pg.replace(link, f"{link}\nAI-generated image description:\n___\n{descs[fname]['description']}\n___")
+    return pg
+# %% ../nbs/01_refine.ipynb 61
+def add_descs_to_pgs(
+    pgs:list, # List of page markdown strings
+    descs:dict # Dictionary mapping image filenames to their descriptions
+) -> list: # List of pages with descriptions added
+    "Add AI-generated descriptions to images in all pages"
+    return [add_descs_to_pg(pg, descs) for pg in pgs]
+# %% ../nbs/01_refine.ipynb 64
+async def add_img_descs(
+    src:str, # Path to source markdown directory
+    dst:str=None, # Destination directory (defaults to src if None)
+    model:str='claude-sonnet-4-5', # Vision model for image description
+    img_folder:str='img', # Name of folder containing images
+    semaphore:int=2, # Max concurrent API requests
+    delay:float=1, # Delay in seconds between API calls
+    force:bool=False, # Force regeneration even if cache exists
+    progress:bool=True # Print progress messages
+):
+    "Describe all images in markdown document and insert descriptions inline"
+    src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
+    if dst_path != src_path: dst_path.mkdir(parents=True, exist_ok=True)
+    src_imgs = src_path/img_folder
+    if src_imgs.exists() and dst_path != src_path: shutil.copytree(src_imgs, dst_path/img_folder, dirs_exist_ok=True)
+    desc_file = src_path/'img_descriptions.json'
+    if desc_file.exists() and not force:
+        if progress: print(f"Loading existing descriptions from {desc_file}")
+        descs = json.loads(desc_file.read_text())
+    else:
+        imgs = (src_path/img_folder).ls(file_exts=['.jpeg', '.jpg', '.png'])
+        if progress: print(f"Describing {len(imgs)} images...")
+        descs = await describe_imgs(imgs, model, semaphore=semaphore, delay=delay)
+        save_img_descs(descs, desc_file)
+        if progress: print(f"Saved descriptions to {desc_file}")
+    pgs = read_pgs(src_path, join=False)
+    if progress: print(f"Adding descriptions to {len(pgs)} pages...")
+    enriched = [add_descs_to_pg(pg, descs) for pg in pgs]
+    for i,pg in enumerate(enriched, 1): (dst_path/f'page_{i}.md').write_text(pg)
+    if progress: print(f"Done! Enriched pages saved to {dst_path}")

mistocr-0.2.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,253 @@
+Metadata-Version: 2.4
+Name: mistocr
+Version: 0.2.1
+Summary: Batch OCR for PDFs with heading restoration and visual content integration
+Home-page: https://github.com/franckalbinet/mistocr
+Author: Solveit
+Author-email: nobody@fast.ai
+License: Apache Software License 2.0
+Keywords: nbdev jupyter notebook python
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Natural Language :: English
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: License :: OSI Approved :: Apache Software License
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: fastcore
+Requires-Dist: mistralai
+Requires-Dist: pillow
+Requires-Dist: dotenv
+Requires-Dist: lisette
+Provides-Extra: dev
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: keywords
+Dynamic: license
+Dynamic: license-file
+Dynamic: provides-extra
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary
+# mistocr
+<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->
+**PDF OCR is a critical bottleneck in AI pipelines.** It’s often
+mentioned in passing, as if it’s a trivial step. Practice shows it’s far
+from it. Poorly converted PDFs mean garbage-in-garbage-out for
+downstream AI-system (RAG, …).
+When [Mistral AI](https://mistral.ai) released their [state-of-the-art
+OCR model](https://mistral.ai/fr/news/mistral-ocr) in March 2025, it
+opened new possibilities for large-scale document processing. While
+alternatives like [datalab.to](https://www.datalab.to) and
+[docling.ai](https://www.docling.ai) offer viable solutions, Mistral OCR
+delivers exceptional accuracy at a compelling price point.
+**mistocr** emerged from months of real-world usage across projects
+requiring large-scale processing of niche-domain PDFs. It addresses two
+fundamental challenges that raw OCR output leaves unsolved:
+- **Heading hierarchy restoration**: Even state-of-the-art OCR sometimes
+  produces inconsistent heading levels in large documents—a complex task
+  to get right. mistocr uses LLM-based analysis to restore proper
+  document structure, essential for downstream AI tasks.
+- **Visual content integration**: Charts, figures and diagrams are
+  automatically classified and described, then integrated into the
+  markdown. This makes visual information searchable and accessible for
+  downstream applications.
+- **Cost-efficient batch processing**: By exclusively using Mistral’s
+  batch API, mistocr cuts costs by 50% (\$0.50 vs \$1.00 per 1000 pages)
+  while eliminating the boilerplate code typically required.
+**In short**: Production-ready batch OCR with intelligent postprocessing
+that ensures your documents are actually usable for AI systems.
+## Get Started
+Install latest from [pypi](https://pypi.org/project/mistocr), then:
+``` sh
+$ pip install mistocr
+```
+Set your API keys:
+``` python
+import os
+os.environ['MISTRAL_API_KEY'] = 'your-key-here'
+os.environ['ANTHROPIC_API_KEY'] = 'your-key-here'  # for refine features (see Advanced Usage for other LLMs)
+```
+### Complete Pipeline
+Full pipeline with all features:
+``` python
+from mistocr.pipeline import pdf_to_md
+await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
+```
+    Step 1/3: Running OCR on files/test/resnet.pdf...
+    Mistral batch job status: QUEUED
+    Mistral batch job status: RUNNING
+    Mistral batch job status: RUNNING
+    Step 2/3: Fixing heading hierarchy...
+    Step 3/3: Adding image descriptions...
+    Describing 7 images...
+    Saved descriptions to ocr_temp/resnet/img_descriptions.json
+    Adding descriptions to 12 pages...
+    Done! Enriched pages saved to files/test/md_test
+    Done!
+This will (as indicated by the output):
+1.  OCR the PDF using Mistral’s batch API
+2.  Fix heading hierarchy inconsistencies
+3.  Describe images (charts, diagrams) and add those descriptions into
+    the markdown Save everything to `files/test/md_test`
+The output structure will be:
+    files/test/md_test/
+    ├── img/
+    │   ├── img-0.jpeg
+    │   ├── img-1.jpeg
+    │   └── ...
+    ├── page_1.md
+    ├── page_2.md
+    └── ...
+Each page’s markdown will include inline image descriptions:
+```` markdown
+```markdown
+![Figure 1](img/img-0.jpeg)
+AI-generated image description:
+___
+A residual learning block...
+___
+```
+````
+To print the the processed markdown, you can use the
+[`read_pgs`](https://franckalbinet.github.io/mistocr/core.html#read_pgs)
+function. Here’s how:
+Then to read the fully processed document:
+``` python
+from mistocr.pipeline import read_pgs
+md = read_pgs('files/test/md_test')
+print(md[:500])
+```
+    # Deep Residual Learning for Image Recognition  ... page 1
+    Kaiming He Xiangyu Zhang Shaoqing Ren Jian Sun<br>Microsoft Research<br>\{kahe, v-xiangz, v-shren, jiansun\}@microsoft.com
+    ## Abstract ... page 1
+    Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, ins
+By default,
+[`read_pgs()`](https://franckalbinet.github.io/mistocr/core.html#read_pgs)
+joins all pages. Pass `join=False` to get a list of individual pages
+instead.
+### Advanced Usage
+**Batch process entire folders:**
+``` python
+from mistocr.core import ocr_pdf
+# Process all PDFs in a folder
+output_dirs = ocr_pdf('path/to/pdf_folder', dst='output_folder')
+```
+**Custom models and prompts for heading fixes:**
+``` python
+from mistocr.refine import fix_hdgs
+# Use a different model or custom prompt
+fix_hdgs('ocr_output/doc1',
+         model='gpt-4o',
+         prompt=your_custom_prompt)
+```
+**Custom image description with rate limiting:**
+``` python
+from mistocr.refine import add_img_descs
+# Control API usage and customize descriptions
+await add_img_descs('ocr_output/doc1',
+                    model='claude-opus-4',
+                    semaphore=5,  # More concurrent requests
+                    delay=0.5)    # Shorter delay between calls
+```
+For complete control over each pipeline step, see the
+[core](https://fr.anckalbi.net/mistocr/core.html),
+[refine](https://fr.anckalbi.net/mistocr/refine.html), and
+[pipeline](https://fr.anckalbi.net/mistocr/pipeline.html) module
+documentation.
+## Known Limitations & Future Work
+`mistocr` is under active development. Current limitations include:
+- **No timeout on batch jobs**: Jobs poll indefinitely until completion.
+  If a job stalls, manual intervention is required.
+- **Limited error handling**: When batch jobs fail, error reporting and
+  recovery options are minimal.
+- **Progress monitoring**: Currently limited to periodic status prints.
+  Future versions will support callbacks or streaming updates for better
+  real-time monitoring.
+Contributions are welcome! If you encounter issues or have ideas for
+improvements, please open an issue or discussion on
+[GitHub](https://github.com/franckalbinet/mistocr).
+## Developer Guide
+If you are new to using `nbdev` here are some useful pointers to get you
+started.
+### Install mistocr in Development mode
+``` sh
+# make sure mistocr package is installed in development mode
+$ pip install -e .
+# make changes under nbs/ directory
+# ...
+# compile to have changes apply to mistocr
+$ nbdev_prepare
+```
+### Documentation
+Documentation can be found hosted on this GitHub
+[repository](https://github.com/franckalbinet/mistocr)’s
+[pages](https://franckalbinet.github.io/mistocr/). Additionally you can
+find package manager specific guidelines on
+[conda](https://anaconda.org/franckalbinet/mistocr) and
+[pypi](https://pypi.org/project/mistocr/) respectively.

mistocr-0.2.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,11 @@
+mistocr/__init__.py,sha256=HfjVOrpTnmZ-xVFCYSVmX50EXaBQeJteUHG-PD6iQs8,22
+mistocr/_modidx.py,sha256=JtXJb6ErVwyGFCWgeVz7N9JvybBp8Zyao4Ahn3smr90,4030
+mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
+mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
+mistocr/refine.py,sha256=zsPoxWD63bk1rzRVO9OPsevWeMNORHgT_y8H7T7CxYs,11785
+mistocr-0.2.1.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
+mistocr-0.2.1.dist-info/METADATA,sha256=-y9Ze92RygrKGCfHbBjlGXlv-5iRYVAOyHtC9MHnplw,7990
+mistocr-0.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+mistocr-0.2.1.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
+mistocr-0.2.1.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
+mistocr-0.2.1.dist-info/RECORD,,

mistocr-0.1.3.dist-info/METADATA DELETED Viewed

@@ -1,183 +0,0 @@
-Metadata-Version: 2.4
-Name: mistocr
-Version: 0.1.3
-Summary: Simple batch OCR for PDFs using Mistral's state-of-the-art vision model
-Home-page: https://github.com/franckalbinet/mistocr
-Author: Solveit
-Author-email: nobody@fast.ai
-License: Apache Software License 2.0
-Keywords: nbdev jupyter notebook python
-Classifier: Development Status :: 4 - Beta
-Classifier: Intended Audience :: Developers
-Classifier: Natural Language :: English
-Classifier: Programming Language :: Python :: 3.9
-Classifier: Programming Language :: Python :: 3.10
-Classifier: Programming Language :: Python :: 3.11
-Classifier: Programming Language :: Python :: 3.12
-Classifier: License :: OSI Approved :: Apache Software License
-Requires-Python: >=3.9
-Description-Content-Type: text/markdown
-License-File: LICENSE
-Requires-Dist: fastcore
-Requires-Dist: mistralai
-Requires-Dist: pillow
-Requires-Dist: dotenv
-Requires-Dist: lisette
-Provides-Extra: dev
-Dynamic: author
-Dynamic: author-email
-Dynamic: classifier
-Dynamic: description
-Dynamic: description-content-type
-Dynamic: home-page
-Dynamic: keywords
-Dynamic: license
-Dynamic: license-file
-Dynamic: provides-extra
-Dynamic: requires-dist
-Dynamic: requires-python
-Dynamic: summary
-# mistocr
-<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->
-## Why mistocr?
-**Performance**: Mistral’s OCR delivers state-of-the-art accuracy on
-complex documents including tables, charts, and multi-column layouts.
-**Scale**: Process entire folders of PDFs in a single batch job. Upload
-once, process asynchronously, and retrieve results when ready - perfect
-for large document sets.
-**Cost savings**: Batch OCR mode reduces costs from \$1/1000 pages to
-\$0.50/1000 pages - a 50% reduction compared to synchronous processing.
-**Simplicity**: A single
-[`ocr()`](https://franckalbinet.github.io/mistocr/core.html#ocr)
-function handles everything - uploading, batch submission, polling for
-completion, and saving results as markdown with extracted images.
-Process one PDF or an entire folder with the same simple interface.
-**Organized output**: Each PDF is automatically saved to its own folder
-with pages as separate markdown files and images in an `img` subfolder,
-making results easy to navigate and process further.
-## Installation
-Install latest from the GitHub
-[repository](https://github.com/franckalbinet/mistocr):
-``` sh
-$ pip install git+https://github.com/franckalbinet/mistocr.git
-```
-or from [pypi](https://pypi.org/project/mistocr/)
-``` sh
-$ pip install mistocr
-```
-## How to use
-### Basic usage
-Process a single PDF:
-``` python
-from mistocr.core import ocr
-fname = 'files/test/attention-is-all-you-need.pdf'
-result = ocr(fname)
-```
-Or process an entire folder:
-``` python
-results = ocr('files/test')
-```
-### Output structure
-Each PDF is saved to its own folder with pages as separate markdown
-files and images in an `img` subfolder:
-    files/test/md/
-    ├── attention-is-all-you-need/
-    │   ├── img/
-    │   │   ├── img-0.jpeg
-    │   │   ├── img-1.jpeg
-    │   │   └── ...
-    │   ├── page_1.md
-    │   ├── page_2.md
-    │   └── ...
-    └── resnet/
-        ├── img/
-        └── ...
-### Reading results
-Read all pages from a processed PDF:
-``` python
-from mistocr.core import read_pgs
-text = read_pgs('files/test/md/attention-is-all-you-need')
-```
-Or read a specific page:
-``` python
-text = read_pgs('files/test/md/attention-is-all-you-need', 10)
-```
-### Customization
-Customize output directory, image inclusion, and polling interval:
-``` python
-results = ocr('files/test', out_dir='output', inc_img=False, poll_interval=5)
-```
-**Parameters:**
-- **`path`**: A single PDF file or folder containing multiple PDFs
-- **`out_dir`**: Directory name for saving markdown output (default:
-  `'md'`)
-- **`inc_img`**: Include extracted images in the output (default:
-  `True`)
-- **`key`**: Your Mistral API key (uses `MISTRAL_API_KEY` environment
-  variable if not provided)
-- **`poll_interval`**: Seconds between batch job status checks (default:
-  `2`)
-**Returns:** List of paths to the generated markdown files
-## Developer Guide
-If you are new to using `nbdev` here are some useful pointers to get you
-started.
-### Install mistocr in Development mode
-``` sh
-# make sure mistocr package is installed in development mode
-$ pip install -e .
-# make changes under nbs/ directory
-# ...
-# compile to have changes apply to mistocr
-$ nbdev_prepare
-```
-### Documentation
-Documentation can be found hosted on this GitHub
-[repository](https://github.com/franckalbinet/mistocr)’s
-[pages](https://franckalbinet.github.io/mistocr/). Additionally you can
-find package manager specific guidelines on
-[conda](https://anaconda.org/franckalbinet/mistocr) and
-[pypi](https://pypi.org/project/mistocr/) respectively.

mistocr-0.1.3.dist-info/RECORD DELETED Viewed

@@ -1,10 +0,0 @@
-mistocr/__init__.py,sha256=XEqb2aiIn8fzGE68Mph4ck1FtQqsR_am0wRWvrYPffQ,22
-mistocr/_modidx.py,sha256=R9zVMv4dKz2sLStoB5wBoKRqjza216_z8xPXszoplU4,2660
-mistocr/core.py,sha256=wtaYZ_Fz0dXIb1cYLAtymwR9Z7_KBI4ULy-UpM3YTX4,7154
-mistocr/refine.py,sha256=572SDG8vhGjNMiET5eZhgVemNpUIHNFqi0ZSSl4eKCM,4545
-mistocr-0.1.3.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
-mistocr-0.1.3.dist-info/METADATA,sha256=jHRc6nm_uk7V-03y6Bd268hUWmkkOFNdt4s5cH3YPu0,4848
-mistocr-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-mistocr-0.1.3.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
-mistocr-0.1.3.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
-mistocr-0.1.3.dist-info/RECORD,,

{mistocr-0.1.3.dist-info → mistocr-0.2.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{mistocr-0.1.3.dist-info → mistocr-0.2.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{mistocr-0.1.3.dist-info → mistocr-0.2.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{mistocr-0.1.3.dist-info → mistocr-0.2.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

mistocr 0.1.3__py3-none-any.whl → 0.2.1__py3-none-any.whl

mistocr 0.1.3py3-none-any.whl → 0.2.1py3-none-any.whl