PyPI - mistocr - Versions diffs - 0.0.4__tar.gz → 0.1.0__tar.gz - Mend

mistocr 0.0.4tar.gz → 0.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

{mistocr-0.0.4/mistocr.egg-info → mistocr-0.1.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mistocr
-Version: 0.0.4
+Version: 0.1.0
 Summary: Simple batch OCR for PDFs using Mistral's state-of-the-art vision model
 Home-page: https://github.com/franckalbinet/mistocr
 Author: Solveit
@@ -22,6 +22,7 @@ Requires-Dist: fastcore
 Requires-Dist: mistralai
 Requires-Dist: pillow
 Requires-Dist: dotenv
+Requires-Dist: lisette
 Provides-Extra: dev
 Dynamic: author
 Dynamic: author-email

mistocr-0.1.0/mistocr/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.1.0"

{mistocr-0.0.4 → mistocr-0.1.0}/mistocr/_modidx.py RENAMED Viewed

@@ -19,4 +19,11 @@ d = { 'settings': { 'branch': 'main',
                               'mistocr.core.save_pages': ('core.html#save_pages', 'mistocr/core.py'),
                               'mistocr.core.submit_batch': ('core.html#submit_batch', 'mistocr/core.py'),
                               'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
-                              'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')}}}
+                              'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
+            'mistocr.refine': { 'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
+                                'mistocr.refine.apply_hdg_fixes': ('refine.html#apply_hdg_fixes', 'mistocr/refine.py'),
+                                'mistocr.refine.fix_hdg_hierarchy': ('refine.html#fix_hdg_hierarchy', 'mistocr/refine.py'),
+                                'mistocr.refine.fix_md_hdgs': ('refine.html#fix_md_hdgs', 'mistocr/refine.py'),
+                                'mistocr.refine.fmt_hdgs_idx': ('refine.html#fmt_hdgs_idx', 'mistocr/refine.py'),
+                                'mistocr.refine.get_hdgs': ('refine.html#get_hdgs', 'mistocr/refine.py'),
+                                'mistocr.refine.mk_fixes_lut': ('refine.html#mk_fixes_lut', 'mistocr/refine.py')}}}

{mistocr-0.0.4 → mistocr-0.1.0}/mistocr/core.py RENAMED Viewed

@@ -110,11 +110,11 @@ def save_images(
 # %% ../nbs/00_core.ipynb 32
 def save_page(
     page:dict, # Page dict,
-    out_dir:str, # Directory to save page
+    dst:str, # Directory to save page
     img_dir:str='img' # Directory to save images
     ) -> None:
     "Save single page markdown and images"
-    (out_dir / f"page_{page['index']+1}.md").write_text(page['markdown'])
+    (dst / f"page_{page['index']+1}.md").write_text(page['markdown'])
     if page.get('images'):
         img_dir.mkdir(exist_ok=True)
         save_images(page, img_dir)
@@ -122,15 +122,15 @@ def save_page(
 # %% ../nbs/00_core.ipynb 34
 def save_pages(
     ocr_resp:dict, # OCR response,
-    out_dir:str, # Directory to save pages,
+    dst:str, # Directory to save pages,
     cid:str # Custom ID
     ) -> Path: # Output directory
     "Save markdown pages and images from OCR response to output directory"
-    out_dir = Path(out_dir) / cid
-    out_dir.mkdir(parents=True, exist_ok=True)
-    img_dir = out_dir / 'img'
-    for page in ocr_resp['pages']: save_page(page, out_dir, img_dir)
-    return out_dir
+    dst = Path(dst) / cid
+    dst.mkdir(parents=True, exist_ok=True)
+    img_dir = dst / 'img'
+    for page in ocr_resp['pages']: save_page(page, dst, img_dir)
+    return dst
 # %% ../nbs/00_core.ipynb 40
 def _get_paths(path:str) -> list[Path]:
@@ -163,7 +163,7 @@ def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]
 # %% ../nbs/00_core.ipynb 43
 def ocr(
     path:str, # Path to PDF file or folder,
-    out_dir:str='md', # Directory to save markdown pages,
+    dst:str='md', # Directory to save markdown pages,
     inc_img:bool=True, # Include image in response,
     key:str=None, # API key,
     poll_interval:int=2 # Poll interval in seconds
@@ -172,18 +172,15 @@ def ocr(
     pdfs = _get_paths(path)
     entries, c = _prep_batch(pdfs, inc_img, key)
     results = _run_batch(entries, c, poll_interval)
-    return L([save_pages(r['response']['body'], out_dir, r['custom_id']) for r in results])
+    return L([save_pages(r['response']['body'], dst, r['custom_id']) for r in results])
 # %% ../nbs/00_core.ipynb 48
 def read_pgs(
     path:str, # OCR output directory,
-    pg:int=None, # Page number
-    ) -> str:
+    join:bool=True # Join pages into single string
+    ) -> str|list[str]: # Joined string or list of page contents
     "Read specific page or all pages from OCR output directory"
     path = Path(path)
-    if pg:
-        pg_path = path / f'page_{pg}.md'
-        if not pg_path.exists(): raise ValueError(f"Page {pg} not found")
-        return pg_path.read_text()
     pgs = sorted(path.glob('page_*.md'), key=lambda p: int(p.stem.split('_')[1]))
-    return '\n\n'.join([p.read_text() for p in pgs])
+    contents = L([p.read_text() for p in pgs])
+    return '\n\n'.join(contents) if join else contents

mistocr-0.1.0/mistocr/refine.py ADDED Viewed

@@ -0,0 +1,113 @@
+"""Postprocess markdown files by fixing heading hierarchy and describint images"""
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_refine.ipynb.
+# %% auto 0
+__all__ = ['prompt_fix_hdgs', 'get_hdgs', 'fmt_hdgs_idx', 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut',
+           'apply_hdg_fixes', 'fix_md_hdgs']
+# %% ../nbs/01_refine.ipynb 3
+from fastcore.all import *
+from .core import read_pgs
+from re import sub, findall, MULTILINE
+from pydantic import BaseModel
+from lisette.core import completion
+import os
+import json
+# %% ../nbs/01_refine.ipynb 7
+def get_hdgs(
+    md:str # Markdown file string
+    ):
+    "Return the markdown headings"
+    # Sanitize removing '#' in python snippet if any
+    md = sub(r'```[\s\S]*?```', '', md)
+    return L(findall(r'^#{1,6} .+$', md, MULTILINE))
+# %% ../nbs/01_refine.ipynb 10
+def fmt_hdgs_idx(
+    hdgs: list[str] # List of markdown headings
+    ) -> str: # Formatted string with index
+    "Format the headings with index"
+    return '\n'.join(f"{i}. {h}" for i, h in enumerate(hdgs))
+# %% ../nbs/01_refine.ipynb 13
+class HeadingCorrections(BaseModel):
+    corrections: dict[int, str]  # index → corrected heading
+# %% ../nbs/01_refine.ipynb 15
+prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
+INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title")
+RULES - Only fix these errors:
+1. **Level jumps**: Headings can only increase by one # at a time
+   - Wrong: 0. # Title → 1. #### Abstract
+   - Fixed: 0. # Title → 1. ## Abstract
+2. **Numbering inconsistency**: Subsection numbers must be one level deeper
+   - Wrong: 4. ## 3. Section → 5. ## 3.1 Subsection
+   - Fixed: 4. ## 3. Section → 5. ### 3.1 Subsection
+3. **Preserve working structure**: If sections are consistently marked, keep it
+4. **Decreasing levels is OK**: Going from ### to ## is valid for new sections
+OUTPUT: Return a Python dictionary mapping index to corrected heading (without the index prefix).
+Only include entries that need changes. Example: {{1: '## Abstract', 15: '### PASCAL VOC'}}
+Headings to analyze:
+{headings_list}
+"""
+# %% ../nbs/01_refine.ipynb 16
+def fix_hdg_hierarchy(
+    hdgs: list[str], # List of markdown headings
+    model: str='claude-sonnet-4-5', # Model to use
+    api_key: str=os.getenv('ANTHROPIC_API_KEY') # API key
+    ) -> dict[int, str]: # Dictionary of index → corrected heading
+    "Fix the heading hierarchy"
+    r = completion(
+        model=model,
+        messages=[{"role": "user", "content": prompt_fix_hdgs.format(headings_list=fmt_hdgs_idx(hdgs))}],
+        response_format=HeadingCorrections,
+        api_key=api_key
+        )
+    return json.loads(r.choices[0].message.content)['corrections']
+# %% ../nbs/01_refine.ipynb 19
+def mk_fixes_lut(
+    hdgs: list[str], # List of markdown headings
+    model: str='claude-sonnet-4-5', # Model to use
+    api_key: str=os.getenv('ANTHROPIC_API_KEY') # API key
+    ) -> dict[str, str]: # Dictionary of old → new heading
+    "Make a lookup table of fixes"
+    fixes = fix_hdg_hierarchy(hdgs, model, api_key)
+    return {hdgs[int(k)]:v for k,v in fixes.items()}
+# %% ../nbs/01_refine.ipynb 22
+def apply_hdg_fixes(
+    p:str, # Page to fix
+    lut_fixes: dict[str, str], # Lookup table of fixes
+    pg: int=None, # Optionnaly specify the page number to append to original heading
+    ) -> str: # Page with fixes applied
+    "Apply the fixes to the page"
+    for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old) + (f' .... page {pg}' if pg else ''))
+    return p
+# %% ../nbs/01_refine.ipynb 25
+def fix_md_hdgs(
+    src:str, # Source directory with markdown pages
+    model:str='claude-sonnet-4-5', # Model
+    dst:str=None, # Destination directory (None=overwrite)
+    pg_nums:bool=True # Add page numbers
+):
+    "Fix heading hierarchy in markdown document"
+    src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
+    if dst_path != src_path: dst_path.mkdir(parents=True, exist_ok=True)
+    lut = mk_fixes_lut(get_hdgs(read_pgs(src_path)), model)
+    for i,p in enumerate(read_pgs(src_path, join=False), 1):
+        (dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut, pg=i if pg_nums else None))

{mistocr-0.0.4 → mistocr-0.1.0/mistocr.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mistocr
-Version: 0.0.4
+Version: 0.1.0
 Summary: Simple batch OCR for PDFs using Mistral's state-of-the-art vision model
 Home-page: https://github.com/franckalbinet/mistocr
 Author: Solveit
@@ -22,6 +22,7 @@ Requires-Dist: fastcore
 Requires-Dist: mistralai
 Requires-Dist: pillow
 Requires-Dist: dotenv
+Requires-Dist: lisette
 Provides-Extra: dev
 Dynamic: author
 Dynamic: author-email

{mistocr-0.0.4 → mistocr-0.1.0}/mistocr.egg-info/SOURCES.txt RENAMED Viewed

@@ -7,6 +7,7 @@ setup.py
 mistocr/__init__.py
 mistocr/_modidx.py
 mistocr/core.py
+mistocr/refine.py
 mistocr.egg-info/PKG-INFO
 mistocr.egg-info/SOURCES.txt
 mistocr.egg-info/dependency_links.txt

{mistocr-0.0.4 → mistocr-0.1.0}/mistocr.egg-info/requires.txt RENAMED Viewed

@@ -2,5 +2,6 @@ fastcore
 mistralai
 pillow
 dotenv
+lisette
 [dev]

{mistocr-0.0.4 → mistocr-0.1.0}/settings.ini RENAMED Viewed

@@ -1,7 +1,7 @@
 [DEFAULT]
 repo = mistocr
 lib_name = mistocr
-version = 0.0.4
+version = 0.1.0
 min_python = 3.9
 license = apache2
 black_formatting = False
@@ -27,7 +27,7 @@ keywords = nbdev jupyter notebook python
 language = English
 status = 3
 user = franckalbinet
-requirements = fastcore mistralai pillow dotenv
+requirements = fastcore mistralai pillow dotenv lisette
 readme_nb = index.ipynb
 allowed_metadata_keys =
 allowed_cell_metadata_keys =