PyPI - mistocr - Versions diffs - 0.2.12__tar.gz → 0.3.1__tar.gz - Mend

mistocr 0.2.12tar.gz → 0.3.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

{mistocr-0.2.12/mistocr.egg-info → mistocr-0.3.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mistocr
-Version: 0.2.12
+Version: 0.3.1
 Summary: Batch OCR for PDFs with heading restoration and visual content integration
 Home-page: https://github.com/franckalbinet/mistocr
 Author: Solveit
@@ -23,6 +23,7 @@ Requires-Dist: mistralai
 Requires-Dist: pillow
 Requires-Dist: dotenv
 Requires-Dist: lisette
+Requires-Dist: PyPDF2
 Provides-Extra: dev
 Dynamic: author
 Dynamic: author-email

mistocr-0.3.1/mistocr/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.3.1"

{mistocr-0.2.12 → mistocr-0.3.1}/mistocr/_modidx.py RENAMED Viewed

@@ -18,9 +18,10 @@ d = { 'settings': { 'branch': 'main',
                               'mistocr.core.save_page': ('core.html#save_page', 'mistocr/core.py'),
                               'mistocr.core.save_pages': ('core.html#save_pages', 'mistocr/core.py'),
                               'mistocr.core.submit_batch': ('core.html#submit_batch', 'mistocr/core.py'),
+                              'mistocr.core.subset_pdf': ('core.html#subset_pdf', 'mistocr/core.py'),
                               'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
                               'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
-            'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},
+            'mistocr.pipeline': {},
             'mistocr.refine': { 'mistocr.refine.HeadingCorrection': ('refine.html#headingcorrection', 'mistocr/refine.py'),
                                 'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
                                 'mistocr.refine.ImgDescription': ('refine.html#imgdescription', 'mistocr/refine.py'),

{mistocr-0.2.12 → mistocr-0.3.1}/mistocr/core.py RENAMED Viewed

@@ -4,7 +4,8 @@
 # %% auto 0
 __all__ = ['ocr_model', 'ocr_endpoint', 'get_api_key', 'upload_pdf', 'create_batch_entry', 'prep_pdf_batch', 'submit_batch',
-           'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr_pdf', 'read_pgs']
+           'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr_pdf', 'read_pgs',
+           'subset_pdf']
 # %% ../nbs/00_core.ipynb 3
 from fastcore.all import *
@@ -13,6 +14,7 @@ from io import BytesIO
 from pathlib import Path
 from PIL import Image
 from mistralai import Mistral
+import PyPDF2
 # %% ../nbs/00_core.ipynb 6
 def get_api_key(
@@ -181,7 +183,7 @@ def ocr_pdf(
     results = _run_batch(entries, c, poll_interval)
     return L([save_pages(r['response']['body'], dst, r['custom_id']) for r in results])
-# %% ../nbs/00_core.ipynb 47
+# %% ../nbs/00_core.ipynb 50
 def read_pgs(
     path:str, # OCR output directory,
     join:bool=True # Join pages into single string
@@ -191,3 +193,24 @@ def read_pgs(
     pgs = sorted(path.glob('page_*.md'), key=lambda p: int(p.stem.split('_')[1]))
     contents = L([p.read_text() for p in pgs])
     return '\n\n'.join(contents) if join else contents
+# %% ../nbs/00_core.ipynb 57
+def subset_pdf(
+    path:str, # Path to PDF file
+    start:int=1, # Start page (1-based)
+    end:int=None, # End page (1-based, inclusive)
+    dst:str='.' # Output directory
+    ) -> Path: # Path to subset PDF
+    "Extract page range from PDF and save with range suffix"
+    path = Path(path)
+    writer = PyPDF2.PdfWriter()
+    with open(path, 'rb') as f:
+        reader = PyPDF2.PdfReader(f)
+        n = len(reader.pages)
+        end = end or n
+        s, e = max(0, start-1), min(n, end) - 1
+        for i in range(s, e+1): writer.add_page(reader.pages[i])
+    suffix = f"_p{s+1}-{e+1}" if s>0 or e<n-1 else ""
+    out = Path(dst) / f"{path.stem}{suffix}.pdf"
+    with open(out, 'wb') as f: writer.write(f)
+    return out

mistocr-0.3.1/mistocr/pipeline.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""End-to-End Pipeline: PDF OCR, Markdown Heading Correction, and AI Image Descriptions"""
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_pipeline.ipynb.
+# %% auto 0
+__all__ = []
+# %% ../nbs/02_pipeline.ipynb 3
+from fastcore.all import *
+from .core import read_pgs, ocr_pdf
+from .refine import add_img_descs, fix_hdgs
+from pathlib import Path
+from asyncio import Semaphore, gather, sleep
+import tempfile
+import os, json, shutil

{mistocr-0.2.12 → mistocr-0.3.1/mistocr.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mistocr
-Version: 0.2.12
+Version: 0.3.1
 Summary: Batch OCR for PDFs with heading restoration and visual content integration
 Home-page: https://github.com/franckalbinet/mistocr
 Author: Solveit
@@ -23,6 +23,7 @@ Requires-Dist: mistralai
 Requires-Dist: pillow
 Requires-Dist: dotenv
 Requires-Dist: lisette
+Requires-Dist: PyPDF2
 Provides-Extra: dev
 Dynamic: author
 Dynamic: author-email

{mistocr-0.2.12 → mistocr-0.3.1}/mistocr.egg-info/requires.txt RENAMED Viewed

@@ -3,5 +3,6 @@ mistralai
 pillow
 dotenv
 lisette
+PyPDF2
 [dev]

{mistocr-0.2.12 → mistocr-0.3.1}/settings.ini RENAMED Viewed

@@ -1,7 +1,7 @@
 [DEFAULT]
 repo = mistocr
 lib_name = mistocr
-version = 0.2.12
+version = 0.3.1
 min_python = 3.9
 license = apache2
 black_formatting = False
@@ -27,7 +27,7 @@ keywords = nbdev jupyter notebook python
 language = English
 status = 3
 user = franckalbinet
-requirements = fastcore mistralai pillow dotenv lisette
+requirements = fastcore mistralai pillow dotenv lisette PyPDF2
 readme_nb = index.ipynb
 allowed_metadata_keys =
 allowed_cell_metadata_keys =

mistocr-0.2.12/mistocr/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- __version__ = "0.2.12"

mistocr-0.2.12/mistocr/pipeline.py DELETED Viewed

@@ -1,37 +0,0 @@
-"""End-to-End Pipeline: PDF OCR, Markdown Heading Correction, and AI Image Descriptions"""
-# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_pipeline.ipynb.
-# %% auto 0
-__all__ = ['pdf_to_md']
-# %% ../nbs/02_pipeline.ipynb 3
-from fastcore.all import *
-from .core import read_pgs, ocr_pdf
-from .refine import add_img_descs, fix_hdgs
-from pathlib import Path
-from asyncio import Semaphore, gather, sleep
-import os, json, shutil
-# %% ../nbs/02_pipeline.ipynb 4
-@delegates(add_img_descs)
-async def pdf_to_md(
-    pdf_path:str, # Path to input PDF file
-    dst:str, # Destination directory for output markdown
-    ocr_output:str=None, # Optional OCR output directory (defaults to pdf_path stem)
-    model:str='claude-sonnet-4-5', # Model to use for heading fixes and image descriptions
-    add_img_desc:bool=True, # Whether to add image descriptions
-    progress:bool=True, # Whether to show progress messages
-    **kwargs):
-    "Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
-    n_steps = 3 if add_img_desc else 2
-    if progress: print(f"Step 1/{n_steps}: Running OCR on {pdf_path}...")
-    ocr_dirs = ocr_pdf(pdf_path, ocr_output or 'ocr_temp')
-    ocr_dir = ocr_dirs[0]
-    if progress: print(f"Step 2/{n_steps}: Fixing heading hierarchy...")
-    fix_hdgs(ocr_dir, model=model)
-    if add_img_desc:
-        if progress: print(f"Step 3/{n_steps}: Adding image descriptions...")
-        await add_img_descs(ocr_dir, dst=dst, model=model, progress=progress, **kwargs)
-    elif dst and Path(dst) != ocr_dir: shutil.copytree(ocr_dir, dst, dirs_exist_ok=True)
-    if progress: print("Done!")