PyPI - mistocr - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

mistocr 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

mistocr/__init__.py +1 -1
mistocr/refine.py +35 -29
{mistocr-0.4.0.dist-info → mistocr-0.4.1.dist-info}/METADATA +1 -13
mistocr-0.4.1.dist-info/RECORD +11 -0
mistocr-0.4.0.dist-info/RECORD +0 -11
{mistocr-0.4.0.dist-info → mistocr-0.4.1.dist-info}/WHEEL +0 -0
{mistocr-0.4.0.dist-info → mistocr-0.4.1.dist-info}/entry_points.txt +0 -0
{mistocr-0.4.0.dist-info → mistocr-0.4.1.dist-info}/licenses/LICENSE +0 -0
{mistocr-0.4.0.dist-info → mistocr-0.4.1.dist-info}/top_level.txt +0 -0

mistocr/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.0"
1	+ __version__ = "0.4.1"

mistocr/refine.py CHANGED Viewed

@@ -3,7 +3,7 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_refine.ipynb.
 # %% auto 0
-__all__ = ['prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
+__all__ = ['logger', 'prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
            'HeadingCorrection', 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes',
            'fix_hdgs', 'ImgDescription', 'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs',
            'add_descs_to_pg', 'add_descs_to_pgs', 'add_img_descs']
@@ -20,8 +20,14 @@ import os
 import json
 import shutil
 from asyncio import Semaphore, gather, sleep
+import logging
-# %% ../nbs/01_refine.ipynb 7
+# %% ../nbs/01_refine.ipynb 4
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.WARNING, format='%(name)s - %(levelname)s - %(message)s')
+logger.setLevel(logging.INFO)
+# %% ../nbs/01_refine.ipynb 8
 def get_hdgs(
     md:str # Markdown file string
     ) -> L: # L of strings
@@ -32,7 +38,7 @@ def get_hdgs(
-# %% ../nbs/01_refine.ipynb 8
+# %% ../nbs/01_refine.ipynb 9
 def add_pg_hdgs(
     md:str, # Markdown file string,
     n:int # Page number
@@ -42,7 +48,7 @@ def add_pg_hdgs(
     def repl(m): return m.group(0) + f' ... page {n}'
     return sub(r'^#{1,6} .+$', repl, md, flags=MULTILINE)
-# %% ../nbs/01_refine.ipynb 12
+# %% ../nbs/01_refine.ipynb 13
 def read_pgs_pg(
     path:str # Path to the markdown file
     ) -> L: # List of markdown pages
@@ -50,7 +56,7 @@ def read_pgs_pg(
     pgs = read_pgs(path, join=False)
     return L([add_pg_hdgs(pg, n) for n, pg in enumerate(pgs, 1)]).concat()
-# %% ../nbs/01_refine.ipynb 15
+# %% ../nbs/01_refine.ipynb 16
 def fmt_hdgs_idx(
     hdgs: list[str] # List of markdown headings
     ) -> str: # Formatted string with index
@@ -58,18 +64,18 @@ def fmt_hdgs_idx(
     return '\n'.join(f"{i}. {h}" for i, h in enumerate(hdgs))
-# %% ../nbs/01_refine.ipynb 18
+# %% ../nbs/01_refine.ipynb 19
 class HeadingCorrection(BaseModel):
     "A single heading correction mapping an index to its corrected markdown heading"
     index: int
     corrected: str
-# %% ../nbs/01_refine.ipynb 19
+# %% ../nbs/01_refine.ipynb 20
 class HeadingCorrections(BaseModel):
     "Collection of heading corrections returned by the LLM"
     corrections: list[HeadingCorrection]
-# %% ../nbs/01_refine.ipynb 21
+# %% ../nbs/01_refine.ipynb 22
 prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
 INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title ... page 1")
@@ -109,7 +115,7 @@ Headings to analyze:
 {headings_list}
 """
-# %% ../nbs/01_refine.ipynb 23
+# %% ../nbs/01_refine.ipynb 24
 def fix_hdg_hierarchy(
     hdgs: list[str], # List of markdown headings
     prompt: str=None, # Prompt to use
@@ -125,7 +131,7 @@ def fix_hdg_hierarchy(
     return {o['index']: o['corrected'] for o in fixes}
-# %% ../nbs/01_refine.ipynb 26
+# %% ../nbs/01_refine.ipynb 27
 @delegates(fix_hdg_hierarchy)
 def mk_fixes_lut(
     hdgs: list[str], # List of markdown headings
@@ -138,7 +144,7 @@ def mk_fixes_lut(
     fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key, **kwargs)
     return {hdgs[k]:v for k,v in fixes.items()}
-# %% ../nbs/01_refine.ipynb 29
+# %% ../nbs/01_refine.ipynb 30
 def apply_hdg_fixes(
     p:str, # Page to fix
     lut_fixes: dict[str, str], # Lookup table of fixes
@@ -147,7 +153,7 @@ def apply_hdg_fixes(
     for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old))
     return p
-# %% ../nbs/01_refine.ipynb 32
+# %% ../nbs/01_refine.ipynb 33
 @delegates(mk_fixes_lut)
 def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:str='img', **kwargs):
     "Fix heading hierarchy in markdown document"
@@ -159,13 +165,13 @@ def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:st
     lut = mk_fixes_lut(L([get_hdgs(pg) for pg in pgs_with_pg]).concat(), model, **kwargs)
     for i,p in enumerate(pgs_with_pg, 1): (dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut))
-# %% ../nbs/01_refine.ipynb 38
+# %% ../nbs/01_refine.ipynb 39
 class ImgDescription(BaseModel):
     "Image classification and description for OCR'd documents"
     is_informative:bool # Whether image contains informative content (charts, diagrams, tables) vs decorative (logos, backgrounds)
     description:str # Detailed description of the image content for RAG and accessibility
-# %% ../nbs/01_refine.ipynb 41
+# %% ../nbs/01_refine.ipynb 42
 describe_img_prompt = """Analyze this image from an academic/technical document.
 Step 1: Determine if this image is informative for understanding the document content.
@@ -178,7 +184,7 @@ Step 2:
 Return your response as JSON with 'is_informative' (boolean) and 'description' (string) fields."""
-# %% ../nbs/01_refine.ipynb 42
+# %% ../nbs/01_refine.ipynb 43
 async def describe_img(
     img_path: Path,  # Path to the image file
     model: str = 'claude-sonnet-4-5',  # Model to use
@@ -189,7 +195,7 @@ async def describe_img(
     r = await chat([img_path.read_bytes(), prompt], response_format=ImgDescription)
     return r
-# %% ../nbs/01_refine.ipynb 46
+# %% ../nbs/01_refine.ipynb 47
 async def limit(
     semaphore, # Semaphore for concurrency control
     coro, # Coroutine to execute
@@ -201,14 +207,14 @@ async def limit(
         if delay: await sleep(delay)
         return r
-# %% ../nbs/01_refine.ipynb 48
+# %% ../nbs/01_refine.ipynb 49
 def parse_r(
     result # ModelResponse object from API call
 ): # Dictionary with 'is_informative' and 'description' keys
     "Extract and parse JSON content from model response"
     return json.loads(result.choices[0].message.content)
-# %% ../nbs/01_refine.ipynb 50
+# %% ../nbs/01_refine.ipynb 51
 async def describe_imgs(
     imgs: list[Path], # List of image file paths to describe
     model: str = 'claude-sonnet-4-5', # Model to use for image description
@@ -221,7 +227,7 @@ async def describe_imgs(
     results = await gather(*[limit(sem, describe_img(img, model, prompt), delay) for img in imgs])
     return {img.name: parse_r(r) for img, r in zip(imgs, results)}
-# %% ../nbs/01_refine.ipynb 52
+# %% ../nbs/01_refine.ipynb 53
 def save_img_descs(
     descs: dict, # Dictionary of image descriptions
     dst_fname: Path, # Path to save the JSON file
@@ -229,7 +235,7 @@ def save_img_descs(
     "Save image descriptions to JSON file"
     Path(dst_fname).write_text(json.dumps(descs, indent=2))
-# %% ../nbs/01_refine.ipynb 57
+# %% ../nbs/01_refine.ipynb 58
 def add_descs_to_pg(
     pg:str, # Page markdown content
     descs:dict # Dictionary mapping image filenames to their descriptions
@@ -240,7 +246,7 @@ def add_descs_to_pg(
         if fname in descs: pg = pg.replace(link, f"{link}\nAI-generated image description:\n___\n{descs[fname]['description']}\n___")
     return pg
-# %% ../nbs/01_refine.ipynb 62
+# %% ../nbs/01_refine.ipynb 63
 def add_descs_to_pgs(
     pgs:list, # List of page markdown strings
     descs:dict # Dictionary mapping image filenames to their descriptions
@@ -248,7 +254,7 @@ def add_descs_to_pgs(
     "Add AI-generated descriptions to images in all pages"
     return [add_descs_to_pg(pg, descs) for pg in pgs]
-# %% ../nbs/01_refine.ipynb 65
+# %% ../nbs/01_refine.ipynb 66
 async def add_img_descs(
     src:str, # Path to source markdown directory
     dst:str=None, # Destination directory (defaults to src if None)
@@ -257,7 +263,7 @@ async def add_img_descs(
     semaphore:int=2, # Max concurrent API requests
     delay:float=1, # Delay in seconds between API calls
     force:bool=False, # Force regeneration even if cache exists
-    progress:bool=True # Print progress messages
+    progress:bool=True # Log progress messages
 ):
     "Describe all images in markdown document and insert descriptions inline"
     src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
@@ -266,23 +272,23 @@ async def add_img_descs(
     # Check if image folder exists
     if not src_imgs.exists():
-        if progress: print(f"No images to describe in the document (no '{img_folder}' folder found)")
+        if progress: logger.info(f"No images to describe in the document (no '{img_folder}' folder found)")
         return
     if src_imgs.exists() and dst_path != src_path: shutil.copytree(src_imgs, dst_path/img_folder, dirs_exist_ok=True)
     desc_file = src_path/'img_descriptions.json'
     if desc_file.exists() and not force:
-        if progress: print(f"Loading existing descriptions from {desc_file}")
+        if progress: logger.info(f"Loading existing descriptions from {desc_file}")
         descs = json.loads(desc_file.read_text())
     else:
         imgs = (src_path/img_folder).ls(file_exts=['.jpeg', '.jpg', '.png'])
-        if progress: print(f"Describing {len(imgs)} images...")
+        if progress: logger.info(f"Describing {len(imgs)} images...")
         descs = await describe_imgs(imgs, model, semaphore=semaphore, delay=delay)
         save_img_descs(descs, desc_file)
-        if progress: print(f"Saved descriptions to {desc_file}")
+        if progress: logger.info(f"Saved descriptions to {desc_file}")
     pgs = read_pgs(src_path, join=False)
-    if progress: print(f"Adding descriptions to {len(pgs)} pages...")
+    if progress: logger.info(f"Adding descriptions to {len(pgs)} pages...")
     enriched = [add_descs_to_pg(pg, descs) for pg in pgs]
     for i,pg in enumerate(enriched, 1): (dst_path/f'page_{i}.md').write_text(pg)
-    if progress: print(f"Done! Enriched pages saved to {dst_path}")
+    if progress: logger.info(f"Done! Enriched pages saved to {dst_path}")

{mistocr-0.4.0.dist-info → mistocr-0.4.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mistocr
-Version: 0.4.0
+Version: 0.4.1
 Summary: Batch OCR for PDFs with heading restoration and visual content integration
 Home-page: https://github.com/franckalbinet/mistocr
 Author: Solveit
@@ -113,18 +113,6 @@ from mistocr.pipeline import pdf_to_md
 await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
 ```
-    Step 1/3: Running OCR on files/test/resnet.pdf...
-    Mistral batch job status: QUEUED
-    Mistral batch job status: RUNNING
-    Mistral batch job status: RUNNING
-    Step 2/3: Fixing heading hierarchy...
-    Step 3/3: Adding image descriptions...
-    Describing 7 images...
-    Saved descriptions to ocr_temp/resnet/img_descriptions.json
-    Adding descriptions to 12 pages...
-    Done! Enriched pages saved to files/test/md_test
-    Done!
 This will (as indicated by the output):
 1.  OCR the PDF using Mistral’s batch API

mistocr-0.4.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,11 @@
+mistocr/__init__.py,sha256=pMtTmSUht-XtbR_7Doz6bsQqopJJd8rZ8I8zy2HwwoA,22
+mistocr/_modidx.py,sha256=LyFez7ndKOXQpF0unhg1imCrW7xcUwO106W82ttVnnk,4366
+mistocr/core.py,sha256=-yXqEro_kTE66lXWBrewS73SRTl-Btt9uyKNxMnzjIw,9181
+mistocr/pipeline.py,sha256=n8AHIHGZBXYy_k7LtEBZSiTL6HClT84-7K4QlTjOLvo,2107
+mistocr/refine.py,sha256=Q14DhUUsT5FLMxP9oIJ2TGQ3qbxe7ulXfRMPKpsd4Wo,13232
+mistocr-0.4.1.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
+mistocr-0.4.1.dist-info/METADATA,sha256=cvASaYVhDfCJ9bzrosdmTRd5ECIAPAl84H7nN5P06zY,7992
+mistocr-0.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+mistocr-0.4.1.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
+mistocr-0.4.1.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
+mistocr-0.4.1.dist-info/RECORD,,

mistocr-0.4.0.dist-info/RECORD DELETED Viewed

@@ -1,11 +0,0 @@
-mistocr/__init__.py,sha256=42STGor_9nKYXumfeV5tiyD_M8VdcddX7CEexmibPBk,22
-mistocr/_modidx.py,sha256=LyFez7ndKOXQpF0unhg1imCrW7xcUwO106W82ttVnnk,4366
-mistocr/core.py,sha256=-yXqEro_kTE66lXWBrewS73SRTl-Btt9uyKNxMnzjIw,9181
-mistocr/pipeline.py,sha256=n8AHIHGZBXYy_k7LtEBZSiTL6HClT84-7K4QlTjOLvo,2107
-mistocr/refine.py,sha256=hQg4ZYRTkPFEiCNNBU2ykgXFfVLCNAp2IuwOwHBVQ2k,12983
-mistocr-0.4.0.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
-mistocr-0.4.0.dist-info/METADATA,sha256=c0LUM6UrwIIoeug8fA8H4dYvutdieBFLQ52Sho4uGgY,8438
-mistocr-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-mistocr-0.4.0.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
-mistocr-0.4.0.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
-mistocr-0.4.0.dist-info/RECORD,,

{mistocr-0.4.0.dist-info → mistocr-0.4.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{mistocr-0.4.0.dist-info → mistocr-0.4.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{mistocr-0.4.0.dist-info → mistocr-0.4.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{mistocr-0.4.0.dist-info → mistocr-0.4.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

mistocr 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

mistocr 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl