PyPI - mistocr - Versions diffs - 0.2.5__py3-none-any.whl → 0.2.10__py3-none-any.whl - Mend

mistocr 0.2.5py3-none-any.whl → 0.2.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

mistocr/__init__.py +1 -1
mistocr/_modidx.py +1 -1
mistocr/refine.py +43 -25
{mistocr-0.2.5.dist-info → mistocr-0.2.10.dist-info}/METADATA +1 -1
mistocr-0.2.10.dist-info/RECORD +11 -0
mistocr-0.2.5.dist-info/RECORD +0 -11
{mistocr-0.2.5.dist-info → mistocr-0.2.10.dist-info}/WHEEL +0 -0
{mistocr-0.2.5.dist-info → mistocr-0.2.10.dist-info}/entry_points.txt +0 -0
{mistocr-0.2.5.dist-info → mistocr-0.2.10.dist-info}/licenses/LICENSE +0 -0
{mistocr-0.2.5.dist-info → mistocr-0.2.10.dist-info}/top_level.txt +0 -0

mistocr/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.2.5"
1	+ __version__ = "0.2.10"

mistocr/_modidx.py CHANGED Viewed

@@ -21,7 +21,7 @@ d = { 'settings': { 'branch': 'main',
                               'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
                               'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
             'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},
-            'mistocr.refine': { 'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
+            'mistocr.refine': { 'mistocr.refine.HeadingCorrection': ('refine.html#headingcorrection', 'mistocr/refine.py'),
                                 'mistocr.refine.ImgDescription': ('refine.html#imgdescription', 'mistocr/refine.py'),
                                 'mistocr.refine.add_descs_to_pg': ('refine.html#add_descs_to_pg', 'mistocr/refine.py'),
                                 'mistocr.refine.add_descs_to_pgs': ('refine.html#add_descs_to_pgs', 'mistocr/refine.py'),

mistocr/refine.py CHANGED Viewed

@@ -4,7 +4,7 @@
 # %% auto 0
 __all__ = ['prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
-           'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes', 'fix_hdgs', 'ImgDescription',
+           'HeadingCorrection', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes', 'fix_hdgs', 'ImgDescription',
            'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs', 'add_descs_to_pg', 'add_descs_to_pgs',
            'add_img_descs']
@@ -59,14 +59,21 @@ def fmt_hdgs_idx(
 # %% ../nbs/01_refine.ipynb 18
-class HeadingCorrections(BaseModel):
-    corrections: dict[int, str]  # index → corrected heading
+class HeadingCorrection(BaseModel):
+    "A single heading correction mapping an index to its corrected markdown heading"
+    index: int
+    corrected: str
-# %% ../nbs/01_refine.ipynb 20
+# %% ../nbs/01_refine.ipynb 21
 prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
 INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title ... page 1")
+ANALYSIS STEPS (think through these before outputting corrections):
+1. For each numbered heading (e.g., "4.1", "2.a", "A.1"), identify its parent heading (e.g., "4", "2", "A")
+2. Verify the child heading is exactly one # deeper than its parent
+3. If not, mark it for correction
 RULES - Apply these fixes in order:
 1. **Single H1 rule**: Documents must have exactly ONE # heading (typically the document title at the top)
@@ -75,8 +82,8 @@ RULES - Apply these fixes in order:
    - NO exceptions: appendices, references, and all sections are ## or deeper after the title
 2. **Infer depth from numbering patterns**: If headings contain section numbers, deeper nesting means deeper heading level
-   - Parent section (e.g., "1", "2", "A") should be shallower than child (e.g., "1.1", "2.a", "A.1")
-   - Child section should be one # deeper than parent
+   - Parent section (e.g., "1", "2", "A") MUST be shallower than child (e.g., "1.1", "2.a", "A.1")
+   - Child section MUST be exactly one # deeper than parent
    - Works with any numbering: "1/1.1/1.1.1", "A/A.1/A.1.a", "I/I.A/I.A.1", etc.
 3. **Level jumps**: Headings can only increase by one # at a time when moving deeper
@@ -85,16 +92,19 @@ RULES - Apply these fixes in order:
 4. **Decreasing levels is OK**: Moving back up the hierarchy (### to ##) is valid for new sections
-OUTPUT: Return a Python dictionary mapping index to corrected heading (without the index prefix).
+5. **Unnumbered headings in numbered documents**: If the document uses numbered headings consistently, any unnumbered heading appearing within that structure is likely misclassified bold text and should be converted to regular text (output the heading text without any # symbols in the corrected field)
+OUTPUT: Return a list of corrections, where each correction has:
+- index: the heading's index number
+- corrected: the fixed heading text (without the index prefix), or empty string "" to remove the heading entirely
 IMPORTANT: Preserve the " ... page N" suffix in all corrected headings.
-Only include entries that need changes.
+Only include headings that need changes.
 Headings to analyze:
 {headings_list}
 """
-# %% ../nbs/01_refine.ipynb 22
+# %% ../nbs/01_refine.ipynb 23
 def fix_hdg_hierarchy(
     hdgs: list[str], # List of markdown headings
     prompt: str=None, # Prompt to use
@@ -106,10 +116,11 @@ def fix_hdg_hierarchy(
     if prompt is None: prompt = prompt_fix_hdgs
     prompt = prompt.format(headings_list=fmt_hdgs_idx(hdgs))
     r = completion(model=model, messages=[{"role": "user", "content": prompt}], response_format=HeadingCorrections, api_key=api_key)
-    return json.loads(r.choices[0].message.content)['corrections']
+    fixes =  json.loads(r.choices[0].message.content)['corrections']
+    return {o['index']: o['corrected'] for o in fixes}
-# %% ../nbs/01_refine.ipynb 25
+# %% ../nbs/01_refine.ipynb 26
 @delegates(fix_hdg_hierarchy)
 def mk_fixes_lut(
     hdgs: list[str], # List of markdown headings
@@ -120,9 +131,9 @@ def mk_fixes_lut(
     "Make a lookup table of fixes"
     if api_key is None: api_key = os.getenv('ANTHROPIC_API_KEY')
     fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key, **kwargs)
-    return {hdgs[int(k)]:v for k,v in fixes.items()}
+    return {hdgs[k]:v for k,v in fixes.items()}
-# %% ../nbs/01_refine.ipynb 28
+# %% ../nbs/01_refine.ipynb 29
 def apply_hdg_fixes(
     p:str, # Page to fix
     lut_fixes: dict[str, str], # Lookup table of fixes
@@ -131,7 +142,7 @@ def apply_hdg_fixes(
     for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old))
     return p
-# %% ../nbs/01_refine.ipynb 31
+# %% ../nbs/01_refine.ipynb 32
 @delegates(mk_fixes_lut)
 def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:str='img', **kwargs):
     "Fix heading hierarchy in markdown document"
@@ -143,13 +154,13 @@ def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:st
     lut = mk_fixes_lut(L([get_hdgs(pg) for pg in pgs_with_pg]).concat(), model, **kwargs)
     for i,p in enumerate(pgs_with_pg, 1): (dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut))
-# %% ../nbs/01_refine.ipynb 37
+# %% ../nbs/01_refine.ipynb 38
 class ImgDescription(BaseModel):
     "Image classification and description for OCR'd documents"
     is_informative:bool # Whether image contains informative content (charts, diagrams, tables) vs decorative (logos, backgrounds)
     description:str # Detailed description of the image content for RAG and accessibility
-# %% ../nbs/01_refine.ipynb 40
+# %% ../nbs/01_refine.ipynb 41
 describe_img_prompt = """Analyze this image from an academic/technical document.
 Step 1: Determine if this image is informative for understanding the document content.
@@ -162,7 +173,7 @@ Step 2:
 Return your response as JSON with 'is_informative' (boolean) and 'description' (string) fields."""
-# %% ../nbs/01_refine.ipynb 41
+# %% ../nbs/01_refine.ipynb 42
 async def describe_img(
     img_path: Path,  # Path to the image file
     model: str = 'claude-sonnet-4-5',  # Model to use
@@ -173,7 +184,7 @@ async def describe_img(
     r = await chat([img_path.read_bytes(), prompt], response_format=ImgDescription)
     return r
-# %% ../nbs/01_refine.ipynb 45
+# %% ../nbs/01_refine.ipynb 46
 async def limit(
     semaphore, # Semaphore for concurrency control
     coro, # Coroutine to execute
@@ -185,14 +196,14 @@ async def limit(
         if delay: await sleep(delay)
         return r
-# %% ../nbs/01_refine.ipynb 47
+# %% ../nbs/01_refine.ipynb 48
 def parse_r(
     result # ModelResponse object from API call
 ): # Dictionary with 'is_informative' and 'description' keys
     "Extract and parse JSON content from model response"
     return json.loads(result.choices[0].message.content)
-# %% ../nbs/01_refine.ipynb 49
+# %% ../nbs/01_refine.ipynb 50
 async def describe_imgs(
     imgs: list[Path], # List of image file paths to describe
     model: str = 'claude-sonnet-4-5', # Model to use for image description
@@ -205,7 +216,7 @@ async def describe_imgs(
     results = await gather(*[limit(sem, describe_img(img, model, prompt), delay) for img in imgs])
     return {img.name: parse_r(r) for img, r in zip(imgs, results)}
-# %% ../nbs/01_refine.ipynb 51
+# %% ../nbs/01_refine.ipynb 52
 def save_img_descs(
     descs: dict, # Dictionary of image descriptions
     dst_fname: Path, # Path to save the JSON file
@@ -213,7 +224,7 @@ def save_img_descs(
     "Save image descriptions to JSON file"
     Path(dst_fname).write_text(json.dumps(descs, indent=2))
-# %% ../nbs/01_refine.ipynb 56
+# %% ../nbs/01_refine.ipynb 57
 def add_descs_to_pg(
     pg:str, # Page markdown content
     descs:dict # Dictionary mapping image filenames to their descriptions
@@ -224,7 +235,7 @@ def add_descs_to_pg(
         if fname in descs: pg = pg.replace(link, f"{link}\nAI-generated image description:\n___\n{descs[fname]['description']}\n___")
     return pg
-# %% ../nbs/01_refine.ipynb 61
+# %% ../nbs/01_refine.ipynb 62
 def add_descs_to_pgs(
     pgs:list, # List of page markdown strings
     descs:dict # Dictionary mapping image filenames to their descriptions
@@ -232,7 +243,7 @@ def add_descs_to_pgs(
     "Add AI-generated descriptions to images in all pages"
     return [add_descs_to_pg(pg, descs) for pg in pgs]
-# %% ../nbs/01_refine.ipynb 64
+# %% ../nbs/01_refine.ipynb 65
 async def add_img_descs(
     src:str, # Path to source markdown directory
     dst:str=None, # Destination directory (defaults to src if None)
@@ -247,6 +258,12 @@ async def add_img_descs(
     src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
     if dst_path != src_path: dst_path.mkdir(parents=True, exist_ok=True)
     src_imgs = src_path/img_folder
+    # Check if image folder exists
+    if not src_imgs.exists():
+        if progress: print(f"No images to describe in the document (no '{img_folder}' folder found)")
+        return
     if src_imgs.exists() and dst_path != src_path: shutil.copytree(src_imgs, dst_path/img_folder, dirs_exist_ok=True)
     desc_file = src_path/'img_descriptions.json'
     if desc_file.exists() and not force:
@@ -263,3 +280,4 @@ async def add_img_descs(
     enriched = [add_descs_to_pg(pg, descs) for pg in pgs]
     for i,pg in enumerate(enriched, 1): (dst_path/f'page_{i}.md').write_text(pg)
     if progress: print(f"Done! Enriched pages saved to {dst_path}")

{mistocr-0.2.5.dist-info → mistocr-0.2.10.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mistocr
-Version: 0.2.5
+Version: 0.2.10
 Summary: Batch OCR for PDFs with heading restoration and visual content integration
 Home-page: https://github.com/franckalbinet/mistocr
 Author: Solveit

mistocr-0.2.10.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,11 @@
+mistocr/__init__.py,sha256=waXgc7p-jgGCsUjdVfO_KjlVZblnCvrzf4A0dsBj_lg,23
+mistocr/_modidx.py,sha256=WTS9JpZdbrp2LghjhOV-CK0JYChHE4PzttgKfh7pTy4,4028
+mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
+mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
+mistocr/refine.py,sha256=H_IAF02k6CwBQdDJm9txknzUcTlz245zXitaHELX-P4,12791
+mistocr-0.2.10.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
+mistocr-0.2.10.dist-info/METADATA,sha256=mkMu_9nYAXZ5jFdJd01AZqK3t93_Rt0xkkD0rRnl9Ew,8417
+mistocr-0.2.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+mistocr-0.2.10.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
+mistocr-0.2.10.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
+mistocr-0.2.10.dist-info/RECORD,,

mistocr-0.2.5.dist-info/RECORD DELETED Viewed

@@ -1,11 +0,0 @@
-mistocr/__init__.py,sha256=Xsa3ayOMVkhUWm4t06YeyHE0apjpZefxLH4ylp0CDtU,22
-mistocr/_modidx.py,sha256=JtXJb6ErVwyGFCWgeVz7N9JvybBp8Zyao4Ahn3smr90,4030
-mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
-mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
-mistocr/refine.py,sha256=arJPOg1eP4MwtkD1zwnYY4EFrBfTTSP-mtR4AVnTiR8,11788
-mistocr-0.2.5.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
-mistocr-0.2.5.dist-info/METADATA,sha256=uGim0pZ4V3-oolsihRFr4aOWh3ZDOO7u3d8Mn0n-gmc,8416
-mistocr-0.2.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-mistocr-0.2.5.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
-mistocr-0.2.5.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
-mistocr-0.2.5.dist-info/RECORD,,

{mistocr-0.2.5.dist-info → mistocr-0.2.10.dist-info}/WHEEL RENAMED Viewed

File without changes

{mistocr-0.2.5.dist-info → mistocr-0.2.10.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{mistocr-0.2.5.dist-info → mistocr-0.2.10.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{mistocr-0.2.5.dist-info → mistocr-0.2.10.dist-info}/top_level.txt RENAMED Viewed

File without changes

mistocr 0.2.5__py3-none-any.whl → 0.2.10__py3-none-any.whl

mistocr 0.2.5py3-none-any.whl → 0.2.10py3-none-any.whl