PyPI - mistocr - Versions diffs - 0.2.4__tar.gz → 0.2.7__tar.gz - Mend

mistocr 0.2.4tar.gz → 0.2.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

{mistocr-0.2.4/mistocr.egg-info → mistocr-0.2.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mistocr
-Version: 0.2.4
+Version: 0.2.7
 Summary: Batch OCR for PDFs with heading restoration and visual content integration
 Home-page: https://github.com/franckalbinet/mistocr
 Author: Solveit
@@ -76,6 +76,14 @@ fundamental challenges that raw OCR output leaves unsolved:
 **In short**: Complete PDF OCR with heading hierarchy fixes and image
 descriptions for RAG and LLM pipelines.
+> [!NOTE]
+>
+> **Want to see mistocr in action?** This
+> [tutorial](https://share.solve.it.com/d/97f75412ca949af76a5945b4dfc443c7)
+> demonstrates real-world PDF processing and shows how clean markdown
+> enables structure-aware navigation through long documents—letting you
+> find exactly what you need, fast.
 ## Get Started
 Install latest from [pypi](https://pypi.org/project/mistocr), then:

{mistocr-0.2.4 → mistocr-0.2.7}/README.md RENAMED Viewed

@@ -36,6 +36,14 @@ fundamental challenges that raw OCR output leaves unsolved:
 **In short**: Complete PDF OCR with heading hierarchy fixes and image
 descriptions for RAG and LLM pipelines.
+> [!NOTE]
+>
+> **Want to see mistocr in action?** This
+> [tutorial](https://share.solve.it.com/d/97f75412ca949af76a5945b4dfc443c7)
+> demonstrates real-world PDF processing and shows how clean markdown
+> enables structure-aware navigation through long documents—letting you
+> find exactly what you need, fast.
 ## Get Started
 Install latest from [pypi](https://pypi.org/project/mistocr), then:

mistocr-0.2.7/mistocr/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.2.7"

{mistocr-0.2.4 → mistocr-0.2.7}/mistocr/_modidx.py RENAMED Viewed

@@ -21,7 +21,8 @@ d = { 'settings': { 'branch': 'main',
                               'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
                               'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
             'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},
-            'mistocr.refine': { 'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
+            'mistocr.refine': { 'mistocr.refine.HeadingCorrection': ('refine.html#headingcorrection', 'mistocr/refine.py'),
+                                'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
                                 'mistocr.refine.ImgDescription': ('refine.html#imgdescription', 'mistocr/refine.py'),
                                 'mistocr.refine.add_descs_to_pg': ('refine.html#add_descs_to_pg', 'mistocr/refine.py'),
                                 'mistocr.refine.add_descs_to_pgs': ('refine.html#add_descs_to_pgs', 'mistocr/refine.py'),

{mistocr-0.2.4 → mistocr-0.2.7}/mistocr/refine.py RENAMED Viewed

@@ -3,7 +3,7 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_refine.ipynb.
 # %% auto 0
-__all__ = ['prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
+__all__ = ['describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx', 'HeadingCorrection',
            'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes', 'fix_hdgs', 'ImgDescription',
            'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs', 'add_descs_to_pg', 'add_descs_to_pgs',
            'add_img_descs']
@@ -59,40 +59,12 @@ def fmt_hdgs_idx(
 # %% ../nbs/01_refine.ipynb 18
-class HeadingCorrections(BaseModel):
-    corrections: dict[int, str]  # index → corrected heading
-# %% ../nbs/01_refine.ipynb 20
-prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
-INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title ... page 1")
-RULES - Apply these fixes in order:
-1. **Single H1 rule**: Documents must have exactly ONE # heading (typically the document title at the top)
-   - If index 0 is already #, then all subsequent headings (index 1+) must be ## or deeper
-   - If no H1 exists, the first major heading should be #, and all others ## or deeper
-   - NO exceptions: appendices, references, and all sections are ## or deeper after the title
-2. **Infer depth from numbering patterns**: If headings contain section numbers, deeper nesting means deeper heading level
-   - Parent section (e.g., "1", "2", "A") should be shallower than child (e.g., "1.1", "2.a", "A.1")
-   - Child section should be one # deeper than parent
-   - Works with any numbering: "1/1.1/1.1.1", "A/A.1/A.1.a", "I/I.A/I.A.1", etc.
-3. **Level jumps**: Headings can only increase by one # at a time when moving deeper
-   - Wrong: ## Section → ##### Subsection
-   - Fixed: ## Section → ### Subsection
-4. **Decreasing levels is OK**: Moving back up the hierarchy (### to ##) is valid for new sections
-OUTPUT: Return a Python dictionary mapping index to corrected heading (without the index prefix).
-IMPORTANT: Preserve the " ... page N" suffix in all corrected headings.
-Only include entries that need changes.
-Headings to analyze:
-{headings_list}
-"""
+class HeadingCorrection(BaseModel):
+    index: int
+    corrected: str
+class HeadingCorrections(BaseModel):
+    corrections: list[HeadingCorrection]
 # %% ../nbs/01_refine.ipynb 22
 def fix_hdg_hierarchy(
@@ -106,7 +78,8 @@ def fix_hdg_hierarchy(
     if prompt is None: prompt = prompt_fix_hdgs
     prompt = prompt.format(headings_list=fmt_hdgs_idx(hdgs))
     r = completion(model=model, messages=[{"role": "user", "content": prompt}], response_format=HeadingCorrections, api_key=api_key)
-    return json.loads(r.choices[0].message.content)['corrections']
+    fixes =  json.loads(r.choices[0].message.content)['corrections']
+    return {o['index']: o['corrected'] for o in fixes}
 # %% ../nbs/01_refine.ipynb 25
@@ -120,7 +93,7 @@ def mk_fixes_lut(
     "Make a lookup table of fixes"
     if api_key is None: api_key = os.getenv('ANTHROPIC_API_KEY')
     fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key, **kwargs)
-    return {hdgs[int(k)]:v for k,v in fixes.items()}
+    return {hdgs[k]:v for k,v in fixes.items()}
 # %% ../nbs/01_refine.ipynb 28
 def apply_hdg_fixes(

{mistocr-0.2.4 → mistocr-0.2.7/mistocr.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mistocr
-Version: 0.2.4
+Version: 0.2.7
 Summary: Batch OCR for PDFs with heading restoration and visual content integration
 Home-page: https://github.com/franckalbinet/mistocr
 Author: Solveit
@@ -76,6 +76,14 @@ fundamental challenges that raw OCR output leaves unsolved:
 **In short**: Complete PDF OCR with heading hierarchy fixes and image
 descriptions for RAG and LLM pipelines.
+> [!NOTE]
+>
+> **Want to see mistocr in action?** This
+> [tutorial](https://share.solve.it.com/d/97f75412ca949af76a5945b4dfc443c7)
+> demonstrates real-world PDF processing and shows how clean markdown
+> enables structure-aware navigation through long documents—letting you
+> find exactly what you need, fast.
 ## Get Started
 Install latest from [pypi](https://pypi.org/project/mistocr), then:

{mistocr-0.2.4 → mistocr-0.2.7}/settings.ini RENAMED Viewed

@@ -1,7 +1,7 @@
 [DEFAULT]
 repo = mistocr
 lib_name = mistocr
-version = 0.2.4
+version = 0.2.7
 min_python = 3.9
 license = apache2
 black_formatting = False