mistocr 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mistocr/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.4"
1
+ __version__ = "0.2.6"
mistocr/_modidx.py CHANGED
@@ -21,7 +21,8 @@ d = { 'settings': { 'branch': 'main',
21
21
  'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
22
22
  'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
23
23
  'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},
24
- 'mistocr.refine': { 'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
24
+ 'mistocr.refine': { 'mistocr.refine.HeadingCorrection': ('refine.html#headingcorrection', 'mistocr/refine.py'),
25
+ 'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
25
26
  'mistocr.refine.ImgDescription': ('refine.html#imgdescription', 'mistocr/refine.py'),
26
27
  'mistocr.refine.add_descs_to_pg': ('refine.html#add_descs_to_pg', 'mistocr/refine.py'),
27
28
  'mistocr.refine.add_descs_to_pgs': ('refine.html#add_descs_to_pgs', 'mistocr/refine.py'),
mistocr/refine.py CHANGED
@@ -4,9 +4,9 @@
4
4
 
5
5
  # %% auto 0
6
6
  __all__ = ['prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
7
- 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes', 'fix_hdgs', 'ImgDescription',
8
- 'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs', 'add_descs_to_pg', 'add_descs_to_pgs',
9
- 'add_img_descs']
7
+ 'HeadingCorrection', 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes',
8
+ 'fix_hdgs', 'ImgDescription', 'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs',
9
+ 'add_descs_to_pg', 'add_descs_to_pgs', 'add_img_descs']
10
10
 
11
11
  # %% ../nbs/01_refine.ipynb 3
12
12
  from fastcore.all import *
@@ -59,8 +59,12 @@ def fmt_hdgs_idx(
59
59
 
60
60
 
61
61
  # %% ../nbs/01_refine.ipynb 18
62
+ class HeadingCorrection(BaseModel):
63
+ index: int
64
+ corrected: str
65
+
62
66
  class HeadingCorrections(BaseModel):
63
- corrections: dict[int, str] # index → corrected heading
67
+ corrections: list[HeadingCorrection]
64
68
 
65
69
  # %% ../nbs/01_refine.ipynb 20
66
70
  prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
@@ -85,15 +89,16 @@ RULES - Apply these fixes in order:
85
89
 
86
90
  4. **Decreasing levels is OK**: Moving back up the hierarchy (### to ##) is valid for new sections
87
91
 
88
- OUTPUT: Return a Python dictionary mapping index to corrected heading (without the index prefix).
92
+ OUTPUT: Return a list of corrections, where each correction has:
93
+ - index: the heading's index number
94
+ - corrected: the fixed heading text (without the index prefix)
89
95
  IMPORTANT: Preserve the " ... page N" suffix in all corrected headings.
90
- Only include entries that need changes.
96
+ Only include headings that need changes.
91
97
 
92
98
  Headings to analyze:
93
99
  {headings_list}
94
100
  """
95
101
 
96
-
97
102
  # %% ../nbs/01_refine.ipynb 22
98
103
  def fix_hdg_hierarchy(
99
104
  hdgs: list[str], # List of markdown headings
@@ -106,7 +111,8 @@ def fix_hdg_hierarchy(
106
111
  if prompt is None: prompt = prompt_fix_hdgs
107
112
  prompt = prompt.format(headings_list=fmt_hdgs_idx(hdgs))
108
113
  r = completion(model=model, messages=[{"role": "user", "content": prompt}], response_format=HeadingCorrections, api_key=api_key)
109
- return json.loads(r.choices[0].message.content)['corrections']
114
+ fixes = json.loads(r.choices[0].message.content)['corrections']
115
+ return {o['index']: o['corrected'] for o in fixes}
110
116
 
111
117
 
112
118
  # %% ../nbs/01_refine.ipynb 25
@@ -120,7 +126,7 @@ def mk_fixes_lut(
120
126
  "Make a lookup table of fixes"
121
127
  if api_key is None: api_key = os.getenv('ANTHROPIC_API_KEY')
122
128
  fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key, **kwargs)
123
- return {hdgs[int(k)]:v for k,v in fixes.items()}
129
+ return {hdgs[k]:v for k,v in fixes.items()}
124
130
 
125
131
  # %% ../nbs/01_refine.ipynb 28
126
132
  def apply_hdg_fixes(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.2.4
3
+ Version: 0.2.6
4
4
  Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -76,6 +76,14 @@ fundamental challenges that raw OCR output leaves unsolved:
76
76
  **In short**: Complete PDF OCR with heading hierarchy fixes and image
77
77
  descriptions for RAG and LLM pipelines.
78
78
 
79
+ > [!NOTE]
80
+ >
81
+ > **Want to see mistocr in action?** This
82
+ > [tutorial](https://share.solve.it.com/d/97f75412ca949af76a5945b4dfc443c7)
83
+ > demonstrates real-world PDF processing and shows how clean markdown
84
+ > enables structure-aware navigation through long documents—letting you
85
+ > find exactly what you need, fast.
86
+
79
87
  ## Get Started
80
88
 
81
89
  Install latest from [pypi](https://pypi.org/project/mistocr), then:
@@ -0,0 +1,11 @@
1
+ mistocr/__init__.py,sha256=Oz5HbwHMyE87nmwV80AZzpkJPf-wBg7eDuJr_BXZkhU,22
2
+ mistocr/_modidx.py,sha256=55k_EmMPrRmLa04qCE-_8lukJPCax66q5MALfpa8T4A,4154
3
+ mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
4
+ mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
5
+ mistocr/refine.py,sha256=EXlCKiC16dnQfPKHUguDwypnhSQ3vK2TKdkPfkSWras,11976
6
+ mistocr-0.2.6.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
+ mistocr-0.2.6.dist-info/METADATA,sha256=KHkqvB4eYBpPKVsj9nUg0dwmjMrHfWEcAyOCUFAHgTk,8416
8
+ mistocr-0.2.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
+ mistocr-0.2.6.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
+ mistocr-0.2.6.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
+ mistocr-0.2.6.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- mistocr/__init__.py,sha256=SBl2EPFW-ltPvQ7vbVWItyAsz3aKYIpjO7vcfr84GkU,22
2
- mistocr/_modidx.py,sha256=JtXJb6ErVwyGFCWgeVz7N9JvybBp8Zyao4Ahn3smr90,4030
3
- mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
4
- mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
5
- mistocr/refine.py,sha256=arJPOg1eP4MwtkD1zwnYY4EFrBfTTSP-mtR4AVnTiR8,11788
6
- mistocr-0.2.4.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
- mistocr-0.2.4.dist-info/METADATA,sha256=CWC4FuabSb3xj9qfV-R_lDKo1QJ8kAk3V-br6Y6AsOk,8105
8
- mistocr-0.2.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
- mistocr-0.2.4.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
- mistocr-0.2.4.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
- mistocr-0.2.4.dist-info/RECORD,,