mistocr 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mistocr/__init__.py +1 -1
- mistocr/_modidx.py +3 -1
- mistocr/refine.py +35 -19
- {mistocr-0.1.3.dist-info → mistocr-0.1.5.dist-info}/METADATA +1 -1
- mistocr-0.1.5.dist-info/RECORD +10 -0
- mistocr-0.1.3.dist-info/RECORD +0 -10
- {mistocr-0.1.3.dist-info → mistocr-0.1.5.dist-info}/WHEEL +0 -0
- {mistocr-0.1.3.dist-info → mistocr-0.1.5.dist-info}/entry_points.txt +0 -0
- {mistocr-0.1.3.dist-info → mistocr-0.1.5.dist-info}/licenses/LICENSE +0 -0
- {mistocr-0.1.3.dist-info → mistocr-0.1.5.dist-info}/top_level.txt +0 -0
mistocr/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.1.
|
|
1
|
+
__version__ = "0.1.5"
|
mistocr/_modidx.py
CHANGED
|
@@ -21,9 +21,11 @@ d = { 'settings': { 'branch': 'main',
|
|
|
21
21
|
'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
|
|
22
22
|
'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
|
|
23
23
|
'mistocr.refine': { 'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
|
|
24
|
+
'mistocr.refine.add_pg_hdgs': ('refine.html#add_pg_hdgs', 'mistocr/refine.py'),
|
|
24
25
|
'mistocr.refine.apply_hdg_fixes': ('refine.html#apply_hdg_fixes', 'mistocr/refine.py'),
|
|
25
26
|
'mistocr.refine.fix_hdg_hierarchy': ('refine.html#fix_hdg_hierarchy', 'mistocr/refine.py'),
|
|
26
27
|
'mistocr.refine.fix_md_hdgs': ('refine.html#fix_md_hdgs', 'mistocr/refine.py'),
|
|
27
28
|
'mistocr.refine.fmt_hdgs_idx': ('refine.html#fmt_hdgs_idx', 'mistocr/refine.py'),
|
|
28
29
|
'mistocr.refine.get_hdgs': ('refine.html#get_hdgs', 'mistocr/refine.py'),
|
|
29
|
-
'mistocr.refine.mk_fixes_lut': ('refine.html#mk_fixes_lut', 'mistocr/refine.py')
|
|
30
|
+
'mistocr.refine.mk_fixes_lut': ('refine.html#mk_fixes_lut', 'mistocr/refine.py'),
|
|
31
|
+
'mistocr.refine.read_pgs_pg': ('refine.html#read_pgs_pg', 'mistocr/refine.py')}}}
|
mistocr/refine.py
CHANGED
|
@@ -3,8 +3,8 @@
|
|
|
3
3
|
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_refine.ipynb.
|
|
4
4
|
|
|
5
5
|
# %% auto 0
|
|
6
|
-
__all__ = ['prompt_fix_hdgs', 'get_hdgs', '
|
|
7
|
-
'apply_hdg_fixes', 'fix_md_hdgs']
|
|
6
|
+
__all__ = ['prompt_fix_hdgs', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx', 'HeadingCorrections', 'fix_hdg_hierarchy',
|
|
7
|
+
'mk_fixes_lut', 'apply_hdg_fixes', 'fix_md_hdgs']
|
|
8
8
|
|
|
9
9
|
# %% ../nbs/01_refine.ipynb 3
|
|
10
10
|
from fastcore.all import *
|
|
@@ -15,7 +15,7 @@ from lisette.core import completion
|
|
|
15
15
|
import os
|
|
16
16
|
import json
|
|
17
17
|
|
|
18
|
-
# %% ../nbs/01_refine.ipynb
|
|
18
|
+
# %% ../nbs/01_refine.ipynb 8
|
|
19
19
|
def get_hdgs(
|
|
20
20
|
md:str # Markdown file string
|
|
21
21
|
):
|
|
@@ -26,7 +26,20 @@ def get_hdgs(
|
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
|
|
29
|
-
# %% ../nbs/01_refine.ipynb
|
|
29
|
+
# %% ../nbs/01_refine.ipynb 9
|
|
30
|
+
def add_pg_hdgs(md, n):
|
|
31
|
+
"Add page number to all headings in markdown"
|
|
32
|
+
md = sub(r'```[\s\S]*?```', '', md)
|
|
33
|
+
def repl(m): return m.group(0) + f' ... page {n}'
|
|
34
|
+
return sub(r'^#{1,6} .+$', repl, md, flags=MULTILINE)
|
|
35
|
+
|
|
36
|
+
# %% ../nbs/01_refine.ipynb 11
|
|
37
|
+
def read_pgs_pg(path):
|
|
38
|
+
"Read all pages of a markdown file and add page numbers to all headings"
|
|
39
|
+
pgs = read_pgs(path, join=False)
|
|
40
|
+
return L([add_pg_hdgs(pg, n) for n, pg in enumerate(pgs, 1)]).concat()
|
|
41
|
+
|
|
42
|
+
# %% ../nbs/01_refine.ipynb 15
|
|
30
43
|
def fmt_hdgs_idx(
|
|
31
44
|
hdgs: list[str] # List of markdown headings
|
|
32
45
|
) -> str: # Formatted string with index
|
|
@@ -34,19 +47,21 @@ def fmt_hdgs_idx(
|
|
|
34
47
|
return '\n'.join(f"{i}. {h}" for i, h in enumerate(hdgs))
|
|
35
48
|
|
|
36
49
|
|
|
37
|
-
# %% ../nbs/01_refine.ipynb
|
|
50
|
+
# %% ../nbs/01_refine.ipynb 18
|
|
38
51
|
class HeadingCorrections(BaseModel):
|
|
39
52
|
corrections: dict[int, str] # index → corrected heading
|
|
40
53
|
|
|
41
|
-
# %% ../nbs/01_refine.ipynb
|
|
54
|
+
# %% ../nbs/01_refine.ipynb 20
|
|
42
55
|
prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
|
|
43
56
|
|
|
44
|
-
INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title")
|
|
57
|
+
INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title ... page 1")
|
|
45
58
|
|
|
46
59
|
RULES - Apply these fixes in order:
|
|
47
60
|
|
|
48
|
-
1. **Single H1 rule**: Documents must have exactly ONE # heading (the title
|
|
49
|
-
-
|
|
61
|
+
1. **Single H1 rule**: Documents must have exactly ONE # heading (typically the document title at the top)
|
|
62
|
+
- If index 0 is already #, then all subsequent headings (index 1+) must be ## or deeper
|
|
63
|
+
- If no H1 exists, the first major heading should be #, and all others ## or deeper
|
|
64
|
+
- NO exceptions: appendices, references, and all sections are ## or deeper after the title
|
|
50
65
|
|
|
51
66
|
2. **Infer depth from numbering patterns**: If headings contain section numbers, deeper nesting means deeper heading level
|
|
52
67
|
- Parent section (e.g., "1", "2", "A") should be shallower than child (e.g., "1.1", "2.a", "A.1")
|
|
@@ -60,13 +75,15 @@ RULES - Apply these fixes in order:
|
|
|
60
75
|
4. **Decreasing levels is OK**: Moving back up the hierarchy (### to ##) is valid for new sections
|
|
61
76
|
|
|
62
77
|
OUTPUT: Return a Python dictionary mapping index to corrected heading (without the index prefix).
|
|
78
|
+
IMPORTANT: Preserve the " ... page N" suffix in all corrected headings.
|
|
63
79
|
Only include entries that need changes.
|
|
64
80
|
|
|
65
81
|
Headings to analyze:
|
|
66
82
|
{headings_list}
|
|
67
83
|
"""
|
|
68
84
|
|
|
69
|
-
|
|
85
|
+
|
|
86
|
+
# %% ../nbs/01_refine.ipynb 21
|
|
70
87
|
def fix_hdg_hierarchy(
|
|
71
88
|
hdgs: list[str], # List of markdown headings
|
|
72
89
|
prompt: str=prompt_fix_hdgs, # Prompt to use
|
|
@@ -82,7 +99,7 @@ def fix_hdg_hierarchy(
|
|
|
82
99
|
)
|
|
83
100
|
return json.loads(r.choices[0].message.content)['corrections']
|
|
84
101
|
|
|
85
|
-
# %% ../nbs/01_refine.ipynb
|
|
102
|
+
# %% ../nbs/01_refine.ipynb 24
|
|
86
103
|
def mk_fixes_lut(
|
|
87
104
|
hdgs: list[str], # List of markdown headings
|
|
88
105
|
model: str='claude-sonnet-4-5', # Model to use
|
|
@@ -92,26 +109,25 @@ def mk_fixes_lut(
|
|
|
92
109
|
fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key)
|
|
93
110
|
return {hdgs[int(k)]:v for k,v in fixes.items()}
|
|
94
111
|
|
|
95
|
-
# %% ../nbs/01_refine.ipynb
|
|
112
|
+
# %% ../nbs/01_refine.ipynb 27
|
|
96
113
|
def apply_hdg_fixes(
|
|
97
114
|
p:str, # Page to fix
|
|
98
115
|
lut_fixes: dict[str, str], # Lookup table of fixes
|
|
99
|
-
pg: int=None, # Optionnaly specify the page number to append to original heading
|
|
100
116
|
) -> str: # Page with fixes applied
|
|
101
117
|
"Apply the fixes to the page"
|
|
102
|
-
for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old)
|
|
118
|
+
for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old))
|
|
103
119
|
return p
|
|
104
120
|
|
|
105
|
-
# %% ../nbs/01_refine.ipynb
|
|
121
|
+
# %% ../nbs/01_refine.ipynb 30
|
|
106
122
|
def fix_md_hdgs(
|
|
107
123
|
src:str, # Source directory with markdown pages
|
|
108
124
|
model:str='claude-sonnet-4-5', # Model
|
|
109
125
|
dst:str=None, # Destination directory (None=overwrite)
|
|
110
|
-
pg_nums:bool=True # Add page numbers
|
|
111
126
|
):
|
|
112
127
|
"Fix heading hierarchy in markdown document"
|
|
113
128
|
src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
|
|
114
129
|
if dst_path != src_path: dst_path.mkdir(parents=True, exist_ok=True)
|
|
115
|
-
|
|
116
|
-
for
|
|
117
|
-
|
|
130
|
+
pgs_with_pg = read_pgs_pg(src_path)
|
|
131
|
+
lut = mk_fixes_lut(L([get_hdgs(pg) for pg in pgs_with_pg]).concat(), model)
|
|
132
|
+
for i,p in enumerate(pgs_with_pg, 1):
|
|
133
|
+
(dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut))
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
mistocr/__init__.py,sha256=rPSfWgIeq2YWVPyESOAwCBt8vftsTpIkuLAGDEzyRQc,22
|
|
2
|
+
mistocr/_modidx.py,sha256=sZ3ISGF-2f7VEOD9MVgqMVs5SifUNe-1YP0wy8Ey0cU,2884
|
|
3
|
+
mistocr/core.py,sha256=wtaYZ_Fz0dXIb1cYLAtymwR9Z7_KBI4ULy-UpM3YTX4,7154
|
|
4
|
+
mistocr/refine.py,sha256=314r4MBZRIvUmu6B_dvvq9P4d4a_japKBpsg4wnU9oU,5253
|
|
5
|
+
mistocr-0.1.5.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
|
|
6
|
+
mistocr-0.1.5.dist-info/METADATA,sha256=kcrDK0kJadP5Sze0tVzRo-pLWWTJQiEnvwlLpWJZz2o,4848
|
|
7
|
+
mistocr-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
+
mistocr-0.1.5.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
|
|
9
|
+
mistocr-0.1.5.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
|
|
10
|
+
mistocr-0.1.5.dist-info/RECORD,,
|
mistocr-0.1.3.dist-info/RECORD
DELETED
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
mistocr/__init__.py,sha256=XEqb2aiIn8fzGE68Mph4ck1FtQqsR_am0wRWvrYPffQ,22
|
|
2
|
-
mistocr/_modidx.py,sha256=R9zVMv4dKz2sLStoB5wBoKRqjza216_z8xPXszoplU4,2660
|
|
3
|
-
mistocr/core.py,sha256=wtaYZ_Fz0dXIb1cYLAtymwR9Z7_KBI4ULy-UpM3YTX4,7154
|
|
4
|
-
mistocr/refine.py,sha256=572SDG8vhGjNMiET5eZhgVemNpUIHNFqi0ZSSl4eKCM,4545
|
|
5
|
-
mistocr-0.1.3.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
|
|
6
|
-
mistocr-0.1.3.dist-info/METADATA,sha256=jHRc6nm_uk7V-03y6Bd268hUWmkkOFNdt4s5cH3YPu0,4848
|
|
7
|
-
mistocr-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
-
mistocr-0.1.3.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
|
|
9
|
-
mistocr-0.1.3.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
|
|
10
|
-
mistocr-0.1.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|