mistocr 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mistocr/__init__.py +1 -1
- mistocr/refine.py +17 -14
- {mistocr-0.1.1.dist-info → mistocr-0.1.2.dist-info}/METADATA +1 -1
- mistocr-0.1.2.dist-info/RECORD +10 -0
- mistocr-0.1.1.dist-info/RECORD +0 -10
- {mistocr-0.1.1.dist-info → mistocr-0.1.2.dist-info}/WHEEL +0 -0
- {mistocr-0.1.1.dist-info → mistocr-0.1.2.dist-info}/entry_points.txt +0 -0
- {mistocr-0.1.1.dist-info → mistocr-0.1.2.dist-info}/licenses/LICENSE +0 -0
- {mistocr-0.1.1.dist-info → mistocr-0.1.2.dist-info}/top_level.txt +0 -0
mistocr/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.1.
|
|
1
|
+
__version__ = "0.1.2"
|
mistocr/refine.py
CHANGED
|
@@ -43,27 +43,30 @@ prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the
|
|
|
43
43
|
|
|
44
44
|
INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title")
|
|
45
45
|
|
|
46
|
-
RULES -
|
|
47
|
-
1. **Level jumps**: Headings can only increase by one # at a time
|
|
48
|
-
- Wrong: 0. # Title → 1. #### Abstract
|
|
49
|
-
- Fixed: 0. # Title → 1. ## Abstract
|
|
46
|
+
RULES - Apply these fixes in order:
|
|
50
47
|
|
|
51
|
-
|
|
52
|
-
-
|
|
53
|
-
- Fixed: 4. ## 3. Section → 5. ### 3.1 Subsection
|
|
48
|
+
1. **Single H1 rule**: Documents must have exactly ONE # heading (the title/main heading)
|
|
49
|
+
- All other headings should be ## or deeper
|
|
54
50
|
|
|
55
|
-
|
|
51
|
+
2. **Infer depth from numbering patterns**: If headings contain section numbers, deeper nesting means deeper heading level
|
|
52
|
+
- Parent section (e.g., "1", "2", "A") should be shallower than child (e.g., "1.1", "2.a", "A.1")
|
|
53
|
+
- Child section should be one # deeper than parent
|
|
54
|
+
- Works with any numbering: "1/1.1/1.1.1", "A/A.1/A.1.a", "I/I.A/I.A.1", etc.
|
|
56
55
|
|
|
57
|
-
|
|
56
|
+
3. **Level jumps**: Headings can only increase by one # at a time when moving deeper
|
|
57
|
+
- Wrong: ## Section → ##### Subsection
|
|
58
|
+
- Fixed: ## Section → ### Subsection
|
|
59
|
+
|
|
60
|
+
4. **Decreasing levels is OK**: Moving back up the hierarchy (### to ##) is valid for new sections
|
|
58
61
|
|
|
59
62
|
OUTPUT: Return a Python dictionary mapping index to corrected heading (without the index prefix).
|
|
60
|
-
Only include entries that need changes.
|
|
63
|
+
Only include entries that need changes.
|
|
61
64
|
|
|
62
65
|
Headings to analyze:
|
|
63
66
|
{headings_list}
|
|
64
67
|
"""
|
|
65
68
|
|
|
66
|
-
# %% ../nbs/01_refine.ipynb
|
|
69
|
+
# %% ../nbs/01_refine.ipynb 16
|
|
67
70
|
def fix_hdg_hierarchy(
|
|
68
71
|
hdgs: list[str], # List of markdown headings
|
|
69
72
|
prompt: str=prompt_fix_hdgs, # Prompt to use
|
|
@@ -79,7 +82,7 @@ def fix_hdg_hierarchy(
|
|
|
79
82
|
)
|
|
80
83
|
return json.loads(r.choices[0].message.content)['corrections']
|
|
81
84
|
|
|
82
|
-
# %% ../nbs/01_refine.ipynb
|
|
85
|
+
# %% ../nbs/01_refine.ipynb 19
|
|
83
86
|
def mk_fixes_lut(
|
|
84
87
|
hdgs: list[str], # List of markdown headings
|
|
85
88
|
model: str='claude-sonnet-4-5', # Model to use
|
|
@@ -89,7 +92,7 @@ def mk_fixes_lut(
|
|
|
89
92
|
fixes = fix_hdg_hierarchy(hdgs, model, api_key)
|
|
90
93
|
return {hdgs[int(k)]:v for k,v in fixes.items()}
|
|
91
94
|
|
|
92
|
-
# %% ../nbs/01_refine.ipynb
|
|
95
|
+
# %% ../nbs/01_refine.ipynb 22
|
|
93
96
|
def apply_hdg_fixes(
|
|
94
97
|
p:str, # Page to fix
|
|
95
98
|
lut_fixes: dict[str, str], # Lookup table of fixes
|
|
@@ -99,7 +102,7 @@ def apply_hdg_fixes(
|
|
|
99
102
|
for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old) + (f' .... page {pg}' if pg else ''))
|
|
100
103
|
return p
|
|
101
104
|
|
|
102
|
-
# %% ../nbs/01_refine.ipynb
|
|
105
|
+
# %% ../nbs/01_refine.ipynb 25
|
|
103
106
|
def fix_md_hdgs(
|
|
104
107
|
src:str, # Source directory with markdown pages
|
|
105
108
|
model:str='claude-sonnet-4-5', # Model
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
mistocr/__init__.py,sha256=YvuYzWnKtqBb-IqG8HAu-nhIYAsgj9Vmc_b9o7vO-js,22
|
|
2
|
+
mistocr/_modidx.py,sha256=R9zVMv4dKz2sLStoB5wBoKRqjza216_z8xPXszoplU4,2660
|
|
3
|
+
mistocr/core.py,sha256=wtaYZ_Fz0dXIb1cYLAtymwR9Z7_KBI4ULy-UpM3YTX4,7154
|
|
4
|
+
mistocr/refine.py,sha256=kZH-z55zNcIgmHSpHETkHjGCoZENmMvL9QJOpNic2GI,4531
|
|
5
|
+
mistocr-0.1.2.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
|
|
6
|
+
mistocr-0.1.2.dist-info/METADATA,sha256=AOoj8PRpmJDlfRgEFzfR7UQYa7ZI5cVTsXlopHzk6JM,4848
|
|
7
|
+
mistocr-0.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
+
mistocr-0.1.2.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
|
|
9
|
+
mistocr-0.1.2.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
|
|
10
|
+
mistocr-0.1.2.dist-info/RECORD,,
|
mistocr-0.1.1.dist-info/RECORD
DELETED
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
mistocr/__init__.py,sha256=rnObPjuBcEStqSO0S6gsdS_ot8ITOQjVj_-P1LUUYpg,22
|
|
2
|
-
mistocr/_modidx.py,sha256=R9zVMv4dKz2sLStoB5wBoKRqjza216_z8xPXszoplU4,2660
|
|
3
|
-
mistocr/core.py,sha256=wtaYZ_Fz0dXIb1cYLAtymwR9Z7_KBI4ULy-UpM3YTX4,7154
|
|
4
|
-
mistocr/refine.py,sha256=0N0omMZvPqydArPaiTdkX8tts4eS4AgfAJQP98WnvwY,4307
|
|
5
|
-
mistocr-0.1.1.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
|
|
6
|
-
mistocr-0.1.1.dist-info/METADATA,sha256=ZPvywkk_QS7UTfPtup5IppiLzf5aZ1tCSPkZLV9g4WE,4848
|
|
7
|
-
mistocr-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
-
mistocr-0.1.1.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
|
|
9
|
-
mistocr-0.1.1.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
|
|
10
|
-
mistocr-0.1.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|