mistocr 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mistocr/__init__.py +1 -1
- mistocr/refine.py +34 -24
- {mistocr-0.1.4.dist-info → mistocr-0.1.6.dist-info}/METADATA +1 -1
- mistocr-0.1.6.dist-info/RECORD +10 -0
- mistocr-0.1.4.dist-info/RECORD +0 -10
- {mistocr-0.1.4.dist-info → mistocr-0.1.6.dist-info}/WHEEL +0 -0
- {mistocr-0.1.4.dist-info → mistocr-0.1.6.dist-info}/entry_points.txt +0 -0
- {mistocr-0.1.4.dist-info → mistocr-0.1.6.dist-info}/licenses/LICENSE +0 -0
- {mistocr-0.1.4.dist-info → mistocr-0.1.6.dist-info}/top_level.txt +0 -0
mistocr/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.1.
|
|
1
|
+
__version__ = "0.1.6"
|
mistocr/refine.py
CHANGED
|
@@ -12,13 +12,14 @@ from .core import read_pgs
|
|
|
12
12
|
from re import sub, findall, MULTILINE
|
|
13
13
|
from pydantic import BaseModel
|
|
14
14
|
from lisette.core import completion
|
|
15
|
+
from typing import Callable
|
|
15
16
|
import os
|
|
16
17
|
import json
|
|
17
18
|
|
|
18
|
-
# %% ../nbs/01_refine.ipynb
|
|
19
|
+
# %% ../nbs/01_refine.ipynb 7
|
|
19
20
|
def get_hdgs(
|
|
20
21
|
md:str # Markdown file string
|
|
21
|
-
):
|
|
22
|
+
) -> L: # L of strings
|
|
22
23
|
"Return the markdown headings"
|
|
23
24
|
# Sanitize removing '#' in python snippet if any
|
|
24
25
|
md = sub(r'```[\s\S]*?```', '', md)
|
|
@@ -26,15 +27,20 @@ def get_hdgs(
|
|
|
26
27
|
|
|
27
28
|
|
|
28
29
|
|
|
29
|
-
# %% ../nbs/01_refine.ipynb
|
|
30
|
-
def add_pg_hdgs(
|
|
31
|
-
|
|
30
|
+
# %% ../nbs/01_refine.ipynb 8
|
|
31
|
+
def add_pg_hdgs(
|
|
32
|
+
md:str, # Markdown file string,
|
|
33
|
+
n:int # Page number
|
|
34
|
+
) -> str: # Markdown file string
|
|
35
|
+
"Add page number to all headings in page markdown"
|
|
32
36
|
md = sub(r'```[\s\S]*?```', '', md)
|
|
33
37
|
def repl(m): return m.group(0) + f' ... page {n}'
|
|
34
38
|
return sub(r'^#{1,6} .+$', repl, md, flags=MULTILINE)
|
|
35
39
|
|
|
36
|
-
# %% ../nbs/01_refine.ipynb
|
|
37
|
-
def read_pgs_pg(
|
|
40
|
+
# %% ../nbs/01_refine.ipynb 12
|
|
41
|
+
def read_pgs_pg(
|
|
42
|
+
path:str # Path to the markdown file
|
|
43
|
+
) -> L: # List of markdown pages
|
|
38
44
|
"Read all pages of a markdown file and add page numbers to all headings"
|
|
39
45
|
pgs = read_pgs(path, join=False)
|
|
40
46
|
return L([add_pg_hdgs(pg, n) for n, pg in enumerate(pgs, 1)]).concat()
|
|
@@ -83,51 +89,55 @@ Headings to analyze:
|
|
|
83
89
|
"""
|
|
84
90
|
|
|
85
91
|
|
|
86
|
-
# %% ../nbs/01_refine.ipynb
|
|
92
|
+
# %% ../nbs/01_refine.ipynb 22
|
|
87
93
|
def fix_hdg_hierarchy(
|
|
88
94
|
hdgs: list[str], # List of markdown headings
|
|
89
|
-
prompt: str=
|
|
95
|
+
prompt: str=None, # Prompt to use
|
|
90
96
|
model: str='claude-sonnet-4-5', # Model to use
|
|
91
|
-
api_key: str=
|
|
97
|
+
api_key: str=None # API key
|
|
92
98
|
) -> dict[int, str]: # Dictionary of index → corrected heading
|
|
93
99
|
"Fix the heading hierarchy"
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
api_key=api_key
|
|
99
|
-
)
|
|
100
|
+
if api_key is None: api_key = os.getenv('ANTHROPIC_API_KEY')
|
|
101
|
+
if prompt is None: prompt = prompt_fix_hdgs
|
|
102
|
+
prompt = prompt.format(headings_list=fmt_hdgs_idx(hdgs))
|
|
103
|
+
r = completion(model=model, messages=[{"role": "user", "content": prompt}], response_format=HeadingCorrections, api_key=api_key)
|
|
100
104
|
return json.loads(r.choices[0].message.content)['corrections']
|
|
101
105
|
|
|
102
|
-
|
|
106
|
+
|
|
107
|
+
# %% ../nbs/01_refine.ipynb 25
|
|
108
|
+
@delegates(fix_hdg_hierarchy)
|
|
103
109
|
def mk_fixes_lut(
|
|
104
110
|
hdgs: list[str], # List of markdown headings
|
|
105
111
|
model: str='claude-sonnet-4-5', # Model to use
|
|
106
|
-
api_key: str=
|
|
112
|
+
api_key: str=None, # API key
|
|
113
|
+
**kwargs
|
|
107
114
|
) -> dict[str, str]: # Dictionary of old → new heading
|
|
108
115
|
"Make a lookup table of fixes"
|
|
109
|
-
|
|
116
|
+
if api_key is None: api_key = os.getenv('ANTHROPIC_API_KEY')
|
|
117
|
+
fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key, **kwargs)
|
|
110
118
|
return {hdgs[int(k)]:v for k,v in fixes.items()}
|
|
111
119
|
|
|
112
|
-
# %% ../nbs/01_refine.ipynb
|
|
120
|
+
# %% ../nbs/01_refine.ipynb 28
|
|
113
121
|
def apply_hdg_fixes(
|
|
114
122
|
p:str, # Page to fix
|
|
115
123
|
lut_fixes: dict[str, str], # Lookup table of fixes
|
|
116
124
|
) -> str: # Page with fixes applied
|
|
117
125
|
"Apply the fixes to the page"
|
|
118
|
-
#for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old) + (f' .... page {pg}' if pg else ''))
|
|
119
126
|
for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old))
|
|
120
127
|
return p
|
|
121
128
|
|
|
122
|
-
# %% ../nbs/01_refine.ipynb
|
|
129
|
+
# %% ../nbs/01_refine.ipynb 31
|
|
130
|
+
@delegates(mk_fixes_lut)
|
|
123
131
|
def fix_md_hdgs(
|
|
124
132
|
src:str, # Source directory with markdown pages
|
|
125
133
|
model:str='claude-sonnet-4-5', # Model
|
|
126
134
|
dst:str=None, # Destination directory (None=overwrite)
|
|
135
|
+
**kwargs
|
|
127
136
|
):
|
|
128
137
|
"Fix heading hierarchy in markdown document"
|
|
129
138
|
src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
|
|
130
139
|
if dst_path != src_path: dst_path.mkdir(parents=True, exist_ok=True)
|
|
131
|
-
|
|
132
|
-
for
|
|
140
|
+
pgs_with_pg = read_pgs_pg(src_path)
|
|
141
|
+
lut = mk_fixes_lut(L([get_hdgs(pg) for pg in pgs_with_pg]).concat(), model, **kwargs)
|
|
142
|
+
for i,p in enumerate(pgs_with_pg, 1):
|
|
133
143
|
(dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut))
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
mistocr/__init__.py,sha256=n3oM6B_EMz93NsTI18NNZd-jKFcUPzUkbIKj5VFK5ok,22
|
|
2
|
+
mistocr/_modidx.py,sha256=sZ3ISGF-2f7VEOD9MVgqMVs5SifUNe-1YP0wy8Ey0cU,2884
|
|
3
|
+
mistocr/core.py,sha256=wtaYZ_Fz0dXIb1cYLAtymwR9Z7_KBI4ULy-UpM3YTX4,7154
|
|
4
|
+
mistocr/refine.py,sha256=BjswapfwJvyfLYyiNw-2048nAWZae6ujeO7ELvXUrMM,5653
|
|
5
|
+
mistocr-0.1.6.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
|
|
6
|
+
mistocr-0.1.6.dist-info/METADATA,sha256=c2aZrFY597Zd1iRSm5istCZWKkK1QKFq5cktr-QLtzs,4848
|
|
7
|
+
mistocr-0.1.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
+
mistocr-0.1.6.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
|
|
9
|
+
mistocr-0.1.6.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
|
|
10
|
+
mistocr-0.1.6.dist-info/RECORD,,
|
mistocr-0.1.4.dist-info/RECORD
DELETED
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
mistocr/__init__.py,sha256=Wzf5T3NBDfhQoTnhnRNHSlAsE0XMqbclXG-M81Vas70,22
|
|
2
|
-
mistocr/_modidx.py,sha256=sZ3ISGF-2f7VEOD9MVgqMVs5SifUNe-1YP0wy8Ey0cU,2884
|
|
3
|
-
mistocr/core.py,sha256=wtaYZ_Fz0dXIb1cYLAtymwR9Z7_KBI4ULy-UpM3YTX4,7154
|
|
4
|
-
mistocr/refine.py,sha256=SMtZ2dvEE5iH8hL86Au8kXRB-M_uewaexB7RhMqpWnw,5313
|
|
5
|
-
mistocr-0.1.4.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
|
|
6
|
-
mistocr-0.1.4.dist-info/METADATA,sha256=k_4gVsWOX_w2pZGHv6G4FipRdWHCm76V2dw1TijpfjM,4848
|
|
7
|
-
mistocr-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
-
mistocr-0.1.4.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
|
|
9
|
-
mistocr-0.1.4.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
|
|
10
|
-
mistocr-0.1.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|