mistocr 0.2.5__tar.gz → 0.2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mistocr-0.2.5/mistocr.egg-info → mistocr-0.2.6}/PKG-INFO +1 -1
- mistocr-0.2.6/mistocr/__init__.py +1 -0
- {mistocr-0.2.5 → mistocr-0.2.6}/mistocr/_modidx.py +2 -1
- {mistocr-0.2.5 → mistocr-0.2.6}/mistocr/refine.py +15 -9
- {mistocr-0.2.5 → mistocr-0.2.6/mistocr.egg-info}/PKG-INFO +1 -1
- {mistocr-0.2.5 → mistocr-0.2.6}/settings.ini +1 -1
- mistocr-0.2.5/mistocr/__init__.py +0 -1
- {mistocr-0.2.5 → mistocr-0.2.6}/LICENSE +0 -0
- {mistocr-0.2.5 → mistocr-0.2.6}/MANIFEST.in +0 -0
- {mistocr-0.2.5 → mistocr-0.2.6}/README.md +0 -0
- {mistocr-0.2.5 → mistocr-0.2.6}/mistocr/core.py +0 -0
- {mistocr-0.2.5 → mistocr-0.2.6}/mistocr/pipeline.py +0 -0
- {mistocr-0.2.5 → mistocr-0.2.6}/mistocr.egg-info/SOURCES.txt +0 -0
- {mistocr-0.2.5 → mistocr-0.2.6}/mistocr.egg-info/dependency_links.txt +0 -0
- {mistocr-0.2.5 → mistocr-0.2.6}/mistocr.egg-info/entry_points.txt +0 -0
- {mistocr-0.2.5 → mistocr-0.2.6}/mistocr.egg-info/not-zip-safe +0 -0
- {mistocr-0.2.5 → mistocr-0.2.6}/mistocr.egg-info/requires.txt +0 -0
- {mistocr-0.2.5 → mistocr-0.2.6}/mistocr.egg-info/top_level.txt +0 -0
- {mistocr-0.2.5 → mistocr-0.2.6}/pyproject.toml +0 -0
- {mistocr-0.2.5 → mistocr-0.2.6}/setup.cfg +0 -0
- {mistocr-0.2.5 → mistocr-0.2.6}/setup.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.6"
|
|
@@ -21,7 +21,8 @@ d = { 'settings': { 'branch': 'main',
|
|
|
21
21
|
'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
|
|
22
22
|
'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
|
|
23
23
|
'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},
|
|
24
|
-
'mistocr.refine': { 'mistocr.refine.
|
|
24
|
+
'mistocr.refine': { 'mistocr.refine.HeadingCorrection': ('refine.html#headingcorrection', 'mistocr/refine.py'),
|
|
25
|
+
'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
|
|
25
26
|
'mistocr.refine.ImgDescription': ('refine.html#imgdescription', 'mistocr/refine.py'),
|
|
26
27
|
'mistocr.refine.add_descs_to_pg': ('refine.html#add_descs_to_pg', 'mistocr/refine.py'),
|
|
27
28
|
'mistocr.refine.add_descs_to_pgs': ('refine.html#add_descs_to_pgs', 'mistocr/refine.py'),
|
|
@@ -4,9 +4,9 @@
|
|
|
4
4
|
|
|
5
5
|
# %% auto 0
|
|
6
6
|
__all__ = ['prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
|
|
7
|
-
'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes',
|
|
8
|
-
'
|
|
9
|
-
'add_img_descs']
|
|
7
|
+
'HeadingCorrection', 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes',
|
|
8
|
+
'fix_hdgs', 'ImgDescription', 'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs',
|
|
9
|
+
'add_descs_to_pg', 'add_descs_to_pgs', 'add_img_descs']
|
|
10
10
|
|
|
11
11
|
# %% ../nbs/01_refine.ipynb 3
|
|
12
12
|
from fastcore.all import *
|
|
@@ -59,8 +59,12 @@ def fmt_hdgs_idx(
|
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
# %% ../nbs/01_refine.ipynb 18
|
|
62
|
+
class HeadingCorrection(BaseModel):
|
|
63
|
+
index: int
|
|
64
|
+
corrected: str
|
|
65
|
+
|
|
62
66
|
class HeadingCorrections(BaseModel):
|
|
63
|
-
corrections:
|
|
67
|
+
corrections: list[HeadingCorrection]
|
|
64
68
|
|
|
65
69
|
# %% ../nbs/01_refine.ipynb 20
|
|
66
70
|
prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
|
|
@@ -85,15 +89,16 @@ RULES - Apply these fixes in order:
|
|
|
85
89
|
|
|
86
90
|
4. **Decreasing levels is OK**: Moving back up the hierarchy (### to ##) is valid for new sections
|
|
87
91
|
|
|
88
|
-
OUTPUT: Return a
|
|
92
|
+
OUTPUT: Return a list of corrections, where each correction has:
|
|
93
|
+
- index: the heading's index number
|
|
94
|
+
- corrected: the fixed heading text (without the index prefix)
|
|
89
95
|
IMPORTANT: Preserve the " ... page N" suffix in all corrected headings.
|
|
90
|
-
Only include
|
|
96
|
+
Only include headings that need changes.
|
|
91
97
|
|
|
92
98
|
Headings to analyze:
|
|
93
99
|
{headings_list}
|
|
94
100
|
"""
|
|
95
101
|
|
|
96
|
-
|
|
97
102
|
# %% ../nbs/01_refine.ipynb 22
|
|
98
103
|
def fix_hdg_hierarchy(
|
|
99
104
|
hdgs: list[str], # List of markdown headings
|
|
@@ -106,7 +111,8 @@ def fix_hdg_hierarchy(
|
|
|
106
111
|
if prompt is None: prompt = prompt_fix_hdgs
|
|
107
112
|
prompt = prompt.format(headings_list=fmt_hdgs_idx(hdgs))
|
|
108
113
|
r = completion(model=model, messages=[{"role": "user", "content": prompt}], response_format=HeadingCorrections, api_key=api_key)
|
|
109
|
-
|
|
114
|
+
fixes = json.loads(r.choices[0].message.content)['corrections']
|
|
115
|
+
return {o['index']: o['corrected'] for o in fixes}
|
|
110
116
|
|
|
111
117
|
|
|
112
118
|
# %% ../nbs/01_refine.ipynb 25
|
|
@@ -120,7 +126,7 @@ def mk_fixes_lut(
|
|
|
120
126
|
"Make a lookup table of fixes"
|
|
121
127
|
if api_key is None: api_key = os.getenv('ANTHROPIC_API_KEY')
|
|
122
128
|
fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key, **kwargs)
|
|
123
|
-
return {hdgs[
|
|
129
|
+
return {hdgs[k]:v for k,v in fixes.items()}
|
|
124
130
|
|
|
125
131
|
# %% ../nbs/01_refine.ipynb 28
|
|
126
132
|
def apply_hdg_fixes(
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.2.5"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|