mistocr 0.0.4__tar.gz → 0.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mistocr-0.0.4/mistocr.egg-info → mistocr-0.1.5}/PKG-INFO +2 -1
- mistocr-0.1.5/mistocr/__init__.py +1 -0
- {mistocr-0.0.4 → mistocr-0.1.5}/mistocr/_modidx.py +10 -1
- {mistocr-0.0.4 → mistocr-0.1.5}/mistocr/core.py +14 -17
- mistocr-0.1.5/mistocr/refine.py +133 -0
- {mistocr-0.0.4 → mistocr-0.1.5/mistocr.egg-info}/PKG-INFO +2 -1
- {mistocr-0.0.4 → mistocr-0.1.5}/mistocr.egg-info/SOURCES.txt +1 -0
- {mistocr-0.0.4 → mistocr-0.1.5}/mistocr.egg-info/requires.txt +1 -0
- {mistocr-0.0.4 → mistocr-0.1.5}/settings.ini +2 -2
- mistocr-0.0.4/mistocr/__init__.py +0 -1
- {mistocr-0.0.4 → mistocr-0.1.5}/LICENSE +0 -0
- {mistocr-0.0.4 → mistocr-0.1.5}/MANIFEST.in +0 -0
- {mistocr-0.0.4 → mistocr-0.1.5}/README.md +0 -0
- {mistocr-0.0.4 → mistocr-0.1.5}/mistocr.egg-info/dependency_links.txt +0 -0
- {mistocr-0.0.4 → mistocr-0.1.5}/mistocr.egg-info/entry_points.txt +0 -0
- {mistocr-0.0.4 → mistocr-0.1.5}/mistocr.egg-info/not-zip-safe +0 -0
- {mistocr-0.0.4 → mistocr-0.1.5}/mistocr.egg-info/top_level.txt +0 -0
- {mistocr-0.0.4 → mistocr-0.1.5}/pyproject.toml +0 -0
- {mistocr-0.0.4 → mistocr-0.1.5}/setup.cfg +0 -0
- {mistocr-0.0.4 → mistocr-0.1.5}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mistocr
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.1.5
|
|
4
4
|
Summary: Simple batch OCR for PDFs using Mistral's state-of-the-art vision model
|
|
5
5
|
Home-page: https://github.com/franckalbinet/mistocr
|
|
6
6
|
Author: Solveit
|
|
@@ -22,6 +22,7 @@ Requires-Dist: fastcore
|
|
|
22
22
|
Requires-Dist: mistralai
|
|
23
23
|
Requires-Dist: pillow
|
|
24
24
|
Requires-Dist: dotenv
|
|
25
|
+
Requires-Dist: lisette
|
|
25
26
|
Provides-Extra: dev
|
|
26
27
|
Dynamic: author
|
|
27
28
|
Dynamic: author-email
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.5"
|
|
@@ -19,4 +19,13 @@ d = { 'settings': { 'branch': 'main',
|
|
|
19
19
|
'mistocr.core.save_pages': ('core.html#save_pages', 'mistocr/core.py'),
|
|
20
20
|
'mistocr.core.submit_batch': ('core.html#submit_batch', 'mistocr/core.py'),
|
|
21
21
|
'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
|
|
22
|
-
'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')}
|
|
22
|
+
'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
|
|
23
|
+
'mistocr.refine': { 'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
|
|
24
|
+
'mistocr.refine.add_pg_hdgs': ('refine.html#add_pg_hdgs', 'mistocr/refine.py'),
|
|
25
|
+
'mistocr.refine.apply_hdg_fixes': ('refine.html#apply_hdg_fixes', 'mistocr/refine.py'),
|
|
26
|
+
'mistocr.refine.fix_hdg_hierarchy': ('refine.html#fix_hdg_hierarchy', 'mistocr/refine.py'),
|
|
27
|
+
'mistocr.refine.fix_md_hdgs': ('refine.html#fix_md_hdgs', 'mistocr/refine.py'),
|
|
28
|
+
'mistocr.refine.fmt_hdgs_idx': ('refine.html#fmt_hdgs_idx', 'mistocr/refine.py'),
|
|
29
|
+
'mistocr.refine.get_hdgs': ('refine.html#get_hdgs', 'mistocr/refine.py'),
|
|
30
|
+
'mistocr.refine.mk_fixes_lut': ('refine.html#mk_fixes_lut', 'mistocr/refine.py'),
|
|
31
|
+
'mistocr.refine.read_pgs_pg': ('refine.html#read_pgs_pg', 'mistocr/refine.py')}}}
|
|
@@ -110,11 +110,11 @@ def save_images(
|
|
|
110
110
|
# %% ../nbs/00_core.ipynb 32
|
|
111
111
|
def save_page(
|
|
112
112
|
page:dict, # Page dict,
|
|
113
|
-
|
|
113
|
+
dst:str, # Directory to save page
|
|
114
114
|
img_dir:str='img' # Directory to save images
|
|
115
115
|
) -> None:
|
|
116
116
|
"Save single page markdown and images"
|
|
117
|
-
(
|
|
117
|
+
(dst / f"page_{page['index']+1}.md").write_text(page['markdown'])
|
|
118
118
|
if page.get('images'):
|
|
119
119
|
img_dir.mkdir(exist_ok=True)
|
|
120
120
|
save_images(page, img_dir)
|
|
@@ -122,15 +122,15 @@ def save_page(
|
|
|
122
122
|
# %% ../nbs/00_core.ipynb 34
|
|
123
123
|
def save_pages(
|
|
124
124
|
ocr_resp:dict, # OCR response,
|
|
125
|
-
|
|
125
|
+
dst:str, # Directory to save pages,
|
|
126
126
|
cid:str # Custom ID
|
|
127
127
|
) -> Path: # Output directory
|
|
128
128
|
"Save markdown pages and images from OCR response to output directory"
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
img_dir =
|
|
132
|
-
for page in ocr_resp['pages']: save_page(page,
|
|
133
|
-
return
|
|
129
|
+
dst = Path(dst) / cid
|
|
130
|
+
dst.mkdir(parents=True, exist_ok=True)
|
|
131
|
+
img_dir = dst / 'img'
|
|
132
|
+
for page in ocr_resp['pages']: save_page(page, dst, img_dir)
|
|
133
|
+
return dst
|
|
134
134
|
|
|
135
135
|
# %% ../nbs/00_core.ipynb 40
|
|
136
136
|
def _get_paths(path:str) -> list[Path]:
|
|
@@ -163,7 +163,7 @@ def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]
|
|
|
163
163
|
# %% ../nbs/00_core.ipynb 43
|
|
164
164
|
def ocr(
|
|
165
165
|
path:str, # Path to PDF file or folder,
|
|
166
|
-
|
|
166
|
+
dst:str='md', # Directory to save markdown pages,
|
|
167
167
|
inc_img:bool=True, # Include image in response,
|
|
168
168
|
key:str=None, # API key,
|
|
169
169
|
poll_interval:int=2 # Poll interval in seconds
|
|
@@ -172,18 +172,15 @@ def ocr(
|
|
|
172
172
|
pdfs = _get_paths(path)
|
|
173
173
|
entries, c = _prep_batch(pdfs, inc_img, key)
|
|
174
174
|
results = _run_batch(entries, c, poll_interval)
|
|
175
|
-
return L([save_pages(r['response']['body'],
|
|
175
|
+
return L([save_pages(r['response']['body'], dst, r['custom_id']) for r in results])
|
|
176
176
|
|
|
177
177
|
# %% ../nbs/00_core.ipynb 48
|
|
178
178
|
def read_pgs(
|
|
179
179
|
path:str, # OCR output directory,
|
|
180
|
-
|
|
181
|
-
) -> str:
|
|
180
|
+
join:bool=True # Join pages into single string
|
|
181
|
+
) -> str|list[str]: # Joined string or list of page contents
|
|
182
182
|
"Read specific page or all pages from OCR output directory"
|
|
183
183
|
path = Path(path)
|
|
184
|
-
if pg:
|
|
185
|
-
pg_path = path / f'page_{pg}.md'
|
|
186
|
-
if not pg_path.exists(): raise ValueError(f"Page {pg} not found")
|
|
187
|
-
return pg_path.read_text()
|
|
188
184
|
pgs = sorted(path.glob('page_*.md'), key=lambda p: int(p.stem.split('_')[1]))
|
|
189
|
-
|
|
185
|
+
contents = L([p.read_text() for p in pgs])
|
|
186
|
+
return '\n\n'.join(contents) if join else contents
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""Postprocess markdown files by fixing heading hierarchy and describint images"""
|
|
2
|
+
|
|
3
|
+
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_refine.ipynb.
|
|
4
|
+
|
|
5
|
+
# %% auto 0
|
|
6
|
+
__all__ = ['prompt_fix_hdgs', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx', 'HeadingCorrections', 'fix_hdg_hierarchy',
|
|
7
|
+
'mk_fixes_lut', 'apply_hdg_fixes', 'fix_md_hdgs']
|
|
8
|
+
|
|
9
|
+
# %% ../nbs/01_refine.ipynb 3
|
|
10
|
+
from fastcore.all import *
|
|
11
|
+
from .core import read_pgs
|
|
12
|
+
from re import sub, findall, MULTILINE
|
|
13
|
+
from pydantic import BaseModel
|
|
14
|
+
from lisette.core import completion
|
|
15
|
+
import os
|
|
16
|
+
import json
|
|
17
|
+
|
|
18
|
+
# %% ../nbs/01_refine.ipynb 8
|
|
19
|
+
def get_hdgs(
|
|
20
|
+
md:str # Markdown file string
|
|
21
|
+
):
|
|
22
|
+
"Return the markdown headings"
|
|
23
|
+
# Sanitize removing '#' in python snippet if any
|
|
24
|
+
md = sub(r'```[\s\S]*?```', '', md)
|
|
25
|
+
return L(findall(r'^#{1,6} .+$', md, MULTILINE))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# %% ../nbs/01_refine.ipynb 9
|
|
30
|
+
def add_pg_hdgs(md, n):
|
|
31
|
+
"Add page number to all headings in markdown"
|
|
32
|
+
md = sub(r'```[\s\S]*?```', '', md)
|
|
33
|
+
def repl(m): return m.group(0) + f' ... page {n}'
|
|
34
|
+
return sub(r'^#{1,6} .+$', repl, md, flags=MULTILINE)
|
|
35
|
+
|
|
36
|
+
# %% ../nbs/01_refine.ipynb 11
|
|
37
|
+
def read_pgs_pg(path):
|
|
38
|
+
"Read all pages of a markdown file and add page numbers to all headings"
|
|
39
|
+
pgs = read_pgs(path, join=False)
|
|
40
|
+
return L([add_pg_hdgs(pg, n) for n, pg in enumerate(pgs, 1)]).concat()
|
|
41
|
+
|
|
42
|
+
# %% ../nbs/01_refine.ipynb 15
|
|
43
|
+
def fmt_hdgs_idx(
|
|
44
|
+
hdgs: list[str] # List of markdown headings
|
|
45
|
+
) -> str: # Formatted string with index
|
|
46
|
+
"Format the headings with index"
|
|
47
|
+
return '\n'.join(f"{i}. {h}" for i, h in enumerate(hdgs))
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# %% ../nbs/01_refine.ipynb 18
|
|
51
|
+
class HeadingCorrections(BaseModel):
|
|
52
|
+
corrections: dict[int, str] # index → corrected heading
|
|
53
|
+
|
|
54
|
+
# %% ../nbs/01_refine.ipynb 20
|
|
55
|
+
prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
|
|
56
|
+
|
|
57
|
+
INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title ... page 1")
|
|
58
|
+
|
|
59
|
+
RULES - Apply these fixes in order:
|
|
60
|
+
|
|
61
|
+
1. **Single H1 rule**: Documents must have exactly ONE # heading (typically the document title at the top)
|
|
62
|
+
- If index 0 is already #, then all subsequent headings (index 1+) must be ## or deeper
|
|
63
|
+
- If no H1 exists, the first major heading should be #, and all others ## or deeper
|
|
64
|
+
- NO exceptions: appendices, references, and all sections are ## or deeper after the title
|
|
65
|
+
|
|
66
|
+
2. **Infer depth from numbering patterns**: If headings contain section numbers, deeper nesting means deeper heading level
|
|
67
|
+
- Parent section (e.g., "1", "2", "A") should be shallower than child (e.g., "1.1", "2.a", "A.1")
|
|
68
|
+
- Child section should be one # deeper than parent
|
|
69
|
+
- Works with any numbering: "1/1.1/1.1.1", "A/A.1/A.1.a", "I/I.A/I.A.1", etc.
|
|
70
|
+
|
|
71
|
+
3. **Level jumps**: Headings can only increase by one # at a time when moving deeper
|
|
72
|
+
- Wrong: ## Section → ##### Subsection
|
|
73
|
+
- Fixed: ## Section → ### Subsection
|
|
74
|
+
|
|
75
|
+
4. **Decreasing levels is OK**: Moving back up the hierarchy (### to ##) is valid for new sections
|
|
76
|
+
|
|
77
|
+
OUTPUT: Return a Python dictionary mapping index to corrected heading (without the index prefix).
|
|
78
|
+
IMPORTANT: Preserve the " ... page N" suffix in all corrected headings.
|
|
79
|
+
Only include entries that need changes.
|
|
80
|
+
|
|
81
|
+
Headings to analyze:
|
|
82
|
+
{headings_list}
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# %% ../nbs/01_refine.ipynb 21
|
|
87
|
+
def fix_hdg_hierarchy(
|
|
88
|
+
hdgs: list[str], # List of markdown headings
|
|
89
|
+
prompt: str=prompt_fix_hdgs, # Prompt to use
|
|
90
|
+
model: str='claude-sonnet-4-5', # Model to use
|
|
91
|
+
api_key: str=os.getenv('ANTHROPIC_API_KEY') # API key
|
|
92
|
+
) -> dict[int, str]: # Dictionary of index → corrected heading
|
|
93
|
+
"Fix the heading hierarchy"
|
|
94
|
+
r = completion(
|
|
95
|
+
model=model,
|
|
96
|
+
messages=[{"role": "user", "content": prompt_fix_hdgs.format(headings_list=fmt_hdgs_idx(hdgs))}],
|
|
97
|
+
response_format=HeadingCorrections,
|
|
98
|
+
api_key=api_key
|
|
99
|
+
)
|
|
100
|
+
return json.loads(r.choices[0].message.content)['corrections']
|
|
101
|
+
|
|
102
|
+
# %% ../nbs/01_refine.ipynb 24
|
|
103
|
+
def mk_fixes_lut(
|
|
104
|
+
hdgs: list[str], # List of markdown headings
|
|
105
|
+
model: str='claude-sonnet-4-5', # Model to use
|
|
106
|
+
api_key: str=os.getenv('ANTHROPIC_API_KEY') # API key
|
|
107
|
+
) -> dict[str, str]: # Dictionary of old → new heading
|
|
108
|
+
"Make a lookup table of fixes"
|
|
109
|
+
fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key)
|
|
110
|
+
return {hdgs[int(k)]:v for k,v in fixes.items()}
|
|
111
|
+
|
|
112
|
+
# %% ../nbs/01_refine.ipynb 27
|
|
113
|
+
def apply_hdg_fixes(
|
|
114
|
+
p:str, # Page to fix
|
|
115
|
+
lut_fixes: dict[str, str], # Lookup table of fixes
|
|
116
|
+
) -> str: # Page with fixes applied
|
|
117
|
+
"Apply the fixes to the page"
|
|
118
|
+
for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old))
|
|
119
|
+
return p
|
|
120
|
+
|
|
121
|
+
# %% ../nbs/01_refine.ipynb 30
|
|
122
|
+
def fix_md_hdgs(
|
|
123
|
+
src:str, # Source directory with markdown pages
|
|
124
|
+
model:str='claude-sonnet-4-5', # Model
|
|
125
|
+
dst:str=None, # Destination directory (None=overwrite)
|
|
126
|
+
):
|
|
127
|
+
"Fix heading hierarchy in markdown document"
|
|
128
|
+
src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
|
|
129
|
+
if dst_path != src_path: dst_path.mkdir(parents=True, exist_ok=True)
|
|
130
|
+
pgs_with_pg = read_pgs_pg(src_path)
|
|
131
|
+
lut = mk_fixes_lut(L([get_hdgs(pg) for pg in pgs_with_pg]).concat(), model)
|
|
132
|
+
for i,p in enumerate(pgs_with_pg, 1):
|
|
133
|
+
(dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mistocr
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.1.5
|
|
4
4
|
Summary: Simple batch OCR for PDFs using Mistral's state-of-the-art vision model
|
|
5
5
|
Home-page: https://github.com/franckalbinet/mistocr
|
|
6
6
|
Author: Solveit
|
|
@@ -22,6 +22,7 @@ Requires-Dist: fastcore
|
|
|
22
22
|
Requires-Dist: mistralai
|
|
23
23
|
Requires-Dist: pillow
|
|
24
24
|
Requires-Dist: dotenv
|
|
25
|
+
Requires-Dist: lisette
|
|
25
26
|
Provides-Extra: dev
|
|
26
27
|
Dynamic: author
|
|
27
28
|
Dynamic: author-email
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
[DEFAULT]
|
|
2
2
|
repo = mistocr
|
|
3
3
|
lib_name = mistocr
|
|
4
|
-
version = 0.
|
|
4
|
+
version = 0.1.5
|
|
5
5
|
min_python = 3.9
|
|
6
6
|
license = apache2
|
|
7
7
|
black_formatting = False
|
|
@@ -27,7 +27,7 @@ keywords = nbdev jupyter notebook python
|
|
|
27
27
|
language = English
|
|
28
28
|
status = 3
|
|
29
29
|
user = franckalbinet
|
|
30
|
-
requirements = fastcore mistralai pillow dotenv
|
|
30
|
+
requirements = fastcore mistralai pillow dotenv lisette
|
|
31
31
|
readme_nb = index.ipynb
|
|
32
32
|
allowed_metadata_keys =
|
|
33
33
|
allowed_cell_metadata_keys =
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.0.4"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|