mistocr 0.0.4__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mistocr/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.0.4"
1
+ __version__ = "0.1.0"
mistocr/_modidx.py CHANGED
@@ -19,4 +19,11 @@ d = { 'settings': { 'branch': 'main',
19
19
  'mistocr.core.save_pages': ('core.html#save_pages', 'mistocr/core.py'),
20
20
  'mistocr.core.submit_batch': ('core.html#submit_batch', 'mistocr/core.py'),
21
21
  'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
22
- 'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')}}}
22
+ 'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
23
+ 'mistocr.refine': { 'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
24
+ 'mistocr.refine.apply_hdg_fixes': ('refine.html#apply_hdg_fixes', 'mistocr/refine.py'),
25
+ 'mistocr.refine.fix_hdg_hierarchy': ('refine.html#fix_hdg_hierarchy', 'mistocr/refine.py'),
26
+ 'mistocr.refine.fix_md_hdgs': ('refine.html#fix_md_hdgs', 'mistocr/refine.py'),
27
+ 'mistocr.refine.fmt_hdgs_idx': ('refine.html#fmt_hdgs_idx', 'mistocr/refine.py'),
28
+ 'mistocr.refine.get_hdgs': ('refine.html#get_hdgs', 'mistocr/refine.py'),
29
+ 'mistocr.refine.mk_fixes_lut': ('refine.html#mk_fixes_lut', 'mistocr/refine.py')}}}
mistocr/core.py CHANGED
@@ -110,11 +110,11 @@ def save_images(
110
110
  # %% ../nbs/00_core.ipynb 32
111
111
  def save_page(
112
112
  page:dict, # Page dict,
113
- out_dir:str, # Directory to save page
113
+ dst:str, # Directory to save page
114
114
  img_dir:str='img' # Directory to save images
115
115
  ) -> None:
116
116
  "Save single page markdown and images"
117
- (out_dir / f"page_{page['index']+1}.md").write_text(page['markdown'])
117
+ (dst / f"page_{page['index']+1}.md").write_text(page['markdown'])
118
118
  if page.get('images'):
119
119
  img_dir.mkdir(exist_ok=True)
120
120
  save_images(page, img_dir)
@@ -122,15 +122,15 @@ def save_page(
122
122
  # %% ../nbs/00_core.ipynb 34
123
123
  def save_pages(
124
124
  ocr_resp:dict, # OCR response,
125
- out_dir:str, # Directory to save pages,
125
+ dst:str, # Directory to save pages,
126
126
  cid:str # Custom ID
127
127
  ) -> Path: # Output directory
128
128
  "Save markdown pages and images from OCR response to output directory"
129
- out_dir = Path(out_dir) / cid
130
- out_dir.mkdir(parents=True, exist_ok=True)
131
- img_dir = out_dir / 'img'
132
- for page in ocr_resp['pages']: save_page(page, out_dir, img_dir)
133
- return out_dir
129
+ dst = Path(dst) / cid
130
+ dst.mkdir(parents=True, exist_ok=True)
131
+ img_dir = dst / 'img'
132
+ for page in ocr_resp['pages']: save_page(page, dst, img_dir)
133
+ return dst
134
134
 
135
135
  # %% ../nbs/00_core.ipynb 40
136
136
  def _get_paths(path:str) -> list[Path]:
@@ -163,7 +163,7 @@ def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]
163
163
  # %% ../nbs/00_core.ipynb 43
164
164
  def ocr(
165
165
  path:str, # Path to PDF file or folder,
166
- out_dir:str='md', # Directory to save markdown pages,
166
+ dst:str='md', # Directory to save markdown pages,
167
167
  inc_img:bool=True, # Include image in response,
168
168
  key:str=None, # API key,
169
169
  poll_interval:int=2 # Poll interval in seconds
@@ -172,18 +172,15 @@ def ocr(
172
172
  pdfs = _get_paths(path)
173
173
  entries, c = _prep_batch(pdfs, inc_img, key)
174
174
  results = _run_batch(entries, c, poll_interval)
175
- return L([save_pages(r['response']['body'], out_dir, r['custom_id']) for r in results])
175
+ return L([save_pages(r['response']['body'], dst, r['custom_id']) for r in results])
176
176
 
177
177
  # %% ../nbs/00_core.ipynb 48
178
178
  def read_pgs(
179
179
  path:str, # OCR output directory,
180
- pg:int=None, # Page number
181
- ) -> str:
180
+ join:bool=True # Join pages into single string
181
+ ) -> str|list[str]: # Joined string or list of page contents
182
182
  "Read specific page or all pages from OCR output directory"
183
183
  path = Path(path)
184
- if pg:
185
- pg_path = path / f'page_{pg}.md'
186
- if not pg_path.exists(): raise ValueError(f"Page {pg} not found")
187
- return pg_path.read_text()
188
184
  pgs = sorted(path.glob('page_*.md'), key=lambda p: int(p.stem.split('_')[1]))
189
- return '\n\n'.join([p.read_text() for p in pgs])
185
+ contents = L([p.read_text() for p in pgs])
186
+ return '\n\n'.join(contents) if join else contents
mistocr/refine.py ADDED
@@ -0,0 +1,113 @@
1
+ """Postprocess markdown files by fixing heading hierarchy and describint images"""
2
+
3
+ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_refine.ipynb.
4
+
5
+ # %% auto 0
6
+ __all__ = ['prompt_fix_hdgs', 'get_hdgs', 'fmt_hdgs_idx', 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut',
7
+ 'apply_hdg_fixes', 'fix_md_hdgs']
8
+
9
+ # %% ../nbs/01_refine.ipynb 3
10
+ from fastcore.all import *
11
+ from .core import read_pgs
12
+ from re import sub, findall, MULTILINE
13
+ from pydantic import BaseModel
14
+ from lisette.core import completion
15
+ import os
16
+ import json
17
+
18
+ # %% ../nbs/01_refine.ipynb 7
19
+ def get_hdgs(
20
+ md:str # Markdown file string
21
+ ):
22
+ "Return the markdown headings"
23
+ # Sanitize removing '#' in python snippet if any
24
+ md = sub(r'```[\s\S]*?```', '', md)
25
+ return L(findall(r'^#{1,6} .+$', md, MULTILINE))
26
+
27
+
28
+
29
+ # %% ../nbs/01_refine.ipynb 10
30
+ def fmt_hdgs_idx(
31
+ hdgs: list[str] # List of markdown headings
32
+ ) -> str: # Formatted string with index
33
+ "Format the headings with index"
34
+ return '\n'.join(f"{i}. {h}" for i, h in enumerate(hdgs))
35
+
36
+
37
+ # %% ../nbs/01_refine.ipynb 13
38
+ class HeadingCorrections(BaseModel):
39
+ corrections: dict[int, str] # index → corrected heading
40
+
41
+ # %% ../nbs/01_refine.ipynb 15
42
+ prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
43
+
44
+ INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title")
45
+
46
+ RULES - Only fix these errors:
47
+ 1. **Level jumps**: Headings can only increase by one # at a time
48
+ - Wrong: 0. # Title → 1. #### Abstract
49
+ - Fixed: 0. # Title → 1. ## Abstract
50
+
51
+ 2. **Numbering inconsistency**: Subsection numbers must be one level deeper
52
+ - Wrong: 4. ## 3. Section → 5. ## 3.1 Subsection
53
+ - Fixed: 4. ## 3. Section → 5. ### 3.1 Subsection
54
+
55
+ 3. **Preserve working structure**: If sections are consistently marked, keep it
56
+
57
+ 4. **Decreasing levels is OK**: Going from ### to ## is valid for new sections
58
+
59
+ OUTPUT: Return a Python dictionary mapping index to corrected heading (without the index prefix).
60
+ Only include entries that need changes. Example: {{1: '## Abstract', 15: '### PASCAL VOC'}}
61
+
62
+ Headings to analyze:
63
+ {headings_list}
64
+ """
65
+
66
+ # %% ../nbs/01_refine.ipynb 16
67
+ def fix_hdg_hierarchy(
68
+ hdgs: list[str], # List of markdown headings
69
+ model: str='claude-sonnet-4-5', # Model to use
70
+ api_key: str=os.getenv('ANTHROPIC_API_KEY') # API key
71
+ ) -> dict[int, str]: # Dictionary of index → corrected heading
72
+ "Fix the heading hierarchy"
73
+ r = completion(
74
+ model=model,
75
+ messages=[{"role": "user", "content": prompt_fix_hdgs.format(headings_list=fmt_hdgs_idx(hdgs))}],
76
+ response_format=HeadingCorrections,
77
+ api_key=api_key
78
+ )
79
+ return json.loads(r.choices[0].message.content)['corrections']
80
+
81
+ # %% ../nbs/01_refine.ipynb 19
82
+ def mk_fixes_lut(
83
+ hdgs: list[str], # List of markdown headings
84
+ model: str='claude-sonnet-4-5', # Model to use
85
+ api_key: str=os.getenv('ANTHROPIC_API_KEY') # API key
86
+ ) -> dict[str, str]: # Dictionary of old → new heading
87
+ "Make a lookup table of fixes"
88
+ fixes = fix_hdg_hierarchy(hdgs, model, api_key)
89
+ return {hdgs[int(k)]:v for k,v in fixes.items()}
90
+
91
+ # %% ../nbs/01_refine.ipynb 22
92
+ def apply_hdg_fixes(
93
+ p:str, # Page to fix
94
+ lut_fixes: dict[str, str], # Lookup table of fixes
95
+ pg: int=None, # Optionnaly specify the page number to append to original heading
96
+ ) -> str: # Page with fixes applied
97
+ "Apply the fixes to the page"
98
+ for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old) + (f' .... page {pg}' if pg else ''))
99
+ return p
100
+
101
+ # %% ../nbs/01_refine.ipynb 25
102
+ def fix_md_hdgs(
103
+ src:str, # Source directory with markdown pages
104
+ model:str='claude-sonnet-4-5', # Model
105
+ dst:str=None, # Destination directory (None=overwrite)
106
+ pg_nums:bool=True # Add page numbers
107
+ ):
108
+ "Fix heading hierarchy in markdown document"
109
+ src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
110
+ if dst_path != src_path: dst_path.mkdir(parents=True, exist_ok=True)
111
+ lut = mk_fixes_lut(get_hdgs(read_pgs(src_path)), model)
112
+ for i,p in enumerate(read_pgs(src_path, join=False), 1):
113
+ (dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut, pg=i if pg_nums else None))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.0.4
3
+ Version: 0.1.0
4
4
  Summary: Simple batch OCR for PDFs using Mistral's state-of-the-art vision model
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -22,6 +22,7 @@ Requires-Dist: fastcore
22
22
  Requires-Dist: mistralai
23
23
  Requires-Dist: pillow
24
24
  Requires-Dist: dotenv
25
+ Requires-Dist: lisette
25
26
  Provides-Extra: dev
26
27
  Dynamic: author
27
28
  Dynamic: author-email
@@ -0,0 +1,10 @@
1
+ mistocr/__init__.py,sha256=kUR5RAFc7HCeiqdlX36dZOHkUI5wI6V_43RpEcD8b-0,22
2
+ mistocr/_modidx.py,sha256=R9zVMv4dKz2sLStoB5wBoKRqjza216_z8xPXszoplU4,2660
3
+ mistocr/core.py,sha256=wtaYZ_Fz0dXIb1cYLAtymwR9Z7_KBI4ULy-UpM3YTX4,7154
4
+ mistocr/refine.py,sha256=gWup79LGjmvKW5RyY1dRKUeAEt94mUJIeTZB3V4D-JE,4258
5
+ mistocr-0.1.0.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
6
+ mistocr-0.1.0.dist-info/METADATA,sha256=JOyUQONpYUmmGk2kFzMkxaIBrHwjC9CfmI7fc9qa6ms,4848
7
+ mistocr-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
+ mistocr-0.1.0.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
9
+ mistocr-0.1.0.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
10
+ mistocr-0.1.0.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- mistocr/__init__.py,sha256=1mptEzQihbdyqqzMgdns_j5ZGK9gz7hR2bsgA_TnjO4,22
2
- mistocr/_modidx.py,sha256=zA12OvdPdNkQ7K_oQx8rzto1mWnpQa3kyz8N-az6kMw,1843
3
- mistocr/core.py,sha256=qMV6ZFqs3PNHNUL6o6612WkWzOQiiA1jIKreAaYwORg,7239
4
- mistocr-0.0.4.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
5
- mistocr-0.0.4.dist-info/METADATA,sha256=01uXdXnZhKv334UNN1ZNlWCxNeozrptZpvAN9MFYIF4,4825
6
- mistocr-0.0.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
- mistocr-0.0.4.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
8
- mistocr-0.0.4.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
9
- mistocr-0.0.4.dist-info/RECORD,,