mistocr 0.0.4__tar.gz → 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.0.4
3
+ Version: 0.1.5
4
4
  Summary: Simple batch OCR for PDFs using Mistral's state-of-the-art vision model
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -22,6 +22,7 @@ Requires-Dist: fastcore
22
22
  Requires-Dist: mistralai
23
23
  Requires-Dist: pillow
24
24
  Requires-Dist: dotenv
25
+ Requires-Dist: lisette
25
26
  Provides-Extra: dev
26
27
  Dynamic: author
27
28
  Dynamic: author-email
@@ -0,0 +1 @@
1
+ __version__ = "0.1.5"
@@ -19,4 +19,13 @@ d = { 'settings': { 'branch': 'main',
19
19
  'mistocr.core.save_pages': ('core.html#save_pages', 'mistocr/core.py'),
20
20
  'mistocr.core.submit_batch': ('core.html#submit_batch', 'mistocr/core.py'),
21
21
  'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
22
- 'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')}}}
22
+ 'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
23
+ 'mistocr.refine': { 'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
24
+ 'mistocr.refine.add_pg_hdgs': ('refine.html#add_pg_hdgs', 'mistocr/refine.py'),
25
+ 'mistocr.refine.apply_hdg_fixes': ('refine.html#apply_hdg_fixes', 'mistocr/refine.py'),
26
+ 'mistocr.refine.fix_hdg_hierarchy': ('refine.html#fix_hdg_hierarchy', 'mistocr/refine.py'),
27
+ 'mistocr.refine.fix_md_hdgs': ('refine.html#fix_md_hdgs', 'mistocr/refine.py'),
28
+ 'mistocr.refine.fmt_hdgs_idx': ('refine.html#fmt_hdgs_idx', 'mistocr/refine.py'),
29
+ 'mistocr.refine.get_hdgs': ('refine.html#get_hdgs', 'mistocr/refine.py'),
30
+ 'mistocr.refine.mk_fixes_lut': ('refine.html#mk_fixes_lut', 'mistocr/refine.py'),
31
+ 'mistocr.refine.read_pgs_pg': ('refine.html#read_pgs_pg', 'mistocr/refine.py')}}}
@@ -110,11 +110,11 @@ def save_images(
110
110
  # %% ../nbs/00_core.ipynb 32
111
111
  def save_page(
112
112
  page:dict, # Page dict,
113
- out_dir:str, # Directory to save page
113
+ dst:str, # Directory to save page
114
114
  img_dir:str='img' # Directory to save images
115
115
  ) -> None:
116
116
  "Save single page markdown and images"
117
- (out_dir / f"page_{page['index']+1}.md").write_text(page['markdown'])
117
+ (dst / f"page_{page['index']+1}.md").write_text(page['markdown'])
118
118
  if page.get('images'):
119
119
  img_dir.mkdir(exist_ok=True)
120
120
  save_images(page, img_dir)
@@ -122,15 +122,15 @@ def save_page(
122
122
  # %% ../nbs/00_core.ipynb 34
123
123
  def save_pages(
124
124
  ocr_resp:dict, # OCR response,
125
- out_dir:str, # Directory to save pages,
125
+ dst:str, # Directory to save pages,
126
126
  cid:str # Custom ID
127
127
  ) -> Path: # Output directory
128
128
  "Save markdown pages and images from OCR response to output directory"
129
- out_dir = Path(out_dir) / cid
130
- out_dir.mkdir(parents=True, exist_ok=True)
131
- img_dir = out_dir / 'img'
132
- for page in ocr_resp['pages']: save_page(page, out_dir, img_dir)
133
- return out_dir
129
+ dst = Path(dst) / cid
130
+ dst.mkdir(parents=True, exist_ok=True)
131
+ img_dir = dst / 'img'
132
+ for page in ocr_resp['pages']: save_page(page, dst, img_dir)
133
+ return dst
134
134
 
135
135
  # %% ../nbs/00_core.ipynb 40
136
136
  def _get_paths(path:str) -> list[Path]:
@@ -163,7 +163,7 @@ def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]
163
163
  # %% ../nbs/00_core.ipynb 43
164
164
  def ocr(
165
165
  path:str, # Path to PDF file or folder,
166
- out_dir:str='md', # Directory to save markdown pages,
166
+ dst:str='md', # Directory to save markdown pages,
167
167
  inc_img:bool=True, # Include image in response,
168
168
  key:str=None, # API key,
169
169
  poll_interval:int=2 # Poll interval in seconds
@@ -172,18 +172,15 @@ def ocr(
172
172
  pdfs = _get_paths(path)
173
173
  entries, c = _prep_batch(pdfs, inc_img, key)
174
174
  results = _run_batch(entries, c, poll_interval)
175
- return L([save_pages(r['response']['body'], out_dir, r['custom_id']) for r in results])
175
+ return L([save_pages(r['response']['body'], dst, r['custom_id']) for r in results])
176
176
 
177
177
  # %% ../nbs/00_core.ipynb 48
178
178
  def read_pgs(
179
179
  path:str, # OCR output directory,
180
- pg:int=None, # Page number
181
- ) -> str:
180
+ join:bool=True # Join pages into single string
181
+ ) -> str|list[str]: # Joined string or list of page contents
182
182
  "Read specific page or all pages from OCR output directory"
183
183
  path = Path(path)
184
- if pg:
185
- pg_path = path / f'page_{pg}.md'
186
- if not pg_path.exists(): raise ValueError(f"Page {pg} not found")
187
- return pg_path.read_text()
188
184
  pgs = sorted(path.glob('page_*.md'), key=lambda p: int(p.stem.split('_')[1]))
189
- return '\n\n'.join([p.read_text() for p in pgs])
185
+ contents = L([p.read_text() for p in pgs])
186
+ return '\n\n'.join(contents) if join else contents
@@ -0,0 +1,133 @@
1
+ """Postprocess markdown files by fixing heading hierarchy and describint images"""
2
+
3
+ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_refine.ipynb.
4
+
5
+ # %% auto 0
6
+ __all__ = ['prompt_fix_hdgs', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx', 'HeadingCorrections', 'fix_hdg_hierarchy',
7
+ 'mk_fixes_lut', 'apply_hdg_fixes', 'fix_md_hdgs']
8
+
9
+ # %% ../nbs/01_refine.ipynb 3
10
+ from fastcore.all import *
11
+ from .core import read_pgs
12
+ from re import sub, findall, MULTILINE
13
+ from pydantic import BaseModel
14
+ from lisette.core import completion
15
+ import os
16
+ import json
17
+
18
+ # %% ../nbs/01_refine.ipynb 8
19
+ def get_hdgs(
20
+ md:str # Markdown file string
21
+ ):
22
+ "Return the markdown headings"
23
+ # Sanitize removing '#' in python snippet if any
24
+ md = sub(r'```[\s\S]*?```', '', md)
25
+ return L(findall(r'^#{1,6} .+$', md, MULTILINE))
26
+
27
+
28
+
29
+ # %% ../nbs/01_refine.ipynb 9
30
+ def add_pg_hdgs(md, n):
31
+ "Add page number to all headings in markdown"
32
+ md = sub(r'```[\s\S]*?```', '', md)
33
+ def repl(m): return m.group(0) + f' ... page {n}'
34
+ return sub(r'^#{1,6} .+$', repl, md, flags=MULTILINE)
35
+
36
+ # %% ../nbs/01_refine.ipynb 11
37
+ def read_pgs_pg(path):
38
+ "Read all pages of a markdown file and add page numbers to all headings"
39
+ pgs = read_pgs(path, join=False)
40
+ return L([add_pg_hdgs(pg, n) for n, pg in enumerate(pgs, 1)]).concat()
41
+
42
+ # %% ../nbs/01_refine.ipynb 15
43
+ def fmt_hdgs_idx(
44
+ hdgs: list[str] # List of markdown headings
45
+ ) -> str: # Formatted string with index
46
+ "Format the headings with index"
47
+ return '\n'.join(f"{i}. {h}" for i, h in enumerate(hdgs))
48
+
49
+
50
+ # %% ../nbs/01_refine.ipynb 18
51
+ class HeadingCorrections(BaseModel):
52
+ corrections: dict[int, str] # index → corrected heading
53
+
54
+ # %% ../nbs/01_refine.ipynb 20
55
+ prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
56
+
57
+ INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title ... page 1")
58
+
59
+ RULES - Apply these fixes in order:
60
+
61
+ 1. **Single H1 rule**: Documents must have exactly ONE # heading (typically the document title at the top)
62
+ - If index 0 is already #, then all subsequent headings (index 1+) must be ## or deeper
63
+ - If no H1 exists, the first major heading should be #, and all others ## or deeper
64
+ - NO exceptions: appendices, references, and all sections are ## or deeper after the title
65
+
66
+ 2. **Infer depth from numbering patterns**: If headings contain section numbers, deeper nesting means deeper heading level
67
+ - Parent section (e.g., "1", "2", "A") should be shallower than child (e.g., "1.1", "2.a", "A.1")
68
+ - Child section should be one # deeper than parent
69
+ - Works with any numbering: "1/1.1/1.1.1", "A/A.1/A.1.a", "I/I.A/I.A.1", etc.
70
+
71
+ 3. **Level jumps**: Headings can only increase by one # at a time when moving deeper
72
+ - Wrong: ## Section → ##### Subsection
73
+ - Fixed: ## Section → ### Subsection
74
+
75
+ 4. **Decreasing levels is OK**: Moving back up the hierarchy (### to ##) is valid for new sections
76
+
77
+ OUTPUT: Return a Python dictionary mapping index to corrected heading (without the index prefix).
78
+ IMPORTANT: Preserve the " ... page N" suffix in all corrected headings.
79
+ Only include entries that need changes.
80
+
81
+ Headings to analyze:
82
+ {headings_list}
83
+ """
84
+
85
+
86
+ # %% ../nbs/01_refine.ipynb 21
87
+ def fix_hdg_hierarchy(
88
+ hdgs: list[str], # List of markdown headings
89
+ prompt: str=prompt_fix_hdgs, # Prompt to use
90
+ model: str='claude-sonnet-4-5', # Model to use
91
+ api_key: str=os.getenv('ANTHROPIC_API_KEY') # API key
92
+ ) -> dict[int, str]: # Dictionary of index → corrected heading
93
+ "Fix the heading hierarchy"
94
+ r = completion(
95
+ model=model,
96
+ messages=[{"role": "user", "content": prompt_fix_hdgs.format(headings_list=fmt_hdgs_idx(hdgs))}],
97
+ response_format=HeadingCorrections,
98
+ api_key=api_key
99
+ )
100
+ return json.loads(r.choices[0].message.content)['corrections']
101
+
102
+ # %% ../nbs/01_refine.ipynb 24
103
+ def mk_fixes_lut(
104
+ hdgs: list[str], # List of markdown headings
105
+ model: str='claude-sonnet-4-5', # Model to use
106
+ api_key: str=os.getenv('ANTHROPIC_API_KEY') # API key
107
+ ) -> dict[str, str]: # Dictionary of old → new heading
108
+ "Make a lookup table of fixes"
109
+ fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key)
110
+ return {hdgs[int(k)]:v for k,v in fixes.items()}
111
+
112
+ # %% ../nbs/01_refine.ipynb 27
113
+ def apply_hdg_fixes(
114
+ p:str, # Page to fix
115
+ lut_fixes: dict[str, str], # Lookup table of fixes
116
+ ) -> str: # Page with fixes applied
117
+ "Apply the fixes to the page"
118
+ for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old))
119
+ return p
120
+
121
+ # %% ../nbs/01_refine.ipynb 30
122
+ def fix_md_hdgs(
123
+ src:str, # Source directory with markdown pages
124
+ model:str='claude-sonnet-4-5', # Model
125
+ dst:str=None, # Destination directory (None=overwrite)
126
+ ):
127
+ "Fix heading hierarchy in markdown document"
128
+ src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
129
+ if dst_path != src_path: dst_path.mkdir(parents=True, exist_ok=True)
130
+ pgs_with_pg = read_pgs_pg(src_path)
131
+ lut = mk_fixes_lut(L([get_hdgs(pg) for pg in pgs_with_pg]).concat(), model)
132
+ for i,p in enumerate(pgs_with_pg, 1):
133
+ (dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.0.4
3
+ Version: 0.1.5
4
4
  Summary: Simple batch OCR for PDFs using Mistral's state-of-the-art vision model
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -22,6 +22,7 @@ Requires-Dist: fastcore
22
22
  Requires-Dist: mistralai
23
23
  Requires-Dist: pillow
24
24
  Requires-Dist: dotenv
25
+ Requires-Dist: lisette
25
26
  Provides-Extra: dev
26
27
  Dynamic: author
27
28
  Dynamic: author-email
@@ -7,6 +7,7 @@ setup.py
7
7
  mistocr/__init__.py
8
8
  mistocr/_modidx.py
9
9
  mistocr/core.py
10
+ mistocr/refine.py
10
11
  mistocr.egg-info/PKG-INFO
11
12
  mistocr.egg-info/SOURCES.txt
12
13
  mistocr.egg-info/dependency_links.txt
@@ -2,5 +2,6 @@ fastcore
2
2
  mistralai
3
3
  pillow
4
4
  dotenv
5
+ lisette
5
6
 
6
7
  [dev]
@@ -1,7 +1,7 @@
1
1
  [DEFAULT]
2
2
  repo = mistocr
3
3
  lib_name = mistocr
4
- version = 0.0.4
4
+ version = 0.1.5
5
5
  min_python = 3.9
6
6
  license = apache2
7
7
  black_formatting = False
@@ -27,7 +27,7 @@ keywords = nbdev jupyter notebook python
27
27
  language = English
28
28
  status = 3
29
29
  user = franckalbinet
30
- requirements = fastcore mistralai pillow dotenv
30
+ requirements = fastcore mistralai pillow dotenv lisette
31
31
  readme_nb = index.ipynb
32
32
  allowed_metadata_keys =
33
33
  allowed_cell_metadata_keys =
@@ -1 +0,0 @@
1
- __version__ = "0.0.4"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes