mistocr 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mistocr/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.4"
1
+ __version__ = "0.1.6"
mistocr/refine.py CHANGED
@@ -12,13 +12,14 @@ from .core import read_pgs
12
12
  from re import sub, findall, MULTILINE
13
13
  from pydantic import BaseModel
14
14
  from lisette.core import completion
15
+ from typing import Callable
15
16
  import os
16
17
  import json
17
18
 
18
- # %% ../nbs/01_refine.ipynb 8
19
+ # %% ../nbs/01_refine.ipynb 7
19
20
  def get_hdgs(
20
21
  md:str # Markdown file string
21
- ):
22
+ ) -> L: # L of strings
22
23
  "Return the markdown headings"
23
24
  # Sanitize removing '#' in python snippet if any
24
25
  md = sub(r'```[\s\S]*?```', '', md)
@@ -26,15 +27,20 @@ def get_hdgs(
26
27
 
27
28
 
28
29
 
29
- # %% ../nbs/01_refine.ipynb 9
30
- def add_pg_hdgs(md, n):
31
- "Add page number to all headings in markdown"
30
+ # %% ../nbs/01_refine.ipynb 8
31
+ def add_pg_hdgs(
32
+ md:str, # Markdown file string,
33
+ n:int # Page number
34
+ ) -> str: # Markdown file string
35
+ "Add page number to all headings in page markdown"
32
36
  md = sub(r'```[\s\S]*?```', '', md)
33
37
  def repl(m): return m.group(0) + f' ... page {n}'
34
38
  return sub(r'^#{1,6} .+$', repl, md, flags=MULTILINE)
35
39
 
36
- # %% ../nbs/01_refine.ipynb 11
37
- def read_pgs_pg(path):
40
+ # %% ../nbs/01_refine.ipynb 12
41
+ def read_pgs_pg(
42
+ path:str # Path to the markdown file
43
+ ) -> L: # List of markdown pages
38
44
  "Read all pages of a markdown file and add page numbers to all headings"
39
45
  pgs = read_pgs(path, join=False)
40
46
  return L([add_pg_hdgs(pg, n) for n, pg in enumerate(pgs, 1)]).concat()
@@ -83,51 +89,55 @@ Headings to analyze:
83
89
  """
84
90
 
85
91
 
86
- # %% ../nbs/01_refine.ipynb 21
92
+ # %% ../nbs/01_refine.ipynb 22
87
93
  def fix_hdg_hierarchy(
88
94
  hdgs: list[str], # List of markdown headings
89
- prompt: str=prompt_fix_hdgs, # Prompt to use
95
+ prompt: str=None, # Prompt to use
90
96
  model: str='claude-sonnet-4-5', # Model to use
91
- api_key: str=os.getenv('ANTHROPIC_API_KEY') # API key
97
+ api_key: str=None # API key
92
98
  ) -> dict[int, str]: # Dictionary of index → corrected heading
93
99
  "Fix the heading hierarchy"
94
- r = completion(
95
- model=model,
96
- messages=[{"role": "user", "content": prompt_fix_hdgs.format(headings_list=fmt_hdgs_idx(hdgs))}],
97
- response_format=HeadingCorrections,
98
- api_key=api_key
99
- )
100
+ if api_key is None: api_key = os.getenv('ANTHROPIC_API_KEY')
101
+ if prompt is None: prompt = prompt_fix_hdgs
102
+ prompt = prompt.format(headings_list=fmt_hdgs_idx(hdgs))
103
+ r = completion(model=model, messages=[{"role": "user", "content": prompt}], response_format=HeadingCorrections, api_key=api_key)
100
104
  return json.loads(r.choices[0].message.content)['corrections']
101
105
 
102
- # %% ../nbs/01_refine.ipynb 24
106
+
107
+ # %% ../nbs/01_refine.ipynb 25
108
+ @delegates(fix_hdg_hierarchy)
103
109
  def mk_fixes_lut(
104
110
  hdgs: list[str], # List of markdown headings
105
111
  model: str='claude-sonnet-4-5', # Model to use
106
- api_key: str=os.getenv('ANTHROPIC_API_KEY') # API key
112
+ api_key: str=None, # API key
113
+ **kwargs
107
114
  ) -> dict[str, str]: # Dictionary of old → new heading
108
115
  "Make a lookup table of fixes"
109
- fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key)
116
+ if api_key is None: api_key = os.getenv('ANTHROPIC_API_KEY')
117
+ fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key, **kwargs)
110
118
  return {hdgs[int(k)]:v for k,v in fixes.items()}
111
119
 
112
- # %% ../nbs/01_refine.ipynb 27
120
+ # %% ../nbs/01_refine.ipynb 28
113
121
  def apply_hdg_fixes(
114
122
  p:str, # Page to fix
115
123
  lut_fixes: dict[str, str], # Lookup table of fixes
116
124
  ) -> str: # Page with fixes applied
117
125
  "Apply the fixes to the page"
118
- #for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old) + (f' .... page {pg}' if pg else ''))
119
126
  for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old))
120
127
  return p
121
128
 
122
- # %% ../nbs/01_refine.ipynb 30
129
+ # %% ../nbs/01_refine.ipynb 31
130
+ @delegates(mk_fixes_lut)
123
131
  def fix_md_hdgs(
124
132
  src:str, # Source directory with markdown pages
125
133
  model:str='claude-sonnet-4-5', # Model
126
134
  dst:str=None, # Destination directory (None=overwrite)
135
+ **kwargs
127
136
  ):
128
137
  "Fix heading hierarchy in markdown document"
129
138
  src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
130
139
  if dst_path != src_path: dst_path.mkdir(parents=True, exist_ok=True)
131
- lut = mk_fixes_lut(get_hdgs(read_pgs(src_path)), model)
132
- for i,p in enumerate(read_pgs_pg(src_path), 1):
140
+ pgs_with_pg = read_pgs_pg(src_path)
141
+ lut = mk_fixes_lut(L([get_hdgs(pg) for pg in pgs_with_pg]).concat(), model, **kwargs)
142
+ for i,p in enumerate(pgs_with_pg, 1):
133
143
  (dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.1.4
3
+ Version: 0.1.6
4
4
  Summary: Simple batch OCR for PDFs using Mistral's state-of-the-art vision model
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -0,0 +1,10 @@
1
+ mistocr/__init__.py,sha256=n3oM6B_EMz93NsTI18NNZd-jKFcUPzUkbIKj5VFK5ok,22
2
+ mistocr/_modidx.py,sha256=sZ3ISGF-2f7VEOD9MVgqMVs5SifUNe-1YP0wy8Ey0cU,2884
3
+ mistocr/core.py,sha256=wtaYZ_Fz0dXIb1cYLAtymwR9Z7_KBI4ULy-UpM3YTX4,7154
4
+ mistocr/refine.py,sha256=BjswapfwJvyfLYyiNw-2048nAWZae6ujeO7ELvXUrMM,5653
5
+ mistocr-0.1.6.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
6
+ mistocr-0.1.6.dist-info/METADATA,sha256=c2aZrFY597Zd1iRSm5istCZWKkK1QKFq5cktr-QLtzs,4848
7
+ mistocr-0.1.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
+ mistocr-0.1.6.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
9
+ mistocr-0.1.6.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
10
+ mistocr-0.1.6.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- mistocr/__init__.py,sha256=Wzf5T3NBDfhQoTnhnRNHSlAsE0XMqbclXG-M81Vas70,22
2
- mistocr/_modidx.py,sha256=sZ3ISGF-2f7VEOD9MVgqMVs5SifUNe-1YP0wy8Ey0cU,2884
3
- mistocr/core.py,sha256=wtaYZ_Fz0dXIb1cYLAtymwR9Z7_KBI4ULy-UpM3YTX4,7154
4
- mistocr/refine.py,sha256=SMtZ2dvEE5iH8hL86Au8kXRB-M_uewaexB7RhMqpWnw,5313
5
- mistocr-0.1.4.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
6
- mistocr-0.1.4.dist-info/METADATA,sha256=k_4gVsWOX_w2pZGHv6G4FipRdWHCm76V2dw1TijpfjM,4848
7
- mistocr-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
- mistocr-0.1.4.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
9
- mistocr-0.1.4.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
10
- mistocr-0.1.4.dist-info/RECORD,,