mistocr 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mistocr/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.3"
1
+ __version__ = "0.1.5"
mistocr/_modidx.py CHANGED
@@ -21,9 +21,11 @@ d = { 'settings': { 'branch': 'main',
21
21
  'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
22
22
  'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
23
23
  'mistocr.refine': { 'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
24
+ 'mistocr.refine.add_pg_hdgs': ('refine.html#add_pg_hdgs', 'mistocr/refine.py'),
24
25
  'mistocr.refine.apply_hdg_fixes': ('refine.html#apply_hdg_fixes', 'mistocr/refine.py'),
25
26
  'mistocr.refine.fix_hdg_hierarchy': ('refine.html#fix_hdg_hierarchy', 'mistocr/refine.py'),
26
27
  'mistocr.refine.fix_md_hdgs': ('refine.html#fix_md_hdgs', 'mistocr/refine.py'),
27
28
  'mistocr.refine.fmt_hdgs_idx': ('refine.html#fmt_hdgs_idx', 'mistocr/refine.py'),
28
29
  'mistocr.refine.get_hdgs': ('refine.html#get_hdgs', 'mistocr/refine.py'),
29
- 'mistocr.refine.mk_fixes_lut': ('refine.html#mk_fixes_lut', 'mistocr/refine.py')}}}
30
+ 'mistocr.refine.mk_fixes_lut': ('refine.html#mk_fixes_lut', 'mistocr/refine.py'),
31
+ 'mistocr.refine.read_pgs_pg': ('refine.html#read_pgs_pg', 'mistocr/refine.py')}}}
mistocr/refine.py CHANGED
@@ -3,8 +3,8 @@
3
3
  # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_refine.ipynb.
4
4
 
5
5
  # %% auto 0
6
- __all__ = ['prompt_fix_hdgs', 'get_hdgs', 'fmt_hdgs_idx', 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut',
7
- 'apply_hdg_fixes', 'fix_md_hdgs']
6
+ __all__ = ['prompt_fix_hdgs', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx', 'HeadingCorrections', 'fix_hdg_hierarchy',
7
+ 'mk_fixes_lut', 'apply_hdg_fixes', 'fix_md_hdgs']
8
8
 
9
9
  # %% ../nbs/01_refine.ipynb 3
10
10
  from fastcore.all import *
@@ -15,7 +15,7 @@ from lisette.core import completion
15
15
  import os
16
16
  import json
17
17
 
18
- # %% ../nbs/01_refine.ipynb 7
18
+ # %% ../nbs/01_refine.ipynb 8
19
19
  def get_hdgs(
20
20
  md:str # Markdown file string
21
21
  ):
@@ -26,7 +26,20 @@ def get_hdgs(
26
26
 
27
27
 
28
28
 
29
- # %% ../nbs/01_refine.ipynb 10
29
+ # %% ../nbs/01_refine.ipynb 9
30
+ def add_pg_hdgs(md, n):
31
+ "Add page number to all headings in markdown"
32
+ md = sub(r'```[\s\S]*?```', '', md)
33
+ def repl(m): return m.group(0) + f' ... page {n}'
34
+ return sub(r'^#{1,6} .+$', repl, md, flags=MULTILINE)
35
+
36
+ # %% ../nbs/01_refine.ipynb 11
37
+ def read_pgs_pg(path):
38
+ "Read all pages of a markdown file and add page numbers to all headings"
39
+ pgs = read_pgs(path, join=False)
40
+ return L([add_pg_hdgs(pg, n) for n, pg in enumerate(pgs, 1)]).concat()
41
+
42
+ # %% ../nbs/01_refine.ipynb 15
30
43
  def fmt_hdgs_idx(
31
44
  hdgs: list[str] # List of markdown headings
32
45
  ) -> str: # Formatted string with index
@@ -34,19 +47,21 @@ def fmt_hdgs_idx(
34
47
  return '\n'.join(f"{i}. {h}" for i, h in enumerate(hdgs))
35
48
 
36
49
 
37
- # %% ../nbs/01_refine.ipynb 13
50
+ # %% ../nbs/01_refine.ipynb 18
38
51
  class HeadingCorrections(BaseModel):
39
52
  corrections: dict[int, str] # index → corrected heading
40
53
 
41
- # %% ../nbs/01_refine.ipynb 15
54
+ # %% ../nbs/01_refine.ipynb 20
42
55
  prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
43
56
 
44
- INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title")
57
+ INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title ... page 1")
45
58
 
46
59
  RULES - Apply these fixes in order:
47
60
 
48
- 1. **Single H1 rule**: Documents must have exactly ONE # heading (the title/main heading)
49
- - All other headings should be ## or deeper
61
+ 1. **Single H1 rule**: Documents must have exactly ONE # heading (typically the document title at the top)
62
+ - If index 0 is already #, then all subsequent headings (index 1+) must be ## or deeper
63
+ - If no H1 exists, the first major heading should be #, and all others ## or deeper
64
+ - NO exceptions: appendices, references, and all sections are ## or deeper after the title
50
65
 
51
66
  2. **Infer depth from numbering patterns**: If headings contain section numbers, deeper nesting means deeper heading level
52
67
  - Parent section (e.g., "1", "2", "A") should be shallower than child (e.g., "1.1", "2.a", "A.1")
@@ -60,13 +75,15 @@ RULES - Apply these fixes in order:
60
75
  4. **Decreasing levels is OK**: Moving back up the hierarchy (### to ##) is valid for new sections
61
76
 
62
77
  OUTPUT: Return a Python dictionary mapping index to corrected heading (without the index prefix).
78
+ IMPORTANT: Preserve the " ... page N" suffix in all corrected headings.
63
79
  Only include entries that need changes.
64
80
 
65
81
  Headings to analyze:
66
82
  {headings_list}
67
83
  """
68
84
 
69
- # %% ../nbs/01_refine.ipynb 16
85
+
86
+ # %% ../nbs/01_refine.ipynb 21
70
87
  def fix_hdg_hierarchy(
71
88
  hdgs: list[str], # List of markdown headings
72
89
  prompt: str=prompt_fix_hdgs, # Prompt to use
@@ -82,7 +99,7 @@ def fix_hdg_hierarchy(
82
99
  )
83
100
  return json.loads(r.choices[0].message.content)['corrections']
84
101
 
85
- # %% ../nbs/01_refine.ipynb 19
102
+ # %% ../nbs/01_refine.ipynb 24
86
103
  def mk_fixes_lut(
87
104
  hdgs: list[str], # List of markdown headings
88
105
  model: str='claude-sonnet-4-5', # Model to use
@@ -92,26 +109,25 @@ def mk_fixes_lut(
92
109
  fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key)
93
110
  return {hdgs[int(k)]:v for k,v in fixes.items()}
94
111
 
95
- # %% ../nbs/01_refine.ipynb 22
112
+ # %% ../nbs/01_refine.ipynb 27
96
113
  def apply_hdg_fixes(
97
114
  p:str, # Page to fix
98
115
  lut_fixes: dict[str, str], # Lookup table of fixes
99
- pg: int=None, # Optionnaly specify the page number to append to original heading
100
116
  ) -> str: # Page with fixes applied
101
117
  "Apply the fixes to the page"
102
- for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old) + (f' .... page {pg}' if pg else ''))
118
+ for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old))
103
119
  return p
104
120
 
105
- # %% ../nbs/01_refine.ipynb 25
121
+ # %% ../nbs/01_refine.ipynb 30
106
122
  def fix_md_hdgs(
107
123
  src:str, # Source directory with markdown pages
108
124
  model:str='claude-sonnet-4-5', # Model
109
125
  dst:str=None, # Destination directory (None=overwrite)
110
- pg_nums:bool=True # Add page numbers
111
126
  ):
112
127
  "Fix heading hierarchy in markdown document"
113
128
  src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
114
129
  if dst_path != src_path: dst_path.mkdir(parents=True, exist_ok=True)
115
- lut = mk_fixes_lut(get_hdgs(read_pgs(src_path)), model)
116
- for i,p in enumerate(read_pgs(src_path, join=False), 1):
117
- (dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut, pg=i if pg_nums else None))
130
+ pgs_with_pg = read_pgs_pg(src_path)
131
+ lut = mk_fixes_lut(L([get_hdgs(pg) for pg in pgs_with_pg]).concat(), model)
132
+ for i,p in enumerate(pgs_with_pg, 1):
133
+ (dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.1.3
3
+ Version: 0.1.5
4
4
  Summary: Simple batch OCR for PDFs using Mistral's state-of-the-art vision model
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -0,0 +1,10 @@
1
+ mistocr/__init__.py,sha256=rPSfWgIeq2YWVPyESOAwCBt8vftsTpIkuLAGDEzyRQc,22
2
+ mistocr/_modidx.py,sha256=sZ3ISGF-2f7VEOD9MVgqMVs5SifUNe-1YP0wy8Ey0cU,2884
3
+ mistocr/core.py,sha256=wtaYZ_Fz0dXIb1cYLAtymwR9Z7_KBI4ULy-UpM3YTX4,7154
4
+ mistocr/refine.py,sha256=314r4MBZRIvUmu6B_dvvq9P4d4a_japKBpsg4wnU9oU,5253
5
+ mistocr-0.1.5.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
6
+ mistocr-0.1.5.dist-info/METADATA,sha256=kcrDK0kJadP5Sze0tVzRo-pLWWTJQiEnvwlLpWJZz2o,4848
7
+ mistocr-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
+ mistocr-0.1.5.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
9
+ mistocr-0.1.5.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
10
+ mistocr-0.1.5.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- mistocr/__init__.py,sha256=XEqb2aiIn8fzGE68Mph4ck1FtQqsR_am0wRWvrYPffQ,22
2
- mistocr/_modidx.py,sha256=R9zVMv4dKz2sLStoB5wBoKRqjza216_z8xPXszoplU4,2660
3
- mistocr/core.py,sha256=wtaYZ_Fz0dXIb1cYLAtymwR9Z7_KBI4ULy-UpM3YTX4,7154
4
- mistocr/refine.py,sha256=572SDG8vhGjNMiET5eZhgVemNpUIHNFqi0ZSSl4eKCM,4545
5
- mistocr-0.1.3.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
6
- mistocr-0.1.3.dist-info/METADATA,sha256=jHRc6nm_uk7V-03y6Bd268hUWmkkOFNdt4s5cH3YPu0,4848
7
- mistocr-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
- mistocr-0.1.3.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
9
- mistocr-0.1.3.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
10
- mistocr-0.1.3.dist-info/RECORD,,