mistocr 0.1.1__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: Simple batch OCR for PDFs using Mistral's state-of-the-art vision model
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -0,0 +1 @@
1
+ __version__ = "0.1.3"
@@ -43,27 +43,30 @@ prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the
43
43
 
44
44
  INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title")
45
45
 
46
- RULES - Only fix these errors:
47
- 1. **Level jumps**: Headings can only increase by one # at a time
48
- - Wrong: 0. # Title → 1. #### Abstract
49
- - Fixed: 0. # Title → 1. ## Abstract
46
+ RULES - Apply these fixes in order:
50
47
 
51
- 2. **Numbering inconsistency**: Subsection numbers must be one level deeper
52
- - Wrong: 4. ## 3. Section → 5. ## 3.1 Subsection
53
- - Fixed: 4. ## 3. Section → 5. ### 3.1 Subsection
48
+ 1. **Single H1 rule**: Documents must have exactly ONE # heading (the title/main heading)
49
+ - All other headings should be ## or deeper
54
50
 
55
- 3. **Preserve working structure**: If sections are consistently marked, keep it
51
+ 2. **Infer depth from numbering patterns**: If headings contain section numbers, deeper nesting means deeper heading level
52
+ - Parent section (e.g., "1", "2", "A") should be shallower than child (e.g., "1.1", "2.a", "A.1")
53
+ - Child section should be one # deeper than parent
54
+ - Works with any numbering: "1/1.1/1.1.1", "A/A.1/A.1.a", "I/I.A/I.A.1", etc.
56
55
 
57
- 4. **Decreasing levels is OK**: Going from ### to ## is valid for new sections
56
+ 3. **Level jumps**: Headings can only increase by one # at a time when moving deeper
57
+ - Wrong: ## Section → ##### Subsection
58
+ - Fixed: ## Section → ### Subsection
59
+
60
+ 4. **Decreasing levels is OK**: Moving back up the hierarchy (### to ##) is valid for new sections
58
61
 
59
62
  OUTPUT: Return a Python dictionary mapping index to corrected heading (without the index prefix).
60
- Only include entries that need changes. Example: {{1: '## Abstract', 15: '### PASCAL VOC'}}
63
+ Only include entries that need changes.
61
64
 
62
65
  Headings to analyze:
63
66
  {headings_list}
64
67
  """
65
68
 
66
- # %% ../nbs/01_refine.ipynb 18
69
+ # %% ../nbs/01_refine.ipynb 16
67
70
  def fix_hdg_hierarchy(
68
71
  hdgs: list[str], # List of markdown headings
69
72
  prompt: str=prompt_fix_hdgs, # Prompt to use
@@ -79,17 +82,17 @@ def fix_hdg_hierarchy(
79
82
  )
80
83
  return json.loads(r.choices[0].message.content)['corrections']
81
84
 
82
- # %% ../nbs/01_refine.ipynb 21
85
+ # %% ../nbs/01_refine.ipynb 19
83
86
  def mk_fixes_lut(
84
87
  hdgs: list[str], # List of markdown headings
85
88
  model: str='claude-sonnet-4-5', # Model to use
86
89
  api_key: str=os.getenv('ANTHROPIC_API_KEY') # API key
87
90
  ) -> dict[str, str]: # Dictionary of old → new heading
88
91
  "Make a lookup table of fixes"
89
- fixes = fix_hdg_hierarchy(hdgs, model, api_key)
92
+ fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key)
90
93
  return {hdgs[int(k)]:v for k,v in fixes.items()}
91
94
 
92
- # %% ../nbs/01_refine.ipynb 24
95
+ # %% ../nbs/01_refine.ipynb 22
93
96
  def apply_hdg_fixes(
94
97
  p:str, # Page to fix
95
98
  lut_fixes: dict[str, str], # Lookup table of fixes
@@ -99,7 +102,7 @@ def apply_hdg_fixes(
99
102
  for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old) + (f' .... page {pg}' if pg else ''))
100
103
  return p
101
104
 
102
- # %% ../nbs/01_refine.ipynb 27
105
+ # %% ../nbs/01_refine.ipynb 25
103
106
  def fix_md_hdgs(
104
107
  src:str, # Source directory with markdown pages
105
108
  model:str='claude-sonnet-4-5', # Model
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: Simple batch OCR for PDFs using Mistral's state-of-the-art vision model
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -1,7 +1,7 @@
1
1
  [DEFAULT]
2
2
  repo = mistocr
3
3
  lib_name = mistocr
4
- version = 0.1.1
4
+ version = 0.1.3
5
5
  min_python = 3.9
6
6
  license = apache2
7
7
  black_formatting = False
@@ -1 +0,0 @@
1
- __version__ = "0.1.1"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes