mistocr 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mistocr/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.0"
1
+ __version__ = "0.1.2"
mistocr/refine.py CHANGED
@@ -43,21 +43,24 @@ prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the
43
43
 
44
44
  INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title")
45
45
 
46
- RULES - Only fix these errors:
47
- 1. **Level jumps**: Headings can only increase by one # at a time
48
- - Wrong: 0. # Title → 1. #### Abstract
49
- - Fixed: 0. # Title → 1. ## Abstract
46
+ RULES - Apply these fixes in order:
50
47
 
51
- 2. **Numbering inconsistency**: Subsection numbers must be one level deeper
52
- - Wrong: 4. ## 3. Section → 5. ## 3.1 Subsection
53
- - Fixed: 4. ## 3. Section → 5. ### 3.1 Subsection
48
+ 1. **Single H1 rule**: Documents must have exactly ONE # heading (the title/main heading)
49
+ - All other headings should be ## or deeper
54
50
 
55
- 3. **Preserve working structure**: If sections are consistently marked, keep it
51
+ 2. **Infer depth from numbering patterns**: If headings contain section numbers, deeper nesting means deeper heading level
52
+ - Parent section (e.g., "1", "2", "A") should be shallower than child (e.g., "1.1", "2.a", "A.1")
53
+ - Child section should be one # deeper than parent
54
+ - Works with any numbering: "1/1.1/1.1.1", "A/A.1/A.1.a", "I/I.A/I.A.1", etc.
56
55
 
57
- 4. **Decreasing levels is OK**: Going from ### to ## is valid for new sections
56
+ 3. **Level jumps**: Headings can only increase by one # at a time when moving deeper
57
+ - Wrong: ## Section → ##### Subsection
58
+ - Fixed: ## Section → ### Subsection
59
+
60
+ 4. **Decreasing levels is OK**: Moving back up the hierarchy (### to ##) is valid for new sections
58
61
 
59
62
  OUTPUT: Return a Python dictionary mapping index to corrected heading (without the index prefix).
60
- Only include entries that need changes. Example: {{1: '## Abstract', 15: '### PASCAL VOC'}}
63
+ Only include entries that need changes.
61
64
 
62
65
  Headings to analyze:
63
66
  {headings_list}
@@ -66,6 +69,7 @@ Headings to analyze:
66
69
  # %% ../nbs/01_refine.ipynb 16
67
70
  def fix_hdg_hierarchy(
68
71
  hdgs: list[str], # List of markdown headings
72
+ prompt: str=prompt_fix_hdgs, # Prompt to use
69
73
  model: str='claude-sonnet-4-5', # Model to use
70
74
  api_key: str=os.getenv('ANTHROPIC_API_KEY') # API key
71
75
  ) -> dict[int, str]: # Dictionary of index → corrected heading
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: Simple batch OCR for PDFs using Mistral's state-of-the-art vision model
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -0,0 +1,10 @@
1
+ mistocr/__init__.py,sha256=YvuYzWnKtqBb-IqG8HAu-nhIYAsgj9Vmc_b9o7vO-js,22
2
+ mistocr/_modidx.py,sha256=R9zVMv4dKz2sLStoB5wBoKRqjza216_z8xPXszoplU4,2660
3
+ mistocr/core.py,sha256=wtaYZ_Fz0dXIb1cYLAtymwR9Z7_KBI4ULy-UpM3YTX4,7154
4
+ mistocr/refine.py,sha256=kZH-z55zNcIgmHSpHETkHjGCoZENmMvL9QJOpNic2GI,4531
5
+ mistocr-0.1.2.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
6
+ mistocr-0.1.2.dist-info/METADATA,sha256=AOoj8PRpmJDlfRgEFzfR7UQYa7ZI5cVTsXlopHzk6JM,4848
7
+ mistocr-0.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
+ mistocr-0.1.2.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
9
+ mistocr-0.1.2.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
10
+ mistocr-0.1.2.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- mistocr/__init__.py,sha256=kUR5RAFc7HCeiqdlX36dZOHkUI5wI6V_43RpEcD8b-0,22
2
- mistocr/_modidx.py,sha256=R9zVMv4dKz2sLStoB5wBoKRqjza216_z8xPXszoplU4,2660
3
- mistocr/core.py,sha256=wtaYZ_Fz0dXIb1cYLAtymwR9Z7_KBI4ULy-UpM3YTX4,7154
4
- mistocr/refine.py,sha256=gWup79LGjmvKW5RyY1dRKUeAEt94mUJIeTZB3V4D-JE,4258
5
- mistocr-0.1.0.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
6
- mistocr-0.1.0.dist-info/METADATA,sha256=JOyUQONpYUmmGk2kFzMkxaIBrHwjC9CfmI7fc9qa6ms,4848
7
- mistocr-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
- mistocr-0.1.0.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
9
- mistocr-0.1.0.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
10
- mistocr-0.1.0.dist-info/RECORD,,