mistocr 0.2.4__tar.gz → 0.2.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.2.4
3
+ Version: 0.2.7
4
4
  Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -76,6 +76,14 @@ fundamental challenges that raw OCR output leaves unsolved:
76
76
  **In short**: Complete PDF OCR with heading hierarchy fixes and image
77
77
  descriptions for RAG and LLM pipelines.
78
78
 
79
+ > [!NOTE]
80
+ >
81
+ > **Want to see mistocr in action?** This
82
+ > [tutorial](https://share.solve.it.com/d/97f75412ca949af76a5945b4dfc443c7)
83
+ > demonstrates real-world PDF processing and shows how clean markdown
84
+ > enables structure-aware navigation through long documents—letting you
85
+ > find exactly what you need, fast.
86
+
79
87
  ## Get Started
80
88
 
81
89
  Install latest from [pypi](https://pypi.org/project/mistocr), then:
@@ -36,6 +36,14 @@ fundamental challenges that raw OCR output leaves unsolved:
36
36
  **In short**: Complete PDF OCR with heading hierarchy fixes and image
37
37
  descriptions for RAG and LLM pipelines.
38
38
 
39
+ > [!NOTE]
40
+ >
41
+ > **Want to see mistocr in action?** This
42
+ > [tutorial](https://share.solve.it.com/d/97f75412ca949af76a5945b4dfc443c7)
43
+ > demonstrates real-world PDF processing and shows how clean markdown
44
+ > enables structure-aware navigation through long documents—letting you
45
+ > find exactly what you need, fast.
46
+
39
47
  ## Get Started
40
48
 
41
49
  Install latest from [pypi](https://pypi.org/project/mistocr), then:
@@ -0,0 +1 @@
1
+ __version__ = "0.2.7"
@@ -21,7 +21,8 @@ d = { 'settings': { 'branch': 'main',
21
21
  'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
22
22
  'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
23
23
  'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},
24
- 'mistocr.refine': { 'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
24
+ 'mistocr.refine': { 'mistocr.refine.HeadingCorrection': ('refine.html#headingcorrection', 'mistocr/refine.py'),
25
+ 'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
25
26
  'mistocr.refine.ImgDescription': ('refine.html#imgdescription', 'mistocr/refine.py'),
26
27
  'mistocr.refine.add_descs_to_pg': ('refine.html#add_descs_to_pg', 'mistocr/refine.py'),
27
28
  'mistocr.refine.add_descs_to_pgs': ('refine.html#add_descs_to_pgs', 'mistocr/refine.py'),
@@ -3,7 +3,7 @@
3
3
  # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_refine.ipynb.
4
4
 
5
5
  # %% auto 0
6
- __all__ = ['prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
6
+ __all__ = ['describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx', 'HeadingCorrection',
7
7
  'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes', 'fix_hdgs', 'ImgDescription',
8
8
  'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs', 'add_descs_to_pg', 'add_descs_to_pgs',
9
9
  'add_img_descs']
@@ -59,40 +59,12 @@ def fmt_hdgs_idx(
59
59
 
60
60
 
61
61
  # %% ../nbs/01_refine.ipynb 18
62
- class HeadingCorrections(BaseModel):
63
- corrections: dict[int, str] # index → corrected heading
64
-
65
- # %% ../nbs/01_refine.ipynb 20
66
- prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
67
-
68
- INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title ... page 1")
69
-
70
- RULES - Apply these fixes in order:
71
-
72
- 1. **Single H1 rule**: Documents must have exactly ONE # heading (typically the document title at the top)
73
- - If index 0 is already #, then all subsequent headings (index 1+) must be ## or deeper
74
- - If no H1 exists, the first major heading should be #, and all others ## or deeper
75
- - NO exceptions: appendices, references, and all sections are ## or deeper after the title
76
-
77
- 2. **Infer depth from numbering patterns**: If headings contain section numbers, deeper nesting means deeper heading level
78
- - Parent section (e.g., "1", "2", "A") should be shallower than child (e.g., "1.1", "2.a", "A.1")
79
- - Child section should be one # deeper than parent
80
- - Works with any numbering: "1/1.1/1.1.1", "A/A.1/A.1.a", "I/I.A/I.A.1", etc.
81
-
82
- 3. **Level jumps**: Headings can only increase by one # at a time when moving deeper
83
- - Wrong: ## Section → ##### Subsection
84
- - Fixed: ## Section → ### Subsection
85
-
86
- 4. **Decreasing levels is OK**: Moving back up the hierarchy (### to ##) is valid for new sections
87
-
88
- OUTPUT: Return a Python dictionary mapping index to corrected heading (without the index prefix).
89
- IMPORTANT: Preserve the " ... page N" suffix in all corrected headings.
90
- Only include entries that need changes.
91
-
92
- Headings to analyze:
93
- {headings_list}
94
- """
62
+ class HeadingCorrection(BaseModel):
63
+ index: int
64
+ corrected: str
95
65
 
66
+ class HeadingCorrections(BaseModel):
67
+ corrections: list[HeadingCorrection]
96
68
 
97
69
  # %% ../nbs/01_refine.ipynb 22
98
70
  def fix_hdg_hierarchy(
@@ -106,7 +78,8 @@ def fix_hdg_hierarchy(
106
78
  if prompt is None: prompt = prompt_fix_hdgs
107
79
  prompt = prompt.format(headings_list=fmt_hdgs_idx(hdgs))
108
80
  r = completion(model=model, messages=[{"role": "user", "content": prompt}], response_format=HeadingCorrections, api_key=api_key)
109
- return json.loads(r.choices[0].message.content)['corrections']
81
+ fixes = json.loads(r.choices[0].message.content)['corrections']
82
+ return {o['index']: o['corrected'] for o in fixes}
110
83
 
111
84
 
112
85
  # %% ../nbs/01_refine.ipynb 25
@@ -120,7 +93,7 @@ def mk_fixes_lut(
120
93
  "Make a lookup table of fixes"
121
94
  if api_key is None: api_key = os.getenv('ANTHROPIC_API_KEY')
122
95
  fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key, **kwargs)
123
- return {hdgs[int(k)]:v for k,v in fixes.items()}
96
+ return {hdgs[k]:v for k,v in fixes.items()}
124
97
 
125
98
  # %% ../nbs/01_refine.ipynb 28
126
99
  def apply_hdg_fixes(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.2.4
3
+ Version: 0.2.7
4
4
  Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -76,6 +76,14 @@ fundamental challenges that raw OCR output leaves unsolved:
76
76
  **In short**: Complete PDF OCR with heading hierarchy fixes and image
77
77
  descriptions for RAG and LLM pipelines.
78
78
 
79
+ > [!NOTE]
80
+ >
81
+ > **Want to see mistocr in action?** This
82
+ > [tutorial](https://share.solve.it.com/d/97f75412ca949af76a5945b4dfc443c7)
83
+ > demonstrates real-world PDF processing and shows how clean markdown
84
+ > enables structure-aware navigation through long documents—letting you
85
+ > find exactly what you need, fast.
86
+
79
87
  ## Get Started
80
88
 
81
89
  Install latest from [pypi](https://pypi.org/project/mistocr), then:
@@ -1,7 +1,7 @@
1
1
  [DEFAULT]
2
2
  repo = mistocr
3
3
  lib_name = mistocr
4
- version = 0.2.4
4
+ version = 0.2.7
5
5
  min_python = 3.9
6
6
  license = apache2
7
7
  black_formatting = False
@@ -1 +0,0 @@
1
- __version__ = "0.2.4"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes