mistocr 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mistocr/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.7"
1
+ __version__ = "0.2.9"
mistocr/refine.py CHANGED
@@ -3,10 +3,10 @@
3
3
  # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_refine.ipynb.
4
4
 
5
5
  # %% auto 0
6
- __all__ = ['describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx', 'HeadingCorrection',
7
- 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes', 'fix_hdgs', 'ImgDescription',
8
- 'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs', 'add_descs_to_pg', 'add_descs_to_pgs',
9
- 'add_img_descs']
6
+ __all__ = ['prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
7
+ 'HeadingCorrection', 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes',
8
+ 'fix_hdgs', 'ImgDescription', 'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs',
9
+ 'add_descs_to_pg', 'add_descs_to_pgs', 'add_img_descs']
10
10
 
11
11
  # %% ../nbs/01_refine.ipynb 3
12
12
  from fastcore.all import *
@@ -66,6 +66,46 @@ class HeadingCorrection(BaseModel):
66
66
  class HeadingCorrections(BaseModel):
67
67
  corrections: list[HeadingCorrection]
68
68
 
69
+ # %% ../nbs/01_refine.ipynb 20
70
+ prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
71
+
72
+ INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title ... page 1")
73
+
74
+ ANALYSIS STEPS (think through these before outputting corrections):
75
+ 1. For each numbered heading (e.g., "4.1", "2.a", "A.1"), identify its parent heading (e.g., "4", "2", "A")
76
+ 2. Verify the child heading is exactly one # deeper than its parent
77
+ 3. If not, mark it for correction
78
+
79
+ RULES - Apply these fixes in order:
80
+
81
+ 1. **Single H1 rule**: Documents must have exactly ONE # heading (typically the document title at the top)
82
+ - If index 0 is already #, then all subsequent headings (index 1+) must be ## or deeper
83
+ - If no H1 exists, the first major heading should be #, and all others ## or deeper
84
+ - NO exceptions: appendices, references, and all sections are ## or deeper after the title
85
+
86
+ 2. **Infer depth from numbering patterns**: If headings contain section numbers, deeper nesting means deeper heading level
87
+ - Parent section (e.g., "1", "2", "A") MUST be shallower than child (e.g., "1.1", "2.a", "A.1")
88
+ - Child section MUST be exactly one # deeper than parent
89
+ - Works with any numbering: "1/1.1/1.1.1", "A/A.1/A.1.a", "I/I.A/I.A.1", etc.
90
+
91
+ 3. **Level jumps**: Headings can only increase by one # at a time when moving deeper
92
+ - Wrong: ## Section → ##### Subsection
93
+ - Fixed: ## Section → ### Subsection
94
+
95
+ 4. **Decreasing levels is OK**: Moving back up the hierarchy (### to ##) is valid for new sections
96
+
97
+ 5. **Unnumbered headings in numbered documents**: If the document uses numbered headings consistently, any unnumbered heading appearing within that structure is likely misclassified bold text and should be converted to regular text (output the heading text without any # symbols in the corrected field)
98
+
99
+ OUTPUT: Return a list of corrections, where each correction has:
100
+ - index: the heading's index number
101
+ - corrected: the fixed heading text (without the index prefix), or empty string "" to remove the heading entirely
102
+ IMPORTANT: Preserve the " ... page N" suffix in all corrected headings.
103
+ Only include headings that need changes.
104
+
105
+ Headings to analyze:
106
+ {headings_list}
107
+ """
108
+
69
109
  # %% ../nbs/01_refine.ipynb 22
70
110
  def fix_hdg_hierarchy(
71
111
  hdgs: list[str], # List of markdown headings
@@ -220,6 +260,12 @@ async def add_img_descs(
220
260
  src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
221
261
  if dst_path != src_path: dst_path.mkdir(parents=True, exist_ok=True)
222
262
  src_imgs = src_path/img_folder
263
+
264
+ # Check if image folder exists
265
+ if not src_imgs.exists():
266
+ if progress: print(f"No images to describe in the document (no '{img_folder}' folder found)")
267
+ return
268
+
223
269
  if src_imgs.exists() and dst_path != src_path: shutil.copytree(src_imgs, dst_path/img_folder, dirs_exist_ok=True)
224
270
  desc_file = src_path/'img_descriptions.json'
225
271
  if desc_file.exists() and not force:
@@ -236,3 +282,4 @@ async def add_img_descs(
236
282
  enriched = [add_descs_to_pg(pg, descs) for pg in pgs]
237
283
  for i,pg in enumerate(enriched, 1): (dst_path/f'page_{i}.md').write_text(pg)
238
284
  if progress: print(f"Done! Enriched pages saved to {dst_path}")
285
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.2.7
3
+ Version: 0.2.9
4
4
  Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -0,0 +1,11 @@
1
+ mistocr/__init__.py,sha256=F8OVhAhMXSkvvXYgZtbPn2SG1AQC3joK4yu-FrHt81Y,22
2
+ mistocr/_modidx.py,sha256=55k_EmMPrRmLa04qCE-_8lukJPCax66q5MALfpa8T4A,4154
3
+ mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
4
+ mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
5
+ mistocr/refine.py,sha256=wtfS_bHlD39R8T2RbITgNX8cDCIPXI9gRrJ4y9nI_rM,12807
6
+ mistocr-0.2.9.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
+ mistocr-0.2.9.dist-info/METADATA,sha256=n9pFeWhh-Vzd7KR7s6s3R0mdJ3xvpexujEH-0iQsRQY,8416
8
+ mistocr-0.2.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
+ mistocr-0.2.9.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
+ mistocr-0.2.9.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
+ mistocr-0.2.9.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- mistocr/__init__.py,sha256=XHypfHSPdgXFKmOdoewn7czU670gt8InhHhzlP5j_aA,22
2
- mistocr/_modidx.py,sha256=55k_EmMPrRmLa04qCE-_8lukJPCax66q5MALfpa8T4A,4154
3
- mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
4
- mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
5
- mistocr/refine.py,sha256=zSCF0gOtEKhhQTQgVq4Jh5Ujk8l8CGSO_rURhsQ09P8,10351
6
- mistocr-0.2.7.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
- mistocr-0.2.7.dist-info/METADATA,sha256=eyQ65s8HsoHUUINrGiijrC8e0RzO_Wvte3rk2OLU8QY,8416
8
- mistocr-0.2.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
- mistocr-0.2.7.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
- mistocr-0.2.7.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
- mistocr-0.2.7.dist-info/RECORD,,