mistocr 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mistocr/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.6"
1
+ __version__ = "0.2.8"
mistocr/refine.py CHANGED
@@ -3,10 +3,10 @@
3
3
  # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_refine.ipynb.
4
4
 
5
5
  # %% auto 0
6
- __all__ = ['prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
7
- 'HeadingCorrection', 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes',
8
- 'fix_hdgs', 'ImgDescription', 'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs',
9
- 'add_descs_to_pg', 'add_descs_to_pgs', 'add_img_descs']
6
+ __all__ = ['describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx', 'HeadingCorrection',
7
+ 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes', 'fix_hdgs', 'ImgDescription',
8
+ 'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs', 'add_descs_to_pg', 'add_descs_to_pgs',
9
+ 'add_img_descs']
10
10
 
11
11
  # %% ../nbs/01_refine.ipynb 3
12
12
  from fastcore.all import *
@@ -66,39 +66,6 @@ class HeadingCorrection(BaseModel):
66
66
  class HeadingCorrections(BaseModel):
67
67
  corrections: list[HeadingCorrection]
68
68
 
69
- # %% ../nbs/01_refine.ipynb 20
70
- prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
71
-
72
- INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title ... page 1")
73
-
74
- RULES - Apply these fixes in order:
75
-
76
- 1. **Single H1 rule**: Documents must have exactly ONE # heading (typically the document title at the top)
77
- - If index 0 is already #, then all subsequent headings (index 1+) must be ## or deeper
78
- - If no H1 exists, the first major heading should be #, and all others ## or deeper
79
- - NO exceptions: appendices, references, and all sections are ## or deeper after the title
80
-
81
- 2. **Infer depth from numbering patterns**: If headings contain section numbers, deeper nesting means deeper heading level
82
- - Parent section (e.g., "1", "2", "A") should be shallower than child (e.g., "1.1", "2.a", "A.1")
83
- - Child section should be one # deeper than parent
84
- - Works with any numbering: "1/1.1/1.1.1", "A/A.1/A.1.a", "I/I.A/I.A.1", etc.
85
-
86
- 3. **Level jumps**: Headings can only increase by one # at a time when moving deeper
87
- - Wrong: ## Section → ##### Subsection
88
- - Fixed: ## Section → ### Subsection
89
-
90
- 4. **Decreasing levels is OK**: Moving back up the hierarchy (### to ##) is valid for new sections
91
-
92
- OUTPUT: Return a list of corrections, where each correction has:
93
- - index: the heading's index number
94
- - corrected: the fixed heading text (without the index prefix)
95
- IMPORTANT: Preserve the " ... page N" suffix in all corrected headings.
96
- Only include headings that need changes.
97
-
98
- Headings to analyze:
99
- {headings_list}
100
- """
101
-
102
69
  # %% ../nbs/01_refine.ipynb 22
103
70
  def fix_hdg_hierarchy(
104
71
  hdgs: list[str], # List of markdown headings
@@ -253,6 +220,12 @@ async def add_img_descs(
253
220
  src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
254
221
  if dst_path != src_path: dst_path.mkdir(parents=True, exist_ok=True)
255
222
  src_imgs = src_path/img_folder
223
+
224
+ # Check if image folder exists
225
+ if not src_imgs.exists():
226
+ if progress: print(f"No images to describe in the document (no '{img_folder}' folder found)")
227
+ return
228
+
256
229
  if src_imgs.exists() and dst_path != src_path: shutil.copytree(src_imgs, dst_path/img_folder, dirs_exist_ok=True)
257
230
  desc_file = src_path/'img_descriptions.json'
258
231
  if desc_file.exists() and not force:
@@ -269,3 +242,4 @@ async def add_img_descs(
269
242
  enriched = [add_descs_to_pg(pg, descs) for pg in pgs]
270
243
  for i,pg in enumerate(enriched, 1): (dst_path/f'page_{i}.md').write_text(pg)
271
244
  if progress: print(f"Done! Enriched pages saved to {dst_path}")
245
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.2.6
3
+ Version: 0.2.8
4
4
  Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -0,0 +1,11 @@
1
+ mistocr/__init__.py,sha256=G6Dbxq2ws-1ZAXwDD8q0KWueYtso_Y6Uyvtj8sRWsPI,22
2
+ mistocr/_modidx.py,sha256=55k_EmMPrRmLa04qCE-_8lukJPCax66q5MALfpa8T4A,4154
3
+ mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
4
+ mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
5
+ mistocr/refine.py,sha256=yLnpqCEHRYfo1QQoAsgqk4INUjz_lYU8-tfEDgTC738,10544
6
+ mistocr-0.2.8.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
+ mistocr-0.2.8.dist-info/METADATA,sha256=qATVQfErJIL_AbUSis3DhuM59fECxCWvgEoM15cRXf8,8416
8
+ mistocr-0.2.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
+ mistocr-0.2.8.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
+ mistocr-0.2.8.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
+ mistocr-0.2.8.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- mistocr/__init__.py,sha256=Oz5HbwHMyE87nmwV80AZzpkJPf-wBg7eDuJr_BXZkhU,22
2
- mistocr/_modidx.py,sha256=55k_EmMPrRmLa04qCE-_8lukJPCax66q5MALfpa8T4A,4154
3
- mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
4
- mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
5
- mistocr/refine.py,sha256=EXlCKiC16dnQfPKHUguDwypnhSQ3vK2TKdkPfkSWras,11976
6
- mistocr-0.2.6.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
- mistocr-0.2.6.dist-info/METADATA,sha256=KHkqvB4eYBpPKVsj9nUg0dwmjMrHfWEcAyOCUFAHgTk,8416
8
- mistocr-0.2.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
- mistocr-0.2.6.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
- mistocr-0.2.6.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
- mistocr-0.2.6.dist-info/RECORD,,