mistocr 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mistocr/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.3.0"
1
+ __version__ = "0.3.2"
mistocr/pipeline.py CHANGED
@@ -11,27 +11,31 @@ from .core import read_pgs, ocr_pdf
11
11
  from .refine import add_img_descs, fix_hdgs
12
12
  from pathlib import Path
13
13
  from asyncio import Semaphore, gather, sleep
14
+ import tempfile
14
15
  import os, json, shutil
15
16
 
16
17
  # %% ../nbs/02_pipeline.ipynb 4
17
18
  @delegates(add_img_descs)
18
19
  async def pdf_to_md(
19
- pdf_path:str, # Path to input PDF file
20
- dst:str, # Destination directory for output markdown
21
- ocr_output:str=None, # Optional OCR output directory (defaults to pdf_path stem)
22
- model:str='claude-sonnet-4-5', # Model to use for heading fixes and image descriptions
23
- add_img_desc:bool=True, # Whether to add image descriptions
24
- progress:bool=True, # Whether to show progress messages
25
- **kwargs):
20
+ pdf_path:str, # Path to input PDF file
21
+ dst:str, # Destination directory for output markdown
22
+ ocr_dst:str=None, # Optional OCR output directory
23
+ model:str='claude-sonnet-4-5', # Model to use for heading fixes and image descriptions
24
+ add_img_desc:bool=True, # Whether to add image descriptions
25
+ progress:bool=True, # Whether to show progress messages
26
+ **kwargs
27
+ ):
26
28
  "Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
29
+ cleanup = ocr_dst is None
30
+ if cleanup: ocr_dst = tempfile.mkdtemp()
27
31
  n_steps = 3 if add_img_desc else 2
28
32
  if progress: print(f"Step 1/{n_steps}: Running OCR on {pdf_path}...")
29
- ocr_dirs = ocr_pdf(pdf_path, ocr_output or 'ocr_temp')
30
- ocr_dir = ocr_dirs[0]
33
+ ocr_dir = ocr_pdf(pdf_path, ocr_dst)[0]
31
34
  if progress: print(f"Step 2/{n_steps}: Fixing heading hierarchy...")
32
35
  fix_hdgs(ocr_dir, model=model)
33
36
  if add_img_desc:
34
37
  if progress: print(f"Step 3/{n_steps}: Adding image descriptions...")
35
38
  await add_img_descs(ocr_dir, dst=dst, model=model, progress=progress, **kwargs)
36
- elif dst and Path(dst) != ocr_dir: shutil.copytree(ocr_dir, dst, dirs_exist_ok=True)
39
+ elif dst != str(ocr_dir): shutil.copytree(ocr_dir, dst, dirs_exist_ok=True)
40
+ if cleanup: shutil.rmtree(ocr_dst)
37
41
  if progress: print("Done!")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -0,0 +1,11 @@
1
+ mistocr/__init__.py,sha256=vNiWJ14r_cw5t_7UDqDQIVZvladKFGyHH2avsLpN7Vg,22
2
+ mistocr/_modidx.py,sha256=2rHVTcz3A3BfDCmGqUNsJFW3_n3Ch1vxSorrPVyLvMI,4256
3
+ mistocr/core.py,sha256=EQYQgpnX2skgSX123u3dYaJHc1oDk5Nhgt5uBdXnCKs,8386
4
+ mistocr/pipeline.py,sha256=hVXpxRYtshaiUm9qXgfSLlyHCAxHZ6nAfPzoGXGmJMQ,1769
5
+ mistocr/refine.py,sha256=hQg4ZYRTkPFEiCNNBU2ykgXFfVLCNAp2IuwOwHBVQ2k,12983
6
+ mistocr-0.3.2.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
+ mistocr-0.3.2.dist-info/METADATA,sha256=igTgaDeBu00u_xJYtIcGlQswQCj2gIrdBi6NLiN5NNU,8438
8
+ mistocr-0.3.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
+ mistocr-0.3.2.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
+ mistocr-0.3.2.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
+ mistocr-0.3.2.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- mistocr/__init__.py,sha256=VrXpHDu3erkzwl_WXrqINBm9xWkcyUy53IQOj042dOs,22
2
- mistocr/_modidx.py,sha256=2rHVTcz3A3BfDCmGqUNsJFW3_n3Ch1vxSorrPVyLvMI,4256
3
- mistocr/core.py,sha256=EQYQgpnX2skgSX123u3dYaJHc1oDk5Nhgt5uBdXnCKs,8386
4
- mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
5
- mistocr/refine.py,sha256=hQg4ZYRTkPFEiCNNBU2ykgXFfVLCNAp2IuwOwHBVQ2k,12983
6
- mistocr-0.3.0.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
- mistocr-0.3.0.dist-info/METADATA,sha256=XTduA6AA_UcHYrjeJo4oOl9tQYnIfqfiP3oNJvy_nTI,8438
8
- mistocr-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
- mistocr-0.3.0.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
- mistocr-0.3.0.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
- mistocr-0.3.0.dist-info/RECORD,,