mistocr 0.3.1__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.3.1
3
+ Version: 0.3.2
4
4
  Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -0,0 +1 @@
1
+ __version__ = "0.3.2"
@@ -21,7 +21,7 @@ d = { 'settings': { 'branch': 'main',
21
21
  'mistocr.core.subset_pdf': ('core.html#subset_pdf', 'mistocr/core.py'),
22
22
  'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
23
23
  'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
24
- 'mistocr.pipeline': {},
24
+ 'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},
25
25
  'mistocr.refine': { 'mistocr.refine.HeadingCorrection': ('refine.html#headingcorrection', 'mistocr/refine.py'),
26
26
  'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
27
27
  'mistocr.refine.ImgDescription': ('refine.html#imgdescription', 'mistocr/refine.py'),
@@ -0,0 +1,41 @@
1
+ """End-to-End Pipeline: PDF OCR, Markdown Heading Correction, and AI Image Descriptions"""
2
+
3
+ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_pipeline.ipynb.
4
+
5
+ # %% auto 0
6
+ __all__ = ['pdf_to_md']
7
+
8
+ # %% ../nbs/02_pipeline.ipynb 3
9
+ from fastcore.all import *
10
+ from .core import read_pgs, ocr_pdf
11
+ from .refine import add_img_descs, fix_hdgs
12
+ from pathlib import Path
13
+ from asyncio import Semaphore, gather, sleep
14
+ import tempfile
15
+ import os, json, shutil
16
+
17
+ # %% ../nbs/02_pipeline.ipynb 4
18
+ @delegates(add_img_descs)
19
+ async def pdf_to_md(
20
+ pdf_path:str, # Path to input PDF file
21
+ dst:str, # Destination directory for output markdown
22
+ ocr_dst:str=None, # Optional OCR output directory
23
+ model:str='claude-sonnet-4-5', # Model to use for heading fixes and image descriptions
24
+ add_img_desc:bool=True, # Whether to add image descriptions
25
+ progress:bool=True, # Whether to show progress messages
26
+ **kwargs
27
+ ):
28
+ "Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
29
+ cleanup = ocr_dst is None
30
+ if cleanup: ocr_dst = tempfile.mkdtemp()
31
+ n_steps = 3 if add_img_desc else 2
32
+ if progress: print(f"Step 1/{n_steps}: Running OCR on {pdf_path}...")
33
+ ocr_dir = ocr_pdf(pdf_path, ocr_dst)[0]
34
+ if progress: print(f"Step 2/{n_steps}: Fixing heading hierarchy...")
35
+ fix_hdgs(ocr_dir, model=model)
36
+ if add_img_desc:
37
+ if progress: print(f"Step 3/{n_steps}: Adding image descriptions...")
38
+ await add_img_descs(ocr_dir, dst=dst, model=model, progress=progress, **kwargs)
39
+ elif dst != str(ocr_dir): shutil.copytree(ocr_dir, dst, dirs_exist_ok=True)
40
+ if cleanup: shutil.rmtree(ocr_dst)
41
+ if progress: print("Done!")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.3.1
3
+ Version: 0.3.2
4
4
  Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -1,7 +1,7 @@
1
1
  [DEFAULT]
2
2
  repo = mistocr
3
3
  lib_name = mistocr
4
- version = 0.3.1
4
+ version = 0.3.2
5
5
  min_python = 3.9
6
6
  license = apache2
7
7
  black_formatting = False
@@ -1 +0,0 @@
1
- __version__ = "0.3.1"
@@ -1,15 +0,0 @@
1
- """End-to-End Pipeline: PDF OCR, Markdown Heading Correction, and AI Image Descriptions"""
2
-
3
- # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_pipeline.ipynb.
4
-
5
- # %% auto 0
6
- __all__ = []
7
-
8
- # %% ../nbs/02_pipeline.ipynb 3
9
- from fastcore.all import *
10
- from .core import read_pgs, ocr_pdf
11
- from .refine import add_img_descs, fix_hdgs
12
- from pathlib import Path
13
- from asyncio import Semaphore, gather, sleep
14
- import tempfile
15
- import os, json, shutil
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes