mistocr 0.2.12__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mistocr-0.2.12/mistocr.egg-info → mistocr-0.3.1}/PKG-INFO +2 -1
- mistocr-0.3.1/mistocr/__init__.py +1 -0
- {mistocr-0.2.12 → mistocr-0.3.1}/mistocr/_modidx.py +2 -1
- {mistocr-0.2.12 → mistocr-0.3.1}/mistocr/core.py +25 -2
- mistocr-0.3.1/mistocr/pipeline.py +15 -0
- {mistocr-0.2.12 → mistocr-0.3.1/mistocr.egg-info}/PKG-INFO +2 -1
- {mistocr-0.2.12 → mistocr-0.3.1}/mistocr.egg-info/requires.txt +1 -0
- {mistocr-0.2.12 → mistocr-0.3.1}/settings.ini +2 -2
- mistocr-0.2.12/mistocr/__init__.py +0 -1
- mistocr-0.2.12/mistocr/pipeline.py +0 -37
- {mistocr-0.2.12 → mistocr-0.3.1}/LICENSE +0 -0
- {mistocr-0.2.12 → mistocr-0.3.1}/MANIFEST.in +0 -0
- {mistocr-0.2.12 → mistocr-0.3.1}/README.md +0 -0
- {mistocr-0.2.12 → mistocr-0.3.1}/mistocr/refine.py +0 -0
- {mistocr-0.2.12 → mistocr-0.3.1}/mistocr.egg-info/SOURCES.txt +0 -0
- {mistocr-0.2.12 → mistocr-0.3.1}/mistocr.egg-info/dependency_links.txt +0 -0
- {mistocr-0.2.12 → mistocr-0.3.1}/mistocr.egg-info/entry_points.txt +0 -0
- {mistocr-0.2.12 → mistocr-0.3.1}/mistocr.egg-info/not-zip-safe +0 -0
- {mistocr-0.2.12 → mistocr-0.3.1}/mistocr.egg-info/top_level.txt +0 -0
- {mistocr-0.2.12 → mistocr-0.3.1}/pyproject.toml +0 -0
- {mistocr-0.2.12 → mistocr-0.3.1}/setup.cfg +0 -0
- {mistocr-0.2.12 → mistocr-0.3.1}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mistocr
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: Batch OCR for PDFs with heading restoration and visual content integration
|
|
5
5
|
Home-page: https://github.com/franckalbinet/mistocr
|
|
6
6
|
Author: Solveit
|
|
@@ -23,6 +23,7 @@ Requires-Dist: mistralai
|
|
|
23
23
|
Requires-Dist: pillow
|
|
24
24
|
Requires-Dist: dotenv
|
|
25
25
|
Requires-Dist: lisette
|
|
26
|
+
Requires-Dist: PyPDF2
|
|
26
27
|
Provides-Extra: dev
|
|
27
28
|
Dynamic: author
|
|
28
29
|
Dynamic: author-email
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.3.1"
|
|
@@ -18,9 +18,10 @@ d = { 'settings': { 'branch': 'main',
|
|
|
18
18
|
'mistocr.core.save_page': ('core.html#save_page', 'mistocr/core.py'),
|
|
19
19
|
'mistocr.core.save_pages': ('core.html#save_pages', 'mistocr/core.py'),
|
|
20
20
|
'mistocr.core.submit_batch': ('core.html#submit_batch', 'mistocr/core.py'),
|
|
21
|
+
'mistocr.core.subset_pdf': ('core.html#subset_pdf', 'mistocr/core.py'),
|
|
21
22
|
'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
|
|
22
23
|
'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
|
|
23
|
-
'mistocr.pipeline': {
|
|
24
|
+
'mistocr.pipeline': {},
|
|
24
25
|
'mistocr.refine': { 'mistocr.refine.HeadingCorrection': ('refine.html#headingcorrection', 'mistocr/refine.py'),
|
|
25
26
|
'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
|
|
26
27
|
'mistocr.refine.ImgDescription': ('refine.html#imgdescription', 'mistocr/refine.py'),
|
|
@@ -4,7 +4,8 @@
|
|
|
4
4
|
|
|
5
5
|
# %% auto 0
|
|
6
6
|
__all__ = ['ocr_model', 'ocr_endpoint', 'get_api_key', 'upload_pdf', 'create_batch_entry', 'prep_pdf_batch', 'submit_batch',
|
|
7
|
-
'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr_pdf', 'read_pgs'
|
|
7
|
+
'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr_pdf', 'read_pgs',
|
|
8
|
+
'subset_pdf']
|
|
8
9
|
|
|
9
10
|
# %% ../nbs/00_core.ipynb 3
|
|
10
11
|
from fastcore.all import *
|
|
@@ -13,6 +14,7 @@ from io import BytesIO
|
|
|
13
14
|
from pathlib import Path
|
|
14
15
|
from PIL import Image
|
|
15
16
|
from mistralai import Mistral
|
|
17
|
+
import PyPDF2
|
|
16
18
|
|
|
17
19
|
# %% ../nbs/00_core.ipynb 6
|
|
18
20
|
def get_api_key(
|
|
@@ -181,7 +183,7 @@ def ocr_pdf(
|
|
|
181
183
|
results = _run_batch(entries, c, poll_interval)
|
|
182
184
|
return L([save_pages(r['response']['body'], dst, r['custom_id']) for r in results])
|
|
183
185
|
|
|
184
|
-
# %% ../nbs/00_core.ipynb
|
|
186
|
+
# %% ../nbs/00_core.ipynb 50
|
|
185
187
|
def read_pgs(
|
|
186
188
|
path:str, # OCR output directory,
|
|
187
189
|
join:bool=True # Join pages into single string
|
|
@@ -191,3 +193,24 @@ def read_pgs(
|
|
|
191
193
|
pgs = sorted(path.glob('page_*.md'), key=lambda p: int(p.stem.split('_')[1]))
|
|
192
194
|
contents = L([p.read_text() for p in pgs])
|
|
193
195
|
return '\n\n'.join(contents) if join else contents
|
|
196
|
+
|
|
197
|
+
# %% ../nbs/00_core.ipynb 57
|
|
198
|
+
def subset_pdf(
|
|
199
|
+
path:str, # Path to PDF file
|
|
200
|
+
start:int=1, # Start page (1-based)
|
|
201
|
+
end:int=None, # End page (1-based, inclusive)
|
|
202
|
+
dst:str='.' # Output directory
|
|
203
|
+
) -> Path: # Path to subset PDF
|
|
204
|
+
"Extract page range from PDF and save with range suffix"
|
|
205
|
+
path = Path(path)
|
|
206
|
+
writer = PyPDF2.PdfWriter()
|
|
207
|
+
with open(path, 'rb') as f:
|
|
208
|
+
reader = PyPDF2.PdfReader(f)
|
|
209
|
+
n = len(reader.pages)
|
|
210
|
+
end = end or n
|
|
211
|
+
s, e = max(0, start-1), min(n, end) - 1
|
|
212
|
+
for i in range(s, e+1): writer.add_page(reader.pages[i])
|
|
213
|
+
suffix = f"_p{s+1}-{e+1}" if s>0 or e<n-1 else ""
|
|
214
|
+
out = Path(dst) / f"{path.stem}{suffix}.pdf"
|
|
215
|
+
with open(out, 'wb') as f: writer.write(f)
|
|
216
|
+
return out
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""End-to-End Pipeline: PDF OCR, Markdown Heading Correction, and AI Image Descriptions"""
|
|
2
|
+
|
|
3
|
+
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_pipeline.ipynb.
|
|
4
|
+
|
|
5
|
+
# %% auto 0
|
|
6
|
+
__all__ = []
|
|
7
|
+
|
|
8
|
+
# %% ../nbs/02_pipeline.ipynb 3
|
|
9
|
+
from fastcore.all import *
|
|
10
|
+
from .core import read_pgs, ocr_pdf
|
|
11
|
+
from .refine import add_img_descs, fix_hdgs
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from asyncio import Semaphore, gather, sleep
|
|
14
|
+
import tempfile
|
|
15
|
+
import os, json, shutil
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mistocr
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: Batch OCR for PDFs with heading restoration and visual content integration
|
|
5
5
|
Home-page: https://github.com/franckalbinet/mistocr
|
|
6
6
|
Author: Solveit
|
|
@@ -23,6 +23,7 @@ Requires-Dist: mistralai
|
|
|
23
23
|
Requires-Dist: pillow
|
|
24
24
|
Requires-Dist: dotenv
|
|
25
25
|
Requires-Dist: lisette
|
|
26
|
+
Requires-Dist: PyPDF2
|
|
26
27
|
Provides-Extra: dev
|
|
27
28
|
Dynamic: author
|
|
28
29
|
Dynamic: author-email
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
[DEFAULT]
|
|
2
2
|
repo = mistocr
|
|
3
3
|
lib_name = mistocr
|
|
4
|
-
version = 0.
|
|
4
|
+
version = 0.3.1
|
|
5
5
|
min_python = 3.9
|
|
6
6
|
license = apache2
|
|
7
7
|
black_formatting = False
|
|
@@ -27,7 +27,7 @@ keywords = nbdev jupyter notebook python
|
|
|
27
27
|
language = English
|
|
28
28
|
status = 3
|
|
29
29
|
user = franckalbinet
|
|
30
|
-
requirements = fastcore mistralai pillow dotenv lisette
|
|
30
|
+
requirements = fastcore mistralai pillow dotenv lisette PyPDF2
|
|
31
31
|
readme_nb = index.ipynb
|
|
32
32
|
allowed_metadata_keys =
|
|
33
33
|
allowed_cell_metadata_keys =
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.2.12"
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
"""End-to-End Pipeline: PDF OCR, Markdown Heading Correction, and AI Image Descriptions"""
|
|
2
|
-
|
|
3
|
-
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_pipeline.ipynb.
|
|
4
|
-
|
|
5
|
-
# %% auto 0
|
|
6
|
-
__all__ = ['pdf_to_md']
|
|
7
|
-
|
|
8
|
-
# %% ../nbs/02_pipeline.ipynb 3
|
|
9
|
-
from fastcore.all import *
|
|
10
|
-
from .core import read_pgs, ocr_pdf
|
|
11
|
-
from .refine import add_img_descs, fix_hdgs
|
|
12
|
-
from pathlib import Path
|
|
13
|
-
from asyncio import Semaphore, gather, sleep
|
|
14
|
-
import os, json, shutil
|
|
15
|
-
|
|
16
|
-
# %% ../nbs/02_pipeline.ipynb 4
|
|
17
|
-
@delegates(add_img_descs)
|
|
18
|
-
async def pdf_to_md(
|
|
19
|
-
pdf_path:str, # Path to input PDF file
|
|
20
|
-
dst:str, # Destination directory for output markdown
|
|
21
|
-
ocr_output:str=None, # Optional OCR output directory (defaults to pdf_path stem)
|
|
22
|
-
model:str='claude-sonnet-4-5', # Model to use for heading fixes and image descriptions
|
|
23
|
-
add_img_desc:bool=True, # Whether to add image descriptions
|
|
24
|
-
progress:bool=True, # Whether to show progress messages
|
|
25
|
-
**kwargs):
|
|
26
|
-
"Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
|
|
27
|
-
n_steps = 3 if add_img_desc else 2
|
|
28
|
-
if progress: print(f"Step 1/{n_steps}: Running OCR on {pdf_path}...")
|
|
29
|
-
ocr_dirs = ocr_pdf(pdf_path, ocr_output or 'ocr_temp')
|
|
30
|
-
ocr_dir = ocr_dirs[0]
|
|
31
|
-
if progress: print(f"Step 2/{n_steps}: Fixing heading hierarchy...")
|
|
32
|
-
fix_hdgs(ocr_dir, model=model)
|
|
33
|
-
if add_img_desc:
|
|
34
|
-
if progress: print(f"Step 3/{n_steps}: Adding image descriptions...")
|
|
35
|
-
await add_img_descs(ocr_dir, dst=dst, model=model, progress=progress, **kwargs)
|
|
36
|
-
elif dst and Path(dst) != ocr_dir: shutil.copytree(ocr_dir, dst, dirs_exist_ok=True)
|
|
37
|
-
if progress: print("Done!")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|