mistocr 0.2.12__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mistocr/__init__.py +1 -1
- mistocr/_modidx.py +1 -0
- mistocr/core.py +25 -2
- {mistocr-0.2.12.dist-info → mistocr-0.3.0.dist-info}/METADATA +2 -1
- mistocr-0.3.0.dist-info/RECORD +11 -0
- mistocr-0.2.12.dist-info/RECORD +0 -11
- {mistocr-0.2.12.dist-info → mistocr-0.3.0.dist-info}/WHEEL +0 -0
- {mistocr-0.2.12.dist-info → mistocr-0.3.0.dist-info}/entry_points.txt +0 -0
- {mistocr-0.2.12.dist-info → mistocr-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {mistocr-0.2.12.dist-info → mistocr-0.3.0.dist-info}/top_level.txt +0 -0
mistocr/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.3.0"
|
mistocr/_modidx.py
CHANGED
|
@@ -18,6 +18,7 @@ d = { 'settings': { 'branch': 'main',
|
|
|
18
18
|
'mistocr.core.save_page': ('core.html#save_page', 'mistocr/core.py'),
|
|
19
19
|
'mistocr.core.save_pages': ('core.html#save_pages', 'mistocr/core.py'),
|
|
20
20
|
'mistocr.core.submit_batch': ('core.html#submit_batch', 'mistocr/core.py'),
|
|
21
|
+
'mistocr.core.subset_pdf': ('core.html#subset_pdf', 'mistocr/core.py'),
|
|
21
22
|
'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
|
|
22
23
|
'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
|
|
23
24
|
'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},
|
mistocr/core.py
CHANGED
|
@@ -4,7 +4,8 @@
|
|
|
4
4
|
|
|
5
5
|
# %% auto 0
|
|
6
6
|
__all__ = ['ocr_model', 'ocr_endpoint', 'get_api_key', 'upload_pdf', 'create_batch_entry', 'prep_pdf_batch', 'submit_batch',
|
|
7
|
-
'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr_pdf', 'read_pgs'
|
|
7
|
+
'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr_pdf', 'read_pgs',
|
|
8
|
+
'subset_pdf']
|
|
8
9
|
|
|
9
10
|
# %% ../nbs/00_core.ipynb 3
|
|
10
11
|
from fastcore.all import *
|
|
@@ -13,6 +14,7 @@ from io import BytesIO
|
|
|
13
14
|
from pathlib import Path
|
|
14
15
|
from PIL import Image
|
|
15
16
|
from mistralai import Mistral
|
|
17
|
+
import PyPDF2
|
|
16
18
|
|
|
17
19
|
# %% ../nbs/00_core.ipynb 6
|
|
18
20
|
def get_api_key(
|
|
@@ -181,7 +183,7 @@ def ocr_pdf(
|
|
|
181
183
|
results = _run_batch(entries, c, poll_interval)
|
|
182
184
|
return L([save_pages(r['response']['body'], dst, r['custom_id']) for r in results])
|
|
183
185
|
|
|
184
|
-
# %% ../nbs/00_core.ipynb
|
|
186
|
+
# %% ../nbs/00_core.ipynb 50
|
|
185
187
|
def read_pgs(
|
|
186
188
|
path:str, # OCR output directory,
|
|
187
189
|
join:bool=True # Join pages into single string
|
|
@@ -191,3 +193,24 @@ def read_pgs(
|
|
|
191
193
|
pgs = sorted(path.glob('page_*.md'), key=lambda p: int(p.stem.split('_')[1]))
|
|
192
194
|
contents = L([p.read_text() for p in pgs])
|
|
193
195
|
return '\n\n'.join(contents) if join else contents
|
|
196
|
+
|
|
197
|
+
# %% ../nbs/00_core.ipynb 57
|
|
198
|
+
def subset_pdf(
|
|
199
|
+
path:str, # Path to PDF file
|
|
200
|
+
start:int=1, # Start page (1-based)
|
|
201
|
+
end:int=None, # End page (1-based, inclusive)
|
|
202
|
+
dst:str='.' # Output directory
|
|
203
|
+
) -> Path: # Path to subset PDF
|
|
204
|
+
"Extract page range from PDF and save with range suffix"
|
|
205
|
+
path = Path(path)
|
|
206
|
+
writer = PyPDF2.PdfWriter()
|
|
207
|
+
with open(path, 'rb') as f:
|
|
208
|
+
reader = PyPDF2.PdfReader(f)
|
|
209
|
+
n = len(reader.pages)
|
|
210
|
+
end = end or n
|
|
211
|
+
s, e = max(0, start-1), min(n, end) - 1
|
|
212
|
+
for i in range(s, e+1): writer.add_page(reader.pages[i])
|
|
213
|
+
suffix = f"_p{s+1}-{e+1}" if s>0 or e<n-1 else ""
|
|
214
|
+
out = Path(dst) / f"{path.stem}{suffix}.pdf"
|
|
215
|
+
with open(out, 'wb') as f: writer.write(f)
|
|
216
|
+
return out
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mistocr
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Batch OCR for PDFs with heading restoration and visual content integration
|
|
5
5
|
Home-page: https://github.com/franckalbinet/mistocr
|
|
6
6
|
Author: Solveit
|
|
@@ -23,6 +23,7 @@ Requires-Dist: mistralai
|
|
|
23
23
|
Requires-Dist: pillow
|
|
24
24
|
Requires-Dist: dotenv
|
|
25
25
|
Requires-Dist: lisette
|
|
26
|
+
Requires-Dist: PyPDF2
|
|
26
27
|
Provides-Extra: dev
|
|
27
28
|
Dynamic: author
|
|
28
29
|
Dynamic: author-email
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
mistocr/__init__.py,sha256=VrXpHDu3erkzwl_WXrqINBm9xWkcyUy53IQOj042dOs,22
|
|
2
|
+
mistocr/_modidx.py,sha256=2rHVTcz3A3BfDCmGqUNsJFW3_n3Ch1vxSorrPVyLvMI,4256
|
|
3
|
+
mistocr/core.py,sha256=EQYQgpnX2skgSX123u3dYaJHc1oDk5Nhgt5uBdXnCKs,8386
|
|
4
|
+
mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
|
|
5
|
+
mistocr/refine.py,sha256=hQg4ZYRTkPFEiCNNBU2ykgXFfVLCNAp2IuwOwHBVQ2k,12983
|
|
6
|
+
mistocr-0.3.0.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
|
|
7
|
+
mistocr-0.3.0.dist-info/METADATA,sha256=XTduA6AA_UcHYrjeJo4oOl9tQYnIfqfiP3oNJvy_nTI,8438
|
|
8
|
+
mistocr-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
+
mistocr-0.3.0.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
|
|
10
|
+
mistocr-0.3.0.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
|
|
11
|
+
mistocr-0.3.0.dist-info/RECORD,,
|
mistocr-0.2.12.dist-info/RECORD
DELETED
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
mistocr/__init__.py,sha256=X4KG3FscE5AhbGbcdDDgdDC550CVpxNMwdNLcx6EQ7M,23
|
|
2
|
-
mistocr/_modidx.py,sha256=55k_EmMPrRmLa04qCE-_8lukJPCax66q5MALfpa8T4A,4154
|
|
3
|
-
mistocr/core.py,sha256=ohh2ru05gUKbIQCRHPMz_hw4ui39FtpoV3_S3n4bl_c,7592
|
|
4
|
-
mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
|
|
5
|
-
mistocr/refine.py,sha256=hQg4ZYRTkPFEiCNNBU2ykgXFfVLCNAp2IuwOwHBVQ2k,12983
|
|
6
|
-
mistocr-0.2.12.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
|
|
7
|
-
mistocr-0.2.12.dist-info/METADATA,sha256=MB_1D5vpqGUIA_jl-jNHj7E5LHJol6YGU8lw-ZCFj_4,8417
|
|
8
|
-
mistocr-0.2.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
-
mistocr-0.2.12.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
|
|
10
|
-
mistocr-0.2.12.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
|
|
11
|
-
mistocr-0.2.12.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|