mistocr 0.2.12__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mistocr/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.12"
1
+ __version__ = "0.3.1"
mistocr/_modidx.py CHANGED
@@ -18,9 +18,10 @@ d = { 'settings': { 'branch': 'main',
18
18
  'mistocr.core.save_page': ('core.html#save_page', 'mistocr/core.py'),
19
19
  'mistocr.core.save_pages': ('core.html#save_pages', 'mistocr/core.py'),
20
20
  'mistocr.core.submit_batch': ('core.html#submit_batch', 'mistocr/core.py'),
21
+ 'mistocr.core.subset_pdf': ('core.html#subset_pdf', 'mistocr/core.py'),
21
22
  'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
22
23
  'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
23
- 'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},
24
+ 'mistocr.pipeline': {},
24
25
  'mistocr.refine': { 'mistocr.refine.HeadingCorrection': ('refine.html#headingcorrection', 'mistocr/refine.py'),
25
26
  'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
26
27
  'mistocr.refine.ImgDescription': ('refine.html#imgdescription', 'mistocr/refine.py'),
mistocr/core.py CHANGED
@@ -4,7 +4,8 @@
4
4
 
5
5
  # %% auto 0
6
6
  __all__ = ['ocr_model', 'ocr_endpoint', 'get_api_key', 'upload_pdf', 'create_batch_entry', 'prep_pdf_batch', 'submit_batch',
7
- 'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr_pdf', 'read_pgs']
7
+ 'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr_pdf', 'read_pgs',
8
+ 'subset_pdf']
8
9
 
9
10
  # %% ../nbs/00_core.ipynb 3
10
11
  from fastcore.all import *
@@ -13,6 +14,7 @@ from io import BytesIO
13
14
  from pathlib import Path
14
15
  from PIL import Image
15
16
  from mistralai import Mistral
17
+ import PyPDF2
16
18
 
17
19
  # %% ../nbs/00_core.ipynb 6
18
20
  def get_api_key(
@@ -181,7 +183,7 @@ def ocr_pdf(
181
183
  results = _run_batch(entries, c, poll_interval)
182
184
  return L([save_pages(r['response']['body'], dst, r['custom_id']) for r in results])
183
185
 
184
- # %% ../nbs/00_core.ipynb 47
186
+ # %% ../nbs/00_core.ipynb 50
185
187
  def read_pgs(
186
188
  path:str, # OCR output directory,
187
189
  join:bool=True # Join pages into single string
@@ -191,3 +193,24 @@ def read_pgs(
191
193
  pgs = sorted(path.glob('page_*.md'), key=lambda p: int(p.stem.split('_')[1]))
192
194
  contents = L([p.read_text() for p in pgs])
193
195
  return '\n\n'.join(contents) if join else contents
196
+
197
+ # %% ../nbs/00_core.ipynb 57
198
+ def subset_pdf(
199
+ path:str, # Path to PDF file
200
+ start:int=1, # Start page (1-based)
201
+ end:int=None, # End page (1-based, inclusive)
202
+ dst:str='.' # Output directory
203
+ ) -> Path: # Path to subset PDF
204
+ "Extract page range from PDF and save with range suffix"
205
+ path = Path(path)
206
+ writer = PyPDF2.PdfWriter()
207
+ with open(path, 'rb') as f:
208
+ reader = PyPDF2.PdfReader(f)
209
+ n = len(reader.pages)
210
+ end = end or n
211
+ s, e = max(0, start-1), min(n, end) - 1
212
+ for i in range(s, e+1): writer.add_page(reader.pages[i])
213
+ suffix = f"_p{s+1}-{e+1}" if s>0 or e<n-1 else ""
214
+ out = Path(dst) / f"{path.stem}{suffix}.pdf"
215
+ with open(out, 'wb') as f: writer.write(f)
216
+ return out
mistocr/pipeline.py CHANGED
@@ -3,7 +3,7 @@
3
3
  # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_pipeline.ipynb.
4
4
 
5
5
  # %% auto 0
6
- __all__ = ['pdf_to_md']
6
+ __all__ = []
7
7
 
8
8
  # %% ../nbs/02_pipeline.ipynb 3
9
9
  from fastcore.all import *
@@ -11,27 +11,5 @@ from .core import read_pgs, ocr_pdf
11
11
  from .refine import add_img_descs, fix_hdgs
12
12
  from pathlib import Path
13
13
  from asyncio import Semaphore, gather, sleep
14
+ import tempfile
14
15
  import os, json, shutil
15
-
16
- # %% ../nbs/02_pipeline.ipynb 4
17
- @delegates(add_img_descs)
18
- async def pdf_to_md(
19
- pdf_path:str, # Path to input PDF file
20
- dst:str, # Destination directory for output markdown
21
- ocr_output:str=None, # Optional OCR output directory (defaults to pdf_path stem)
22
- model:str='claude-sonnet-4-5', # Model to use for heading fixes and image descriptions
23
- add_img_desc:bool=True, # Whether to add image descriptions
24
- progress:bool=True, # Whether to show progress messages
25
- **kwargs):
26
- "Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
27
- n_steps = 3 if add_img_desc else 2
28
- if progress: print(f"Step 1/{n_steps}: Running OCR on {pdf_path}...")
29
- ocr_dirs = ocr_pdf(pdf_path, ocr_output or 'ocr_temp')
30
- ocr_dir = ocr_dirs[0]
31
- if progress: print(f"Step 2/{n_steps}: Fixing heading hierarchy...")
32
- fix_hdgs(ocr_dir, model=model)
33
- if add_img_desc:
34
- if progress: print(f"Step 3/{n_steps}: Adding image descriptions...")
35
- await add_img_descs(ocr_dir, dst=dst, model=model, progress=progress, **kwargs)
36
- elif dst and Path(dst) != ocr_dir: shutil.copytree(ocr_dir, dst, dirs_exist_ok=True)
37
- if progress: print("Done!")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.2.12
3
+ Version: 0.3.1
4
4
  Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -23,6 +23,7 @@ Requires-Dist: mistralai
23
23
  Requires-Dist: pillow
24
24
  Requires-Dist: dotenv
25
25
  Requires-Dist: lisette
26
+ Requires-Dist: PyPDF2
26
27
  Provides-Extra: dev
27
28
  Dynamic: author
28
29
  Dynamic: author-email
@@ -0,0 +1,11 @@
1
+ mistocr/__init__.py,sha256=r4xAFihOf72W9TD-lpMi6ntWSTKTP2SlzKP1ytkjRbI,22
2
+ mistocr/_modidx.py,sha256=4apaXSL_JCEHVA8B-tH3w_23jIZBzBflE3vnQ9TQDDo,4176
3
+ mistocr/core.py,sha256=EQYQgpnX2skgSX123u3dYaJHc1oDk5Nhgt5uBdXnCKs,8386
4
+ mistocr/pipeline.py,sha256=TDk8qMGFLpQReGelNyQ9mHnh2cIb-5P2YgpDHC0iqdI,438
5
+ mistocr/refine.py,sha256=hQg4ZYRTkPFEiCNNBU2ykgXFfVLCNAp2IuwOwHBVQ2k,12983
6
+ mistocr-0.3.1.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
+ mistocr-0.3.1.dist-info/METADATA,sha256=O6X89wifAKyOWhee-7SN5SuDkuZZ0W5FIOEEGZ_8J3M,8438
8
+ mistocr-0.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
+ mistocr-0.3.1.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
+ mistocr-0.3.1.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
+ mistocr-0.3.1.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- mistocr/__init__.py,sha256=X4KG3FscE5AhbGbcdDDgdDC550CVpxNMwdNLcx6EQ7M,23
2
- mistocr/_modidx.py,sha256=55k_EmMPrRmLa04qCE-_8lukJPCax66q5MALfpa8T4A,4154
3
- mistocr/core.py,sha256=ohh2ru05gUKbIQCRHPMz_hw4ui39FtpoV3_S3n4bl_c,7592
4
- mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
5
- mistocr/refine.py,sha256=hQg4ZYRTkPFEiCNNBU2ykgXFfVLCNAp2IuwOwHBVQ2k,12983
6
- mistocr-0.2.12.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
- mistocr-0.2.12.dist-info/METADATA,sha256=MB_1D5vpqGUIA_jl-jNHj7E5LHJol6YGU8lw-ZCFj_4,8417
8
- mistocr-0.2.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
- mistocr-0.2.12.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
- mistocr-0.2.12.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
- mistocr-0.2.12.dist-info/RECORD,,