PyPI - mistocr - Versions diffs - 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl - Mend

mistocr 0.0.3py3-none-any.whl → 0.0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

mistocr/__init__.py +1 -1
mistocr/_modidx.py +1 -0
mistocr/core.py +31 -21
{mistocr-0.0.3.dist-info → mistocr-0.0.4.dist-info}/METADATA +42 -38
mistocr-0.0.4.dist-info/RECORD +9 -0
mistocr-0.0.3.dist-info/RECORD +0 -9
{mistocr-0.0.3.dist-info → mistocr-0.0.4.dist-info}/WHEEL +0 -0
{mistocr-0.0.3.dist-info → mistocr-0.0.4.dist-info}/entry_points.txt +0 -0
{mistocr-0.0.3.dist-info → mistocr-0.0.4.dist-info}/licenses/LICENSE +0 -0
{mistocr-0.0.3.dist-info → mistocr-0.0.4.dist-info}/top_level.txt +0 -0

mistocr/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.0.3"
1	+ __version__ = "0.0.4"

mistocr/_modidx.py CHANGED Viewed

@@ -13,6 +13,7 @@ d = { 'settings': { 'branch': 'main',
                               'mistocr.core.get_api_key': ('core.html#get_api_key', 'mistocr/core.py'),
                               'mistocr.core.ocr': ('core.html#ocr', 'mistocr/core.py'),
                               'mistocr.core.prep_pdf_batch': ('core.html#prep_pdf_batch', 'mistocr/core.py'),
+                              'mistocr.core.read_pgs': ('core.html#read_pgs', 'mistocr/core.py'),
                               'mistocr.core.save_images': ('core.html#save_images', 'mistocr/core.py'),
                               'mistocr.core.save_page': ('core.html#save_page', 'mistocr/core.py'),
                               'mistocr.core.save_pages': ('core.html#save_pages', 'mistocr/core.py'),

mistocr/core.py CHANGED Viewed

@@ -4,21 +4,17 @@
 # %% auto 0
 __all__ = ['ocr_model', 'ocr_endpoint', 'get_api_key', 'upload_pdf', 'create_batch_entry', 'prep_pdf_batch', 'submit_batch',
-           'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr']
+           'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr', 'read_pgs']
 # %% ../nbs/00_core.ipynb 3
 from fastcore.all import *
-from dotenv import load_dotenv
-import os, json, time, base64, tempfile
+import os, re, json, time, base64, tempfile, logging
 from io import BytesIO
 from pathlib import Path
 from PIL import Image
 from mistralai import Mistral
 # %% ../nbs/00_core.ipynb 6
-load_dotenv()
-# %% ../nbs/00_core.ipynb 7
 def get_api_key(
     key:str=None # Mistral API key
     ):
@@ -27,11 +23,11 @@ def get_api_key(
     if not key: raise ValueError("MISTRAL_API_KEY not found")
     return key
-# %% ../nbs/00_core.ipynb 8
+# %% ../nbs/00_core.ipynb 7
 ocr_model = "mistral-ocr-latest"
 ocr_endpoint = "/v1/ocr"
-# %% ../nbs/00_core.ipynb 11
+# %% ../nbs/00_core.ipynb 10
 def upload_pdf(
     path:str, # Path to PDF file
     key:str=None # Mistral API key
@@ -42,11 +38,11 @@ def upload_pdf(
     uploaded = c.files.upload(file=dict(file_name=path.stem, content=path.read_bytes()), purpose="ocr")
     return c.files.get_signed_url(file_id=uploaded.id).url, c
-# %% ../nbs/00_core.ipynb 16
+# %% ../nbs/00_core.ipynb 15
 def create_batch_entry(
     path:str, # Path to PDF file,
     url:str, # Mistral signed URL
-    cid:str=None, # Custom ID (by default using the file name without extention)
+    cid:str=None, # Custom ID (by default using the file name without extension)
     inc_img:bool=True # Include image in response
     ) -> dict[str, str | dict[str, str | bool]]: # Batch entry dict
     "Create a batch entry dict for OCR"
@@ -54,7 +50,7 @@ def create_batch_entry(
     if not cid: cid = path.stem
     return dict(custom_id=cid, body=dict(document=dict(type="document_url", document_url=url), include_image_base64=inc_img))
-# %% ../nbs/00_core.ipynb 18
+# %% ../nbs/00_core.ipynb 17
 def prep_pdf_batch(
     path:str, # Path to PDF file,
     cid:str=None, # Custom ID (by default using the file name without extention)
@@ -65,7 +61,7 @@ def prep_pdf_batch(
     url, c = upload_pdf(path, key)
     return create_batch_entry(path, url, cid, inc_img), c
-# %% ../nbs/00_core.ipynb 22
+# %% ../nbs/00_core.ipynb 21
 def submit_batch(
     entries:list[dict], # List of batch entries,
     c:Mistral=None, # Mistral client,
@@ -79,7 +75,7 @@ def submit_batch(
         batch_data = c.files.upload(file=dict(file_name="batch.jsonl", content=open(f.name, "rb")), purpose="batch")
     return c.batch.jobs.create(input_files=[batch_data.id], model=model, endpoint=endpoint)
-# %% ../nbs/00_core.ipynb 25
+# %% ../nbs/00_core.ipynb 24
 def wait_for_job(
     job:dict, # Job dict,
     c:Mistral=None, # Mistral client,
@@ -91,7 +87,7 @@ def wait_for_job(
         job = c.batch.jobs.get(job_id=job.id)
     return job
-# %% ../nbs/00_core.ipynb 27
+# %% ../nbs/00_core.ipynb 26
 def download_results(
     job:dict, # Job dict,
     c:Mistral=None # Mistral client
@@ -100,7 +96,7 @@ def download_results(
     content = c.files.download(file_id=job.output_file).read().decode('utf-8')
     return [json.loads(line) for line in content.strip().split('\n') if line]
-# %% ../nbs/00_core.ipynb 32
+# %% ../nbs/00_core.ipynb 31
 def save_images(
     page:dict, # Page dict,
     img_dir:str='img' # Directory to save images
@@ -111,7 +107,7 @@ def save_images(
             img_bytes = base64.b64decode(img['image_base64'].split(',')[1])
             Image.open(BytesIO(img_bytes)).save(img_dir / img['id'])
-# %% ../nbs/00_core.ipynb 33
+# %% ../nbs/00_core.ipynb 32
 def save_page(
     page:dict, # Page dict,
     out_dir:str, # Directory to save page
@@ -123,7 +119,7 @@ def save_page(
         img_dir.mkdir(exist_ok=True)
         save_images(page, img_dir)
-# %% ../nbs/00_core.ipynb 35
+# %% ../nbs/00_core.ipynb 34
 def save_pages(
     ocr_resp:dict, # OCR response,
     out_dir:str, # Directory to save pages,
@@ -136,7 +132,7 @@ def save_pages(
     for page in ocr_resp['pages']: save_page(page, out_dir, img_dir)
     return out_dir
-# %% ../nbs/00_core.ipynb 41
+# %% ../nbs/00_core.ipynb 40
 def _get_paths(path:str) -> list[Path]:
     "Get list of PDFs from file or folder"
     path = Path(path)
@@ -147,7 +143,7 @@ def _get_paths(path:str) -> list[Path]:
         return pdfs
     raise ValueError(f"Path not found: {path}")
-# %% ../nbs/00_core.ipynb 42
+# %% ../nbs/00_core.ipynb 41
 def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[dict], Mistral]:
     "Prepare batch entries for list of PDFs"
     entries, c = [], None
@@ -156,7 +152,7 @@ def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[
         entries.append(entry)
     return entries, c
-# %% ../nbs/00_core.ipynb 43
+# %% ../nbs/00_core.ipynb 42
 def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]:
     "Submit batch, wait for completion, and download results"
     job = submit_batch(entries, c)
@@ -164,7 +160,7 @@ def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]
     if job.status != 'SUCCESS': raise Exception(f"Job failed with status: {job.status}")
     return download_results(job, c)
-# %% ../nbs/00_core.ipynb 44
+# %% ../nbs/00_core.ipynb 43
 def ocr(
     path:str, # Path to PDF file or folder,
     out_dir:str='md', # Directory to save markdown pages,
@@ -177,3 +173,17 @@ def ocr(
     entries, c = _prep_batch(pdfs, inc_img, key)
     results = _run_batch(entries, c, poll_interval)
     return L([save_pages(r['response']['body'], out_dir, r['custom_id']) for r in results])
+# %% ../nbs/00_core.ipynb 48
+def read_pgs(
+    path:str, # OCR output directory,
+    pg:int=None, # Page number
+    ) -> str:
+    "Read specific page or all pages from OCR output directory"
+    path = Path(path)
+    if pg:
+        pg_path = path / f'page_{pg}.md'
+        if not pg_path.exists(): raise ValueError(f"Page {pg} not found")
+        return pg_path.read_text()
+    pgs = sorted(path.glob('page_*.md'), key=lambda p: int(p.stem.split('_')[1]))
+    return '\n\n'.join([p.read_text() for p in pgs])

{mistocr-0.0.3.dist-info → mistocr-0.0.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mistocr
-Version: 0.0.3
+Version: 0.0.4
 Summary: Simple batch OCR for PDFs using Mistral's state-of-the-art vision model
 Home-page: https://github.com/franckalbinet/mistocr
 Author: Solveit
@@ -54,10 +54,11 @@ for large document sets.
 **Cost savings**: Batch OCR mode reduces costs from \$1/1000 pages to
 \$0.50/1000 pages - a 50% reduction compared to synchronous processing.
-**Simplicity**: A single `ocr()` function handles everything -
-uploading, batch submission, polling for completion, and saving results
-as markdown with extracted images. Process one PDF or an entire folder
-with the same simple interface.
+**Simplicity**: A single
+[`ocr()`](https://franckalbinet.github.io/mistocr/core.html#ocr)
+function handles everything - uploading, batch submission, polling for
+completion, and saving results as markdown with extracted images.
+Process one PDF or an entire folder with the same simple interface.
 **Organized output**: Each PDF is automatically saved to its own folder
 with pages as separate markdown files and images in an `img` subfolder,
@@ -80,57 +81,60 @@ $ pip install mistocr
 ## How to use
+### Basic usage
+Process a single PDF:
 ``` python
 from mistocr.core import ocr
-```
-- **Process a single PDF:**
-<!-- -->
+fname = 'files/test/attention-is-all-you-need.pdf'
+result = ocr(fname)
+```
-    fname = 'files/test/attention-is-all-you-need.pdf'
-    result = ocr(fname)
+Or process an entire folder:
 ``` python
+results = ocr('files/test')
 ```
-    files/test/md/attention-is-all-you-need:
-    img/        page_11.md  page_14.md  page_3.md  page_6.md  page_9.md
-    page_1.md   page_12.md  page_15.md  page_4.md  page_7.md
-    page_10.md  page_13.md  page_2.md   page_5.md  page_8.md
+### Output structure
-    files/test/md/attention-is-all-you-need/img:
-    img-0.jpeg  img-1.jpeg  img-2.jpeg  img-3.jpeg  img-4.jpeg
+Each PDF is saved to its own folder with pages as separate markdown
+files and images in an `img` subfolder:
-- **Or process an entire folder:**
+    files/test/md/
+    ├── attention-is-all-you-need/
+    │   ├── img/
+    │   │   ├── img-0.jpeg
+    │   │   ├── img-1.jpeg
+    │   │   └── ...
+    │   ├── page_1.md
+    │   ├── page_2.md
+    │   └── ...
+    └── resnet/
+        ├── img/
+        └── ...
-``` python
-results = ocr('files/test')
-```
+### Reading results
-``` python
-```
+Read all pages from a processed PDF:
-    files/test/md:
-    attention-is-all-you-need/  resnet/
+``` python
+from mistocr.core import read_pgs
-    files/test/md/attention-is-all-you-need:
-    img/        page_11.md  page_14.md  page_3.md  page_6.md  page_9.md
-    page_1.md   page_12.md  page_15.md  page_4.md  page_7.md
-    page_10.md  page_13.md  page_2.md   page_5.md  page_8.md
+text = read_pgs('files/test/md/attention-is-all-you-need')
+```
-    files/test/md/attention-is-all-you-need/img:
-    img-0.jpeg  img-1.jpeg  img-2.jpeg  img-3.jpeg  img-4.jpeg
+Or read a specific page:
-    files/test/md/resnet:
-    img/       page_10.md  page_12.md  page_3.md  page_5.md  page_7.md  page_9.md
-    page_1.md  page_11.md  page_2.md   page_4.md  page_6.md  page_8.md
+``` python
+text = read_pgs('files/test/md/attention-is-all-you-need', 10)
+```
-    files/test/md/resnet/img:
-    img-0.jpeg  img-2.jpeg  img-4.jpeg  img-6.jpeg
-    img-1.jpeg  img-3.jpeg  img-5.jpeg
+### Customization
-- **Customize the output:**
+Customize output directory, image inclusion, and polling interval:
 ``` python
 results = ocr('files/test', out_dir='output', inc_img=False, poll_interval=5)

mistocr-0.0.4.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+mistocr/__init__.py,sha256=1mptEzQihbdyqqzMgdns_j5ZGK9gz7hR2bsgA_TnjO4,22
+mistocr/_modidx.py,sha256=zA12OvdPdNkQ7K_oQx8rzto1mWnpQa3kyz8N-az6kMw,1843
+mistocr/core.py,sha256=qMV6ZFqs3PNHNUL6o6612WkWzOQiiA1jIKreAaYwORg,7239
+mistocr-0.0.4.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
+mistocr-0.0.4.dist-info/METADATA,sha256=01uXdXnZhKv334UNN1ZNlWCxNeozrptZpvAN9MFYIF4,4825
+mistocr-0.0.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+mistocr-0.0.4.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
+mistocr-0.0.4.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
+mistocr-0.0.4.dist-info/RECORD,,

mistocr-0.0.3.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-mistocr/__init__.py,sha256=4GZKi13lDTD25YBkGakhZyEQZWTER_OWQMNPoH_UM2c,22
-mistocr/_modidx.py,sha256=gViY05_Y4LdQXC5l2yEPG3MX-9M93xf4FJEGh3ns2Fo,1745
-mistocr/core.py,sha256=Ur5R8NLvHxduvSVuWNkWwt8xgkrxpnL9cmJjQ5h9thM,6778
-mistocr-0.0.3.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
-mistocr-0.0.3.dist-info/METADATA,sha256=aWl_wHxvy5Qrsze7JtTWMQ6FD-l-1QEM-7GZfTeem88,5076
-mistocr-0.0.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-mistocr-0.0.3.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
-mistocr-0.0.3.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
-mistocr-0.0.3.dist-info/RECORD,,

{mistocr-0.0.3.dist-info → mistocr-0.0.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{mistocr-0.0.3.dist-info → mistocr-0.0.4.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{mistocr-0.0.3.dist-info → mistocr-0.0.4.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{mistocr-0.0.3.dist-info → mistocr-0.0.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

mistocr 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl

mistocr 0.0.3py3-none-any.whl → 0.0.4py3-none-any.whl