PyPI - mistocr - Versions diffs - 0.4.1__tar.gz → 0.4.3__tar.gz - Mend

mistocr 0.4.1tar.gz → 0.4.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

{mistocr-0.4.1/mistocr.egg-info → mistocr-0.4.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mistocr
-Version: 0.4.1
+Version: 0.4.3
 Summary: Batch OCR for PDFs with heading restoration and visual content integration
 Home-page: https://github.com/franckalbinet/mistocr
 Author: Solveit
@@ -113,7 +113,25 @@ from mistocr.pipeline import pdf_to_md
 await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
 ```
-This will (as indicated by the output):
+    mistocr.pipeline - INFO - Step 1/3: Running OCR on files/test/resnet.pdf...
+    mistocr.core - INFO - Waiting for batch job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 (initial status: QUEUED)
+    mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: QUEUED (elapsed: 0s)
+    mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
+    mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
+    mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
+    mistocr.core - INFO - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 completed with status: SUCCESS
+    mistocr.pipeline - INFO - Step 2/3: Fixing heading hierarchy...
+    mistocr.pipeline - INFO - Step 3/3: Adding image descriptions...
+    Describing 12 images...
+    mistocr.pipeline - INFO - Done!
+    Saved descriptions to /tmp/tmp62c7_ac1/resnet/img_descriptions.json
+    Adding descriptions to 12 pages...
+    Done! Enriched pages saved to files/test/md_test
+This will:
 1.  OCR the PDF using Mistral’s batch API
 2.  Fix heading hierarchy inconsistencies

{mistocr-0.4.1 → mistocr-0.4.3}/README.md RENAMED Viewed

@@ -72,7 +72,25 @@ from mistocr.pipeline import pdf_to_md
 await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
 ```
-This will (as indicated by the output):
+    mistocr.pipeline - INFO - Step 1/3: Running OCR on files/test/resnet.pdf...
+    mistocr.core - INFO - Waiting for batch job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 (initial status: QUEUED)
+    mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: QUEUED (elapsed: 0s)
+    mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
+    mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
+    mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
+    mistocr.core - INFO - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 completed with status: SUCCESS
+    mistocr.pipeline - INFO - Step 2/3: Fixing heading hierarchy...
+    mistocr.pipeline - INFO - Step 3/3: Adding image descriptions...
+    Describing 12 images...
+    mistocr.pipeline - INFO - Done!
+    Saved descriptions to /tmp/tmp62c7_ac1/resnet/img_descriptions.json
+    Adding descriptions to 12 pages...
+    Done! Enriched pages saved to files/test/md_test
+This will:
 1.  OCR the PDF using Mistral’s batch API
 2.  Fix heading hierarchy inconsistencies

mistocr-0.4.3/mistocr/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.4.3"

{mistocr-0.4.1 → mistocr-0.4.3}/mistocr/core.py RENAMED Viewed

@@ -51,14 +51,26 @@ def create_batch_entry(
     path:str, # Path to PDF file,
     url:str, # Mistral signed URL
     cid:str=None, # Custom ID (by default using the file name without extension)
-    inc_img:bool=True # Include image in response
+    inc_img:bool=True, # Include image in response
+    extract_header:bool=True, # Extract headers from document
+    extract_footer:bool=True # Extract footers from document
     ) -> dict[str, str | dict[str, str | bool]]: # Batch entry dict
     "Create a batch entry dict for OCR"
     path = Path(path)
     if not cid: cid = path.stem
-    return dict(custom_id=cid, body=dict(document=dict(type="document_url", document_url=url), include_image_base64=inc_img))
-# %% ../nbs/00_core.ipynb 18
+    return dict(
+        custom_id=cid,
+        body=dict(
+            document=dict(
+                type="document_url",
+                document_url=url),
+                include_image_base64=inc_img,
+                extract_header=extract_header,
+                extract_footer=extract_footer
+            )
+        )
+# %% ../nbs/00_core.ipynb 19
 def prep_pdf_batch(
     path:str, # Path to PDF file,
     cid:str=None, # Custom ID (by default using the file name without extention)
@@ -69,7 +81,7 @@ def prep_pdf_batch(
     url, c = upload_pdf(path, key)
     return create_batch_entry(path, url, cid, inc_img), c
-# %% ../nbs/00_core.ipynb 22
+# %% ../nbs/00_core.ipynb 23
 def submit_batch(
     entries:list[dict], # List of batch entries,
     c:Mistral=None, # Mistral client,
@@ -83,7 +95,7 @@ def submit_batch(
         batch_data = c.files.upload(file=dict(file_name="batch.jsonl", content=open(f.name, "rb")), purpose="batch")
     return c.batch.jobs.create(input_files=[batch_data.id], model=model, endpoint=endpoint)
-# %% ../nbs/00_core.ipynb 25
+# %% ../nbs/00_core.ipynb 26
 def _check_timeout(
     queued_time:int, # Time spent in QUEUED state (seconds)
     timeout:int, # Maximum allowed QUEUED time (seconds)
@@ -92,7 +104,7 @@ def _check_timeout(
     "Raise TimeoutError if job has been queued longer than timeout"
     if queued_time >= timeout: raise TimeoutError(f"Job {job_id} stayed in QUEUED for {queued_time}s, exceeding timeout of {timeout}s. Check your balance or Mistral Status.")
-# %% ../nbs/00_core.ipynb 26
+# %% ../nbs/00_core.ipynb 27
 def wait_for_job(
     job:dict, # Batch job from submit_batch
     c:Mistral=None, # Mistral client
@@ -111,7 +123,7 @@ def wait_for_job(
     if job.status != "SUCCESS": logger.warning(f"Job {job.id} finished with non-success status: {job.status}")
     return job
-# %% ../nbs/00_core.ipynb 28
+# %% ../nbs/00_core.ipynb 29
 def download_results(
     job:dict, # Job dict,
     c:Mistral=None # Mistral client
@@ -120,7 +132,7 @@ def download_results(
     content = c.files.download(file_id=job.output_file).read().decode('utf-8')
     return [json.loads(line) for line in content.strip().split('\n') if line]
-# %% ../nbs/00_core.ipynb 33
+# %% ../nbs/00_core.ipynb 34
 def save_images(
     page:dict, # Page dict,
     img_dir:str='img' # Directory to save images
@@ -131,7 +143,7 @@ def save_images(
             img_bytes = base64.b64decode(img['image_base64'].split(',')[1])
             Image.open(BytesIO(img_bytes)).save(img_dir / img['id'])
-# %% ../nbs/00_core.ipynb 34
+# %% ../nbs/00_core.ipynb 35
 def save_page(
     page:dict, # Page dict,
     dst:str, # Directory to save page
@@ -143,7 +155,7 @@ def save_page(
         img_dir.mkdir(exist_ok=True)
         save_images(page, img_dir)
-# %% ../nbs/00_core.ipynb 36
+# %% ../nbs/00_core.ipynb 37
 def save_pages(
     ocr_resp:dict, # OCR response,
     dst:str, # Directory to save pages,
@@ -156,7 +168,7 @@ def save_pages(
     for page in ocr_resp['pages']: save_page(page, dst, img_dir)
     return dst
-# %% ../nbs/00_core.ipynb 42
+# %% ../nbs/00_core.ipynb 43
 def _get_paths(path:str) -> list[Path]:
     "Get list of PDFs from file or folder"
     path = Path(path)
@@ -167,7 +179,7 @@ def _get_paths(path:str) -> list[Path]:
         return pdfs
     raise ValueError(f"Path not found: {path}")
-# %% ../nbs/00_core.ipynb 43
+# %% ../nbs/00_core.ipynb 44
 def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[dict], Mistral]:
     "Prepare batch entries for list of PDFs"
     entries, c = [], None
@@ -176,7 +188,7 @@ def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[
         entries.append(entry)
     return entries, c
-# %% ../nbs/00_core.ipynb 44
+# %% ../nbs/00_core.ipynb 45
 def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]:
     "Submit batch, wait for completion, and download results"
     job = submit_batch(entries, c)
@@ -184,7 +196,7 @@ def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]
     if job.status != 'SUCCESS': raise Exception(f"Job failed with status: {job.status}")
     return download_results(job, c)
-# %% ../nbs/00_core.ipynb 45
+# %% ../nbs/00_core.ipynb 46
 def ocr_pdf(
     path:str, # Path to PDF file or folder,
     dst:str='md', # Directory to save markdown pages,
@@ -198,7 +210,7 @@ def ocr_pdf(
     results = _run_batch(entries, c, poll_interval)
     return L([save_pages(r['response']['body'], dst, r['custom_id']) for r in results])
-# %% ../nbs/00_core.ipynb 52
+# %% ../nbs/00_core.ipynb 53
 def read_pgs(
     path:str, # OCR output directory,
     join:bool=True # Join pages into single string
@@ -209,7 +221,7 @@ def read_pgs(
     contents = L([p.read_text() for p in pgs])
     return '\n\n'.join(contents) if join else contents
-# %% ../nbs/00_core.ipynb 59
+# %% ../nbs/00_core.ipynb 60
 def subset_pdf(
     path:str, # Path to PDF file
     start:int=1, # Start page (1-based)

{mistocr-0.4.1 → mistocr-0.4.3/mistocr.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mistocr
-Version: 0.4.1
+Version: 0.4.3
 Summary: Batch OCR for PDFs with heading restoration and visual content integration
 Home-page: https://github.com/franckalbinet/mistocr
 Author: Solveit
@@ -113,7 +113,25 @@ from mistocr.pipeline import pdf_to_md
 await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
 ```
-This will (as indicated by the output):
+    mistocr.pipeline - INFO - Step 1/3: Running OCR on files/test/resnet.pdf...
+    mistocr.core - INFO - Waiting for batch job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 (initial status: QUEUED)
+    mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: QUEUED (elapsed: 0s)
+    mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
+    mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
+    mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
+    mistocr.core - INFO - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 completed with status: SUCCESS
+    mistocr.pipeline - INFO - Step 2/3: Fixing heading hierarchy...
+    mistocr.pipeline - INFO - Step 3/3: Adding image descriptions...
+    Describing 12 images...
+    mistocr.pipeline - INFO - Done!
+    Saved descriptions to /tmp/tmp62c7_ac1/resnet/img_descriptions.json
+    Adding descriptions to 12 pages...
+    Done! Enriched pages saved to files/test/md_test
+This will:
 1.  OCR the PDF using Mistral’s batch API
 2.  Fix heading hierarchy inconsistencies

{mistocr-0.4.1 → mistocr-0.4.3}/settings.ini RENAMED Viewed

@@ -1,7 +1,7 @@
 [DEFAULT]
 repo = mistocr
 lib_name = mistocr
-version = 0.4.1
+version = 0.4.3
 min_python = 3.9
 license = apache2
 black_formatting = False