mistocr 0.4.1__tar.gz → 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.4.1
3
+ Version: 0.4.2
4
4
  Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -113,7 +113,25 @@ from mistocr.pipeline import pdf_to_md
113
113
  await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
114
114
  ```
115
115
 
116
- This will (as indicated by the output):
116
+ mistocr.pipeline - INFO - Step 1/3: Running OCR on files/test/resnet.pdf...
117
+ mistocr.core - INFO - Waiting for batch job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 (initial status: QUEUED)
118
+ mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: QUEUED (elapsed: 0s)
119
+ mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
120
+ mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
121
+ mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
122
+ mistocr.core - INFO - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 completed with status: SUCCESS
123
+ mistocr.pipeline - INFO - Step 2/3: Fixing heading hierarchy...
124
+ mistocr.pipeline - INFO - Step 3/3: Adding image descriptions...
125
+
126
+ Describing 12 images...
127
+
128
+ mistocr.pipeline - INFO - Done!
129
+
130
+ Saved descriptions to /tmp/tmp62c7_ac1/resnet/img_descriptions.json
131
+ Adding descriptions to 12 pages...
132
+ Done! Enriched pages saved to files/test/md_test
133
+
134
+ This will:
117
135
 
118
136
  1. OCR the PDF using Mistral’s batch API
119
137
  2. Fix heading hierarchy inconsistencies
@@ -72,7 +72,25 @@ from mistocr.pipeline import pdf_to_md
72
72
  await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
73
73
  ```
74
74
 
75
- This will (as indicated by the output):
75
+ mistocr.pipeline - INFO - Step 1/3: Running OCR on files/test/resnet.pdf...
76
+ mistocr.core - INFO - Waiting for batch job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 (initial status: QUEUED)
77
+ mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: QUEUED (elapsed: 0s)
78
+ mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
79
+ mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
80
+ mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
81
+ mistocr.core - INFO - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 completed with status: SUCCESS
82
+ mistocr.pipeline - INFO - Step 2/3: Fixing heading hierarchy...
83
+ mistocr.pipeline - INFO - Step 3/3: Adding image descriptions...
84
+
85
+ Describing 12 images...
86
+
87
+ mistocr.pipeline - INFO - Done!
88
+
89
+ Saved descriptions to /tmp/tmp62c7_ac1/resnet/img_descriptions.json
90
+ Adding descriptions to 12 pages...
91
+ Done! Enriched pages saved to files/test/md_test
92
+
93
+ This will:
76
94
 
77
95
  1. OCR the PDF using Mistral’s batch API
78
96
  2. Fix heading hierarchy inconsistencies
@@ -0,0 +1 @@
1
+ __version__ = "0.4.2"
@@ -51,12 +51,24 @@ def create_batch_entry(
51
51
  path:str, # Path to PDF file,
52
52
  url:str, # Mistral signed URL
53
53
  cid:str=None, # Custom ID (by default using the file name without extension)
54
- inc_img:bool=True # Include image in response
54
+ inc_img:bool=True, # Include image in response
55
+ extract_header:bool=False, # Extract headers from document
56
+ extract_footer:bool=False # Extract footers from document
55
57
  ) -> dict[str, str | dict[str, str | bool]]: # Batch entry dict
56
58
  "Create a batch entry dict for OCR"
57
59
  path = Path(path)
58
60
  if not cid: cid = path.stem
59
- return dict(custom_id=cid, body=dict(document=dict(type="document_url", document_url=url), include_image_base64=inc_img))
61
+ return dict(
62
+ custom_id=cid,
63
+ body=dict(
64
+ document=dict(
65
+ type="document_url",
66
+ document_url=url),
67
+ include_image_base64=inc_img,
68
+ extract_header=extract_header,
69
+ extract_footer=extract_footer
70
+ )
71
+ )
60
72
 
61
73
  # %% ../nbs/00_core.ipynb 18
62
74
  def prep_pdf_batch(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.4.1
3
+ Version: 0.4.2
4
4
  Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -113,7 +113,25 @@ from mistocr.pipeline import pdf_to_md
113
113
  await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
114
114
  ```
115
115
 
116
- This will (as indicated by the output):
116
+ mistocr.pipeline - INFO - Step 1/3: Running OCR on files/test/resnet.pdf...
117
+ mistocr.core - INFO - Waiting for batch job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 (initial status: QUEUED)
118
+ mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: QUEUED (elapsed: 0s)
119
+ mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
120
+ mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
121
+ mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
122
+ mistocr.core - INFO - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 completed with status: SUCCESS
123
+ mistocr.pipeline - INFO - Step 2/3: Fixing heading hierarchy...
124
+ mistocr.pipeline - INFO - Step 3/3: Adding image descriptions...
125
+
126
+ Describing 12 images...
127
+
128
+ mistocr.pipeline - INFO - Done!
129
+
130
+ Saved descriptions to /tmp/tmp62c7_ac1/resnet/img_descriptions.json
131
+ Adding descriptions to 12 pages...
132
+ Done! Enriched pages saved to files/test/md_test
133
+
134
+ This will:
117
135
 
118
136
  1. OCR the PDF using Mistral’s batch API
119
137
  2. Fix heading hierarchy inconsistencies
@@ -1,7 +1,7 @@
1
1
  [DEFAULT]
2
2
  repo = mistocr
3
3
  lib_name = mistocr
4
- version = 0.4.1
4
+ version = 0.4.2
5
5
  min_python = 3.9
6
6
  license = apache2
7
7
  black_formatting = False
@@ -1 +0,0 @@
1
- __version__ = "0.4.1"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes