mistocr 0.4.1__tar.gz → 0.4.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mistocr-0.4.1/mistocr.egg-info → mistocr-0.4.3}/PKG-INFO +20 -2
- {mistocr-0.4.1 → mistocr-0.4.3}/README.md +19 -1
- mistocr-0.4.3/mistocr/__init__.py +1 -0
- {mistocr-0.4.1 → mistocr-0.4.3}/mistocr/core.py +29 -17
- {mistocr-0.4.1 → mistocr-0.4.3/mistocr.egg-info}/PKG-INFO +20 -2
- {mistocr-0.4.1 → mistocr-0.4.3}/settings.ini +1 -1
- mistocr-0.4.1/mistocr/__init__.py +0 -1
- {mistocr-0.4.1 → mistocr-0.4.3}/LICENSE +0 -0
- {mistocr-0.4.1 → mistocr-0.4.3}/MANIFEST.in +0 -0
- {mistocr-0.4.1 → mistocr-0.4.3}/mistocr/_modidx.py +0 -0
- {mistocr-0.4.1 → mistocr-0.4.3}/mistocr/pipeline.py +0 -0
- {mistocr-0.4.1 → mistocr-0.4.3}/mistocr/refine.py +0 -0
- {mistocr-0.4.1 → mistocr-0.4.3}/mistocr.egg-info/SOURCES.txt +0 -0
- {mistocr-0.4.1 → mistocr-0.4.3}/mistocr.egg-info/dependency_links.txt +0 -0
- {mistocr-0.4.1 → mistocr-0.4.3}/mistocr.egg-info/entry_points.txt +0 -0
- {mistocr-0.4.1 → mistocr-0.4.3}/mistocr.egg-info/not-zip-safe +0 -0
- {mistocr-0.4.1 → mistocr-0.4.3}/mistocr.egg-info/requires.txt +0 -0
- {mistocr-0.4.1 → mistocr-0.4.3}/mistocr.egg-info/top_level.txt +0 -0
- {mistocr-0.4.1 → mistocr-0.4.3}/pyproject.toml +0 -0
- {mistocr-0.4.1 → mistocr-0.4.3}/setup.cfg +0 -0
- {mistocr-0.4.1 → mistocr-0.4.3}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mistocr
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.3
|
|
4
4
|
Summary: Batch OCR for PDFs with heading restoration and visual content integration
|
|
5
5
|
Home-page: https://github.com/franckalbinet/mistocr
|
|
6
6
|
Author: Solveit
|
|
@@ -113,7 +113,25 @@ from mistocr.pipeline import pdf_to_md
|
|
|
113
113
|
await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
|
|
114
114
|
```
|
|
115
115
|
|
|
116
|
-
|
|
116
|
+
mistocr.pipeline - INFO - Step 1/3: Running OCR on files/test/resnet.pdf...
|
|
117
|
+
mistocr.core - INFO - Waiting for batch job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 (initial status: QUEUED)
|
|
118
|
+
mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: QUEUED (elapsed: 0s)
|
|
119
|
+
mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
|
|
120
|
+
mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
|
|
121
|
+
mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
|
|
122
|
+
mistocr.core - INFO - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 completed with status: SUCCESS
|
|
123
|
+
mistocr.pipeline - INFO - Step 2/3: Fixing heading hierarchy...
|
|
124
|
+
mistocr.pipeline - INFO - Step 3/3: Adding image descriptions...
|
|
125
|
+
|
|
126
|
+
Describing 12 images...
|
|
127
|
+
|
|
128
|
+
mistocr.pipeline - INFO - Done!
|
|
129
|
+
|
|
130
|
+
Saved descriptions to /tmp/tmp62c7_ac1/resnet/img_descriptions.json
|
|
131
|
+
Adding descriptions to 12 pages...
|
|
132
|
+
Done! Enriched pages saved to files/test/md_test
|
|
133
|
+
|
|
134
|
+
This will:
|
|
117
135
|
|
|
118
136
|
1. OCR the PDF using Mistral’s batch API
|
|
119
137
|
2. Fix heading hierarchy inconsistencies
|
|
@@ -72,7 +72,25 @@ from mistocr.pipeline import pdf_to_md
|
|
|
72
72
|
await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
|
|
73
73
|
```
|
|
74
74
|
|
|
75
|
-
|
|
75
|
+
mistocr.pipeline - INFO - Step 1/3: Running OCR on files/test/resnet.pdf...
|
|
76
|
+
mistocr.core - INFO - Waiting for batch job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 (initial status: QUEUED)
|
|
77
|
+
mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: QUEUED (elapsed: 0s)
|
|
78
|
+
mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
|
|
79
|
+
mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
|
|
80
|
+
mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
|
|
81
|
+
mistocr.core - INFO - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 completed with status: SUCCESS
|
|
82
|
+
mistocr.pipeline - INFO - Step 2/3: Fixing heading hierarchy...
|
|
83
|
+
mistocr.pipeline - INFO - Step 3/3: Adding image descriptions...
|
|
84
|
+
|
|
85
|
+
Describing 12 images...
|
|
86
|
+
|
|
87
|
+
mistocr.pipeline - INFO - Done!
|
|
88
|
+
|
|
89
|
+
Saved descriptions to /tmp/tmp62c7_ac1/resnet/img_descriptions.json
|
|
90
|
+
Adding descriptions to 12 pages...
|
|
91
|
+
Done! Enriched pages saved to files/test/md_test
|
|
92
|
+
|
|
93
|
+
This will:
|
|
76
94
|
|
|
77
95
|
1. OCR the PDF using Mistral’s batch API
|
|
78
96
|
2. Fix heading hierarchy inconsistencies
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.4.3"
|
|
@@ -51,14 +51,26 @@ def create_batch_entry(
|
|
|
51
51
|
path:str, # Path to PDF file,
|
|
52
52
|
url:str, # Mistral signed URL
|
|
53
53
|
cid:str=None, # Custom ID (by default using the file name without extension)
|
|
54
|
-
inc_img:bool=True # Include image in response
|
|
54
|
+
inc_img:bool=True, # Include image in response
|
|
55
|
+
extract_header:bool=True, # Extract headers from document
|
|
56
|
+
extract_footer:bool=True # Extract footers from document
|
|
55
57
|
) -> dict[str, str | dict[str, str | bool]]: # Batch entry dict
|
|
56
58
|
"Create a batch entry dict for OCR"
|
|
57
59
|
path = Path(path)
|
|
58
60
|
if not cid: cid = path.stem
|
|
59
|
-
return dict(
|
|
60
|
-
|
|
61
|
-
|
|
61
|
+
return dict(
|
|
62
|
+
custom_id=cid,
|
|
63
|
+
body=dict(
|
|
64
|
+
document=dict(
|
|
65
|
+
type="document_url",
|
|
66
|
+
document_url=url),
|
|
67
|
+
include_image_base64=inc_img,
|
|
68
|
+
extract_header=extract_header,
|
|
69
|
+
extract_footer=extract_footer
|
|
70
|
+
)
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# %% ../nbs/00_core.ipynb 19
|
|
62
74
|
def prep_pdf_batch(
|
|
63
75
|
path:str, # Path to PDF file,
|
|
64
76
|
cid:str=None, # Custom ID (by default using the file name without extention)
|
|
@@ -69,7 +81,7 @@ def prep_pdf_batch(
|
|
|
69
81
|
url, c = upload_pdf(path, key)
|
|
70
82
|
return create_batch_entry(path, url, cid, inc_img), c
|
|
71
83
|
|
|
72
|
-
# %% ../nbs/00_core.ipynb
|
|
84
|
+
# %% ../nbs/00_core.ipynb 23
|
|
73
85
|
def submit_batch(
|
|
74
86
|
entries:list[dict], # List of batch entries,
|
|
75
87
|
c:Mistral=None, # Mistral client,
|
|
@@ -83,7 +95,7 @@ def submit_batch(
|
|
|
83
95
|
batch_data = c.files.upload(file=dict(file_name="batch.jsonl", content=open(f.name, "rb")), purpose="batch")
|
|
84
96
|
return c.batch.jobs.create(input_files=[batch_data.id], model=model, endpoint=endpoint)
|
|
85
97
|
|
|
86
|
-
# %% ../nbs/00_core.ipynb
|
|
98
|
+
# %% ../nbs/00_core.ipynb 26
|
|
87
99
|
def _check_timeout(
|
|
88
100
|
queued_time:int, # Time spent in QUEUED state (seconds)
|
|
89
101
|
timeout:int, # Maximum allowed QUEUED time (seconds)
|
|
@@ -92,7 +104,7 @@ def _check_timeout(
|
|
|
92
104
|
"Raise TimeoutError if job has been queued longer than timeout"
|
|
93
105
|
if queued_time >= timeout: raise TimeoutError(f"Job {job_id} stayed in QUEUED for {queued_time}s, exceeding timeout of {timeout}s. Check your balance or Mistral Status.")
|
|
94
106
|
|
|
95
|
-
# %% ../nbs/00_core.ipynb
|
|
107
|
+
# %% ../nbs/00_core.ipynb 27
|
|
96
108
|
def wait_for_job(
|
|
97
109
|
job:dict, # Batch job from submit_batch
|
|
98
110
|
c:Mistral=None, # Mistral client
|
|
@@ -111,7 +123,7 @@ def wait_for_job(
|
|
|
111
123
|
if job.status != "SUCCESS": logger.warning(f"Job {job.id} finished with non-success status: {job.status}")
|
|
112
124
|
return job
|
|
113
125
|
|
|
114
|
-
# %% ../nbs/00_core.ipynb
|
|
126
|
+
# %% ../nbs/00_core.ipynb 29
|
|
115
127
|
def download_results(
|
|
116
128
|
job:dict, # Job dict,
|
|
117
129
|
c:Mistral=None # Mistral client
|
|
@@ -120,7 +132,7 @@ def download_results(
|
|
|
120
132
|
content = c.files.download(file_id=job.output_file).read().decode('utf-8')
|
|
121
133
|
return [json.loads(line) for line in content.strip().split('\n') if line]
|
|
122
134
|
|
|
123
|
-
# %% ../nbs/00_core.ipynb
|
|
135
|
+
# %% ../nbs/00_core.ipynb 34
|
|
124
136
|
def save_images(
|
|
125
137
|
page:dict, # Page dict,
|
|
126
138
|
img_dir:str='img' # Directory to save images
|
|
@@ -131,7 +143,7 @@ def save_images(
|
|
|
131
143
|
img_bytes = base64.b64decode(img['image_base64'].split(',')[1])
|
|
132
144
|
Image.open(BytesIO(img_bytes)).save(img_dir / img['id'])
|
|
133
145
|
|
|
134
|
-
# %% ../nbs/00_core.ipynb
|
|
146
|
+
# %% ../nbs/00_core.ipynb 35
|
|
135
147
|
def save_page(
|
|
136
148
|
page:dict, # Page dict,
|
|
137
149
|
dst:str, # Directory to save page
|
|
@@ -143,7 +155,7 @@ def save_page(
|
|
|
143
155
|
img_dir.mkdir(exist_ok=True)
|
|
144
156
|
save_images(page, img_dir)
|
|
145
157
|
|
|
146
|
-
# %% ../nbs/00_core.ipynb
|
|
158
|
+
# %% ../nbs/00_core.ipynb 37
|
|
147
159
|
def save_pages(
|
|
148
160
|
ocr_resp:dict, # OCR response,
|
|
149
161
|
dst:str, # Directory to save pages,
|
|
@@ -156,7 +168,7 @@ def save_pages(
|
|
|
156
168
|
for page in ocr_resp['pages']: save_page(page, dst, img_dir)
|
|
157
169
|
return dst
|
|
158
170
|
|
|
159
|
-
# %% ../nbs/00_core.ipynb
|
|
171
|
+
# %% ../nbs/00_core.ipynb 43
|
|
160
172
|
def _get_paths(path:str) -> list[Path]:
|
|
161
173
|
"Get list of PDFs from file or folder"
|
|
162
174
|
path = Path(path)
|
|
@@ -167,7 +179,7 @@ def _get_paths(path:str) -> list[Path]:
|
|
|
167
179
|
return pdfs
|
|
168
180
|
raise ValueError(f"Path not found: {path}")
|
|
169
181
|
|
|
170
|
-
# %% ../nbs/00_core.ipynb
|
|
182
|
+
# %% ../nbs/00_core.ipynb 44
|
|
171
183
|
def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[dict], Mistral]:
|
|
172
184
|
"Prepare batch entries for list of PDFs"
|
|
173
185
|
entries, c = [], None
|
|
@@ -176,7 +188,7 @@ def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[
|
|
|
176
188
|
entries.append(entry)
|
|
177
189
|
return entries, c
|
|
178
190
|
|
|
179
|
-
# %% ../nbs/00_core.ipynb
|
|
191
|
+
# %% ../nbs/00_core.ipynb 45
|
|
180
192
|
def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]:
|
|
181
193
|
"Submit batch, wait for completion, and download results"
|
|
182
194
|
job = submit_batch(entries, c)
|
|
@@ -184,7 +196,7 @@ def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]
|
|
|
184
196
|
if job.status != 'SUCCESS': raise Exception(f"Job failed with status: {job.status}")
|
|
185
197
|
return download_results(job, c)
|
|
186
198
|
|
|
187
|
-
# %% ../nbs/00_core.ipynb
|
|
199
|
+
# %% ../nbs/00_core.ipynb 46
|
|
188
200
|
def ocr_pdf(
|
|
189
201
|
path:str, # Path to PDF file or folder,
|
|
190
202
|
dst:str='md', # Directory to save markdown pages,
|
|
@@ -198,7 +210,7 @@ def ocr_pdf(
|
|
|
198
210
|
results = _run_batch(entries, c, poll_interval)
|
|
199
211
|
return L([save_pages(r['response']['body'], dst, r['custom_id']) for r in results])
|
|
200
212
|
|
|
201
|
-
# %% ../nbs/00_core.ipynb
|
|
213
|
+
# %% ../nbs/00_core.ipynb 53
|
|
202
214
|
def read_pgs(
|
|
203
215
|
path:str, # OCR output directory,
|
|
204
216
|
join:bool=True # Join pages into single string
|
|
@@ -209,7 +221,7 @@ def read_pgs(
|
|
|
209
221
|
contents = L([p.read_text() for p in pgs])
|
|
210
222
|
return '\n\n'.join(contents) if join else contents
|
|
211
223
|
|
|
212
|
-
# %% ../nbs/00_core.ipynb
|
|
224
|
+
# %% ../nbs/00_core.ipynb 60
|
|
213
225
|
def subset_pdf(
|
|
214
226
|
path:str, # Path to PDF file
|
|
215
227
|
start:int=1, # Start page (1-based)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mistocr
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.3
|
|
4
4
|
Summary: Batch OCR for PDFs with heading restoration and visual content integration
|
|
5
5
|
Home-page: https://github.com/franckalbinet/mistocr
|
|
6
6
|
Author: Solveit
|
|
@@ -113,7 +113,25 @@ from mistocr.pipeline import pdf_to_md
|
|
|
113
113
|
await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
|
|
114
114
|
```
|
|
115
115
|
|
|
116
|
-
|
|
116
|
+
mistocr.pipeline - INFO - Step 1/3: Running OCR on files/test/resnet.pdf...
|
|
117
|
+
mistocr.core - INFO - Waiting for batch job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 (initial status: QUEUED)
|
|
118
|
+
mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: QUEUED (elapsed: 0s)
|
|
119
|
+
mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
|
|
120
|
+
mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
|
|
121
|
+
mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
|
|
122
|
+
mistocr.core - INFO - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 completed with status: SUCCESS
|
|
123
|
+
mistocr.pipeline - INFO - Step 2/3: Fixing heading hierarchy...
|
|
124
|
+
mistocr.pipeline - INFO - Step 3/3: Adding image descriptions...
|
|
125
|
+
|
|
126
|
+
Describing 12 images...
|
|
127
|
+
|
|
128
|
+
mistocr.pipeline - INFO - Done!
|
|
129
|
+
|
|
130
|
+
Saved descriptions to /tmp/tmp62c7_ac1/resnet/img_descriptions.json
|
|
131
|
+
Adding descriptions to 12 pages...
|
|
132
|
+
Done! Enriched pages saved to files/test/md_test
|
|
133
|
+
|
|
134
|
+
This will:
|
|
117
135
|
|
|
118
136
|
1. OCR the PDF using Mistral’s batch API
|
|
119
137
|
2. Fix heading hierarchy inconsistencies
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.4.1"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|