mistocr 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mistocr/__init__.py +1 -1
- mistocr/_modidx.py +1 -0
- mistocr/core.py +31 -21
- {mistocr-0.0.3.dist-info → mistocr-0.0.4.dist-info}/METADATA +42 -38
- mistocr-0.0.4.dist-info/RECORD +9 -0
- mistocr-0.0.3.dist-info/RECORD +0 -9
- {mistocr-0.0.3.dist-info → mistocr-0.0.4.dist-info}/WHEEL +0 -0
- {mistocr-0.0.3.dist-info → mistocr-0.0.4.dist-info}/entry_points.txt +0 -0
- {mistocr-0.0.3.dist-info → mistocr-0.0.4.dist-info}/licenses/LICENSE +0 -0
- {mistocr-0.0.3.dist-info → mistocr-0.0.4.dist-info}/top_level.txt +0 -0
mistocr/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.0.
|
|
1
|
+
__version__ = "0.0.4"
|
mistocr/_modidx.py
CHANGED
|
@@ -13,6 +13,7 @@ d = { 'settings': { 'branch': 'main',
|
|
|
13
13
|
'mistocr.core.get_api_key': ('core.html#get_api_key', 'mistocr/core.py'),
|
|
14
14
|
'mistocr.core.ocr': ('core.html#ocr', 'mistocr/core.py'),
|
|
15
15
|
'mistocr.core.prep_pdf_batch': ('core.html#prep_pdf_batch', 'mistocr/core.py'),
|
|
16
|
+
'mistocr.core.read_pgs': ('core.html#read_pgs', 'mistocr/core.py'),
|
|
16
17
|
'mistocr.core.save_images': ('core.html#save_images', 'mistocr/core.py'),
|
|
17
18
|
'mistocr.core.save_page': ('core.html#save_page', 'mistocr/core.py'),
|
|
18
19
|
'mistocr.core.save_pages': ('core.html#save_pages', 'mistocr/core.py'),
|
mistocr/core.py
CHANGED
|
@@ -4,21 +4,17 @@
|
|
|
4
4
|
|
|
5
5
|
# %% auto 0
|
|
6
6
|
__all__ = ['ocr_model', 'ocr_endpoint', 'get_api_key', 'upload_pdf', 'create_batch_entry', 'prep_pdf_batch', 'submit_batch',
|
|
7
|
-
'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr']
|
|
7
|
+
'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr', 'read_pgs']
|
|
8
8
|
|
|
9
9
|
# %% ../nbs/00_core.ipynb 3
|
|
10
10
|
from fastcore.all import *
|
|
11
|
-
|
|
12
|
-
import os, json, time, base64, tempfile
|
|
11
|
+
import os, re, json, time, base64, tempfile, logging
|
|
13
12
|
from io import BytesIO
|
|
14
13
|
from pathlib import Path
|
|
15
14
|
from PIL import Image
|
|
16
15
|
from mistralai import Mistral
|
|
17
16
|
|
|
18
17
|
# %% ../nbs/00_core.ipynb 6
|
|
19
|
-
load_dotenv()
|
|
20
|
-
|
|
21
|
-
# %% ../nbs/00_core.ipynb 7
|
|
22
18
|
def get_api_key(
|
|
23
19
|
key:str=None # Mistral API key
|
|
24
20
|
):
|
|
@@ -27,11 +23,11 @@ def get_api_key(
|
|
|
27
23
|
if not key: raise ValueError("MISTRAL_API_KEY not found")
|
|
28
24
|
return key
|
|
29
25
|
|
|
30
|
-
# %% ../nbs/00_core.ipynb
|
|
26
|
+
# %% ../nbs/00_core.ipynb 7
|
|
31
27
|
ocr_model = "mistral-ocr-latest"
|
|
32
28
|
ocr_endpoint = "/v1/ocr"
|
|
33
29
|
|
|
34
|
-
# %% ../nbs/00_core.ipynb
|
|
30
|
+
# %% ../nbs/00_core.ipynb 10
|
|
35
31
|
def upload_pdf(
|
|
36
32
|
path:str, # Path to PDF file
|
|
37
33
|
key:str=None # Mistral API key
|
|
@@ -42,11 +38,11 @@ def upload_pdf(
|
|
|
42
38
|
uploaded = c.files.upload(file=dict(file_name=path.stem, content=path.read_bytes()), purpose="ocr")
|
|
43
39
|
return c.files.get_signed_url(file_id=uploaded.id).url, c
|
|
44
40
|
|
|
45
|
-
# %% ../nbs/00_core.ipynb
|
|
41
|
+
# %% ../nbs/00_core.ipynb 15
|
|
46
42
|
def create_batch_entry(
|
|
47
43
|
path:str, # Path to PDF file,
|
|
48
44
|
url:str, # Mistral signed URL
|
|
49
|
-
cid:str=None, # Custom ID (by default using the file name without
|
|
45
|
+
cid:str=None, # Custom ID (by default using the file name without extension)
|
|
50
46
|
inc_img:bool=True # Include image in response
|
|
51
47
|
) -> dict[str, str | dict[str, str | bool]]: # Batch entry dict
|
|
52
48
|
"Create a batch entry dict for OCR"
|
|
@@ -54,7 +50,7 @@ def create_batch_entry(
|
|
|
54
50
|
if not cid: cid = path.stem
|
|
55
51
|
return dict(custom_id=cid, body=dict(document=dict(type="document_url", document_url=url), include_image_base64=inc_img))
|
|
56
52
|
|
|
57
|
-
# %% ../nbs/00_core.ipynb
|
|
53
|
+
# %% ../nbs/00_core.ipynb 17
|
|
58
54
|
def prep_pdf_batch(
|
|
59
55
|
path:str, # Path to PDF file,
|
|
60
56
|
cid:str=None, # Custom ID (by default using the file name without extention)
|
|
@@ -65,7 +61,7 @@ def prep_pdf_batch(
|
|
|
65
61
|
url, c = upload_pdf(path, key)
|
|
66
62
|
return create_batch_entry(path, url, cid, inc_img), c
|
|
67
63
|
|
|
68
|
-
# %% ../nbs/00_core.ipynb
|
|
64
|
+
# %% ../nbs/00_core.ipynb 21
|
|
69
65
|
def submit_batch(
|
|
70
66
|
entries:list[dict], # List of batch entries,
|
|
71
67
|
c:Mistral=None, # Mistral client,
|
|
@@ -79,7 +75,7 @@ def submit_batch(
|
|
|
79
75
|
batch_data = c.files.upload(file=dict(file_name="batch.jsonl", content=open(f.name, "rb")), purpose="batch")
|
|
80
76
|
return c.batch.jobs.create(input_files=[batch_data.id], model=model, endpoint=endpoint)
|
|
81
77
|
|
|
82
|
-
# %% ../nbs/00_core.ipynb
|
|
78
|
+
# %% ../nbs/00_core.ipynb 24
|
|
83
79
|
def wait_for_job(
|
|
84
80
|
job:dict, # Job dict,
|
|
85
81
|
c:Mistral=None, # Mistral client,
|
|
@@ -91,7 +87,7 @@ def wait_for_job(
|
|
|
91
87
|
job = c.batch.jobs.get(job_id=job.id)
|
|
92
88
|
return job
|
|
93
89
|
|
|
94
|
-
# %% ../nbs/00_core.ipynb
|
|
90
|
+
# %% ../nbs/00_core.ipynb 26
|
|
95
91
|
def download_results(
|
|
96
92
|
job:dict, # Job dict,
|
|
97
93
|
c:Mistral=None # Mistral client
|
|
@@ -100,7 +96,7 @@ def download_results(
|
|
|
100
96
|
content = c.files.download(file_id=job.output_file).read().decode('utf-8')
|
|
101
97
|
return [json.loads(line) for line in content.strip().split('\n') if line]
|
|
102
98
|
|
|
103
|
-
# %% ../nbs/00_core.ipynb
|
|
99
|
+
# %% ../nbs/00_core.ipynb 31
|
|
104
100
|
def save_images(
|
|
105
101
|
page:dict, # Page dict,
|
|
106
102
|
img_dir:str='img' # Directory to save images
|
|
@@ -111,7 +107,7 @@ def save_images(
|
|
|
111
107
|
img_bytes = base64.b64decode(img['image_base64'].split(',')[1])
|
|
112
108
|
Image.open(BytesIO(img_bytes)).save(img_dir / img['id'])
|
|
113
109
|
|
|
114
|
-
# %% ../nbs/00_core.ipynb
|
|
110
|
+
# %% ../nbs/00_core.ipynb 32
|
|
115
111
|
def save_page(
|
|
116
112
|
page:dict, # Page dict,
|
|
117
113
|
out_dir:str, # Directory to save page
|
|
@@ -123,7 +119,7 @@ def save_page(
|
|
|
123
119
|
img_dir.mkdir(exist_ok=True)
|
|
124
120
|
save_images(page, img_dir)
|
|
125
121
|
|
|
126
|
-
# %% ../nbs/00_core.ipynb
|
|
122
|
+
# %% ../nbs/00_core.ipynb 34
|
|
127
123
|
def save_pages(
|
|
128
124
|
ocr_resp:dict, # OCR response,
|
|
129
125
|
out_dir:str, # Directory to save pages,
|
|
@@ -136,7 +132,7 @@ def save_pages(
|
|
|
136
132
|
for page in ocr_resp['pages']: save_page(page, out_dir, img_dir)
|
|
137
133
|
return out_dir
|
|
138
134
|
|
|
139
|
-
# %% ../nbs/00_core.ipynb
|
|
135
|
+
# %% ../nbs/00_core.ipynb 40
|
|
140
136
|
def _get_paths(path:str) -> list[Path]:
|
|
141
137
|
"Get list of PDFs from file or folder"
|
|
142
138
|
path = Path(path)
|
|
@@ -147,7 +143,7 @@ def _get_paths(path:str) -> list[Path]:
|
|
|
147
143
|
return pdfs
|
|
148
144
|
raise ValueError(f"Path not found: {path}")
|
|
149
145
|
|
|
150
|
-
# %% ../nbs/00_core.ipynb
|
|
146
|
+
# %% ../nbs/00_core.ipynb 41
|
|
151
147
|
def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[dict], Mistral]:
|
|
152
148
|
"Prepare batch entries for list of PDFs"
|
|
153
149
|
entries, c = [], None
|
|
@@ -156,7 +152,7 @@ def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[
|
|
|
156
152
|
entries.append(entry)
|
|
157
153
|
return entries, c
|
|
158
154
|
|
|
159
|
-
# %% ../nbs/00_core.ipynb
|
|
155
|
+
# %% ../nbs/00_core.ipynb 42
|
|
160
156
|
def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]:
|
|
161
157
|
"Submit batch, wait for completion, and download results"
|
|
162
158
|
job = submit_batch(entries, c)
|
|
@@ -164,7 +160,7 @@ def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]
|
|
|
164
160
|
if job.status != 'SUCCESS': raise Exception(f"Job failed with status: {job.status}")
|
|
165
161
|
return download_results(job, c)
|
|
166
162
|
|
|
167
|
-
# %% ../nbs/00_core.ipynb
|
|
163
|
+
# %% ../nbs/00_core.ipynb 43
|
|
168
164
|
def ocr(
|
|
169
165
|
path:str, # Path to PDF file or folder,
|
|
170
166
|
out_dir:str='md', # Directory to save markdown pages,
|
|
@@ -177,3 +173,17 @@ def ocr(
|
|
|
177
173
|
entries, c = _prep_batch(pdfs, inc_img, key)
|
|
178
174
|
results = _run_batch(entries, c, poll_interval)
|
|
179
175
|
return L([save_pages(r['response']['body'], out_dir, r['custom_id']) for r in results])
|
|
176
|
+
|
|
177
|
+
# %% ../nbs/00_core.ipynb 48
|
|
178
|
+
def read_pgs(
|
|
179
|
+
path:str, # OCR output directory,
|
|
180
|
+
pg:int=None, # Page number
|
|
181
|
+
) -> str:
|
|
182
|
+
"Read specific page or all pages from OCR output directory"
|
|
183
|
+
path = Path(path)
|
|
184
|
+
if pg:
|
|
185
|
+
pg_path = path / f'page_{pg}.md'
|
|
186
|
+
if not pg_path.exists(): raise ValueError(f"Page {pg} not found")
|
|
187
|
+
return pg_path.read_text()
|
|
188
|
+
pgs = sorted(path.glob('page_*.md'), key=lambda p: int(p.stem.split('_')[1]))
|
|
189
|
+
return '\n\n'.join([p.read_text() for p in pgs])
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mistocr
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.4
|
|
4
4
|
Summary: Simple batch OCR for PDFs using Mistral's state-of-the-art vision model
|
|
5
5
|
Home-page: https://github.com/franckalbinet/mistocr
|
|
6
6
|
Author: Solveit
|
|
@@ -54,10 +54,11 @@ for large document sets.
|
|
|
54
54
|
**Cost savings**: Batch OCR mode reduces costs from \$1/1000 pages to
|
|
55
55
|
\$0.50/1000 pages - a 50% reduction compared to synchronous processing.
|
|
56
56
|
|
|
57
|
-
**Simplicity**: A single
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
57
|
+
**Simplicity**: A single
|
|
58
|
+
[`ocr()`](https://franckalbinet.github.io/mistocr/core.html#ocr)
|
|
59
|
+
function handles everything - uploading, batch submission, polling for
|
|
60
|
+
completion, and saving results as markdown with extracted images.
|
|
61
|
+
Process one PDF or an entire folder with the same simple interface.
|
|
61
62
|
|
|
62
63
|
**Organized output**: Each PDF is automatically saved to its own folder
|
|
63
64
|
with pages as separate markdown files and images in an `img` subfolder,
|
|
@@ -80,57 +81,60 @@ $ pip install mistocr
|
|
|
80
81
|
|
|
81
82
|
## How to use
|
|
82
83
|
|
|
84
|
+
### Basic usage
|
|
85
|
+
|
|
86
|
+
Process a single PDF:
|
|
87
|
+
|
|
83
88
|
``` python
|
|
84
89
|
from mistocr.core import ocr
|
|
85
|
-
```
|
|
86
|
-
|
|
87
|
-
- **Process a single PDF:**
|
|
88
90
|
|
|
89
|
-
|
|
91
|
+
fname = 'files/test/attention-is-all-you-need.pdf'
|
|
92
|
+
result = ocr(fname)
|
|
93
|
+
```
|
|
90
94
|
|
|
91
|
-
|
|
92
|
-
result = ocr(fname)
|
|
95
|
+
Or process an entire folder:
|
|
93
96
|
|
|
94
97
|
``` python
|
|
98
|
+
results = ocr('files/test')
|
|
95
99
|
```
|
|
96
100
|
|
|
97
|
-
|
|
98
|
-
img/ page_11.md page_14.md page_3.md page_6.md page_9.md
|
|
99
|
-
page_1.md page_12.md page_15.md page_4.md page_7.md
|
|
100
|
-
page_10.md page_13.md page_2.md page_5.md page_8.md
|
|
101
|
+
### Output structure
|
|
101
102
|
|
|
102
|
-
|
|
103
|
-
|
|
103
|
+
Each PDF is saved to its own folder with pages as separate markdown
|
|
104
|
+
files and images in an `img` subfolder:
|
|
104
105
|
|
|
105
|
-
|
|
106
|
+
files/test/md/
|
|
107
|
+
├── attention-is-all-you-need/
|
|
108
|
+
│ ├── img/
|
|
109
|
+
│ │ ├── img-0.jpeg
|
|
110
|
+
│ │ ├── img-1.jpeg
|
|
111
|
+
│ │ └── ...
|
|
112
|
+
│ ├── page_1.md
|
|
113
|
+
│ ├── page_2.md
|
|
114
|
+
│ └── ...
|
|
115
|
+
└── resnet/
|
|
116
|
+
├── img/
|
|
117
|
+
└── ...
|
|
106
118
|
|
|
107
|
-
|
|
108
|
-
results = ocr('files/test')
|
|
109
|
-
```
|
|
119
|
+
### Reading results
|
|
110
120
|
|
|
111
|
-
|
|
112
|
-
```
|
|
121
|
+
Read all pages from a processed PDF:
|
|
113
122
|
|
|
114
|
-
|
|
115
|
-
|
|
123
|
+
``` python
|
|
124
|
+
from mistocr.core import read_pgs
|
|
116
125
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
page_1.md page_12.md page_15.md page_4.md page_7.md
|
|
120
|
-
page_10.md page_13.md page_2.md page_5.md page_8.md
|
|
126
|
+
text = read_pgs('files/test/md/attention-is-all-you-need')
|
|
127
|
+
```
|
|
121
128
|
|
|
122
|
-
|
|
123
|
-
img-0.jpeg img-1.jpeg img-2.jpeg img-3.jpeg img-4.jpeg
|
|
129
|
+
Or read a specific page:
|
|
124
130
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
131
|
+
``` python
|
|
132
|
+
text = read_pgs('files/test/md/attention-is-all-you-need', 10)
|
|
133
|
+
```
|
|
128
134
|
|
|
129
|
-
|
|
130
|
-
img-0.jpeg img-2.jpeg img-4.jpeg img-6.jpeg
|
|
131
|
-
img-1.jpeg img-3.jpeg img-5.jpeg
|
|
135
|
+
### Customization
|
|
132
136
|
|
|
133
|
-
|
|
137
|
+
Customize output directory, image inclusion, and polling interval:
|
|
134
138
|
|
|
135
139
|
``` python
|
|
136
140
|
results = ocr('files/test', out_dir='output', inc_img=False, poll_interval=5)
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
mistocr/__init__.py,sha256=1mptEzQihbdyqqzMgdns_j5ZGK9gz7hR2bsgA_TnjO4,22
|
|
2
|
+
mistocr/_modidx.py,sha256=zA12OvdPdNkQ7K_oQx8rzto1mWnpQa3kyz8N-az6kMw,1843
|
|
3
|
+
mistocr/core.py,sha256=qMV6ZFqs3PNHNUL6o6612WkWzOQiiA1jIKreAaYwORg,7239
|
|
4
|
+
mistocr-0.0.4.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
|
|
5
|
+
mistocr-0.0.4.dist-info/METADATA,sha256=01uXdXnZhKv334UNN1ZNlWCxNeozrptZpvAN9MFYIF4,4825
|
|
6
|
+
mistocr-0.0.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
7
|
+
mistocr-0.0.4.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
|
|
8
|
+
mistocr-0.0.4.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
|
|
9
|
+
mistocr-0.0.4.dist-info/RECORD,,
|
mistocr-0.0.3.dist-info/RECORD
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
mistocr/__init__.py,sha256=4GZKi13lDTD25YBkGakhZyEQZWTER_OWQMNPoH_UM2c,22
|
|
2
|
-
mistocr/_modidx.py,sha256=gViY05_Y4LdQXC5l2yEPG3MX-9M93xf4FJEGh3ns2Fo,1745
|
|
3
|
-
mistocr/core.py,sha256=Ur5R8NLvHxduvSVuWNkWwt8xgkrxpnL9cmJjQ5h9thM,6778
|
|
4
|
-
mistocr-0.0.3.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
|
|
5
|
-
mistocr-0.0.3.dist-info/METADATA,sha256=aWl_wHxvy5Qrsze7JtTWMQ6FD-l-1QEM-7GZfTeem88,5076
|
|
6
|
-
mistocr-0.0.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
7
|
-
mistocr-0.0.3.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
|
|
8
|
-
mistocr-0.0.3.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
|
|
9
|
-
mistocr-0.0.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|