mistocr 0.4.2__tar.gz → 0.4.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mistocr-0.4.2/mistocr.egg-info → mistocr-0.4.3}/PKG-INFO +1 -1
- mistocr-0.4.3/mistocr/__init__.py +1 -0
- {mistocr-0.4.2 → mistocr-0.4.3}/mistocr/core.py +16 -16
- {mistocr-0.4.2 → mistocr-0.4.3/mistocr.egg-info}/PKG-INFO +1 -1
- {mistocr-0.4.2 → mistocr-0.4.3}/settings.ini +1 -1
- mistocr-0.4.2/mistocr/__init__.py +0 -1
- {mistocr-0.4.2 → mistocr-0.4.3}/LICENSE +0 -0
- {mistocr-0.4.2 → mistocr-0.4.3}/MANIFEST.in +0 -0
- {mistocr-0.4.2 → mistocr-0.4.3}/README.md +0 -0
- {mistocr-0.4.2 → mistocr-0.4.3}/mistocr/_modidx.py +0 -0
- {mistocr-0.4.2 → mistocr-0.4.3}/mistocr/pipeline.py +0 -0
- {mistocr-0.4.2 → mistocr-0.4.3}/mistocr/refine.py +0 -0
- {mistocr-0.4.2 → mistocr-0.4.3}/mistocr.egg-info/SOURCES.txt +0 -0
- {mistocr-0.4.2 → mistocr-0.4.3}/mistocr.egg-info/dependency_links.txt +0 -0
- {mistocr-0.4.2 → mistocr-0.4.3}/mistocr.egg-info/entry_points.txt +0 -0
- {mistocr-0.4.2 → mistocr-0.4.3}/mistocr.egg-info/not-zip-safe +0 -0
- {mistocr-0.4.2 → mistocr-0.4.3}/mistocr.egg-info/requires.txt +0 -0
- {mistocr-0.4.2 → mistocr-0.4.3}/mistocr.egg-info/top_level.txt +0 -0
- {mistocr-0.4.2 → mistocr-0.4.3}/pyproject.toml +0 -0
- {mistocr-0.4.2 → mistocr-0.4.3}/setup.cfg +0 -0
- {mistocr-0.4.2 → mistocr-0.4.3}/setup.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.4.3"
|
|
@@ -52,8 +52,8 @@ def create_batch_entry(
|
|
|
52
52
|
url:str, # Mistral signed URL
|
|
53
53
|
cid:str=None, # Custom ID (by default using the file name without extension)
|
|
54
54
|
inc_img:bool=True, # Include image in response
|
|
55
|
-
extract_header:bool=
|
|
56
|
-
extract_footer:bool=
|
|
55
|
+
extract_header:bool=True, # Extract headers from document
|
|
56
|
+
extract_footer:bool=True # Extract footers from document
|
|
57
57
|
) -> dict[str, str | dict[str, str | bool]]: # Batch entry dict
|
|
58
58
|
"Create a batch entry dict for OCR"
|
|
59
59
|
path = Path(path)
|
|
@@ -70,7 +70,7 @@ def create_batch_entry(
|
|
|
70
70
|
)
|
|
71
71
|
)
|
|
72
72
|
|
|
73
|
-
# %% ../nbs/00_core.ipynb
|
|
73
|
+
# %% ../nbs/00_core.ipynb 19
|
|
74
74
|
def prep_pdf_batch(
|
|
75
75
|
path:str, # Path to PDF file,
|
|
76
76
|
cid:str=None, # Custom ID (by default using the file name without extention)
|
|
@@ -81,7 +81,7 @@ def prep_pdf_batch(
|
|
|
81
81
|
url, c = upload_pdf(path, key)
|
|
82
82
|
return create_batch_entry(path, url, cid, inc_img), c
|
|
83
83
|
|
|
84
|
-
# %% ../nbs/00_core.ipynb
|
|
84
|
+
# %% ../nbs/00_core.ipynb 23
|
|
85
85
|
def submit_batch(
|
|
86
86
|
entries:list[dict], # List of batch entries,
|
|
87
87
|
c:Mistral=None, # Mistral client,
|
|
@@ -95,7 +95,7 @@ def submit_batch(
|
|
|
95
95
|
batch_data = c.files.upload(file=dict(file_name="batch.jsonl", content=open(f.name, "rb")), purpose="batch")
|
|
96
96
|
return c.batch.jobs.create(input_files=[batch_data.id], model=model, endpoint=endpoint)
|
|
97
97
|
|
|
98
|
-
# %% ../nbs/00_core.ipynb
|
|
98
|
+
# %% ../nbs/00_core.ipynb 26
|
|
99
99
|
def _check_timeout(
|
|
100
100
|
queued_time:int, # Time spent in QUEUED state (seconds)
|
|
101
101
|
timeout:int, # Maximum allowed QUEUED time (seconds)
|
|
@@ -104,7 +104,7 @@ def _check_timeout(
|
|
|
104
104
|
"Raise TimeoutError if job has been queued longer than timeout"
|
|
105
105
|
if queued_time >= timeout: raise TimeoutError(f"Job {job_id} stayed in QUEUED for {queued_time}s, exceeding timeout of {timeout}s. Check your balance or Mistral Status.")
|
|
106
106
|
|
|
107
|
-
# %% ../nbs/00_core.ipynb
|
|
107
|
+
# %% ../nbs/00_core.ipynb 27
|
|
108
108
|
def wait_for_job(
|
|
109
109
|
job:dict, # Batch job from submit_batch
|
|
110
110
|
c:Mistral=None, # Mistral client
|
|
@@ -123,7 +123,7 @@ def wait_for_job(
|
|
|
123
123
|
if job.status != "SUCCESS": logger.warning(f"Job {job.id} finished with non-success status: {job.status}")
|
|
124
124
|
return job
|
|
125
125
|
|
|
126
|
-
# %% ../nbs/00_core.ipynb
|
|
126
|
+
# %% ../nbs/00_core.ipynb 29
|
|
127
127
|
def download_results(
|
|
128
128
|
job:dict, # Job dict,
|
|
129
129
|
c:Mistral=None # Mistral client
|
|
@@ -132,7 +132,7 @@ def download_results(
|
|
|
132
132
|
content = c.files.download(file_id=job.output_file).read().decode('utf-8')
|
|
133
133
|
return [json.loads(line) for line in content.strip().split('\n') if line]
|
|
134
134
|
|
|
135
|
-
# %% ../nbs/00_core.ipynb
|
|
135
|
+
# %% ../nbs/00_core.ipynb 34
|
|
136
136
|
def save_images(
|
|
137
137
|
page:dict, # Page dict,
|
|
138
138
|
img_dir:str='img' # Directory to save images
|
|
@@ -143,7 +143,7 @@ def save_images(
|
|
|
143
143
|
img_bytes = base64.b64decode(img['image_base64'].split(',')[1])
|
|
144
144
|
Image.open(BytesIO(img_bytes)).save(img_dir / img['id'])
|
|
145
145
|
|
|
146
|
-
# %% ../nbs/00_core.ipynb
|
|
146
|
+
# %% ../nbs/00_core.ipynb 35
|
|
147
147
|
def save_page(
|
|
148
148
|
page:dict, # Page dict,
|
|
149
149
|
dst:str, # Directory to save page
|
|
@@ -155,7 +155,7 @@ def save_page(
|
|
|
155
155
|
img_dir.mkdir(exist_ok=True)
|
|
156
156
|
save_images(page, img_dir)
|
|
157
157
|
|
|
158
|
-
# %% ../nbs/00_core.ipynb
|
|
158
|
+
# %% ../nbs/00_core.ipynb 37
|
|
159
159
|
def save_pages(
|
|
160
160
|
ocr_resp:dict, # OCR response,
|
|
161
161
|
dst:str, # Directory to save pages,
|
|
@@ -168,7 +168,7 @@ def save_pages(
|
|
|
168
168
|
for page in ocr_resp['pages']: save_page(page, dst, img_dir)
|
|
169
169
|
return dst
|
|
170
170
|
|
|
171
|
-
# %% ../nbs/00_core.ipynb
|
|
171
|
+
# %% ../nbs/00_core.ipynb 43
|
|
172
172
|
def _get_paths(path:str) -> list[Path]:
|
|
173
173
|
"Get list of PDFs from file or folder"
|
|
174
174
|
path = Path(path)
|
|
@@ -179,7 +179,7 @@ def _get_paths(path:str) -> list[Path]:
|
|
|
179
179
|
return pdfs
|
|
180
180
|
raise ValueError(f"Path not found: {path}")
|
|
181
181
|
|
|
182
|
-
# %% ../nbs/00_core.ipynb
|
|
182
|
+
# %% ../nbs/00_core.ipynb 44
|
|
183
183
|
def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[dict], Mistral]:
|
|
184
184
|
"Prepare batch entries for list of PDFs"
|
|
185
185
|
entries, c = [], None
|
|
@@ -188,7 +188,7 @@ def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[
|
|
|
188
188
|
entries.append(entry)
|
|
189
189
|
return entries, c
|
|
190
190
|
|
|
191
|
-
# %% ../nbs/00_core.ipynb
|
|
191
|
+
# %% ../nbs/00_core.ipynb 45
|
|
192
192
|
def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]:
|
|
193
193
|
"Submit batch, wait for completion, and download results"
|
|
194
194
|
job = submit_batch(entries, c)
|
|
@@ -196,7 +196,7 @@ def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]
|
|
|
196
196
|
if job.status != 'SUCCESS': raise Exception(f"Job failed with status: {job.status}")
|
|
197
197
|
return download_results(job, c)
|
|
198
198
|
|
|
199
|
-
# %% ../nbs/00_core.ipynb
|
|
199
|
+
# %% ../nbs/00_core.ipynb 46
|
|
200
200
|
def ocr_pdf(
|
|
201
201
|
path:str, # Path to PDF file or folder,
|
|
202
202
|
dst:str='md', # Directory to save markdown pages,
|
|
@@ -210,7 +210,7 @@ def ocr_pdf(
|
|
|
210
210
|
results = _run_batch(entries, c, poll_interval)
|
|
211
211
|
return L([save_pages(r['response']['body'], dst, r['custom_id']) for r in results])
|
|
212
212
|
|
|
213
|
-
# %% ../nbs/00_core.ipynb
|
|
213
|
+
# %% ../nbs/00_core.ipynb 53
|
|
214
214
|
def read_pgs(
|
|
215
215
|
path:str, # OCR output directory,
|
|
216
216
|
join:bool=True # Join pages into single string
|
|
@@ -221,7 +221,7 @@ def read_pgs(
|
|
|
221
221
|
contents = L([p.read_text() for p in pgs])
|
|
222
222
|
return '\n\n'.join(contents) if join else contents
|
|
223
223
|
|
|
224
|
-
# %% ../nbs/00_core.ipynb
|
|
224
|
+
# %% ../nbs/00_core.ipynb 60
|
|
225
225
|
def subset_pdf(
|
|
226
226
|
path:str, # Path to PDF file
|
|
227
227
|
start:int=1, # Start page (1-based)
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.4.2"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|