mistocr 0.3.2__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mistocr/__init__.py +1 -1
- mistocr/_modidx.py +2 -1
- mistocr/core.py +45 -30
- mistocr/pipeline.py +12 -5
- mistocr/refine.py +35 -29
- {mistocr-0.3.2.dist-info → mistocr-0.4.1.dist-info}/METADATA +1 -13
- mistocr-0.4.1.dist-info/RECORD +11 -0
- mistocr-0.3.2.dist-info/RECORD +0 -11
- {mistocr-0.3.2.dist-info → mistocr-0.4.1.dist-info}/WHEEL +0 -0
- {mistocr-0.3.2.dist-info → mistocr-0.4.1.dist-info}/entry_points.txt +0 -0
- {mistocr-0.3.2.dist-info → mistocr-0.4.1.dist-info}/licenses/LICENSE +0 -0
- {mistocr-0.3.2.dist-info → mistocr-0.4.1.dist-info}/top_level.txt +0 -0
mistocr/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.4.1"
|
mistocr/_modidx.py
CHANGED
|
@@ -5,7 +5,8 @@ d = { 'settings': { 'branch': 'main',
|
|
|
5
5
|
'doc_host': 'https://franckalbinet.github.io',
|
|
6
6
|
'git_url': 'https://github.com/franckalbinet/mistocr',
|
|
7
7
|
'lib_path': 'mistocr'},
|
|
8
|
-
'syms': { 'mistocr.core': { 'mistocr.core.
|
|
8
|
+
'syms': { 'mistocr.core': { 'mistocr.core._check_timeout': ('core.html#_check_timeout', 'mistocr/core.py'),
|
|
9
|
+
'mistocr.core._get_paths': ('core.html#_get_paths', 'mistocr/core.py'),
|
|
9
10
|
'mistocr.core._prep_batch': ('core.html#_prep_batch', 'mistocr/core.py'),
|
|
10
11
|
'mistocr.core._run_batch': ('core.html#_run_batch', 'mistocr/core.py'),
|
|
11
12
|
'mistocr.core.create_batch_entry': ('core.html#create_batch_entry', 'mistocr/core.py'),
|
mistocr/core.py
CHANGED
|
@@ -3,9 +3,9 @@
|
|
|
3
3
|
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_core.ipynb.
|
|
4
4
|
|
|
5
5
|
# %% auto 0
|
|
6
|
-
__all__ = ['ocr_model', 'ocr_endpoint', 'get_api_key', 'upload_pdf', 'create_batch_entry', 'prep_pdf_batch',
|
|
7
|
-
'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr_pdf',
|
|
8
|
-
'subset_pdf']
|
|
6
|
+
__all__ = ['logger', 'ocr_model', 'ocr_endpoint', 'get_api_key', 'upload_pdf', 'create_batch_entry', 'prep_pdf_batch',
|
|
7
|
+
'submit_batch', 'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr_pdf',
|
|
8
|
+
'read_pgs', 'subset_pdf']
|
|
9
9
|
|
|
10
10
|
# %% ../nbs/00_core.ipynb 3
|
|
11
11
|
from fastcore.all import *
|
|
@@ -15,8 +15,14 @@ from pathlib import Path
|
|
|
15
15
|
from PIL import Image
|
|
16
16
|
from mistralai import Mistral
|
|
17
17
|
import PyPDF2
|
|
18
|
+
import logging
|
|
18
19
|
|
|
19
|
-
# %% ../nbs/00_core.ipynb
|
|
20
|
+
# %% ../nbs/00_core.ipynb 4
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
logging.basicConfig(level=logging.WARNING, format='%(name)s - %(levelname)s - %(message)s')
|
|
23
|
+
logger.setLevel(logging.DEBUG)
|
|
24
|
+
|
|
25
|
+
# %% ../nbs/00_core.ipynb 7
|
|
20
26
|
def get_api_key(
|
|
21
27
|
key:str=None # Mistral API key
|
|
22
28
|
):
|
|
@@ -25,11 +31,11 @@ def get_api_key(
|
|
|
25
31
|
if not key: raise ValueError("MISTRAL_API_KEY not found")
|
|
26
32
|
return key
|
|
27
33
|
|
|
28
|
-
# %% ../nbs/00_core.ipynb
|
|
34
|
+
# %% ../nbs/00_core.ipynb 8
|
|
29
35
|
ocr_model = "mistral-ocr-latest"
|
|
30
36
|
ocr_endpoint = "/v1/ocr"
|
|
31
37
|
|
|
32
|
-
# %% ../nbs/00_core.ipynb
|
|
38
|
+
# %% ../nbs/00_core.ipynb 11
|
|
33
39
|
def upload_pdf(
|
|
34
40
|
path:str, # Path to PDF file
|
|
35
41
|
key:str=None # Mistral API key
|
|
@@ -40,7 +46,7 @@ def upload_pdf(
|
|
|
40
46
|
uploaded = c.files.upload(file=dict(file_name=path.stem, content=path.read_bytes()), purpose="ocr")
|
|
41
47
|
return c.files.get_signed_url(file_id=uploaded.id).url, c
|
|
42
48
|
|
|
43
|
-
# %% ../nbs/00_core.ipynb
|
|
49
|
+
# %% ../nbs/00_core.ipynb 16
|
|
44
50
|
def create_batch_entry(
|
|
45
51
|
path:str, # Path to PDF file,
|
|
46
52
|
url:str, # Mistral signed URL
|
|
@@ -52,7 +58,7 @@ def create_batch_entry(
|
|
|
52
58
|
if not cid: cid = path.stem
|
|
53
59
|
return dict(custom_id=cid, body=dict(document=dict(type="document_url", document_url=url), include_image_base64=inc_img))
|
|
54
60
|
|
|
55
|
-
# %% ../nbs/00_core.ipynb
|
|
61
|
+
# %% ../nbs/00_core.ipynb 18
|
|
56
62
|
def prep_pdf_batch(
|
|
57
63
|
path:str, # Path to PDF file,
|
|
58
64
|
cid:str=None, # Custom ID (by default using the file name without extention)
|
|
@@ -63,7 +69,7 @@ def prep_pdf_batch(
|
|
|
63
69
|
url, c = upload_pdf(path, key)
|
|
64
70
|
return create_batch_entry(path, url, cid, inc_img), c
|
|
65
71
|
|
|
66
|
-
# %% ../nbs/00_core.ipynb
|
|
72
|
+
# %% ../nbs/00_core.ipynb 22
|
|
67
73
|
def submit_batch(
|
|
68
74
|
entries:list[dict], # List of batch entries,
|
|
69
75
|
c:Mistral=None, # Mistral client,
|
|
@@ -77,26 +83,35 @@ def submit_batch(
|
|
|
77
83
|
batch_data = c.files.upload(file=dict(file_name="batch.jsonl", content=open(f.name, "rb")), purpose="batch")
|
|
78
84
|
return c.batch.jobs.create(input_files=[batch_data.id], model=model, endpoint=endpoint)
|
|
79
85
|
|
|
80
|
-
# %% ../nbs/00_core.ipynb
|
|
86
|
+
# %% ../nbs/00_core.ipynb 25
|
|
87
|
+
def _check_timeout(
|
|
88
|
+
queued_time:int, # Time spent in QUEUED state (seconds)
|
|
89
|
+
timeout:int, # Maximum allowed QUEUED time (seconds)
|
|
90
|
+
job_id:str # Batch job ID
|
|
91
|
+
):
|
|
92
|
+
"Raise TimeoutError if job has been queued longer than timeout"
|
|
93
|
+
if queued_time >= timeout: raise TimeoutError(f"Job {job_id} stayed in QUEUED for {queued_time}s, exceeding timeout of {timeout}s. Check your balance or Mistral Status.")
|
|
94
|
+
|
|
95
|
+
# %% ../nbs/00_core.ipynb 26
|
|
81
96
|
def wait_for_job(
|
|
82
|
-
job:dict, #
|
|
83
|
-
c:Mistral=None, # Mistral client
|
|
84
|
-
poll_interval:int=1, #
|
|
85
|
-
queued_timeout:int=300 #
|
|
86
|
-
) -> dict: #
|
|
97
|
+
job:dict, # Batch job from submit_batch
|
|
98
|
+
c:Mistral=None, # Mistral client
|
|
99
|
+
poll_interval:int=1, # Seconds between status checks
|
|
100
|
+
queued_timeout:int=300 # Max seconds in QUEUED before timeout
|
|
101
|
+
) -> dict: # Completed job dict
|
|
87
102
|
"Poll job until completion and return final job status"
|
|
103
|
+
logger.info(f"Waiting for batch job {job.id} (initial status: {job.status})")
|
|
88
104
|
queued_time = 0
|
|
89
105
|
while job.status in ["QUEUED", "RUNNING"]:
|
|
90
|
-
|
|
91
|
-
if job.status == "QUEUED":
|
|
92
|
-
queued_time += poll_interval
|
|
93
|
-
if queued_time >= queued_timeout:
|
|
94
|
-
raise TimeoutError(f"Job stayed in QUEUED status for {queued_time}s, exceeding timeout of {queued_timeout}s. Check your balance or Mistral Status.")
|
|
106
|
+
logger.debug(f"Job {job.id} status: {job.status} (elapsed: {queued_time}s)")
|
|
107
|
+
if job.status == "QUEUED": queued_time += poll_interval; _check_timeout(queued_time, queued_timeout, job.id)
|
|
95
108
|
time.sleep(poll_interval)
|
|
96
109
|
job = c.batch.jobs.get(job_id=job.id)
|
|
110
|
+
logger.info(f"Job {job.id} completed with status: {job.status}")
|
|
111
|
+
if job.status != "SUCCESS": logger.warning(f"Job {job.id} finished with non-success status: {job.status}")
|
|
97
112
|
return job
|
|
98
113
|
|
|
99
|
-
# %% ../nbs/00_core.ipynb
|
|
114
|
+
# %% ../nbs/00_core.ipynb 28
|
|
100
115
|
def download_results(
|
|
101
116
|
job:dict, # Job dict,
|
|
102
117
|
c:Mistral=None # Mistral client
|
|
@@ -105,7 +120,7 @@ def download_results(
|
|
|
105
120
|
content = c.files.download(file_id=job.output_file).read().decode('utf-8')
|
|
106
121
|
return [json.loads(line) for line in content.strip().split('\n') if line]
|
|
107
122
|
|
|
108
|
-
# %% ../nbs/00_core.ipynb
|
|
123
|
+
# %% ../nbs/00_core.ipynb 33
|
|
109
124
|
def save_images(
|
|
110
125
|
page:dict, # Page dict,
|
|
111
126
|
img_dir:str='img' # Directory to save images
|
|
@@ -116,7 +131,7 @@ def save_images(
|
|
|
116
131
|
img_bytes = base64.b64decode(img['image_base64'].split(',')[1])
|
|
117
132
|
Image.open(BytesIO(img_bytes)).save(img_dir / img['id'])
|
|
118
133
|
|
|
119
|
-
# %% ../nbs/00_core.ipynb
|
|
134
|
+
# %% ../nbs/00_core.ipynb 34
|
|
120
135
|
def save_page(
|
|
121
136
|
page:dict, # Page dict,
|
|
122
137
|
dst:str, # Directory to save page
|
|
@@ -128,7 +143,7 @@ def save_page(
|
|
|
128
143
|
img_dir.mkdir(exist_ok=True)
|
|
129
144
|
save_images(page, img_dir)
|
|
130
145
|
|
|
131
|
-
# %% ../nbs/00_core.ipynb
|
|
146
|
+
# %% ../nbs/00_core.ipynb 36
|
|
132
147
|
def save_pages(
|
|
133
148
|
ocr_resp:dict, # OCR response,
|
|
134
149
|
dst:str, # Directory to save pages,
|
|
@@ -141,7 +156,7 @@ def save_pages(
|
|
|
141
156
|
for page in ocr_resp['pages']: save_page(page, dst, img_dir)
|
|
142
157
|
return dst
|
|
143
158
|
|
|
144
|
-
# %% ../nbs/00_core.ipynb
|
|
159
|
+
# %% ../nbs/00_core.ipynb 42
|
|
145
160
|
def _get_paths(path:str) -> list[Path]:
|
|
146
161
|
"Get list of PDFs from file or folder"
|
|
147
162
|
path = Path(path)
|
|
@@ -152,7 +167,7 @@ def _get_paths(path:str) -> list[Path]:
|
|
|
152
167
|
return pdfs
|
|
153
168
|
raise ValueError(f"Path not found: {path}")
|
|
154
169
|
|
|
155
|
-
# %% ../nbs/00_core.ipynb
|
|
170
|
+
# %% ../nbs/00_core.ipynb 43
|
|
156
171
|
def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[dict], Mistral]:
|
|
157
172
|
"Prepare batch entries for list of PDFs"
|
|
158
173
|
entries, c = [], None
|
|
@@ -161,7 +176,7 @@ def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[
|
|
|
161
176
|
entries.append(entry)
|
|
162
177
|
return entries, c
|
|
163
178
|
|
|
164
|
-
# %% ../nbs/00_core.ipynb
|
|
179
|
+
# %% ../nbs/00_core.ipynb 44
|
|
165
180
|
def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]:
|
|
166
181
|
"Submit batch, wait for completion, and download results"
|
|
167
182
|
job = submit_batch(entries, c)
|
|
@@ -169,7 +184,7 @@ def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]
|
|
|
169
184
|
if job.status != 'SUCCESS': raise Exception(f"Job failed with status: {job.status}")
|
|
170
185
|
return download_results(job, c)
|
|
171
186
|
|
|
172
|
-
# %% ../nbs/00_core.ipynb
|
|
187
|
+
# %% ../nbs/00_core.ipynb 45
|
|
173
188
|
def ocr_pdf(
|
|
174
189
|
path:str, # Path to PDF file or folder,
|
|
175
190
|
dst:str='md', # Directory to save markdown pages,
|
|
@@ -183,7 +198,7 @@ def ocr_pdf(
|
|
|
183
198
|
results = _run_batch(entries, c, poll_interval)
|
|
184
199
|
return L([save_pages(r['response']['body'], dst, r['custom_id']) for r in results])
|
|
185
200
|
|
|
186
|
-
# %% ../nbs/00_core.ipynb
|
|
201
|
+
# %% ../nbs/00_core.ipynb 52
|
|
187
202
|
def read_pgs(
|
|
188
203
|
path:str, # OCR output directory,
|
|
189
204
|
join:bool=True # Join pages into single string
|
|
@@ -194,7 +209,7 @@ def read_pgs(
|
|
|
194
209
|
contents = L([p.read_text() for p in pgs])
|
|
195
210
|
return '\n\n'.join(contents) if join else contents
|
|
196
211
|
|
|
197
|
-
# %% ../nbs/00_core.ipynb
|
|
212
|
+
# %% ../nbs/00_core.ipynb 59
|
|
198
213
|
def subset_pdf(
|
|
199
214
|
path:str, # Path to PDF file
|
|
200
215
|
start:int=1, # Start page (1-based)
|
mistocr/pipeline.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_pipeline.ipynb.
|
|
4
4
|
|
|
5
5
|
# %% auto 0
|
|
6
|
-
__all__ = ['pdf_to_md']
|
|
6
|
+
__all__ = ['logger', 'pdf_to_md']
|
|
7
7
|
|
|
8
8
|
# %% ../nbs/02_pipeline.ipynb 3
|
|
9
9
|
from fastcore.all import *
|
|
@@ -13,8 +13,14 @@ from pathlib import Path
|
|
|
13
13
|
from asyncio import Semaphore, gather, sleep
|
|
14
14
|
import tempfile
|
|
15
15
|
import os, json, shutil
|
|
16
|
+
import logging
|
|
16
17
|
|
|
17
18
|
# %% ../nbs/02_pipeline.ipynb 4
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
logging.basicConfig(level=logging.WARNING, format='%(name)s - %(levelname)s - %(message)s')
|
|
21
|
+
logger.setLevel(logging.INFO)
|
|
22
|
+
|
|
23
|
+
# %% ../nbs/02_pipeline.ipynb 5
|
|
18
24
|
@delegates(add_img_descs)
|
|
19
25
|
async def pdf_to_md(
|
|
20
26
|
pdf_path:str, # Path to input PDF file
|
|
@@ -26,16 +32,17 @@ async def pdf_to_md(
|
|
|
26
32
|
**kwargs
|
|
27
33
|
):
|
|
28
34
|
"Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
|
|
35
|
+
"Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
|
|
29
36
|
cleanup = ocr_dst is None
|
|
30
37
|
if cleanup: ocr_dst = tempfile.mkdtemp()
|
|
31
38
|
n_steps = 3 if add_img_desc else 2
|
|
32
|
-
if progress:
|
|
39
|
+
if progress: logger.info(f"Step 1/{n_steps}: Running OCR on {pdf_path}...")
|
|
33
40
|
ocr_dir = ocr_pdf(pdf_path, ocr_dst)[0]
|
|
34
|
-
if progress:
|
|
41
|
+
if progress: logger.info(f"Step 2/{n_steps}: Fixing heading hierarchy...")
|
|
35
42
|
fix_hdgs(ocr_dir, model=model)
|
|
36
43
|
if add_img_desc:
|
|
37
|
-
if progress:
|
|
44
|
+
if progress: logger.info(f"Step 3/{n_steps}: Adding image descriptions...")
|
|
38
45
|
await add_img_descs(ocr_dir, dst=dst, model=model, progress=progress, **kwargs)
|
|
39
46
|
elif dst != str(ocr_dir): shutil.copytree(ocr_dir, dst, dirs_exist_ok=True)
|
|
40
47
|
if cleanup: shutil.rmtree(ocr_dst)
|
|
41
|
-
if progress:
|
|
48
|
+
if progress: logger.info("Done!")
|
mistocr/refine.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_refine.ipynb.
|
|
4
4
|
|
|
5
5
|
# %% auto 0
|
|
6
|
-
__all__ = ['prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
|
|
6
|
+
__all__ = ['logger', 'prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
|
|
7
7
|
'HeadingCorrection', 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes',
|
|
8
8
|
'fix_hdgs', 'ImgDescription', 'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs',
|
|
9
9
|
'add_descs_to_pg', 'add_descs_to_pgs', 'add_img_descs']
|
|
@@ -20,8 +20,14 @@ import os
|
|
|
20
20
|
import json
|
|
21
21
|
import shutil
|
|
22
22
|
from asyncio import Semaphore, gather, sleep
|
|
23
|
+
import logging
|
|
23
24
|
|
|
24
|
-
# %% ../nbs/01_refine.ipynb
|
|
25
|
+
# %% ../nbs/01_refine.ipynb 4
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
logging.basicConfig(level=logging.WARNING, format='%(name)s - %(levelname)s - %(message)s')
|
|
28
|
+
logger.setLevel(logging.INFO)
|
|
29
|
+
|
|
30
|
+
# %% ../nbs/01_refine.ipynb 8
|
|
25
31
|
def get_hdgs(
|
|
26
32
|
md:str # Markdown file string
|
|
27
33
|
) -> L: # L of strings
|
|
@@ -32,7 +38,7 @@ def get_hdgs(
|
|
|
32
38
|
|
|
33
39
|
|
|
34
40
|
|
|
35
|
-
# %% ../nbs/01_refine.ipynb
|
|
41
|
+
# %% ../nbs/01_refine.ipynb 9
|
|
36
42
|
def add_pg_hdgs(
|
|
37
43
|
md:str, # Markdown file string,
|
|
38
44
|
n:int # Page number
|
|
@@ -42,7 +48,7 @@ def add_pg_hdgs(
|
|
|
42
48
|
def repl(m): return m.group(0) + f' ... page {n}'
|
|
43
49
|
return sub(r'^#{1,6} .+$', repl, md, flags=MULTILINE)
|
|
44
50
|
|
|
45
|
-
# %% ../nbs/01_refine.ipynb
|
|
51
|
+
# %% ../nbs/01_refine.ipynb 13
|
|
46
52
|
def read_pgs_pg(
|
|
47
53
|
path:str # Path to the markdown file
|
|
48
54
|
) -> L: # List of markdown pages
|
|
@@ -50,7 +56,7 @@ def read_pgs_pg(
|
|
|
50
56
|
pgs = read_pgs(path, join=False)
|
|
51
57
|
return L([add_pg_hdgs(pg, n) for n, pg in enumerate(pgs, 1)]).concat()
|
|
52
58
|
|
|
53
|
-
# %% ../nbs/01_refine.ipynb
|
|
59
|
+
# %% ../nbs/01_refine.ipynb 16
|
|
54
60
|
def fmt_hdgs_idx(
|
|
55
61
|
hdgs: list[str] # List of markdown headings
|
|
56
62
|
) -> str: # Formatted string with index
|
|
@@ -58,18 +64,18 @@ def fmt_hdgs_idx(
|
|
|
58
64
|
return '\n'.join(f"{i}. {h}" for i, h in enumerate(hdgs))
|
|
59
65
|
|
|
60
66
|
|
|
61
|
-
# %% ../nbs/01_refine.ipynb
|
|
67
|
+
# %% ../nbs/01_refine.ipynb 19
|
|
62
68
|
class HeadingCorrection(BaseModel):
|
|
63
69
|
"A single heading correction mapping an index to its corrected markdown heading"
|
|
64
70
|
index: int
|
|
65
71
|
corrected: str
|
|
66
72
|
|
|
67
|
-
# %% ../nbs/01_refine.ipynb
|
|
73
|
+
# %% ../nbs/01_refine.ipynb 20
|
|
68
74
|
class HeadingCorrections(BaseModel):
|
|
69
75
|
"Collection of heading corrections returned by the LLM"
|
|
70
76
|
corrections: list[HeadingCorrection]
|
|
71
77
|
|
|
72
|
-
# %% ../nbs/01_refine.ipynb
|
|
78
|
+
# %% ../nbs/01_refine.ipynb 22
|
|
73
79
|
prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
|
|
74
80
|
|
|
75
81
|
INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title ... page 1")
|
|
@@ -109,7 +115,7 @@ Headings to analyze:
|
|
|
109
115
|
{headings_list}
|
|
110
116
|
"""
|
|
111
117
|
|
|
112
|
-
# %% ../nbs/01_refine.ipynb
|
|
118
|
+
# %% ../nbs/01_refine.ipynb 24
|
|
113
119
|
def fix_hdg_hierarchy(
|
|
114
120
|
hdgs: list[str], # List of markdown headings
|
|
115
121
|
prompt: str=None, # Prompt to use
|
|
@@ -125,7 +131,7 @@ def fix_hdg_hierarchy(
|
|
|
125
131
|
return {o['index']: o['corrected'] for o in fixes}
|
|
126
132
|
|
|
127
133
|
|
|
128
|
-
# %% ../nbs/01_refine.ipynb
|
|
134
|
+
# %% ../nbs/01_refine.ipynb 27
|
|
129
135
|
@delegates(fix_hdg_hierarchy)
|
|
130
136
|
def mk_fixes_lut(
|
|
131
137
|
hdgs: list[str], # List of markdown headings
|
|
@@ -138,7 +144,7 @@ def mk_fixes_lut(
|
|
|
138
144
|
fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key, **kwargs)
|
|
139
145
|
return {hdgs[k]:v for k,v in fixes.items()}
|
|
140
146
|
|
|
141
|
-
# %% ../nbs/01_refine.ipynb
|
|
147
|
+
# %% ../nbs/01_refine.ipynb 30
|
|
142
148
|
def apply_hdg_fixes(
|
|
143
149
|
p:str, # Page to fix
|
|
144
150
|
lut_fixes: dict[str, str], # Lookup table of fixes
|
|
@@ -147,7 +153,7 @@ def apply_hdg_fixes(
|
|
|
147
153
|
for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old))
|
|
148
154
|
return p
|
|
149
155
|
|
|
150
|
-
# %% ../nbs/01_refine.ipynb
|
|
156
|
+
# %% ../nbs/01_refine.ipynb 33
|
|
151
157
|
@delegates(mk_fixes_lut)
|
|
152
158
|
def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:str='img', **kwargs):
|
|
153
159
|
"Fix heading hierarchy in markdown document"
|
|
@@ -159,13 +165,13 @@ def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:st
|
|
|
159
165
|
lut = mk_fixes_lut(L([get_hdgs(pg) for pg in pgs_with_pg]).concat(), model, **kwargs)
|
|
160
166
|
for i,p in enumerate(pgs_with_pg, 1): (dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut))
|
|
161
167
|
|
|
162
|
-
# %% ../nbs/01_refine.ipynb
|
|
168
|
+
# %% ../nbs/01_refine.ipynb 39
|
|
163
169
|
class ImgDescription(BaseModel):
|
|
164
170
|
"Image classification and description for OCR'd documents"
|
|
165
171
|
is_informative:bool # Whether image contains informative content (charts, diagrams, tables) vs decorative (logos, backgrounds)
|
|
166
172
|
description:str # Detailed description of the image content for RAG and accessibility
|
|
167
173
|
|
|
168
|
-
# %% ../nbs/01_refine.ipynb
|
|
174
|
+
# %% ../nbs/01_refine.ipynb 42
|
|
169
175
|
describe_img_prompt = """Analyze this image from an academic/technical document.
|
|
170
176
|
|
|
171
177
|
Step 1: Determine if this image is informative for understanding the document content.
|
|
@@ -178,7 +184,7 @@ Step 2:
|
|
|
178
184
|
|
|
179
185
|
Return your response as JSON with 'is_informative' (boolean) and 'description' (string) fields."""
|
|
180
186
|
|
|
181
|
-
# %% ../nbs/01_refine.ipynb
|
|
187
|
+
# %% ../nbs/01_refine.ipynb 43
|
|
182
188
|
async def describe_img(
|
|
183
189
|
img_path: Path, # Path to the image file
|
|
184
190
|
model: str = 'claude-sonnet-4-5', # Model to use
|
|
@@ -189,7 +195,7 @@ async def describe_img(
|
|
|
189
195
|
r = await chat([img_path.read_bytes(), prompt], response_format=ImgDescription)
|
|
190
196
|
return r
|
|
191
197
|
|
|
192
|
-
# %% ../nbs/01_refine.ipynb
|
|
198
|
+
# %% ../nbs/01_refine.ipynb 47
|
|
193
199
|
async def limit(
|
|
194
200
|
semaphore, # Semaphore for concurrency control
|
|
195
201
|
coro, # Coroutine to execute
|
|
@@ -201,14 +207,14 @@ async def limit(
|
|
|
201
207
|
if delay: await sleep(delay)
|
|
202
208
|
return r
|
|
203
209
|
|
|
204
|
-
# %% ../nbs/01_refine.ipynb
|
|
210
|
+
# %% ../nbs/01_refine.ipynb 49
|
|
205
211
|
def parse_r(
|
|
206
212
|
result # ModelResponse object from API call
|
|
207
213
|
): # Dictionary with 'is_informative' and 'description' keys
|
|
208
214
|
"Extract and parse JSON content from model response"
|
|
209
215
|
return json.loads(result.choices[0].message.content)
|
|
210
216
|
|
|
211
|
-
# %% ../nbs/01_refine.ipynb
|
|
217
|
+
# %% ../nbs/01_refine.ipynb 51
|
|
212
218
|
async def describe_imgs(
|
|
213
219
|
imgs: list[Path], # List of image file paths to describe
|
|
214
220
|
model: str = 'claude-sonnet-4-5', # Model to use for image description
|
|
@@ -221,7 +227,7 @@ async def describe_imgs(
|
|
|
221
227
|
results = await gather(*[limit(sem, describe_img(img, model, prompt), delay) for img in imgs])
|
|
222
228
|
return {img.name: parse_r(r) for img, r in zip(imgs, results)}
|
|
223
229
|
|
|
224
|
-
# %% ../nbs/01_refine.ipynb
|
|
230
|
+
# %% ../nbs/01_refine.ipynb 53
|
|
225
231
|
def save_img_descs(
|
|
226
232
|
descs: dict, # Dictionary of image descriptions
|
|
227
233
|
dst_fname: Path, # Path to save the JSON file
|
|
@@ -229,7 +235,7 @@ def save_img_descs(
|
|
|
229
235
|
"Save image descriptions to JSON file"
|
|
230
236
|
Path(dst_fname).write_text(json.dumps(descs, indent=2))
|
|
231
237
|
|
|
232
|
-
# %% ../nbs/01_refine.ipynb
|
|
238
|
+
# %% ../nbs/01_refine.ipynb 58
|
|
233
239
|
def add_descs_to_pg(
|
|
234
240
|
pg:str, # Page markdown content
|
|
235
241
|
descs:dict # Dictionary mapping image filenames to their descriptions
|
|
@@ -240,7 +246,7 @@ def add_descs_to_pg(
|
|
|
240
246
|
if fname in descs: pg = pg.replace(link, f"{link}\nAI-generated image description:\n___\n{descs[fname]['description']}\n___")
|
|
241
247
|
return pg
|
|
242
248
|
|
|
243
|
-
# %% ../nbs/01_refine.ipynb
|
|
249
|
+
# %% ../nbs/01_refine.ipynb 63
|
|
244
250
|
def add_descs_to_pgs(
|
|
245
251
|
pgs:list, # List of page markdown strings
|
|
246
252
|
descs:dict # Dictionary mapping image filenames to their descriptions
|
|
@@ -248,7 +254,7 @@ def add_descs_to_pgs(
|
|
|
248
254
|
"Add AI-generated descriptions to images in all pages"
|
|
249
255
|
return [add_descs_to_pg(pg, descs) for pg in pgs]
|
|
250
256
|
|
|
251
|
-
# %% ../nbs/01_refine.ipynb
|
|
257
|
+
# %% ../nbs/01_refine.ipynb 66
|
|
252
258
|
async def add_img_descs(
|
|
253
259
|
src:str, # Path to source markdown directory
|
|
254
260
|
dst:str=None, # Destination directory (defaults to src if None)
|
|
@@ -257,7 +263,7 @@ async def add_img_descs(
|
|
|
257
263
|
semaphore:int=2, # Max concurrent API requests
|
|
258
264
|
delay:float=1, # Delay in seconds between API calls
|
|
259
265
|
force:bool=False, # Force regeneration even if cache exists
|
|
260
|
-
progress:bool=True #
|
|
266
|
+
progress:bool=True # Log progress messages
|
|
261
267
|
):
|
|
262
268
|
"Describe all images in markdown document and insert descriptions inline"
|
|
263
269
|
src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
|
|
@@ -266,23 +272,23 @@ async def add_img_descs(
|
|
|
266
272
|
|
|
267
273
|
# Check if image folder exists
|
|
268
274
|
if not src_imgs.exists():
|
|
269
|
-
if progress:
|
|
275
|
+
if progress: logger.info(f"No images to describe in the document (no '{img_folder}' folder found)")
|
|
270
276
|
return
|
|
271
277
|
|
|
272
278
|
if src_imgs.exists() and dst_path != src_path: shutil.copytree(src_imgs, dst_path/img_folder, dirs_exist_ok=True)
|
|
273
279
|
desc_file = src_path/'img_descriptions.json'
|
|
274
280
|
if desc_file.exists() and not force:
|
|
275
|
-
if progress:
|
|
281
|
+
if progress: logger.info(f"Loading existing descriptions from {desc_file}")
|
|
276
282
|
descs = json.loads(desc_file.read_text())
|
|
277
283
|
else:
|
|
278
284
|
imgs = (src_path/img_folder).ls(file_exts=['.jpeg', '.jpg', '.png'])
|
|
279
|
-
if progress:
|
|
285
|
+
if progress: logger.info(f"Describing {len(imgs)} images...")
|
|
280
286
|
descs = await describe_imgs(imgs, model, semaphore=semaphore, delay=delay)
|
|
281
287
|
save_img_descs(descs, desc_file)
|
|
282
|
-
if progress:
|
|
288
|
+
if progress: logger.info(f"Saved descriptions to {desc_file}")
|
|
283
289
|
pgs = read_pgs(src_path, join=False)
|
|
284
|
-
if progress:
|
|
290
|
+
if progress: logger.info(f"Adding descriptions to {len(pgs)} pages...")
|
|
285
291
|
enriched = [add_descs_to_pg(pg, descs) for pg in pgs]
|
|
286
292
|
for i,pg in enumerate(enriched, 1): (dst_path/f'page_{i}.md').write_text(pg)
|
|
287
|
-
if progress:
|
|
293
|
+
if progress: logger.info(f"Done! Enriched pages saved to {dst_path}")
|
|
288
294
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mistocr
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: Batch OCR for PDFs with heading restoration and visual content integration
|
|
5
5
|
Home-page: https://github.com/franckalbinet/mistocr
|
|
6
6
|
Author: Solveit
|
|
@@ -113,18 +113,6 @@ from mistocr.pipeline import pdf_to_md
|
|
|
113
113
|
await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
|
|
114
114
|
```
|
|
115
115
|
|
|
116
|
-
Step 1/3: Running OCR on files/test/resnet.pdf...
|
|
117
|
-
Mistral batch job status: QUEUED
|
|
118
|
-
Mistral batch job status: RUNNING
|
|
119
|
-
Mistral batch job status: RUNNING
|
|
120
|
-
Step 2/3: Fixing heading hierarchy...
|
|
121
|
-
Step 3/3: Adding image descriptions...
|
|
122
|
-
Describing 7 images...
|
|
123
|
-
Saved descriptions to ocr_temp/resnet/img_descriptions.json
|
|
124
|
-
Adding descriptions to 12 pages...
|
|
125
|
-
Done! Enriched pages saved to files/test/md_test
|
|
126
|
-
Done!
|
|
127
|
-
|
|
128
116
|
This will (as indicated by the output):
|
|
129
117
|
|
|
130
118
|
1. OCR the PDF using Mistral’s batch API
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
mistocr/__init__.py,sha256=pMtTmSUht-XtbR_7Doz6bsQqopJJd8rZ8I8zy2HwwoA,22
|
|
2
|
+
mistocr/_modidx.py,sha256=LyFez7ndKOXQpF0unhg1imCrW7xcUwO106W82ttVnnk,4366
|
|
3
|
+
mistocr/core.py,sha256=-yXqEro_kTE66lXWBrewS73SRTl-Btt9uyKNxMnzjIw,9181
|
|
4
|
+
mistocr/pipeline.py,sha256=n8AHIHGZBXYy_k7LtEBZSiTL6HClT84-7K4QlTjOLvo,2107
|
|
5
|
+
mistocr/refine.py,sha256=Q14DhUUsT5FLMxP9oIJ2TGQ3qbxe7ulXfRMPKpsd4Wo,13232
|
|
6
|
+
mistocr-0.4.1.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
|
|
7
|
+
mistocr-0.4.1.dist-info/METADATA,sha256=cvASaYVhDfCJ9bzrosdmTRd5ECIAPAl84H7nN5P06zY,7992
|
|
8
|
+
mistocr-0.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
+
mistocr-0.4.1.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
|
|
10
|
+
mistocr-0.4.1.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
|
|
11
|
+
mistocr-0.4.1.dist-info/RECORD,,
|
mistocr-0.3.2.dist-info/RECORD
DELETED
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
mistocr/__init__.py,sha256=vNiWJ14r_cw5t_7UDqDQIVZvladKFGyHH2avsLpN7Vg,22
|
|
2
|
-
mistocr/_modidx.py,sha256=2rHVTcz3A3BfDCmGqUNsJFW3_n3Ch1vxSorrPVyLvMI,4256
|
|
3
|
-
mistocr/core.py,sha256=EQYQgpnX2skgSX123u3dYaJHc1oDk5Nhgt5uBdXnCKs,8386
|
|
4
|
-
mistocr/pipeline.py,sha256=hVXpxRYtshaiUm9qXgfSLlyHCAxHZ6nAfPzoGXGmJMQ,1769
|
|
5
|
-
mistocr/refine.py,sha256=hQg4ZYRTkPFEiCNNBU2ykgXFfVLCNAp2IuwOwHBVQ2k,12983
|
|
6
|
-
mistocr-0.3.2.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
|
|
7
|
-
mistocr-0.3.2.dist-info/METADATA,sha256=igTgaDeBu00u_xJYtIcGlQswQCj2gIrdBi6NLiN5NNU,8438
|
|
8
|
-
mistocr-0.3.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
-
mistocr-0.3.2.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
|
|
10
|
-
mistocr-0.3.2.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
|
|
11
|
-
mistocr-0.3.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|