mistocr 0.2.7__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mistocr-0.2.7/mistocr.egg-info → mistocr-0.4.1}/PKG-INFO +2 -13
- {mistocr-0.2.7 → mistocr-0.4.1}/README.md +0 -12
- mistocr-0.4.1/mistocr/__init__.py +1 -0
- {mistocr-0.2.7 → mistocr-0.4.1}/mistocr/_modidx.py +3 -1
- {mistocr-0.2.7 → mistocr-0.4.1}/mistocr/core.py +67 -23
- mistocr-0.4.1/mistocr/pipeline.py +48 -0
- {mistocr-0.2.7 → mistocr-0.4.1}/mistocr/refine.py +84 -28
- {mistocr-0.2.7 → mistocr-0.4.1/mistocr.egg-info}/PKG-INFO +2 -13
- {mistocr-0.2.7 → mistocr-0.4.1}/mistocr.egg-info/requires.txt +1 -0
- {mistocr-0.2.7 → mistocr-0.4.1}/settings.ini +2 -2
- mistocr-0.2.7/mistocr/__init__.py +0 -1
- mistocr-0.2.7/mistocr/pipeline.py +0 -37
- {mistocr-0.2.7 → mistocr-0.4.1}/LICENSE +0 -0
- {mistocr-0.2.7 → mistocr-0.4.1}/MANIFEST.in +0 -0
- {mistocr-0.2.7 → mistocr-0.4.1}/mistocr.egg-info/SOURCES.txt +0 -0
- {mistocr-0.2.7 → mistocr-0.4.1}/mistocr.egg-info/dependency_links.txt +0 -0
- {mistocr-0.2.7 → mistocr-0.4.1}/mistocr.egg-info/entry_points.txt +0 -0
- {mistocr-0.2.7 → mistocr-0.4.1}/mistocr.egg-info/not-zip-safe +0 -0
- {mistocr-0.2.7 → mistocr-0.4.1}/mistocr.egg-info/top_level.txt +0 -0
- {mistocr-0.2.7 → mistocr-0.4.1}/pyproject.toml +0 -0
- {mistocr-0.2.7 → mistocr-0.4.1}/setup.cfg +0 -0
- {mistocr-0.2.7 → mistocr-0.4.1}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mistocr
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: Batch OCR for PDFs with heading restoration and visual content integration
|
|
5
5
|
Home-page: https://github.com/franckalbinet/mistocr
|
|
6
6
|
Author: Solveit
|
|
@@ -23,6 +23,7 @@ Requires-Dist: mistralai
|
|
|
23
23
|
Requires-Dist: pillow
|
|
24
24
|
Requires-Dist: dotenv
|
|
25
25
|
Requires-Dist: lisette
|
|
26
|
+
Requires-Dist: PyPDF2
|
|
26
27
|
Provides-Extra: dev
|
|
27
28
|
Dynamic: author
|
|
28
29
|
Dynamic: author-email
|
|
@@ -112,18 +113,6 @@ from mistocr.pipeline import pdf_to_md
|
|
|
112
113
|
await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
|
|
113
114
|
```
|
|
114
115
|
|
|
115
|
-
Step 1/3: Running OCR on files/test/resnet.pdf...
|
|
116
|
-
Mistral batch job status: QUEUED
|
|
117
|
-
Mistral batch job status: RUNNING
|
|
118
|
-
Mistral batch job status: RUNNING
|
|
119
|
-
Step 2/3: Fixing heading hierarchy...
|
|
120
|
-
Step 3/3: Adding image descriptions...
|
|
121
|
-
Describing 7 images...
|
|
122
|
-
Saved descriptions to ocr_temp/resnet/img_descriptions.json
|
|
123
|
-
Adding descriptions to 12 pages...
|
|
124
|
-
Done! Enriched pages saved to files/test/md_test
|
|
125
|
-
Done!
|
|
126
|
-
|
|
127
116
|
This will (as indicated by the output):
|
|
128
117
|
|
|
129
118
|
1. OCR the PDF using Mistral’s batch API
|
|
@@ -72,18 +72,6 @@ from mistocr.pipeline import pdf_to_md
|
|
|
72
72
|
await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
|
|
73
73
|
```
|
|
74
74
|
|
|
75
|
-
Step 1/3: Running OCR on files/test/resnet.pdf...
|
|
76
|
-
Mistral batch job status: QUEUED
|
|
77
|
-
Mistral batch job status: RUNNING
|
|
78
|
-
Mistral batch job status: RUNNING
|
|
79
|
-
Step 2/3: Fixing heading hierarchy...
|
|
80
|
-
Step 3/3: Adding image descriptions...
|
|
81
|
-
Describing 7 images...
|
|
82
|
-
Saved descriptions to ocr_temp/resnet/img_descriptions.json
|
|
83
|
-
Adding descriptions to 12 pages...
|
|
84
|
-
Done! Enriched pages saved to files/test/md_test
|
|
85
|
-
Done!
|
|
86
|
-
|
|
87
75
|
This will (as indicated by the output):
|
|
88
76
|
|
|
89
77
|
1. OCR the PDF using Mistral’s batch API
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.4.1"
|
|
@@ -5,7 +5,8 @@ d = { 'settings': { 'branch': 'main',
|
|
|
5
5
|
'doc_host': 'https://franckalbinet.github.io',
|
|
6
6
|
'git_url': 'https://github.com/franckalbinet/mistocr',
|
|
7
7
|
'lib_path': 'mistocr'},
|
|
8
|
-
'syms': { 'mistocr.core': { 'mistocr.core.
|
|
8
|
+
'syms': { 'mistocr.core': { 'mistocr.core._check_timeout': ('core.html#_check_timeout', 'mistocr/core.py'),
|
|
9
|
+
'mistocr.core._get_paths': ('core.html#_get_paths', 'mistocr/core.py'),
|
|
9
10
|
'mistocr.core._prep_batch': ('core.html#_prep_batch', 'mistocr/core.py'),
|
|
10
11
|
'mistocr.core._run_batch': ('core.html#_run_batch', 'mistocr/core.py'),
|
|
11
12
|
'mistocr.core.create_batch_entry': ('core.html#create_batch_entry', 'mistocr/core.py'),
|
|
@@ -18,6 +19,7 @@ d = { 'settings': { 'branch': 'main',
|
|
|
18
19
|
'mistocr.core.save_page': ('core.html#save_page', 'mistocr/core.py'),
|
|
19
20
|
'mistocr.core.save_pages': ('core.html#save_pages', 'mistocr/core.py'),
|
|
20
21
|
'mistocr.core.submit_batch': ('core.html#submit_batch', 'mistocr/core.py'),
|
|
22
|
+
'mistocr.core.subset_pdf': ('core.html#subset_pdf', 'mistocr/core.py'),
|
|
21
23
|
'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
|
|
22
24
|
'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
|
|
23
25
|
'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},
|
|
@@ -3,8 +3,9 @@
|
|
|
3
3
|
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_core.ipynb.
|
|
4
4
|
|
|
5
5
|
# %% auto 0
|
|
6
|
-
__all__ = ['ocr_model', 'ocr_endpoint', 'get_api_key', 'upload_pdf', 'create_batch_entry', 'prep_pdf_batch',
|
|
7
|
-
'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr_pdf',
|
|
6
|
+
__all__ = ['logger', 'ocr_model', 'ocr_endpoint', 'get_api_key', 'upload_pdf', 'create_batch_entry', 'prep_pdf_batch',
|
|
7
|
+
'submit_batch', 'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr_pdf',
|
|
8
|
+
'read_pgs', 'subset_pdf']
|
|
8
9
|
|
|
9
10
|
# %% ../nbs/00_core.ipynb 3
|
|
10
11
|
from fastcore.all import *
|
|
@@ -13,8 +14,15 @@ from io import BytesIO
|
|
|
13
14
|
from pathlib import Path
|
|
14
15
|
from PIL import Image
|
|
15
16
|
from mistralai import Mistral
|
|
17
|
+
import PyPDF2
|
|
18
|
+
import logging
|
|
16
19
|
|
|
17
|
-
# %% ../nbs/00_core.ipynb
|
|
20
|
+
# %% ../nbs/00_core.ipynb 4
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
logging.basicConfig(level=logging.WARNING, format='%(name)s - %(levelname)s - %(message)s')
|
|
23
|
+
logger.setLevel(logging.DEBUG)
|
|
24
|
+
|
|
25
|
+
# %% ../nbs/00_core.ipynb 7
|
|
18
26
|
def get_api_key(
|
|
19
27
|
key:str=None # Mistral API key
|
|
20
28
|
):
|
|
@@ -23,11 +31,11 @@ def get_api_key(
|
|
|
23
31
|
if not key: raise ValueError("MISTRAL_API_KEY not found")
|
|
24
32
|
return key
|
|
25
33
|
|
|
26
|
-
# %% ../nbs/00_core.ipynb
|
|
34
|
+
# %% ../nbs/00_core.ipynb 8
|
|
27
35
|
ocr_model = "mistral-ocr-latest"
|
|
28
36
|
ocr_endpoint = "/v1/ocr"
|
|
29
37
|
|
|
30
|
-
# %% ../nbs/00_core.ipynb
|
|
38
|
+
# %% ../nbs/00_core.ipynb 11
|
|
31
39
|
def upload_pdf(
|
|
32
40
|
path:str, # Path to PDF file
|
|
33
41
|
key:str=None # Mistral API key
|
|
@@ -38,7 +46,7 @@ def upload_pdf(
|
|
|
38
46
|
uploaded = c.files.upload(file=dict(file_name=path.stem, content=path.read_bytes()), purpose="ocr")
|
|
39
47
|
return c.files.get_signed_url(file_id=uploaded.id).url, c
|
|
40
48
|
|
|
41
|
-
# %% ../nbs/00_core.ipynb
|
|
49
|
+
# %% ../nbs/00_core.ipynb 16
|
|
42
50
|
def create_batch_entry(
|
|
43
51
|
path:str, # Path to PDF file,
|
|
44
52
|
url:str, # Mistral signed URL
|
|
@@ -50,7 +58,7 @@ def create_batch_entry(
|
|
|
50
58
|
if not cid: cid = path.stem
|
|
51
59
|
return dict(custom_id=cid, body=dict(document=dict(type="document_url", document_url=url), include_image_base64=inc_img))
|
|
52
60
|
|
|
53
|
-
# %% ../nbs/00_core.ipynb
|
|
61
|
+
# %% ../nbs/00_core.ipynb 18
|
|
54
62
|
def prep_pdf_batch(
|
|
55
63
|
path:str, # Path to PDF file,
|
|
56
64
|
cid:str=None, # Custom ID (by default using the file name without extention)
|
|
@@ -61,7 +69,7 @@ def prep_pdf_batch(
|
|
|
61
69
|
url, c = upload_pdf(path, key)
|
|
62
70
|
return create_batch_entry(path, url, cid, inc_img), c
|
|
63
71
|
|
|
64
|
-
# %% ../nbs/00_core.ipynb
|
|
72
|
+
# %% ../nbs/00_core.ipynb 22
|
|
65
73
|
def submit_batch(
|
|
66
74
|
entries:list[dict], # List of batch entries,
|
|
67
75
|
c:Mistral=None, # Mistral client,
|
|
@@ -75,20 +83,35 @@ def submit_batch(
|
|
|
75
83
|
batch_data = c.files.upload(file=dict(file_name="batch.jsonl", content=open(f.name, "rb")), purpose="batch")
|
|
76
84
|
return c.batch.jobs.create(input_files=[batch_data.id], model=model, endpoint=endpoint)
|
|
77
85
|
|
|
78
|
-
# %% ../nbs/00_core.ipynb
|
|
86
|
+
# %% ../nbs/00_core.ipynb 25
|
|
87
|
+
def _check_timeout(
|
|
88
|
+
queued_time:int, # Time spent in QUEUED state (seconds)
|
|
89
|
+
timeout:int, # Maximum allowed QUEUED time (seconds)
|
|
90
|
+
job_id:str # Batch job ID
|
|
91
|
+
):
|
|
92
|
+
"Raise TimeoutError if job has been queued longer than timeout"
|
|
93
|
+
if queued_time >= timeout: raise TimeoutError(f"Job {job_id} stayed in QUEUED for {queued_time}s, exceeding timeout of {timeout}s. Check your balance or Mistral Status.")
|
|
94
|
+
|
|
95
|
+
# %% ../nbs/00_core.ipynb 26
|
|
79
96
|
def wait_for_job(
|
|
80
|
-
job:dict, #
|
|
81
|
-
c:Mistral=None, # Mistral client
|
|
82
|
-
poll_interval:int=1 #
|
|
83
|
-
|
|
97
|
+
job:dict, # Batch job from submit_batch
|
|
98
|
+
c:Mistral=None, # Mistral client
|
|
99
|
+
poll_interval:int=1, # Seconds between status checks
|
|
100
|
+
queued_timeout:int=300 # Max seconds in QUEUED before timeout
|
|
101
|
+
) -> dict: # Completed job dict
|
|
84
102
|
"Poll job until completion and return final job status"
|
|
103
|
+
logger.info(f"Waiting for batch job {job.id} (initial status: {job.status})")
|
|
104
|
+
queued_time = 0
|
|
85
105
|
while job.status in ["QUEUED", "RUNNING"]:
|
|
86
|
-
|
|
106
|
+
logger.debug(f"Job {job.id} status: {job.status} (elapsed: {queued_time}s)")
|
|
107
|
+
if job.status == "QUEUED": queued_time += poll_interval; _check_timeout(queued_time, queued_timeout, job.id)
|
|
87
108
|
time.sleep(poll_interval)
|
|
88
109
|
job = c.batch.jobs.get(job_id=job.id)
|
|
110
|
+
logger.info(f"Job {job.id} completed with status: {job.status}")
|
|
111
|
+
if job.status != "SUCCESS": logger.warning(f"Job {job.id} finished with non-success status: {job.status}")
|
|
89
112
|
return job
|
|
90
113
|
|
|
91
|
-
# %% ../nbs/00_core.ipynb
|
|
114
|
+
# %% ../nbs/00_core.ipynb 28
|
|
92
115
|
def download_results(
|
|
93
116
|
job:dict, # Job dict,
|
|
94
117
|
c:Mistral=None # Mistral client
|
|
@@ -97,7 +120,7 @@ def download_results(
|
|
|
97
120
|
content = c.files.download(file_id=job.output_file).read().decode('utf-8')
|
|
98
121
|
return [json.loads(line) for line in content.strip().split('\n') if line]
|
|
99
122
|
|
|
100
|
-
# %% ../nbs/00_core.ipynb
|
|
123
|
+
# %% ../nbs/00_core.ipynb 33
|
|
101
124
|
def save_images(
|
|
102
125
|
page:dict, # Page dict,
|
|
103
126
|
img_dir:str='img' # Directory to save images
|
|
@@ -108,7 +131,7 @@ def save_images(
|
|
|
108
131
|
img_bytes = base64.b64decode(img['image_base64'].split(',')[1])
|
|
109
132
|
Image.open(BytesIO(img_bytes)).save(img_dir / img['id'])
|
|
110
133
|
|
|
111
|
-
# %% ../nbs/00_core.ipynb
|
|
134
|
+
# %% ../nbs/00_core.ipynb 34
|
|
112
135
|
def save_page(
|
|
113
136
|
page:dict, # Page dict,
|
|
114
137
|
dst:str, # Directory to save page
|
|
@@ -120,7 +143,7 @@ def save_page(
|
|
|
120
143
|
img_dir.mkdir(exist_ok=True)
|
|
121
144
|
save_images(page, img_dir)
|
|
122
145
|
|
|
123
|
-
# %% ../nbs/00_core.ipynb
|
|
146
|
+
# %% ../nbs/00_core.ipynb 36
|
|
124
147
|
def save_pages(
|
|
125
148
|
ocr_resp:dict, # OCR response,
|
|
126
149
|
dst:str, # Directory to save pages,
|
|
@@ -133,7 +156,7 @@ def save_pages(
|
|
|
133
156
|
for page in ocr_resp['pages']: save_page(page, dst, img_dir)
|
|
134
157
|
return dst
|
|
135
158
|
|
|
136
|
-
# %% ../nbs/00_core.ipynb
|
|
159
|
+
# %% ../nbs/00_core.ipynb 42
|
|
137
160
|
def _get_paths(path:str) -> list[Path]:
|
|
138
161
|
"Get list of PDFs from file or folder"
|
|
139
162
|
path = Path(path)
|
|
@@ -144,7 +167,7 @@ def _get_paths(path:str) -> list[Path]:
|
|
|
144
167
|
return pdfs
|
|
145
168
|
raise ValueError(f"Path not found: {path}")
|
|
146
169
|
|
|
147
|
-
# %% ../nbs/00_core.ipynb
|
|
170
|
+
# %% ../nbs/00_core.ipynb 43
|
|
148
171
|
def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[dict], Mistral]:
|
|
149
172
|
"Prepare batch entries for list of PDFs"
|
|
150
173
|
entries, c = [], None
|
|
@@ -153,7 +176,7 @@ def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[
|
|
|
153
176
|
entries.append(entry)
|
|
154
177
|
return entries, c
|
|
155
178
|
|
|
156
|
-
# %% ../nbs/00_core.ipynb
|
|
179
|
+
# %% ../nbs/00_core.ipynb 44
|
|
157
180
|
def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]:
|
|
158
181
|
"Submit batch, wait for completion, and download results"
|
|
159
182
|
job = submit_batch(entries, c)
|
|
@@ -161,7 +184,7 @@ def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]
|
|
|
161
184
|
if job.status != 'SUCCESS': raise Exception(f"Job failed with status: {job.status}")
|
|
162
185
|
return download_results(job, c)
|
|
163
186
|
|
|
164
|
-
# %% ../nbs/00_core.ipynb
|
|
187
|
+
# %% ../nbs/00_core.ipynb 45
|
|
165
188
|
def ocr_pdf(
|
|
166
189
|
path:str, # Path to PDF file or folder,
|
|
167
190
|
dst:str='md', # Directory to save markdown pages,
|
|
@@ -175,7 +198,7 @@ def ocr_pdf(
|
|
|
175
198
|
results = _run_batch(entries, c, poll_interval)
|
|
176
199
|
return L([save_pages(r['response']['body'], dst, r['custom_id']) for r in results])
|
|
177
200
|
|
|
178
|
-
# %% ../nbs/00_core.ipynb
|
|
201
|
+
# %% ../nbs/00_core.ipynb 52
|
|
179
202
|
def read_pgs(
|
|
180
203
|
path:str, # OCR output directory,
|
|
181
204
|
join:bool=True # Join pages into single string
|
|
@@ -185,3 +208,24 @@ def read_pgs(
|
|
|
185
208
|
pgs = sorted(path.glob('page_*.md'), key=lambda p: int(p.stem.split('_')[1]))
|
|
186
209
|
contents = L([p.read_text() for p in pgs])
|
|
187
210
|
return '\n\n'.join(contents) if join else contents
|
|
211
|
+
|
|
212
|
+
# %% ../nbs/00_core.ipynb 59
|
|
213
|
+
def subset_pdf(
|
|
214
|
+
path:str, # Path to PDF file
|
|
215
|
+
start:int=1, # Start page (1-based)
|
|
216
|
+
end:int=None, # End page (1-based, inclusive)
|
|
217
|
+
dst:str='.' # Output directory
|
|
218
|
+
) -> Path: # Path to subset PDF
|
|
219
|
+
"Extract page range from PDF and save with range suffix"
|
|
220
|
+
path = Path(path)
|
|
221
|
+
writer = PyPDF2.PdfWriter()
|
|
222
|
+
with open(path, 'rb') as f:
|
|
223
|
+
reader = PyPDF2.PdfReader(f)
|
|
224
|
+
n = len(reader.pages)
|
|
225
|
+
end = end or n
|
|
226
|
+
s, e = max(0, start-1), min(n, end) - 1
|
|
227
|
+
for i in range(s, e+1): writer.add_page(reader.pages[i])
|
|
228
|
+
suffix = f"_p{s+1}-{e+1}" if s>0 or e<n-1 else ""
|
|
229
|
+
out = Path(dst) / f"{path.stem}{suffix}.pdf"
|
|
230
|
+
with open(out, 'wb') as f: writer.write(f)
|
|
231
|
+
return out
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""End-to-End Pipeline: PDF OCR, Markdown Heading Correction, and AI Image Descriptions"""
|
|
2
|
+
|
|
3
|
+
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_pipeline.ipynb.
|
|
4
|
+
|
|
5
|
+
# %% auto 0
|
|
6
|
+
__all__ = ['logger', 'pdf_to_md']
|
|
7
|
+
|
|
8
|
+
# %% ../nbs/02_pipeline.ipynb 3
|
|
9
|
+
from fastcore.all import *
|
|
10
|
+
from .core import read_pgs, ocr_pdf
|
|
11
|
+
from .refine import add_img_descs, fix_hdgs
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from asyncio import Semaphore, gather, sleep
|
|
14
|
+
import tempfile
|
|
15
|
+
import os, json, shutil
|
|
16
|
+
import logging
|
|
17
|
+
|
|
18
|
+
# %% ../nbs/02_pipeline.ipynb 4
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
logging.basicConfig(level=logging.WARNING, format='%(name)s - %(levelname)s - %(message)s')
|
|
21
|
+
logger.setLevel(logging.INFO)
|
|
22
|
+
|
|
23
|
+
# %% ../nbs/02_pipeline.ipynb 5
|
|
24
|
+
@delegates(add_img_descs)
|
|
25
|
+
async def pdf_to_md(
|
|
26
|
+
pdf_path:str, # Path to input PDF file
|
|
27
|
+
dst:str, # Destination directory for output markdown
|
|
28
|
+
ocr_dst:str=None, # Optional OCR output directory
|
|
29
|
+
model:str='claude-sonnet-4-5', # Model to use for heading fixes and image descriptions
|
|
30
|
+
add_img_desc:bool=True, # Whether to add image descriptions
|
|
31
|
+
progress:bool=True, # Whether to show progress messages
|
|
32
|
+
**kwargs
|
|
33
|
+
):
|
|
34
|
+
"Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
|
|
35
|
+
"Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
|
|
36
|
+
cleanup = ocr_dst is None
|
|
37
|
+
if cleanup: ocr_dst = tempfile.mkdtemp()
|
|
38
|
+
n_steps = 3 if add_img_desc else 2
|
|
39
|
+
if progress: logger.info(f"Step 1/{n_steps}: Running OCR on {pdf_path}...")
|
|
40
|
+
ocr_dir = ocr_pdf(pdf_path, ocr_dst)[0]
|
|
41
|
+
if progress: logger.info(f"Step 2/{n_steps}: Fixing heading hierarchy...")
|
|
42
|
+
fix_hdgs(ocr_dir, model=model)
|
|
43
|
+
if add_img_desc:
|
|
44
|
+
if progress: logger.info(f"Step 3/{n_steps}: Adding image descriptions...")
|
|
45
|
+
await add_img_descs(ocr_dir, dst=dst, model=model, progress=progress, **kwargs)
|
|
46
|
+
elif dst != str(ocr_dir): shutil.copytree(ocr_dir, dst, dirs_exist_ok=True)
|
|
47
|
+
if cleanup: shutil.rmtree(ocr_dst)
|
|
48
|
+
if progress: logger.info("Done!")
|
|
@@ -3,10 +3,10 @@
|
|
|
3
3
|
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_refine.ipynb.
|
|
4
4
|
|
|
5
5
|
# %% auto 0
|
|
6
|
-
__all__ = ['describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
|
|
7
|
-
'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes',
|
|
8
|
-
'
|
|
9
|
-
'add_img_descs']
|
|
6
|
+
__all__ = ['logger', 'prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
|
|
7
|
+
'HeadingCorrection', 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes',
|
|
8
|
+
'fix_hdgs', 'ImgDescription', 'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs',
|
|
9
|
+
'add_descs_to_pg', 'add_descs_to_pgs', 'add_img_descs']
|
|
10
10
|
|
|
11
11
|
# %% ../nbs/01_refine.ipynb 3
|
|
12
12
|
from fastcore.all import *
|
|
@@ -20,8 +20,14 @@ import os
|
|
|
20
20
|
import json
|
|
21
21
|
import shutil
|
|
22
22
|
from asyncio import Semaphore, gather, sleep
|
|
23
|
+
import logging
|
|
23
24
|
|
|
24
|
-
# %% ../nbs/01_refine.ipynb
|
|
25
|
+
# %% ../nbs/01_refine.ipynb 4
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
logging.basicConfig(level=logging.WARNING, format='%(name)s - %(levelname)s - %(message)s')
|
|
28
|
+
logger.setLevel(logging.INFO)
|
|
29
|
+
|
|
30
|
+
# %% ../nbs/01_refine.ipynb 8
|
|
25
31
|
def get_hdgs(
|
|
26
32
|
md:str # Markdown file string
|
|
27
33
|
) -> L: # L of strings
|
|
@@ -32,7 +38,7 @@ def get_hdgs(
|
|
|
32
38
|
|
|
33
39
|
|
|
34
40
|
|
|
35
|
-
# %% ../nbs/01_refine.ipynb
|
|
41
|
+
# %% ../nbs/01_refine.ipynb 9
|
|
36
42
|
def add_pg_hdgs(
|
|
37
43
|
md:str, # Markdown file string,
|
|
38
44
|
n:int # Page number
|
|
@@ -42,7 +48,7 @@ def add_pg_hdgs(
|
|
|
42
48
|
def repl(m): return m.group(0) + f' ... page {n}'
|
|
43
49
|
return sub(r'^#{1,6} .+$', repl, md, flags=MULTILINE)
|
|
44
50
|
|
|
45
|
-
# %% ../nbs/01_refine.ipynb
|
|
51
|
+
# %% ../nbs/01_refine.ipynb 13
|
|
46
52
|
def read_pgs_pg(
|
|
47
53
|
path:str # Path to the markdown file
|
|
48
54
|
) -> L: # List of markdown pages
|
|
@@ -50,7 +56,7 @@ def read_pgs_pg(
|
|
|
50
56
|
pgs = read_pgs(path, join=False)
|
|
51
57
|
return L([add_pg_hdgs(pg, n) for n, pg in enumerate(pgs, 1)]).concat()
|
|
52
58
|
|
|
53
|
-
# %% ../nbs/01_refine.ipynb
|
|
59
|
+
# %% ../nbs/01_refine.ipynb 16
|
|
54
60
|
def fmt_hdgs_idx(
|
|
55
61
|
hdgs: list[str] # List of markdown headings
|
|
56
62
|
) -> str: # Formatted string with index
|
|
@@ -58,15 +64,58 @@ def fmt_hdgs_idx(
|
|
|
58
64
|
return '\n'.join(f"{i}. {h}" for i, h in enumerate(hdgs))
|
|
59
65
|
|
|
60
66
|
|
|
61
|
-
# %% ../nbs/01_refine.ipynb
|
|
67
|
+
# %% ../nbs/01_refine.ipynb 19
|
|
62
68
|
class HeadingCorrection(BaseModel):
|
|
69
|
+
"A single heading correction mapping an index to its corrected markdown heading"
|
|
63
70
|
index: int
|
|
64
71
|
corrected: str
|
|
65
72
|
|
|
73
|
+
# %% ../nbs/01_refine.ipynb 20
|
|
66
74
|
class HeadingCorrections(BaseModel):
|
|
75
|
+
"Collection of heading corrections returned by the LLM"
|
|
67
76
|
corrections: list[HeadingCorrection]
|
|
68
77
|
|
|
69
78
|
# %% ../nbs/01_refine.ipynb 22
|
|
79
|
+
prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
|
|
80
|
+
|
|
81
|
+
INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title ... page 1")
|
|
82
|
+
|
|
83
|
+
ANALYSIS STEPS (think through these before outputting corrections):
|
|
84
|
+
1. For each numbered heading (e.g., "4.1", "2.a", "A.1"), identify its parent heading (e.g., "4", "2", "A")
|
|
85
|
+
2. Verify the child heading is exactly one # deeper than its parent
|
|
86
|
+
3. If not, mark it for correction
|
|
87
|
+
|
|
88
|
+
RULES - Apply these fixes in order:
|
|
89
|
+
|
|
90
|
+
1. **Single H1 rule**: Documents must have exactly ONE # heading (typically the document title at the top)
|
|
91
|
+
- If index 0 is already #, then all subsequent headings (index 1+) must be ## or deeper
|
|
92
|
+
- If no H1 exists, the first major heading should be #, and all others ## or deeper
|
|
93
|
+
- NO exceptions: appendices, references, and all sections are ## or deeper after the title
|
|
94
|
+
|
|
95
|
+
2. **Infer depth from numbering patterns**: If headings contain section numbers, deeper nesting means deeper heading level
|
|
96
|
+
- Parent section (e.g., "1", "2", "A") MUST be shallower than child (e.g., "1.1", "2.a", "A.1")
|
|
97
|
+
- Child section MUST be exactly one # deeper than parent
|
|
98
|
+
- Works with any numbering: "1/1.1/1.1.1", "A/A.1/A.1.a", "I/I.A/I.A.1", etc.
|
|
99
|
+
|
|
100
|
+
3. **Level jumps**: Headings can only increase by one # at a time when moving deeper
|
|
101
|
+
- Wrong: ## Section → ##### Subsection
|
|
102
|
+
- Fixed: ## Section → ### Subsection
|
|
103
|
+
|
|
104
|
+
4. **Decreasing levels is OK**: Moving back up the hierarchy (### to ##) is valid for new sections
|
|
105
|
+
|
|
106
|
+
5. **Unnumbered headings in numbered documents**: If the document uses numbered headings consistently, any unnumbered heading appearing within that structure is likely misclassified bold text and should be converted to regular text (output the heading text without any # symbols in the corrected field)
|
|
107
|
+
|
|
108
|
+
OUTPUT: Return a list of corrections, where each correction has:
|
|
109
|
+
- index: the heading's index number
|
|
110
|
+
- corrected: the fixed heading text (without the index prefix), or empty string "" to remove the heading entirely
|
|
111
|
+
IMPORTANT: Preserve the " ... page N" suffix in all corrected headings.
|
|
112
|
+
Only include headings that need changes.
|
|
113
|
+
|
|
114
|
+
Headings to analyze:
|
|
115
|
+
{headings_list}
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
# %% ../nbs/01_refine.ipynb 24
|
|
70
119
|
def fix_hdg_hierarchy(
|
|
71
120
|
hdgs: list[str], # List of markdown headings
|
|
72
121
|
prompt: str=None, # Prompt to use
|
|
@@ -82,7 +131,7 @@ def fix_hdg_hierarchy(
|
|
|
82
131
|
return {o['index']: o['corrected'] for o in fixes}
|
|
83
132
|
|
|
84
133
|
|
|
85
|
-
# %% ../nbs/01_refine.ipynb
|
|
134
|
+
# %% ../nbs/01_refine.ipynb 27
|
|
86
135
|
@delegates(fix_hdg_hierarchy)
|
|
87
136
|
def mk_fixes_lut(
|
|
88
137
|
hdgs: list[str], # List of markdown headings
|
|
@@ -95,7 +144,7 @@ def mk_fixes_lut(
|
|
|
95
144
|
fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key, **kwargs)
|
|
96
145
|
return {hdgs[k]:v for k,v in fixes.items()}
|
|
97
146
|
|
|
98
|
-
# %% ../nbs/01_refine.ipynb
|
|
147
|
+
# %% ../nbs/01_refine.ipynb 30
|
|
99
148
|
def apply_hdg_fixes(
|
|
100
149
|
p:str, # Page to fix
|
|
101
150
|
lut_fixes: dict[str, str], # Lookup table of fixes
|
|
@@ -104,7 +153,7 @@ def apply_hdg_fixes(
|
|
|
104
153
|
for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old))
|
|
105
154
|
return p
|
|
106
155
|
|
|
107
|
-
# %% ../nbs/01_refine.ipynb
|
|
156
|
+
# %% ../nbs/01_refine.ipynb 33
|
|
108
157
|
@delegates(mk_fixes_lut)
|
|
109
158
|
def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:str='img', **kwargs):
|
|
110
159
|
"Fix heading hierarchy in markdown document"
|
|
@@ -116,13 +165,13 @@ def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:st
|
|
|
116
165
|
lut = mk_fixes_lut(L([get_hdgs(pg) for pg in pgs_with_pg]).concat(), model, **kwargs)
|
|
117
166
|
for i,p in enumerate(pgs_with_pg, 1): (dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut))
|
|
118
167
|
|
|
119
|
-
# %% ../nbs/01_refine.ipynb
|
|
168
|
+
# %% ../nbs/01_refine.ipynb 39
|
|
120
169
|
class ImgDescription(BaseModel):
|
|
121
170
|
"Image classification and description for OCR'd documents"
|
|
122
171
|
is_informative:bool # Whether image contains informative content (charts, diagrams, tables) vs decorative (logos, backgrounds)
|
|
123
172
|
description:str # Detailed description of the image content for RAG and accessibility
|
|
124
173
|
|
|
125
|
-
# %% ../nbs/01_refine.ipynb
|
|
174
|
+
# %% ../nbs/01_refine.ipynb 42
|
|
126
175
|
describe_img_prompt = """Analyze this image from an academic/technical document.
|
|
127
176
|
|
|
128
177
|
Step 1: Determine if this image is informative for understanding the document content.
|
|
@@ -135,7 +184,7 @@ Step 2:
|
|
|
135
184
|
|
|
136
185
|
Return your response as JSON with 'is_informative' (boolean) and 'description' (string) fields."""
|
|
137
186
|
|
|
138
|
-
# %% ../nbs/01_refine.ipynb
|
|
187
|
+
# %% ../nbs/01_refine.ipynb 43
|
|
139
188
|
async def describe_img(
|
|
140
189
|
img_path: Path, # Path to the image file
|
|
141
190
|
model: str = 'claude-sonnet-4-5', # Model to use
|
|
@@ -146,7 +195,7 @@ async def describe_img(
|
|
|
146
195
|
r = await chat([img_path.read_bytes(), prompt], response_format=ImgDescription)
|
|
147
196
|
return r
|
|
148
197
|
|
|
149
|
-
# %% ../nbs/01_refine.ipynb
|
|
198
|
+
# %% ../nbs/01_refine.ipynb 47
|
|
150
199
|
async def limit(
|
|
151
200
|
semaphore, # Semaphore for concurrency control
|
|
152
201
|
coro, # Coroutine to execute
|
|
@@ -158,14 +207,14 @@ async def limit(
|
|
|
158
207
|
if delay: await sleep(delay)
|
|
159
208
|
return r
|
|
160
209
|
|
|
161
|
-
# %% ../nbs/01_refine.ipynb
|
|
210
|
+
# %% ../nbs/01_refine.ipynb 49
|
|
162
211
|
def parse_r(
|
|
163
212
|
result # ModelResponse object from API call
|
|
164
213
|
): # Dictionary with 'is_informative' and 'description' keys
|
|
165
214
|
"Extract and parse JSON content from model response"
|
|
166
215
|
return json.loads(result.choices[0].message.content)
|
|
167
216
|
|
|
168
|
-
# %% ../nbs/01_refine.ipynb
|
|
217
|
+
# %% ../nbs/01_refine.ipynb 51
|
|
169
218
|
async def describe_imgs(
|
|
170
219
|
imgs: list[Path], # List of image file paths to describe
|
|
171
220
|
model: str = 'claude-sonnet-4-5', # Model to use for image description
|
|
@@ -178,7 +227,7 @@ async def describe_imgs(
|
|
|
178
227
|
results = await gather(*[limit(sem, describe_img(img, model, prompt), delay) for img in imgs])
|
|
179
228
|
return {img.name: parse_r(r) for img, r in zip(imgs, results)}
|
|
180
229
|
|
|
181
|
-
# %% ../nbs/01_refine.ipynb
|
|
230
|
+
# %% ../nbs/01_refine.ipynb 53
|
|
182
231
|
def save_img_descs(
|
|
183
232
|
descs: dict, # Dictionary of image descriptions
|
|
184
233
|
dst_fname: Path, # Path to save the JSON file
|
|
@@ -186,7 +235,7 @@ def save_img_descs(
|
|
|
186
235
|
"Save image descriptions to JSON file"
|
|
187
236
|
Path(dst_fname).write_text(json.dumps(descs, indent=2))
|
|
188
237
|
|
|
189
|
-
# %% ../nbs/01_refine.ipynb
|
|
238
|
+
# %% ../nbs/01_refine.ipynb 58
|
|
190
239
|
def add_descs_to_pg(
|
|
191
240
|
pg:str, # Page markdown content
|
|
192
241
|
descs:dict # Dictionary mapping image filenames to their descriptions
|
|
@@ -197,7 +246,7 @@ def add_descs_to_pg(
|
|
|
197
246
|
if fname in descs: pg = pg.replace(link, f"{link}\nAI-generated image description:\n___\n{descs[fname]['description']}\n___")
|
|
198
247
|
return pg
|
|
199
248
|
|
|
200
|
-
# %% ../nbs/01_refine.ipynb
|
|
249
|
+
# %% ../nbs/01_refine.ipynb 63
|
|
201
250
|
def add_descs_to_pgs(
|
|
202
251
|
pgs:list, # List of page markdown strings
|
|
203
252
|
descs:dict # Dictionary mapping image filenames to their descriptions
|
|
@@ -205,7 +254,7 @@ def add_descs_to_pgs(
|
|
|
205
254
|
"Add AI-generated descriptions to images in all pages"
|
|
206
255
|
return [add_descs_to_pg(pg, descs) for pg in pgs]
|
|
207
256
|
|
|
208
|
-
# %% ../nbs/01_refine.ipynb
|
|
257
|
+
# %% ../nbs/01_refine.ipynb 66
|
|
209
258
|
async def add_img_descs(
|
|
210
259
|
src:str, # Path to source markdown directory
|
|
211
260
|
dst:str=None, # Destination directory (defaults to src if None)
|
|
@@ -214,25 +263,32 @@ async def add_img_descs(
|
|
|
214
263
|
semaphore:int=2, # Max concurrent API requests
|
|
215
264
|
delay:float=1, # Delay in seconds between API calls
|
|
216
265
|
force:bool=False, # Force regeneration even if cache exists
|
|
217
|
-
progress:bool=True #
|
|
266
|
+
progress:bool=True # Log progress messages
|
|
218
267
|
):
|
|
219
268
|
"Describe all images in markdown document and insert descriptions inline"
|
|
220
269
|
src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
|
|
221
270
|
if dst_path != src_path: dst_path.mkdir(parents=True, exist_ok=True)
|
|
222
271
|
src_imgs = src_path/img_folder
|
|
272
|
+
|
|
273
|
+
# Check if image folder exists
|
|
274
|
+
if not src_imgs.exists():
|
|
275
|
+
if progress: logger.info(f"No images to describe in the document (no '{img_folder}' folder found)")
|
|
276
|
+
return
|
|
277
|
+
|
|
223
278
|
if src_imgs.exists() and dst_path != src_path: shutil.copytree(src_imgs, dst_path/img_folder, dirs_exist_ok=True)
|
|
224
279
|
desc_file = src_path/'img_descriptions.json'
|
|
225
280
|
if desc_file.exists() and not force:
|
|
226
|
-
if progress:
|
|
281
|
+
if progress: logger.info(f"Loading existing descriptions from {desc_file}")
|
|
227
282
|
descs = json.loads(desc_file.read_text())
|
|
228
283
|
else:
|
|
229
284
|
imgs = (src_path/img_folder).ls(file_exts=['.jpeg', '.jpg', '.png'])
|
|
230
|
-
if progress:
|
|
285
|
+
if progress: logger.info(f"Describing {len(imgs)} images...")
|
|
231
286
|
descs = await describe_imgs(imgs, model, semaphore=semaphore, delay=delay)
|
|
232
287
|
save_img_descs(descs, desc_file)
|
|
233
|
-
if progress:
|
|
288
|
+
if progress: logger.info(f"Saved descriptions to {desc_file}")
|
|
234
289
|
pgs = read_pgs(src_path, join=False)
|
|
235
|
-
if progress:
|
|
290
|
+
if progress: logger.info(f"Adding descriptions to {len(pgs)} pages...")
|
|
236
291
|
enriched = [add_descs_to_pg(pg, descs) for pg in pgs]
|
|
237
292
|
for i,pg in enumerate(enriched, 1): (dst_path/f'page_{i}.md').write_text(pg)
|
|
238
|
-
if progress:
|
|
293
|
+
if progress: logger.info(f"Done! Enriched pages saved to {dst_path}")
|
|
294
|
+
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mistocr
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: Batch OCR for PDFs with heading restoration and visual content integration
|
|
5
5
|
Home-page: https://github.com/franckalbinet/mistocr
|
|
6
6
|
Author: Solveit
|
|
@@ -23,6 +23,7 @@ Requires-Dist: mistralai
|
|
|
23
23
|
Requires-Dist: pillow
|
|
24
24
|
Requires-Dist: dotenv
|
|
25
25
|
Requires-Dist: lisette
|
|
26
|
+
Requires-Dist: PyPDF2
|
|
26
27
|
Provides-Extra: dev
|
|
27
28
|
Dynamic: author
|
|
28
29
|
Dynamic: author-email
|
|
@@ -112,18 +113,6 @@ from mistocr.pipeline import pdf_to_md
|
|
|
112
113
|
await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
|
|
113
114
|
```
|
|
114
115
|
|
|
115
|
-
Step 1/3: Running OCR on files/test/resnet.pdf...
|
|
116
|
-
Mistral batch job status: QUEUED
|
|
117
|
-
Mistral batch job status: RUNNING
|
|
118
|
-
Mistral batch job status: RUNNING
|
|
119
|
-
Step 2/3: Fixing heading hierarchy...
|
|
120
|
-
Step 3/3: Adding image descriptions...
|
|
121
|
-
Describing 7 images...
|
|
122
|
-
Saved descriptions to ocr_temp/resnet/img_descriptions.json
|
|
123
|
-
Adding descriptions to 12 pages...
|
|
124
|
-
Done! Enriched pages saved to files/test/md_test
|
|
125
|
-
Done!
|
|
126
|
-
|
|
127
116
|
This will (as indicated by the output):
|
|
128
117
|
|
|
129
118
|
1. OCR the PDF using Mistral’s batch API
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
[DEFAULT]
|
|
2
2
|
repo = mistocr
|
|
3
3
|
lib_name = mistocr
|
|
4
|
-
version = 0.
|
|
4
|
+
version = 0.4.1
|
|
5
5
|
min_python = 3.9
|
|
6
6
|
license = apache2
|
|
7
7
|
black_formatting = False
|
|
@@ -27,7 +27,7 @@ keywords = nbdev jupyter notebook python
|
|
|
27
27
|
language = English
|
|
28
28
|
status = 3
|
|
29
29
|
user = franckalbinet
|
|
30
|
-
requirements = fastcore mistralai pillow dotenv lisette
|
|
30
|
+
requirements = fastcore mistralai pillow dotenv lisette PyPDF2
|
|
31
31
|
readme_nb = index.ipynb
|
|
32
32
|
allowed_metadata_keys =
|
|
33
33
|
allowed_cell_metadata_keys =
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.2.7"
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
"""End-to-End Pipeline: PDF OCR, Markdown Heading Correction, and AI Image Descriptions"""
|
|
2
|
-
|
|
3
|
-
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_pipeline.ipynb.
|
|
4
|
-
|
|
5
|
-
# %% auto 0
|
|
6
|
-
__all__ = ['pdf_to_md']
|
|
7
|
-
|
|
8
|
-
# %% ../nbs/02_pipeline.ipynb 3
|
|
9
|
-
from fastcore.all import *
|
|
10
|
-
from .core import read_pgs, ocr_pdf
|
|
11
|
-
from .refine import add_img_descs, fix_hdgs
|
|
12
|
-
from pathlib import Path
|
|
13
|
-
from asyncio import Semaphore, gather, sleep
|
|
14
|
-
import os, json, shutil
|
|
15
|
-
|
|
16
|
-
# %% ../nbs/02_pipeline.ipynb 4
|
|
17
|
-
@delegates(add_img_descs)
|
|
18
|
-
async def pdf_to_md(
|
|
19
|
-
pdf_path:str, # Path to input PDF file
|
|
20
|
-
dst:str, # Destination directory for output markdown
|
|
21
|
-
ocr_output:str=None, # Optional OCR output directory (defaults to pdf_path stem)
|
|
22
|
-
model:str='claude-sonnet-4-5', # Model to use for heading fixes and image descriptions
|
|
23
|
-
add_img_desc:bool=True, # Whether to add image descriptions
|
|
24
|
-
progress:bool=True, # Whether to show progress messages
|
|
25
|
-
**kwargs):
|
|
26
|
-
"Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
|
|
27
|
-
n_steps = 3 if add_img_desc else 2
|
|
28
|
-
if progress: print(f"Step 1/{n_steps}: Running OCR on {pdf_path}...")
|
|
29
|
-
ocr_dirs = ocr_pdf(pdf_path, ocr_output or 'ocr_temp')
|
|
30
|
-
ocr_dir = ocr_dirs[0]
|
|
31
|
-
if progress: print(f"Step 2/{n_steps}: Fixing heading hierarchy...")
|
|
32
|
-
fix_hdgs(ocr_dir, model=model)
|
|
33
|
-
if add_img_desc:
|
|
34
|
-
if progress: print(f"Step 3/{n_steps}: Adding image descriptions...")
|
|
35
|
-
await add_img_descs(ocr_dir, dst=dst, model=model, progress=progress, **kwargs)
|
|
36
|
-
elif dst and Path(dst) != ocr_dir: shutil.copytree(ocr_dir, dst, dirs_exist_ok=True)
|
|
37
|
-
if progress: print("Done!")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|