vlmparse 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vlmparse/base_model.py +16 -0
- vlmparse/build_doc.py +70 -0
- vlmparse/cli.py +409 -0
- vlmparse/converter.py +179 -0
- vlmparse/converter_with_server.py +122 -0
- vlmparse/registries.py +174 -0
- vlmparse/utils.py +41 -0
- vlmparse-0.1.0.dist-info/METADATA +184 -0
- vlmparse-0.1.0.dist-info/RECORD +13 -0
- vlmparse-0.1.0.dist-info/WHEEL +5 -0
- vlmparse-0.1.0.dist-info/entry_points.txt +2 -0
- vlmparse-0.1.0.dist-info/licenses/LICENSE +21 -0
- vlmparse-0.1.0.dist-info/top_level.txt +1 -0
vlmparse/base_model.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from pydantic import BaseModel, ConfigDict
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class VLMParseBaseModel(BaseModel):
|
|
5
|
+
model_config = ConfigDict(
|
|
6
|
+
arbitrary_types_allowed=True,
|
|
7
|
+
validate_assignment=True,
|
|
8
|
+
strict=True,
|
|
9
|
+
extra="allow",
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
def __repr__(self):
|
|
13
|
+
from devtools import PrettyFormat
|
|
14
|
+
|
|
15
|
+
pformat = PrettyFormat()
|
|
16
|
+
return pformat(self, highlight=False)
|
vlmparse/build_doc.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import PIL
|
|
5
|
+
import pypdfium2 as pdfium
|
|
6
|
+
from loguru import logger
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def convert_pdfium(file_path, dpi):
|
|
10
|
+
pdf = pdfium.PdfDocument(file_path)
|
|
11
|
+
pil_images = []
|
|
12
|
+
for page in pdf:
|
|
13
|
+
pil_images.append(page.render(scale=dpi / 72).to_pil())
|
|
14
|
+
|
|
15
|
+
pdf.close()
|
|
16
|
+
return pil_images
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def custom_ceil(a, precision=0):
|
|
20
|
+
return np.round(a + 0.5 * 10 ** (-precision), precision)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def convert_pdfium_to_images(file_path, dpi=175):
|
|
24
|
+
try:
|
|
25
|
+
images = convert_pdfium(file_path, dpi=dpi)
|
|
26
|
+
images = [
|
|
27
|
+
img.convert("L").convert("RGB") if img.mode != "RGB" else img
|
|
28
|
+
for img in images
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
except PIL.Image.DecompressionBombError as e:
|
|
32
|
+
logger.exception(f"Got problem size document with {file_path}")
|
|
33
|
+
cur_size, limit_size = map(int, re.findall(r"\d+", str(e)))
|
|
34
|
+
factor = custom_ceil(cur_size / limit_size, precision=1)
|
|
35
|
+
logger.warning(
|
|
36
|
+
f"Try again by reducing DPI for doc {file_path} from {dpi} to {dpi//factor}"
|
|
37
|
+
)
|
|
38
|
+
dpi = dpi // factor
|
|
39
|
+
images = convert_pdfium(file_path, dpi=dpi)
|
|
40
|
+
|
|
41
|
+
return images
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def convert_specific_page_to_image(file_path, page_number, dpi=175):
|
|
45
|
+
pdf = pdfium.PdfDocument(file_path)
|
|
46
|
+
page = pdf.get_page(page_number)
|
|
47
|
+
image = page.render(scale=dpi / 72).to_pil()
|
|
48
|
+
image = image.convert("L").convert("RGB") if image.mode != "RGB" else image
|
|
49
|
+
pdf.close()
|
|
50
|
+
return image
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def resize_image(image, max_image_size):
|
|
54
|
+
if max_image_size is not None:
|
|
55
|
+
ratio = max_image_size / max(image.size)
|
|
56
|
+
if ratio < 1:
|
|
57
|
+
new_size = (
|
|
58
|
+
int(image.size[0] * ratio),
|
|
59
|
+
int(image.size[1] * ratio),
|
|
60
|
+
)
|
|
61
|
+
image = image.resize(new_size)
|
|
62
|
+
logger.info(f"Resized image to {new_size}")
|
|
63
|
+
return image
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def get_page_count(file_path):
|
|
67
|
+
pdf = pdfium.PdfDocument(file_path)
|
|
68
|
+
count = len(pdf)
|
|
69
|
+
pdf.close()
|
|
70
|
+
return count
|
vlmparse/cli.py
ADDED
|
@@ -0,0 +1,409 @@
|
|
|
1
|
+
from typing import Literal
|
|
2
|
+
|
|
3
|
+
from loguru import logger
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class DParseCLI:
|
|
7
|
+
def serve(self, model: str, port: int | None = None, gpus: str | None = None):
|
|
8
|
+
"""Deploy a VLLM server in a Docker container.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
model: Model name
|
|
12
|
+
port: VLLM server port (default: 8056)
|
|
13
|
+
gpus: Comma-separated GPU device IDs (e.g., "0" or "0,1,2"). If not specified, all GPUs will be used.
|
|
14
|
+
"""
|
|
15
|
+
if port is None:
|
|
16
|
+
port = 8056
|
|
17
|
+
|
|
18
|
+
from vlmparse.registries import docker_config_registry
|
|
19
|
+
|
|
20
|
+
docker_config = docker_config_registry.get(model)
|
|
21
|
+
if docker_config is None:
|
|
22
|
+
logger.warning(
|
|
23
|
+
f"No Docker configuration found for model: {model}, using default configuration"
|
|
24
|
+
)
|
|
25
|
+
return
|
|
26
|
+
|
|
27
|
+
docker_config.docker_port = port
|
|
28
|
+
|
|
29
|
+
# Only override GPU configuration if explicitly specified
|
|
30
|
+
# This preserves CPU-only settings from the config
|
|
31
|
+
if gpus is not None:
|
|
32
|
+
docker_config.gpu_device_ids = [g.strip() for g in str(gpus).split(",")]
|
|
33
|
+
server = docker_config.get_server(auto_stop=False)
|
|
34
|
+
|
|
35
|
+
# Deploy server and leave it running (cleanup=False)
|
|
36
|
+
logger.info(
|
|
37
|
+
f"Deploying VLLM server for {docker_config.model_name} on port {port}..."
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
base_url, container = server.start()
|
|
41
|
+
|
|
42
|
+
logger.info(f"✓ VLLM server ready at {base_url}")
|
|
43
|
+
logger.info(f"✓ Container ID: {container.id}")
|
|
44
|
+
logger.info(f"✓ Container name: {container.name}")
|
|
45
|
+
|
|
46
|
+
def convert(
|
|
47
|
+
self,
|
|
48
|
+
inputs: str | list[str],
|
|
49
|
+
out_folder: str = ".",
|
|
50
|
+
model: str | None = None,
|
|
51
|
+
uri: str | None = None,
|
|
52
|
+
gpus: str | None = None,
|
|
53
|
+
mode: Literal["document", "md", "md_page"] = "document",
|
|
54
|
+
with_vllm_server: bool = False,
|
|
55
|
+
concurrency: int = 10,
|
|
56
|
+
dpi: int | None = None,
|
|
57
|
+
):
|
|
58
|
+
"""Parse PDF documents and save results.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
inputs: List of folders to process
|
|
62
|
+
out_folder: Output folder for parsed documents
|
|
63
|
+
pipe: Converter type ("vllm", "openai", or "lightonocr", default: "vllm")
|
|
64
|
+
model: Model name. If not specified, the model will be inferred from the URI.
|
|
65
|
+
uri: URI of the server, if not specified and the pipe is vllm, a local server will be deployed
|
|
66
|
+
gpus: Comma-separated GPU device IDs (e.g., "0" or "0,1,2"). If not specified, all GPUs will be used.
|
|
67
|
+
mode: Output mode - "document" (save as JSON zip), "md" (save as markdown file), "md_page" (save as folder of markdown pages)
|
|
68
|
+
with_vllm_server: If True, a local VLLM server will be deployed if the model is not found in the registry. Note that if the model is in the registry and the uri is None, the server will be anyway deployed.
|
|
69
|
+
dpi: DPI to use for the conversion. If not specified, the default DPI will be used.
|
|
70
|
+
"""
|
|
71
|
+
from vlmparse.converter_with_server import ConverterWithServer
|
|
72
|
+
|
|
73
|
+
converter_with_server = ConverterWithServer(
|
|
74
|
+
model=model,
|
|
75
|
+
uri=uri,
|
|
76
|
+
gpus=gpus,
|
|
77
|
+
with_vllm_server=with_vllm_server,
|
|
78
|
+
concurrency=concurrency,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
return converter_with_server.parse(
|
|
82
|
+
inputs=inputs, out_folder=out_folder, mode=mode, dpi=dpi
|
|
83
|
+
)
|
|
84
|
+
# from vlmparse.registries import converter_config_registry
|
|
85
|
+
|
|
86
|
+
# # Infer model from URI if provided
|
|
87
|
+
# if uri is not None and model is None:
|
|
88
|
+
# import docker
|
|
89
|
+
|
|
90
|
+
# try:
|
|
91
|
+
# docker_client = docker.from_env()
|
|
92
|
+
# containers = docker_client.containers.list()
|
|
93
|
+
# for container in containers:
|
|
94
|
+
# # Check both exact match and match with/without trailing slash
|
|
95
|
+
# container_uri = container.labels.get("vlmparse_uri", "")
|
|
96
|
+
# if container_uri and (
|
|
97
|
+
# container_uri == uri
|
|
98
|
+
# or container_uri.rstrip("/") == uri.rstrip("/")
|
|
99
|
+
# ):
|
|
100
|
+
# inferred_model = container.labels.get("vlmparse_model_name")
|
|
101
|
+
# if inferred_model:
|
|
102
|
+
# logger.info(
|
|
103
|
+
# f"Inferred model {inferred_model} from URI {uri}"
|
|
104
|
+
# )
|
|
105
|
+
# model = inferred_model
|
|
106
|
+
# break
|
|
107
|
+
# except Exception:
|
|
108
|
+
# # If Docker is not available or fails, just proceed with provided arguments
|
|
109
|
+
# pass
|
|
110
|
+
|
|
111
|
+
# if mode not in ["document", "md", "md_page"]:
|
|
112
|
+
# logger.error(f"Invalid mode: {mode}. Must be one of: document, md, md_page")
|
|
113
|
+
# return
|
|
114
|
+
|
|
115
|
+
# # Expand file paths from glob patterns
|
|
116
|
+
# file_paths = []
|
|
117
|
+
# if isinstance(inputs, str):
|
|
118
|
+
# inputs = [inputs]
|
|
119
|
+
# for pattern in inputs:
|
|
120
|
+
# if "*" in pattern or "?" in pattern:
|
|
121
|
+
# file_paths.extend(glob(pattern, recursive=True))
|
|
122
|
+
# elif os.path.isdir(pattern):
|
|
123
|
+
# file_paths.extend(glob(os.path.join(pattern, "*.pdf"), recursive=True))
|
|
124
|
+
# elif os.path.isfile(pattern):
|
|
125
|
+
# file_paths.append(pattern)
|
|
126
|
+
# else:
|
|
127
|
+
# logger.error(f"Invalid input: {pattern}")
|
|
128
|
+
|
|
129
|
+
# # Filter to only existing PDF files
|
|
130
|
+
# file_paths = [f for f in file_paths if os.path.exists(f) and f.endswith(".pdf")]
|
|
131
|
+
|
|
132
|
+
# if not file_paths:
|
|
133
|
+
# logger.error("No PDF files found matching the inputs patterns")
|
|
134
|
+
# return
|
|
135
|
+
|
|
136
|
+
# logger.info(f"Processing {len(file_paths)} files with {model} converter")
|
|
137
|
+
|
|
138
|
+
# gpu_device_ids = None
|
|
139
|
+
# if gpus is not None:
|
|
140
|
+
# gpu_device_ids = [g.strip() for g in gpus.split(",")]
|
|
141
|
+
|
|
142
|
+
# if uri is None:
|
|
143
|
+
# from vlmparse.registries import docker_config_registry
|
|
144
|
+
|
|
145
|
+
# docker_config = docker_config_registry.get(model, default=with_vllm_server)
|
|
146
|
+
|
|
147
|
+
# if docker_config is not None:
|
|
148
|
+
# docker_config.gpu_device_ids = gpu_device_ids
|
|
149
|
+
# server = docker_config.get_server(auto_stop=True)
|
|
150
|
+
# server.start()
|
|
151
|
+
|
|
152
|
+
# client = docker_config.get_client(
|
|
153
|
+
# save_folder=out_folder, save_mode=mode
|
|
154
|
+
# )
|
|
155
|
+
# else:
|
|
156
|
+
# client = converter_config_registry.get(model).get_client(
|
|
157
|
+
# save_folder=out_folder, save_mode=mode
|
|
158
|
+
# )
|
|
159
|
+
|
|
160
|
+
# else:
|
|
161
|
+
# client_config = converter_config_registry.get(model, uri=uri)
|
|
162
|
+
# client = client_config.get_client(save_folder=out_folder, save_mode=mode)
|
|
163
|
+
# client.num_concurrent_files = concurrency
|
|
164
|
+
# client.num_concurrent_pages = concurrency
|
|
165
|
+
# if dpi is not None:
|
|
166
|
+
# client.config.dpi = int(dpi)
|
|
167
|
+
# documents = client.batch(file_paths)
|
|
168
|
+
|
|
169
|
+
# if documents is not None:
|
|
170
|
+
# logger.info(f"Processed {len(documents)} documents to {out_folder}")
|
|
171
|
+
# else:
|
|
172
|
+
# logger.info(f"Processed {len(file_paths)} documents to {out_folder}")
|
|
173
|
+
|
|
174
|
+
def list(self):
|
|
175
|
+
"""List all containers whose name begins with vlmparse."""
|
|
176
|
+
import docker
|
|
177
|
+
|
|
178
|
+
try:
|
|
179
|
+
client = docker.from_env()
|
|
180
|
+
containers = client.containers.list()
|
|
181
|
+
|
|
182
|
+
if not containers:
|
|
183
|
+
logger.info("No running containers found")
|
|
184
|
+
return
|
|
185
|
+
|
|
186
|
+
# Filter for containers whose name begins with "vlmparse"
|
|
187
|
+
vlmparse_containers = [
|
|
188
|
+
container
|
|
189
|
+
for container in containers
|
|
190
|
+
if container.name.startswith("vlmparse")
|
|
191
|
+
]
|
|
192
|
+
|
|
193
|
+
if not vlmparse_containers:
|
|
194
|
+
logger.info("No vlmparse containers found")
|
|
195
|
+
return
|
|
196
|
+
|
|
197
|
+
# Prepare table data
|
|
198
|
+
table_data = []
|
|
199
|
+
for container in vlmparse_containers:
|
|
200
|
+
# Extract port mappings
|
|
201
|
+
ports = []
|
|
202
|
+
if container.ports:
|
|
203
|
+
for _, host_bindings in container.ports.items():
|
|
204
|
+
if host_bindings:
|
|
205
|
+
for binding in host_bindings:
|
|
206
|
+
ports.append(f"{binding['HostPort']}")
|
|
207
|
+
|
|
208
|
+
port_str = ", ".join(set(ports)) if ports else "N/A"
|
|
209
|
+
uri = container.labels.get("vlmparse_uri", "N/A")
|
|
210
|
+
gpu = container.labels.get("vlmparse_gpus", "N/A")
|
|
211
|
+
|
|
212
|
+
table_data.append(
|
|
213
|
+
[
|
|
214
|
+
container.name,
|
|
215
|
+
container.status,
|
|
216
|
+
port_str,
|
|
217
|
+
gpu,
|
|
218
|
+
uri,
|
|
219
|
+
]
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
# Display as table
|
|
223
|
+
from tabulate import tabulate
|
|
224
|
+
|
|
225
|
+
headers = ["Name", "Status", "Port(s)", "GPU", "URI"]
|
|
226
|
+
table = tabulate(table_data, headers=headers, tablefmt="grid")
|
|
227
|
+
|
|
228
|
+
logger.info(f"\nFound {len(vlmparse_containers)} vlmparse container(s):\n")
|
|
229
|
+
print(table)
|
|
230
|
+
|
|
231
|
+
except docker.errors.DockerException as e:
|
|
232
|
+
logger.error(f"Failed to connect to Docker: {e}")
|
|
233
|
+
logger.error(
|
|
234
|
+
"Make sure Docker is running and you have the necessary permissions"
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
def stop(self, container: str | None = None):
|
|
238
|
+
"""Stop a Docker container by its ID or name.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
container: Container ID or name to stop. If not specified, automatically stops the container if only one vlmparse container is running.
|
|
242
|
+
"""
|
|
243
|
+
import docker
|
|
244
|
+
|
|
245
|
+
try:
|
|
246
|
+
client = docker.from_env()
|
|
247
|
+
|
|
248
|
+
# If no container specified, try to auto-select
|
|
249
|
+
if container is None:
|
|
250
|
+
containers = client.containers.list()
|
|
251
|
+
vlmparse_containers = [
|
|
252
|
+
c for c in containers if c.name.startswith("vlmparse")
|
|
253
|
+
]
|
|
254
|
+
|
|
255
|
+
if len(vlmparse_containers) == 0:
|
|
256
|
+
logger.error("No vlmparse containers found")
|
|
257
|
+
return
|
|
258
|
+
elif len(vlmparse_containers) > 1:
|
|
259
|
+
logger.error(
|
|
260
|
+
f"Multiple vlmparse containers found ({len(vlmparse_containers)}). "
|
|
261
|
+
"Please specify a container ID or name:"
|
|
262
|
+
)
|
|
263
|
+
for c in vlmparse_containers:
|
|
264
|
+
logger.info(f" - {c.name} ({c.short_id})")
|
|
265
|
+
return
|
|
266
|
+
else:
|
|
267
|
+
target_container = vlmparse_containers[0]
|
|
268
|
+
else:
|
|
269
|
+
# Try to get the specified container
|
|
270
|
+
try:
|
|
271
|
+
target_container = client.containers.get(container)
|
|
272
|
+
except docker.errors.NotFound:
|
|
273
|
+
logger.error(f"Container not found: {container}")
|
|
274
|
+
return
|
|
275
|
+
|
|
276
|
+
# Stop the container
|
|
277
|
+
logger.info(
|
|
278
|
+
f"Stopping container: {target_container.name} ({target_container.short_id})"
|
|
279
|
+
)
|
|
280
|
+
target_container.stop()
|
|
281
|
+
logger.info("✓ Container stopped successfully")
|
|
282
|
+
|
|
283
|
+
except docker.errors.DockerException as e:
|
|
284
|
+
logger.error(f"Failed to connect to Docker: {e}")
|
|
285
|
+
logger.error(
|
|
286
|
+
"Make sure Docker is running and you have the necessary permissions"
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
def log(self, container: str | None = None, follow: bool = True, tail: int = 500):
|
|
290
|
+
"""Show logs from a Docker container.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
container: Container ID or name. If not specified, automatically selects the container if only one vlmparse container is running.
|
|
294
|
+
follow: If True, follow log output (stream logs in real-time)
|
|
295
|
+
tail: Number of lines to show from the end of the logs
|
|
296
|
+
"""
|
|
297
|
+
import docker
|
|
298
|
+
|
|
299
|
+
try:
|
|
300
|
+
client = docker.from_env()
|
|
301
|
+
|
|
302
|
+
# If no container specified, try to auto-select
|
|
303
|
+
if container is None:
|
|
304
|
+
containers = client.containers.list()
|
|
305
|
+
vlmparse_containers = [
|
|
306
|
+
c for c in containers if c.name.startswith("vlmparse")
|
|
307
|
+
]
|
|
308
|
+
|
|
309
|
+
if len(vlmparse_containers) == 0:
|
|
310
|
+
logger.error("No vlmparse containers found")
|
|
311
|
+
return
|
|
312
|
+
elif len(vlmparse_containers) > 1:
|
|
313
|
+
logger.error(
|
|
314
|
+
f"Multiple vlmparse containers found ({len(vlmparse_containers)}). "
|
|
315
|
+
"Please specify a container ID or name:"
|
|
316
|
+
)
|
|
317
|
+
for c in vlmparse_containers:
|
|
318
|
+
logger.info(f" - {c.name} ({c.short_id})")
|
|
319
|
+
return
|
|
320
|
+
else:
|
|
321
|
+
target_container = vlmparse_containers[0]
|
|
322
|
+
logger.info(
|
|
323
|
+
f"Showing logs for: {target_container.name} ({target_container.short_id})"
|
|
324
|
+
)
|
|
325
|
+
else:
|
|
326
|
+
# Try to get the specified container
|
|
327
|
+
try:
|
|
328
|
+
target_container = client.containers.get(container)
|
|
329
|
+
except docker.errors.NotFound:
|
|
330
|
+
logger.error(f"Container not found: {container}")
|
|
331
|
+
return
|
|
332
|
+
|
|
333
|
+
# Get and display logs
|
|
334
|
+
if follow:
|
|
335
|
+
logger.info("Following logs (press Ctrl+C to stop)...")
|
|
336
|
+
try:
|
|
337
|
+
for log_line in target_container.logs(
|
|
338
|
+
stream=True, follow=True, tail=tail
|
|
339
|
+
):
|
|
340
|
+
print(log_line.decode("utf-8", errors="replace"), end="")
|
|
341
|
+
except KeyboardInterrupt:
|
|
342
|
+
logger.info("\nStopped following logs")
|
|
343
|
+
else:
|
|
344
|
+
logs = target_container.logs().decode("utf-8", errors="replace")
|
|
345
|
+
print(logs)
|
|
346
|
+
|
|
347
|
+
except docker.errors.DockerException as e:
|
|
348
|
+
logger.error(f"Failed to connect to Docker: {e}")
|
|
349
|
+
logger.error(
|
|
350
|
+
"Make sure Docker is running and you have the necessary permissions"
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
def list_register(self):
|
|
354
|
+
"""List all model keys registered in client and server registries."""
|
|
355
|
+
from vlmparse.registries import (
|
|
356
|
+
converter_config_registry,
|
|
357
|
+
docker_config_registry,
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
client_models = sorted(converter_config_registry.list_models())
|
|
361
|
+
server_models = sorted(docker_config_registry.list_models())
|
|
362
|
+
|
|
363
|
+
print("\nClient Models Registry:")
|
|
364
|
+
for model in client_models:
|
|
365
|
+
print(f" - {model}")
|
|
366
|
+
|
|
367
|
+
print("\nServer Models Registry:")
|
|
368
|
+
for model in server_models:
|
|
369
|
+
print(f" - {model}")
|
|
370
|
+
|
|
371
|
+
def view(self, folder):
|
|
372
|
+
import subprocess
|
|
373
|
+
import sys
|
|
374
|
+
|
|
375
|
+
from streamlit import runtime
|
|
376
|
+
|
|
377
|
+
from vlmparse.st_viewer.st_viewer import __file__ as st_viewer_file
|
|
378
|
+
from vlmparse.st_viewer.st_viewer import run_streamlit
|
|
379
|
+
|
|
380
|
+
if runtime.exists():
|
|
381
|
+
run_streamlit(folder)
|
|
382
|
+
else:
|
|
383
|
+
try:
|
|
384
|
+
subprocess.run(
|
|
385
|
+
[
|
|
386
|
+
sys.executable,
|
|
387
|
+
"-m",
|
|
388
|
+
"streamlit",
|
|
389
|
+
"run",
|
|
390
|
+
st_viewer_file,
|
|
391
|
+
"--",
|
|
392
|
+
folder,
|
|
393
|
+
],
|
|
394
|
+
check=True,
|
|
395
|
+
)
|
|
396
|
+
except KeyboardInterrupt:
|
|
397
|
+
print("\nStreamlit app terminated by user.")
|
|
398
|
+
except subprocess.CalledProcessError as e:
|
|
399
|
+
print(f"Error while running Streamlit: {e}")
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def main():
|
|
403
|
+
import fire
|
|
404
|
+
|
|
405
|
+
fire.Fire(DParseCLI)
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
if __name__ == "__main__":
|
|
409
|
+
main()
|
vlmparse/converter.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import threading
|
|
3
|
+
import time
|
|
4
|
+
import traceback
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Literal
|
|
7
|
+
|
|
8
|
+
from loguru import logger
|
|
9
|
+
from pydantic import Field
|
|
10
|
+
|
|
11
|
+
from .base_model import VLMParseBaseModel
|
|
12
|
+
from .build_doc import convert_specific_page_to_image, get_page_count, resize_image
|
|
13
|
+
from .data_model.document import Document, Page, ProcessingError
|
|
14
|
+
|
|
15
|
+
# Add a lock to ensure PDFium is accessed by only one thread/task at a time
|
|
16
|
+
PDFIUM_LOCK = threading.Lock()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ConverterConfig(VLMParseBaseModel):
|
|
20
|
+
aliases: list[str] = Field(default_factory=list)
|
|
21
|
+
dpi: int = 175
|
|
22
|
+
max_image_size: int | None = 4000
|
|
23
|
+
|
|
24
|
+
def get_client(self, **kwargs) -> "BaseConverter":
|
|
25
|
+
return BaseConverter(config=self, **kwargs)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class BaseConverter:
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
config: ConverterConfig,
|
|
32
|
+
num_concurrent_files: int = 10,
|
|
33
|
+
num_concurrent_pages: int = 10,
|
|
34
|
+
save_folder: str | None = None,
|
|
35
|
+
save_mode: Literal["document", "md", "md_page"] = "document",
|
|
36
|
+
debug: bool = False,
|
|
37
|
+
return_documents_in_batch_mode: bool = False,
|
|
38
|
+
save_page_images: bool = False,
|
|
39
|
+
):
|
|
40
|
+
self.config = config
|
|
41
|
+
self.num_concurrent_files = num_concurrent_files
|
|
42
|
+
self.num_concurrent_pages = num_concurrent_pages
|
|
43
|
+
self.save_folder = save_folder
|
|
44
|
+
self.save_mode = save_mode
|
|
45
|
+
self.debug = debug
|
|
46
|
+
self.return_documents_in_batch_mode = return_documents_in_batch_mode
|
|
47
|
+
self.save_page_images = save_page_images
|
|
48
|
+
|
|
49
|
+
async def async_call_inside_page(self, page: Page) -> Page:
|
|
50
|
+
raise NotImplementedError
|
|
51
|
+
|
|
52
|
+
def add_page_image(self, page: Page, file_path, page_idx):
|
|
53
|
+
with PDFIUM_LOCK:
|
|
54
|
+
image = convert_specific_page_to_image(
|
|
55
|
+
file_path,
|
|
56
|
+
page_idx,
|
|
57
|
+
dpi=self.config.dpi,
|
|
58
|
+
)
|
|
59
|
+
image = resize_image(image, self.config.max_image_size)
|
|
60
|
+
page.buffer_image = image
|
|
61
|
+
return page
|
|
62
|
+
|
|
63
|
+
async def async_call(self, file_path: str | Path) -> Document:
|
|
64
|
+
tic = time.perf_counter()
|
|
65
|
+
document = Document(file_path=str(file_path))
|
|
66
|
+
try:
|
|
67
|
+
num_pages = get_page_count(file_path)
|
|
68
|
+
document.pages = [Page() for _ in range(num_pages)]
|
|
69
|
+
|
|
70
|
+
semaphore = asyncio.Semaphore(self.num_concurrent_pages)
|
|
71
|
+
|
|
72
|
+
async def worker(page_idx: int, page: Page):
|
|
73
|
+
async with semaphore:
|
|
74
|
+
try:
|
|
75
|
+
page = await asyncio.to_thread(
|
|
76
|
+
self.add_page_image, page, file_path, page_idx
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
tic = time.perf_counter()
|
|
80
|
+
page = await self.async_call_inside_page(page)
|
|
81
|
+
toc = time.perf_counter()
|
|
82
|
+
page.latency = toc - tic
|
|
83
|
+
logger.debug(f"Time taken: {page.latency} seconds")
|
|
84
|
+
except KeyboardInterrupt:
|
|
85
|
+
raise
|
|
86
|
+
except Exception:
|
|
87
|
+
if self.debug:
|
|
88
|
+
raise
|
|
89
|
+
else:
|
|
90
|
+
logger.exception(traceback.format_exc())
|
|
91
|
+
page.error = ProcessingError.from_class(self)
|
|
92
|
+
if not self.save_page_images:
|
|
93
|
+
page.buffer_image = dict(
|
|
94
|
+
file_path=str(file_path),
|
|
95
|
+
page_idx=page_idx,
|
|
96
|
+
dpi=self.config.dpi,
|
|
97
|
+
max_image_size=self.config.max_image_size,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
tasks = [
|
|
101
|
+
asyncio.create_task(worker(i, page))
|
|
102
|
+
for i, page in enumerate(document.pages)
|
|
103
|
+
]
|
|
104
|
+
await asyncio.gather(*tasks)
|
|
105
|
+
except KeyboardInterrupt:
|
|
106
|
+
raise
|
|
107
|
+
except Exception:
|
|
108
|
+
if self.debug:
|
|
109
|
+
raise
|
|
110
|
+
else:
|
|
111
|
+
logger.exception(traceback.format_exc())
|
|
112
|
+
document.error = ProcessingError.from_class(self)
|
|
113
|
+
return document
|
|
114
|
+
toc = time.perf_counter()
|
|
115
|
+
document.latency = toc - tic
|
|
116
|
+
logger.debug(f"Time taken to process the document: {document.latency} seconds")
|
|
117
|
+
if self.save_folder is not None:
|
|
118
|
+
self._save_document(document)
|
|
119
|
+
|
|
120
|
+
return document
|
|
121
|
+
|
|
122
|
+
def _save_document(self, document: Document):
|
|
123
|
+
"""Save document according to save_mode."""
|
|
124
|
+
if document.is_error:
|
|
125
|
+
save_folder = Path(self.save_folder) / "errors"
|
|
126
|
+
|
|
127
|
+
else:
|
|
128
|
+
save_folder = Path(self.save_folder) / "results"
|
|
129
|
+
|
|
130
|
+
save_folder.mkdir(parents=True, exist_ok=True)
|
|
131
|
+
doc_name = Path(document.file_path).stem
|
|
132
|
+
|
|
133
|
+
if self.save_mode == "document":
|
|
134
|
+
zip_path = save_folder / f"{doc_name}.zip"
|
|
135
|
+
document.to_zip(zip_path)
|
|
136
|
+
logger.info(f"Saved document to {zip_path}")
|
|
137
|
+
|
|
138
|
+
elif self.save_mode == "md":
|
|
139
|
+
md_path = save_folder / f"{doc_name}.md"
|
|
140
|
+
text_content = "\n\n".join([page.text or "" for page in document.pages])
|
|
141
|
+
with open(md_path, "w", encoding="utf-8") as f:
|
|
142
|
+
f.write(text_content)
|
|
143
|
+
logger.info(f"Saved markdown to {md_path}")
|
|
144
|
+
|
|
145
|
+
elif self.save_mode == "md_page":
|
|
146
|
+
doc_folder = save_folder / doc_name
|
|
147
|
+
doc_folder.mkdir(parents=True, exist_ok=True)
|
|
148
|
+
for i, page in enumerate(document.pages, start=1):
|
|
149
|
+
page_text = page.text if page.text else ""
|
|
150
|
+
page_path = doc_folder / f"page_{i:04d}.md"
|
|
151
|
+
with open(page_path, "w", encoding="utf-8") as f:
|
|
152
|
+
f.write(page_text)
|
|
153
|
+
logger.info(f"Saved {len(document.pages)} pages to {doc_folder}")
|
|
154
|
+
|
|
155
|
+
else:
|
|
156
|
+
logger.warning(f"Unknown save_mode: {self.save_mode}, skipping save")
|
|
157
|
+
|
|
158
|
+
def __call__(self, file_path: str | Path):
|
|
159
|
+
return asyncio.run(self.async_call(file_path))
|
|
160
|
+
|
|
161
|
+
async def async_batch(self, file_paths: list[str | Path]) -> list[Document] | None:
|
|
162
|
+
"""Process multiple files concurrently with semaphore limit."""
|
|
163
|
+
semaphore = asyncio.Semaphore(self.num_concurrent_files)
|
|
164
|
+
|
|
165
|
+
async def worker(file_path: str | Path) -> Document:
|
|
166
|
+
async with semaphore:
|
|
167
|
+
if self.return_documents_in_batch_mode:
|
|
168
|
+
return await self.async_call(file_path)
|
|
169
|
+
else:
|
|
170
|
+
await self.async_call(file_path)
|
|
171
|
+
|
|
172
|
+
tasks = [asyncio.create_task(worker(file_path)) for file_path in file_paths]
|
|
173
|
+
documents = await asyncio.gather(*tasks)
|
|
174
|
+
if self.return_documents_in_batch_mode:
|
|
175
|
+
return documents
|
|
176
|
+
|
|
177
|
+
def batch(self, file_paths: list[str | Path]) -> list[Document] | None:
|
|
178
|
+
"""Synchronous wrapper for async_batch."""
|
|
179
|
+
return asyncio.run(self.async_batch(file_paths))
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Literal
|
|
5
|
+
|
|
6
|
+
from loguru import logger
|
|
7
|
+
|
|
8
|
+
from vlmparse.servers.utils import get_model_from_uri
|
|
9
|
+
from vlmparse.utils import get_file_paths
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ConverterWithServer:
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
model: str,
|
|
16
|
+
uri: str | None = None,
|
|
17
|
+
gpus: str | None = None,
|
|
18
|
+
port: int | None = None,
|
|
19
|
+
with_vllm_server: bool = False,
|
|
20
|
+
concurrency: int = 10,
|
|
21
|
+
):
|
|
22
|
+
from vlmparse.registries import (
|
|
23
|
+
converter_config_registry,
|
|
24
|
+
docker_config_registry,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
self.model = model
|
|
28
|
+
self.uri = uri
|
|
29
|
+
self.port = port
|
|
30
|
+
self.gpus = gpus
|
|
31
|
+
self.with_vllm_server = with_vllm_server
|
|
32
|
+
self.concurrency = concurrency
|
|
33
|
+
|
|
34
|
+
if self.uri is not None and self.model is None:
|
|
35
|
+
self.model = get_model_from_uri(self.uri)
|
|
36
|
+
|
|
37
|
+
gpu_device_ids = None
|
|
38
|
+
if self.gpus is not None:
|
|
39
|
+
gpu_device_ids = [g.strip() for g in self.gpus.split(",")]
|
|
40
|
+
|
|
41
|
+
if self.uri is None:
|
|
42
|
+
docker_config = docker_config_registry.get(
|
|
43
|
+
self.model, default=self.with_vllm_server
|
|
44
|
+
)
|
|
45
|
+
if self.port is not None:
|
|
46
|
+
docker_config.docker_port = self.port
|
|
47
|
+
|
|
48
|
+
if docker_config is not None:
|
|
49
|
+
docker_config.gpu_device_ids = gpu_device_ids
|
|
50
|
+
server = docker_config.get_server(auto_stop=True)
|
|
51
|
+
server.start()
|
|
52
|
+
|
|
53
|
+
self.client = docker_config.get_client()
|
|
54
|
+
else:
|
|
55
|
+
self.client = converter_config_registry.get(self.model).get_client()
|
|
56
|
+
|
|
57
|
+
else:
|
|
58
|
+
client_config = converter_config_registry.get(self.model, uri=self.uri)
|
|
59
|
+
self.client = client_config.get_client()
|
|
60
|
+
|
|
61
|
+
def parse(
|
|
62
|
+
self,
|
|
63
|
+
inputs: str | list[str],
|
|
64
|
+
out_folder: str = ".",
|
|
65
|
+
mode: Literal["document", "md", "md_page"] = "document",
|
|
66
|
+
dpi: int | None = None,
|
|
67
|
+
debug: bool = False,
|
|
68
|
+
retrylast: bool = False,
|
|
69
|
+
):
|
|
70
|
+
file_paths = get_file_paths(inputs)
|
|
71
|
+
assert (
|
|
72
|
+
out_folder is not None
|
|
73
|
+
), "out_folder must be provided if retrylast is True"
|
|
74
|
+
if retrylast:
|
|
75
|
+
retry = Path(out_folder)
|
|
76
|
+
previous_runs = sorted(os.listdir(retry))
|
|
77
|
+
if len(previous_runs) > 0:
|
|
78
|
+
retry = retry / previous_runs[-1]
|
|
79
|
+
else:
|
|
80
|
+
raise ValueError(
|
|
81
|
+
"No previous runs found, do not use the retrylast flag"
|
|
82
|
+
)
|
|
83
|
+
already_processed = [
|
|
84
|
+
f.removesuffix(".zip") for f in os.listdir(retry / "results")
|
|
85
|
+
]
|
|
86
|
+
file_paths = [
|
|
87
|
+
f
|
|
88
|
+
for f in file_paths
|
|
89
|
+
if Path(f).name.removesuffix(".pdf") not in already_processed
|
|
90
|
+
]
|
|
91
|
+
|
|
92
|
+
logger.debug(f"Number of files after filtering: {len(file_paths)}")
|
|
93
|
+
|
|
94
|
+
else:
|
|
95
|
+
out_folder = Path(out_folder) / (
|
|
96
|
+
datetime.datetime.now().strftime("%Y-%m-%dT%Hh%Mm%Ss")
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
if dpi is not None:
|
|
100
|
+
self.client.config.dpi = int(dpi)
|
|
101
|
+
|
|
102
|
+
if debug:
|
|
103
|
+
self.client.debug = debug
|
|
104
|
+
|
|
105
|
+
self.client.save_folder = out_folder
|
|
106
|
+
self.client.save_mode = mode
|
|
107
|
+
self.client.num_concurrent_files = self.concurrency if not debug else 1
|
|
108
|
+
self.client.num_concurrent_pages = self.concurrency if not debug else 1
|
|
109
|
+
|
|
110
|
+
logger.info(f"Processing {len(file_paths)} files with {self.model} converter")
|
|
111
|
+
|
|
112
|
+
documents = self.client.batch(file_paths)
|
|
113
|
+
|
|
114
|
+
if documents is not None:
|
|
115
|
+
logger.info(f"Processed {len(documents)} documents to {out_folder}")
|
|
116
|
+
else:
|
|
117
|
+
logger.info(f"Processed {len(file_paths)} documents to {out_folder}")
|
|
118
|
+
|
|
119
|
+
return documents
|
|
120
|
+
|
|
121
|
+
def get_out_folder(self) -> Path:
|
|
122
|
+
return self.client.save_folder
|
vlmparse/registries.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from collections.abc import Callable
|
|
3
|
+
|
|
4
|
+
from vlmparse.clients.chandra import ChandraConverterConfig, ChandraDockerServerConfig
|
|
5
|
+
from vlmparse.clients.deepseekocr import (
|
|
6
|
+
DeepSeekOCRConverterConfig,
|
|
7
|
+
DeepSeekOCRDockerServerConfig,
|
|
8
|
+
)
|
|
9
|
+
from vlmparse.clients.docling import DoclingConverterConfig, DoclingDockerServerConfig
|
|
10
|
+
from vlmparse.clients.dotsocr import DotsOCRConverterConfig, DotsOCRDockerServerConfig
|
|
11
|
+
from vlmparse.clients.granite_docling import (
|
|
12
|
+
GraniteDoclingConverterConfig,
|
|
13
|
+
GraniteDoclingDockerServerConfig,
|
|
14
|
+
)
|
|
15
|
+
from vlmparse.clients.hunyuanocr import (
|
|
16
|
+
HunyuanOCRConverterConfig,
|
|
17
|
+
HunyuanOCRDockerServerConfig,
|
|
18
|
+
)
|
|
19
|
+
from vlmparse.clients.lightonocr import (
|
|
20
|
+
LightOnOCRConverterConfig,
|
|
21
|
+
LightOnOCRDockerServerConfig,
|
|
22
|
+
)
|
|
23
|
+
from vlmparse.clients.mineru import MinerUConverterConfig, MinerUDockerServerConfig
|
|
24
|
+
from vlmparse.clients.nanonetocr import (
|
|
25
|
+
NanonetOCR2ConverterConfig,
|
|
26
|
+
NanonetOCR2DockerServerConfig,
|
|
27
|
+
)
|
|
28
|
+
from vlmparse.clients.olmocr import OlmOCRConverterConfig, OlmOCRDockerServerConfig
|
|
29
|
+
from vlmparse.clients.openai_converter import LLMParams, OpenAIConverterConfig
|
|
30
|
+
from vlmparse.clients.paddleocrvl import (
|
|
31
|
+
PaddleOCRVLConverterConfig,
|
|
32
|
+
PaddleOCRVLDockerServerConfig,
|
|
33
|
+
)
|
|
34
|
+
from vlmparse.servers.docker_server import DEFAULT_MODEL_NAME, docker_config_registry
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_default(cls, field_name):
|
|
38
|
+
field_info = cls.model_fields.get(field_name)
|
|
39
|
+
if field_info is None:
|
|
40
|
+
return [] if field_name == "aliases" else None
|
|
41
|
+
if field_info.default_factory:
|
|
42
|
+
return field_info.default_factory()
|
|
43
|
+
return field_info.default
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
for server_config_cls in [
|
|
47
|
+
ChandraDockerServerConfig,
|
|
48
|
+
LightOnOCRDockerServerConfig,
|
|
49
|
+
DotsOCRDockerServerConfig,
|
|
50
|
+
PaddleOCRVLDockerServerConfig,
|
|
51
|
+
NanonetOCR2DockerServerConfig,
|
|
52
|
+
HunyuanOCRDockerServerConfig,
|
|
53
|
+
DoclingDockerServerConfig,
|
|
54
|
+
OlmOCRDockerServerConfig,
|
|
55
|
+
MinerUDockerServerConfig,
|
|
56
|
+
DeepSeekOCRDockerServerConfig,
|
|
57
|
+
GraniteDoclingDockerServerConfig,
|
|
58
|
+
]:
|
|
59
|
+
aliases = get_default(server_config_cls, "aliases") or []
|
|
60
|
+
model_name = get_default(server_config_cls, "model_name")
|
|
61
|
+
names = [n for n in aliases + [model_name] if isinstance(n, str)]
|
|
62
|
+
for name in names:
|
|
63
|
+
docker_config_registry.register(name, lambda cls=server_config_cls: cls())
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class ConverterConfigRegistry:
|
|
67
|
+
"""Registry for mapping model names to their Docker configurations."""
|
|
68
|
+
|
|
69
|
+
def __init__(self):
|
|
70
|
+
self._registry = dict()
|
|
71
|
+
|
|
72
|
+
def register(
|
|
73
|
+
self,
|
|
74
|
+
model_name: str,
|
|
75
|
+
config_factory: Callable[[str], OpenAIConverterConfig | None],
|
|
76
|
+
):
|
|
77
|
+
"""Register a config factory for a model name."""
|
|
78
|
+
self._registry[model_name] = config_factory
|
|
79
|
+
|
|
80
|
+
def get(
|
|
81
|
+
self, model_name: str, uri: str | None = None
|
|
82
|
+
) -> OpenAIConverterConfig | None:
|
|
83
|
+
"""Get config for a model name. Returns default if not registered."""
|
|
84
|
+
if model_name in self._registry:
|
|
85
|
+
return self._registry[model_name](uri=uri)
|
|
86
|
+
# Fallback to OpenAIConverterConfig for unregistered models
|
|
87
|
+
if uri is not None:
|
|
88
|
+
return OpenAIConverterConfig(
|
|
89
|
+
llm_params=LLMParams(base_url=uri, model_name=model_name)
|
|
90
|
+
)
|
|
91
|
+
return OpenAIConverterConfig(llm_params=LLMParams(model_name=model_name))
|
|
92
|
+
|
|
93
|
+
def list_models(self) -> list[str]:
|
|
94
|
+
"""List all registered model names."""
|
|
95
|
+
return list(self._registry.keys())
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
# Global registry instance
|
|
99
|
+
converter_config_registry = ConverterConfigRegistry()
|
|
100
|
+
GOOGLE_API_BASE_URL = (
|
|
101
|
+
os.getenv("GOOGLE_API_BASE_URL")
|
|
102
|
+
or "https://generativelanguage.googleapis.com/v1beta/openai/"
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
for gemini_model in [
|
|
107
|
+
"gemini-2.5-pro",
|
|
108
|
+
"gemini-2.5-flash",
|
|
109
|
+
"gemini-2.5-flash-lite",
|
|
110
|
+
"gemini-3-pro-preview",
|
|
111
|
+
]:
|
|
112
|
+
converter_config_registry.register(
|
|
113
|
+
gemini_model,
|
|
114
|
+
lambda uri=None, model=gemini_model: OpenAIConverterConfig(
|
|
115
|
+
llm_params=LLMParams(
|
|
116
|
+
model_name=model,
|
|
117
|
+
base_url=GOOGLE_API_BASE_URL if uri is None else uri,
|
|
118
|
+
api_key=os.getenv("GOOGLE_API_KEY"),
|
|
119
|
+
)
|
|
120
|
+
),
|
|
121
|
+
)
|
|
122
|
+
for openai_model in [
|
|
123
|
+
"gpt-5.1",
|
|
124
|
+
"gpt-5.1-mini",
|
|
125
|
+
"gpt-5.1-nano",
|
|
126
|
+
"gpt-5",
|
|
127
|
+
"gpt-5-mini",
|
|
128
|
+
"gpt-5-nano",
|
|
129
|
+
]:
|
|
130
|
+
converter_config_registry.register(
|
|
131
|
+
openai_model,
|
|
132
|
+
lambda uri=None, model=openai_model: OpenAIConverterConfig(
|
|
133
|
+
llm_params=LLMParams(
|
|
134
|
+
model_name=model,
|
|
135
|
+
base_url=None,
|
|
136
|
+
api_key=os.getenv("OPENAI_API_KEY"),
|
|
137
|
+
)
|
|
138
|
+
),
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
for converter_config_cls in [
|
|
142
|
+
ChandraConverterConfig,
|
|
143
|
+
LightOnOCRConverterConfig,
|
|
144
|
+
DotsOCRConverterConfig,
|
|
145
|
+
PaddleOCRVLConverterConfig,
|
|
146
|
+
NanonetOCR2ConverterConfig,
|
|
147
|
+
HunyuanOCRConverterConfig,
|
|
148
|
+
DeepSeekOCRConverterConfig,
|
|
149
|
+
GraniteDoclingConverterConfig,
|
|
150
|
+
OlmOCRConverterConfig,
|
|
151
|
+
]:
|
|
152
|
+
aliases = get_default(converter_config_cls, "aliases") or []
|
|
153
|
+
model_name = get_default(converter_config_cls, "model_name")
|
|
154
|
+
names = [n for n in aliases + [model_name] if isinstance(n, str)]
|
|
155
|
+
for name in names:
|
|
156
|
+
converter_config_registry.register(
|
|
157
|
+
name,
|
|
158
|
+
lambda uri, cls=converter_config_cls: cls(
|
|
159
|
+
llm_params=LLMParams(
|
|
160
|
+
base_url=uri,
|
|
161
|
+
model_name=DEFAULT_MODEL_NAME,
|
|
162
|
+
api_key="",
|
|
163
|
+
)
|
|
164
|
+
),
|
|
165
|
+
)
|
|
166
|
+
for converter_config_cls in [MinerUConverterConfig, DoclingConverterConfig]:
|
|
167
|
+
aliases = get_default(converter_config_cls, "aliases") or []
|
|
168
|
+
model_name = get_default(converter_config_cls, "model_name")
|
|
169
|
+
names = [n for n in aliases + [model_name] if isinstance(n, str)]
|
|
170
|
+
for name in names:
|
|
171
|
+
converter_config_registry.register(
|
|
172
|
+
name,
|
|
173
|
+
lambda uri, cls=converter_config_cls: cls(base_url=uri),
|
|
174
|
+
)
|
vlmparse/utils.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import os
|
|
3
|
+
from glob import glob
|
|
4
|
+
from io import BytesIO
|
|
5
|
+
|
|
6
|
+
from loguru import logger
|
|
7
|
+
from PIL import Image
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def to_base64(image: Image, extension="PNG"):
|
|
11
|
+
img_byte_arr = BytesIO()
|
|
12
|
+
image.save(img_byte_arr, format=extension)
|
|
13
|
+
img_byte_arr = img_byte_arr.getvalue()
|
|
14
|
+
return base64.b64encode(img_byte_arr).decode("utf-8")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def from_base64(base64_str: str):
|
|
18
|
+
image_data = base64.b64decode(base64_str)
|
|
19
|
+
return Image.open(BytesIO(image_data))
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_file_paths(inputs: str | list[str]):
|
|
23
|
+
# Expand file paths from glob patterns
|
|
24
|
+
file_paths = []
|
|
25
|
+
if isinstance(inputs, str):
|
|
26
|
+
inputs = [inputs]
|
|
27
|
+
for pattern in inputs:
|
|
28
|
+
if "*" in pattern or "?" in pattern:
|
|
29
|
+
file_paths.extend(glob(pattern, recursive=True))
|
|
30
|
+
elif os.path.isdir(pattern):
|
|
31
|
+
file_paths.extend(glob(os.path.join(pattern, "*.pdf"), recursive=True))
|
|
32
|
+
elif os.path.isfile(pattern):
|
|
33
|
+
file_paths.append(pattern)
|
|
34
|
+
else:
|
|
35
|
+
logger.error(f"Invalid input: {pattern}")
|
|
36
|
+
file_paths = [f for f in file_paths if os.path.exists(f) and f.endswith(".pdf")]
|
|
37
|
+
|
|
38
|
+
if not file_paths:
|
|
39
|
+
logger.error("No PDF files found matching the inputs patterns")
|
|
40
|
+
|
|
41
|
+
return file_paths
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vlmparse
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Requires-Python: >=3.12.0
|
|
5
|
+
Description-Content-Type: text/markdown
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Dist: devtools>=0.12.2
|
|
8
|
+
Requires-Dist: docker>=7.1.0
|
|
9
|
+
Requires-Dist: html-to-markdown>=1.9.0
|
|
10
|
+
Requires-Dist: httpx>=0.27.0
|
|
11
|
+
Requires-Dist: loguru>=0.7.3
|
|
12
|
+
Requires-Dist: nest-asyncio>=1.6.0
|
|
13
|
+
Requires-Dist: numpy>=2.3.2
|
|
14
|
+
Requires-Dist: openai>=1.102.0
|
|
15
|
+
Requires-Dist: orjson>=3.11.3
|
|
16
|
+
Requires-Dist: pillow>=11.3.0
|
|
17
|
+
Requires-Dist: pydantic
|
|
18
|
+
Requires-Dist: pypdfium2>=4.30.0
|
|
19
|
+
Requires-Dist: fire>=0.7.1
|
|
20
|
+
Requires-Dist: lxml>=6.0.2
|
|
21
|
+
Requires-Dist: tabulate>=0.9.0
|
|
22
|
+
Provides-Extra: dev
|
|
23
|
+
Requires-Dist: jupyter; extra == "dev"
|
|
24
|
+
Provides-Extra: docling-core
|
|
25
|
+
Requires-Dist: docling-core; extra == "docling-core"
|
|
26
|
+
Provides-Extra: st-app
|
|
27
|
+
Requires-Dist: streamlit>=1.49.0; extra == "st-app"
|
|
28
|
+
Provides-Extra: bench
|
|
29
|
+
Requires-Dist: html-to-markdown>=1.9.0; extra == "bench"
|
|
30
|
+
Requires-Dist: loguru>=0.7.3; extra == "bench"
|
|
31
|
+
Requires-Dist: nest-asyncio>=1.6.0; extra == "bench"
|
|
32
|
+
Requires-Dist: numpy>=2.3.2; extra == "bench"
|
|
33
|
+
Requires-Dist: pillow>=11.3.0; extra == "bench"
|
|
34
|
+
Requires-Dist: pydantic; extra == "bench"
|
|
35
|
+
Requires-Dist: rapidfuzz>=3.14.0; extra == "bench"
|
|
36
|
+
Requires-Dist: unidecode>=1.4.0; extra == "bench"
|
|
37
|
+
Requires-Dist: fire>=0.7.1; extra == "bench"
|
|
38
|
+
Requires-Dist: lxml>=6.0.2; extra == "bench"
|
|
39
|
+
Requires-Dist: datasets>=4.4.1; extra == "bench"
|
|
40
|
+
Requires-Dist: openpyxl>=3.1.5; extra == "bench"
|
|
41
|
+
Requires-Dist: joblib>=1.5.2; extra == "bench"
|
|
42
|
+
Requires-Dist: playwright; extra == "bench"
|
|
43
|
+
Requires-Dist: fuzzysearch>=0.8.1; extra == "bench"
|
|
44
|
+
Provides-Extra: test
|
|
45
|
+
Requires-Dist: pre-commit; extra == "test"
|
|
46
|
+
Requires-Dist: pytest; extra == "test"
|
|
47
|
+
Requires-Dist: pytest-cov; extra == "test"
|
|
48
|
+
Requires-Dist: pytest-mock; extra == "test"
|
|
49
|
+
Requires-Dist: ruff; extra == "test"
|
|
50
|
+
Requires-Dist: isort; extra == "test"
|
|
51
|
+
Requires-Dist: pre-commit; extra == "test"
|
|
52
|
+
Dynamic: license-file
|
|
53
|
+
|
|
54
|
+
# vlmparse
|
|
55
|
+
|
|
56
|
+
A unified wrapper for Vision Language Models (VLM) and OCR solutions to parse PDF documents into Markdown.
|
|
57
|
+
|
|
58
|
+
Features:
|
|
59
|
+
|
|
60
|
+
- ⚡ Async/concurrent processing for high throughput
|
|
61
|
+
- 🐳 Automatic Docker server management for local models
|
|
62
|
+
- 🔄 Unified interface across all VLM/OCR providers
|
|
63
|
+
- 📊 Built-in result visualization with Streamlit
|
|
64
|
+
|
|
65
|
+
Supported Converters:
|
|
66
|
+
|
|
67
|
+
- **Open Source Small VLMs**: `lightonocr`, `mineru2.5`, `hunyuanocr`, `paddleocrvl`, `granite-docling`, `olmocr2-fp8`, `dotsocr`, `chandra`, `deepseekocr`, `nanonets/Nanonets-OCR2-3B`
|
|
68
|
+
- **Open Source Generalist VLMs**: such as the Qwen family.
|
|
69
|
+
- **Pipelines**: `docling`
|
|
70
|
+
- **Proprietary LLMs**: `gemini`, `gpt`
|
|
71
|
+
|
|
72
|
+
## Installation
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
uv sync
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
With optional dependencies:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
uv sync --all-extras
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Activate the virtual environment:
|
|
85
|
+
```bash
|
|
86
|
+
source .venv/bin/activate
|
|
87
|
+
```
|
|
88
|
+
Other solution: append uv run to all the commands below.
|
|
89
|
+
|
|
90
|
+
## CLI Usage
|
|
91
|
+
|
|
92
|
+
### Convert PDFs
|
|
93
|
+
|
|
94
|
+
With a general VLM (requires setting your api key as an environment variable):
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
vlmparse convert --input "*.pdf" --out_folder ./output --model gemini-2.5-flash-lite
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Convert with auto deployment of a small vlm (or any huggingface VLM model, requires a gpu + docker installation):
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
vlmparse convert --input "*.pdf" --out_folder ./output --model nanonets/Nanonets-OCR2-3B
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Deploy a local model server
|
|
107
|
+
|
|
108
|
+
Deployment (requires a gpu + docker installation):
|
|
109
|
+
- You need a gpu dedicated for this.
|
|
110
|
+
- Check that the port is not used by another service.
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
vlmparse serve --model lightonocr --port 8000 --gpus 1
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
then convert:
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
vlmparse convert --input "*.pdf" --out_folder ./output --model lightonocr --uri http://localhost:8000/v1
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
You can also list all running servers:
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
vlmparse list
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
Show logs of a server (if only one server is running, the container name is not needed):
|
|
129
|
+
```bash
|
|
130
|
+
vlmparse log <container_name>
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
Stop a server (if only one server is running, the container name is not needed):
|
|
134
|
+
```bash
|
|
135
|
+
vlmparse stop <container_name>
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### View conversion results with Streamlit
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
vlmparse view ./output
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## Configuration
|
|
145
|
+
|
|
146
|
+
Set API keys as environment variables:
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
export GOOGLE_API_KEY="your-key"
|
|
150
|
+
export OPENAI_API_KEY="your-key"
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## Python API
|
|
154
|
+
|
|
155
|
+
Client interface:
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
from vlmparse.registries import converter_config_registry
|
|
159
|
+
|
|
160
|
+
# Get a converter configuration
|
|
161
|
+
config = converter_config_registry.get("gemini-2.5-flash-lite")
|
|
162
|
+
client = config.get_client()
|
|
163
|
+
|
|
164
|
+
# Convert a single PDF
|
|
165
|
+
document = client("path/to/document.pdf")
|
|
166
|
+
print(document.to_markdown())
|
|
167
|
+
|
|
168
|
+
# Batch convert multiple PDFs
|
|
169
|
+
documents = client.batch(["file1.pdf", "file2.pdf"])
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
Docker server interface:
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
from vlmparse.registries import docker_config_registry
|
|
176
|
+
|
|
177
|
+
config = docker_config_registry.get("lightonocr")
|
|
178
|
+
server = config.get_server()
|
|
179
|
+
server.start()
|
|
180
|
+
|
|
181
|
+
# Client calls...
|
|
182
|
+
|
|
183
|
+
server.stop()
|
|
184
|
+
```
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
vlmparse/base_model.py,sha256=4U4UPe8SNArliKnUf8pp8zQugWYsnhg9okylt7mrW1U,381
|
|
2
|
+
vlmparse/build_doc.py,sha256=9evdU6GwVAZU15dZ1Qght6hNo_QxBQN8X3gmYdU2ltg,1965
|
|
3
|
+
vlmparse/cli.py,sha256=tQma1IkOsFnqPKqqHVO1PJh18n1w82gp4ewA7oraJkE,15855
|
|
4
|
+
vlmparse/converter.py,sha256=5wTA_cFyDMDSY8YgLzZV9SVBKmHjEbJCW8KPoJjmVVA,6880
|
|
5
|
+
vlmparse/converter_with_server.py,sha256=9yoqfv8akB0xZZ7Snjq3aHW5NPNam2AgbK7_rfFqNkk,3909
|
|
6
|
+
vlmparse/registries.py,sha256=TdSR1fx1Tz3roGk4Tk5ckIK6Iz-e4UD4erWUk96fFpQ,5846
|
|
7
|
+
vlmparse/utils.py,sha256=jZWbNMwpZSZL--ZzvL8wPG_7mwpw9Pi36qTO9TjvHZU,1239
|
|
8
|
+
vlmparse-0.1.0.dist-info/licenses/LICENSE,sha256=3TKJHk8hPBR5dbLWZ3IpfCftl-_m-iyBwpYQGZYxj14,1080
|
|
9
|
+
vlmparse-0.1.0.dist-info/METADATA,sha256=4xxtT6rE3pJyqfqbVjl8Llj7C5Az99TeusxXmMHzMMQ,4788
|
|
10
|
+
vlmparse-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
11
|
+
vlmparse-0.1.0.dist-info/entry_points.txt,sha256=gD5berP6HwE2wNIkls-Lw5goiceA8uMgPEd7ifnFJXs,47
|
|
12
|
+
vlmparse-0.1.0.dist-info/top_level.txt,sha256=k4ni-GNH_iAX7liQEsk_KY_c3xgZgt8k9fsSs9IXLXs,9
|
|
13
|
+
vlmparse-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2022 WisTex TechSero Ltd. Co.
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
vlmparse
|