vlmparse 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vlmparse/cli.py +26 -96
- vlmparse/clients/chandra.py +1 -1
- vlmparse/clients/deepseekocr.py +51 -51
- vlmparse/clients/docling.py +2 -2
- vlmparse/clients/dotsocr.py +20 -7
- vlmparse/clients/hunyuanocr.py +2 -1
- vlmparse/clients/mineru.py +18 -19
- vlmparse/clients/olmocr.py +1 -1
- vlmparse/clients/openai_converter.py +14 -4
- vlmparse/clients/paddleocrvl.py +2 -1
- vlmparse/converter_with_server.py +38 -11
- vlmparse/data_model/document.py +11 -1
- vlmparse/registries.py +3 -7
- vlmparse/servers/docker_server.py +16 -2
- vlmparse/servers/utils.py +3 -2
- {vlmparse-0.1.4.dist-info → vlmparse-0.1.6.dist-info}/METADATA +22 -6
- vlmparse-0.1.6.dist-info/RECORD +36 -0
- vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py +0 -1763
- vlmparse/benchpdf2md/bench_tests/utils.py +0 -0
- vlmparse/benchpdf2md/create_dataset.py +0 -60
- vlmparse/benchpdf2md/olmocrbench/katex/__init__.py +0 -1
- vlmparse/benchpdf2md/olmocrbench/katex/render.py +0 -592
- vlmparse/benchpdf2md/olmocrbench/repeatdetect.py +0 -175
- vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py +0 -256
- vlmparse/benchpdf2md/olmocrbench/tests.py +0 -1334
- vlmparse/benchpdf2md/run_benchmark.py +0 -296
- vlmparse/benchpdf2md/st_visu_benchmark/app.py +0 -271
- vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py +0 -117
- vlmparse/benchpdf2md/st_visu_benchmark/test_form.py +0 -95
- vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py +0 -20
- vlmparse/benchpdf2md/st_visu_benchmark/utils.py +0 -50
- vlmparse/benchpdf2md/utils.py +0 -56
- vlmparse-0.1.4.dist-info/RECORD +0 -51
- {vlmparse-0.1.4.dist-info → vlmparse-0.1.6.dist-info}/WHEEL +0 -0
- {vlmparse-0.1.4.dist-info → vlmparse-0.1.6.dist-info}/entry_points.txt +0 -0
- {vlmparse-0.1.4.dist-info → vlmparse-0.1.6.dist-info}/licenses/LICENSE +0 -0
- {vlmparse-0.1.4.dist-info → vlmparse-0.1.6.dist-info}/top_level.txt +0 -0
vlmparse/cli.py
CHANGED
|
@@ -4,13 +4,24 @@ from loguru import logger
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class DParseCLI:
|
|
7
|
-
|
|
7
|
+
"""Parsing of pdf to text using VLMs: typ in vlmparse to get the command lists, then `vlmparse <command> --help` to get help on a specific command."""
|
|
8
|
+
|
|
9
|
+
def serve(
|
|
10
|
+
self,
|
|
11
|
+
model: str,
|
|
12
|
+
port: int | None = None,
|
|
13
|
+
gpus: str | None = None,
|
|
14
|
+
vllm_kwargs: dict | None = None,
|
|
15
|
+
forget_predefined_vllm_kwargs: bool = False,
|
|
16
|
+
):
|
|
8
17
|
"""Deploy a VLLM server in a Docker container.
|
|
9
18
|
|
|
10
19
|
Args:
|
|
11
20
|
model: Model name
|
|
12
21
|
port: VLLM server port (default: 8056)
|
|
13
22
|
gpus: Comma-separated GPU device IDs (e.g., "0" or "0,1,2"). If not specified, all GPUs will be used.
|
|
23
|
+
vllm_kwargs: Additional keyword arguments to pass to the VLLM server.
|
|
24
|
+
forget_predefined_vllm_kwargs: If True, the predefined VLLM kwargs from the docker config will be replaced by vllm_kwargs otherwise the predefined kwargs will be updated with vllm_kwargs with a risk of collision of argument names.
|
|
14
25
|
"""
|
|
15
26
|
if port is None:
|
|
16
27
|
port = 8056
|
|
@@ -32,6 +43,10 @@ class DParseCLI:
|
|
|
32
43
|
docker_config.gpu_device_ids = [g.strip() for g in str(gpus).split(",")]
|
|
33
44
|
server = docker_config.get_server(auto_stop=False)
|
|
34
45
|
|
|
46
|
+
if server is None:
|
|
47
|
+
logger.error(f"Model server not found for model: {model}")
|
|
48
|
+
return
|
|
49
|
+
|
|
35
50
|
# Deploy server and leave it running (cleanup=False)
|
|
36
51
|
logger.info(
|
|
37
52
|
f"Deploying VLLM server for {docker_config.model_name} on port {port}..."
|
|
@@ -54,6 +69,8 @@ class DParseCLI:
|
|
|
54
69
|
with_vllm_server: bool = False,
|
|
55
70
|
concurrency: int = 10,
|
|
56
71
|
dpi: int | None = None,
|
|
72
|
+
vllm_kwargs: dict | None = None,
|
|
73
|
+
debug: bool = False,
|
|
57
74
|
):
|
|
58
75
|
"""Parse PDF documents and save results.
|
|
59
76
|
|
|
@@ -67,109 +84,22 @@ class DParseCLI:
|
|
|
67
84
|
mode: Output mode - "document" (save as JSON zip), "md" (save as markdown file), "md_page" (save as folder of markdown pages)
|
|
68
85
|
with_vllm_server: If True, a local VLLM server will be deployed if the model is not found in the registry. Note that if the model is in the registry and the uri is None, the server will be anyway deployed.
|
|
69
86
|
dpi: DPI to use for the conversion. If not specified, the default DPI will be used.
|
|
87
|
+
vllm_kwargs: Additional keyword arguments to pass to the VLLM server.
|
|
88
|
+
debug: If True, run in debug mode (single-threaded, no concurrency)
|
|
70
89
|
"""
|
|
71
90
|
from vlmparse.converter_with_server import ConverterWithServer
|
|
72
91
|
|
|
73
|
-
|
|
92
|
+
with ConverterWithServer(
|
|
74
93
|
model=model,
|
|
75
94
|
uri=uri,
|
|
76
95
|
gpus=gpus,
|
|
77
96
|
with_vllm_server=with_vllm_server,
|
|
78
97
|
concurrency=concurrency,
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
# from vlmparse.registries import converter_config_registry
|
|
85
|
-
|
|
86
|
-
# # Infer model from URI if provided
|
|
87
|
-
# if uri is not None and model is None:
|
|
88
|
-
# import docker
|
|
89
|
-
|
|
90
|
-
# try:
|
|
91
|
-
# docker_client = docker.from_env()
|
|
92
|
-
# containers = docker_client.containers.list()
|
|
93
|
-
# for container in containers:
|
|
94
|
-
# # Check both exact match and match with/without trailing slash
|
|
95
|
-
# container_uri = container.labels.get("vlmparse_uri", "")
|
|
96
|
-
# if container_uri and (
|
|
97
|
-
# container_uri == uri
|
|
98
|
-
# or container_uri.rstrip("/") == uri.rstrip("/")
|
|
99
|
-
# ):
|
|
100
|
-
# inferred_model = container.labels.get("vlmparse_model_name")
|
|
101
|
-
# if inferred_model:
|
|
102
|
-
# logger.info(
|
|
103
|
-
# f"Inferred model {inferred_model} from URI {uri}"
|
|
104
|
-
# )
|
|
105
|
-
# model = inferred_model
|
|
106
|
-
# break
|
|
107
|
-
# except Exception:
|
|
108
|
-
# # If Docker is not available or fails, just proceed with provided arguments
|
|
109
|
-
# pass
|
|
110
|
-
|
|
111
|
-
# if mode not in ["document", "md", "md_page"]:
|
|
112
|
-
# logger.error(f"Invalid mode: {mode}. Must be one of: document, md, md_page")
|
|
113
|
-
# return
|
|
114
|
-
|
|
115
|
-
# # Expand file paths from glob patterns
|
|
116
|
-
# file_paths = []
|
|
117
|
-
# if isinstance(inputs, str):
|
|
118
|
-
# inputs = [inputs]
|
|
119
|
-
# for pattern in inputs:
|
|
120
|
-
# if "*" in pattern or "?" in pattern:
|
|
121
|
-
# file_paths.extend(glob(pattern, recursive=True))
|
|
122
|
-
# elif os.path.isdir(pattern):
|
|
123
|
-
# file_paths.extend(glob(os.path.join(pattern, "*.pdf"), recursive=True))
|
|
124
|
-
# elif os.path.isfile(pattern):
|
|
125
|
-
# file_paths.append(pattern)
|
|
126
|
-
# else:
|
|
127
|
-
# logger.error(f"Invalid input: {pattern}")
|
|
128
|
-
|
|
129
|
-
# # Filter to only existing PDF files
|
|
130
|
-
# file_paths = [f for f in file_paths if os.path.exists(f) and f.endswith(".pdf")]
|
|
131
|
-
|
|
132
|
-
# if not file_paths:
|
|
133
|
-
# logger.error("No PDF files found matching the inputs patterns")
|
|
134
|
-
# return
|
|
135
|
-
|
|
136
|
-
# logger.info(f"Processing {len(file_paths)} files with {model} converter")
|
|
137
|
-
|
|
138
|
-
# gpu_device_ids = None
|
|
139
|
-
# if gpus is not None:
|
|
140
|
-
# gpu_device_ids = [g.strip() for g in gpus.split(",")]
|
|
141
|
-
|
|
142
|
-
# if uri is None:
|
|
143
|
-
# from vlmparse.registries import docker_config_registry
|
|
144
|
-
|
|
145
|
-
# docker_config = docker_config_registry.get(model, default=with_vllm_server)
|
|
146
|
-
|
|
147
|
-
# if docker_config is not None:
|
|
148
|
-
# docker_config.gpu_device_ids = gpu_device_ids
|
|
149
|
-
# server = docker_config.get_server(auto_stop=True)
|
|
150
|
-
# server.start()
|
|
151
|
-
|
|
152
|
-
# client = docker_config.get_client(
|
|
153
|
-
# save_folder=out_folder, save_mode=mode
|
|
154
|
-
# )
|
|
155
|
-
# else:
|
|
156
|
-
# client = converter_config_registry.get(model).get_client(
|
|
157
|
-
# save_folder=out_folder, save_mode=mode
|
|
158
|
-
# )
|
|
159
|
-
|
|
160
|
-
# else:
|
|
161
|
-
# client_config = converter_config_registry.get(model, uri=uri)
|
|
162
|
-
# client = client_config.get_client(save_folder=out_folder, save_mode=mode)
|
|
163
|
-
# client.num_concurrent_files = concurrency
|
|
164
|
-
# client.num_concurrent_pages = concurrency
|
|
165
|
-
# if dpi is not None:
|
|
166
|
-
# client.config.dpi = int(dpi)
|
|
167
|
-
# documents = client.batch(file_paths)
|
|
168
|
-
|
|
169
|
-
# if documents is not None:
|
|
170
|
-
# logger.info(f"Processed {len(documents)} documents to {out_folder}")
|
|
171
|
-
# else:
|
|
172
|
-
# logger.info(f"Processed {len(file_paths)} documents to {out_folder}")
|
|
98
|
+
vllm_kwargs=vllm_kwargs,
|
|
99
|
+
) as converter_with_server:
|
|
100
|
+
return converter_with_server.parse(
|
|
101
|
+
inputs=inputs, out_folder=out_folder, mode=mode, dpi=dpi, debug=debug
|
|
102
|
+
)
|
|
173
103
|
|
|
174
104
|
def list(self):
|
|
175
105
|
"""List all containers whose name begins with vlmparse."""
|
vlmparse/clients/chandra.py
CHANGED
|
@@ -194,7 +194,7 @@ class ChandraConverterConfig(OpenAIConverterConfig):
|
|
|
194
194
|
model_name: str = "datalab-to/chandra"
|
|
195
195
|
prompt_type: str = "ocr" # Default prompt type
|
|
196
196
|
bbox_scale: int = 1024
|
|
197
|
-
max_retries: int =
|
|
197
|
+
max_retries: int = 0
|
|
198
198
|
max_failure_retries: int = None
|
|
199
199
|
completion_kwargs: dict = Field(
|
|
200
200
|
default_factory=lambda: {
|
vlmparse/clients/deepseekocr.py
CHANGED
|
@@ -15,6 +15,57 @@ from vlmparse.servers.docker_server import VLLMDockerServerConfig
|
|
|
15
15
|
from vlmparse.utils import to_base64
|
|
16
16
|
|
|
17
17
|
|
|
18
|
+
class DeepSeekOCRDockerServerConfig(VLLMDockerServerConfig):
|
|
19
|
+
"""Configuration for DeepSeekOCR model."""
|
|
20
|
+
|
|
21
|
+
model_name: str = "deepseek-ai/DeepSeek-OCR"
|
|
22
|
+
command_args: list[str] = Field(
|
|
23
|
+
default_factory=lambda: [
|
|
24
|
+
"--limit-mm-per-prompt",
|
|
25
|
+
'{"image": 1}',
|
|
26
|
+
"--async-scheduling",
|
|
27
|
+
"--logits_processors",
|
|
28
|
+
"vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor",
|
|
29
|
+
"--no-enable-prefix-caching",
|
|
30
|
+
"--mm-processor-cache-gb",
|
|
31
|
+
"0",
|
|
32
|
+
]
|
|
33
|
+
)
|
|
34
|
+
aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def client_config(self):
|
|
38
|
+
return DeepSeekOCRConverterConfig(llm_params=self.llm_params)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class DeepSeekOCRConverterConfig(OpenAIConverterConfig):
|
|
42
|
+
"""DeepSeekOCR converter - backward compatibility alias."""
|
|
43
|
+
|
|
44
|
+
model_name: str = "deepseek-ai/DeepSeek-OCR"
|
|
45
|
+
aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
|
|
46
|
+
|
|
47
|
+
prompt_mode: Literal["layout", "ocr"] = "ocr"
|
|
48
|
+
completion_kwargs: dict | None = {
|
|
49
|
+
"temperature": 0.0,
|
|
50
|
+
"max_tokens": 8181,
|
|
51
|
+
"extra_body": {
|
|
52
|
+
"skip_special_tokens": False,
|
|
53
|
+
# args used to control custom logits processor
|
|
54
|
+
"vllm_xargs": {
|
|
55
|
+
"ngram_size": 30,
|
|
56
|
+
"window_size": 90,
|
|
57
|
+
# whitelist: <td>, </td>
|
|
58
|
+
"whitelist_token_ids": [128821, 128822],
|
|
59
|
+
},
|
|
60
|
+
},
|
|
61
|
+
}
|
|
62
|
+
dpi: int = 200
|
|
63
|
+
aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
|
|
64
|
+
|
|
65
|
+
def get_client(self, **kwargs) -> "DeepSeekOCRConverterClient":
|
|
66
|
+
return DeepSeekOCRConverterClient(config=self, **kwargs)
|
|
67
|
+
|
|
68
|
+
|
|
18
69
|
def re_match(text):
|
|
19
70
|
pattern = r"(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)"
|
|
20
71
|
matches = re.findall(pattern, text, re.DOTALL)
|
|
@@ -150,54 +201,3 @@ class DeepSeekOCRConverterClient(OpenAIConverterClient):
|
|
|
150
201
|
logger.debug(page.text)
|
|
151
202
|
|
|
152
203
|
return page
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
class DeepSeekOCRDockerServerConfig(VLLMDockerServerConfig):
|
|
156
|
-
"""Configuration for DeepSeekOCR model."""
|
|
157
|
-
|
|
158
|
-
model_name: str = "deepseek-ai/DeepSeek-OCR"
|
|
159
|
-
command_args: list[str] = Field(
|
|
160
|
-
default_factory=lambda: [
|
|
161
|
-
"--limit-mm-per-prompt",
|
|
162
|
-
'{"image": 1}',
|
|
163
|
-
"--async-scheduling",
|
|
164
|
-
"--logits_processors",
|
|
165
|
-
"vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor",
|
|
166
|
-
"--no-enable-prefix-caching",
|
|
167
|
-
"--mm-processor-cache-gb",
|
|
168
|
-
"0",
|
|
169
|
-
]
|
|
170
|
-
)
|
|
171
|
-
aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
|
|
172
|
-
|
|
173
|
-
@property
|
|
174
|
-
def client_config(self):
|
|
175
|
-
return DeepSeekOCRConverterConfig(llm_params=self.llm_params)
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
class DeepSeekOCRConverterConfig(OpenAIConverterConfig):
|
|
179
|
-
"""DeepSeekOCR converter - backward compatibility alias."""
|
|
180
|
-
|
|
181
|
-
model_name: str = "deepseek-ai/DeepSeek-OCR"
|
|
182
|
-
aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
|
|
183
|
-
|
|
184
|
-
prompt_mode: Literal["layout", "ocr"] = "ocr"
|
|
185
|
-
completion_kwargs: dict | None = {
|
|
186
|
-
"temperature": 0.0,
|
|
187
|
-
"max_tokens": 8181,
|
|
188
|
-
"extra_body": {
|
|
189
|
-
"skip_special_tokens": False,
|
|
190
|
-
# args used to control custom logits processor
|
|
191
|
-
"vllm_xargs": {
|
|
192
|
-
"ngram_size": 30,
|
|
193
|
-
"window_size": 90,
|
|
194
|
-
# whitelist: <td>, </td>
|
|
195
|
-
"whitelist_token_ids": [128821, 128822],
|
|
196
|
-
},
|
|
197
|
-
},
|
|
198
|
-
}
|
|
199
|
-
dpi: int = 200
|
|
200
|
-
aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
|
|
201
|
-
|
|
202
|
-
def get_client(self, **kwargs) -> "DeepSeekOCRConverterClient":
|
|
203
|
-
return DeepSeekOCRConverterClient(config=self, **kwargs)
|
vlmparse/clients/docling.py
CHANGED
|
@@ -34,7 +34,7 @@ class DoclingDockerServerConfig(DockerServerConfig):
|
|
|
34
34
|
"LOG_LEVEL": "DEBUG", # Enable verbose logging
|
|
35
35
|
# Performance Tuning
|
|
36
36
|
# "UVICORN_WORKERS": "4", # Increase web server workers (Default: 1)
|
|
37
|
-
|
|
37
|
+
"DOCLING_SERVE_ENG_LOC_NUM_WORKERS": "16", # Increase processing workers (Default: 2)
|
|
38
38
|
"DOCLING_NUM_THREADS": "32", # Increase torch threads (Default: 4)
|
|
39
39
|
}
|
|
40
40
|
)
|
|
@@ -62,8 +62,8 @@ class DoclingDockerServerConfig(DockerServerConfig):
|
|
|
62
62
|
class DoclingConverterConfig(ConverterConfig):
|
|
63
63
|
"""Configuration for Docling converter client."""
|
|
64
64
|
|
|
65
|
+
base_url: str
|
|
65
66
|
model_name: str = "docling"
|
|
66
|
-
base_url: str = "http://localhost:5001"
|
|
67
67
|
timeout: int = 300
|
|
68
68
|
api_kwargs: dict = {"output_format": "markdown", "image_export_mode": "referenced"}
|
|
69
69
|
|
vlmparse/clients/dotsocr.py
CHANGED
|
@@ -8,6 +8,7 @@ from PIL import Image
|
|
|
8
8
|
from pydantic import Field
|
|
9
9
|
|
|
10
10
|
from vlmparse.clients.openai_converter import (
|
|
11
|
+
LLMParams,
|
|
11
12
|
OpenAIConverterClient,
|
|
12
13
|
OpenAIConverterConfig,
|
|
13
14
|
)
|
|
@@ -28,6 +29,7 @@ class DotsOCRDockerServerConfig(DockerServerConfig):
|
|
|
28
29
|
dockerfile_dir: str = str(DOCKERFILE_DIR / "dotsocr")
|
|
29
30
|
command_args: list[str] = Field(
|
|
30
31
|
default_factory=lambda: [
|
|
32
|
+
"/workspace/weights/DotsOCR",
|
|
31
33
|
"--tensor-parallel-size",
|
|
32
34
|
"1",
|
|
33
35
|
"--gpu-memory-utilization",
|
|
@@ -44,12 +46,19 @@ class DotsOCRDockerServerConfig(DockerServerConfig):
|
|
|
44
46
|
# "16384",
|
|
45
47
|
]
|
|
46
48
|
)
|
|
47
|
-
add_model_key_to_server: bool =
|
|
49
|
+
add_model_key_to_server: bool = True
|
|
48
50
|
aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
|
|
49
51
|
|
|
50
52
|
@property
|
|
51
53
|
def client_config(self):
|
|
52
|
-
return DotsOCRConverterConfig(
|
|
54
|
+
return DotsOCRConverterConfig(
|
|
55
|
+
llm_params=LLMParams(
|
|
56
|
+
base_url=f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}",
|
|
57
|
+
)
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
def get_base_url_suffix(self) -> str:
|
|
61
|
+
return "/v1"
|
|
53
62
|
|
|
54
63
|
|
|
55
64
|
class DotsOCRConverterConfig(OpenAIConverterConfig):
|
|
@@ -228,7 +237,7 @@ class DotsOCRConverter(OpenAIConverterClient):
|
|
|
228
237
|
)
|
|
229
238
|
prompt = self.PROMPTS[prompt_mode]
|
|
230
239
|
|
|
231
|
-
response = await self._async_inference_with_vllm(image, prompt)
|
|
240
|
+
response, usage = await self._async_inference_with_vllm(image, prompt)
|
|
232
241
|
|
|
233
242
|
if prompt_mode in ["prompt_layout_all_en"]:
|
|
234
243
|
try:
|
|
@@ -239,17 +248,17 @@ class DotsOCRConverter(OpenAIConverterClient):
|
|
|
239
248
|
image.width,
|
|
240
249
|
image.height,
|
|
241
250
|
)
|
|
242
|
-
return {}, cells, False
|
|
251
|
+
return {}, cells, False, usage
|
|
243
252
|
except Exception as e:
|
|
244
253
|
logger.warning(f"cells post process error: {e}, returning raw response")
|
|
245
|
-
return {}, response, True
|
|
254
|
+
return {}, response, True, usage
|
|
246
255
|
else:
|
|
247
|
-
return {}, response, None
|
|
256
|
+
return {}, response, None, usage
|
|
248
257
|
|
|
249
258
|
async def async_call_inside_page(self, page: Page) -> Page:
|
|
250
259
|
image = page.image
|
|
251
260
|
|
|
252
|
-
_, response, _ = await self._parse_image_vllm(
|
|
261
|
+
_, response, _, usage = await self._parse_image_vllm(
|
|
253
262
|
image, prompt_mode=self.config.prompt_mode
|
|
254
263
|
)
|
|
255
264
|
logger.info("Response: " + str(response))
|
|
@@ -274,4 +283,8 @@ class DotsOCRConverter(OpenAIConverterClient):
|
|
|
274
283
|
text = clean_response(response)
|
|
275
284
|
text = html_to_md_keep_tables(text)
|
|
276
285
|
page.text = text
|
|
286
|
+
|
|
287
|
+
page.completion_tokens = usage.completion_tokens
|
|
288
|
+
page.prompt_tokens = usage.prompt_tokens
|
|
289
|
+
page.reasoning_tokens = usage.reasoning_tokens
|
|
277
290
|
return page
|
vlmparse/clients/hunyuanocr.py
CHANGED
|
@@ -39,7 +39,8 @@ class HunyuanOCRConverterConfig(OpenAIConverterConfig):
|
|
|
39
39
|
completion_kwargs: dict | None = {
|
|
40
40
|
"temperature": 0.0,
|
|
41
41
|
"extra_body": {"top_k": 1, "repetition_penalty": 1.0},
|
|
42
|
+
"max_completion_tokens": 16384, # max token len used in training according to the technical report is 32000, but in practice the model breaks earlier
|
|
42
43
|
}
|
|
43
|
-
max_image_size: int | None = 1540
|
|
44
44
|
dpi: int = 200
|
|
45
45
|
aliases: list[str] = Field(default_factory=lambda: ["hunyuanocr"])
|
|
46
|
+
stream: bool = True
|
vlmparse/clients/mineru.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import io
|
|
3
|
-
import os
|
|
4
3
|
|
|
5
4
|
import orjson
|
|
6
5
|
from loguru import logger
|
|
@@ -20,18 +19,21 @@ class MinerUDockerServerConfig(DockerServerConfig):
|
|
|
20
19
|
docker_image: str = "pulsia/mineru25apipulsia:latest"
|
|
21
20
|
docker_port: int = 4299
|
|
22
21
|
container_port: int = 8000
|
|
22
|
+
server_ready_indicators: list[str] = Field(
|
|
23
|
+
default_factory=lambda: ["Uvicorn running"]
|
|
24
|
+
)
|
|
23
25
|
|
|
24
26
|
@property
|
|
25
27
|
def client_config(self):
|
|
26
|
-
return MinerUConverterConfig(
|
|
28
|
+
return MinerUConverterConfig(base_url=f"http://localhost:{self.docker_port}")
|
|
27
29
|
|
|
28
30
|
|
|
29
31
|
class MinerUConverterConfig(ConverterConfig):
|
|
30
32
|
"""Configuration for MinerU API converter."""
|
|
31
33
|
|
|
32
|
-
base_url: str
|
|
33
|
-
|
|
34
|
-
)
|
|
34
|
+
base_url: str
|
|
35
|
+
model_name: str = "opendatalab/MinerU2.5-2509-1.2B"
|
|
36
|
+
aliases: list[str] = Field(default_factory=lambda: ["mineru25"])
|
|
35
37
|
timeout: int = 600
|
|
36
38
|
|
|
37
39
|
def get_client(self, **kwargs) -> "MinerUConverter":
|
|
@@ -50,25 +52,22 @@ class MinerUConverter(BaseConverter):
|
|
|
50
52
|
|
|
51
53
|
config: MinerUConverterConfig
|
|
52
54
|
|
|
53
|
-
def __init__(self, config: MinerUConverterConfig, **kwargs):
|
|
54
|
-
super().__init__(config=config, **kwargs)
|
|
55
|
-
from httpx import AsyncClient
|
|
56
|
-
|
|
57
|
-
self.client = AsyncClient(base_url=config.api_url, timeout=config.timeout)
|
|
58
|
-
|
|
59
55
|
async def _async_inference_with_api(self, image) -> list:
|
|
60
56
|
"""Run async inference with MinerU API."""
|
|
57
|
+
from httpx import AsyncClient
|
|
61
58
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
59
|
+
async with AsyncClient(
|
|
60
|
+
base_url=self.config.base_url, timeout=self.config.timeout
|
|
61
|
+
) as client:
|
|
62
|
+
img_byte_arr = await asyncio.to_thread(to_bytes_io, image)
|
|
63
|
+
response = await client.post(
|
|
64
|
+
"process-image",
|
|
65
|
+
files={"image": ("image.png", img_byte_arr, "image/png")},
|
|
66
|
+
)
|
|
68
67
|
|
|
69
|
-
|
|
68
|
+
response.raise_for_status()
|
|
70
69
|
|
|
71
|
-
|
|
70
|
+
res = orjson.loads(response.content)
|
|
72
71
|
|
|
73
72
|
return res
|
|
74
73
|
|
vlmparse/clients/olmocr.py
CHANGED
|
@@ -41,6 +41,6 @@ class OlmOCRConverterConfig(OpenAIConverterConfig):
|
|
|
41
41
|
"temperature": 0.1,
|
|
42
42
|
"max_tokens": 8000,
|
|
43
43
|
}
|
|
44
|
-
max_image_size: int | None = 1288
|
|
44
|
+
# max_image_size: int | None = 1288
|
|
45
45
|
dpi: int = 200
|
|
46
46
|
aliases: list[str] = Field(default_factory=lambda: ["olmocr-2-fp8"])
|
|
@@ -40,6 +40,8 @@ def get_llm_params(model_name: str, uri: str | None = None):
|
|
|
40
40
|
]:
|
|
41
41
|
base_url = None
|
|
42
42
|
api_key = os.getenv("OPENAI_API_KEY")
|
|
43
|
+
if api_key is None:
|
|
44
|
+
raise ValueError("OPENAI_API_KEY environment variable not set")
|
|
43
45
|
else:
|
|
44
46
|
if model_name in [
|
|
45
47
|
"gemini-2.5-flash-lite",
|
|
@@ -48,6 +50,8 @@ def get_llm_params(model_name: str, uri: str | None = None):
|
|
|
48
50
|
]:
|
|
49
51
|
base_url = GOOGLE_API_BASE_URL
|
|
50
52
|
api_key = os.getenv("GOOGLE_API_KEY")
|
|
53
|
+
if api_key is None:
|
|
54
|
+
raise ValueError("GOOGLE_API_KEY environment variable not set")
|
|
51
55
|
else:
|
|
52
56
|
return None
|
|
53
57
|
return LLMParams(base_url=base_url, model_name=model_name, api_key=api_key)
|
|
@@ -92,11 +96,12 @@ class OpenAIConverterClient(BaseConverter):
|
|
|
92
96
|
base_url=self.config.llm_params.base_url,
|
|
93
97
|
api_key=self.config.llm_params.api_key,
|
|
94
98
|
timeout=self.config.llm_params.timeout,
|
|
99
|
+
max_retries=self.config.llm_params.max_retries,
|
|
95
100
|
)
|
|
96
101
|
|
|
97
102
|
async def _get_chat_completion(
|
|
98
103
|
self, messages: list[dict], completion_kwargs: dict | None = None
|
|
99
|
-
) -> str:
|
|
104
|
+
) -> tuple[str, "CompletionUsage"]: # noqa: F821
|
|
100
105
|
"""Helper to handle chat completion with optional streaming."""
|
|
101
106
|
if completion_kwargs is None:
|
|
102
107
|
completion_kwargs = self.config.completion_kwargs
|
|
@@ -125,7 +130,8 @@ class OpenAIConverterClient(BaseConverter):
|
|
|
125
130
|
"Response is None, finish reason: "
|
|
126
131
|
+ response_obj.choices[0].finish_reason
|
|
127
132
|
)
|
|
128
|
-
|
|
133
|
+
|
|
134
|
+
return response_obj.choices[0].message.content, response_obj.usage
|
|
129
135
|
|
|
130
136
|
async def async_call_inside_page(self, page: Page) -> Page:
|
|
131
137
|
"""Process a single page using OpenAI-compatible API."""
|
|
@@ -162,12 +168,16 @@ class OpenAIConverterClient(BaseConverter):
|
|
|
162
168
|
},
|
|
163
169
|
]
|
|
164
170
|
|
|
165
|
-
response = await self._get_chat_completion(messages)
|
|
166
|
-
logger.
|
|
171
|
+
response, usage = await self._get_chat_completion(messages)
|
|
172
|
+
logger.debug("Response: " + str(response))
|
|
167
173
|
page.raw_response = response
|
|
168
174
|
text = clean_response(response)
|
|
169
175
|
|
|
170
176
|
text = html_to_md_keep_tables(text)
|
|
171
177
|
page.text = text
|
|
178
|
+
page.prompt_tokens = usage.prompt_tokens
|
|
179
|
+
page.completion_tokens = usage.completion_tokens
|
|
180
|
+
if hasattr(usage, "reasoning_tokens"):
|
|
181
|
+
page.reasoning_tokens = usage.reasoning_tokens
|
|
172
182
|
|
|
173
183
|
return page
|
vlmparse/clients/paddleocrvl.py
CHANGED
|
@@ -42,7 +42,8 @@ class PaddleOCRVLConverterConfig(OpenAIConverterConfig):
|
|
|
42
42
|
postprompt: str | None = TASKS["ocr"]
|
|
43
43
|
completion_kwargs: dict | None = {
|
|
44
44
|
"temperature": 0.0,
|
|
45
|
+
"max_completion_tokens": 16384,
|
|
45
46
|
}
|
|
46
|
-
max_image_size: int | None = 1540
|
|
47
47
|
dpi: int = 200
|
|
48
48
|
aliases: list[str] = Field(default_factory=lambda: ["paddleocrvl"])
|
|
49
|
+
stream: bool = True
|
|
@@ -12,28 +12,35 @@ from vlmparse.utils import get_file_paths
|
|
|
12
12
|
class ConverterWithServer:
|
|
13
13
|
def __init__(
|
|
14
14
|
self,
|
|
15
|
-
model: str,
|
|
15
|
+
model: str | None = None,
|
|
16
16
|
uri: str | None = None,
|
|
17
17
|
gpus: str | None = None,
|
|
18
18
|
port: int | None = None,
|
|
19
19
|
with_vllm_server: bool = False,
|
|
20
20
|
concurrency: int = 10,
|
|
21
|
+
vllm_kwargs: dict | None = None,
|
|
22
|
+
forget_predefined_vllm_kwargs: bool = False,
|
|
21
23
|
):
|
|
22
|
-
from vlmparse.registries import (
|
|
23
|
-
converter_config_registry,
|
|
24
|
-
docker_config_registry,
|
|
25
|
-
)
|
|
26
|
-
|
|
27
24
|
self.model = model
|
|
28
25
|
self.uri = uri
|
|
29
26
|
self.port = port
|
|
30
27
|
self.gpus = gpus
|
|
31
28
|
self.with_vllm_server = with_vllm_server
|
|
32
29
|
self.concurrency = concurrency
|
|
30
|
+
self.vllm_kwargs = vllm_kwargs
|
|
31
|
+
self.forget_predefined_vllm_kwargs = forget_predefined_vllm_kwargs
|
|
32
|
+
self.server = None
|
|
33
|
+
self.client = None
|
|
33
34
|
|
|
34
35
|
if self.uri is not None and self.model is None:
|
|
35
36
|
self.model = get_model_from_uri(self.uri)
|
|
36
37
|
|
|
38
|
+
def start_server_and_client(self):
|
|
39
|
+
from vlmparse.registries import (
|
|
40
|
+
converter_config_registry,
|
|
41
|
+
docker_config_registry,
|
|
42
|
+
)
|
|
43
|
+
|
|
37
44
|
gpu_device_ids = None
|
|
38
45
|
if self.gpus is not None:
|
|
39
46
|
gpu_device_ids = [g.strip() for g in self.gpus.split(",")]
|
|
@@ -42,13 +49,18 @@ class ConverterWithServer:
|
|
|
42
49
|
docker_config = docker_config_registry.get(
|
|
43
50
|
self.model, default=self.with_vllm_server
|
|
44
51
|
)
|
|
45
|
-
if self.port is not None:
|
|
46
|
-
docker_config.docker_port = self.port
|
|
47
52
|
|
|
48
53
|
if docker_config is not None:
|
|
54
|
+
if self.port is not None:
|
|
55
|
+
docker_config.docker_port = self.port
|
|
49
56
|
docker_config.gpu_device_ids = gpu_device_ids
|
|
50
|
-
|
|
51
|
-
|
|
57
|
+
docker_config.update_command_args(
|
|
58
|
+
self.vllm_kwargs,
|
|
59
|
+
forget_predefined_vllm_kwargs=self.forget_predefined_vllm_kwargs,
|
|
60
|
+
)
|
|
61
|
+
self.server = docker_config.get_server(auto_stop=True)
|
|
62
|
+
|
|
63
|
+
self.server.start()
|
|
52
64
|
|
|
53
65
|
self.client = docker_config.get_client()
|
|
54
66
|
else:
|
|
@@ -56,8 +68,20 @@ class ConverterWithServer:
|
|
|
56
68
|
|
|
57
69
|
else:
|
|
58
70
|
client_config = converter_config_registry.get(self.model, uri=self.uri)
|
|
71
|
+
|
|
59
72
|
self.client = client_config.get_client()
|
|
60
73
|
|
|
74
|
+
def stop_server(self):
|
|
75
|
+
if self.server is not None and self.server.auto_stop:
|
|
76
|
+
self.server.stop()
|
|
77
|
+
|
|
78
|
+
def __enter__(self):
|
|
79
|
+
self.start_server_and_client()
|
|
80
|
+
return self
|
|
81
|
+
|
|
82
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
|
83
|
+
self.stop_server()
|
|
84
|
+
|
|
61
85
|
def parse(
|
|
62
86
|
self,
|
|
63
87
|
inputs: str | list[str],
|
|
@@ -67,6 +91,9 @@ class ConverterWithServer:
|
|
|
67
91
|
debug: bool = False,
|
|
68
92
|
retrylast: bool = False,
|
|
69
93
|
):
|
|
94
|
+
assert (
|
|
95
|
+
self.client is not None
|
|
96
|
+
), "Client not initialized. Call start_server_and_client() first."
|
|
70
97
|
file_paths = get_file_paths(inputs)
|
|
71
98
|
assert (
|
|
72
99
|
out_folder is not None
|
|
@@ -118,5 +145,5 @@ class ConverterWithServer:
|
|
|
118
145
|
|
|
119
146
|
return documents
|
|
120
147
|
|
|
121
|
-
def get_out_folder(self) ->
|
|
148
|
+
def get_out_folder(self) -> str | None:
|
|
122
149
|
return self.client.save_folder
|