vlmparse 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. vlmparse/cli.py +26 -96
  2. vlmparse/clients/chandra.py +1 -1
  3. vlmparse/clients/deepseekocr.py +51 -51
  4. vlmparse/clients/docling.py +2 -2
  5. vlmparse/clients/dotsocr.py +20 -7
  6. vlmparse/clients/hunyuanocr.py +2 -1
  7. vlmparse/clients/mineru.py +18 -19
  8. vlmparse/clients/olmocr.py +1 -1
  9. vlmparse/clients/openai_converter.py +14 -4
  10. vlmparse/clients/paddleocrvl.py +2 -1
  11. vlmparse/converter_with_server.py +38 -11
  12. vlmparse/data_model/document.py +11 -1
  13. vlmparse/registries.py +3 -7
  14. vlmparse/servers/docker_server.py +16 -2
  15. vlmparse/servers/utils.py +3 -2
  16. {vlmparse-0.1.4.dist-info → vlmparse-0.1.6.dist-info}/METADATA +22 -6
  17. vlmparse-0.1.6.dist-info/RECORD +36 -0
  18. vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py +0 -1763
  19. vlmparse/benchpdf2md/bench_tests/utils.py +0 -0
  20. vlmparse/benchpdf2md/create_dataset.py +0 -60
  21. vlmparse/benchpdf2md/olmocrbench/katex/__init__.py +0 -1
  22. vlmparse/benchpdf2md/olmocrbench/katex/render.py +0 -592
  23. vlmparse/benchpdf2md/olmocrbench/repeatdetect.py +0 -175
  24. vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py +0 -256
  25. vlmparse/benchpdf2md/olmocrbench/tests.py +0 -1334
  26. vlmparse/benchpdf2md/run_benchmark.py +0 -296
  27. vlmparse/benchpdf2md/st_visu_benchmark/app.py +0 -271
  28. vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py +0 -117
  29. vlmparse/benchpdf2md/st_visu_benchmark/test_form.py +0 -95
  30. vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py +0 -20
  31. vlmparse/benchpdf2md/st_visu_benchmark/utils.py +0 -50
  32. vlmparse/benchpdf2md/utils.py +0 -56
  33. vlmparse-0.1.4.dist-info/RECORD +0 -51
  34. {vlmparse-0.1.4.dist-info → vlmparse-0.1.6.dist-info}/WHEEL +0 -0
  35. {vlmparse-0.1.4.dist-info → vlmparse-0.1.6.dist-info}/entry_points.txt +0 -0
  36. {vlmparse-0.1.4.dist-info → vlmparse-0.1.6.dist-info}/licenses/LICENSE +0 -0
  37. {vlmparse-0.1.4.dist-info → vlmparse-0.1.6.dist-info}/top_level.txt +0 -0
vlmparse/cli.py CHANGED
@@ -4,13 +4,24 @@ from loguru import logger
4
4
 
5
5
 
6
6
  class DParseCLI:
7
- def serve(self, model: str, port: int | None = None, gpus: str | None = None):
7
+ """Parsing of pdf to text using VLMs: typ in vlmparse to get the command lists, then `vlmparse <command> --help` to get help on a specific command."""
8
+
9
+ def serve(
10
+ self,
11
+ model: str,
12
+ port: int | None = None,
13
+ gpus: str | None = None,
14
+ vllm_kwargs: dict | None = None,
15
+ forget_predefined_vllm_kwargs: bool = False,
16
+ ):
8
17
  """Deploy a VLLM server in a Docker container.
9
18
 
10
19
  Args:
11
20
  model: Model name
12
21
  port: VLLM server port (default: 8056)
13
22
  gpus: Comma-separated GPU device IDs (e.g., "0" or "0,1,2"). If not specified, all GPUs will be used.
23
+ vllm_kwargs: Additional keyword arguments to pass to the VLLM server.
24
+ forget_predefined_vllm_kwargs: If True, the predefined VLLM kwargs from the docker config will be replaced by vllm_kwargs otherwise the predefined kwargs will be updated with vllm_kwargs with a risk of collision of argument names.
14
25
  """
15
26
  if port is None:
16
27
  port = 8056
@@ -32,6 +43,10 @@ class DParseCLI:
32
43
  docker_config.gpu_device_ids = [g.strip() for g in str(gpus).split(",")]
33
44
  server = docker_config.get_server(auto_stop=False)
34
45
 
46
+ if server is None:
47
+ logger.error(f"Model server not found for model: {model}")
48
+ return
49
+
35
50
  # Deploy server and leave it running (cleanup=False)
36
51
  logger.info(
37
52
  f"Deploying VLLM server for {docker_config.model_name} on port {port}..."
@@ -54,6 +69,8 @@ class DParseCLI:
54
69
  with_vllm_server: bool = False,
55
70
  concurrency: int = 10,
56
71
  dpi: int | None = None,
72
+ vllm_kwargs: dict | None = None,
73
+ debug: bool = False,
57
74
  ):
58
75
  """Parse PDF documents and save results.
59
76
 
@@ -67,109 +84,22 @@ class DParseCLI:
67
84
  mode: Output mode - "document" (save as JSON zip), "md" (save as markdown file), "md_page" (save as folder of markdown pages)
68
85
  with_vllm_server: If True, a local VLLM server will be deployed if the model is not found in the registry. Note that if the model is in the registry and the uri is None, the server will be anyway deployed.
69
86
  dpi: DPI to use for the conversion. If not specified, the default DPI will be used.
87
+ vllm_kwargs: Additional keyword arguments to pass to the VLLM server.
88
+ debug: If True, run in debug mode (single-threaded, no concurrency)
70
89
  """
71
90
  from vlmparse.converter_with_server import ConverterWithServer
72
91
 
73
- converter_with_server = ConverterWithServer(
92
+ with ConverterWithServer(
74
93
  model=model,
75
94
  uri=uri,
76
95
  gpus=gpus,
77
96
  with_vllm_server=with_vllm_server,
78
97
  concurrency=concurrency,
79
- )
80
-
81
- return converter_with_server.parse(
82
- inputs=inputs, out_folder=out_folder, mode=mode, dpi=dpi
83
- )
84
- # from vlmparse.registries import converter_config_registry
85
-
86
- # # Infer model from URI if provided
87
- # if uri is not None and model is None:
88
- # import docker
89
-
90
- # try:
91
- # docker_client = docker.from_env()
92
- # containers = docker_client.containers.list()
93
- # for container in containers:
94
- # # Check both exact match and match with/without trailing slash
95
- # container_uri = container.labels.get("vlmparse_uri", "")
96
- # if container_uri and (
97
- # container_uri == uri
98
- # or container_uri.rstrip("/") == uri.rstrip("/")
99
- # ):
100
- # inferred_model = container.labels.get("vlmparse_model_name")
101
- # if inferred_model:
102
- # logger.info(
103
- # f"Inferred model {inferred_model} from URI {uri}"
104
- # )
105
- # model = inferred_model
106
- # break
107
- # except Exception:
108
- # # If Docker is not available or fails, just proceed with provided arguments
109
- # pass
110
-
111
- # if mode not in ["document", "md", "md_page"]:
112
- # logger.error(f"Invalid mode: {mode}. Must be one of: document, md, md_page")
113
- # return
114
-
115
- # # Expand file paths from glob patterns
116
- # file_paths = []
117
- # if isinstance(inputs, str):
118
- # inputs = [inputs]
119
- # for pattern in inputs:
120
- # if "*" in pattern or "?" in pattern:
121
- # file_paths.extend(glob(pattern, recursive=True))
122
- # elif os.path.isdir(pattern):
123
- # file_paths.extend(glob(os.path.join(pattern, "*.pdf"), recursive=True))
124
- # elif os.path.isfile(pattern):
125
- # file_paths.append(pattern)
126
- # else:
127
- # logger.error(f"Invalid input: {pattern}")
128
-
129
- # # Filter to only existing PDF files
130
- # file_paths = [f for f in file_paths if os.path.exists(f) and f.endswith(".pdf")]
131
-
132
- # if not file_paths:
133
- # logger.error("No PDF files found matching the inputs patterns")
134
- # return
135
-
136
- # logger.info(f"Processing {len(file_paths)} files with {model} converter")
137
-
138
- # gpu_device_ids = None
139
- # if gpus is not None:
140
- # gpu_device_ids = [g.strip() for g in gpus.split(",")]
141
-
142
- # if uri is None:
143
- # from vlmparse.registries import docker_config_registry
144
-
145
- # docker_config = docker_config_registry.get(model, default=with_vllm_server)
146
-
147
- # if docker_config is not None:
148
- # docker_config.gpu_device_ids = gpu_device_ids
149
- # server = docker_config.get_server(auto_stop=True)
150
- # server.start()
151
-
152
- # client = docker_config.get_client(
153
- # save_folder=out_folder, save_mode=mode
154
- # )
155
- # else:
156
- # client = converter_config_registry.get(model).get_client(
157
- # save_folder=out_folder, save_mode=mode
158
- # )
159
-
160
- # else:
161
- # client_config = converter_config_registry.get(model, uri=uri)
162
- # client = client_config.get_client(save_folder=out_folder, save_mode=mode)
163
- # client.num_concurrent_files = concurrency
164
- # client.num_concurrent_pages = concurrency
165
- # if dpi is not None:
166
- # client.config.dpi = int(dpi)
167
- # documents = client.batch(file_paths)
168
-
169
- # if documents is not None:
170
- # logger.info(f"Processed {len(documents)} documents to {out_folder}")
171
- # else:
172
- # logger.info(f"Processed {len(file_paths)} documents to {out_folder}")
98
+ vllm_kwargs=vllm_kwargs,
99
+ ) as converter_with_server:
100
+ return converter_with_server.parse(
101
+ inputs=inputs, out_folder=out_folder, mode=mode, dpi=dpi, debug=debug
102
+ )
173
103
 
174
104
  def list(self):
175
105
  """List all containers whose name begins with vlmparse."""
@@ -194,7 +194,7 @@ class ChandraConverterConfig(OpenAIConverterConfig):
194
194
  model_name: str = "datalab-to/chandra"
195
195
  prompt_type: str = "ocr" # Default prompt type
196
196
  bbox_scale: int = 1024
197
- max_retries: int = 6
197
+ max_retries: int = 0
198
198
  max_failure_retries: int = None
199
199
  completion_kwargs: dict = Field(
200
200
  default_factory=lambda: {
@@ -15,6 +15,57 @@ from vlmparse.servers.docker_server import VLLMDockerServerConfig
15
15
  from vlmparse.utils import to_base64
16
16
 
17
17
 
18
+ class DeepSeekOCRDockerServerConfig(VLLMDockerServerConfig):
19
+ """Configuration for DeepSeekOCR model."""
20
+
21
+ model_name: str = "deepseek-ai/DeepSeek-OCR"
22
+ command_args: list[str] = Field(
23
+ default_factory=lambda: [
24
+ "--limit-mm-per-prompt",
25
+ '{"image": 1}',
26
+ "--async-scheduling",
27
+ "--logits_processors",
28
+ "vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor",
29
+ "--no-enable-prefix-caching",
30
+ "--mm-processor-cache-gb",
31
+ "0",
32
+ ]
33
+ )
34
+ aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
35
+
36
+ @property
37
+ def client_config(self):
38
+ return DeepSeekOCRConverterConfig(llm_params=self.llm_params)
39
+
40
+
41
+ class DeepSeekOCRConverterConfig(OpenAIConverterConfig):
42
+ """DeepSeekOCR converter - backward compatibility alias."""
43
+
44
+ model_name: str = "deepseek-ai/DeepSeek-OCR"
45
+ aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
46
+
47
+ prompt_mode: Literal["layout", "ocr"] = "ocr"
48
+ completion_kwargs: dict | None = {
49
+ "temperature": 0.0,
50
+ "max_tokens": 8181,
51
+ "extra_body": {
52
+ "skip_special_tokens": False,
53
+ # args used to control custom logits processor
54
+ "vllm_xargs": {
55
+ "ngram_size": 30,
56
+ "window_size": 90,
57
+ # whitelist: <td>, </td>
58
+ "whitelist_token_ids": [128821, 128822],
59
+ },
60
+ },
61
+ }
62
+ dpi: int = 200
63
+ aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
64
+
65
+ def get_client(self, **kwargs) -> "DeepSeekOCRConverterClient":
66
+ return DeepSeekOCRConverterClient(config=self, **kwargs)
67
+
68
+
18
69
  def re_match(text):
19
70
  pattern = r"(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)"
20
71
  matches = re.findall(pattern, text, re.DOTALL)
@@ -150,54 +201,3 @@ class DeepSeekOCRConverterClient(OpenAIConverterClient):
150
201
  logger.debug(page.text)
151
202
 
152
203
  return page
153
-
154
-
155
- class DeepSeekOCRDockerServerConfig(VLLMDockerServerConfig):
156
- """Configuration for DeepSeekOCR model."""
157
-
158
- model_name: str = "deepseek-ai/DeepSeek-OCR"
159
- command_args: list[str] = Field(
160
- default_factory=lambda: [
161
- "--limit-mm-per-prompt",
162
- '{"image": 1}',
163
- "--async-scheduling",
164
- "--logits_processors",
165
- "vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor",
166
- "--no-enable-prefix-caching",
167
- "--mm-processor-cache-gb",
168
- "0",
169
- ]
170
- )
171
- aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
172
-
173
- @property
174
- def client_config(self):
175
- return DeepSeekOCRConverterConfig(llm_params=self.llm_params)
176
-
177
-
178
- class DeepSeekOCRConverterConfig(OpenAIConverterConfig):
179
- """DeepSeekOCR converter - backward compatibility alias."""
180
-
181
- model_name: str = "deepseek-ai/DeepSeek-OCR"
182
- aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
183
-
184
- prompt_mode: Literal["layout", "ocr"] = "ocr"
185
- completion_kwargs: dict | None = {
186
- "temperature": 0.0,
187
- "max_tokens": 8181,
188
- "extra_body": {
189
- "skip_special_tokens": False,
190
- # args used to control custom logits processor
191
- "vllm_xargs": {
192
- "ngram_size": 30,
193
- "window_size": 90,
194
- # whitelist: <td>, </td>
195
- "whitelist_token_ids": [128821, 128822],
196
- },
197
- },
198
- }
199
- dpi: int = 200
200
- aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
201
-
202
- def get_client(self, **kwargs) -> "DeepSeekOCRConverterClient":
203
- return DeepSeekOCRConverterClient(config=self, **kwargs)
@@ -34,7 +34,7 @@ class DoclingDockerServerConfig(DockerServerConfig):
34
34
  "LOG_LEVEL": "DEBUG", # Enable verbose logging
35
35
  # Performance Tuning
36
36
  # "UVICORN_WORKERS": "4", # Increase web server workers (Default: 1)
37
- # "DOCLING_SERVE_ENG_LOC_NUM_WORKERS": "4", # Increase processing workers (Default: 2)
37
+ "DOCLING_SERVE_ENG_LOC_NUM_WORKERS": "16", # Increase processing workers (Default: 2)
38
38
  "DOCLING_NUM_THREADS": "32", # Increase torch threads (Default: 4)
39
39
  }
40
40
  )
@@ -62,8 +62,8 @@ class DoclingDockerServerConfig(DockerServerConfig):
62
62
  class DoclingConverterConfig(ConverterConfig):
63
63
  """Configuration for Docling converter client."""
64
64
 
65
+ base_url: str
65
66
  model_name: str = "docling"
66
- base_url: str = "http://localhost:5001"
67
67
  timeout: int = 300
68
68
  api_kwargs: dict = {"output_format": "markdown", "image_export_mode": "referenced"}
69
69
 
@@ -8,6 +8,7 @@ from PIL import Image
8
8
  from pydantic import Field
9
9
 
10
10
  from vlmparse.clients.openai_converter import (
11
+ LLMParams,
11
12
  OpenAIConverterClient,
12
13
  OpenAIConverterConfig,
13
14
  )
@@ -28,6 +29,7 @@ class DotsOCRDockerServerConfig(DockerServerConfig):
28
29
  dockerfile_dir: str = str(DOCKERFILE_DIR / "dotsocr")
29
30
  command_args: list[str] = Field(
30
31
  default_factory=lambda: [
32
+ "/workspace/weights/DotsOCR",
31
33
  "--tensor-parallel-size",
32
34
  "1",
33
35
  "--gpu-memory-utilization",
@@ -44,12 +46,19 @@ class DotsOCRDockerServerConfig(DockerServerConfig):
44
46
  # "16384",
45
47
  ]
46
48
  )
47
- add_model_key_to_server: bool = False
49
+ add_model_key_to_server: bool = True
48
50
  aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
49
51
 
50
52
  @property
51
53
  def client_config(self):
52
- return DotsOCRConverterConfig(llm_params=self.llm_params)
54
+ return DotsOCRConverterConfig(
55
+ llm_params=LLMParams(
56
+ base_url=f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}",
57
+ )
58
+ )
59
+
60
+ def get_base_url_suffix(self) -> str:
61
+ return "/v1"
53
62
 
54
63
 
55
64
  class DotsOCRConverterConfig(OpenAIConverterConfig):
@@ -228,7 +237,7 @@ class DotsOCRConverter(OpenAIConverterClient):
228
237
  )
229
238
  prompt = self.PROMPTS[prompt_mode]
230
239
 
231
- response = await self._async_inference_with_vllm(image, prompt)
240
+ response, usage = await self._async_inference_with_vllm(image, prompt)
232
241
 
233
242
  if prompt_mode in ["prompt_layout_all_en"]:
234
243
  try:
@@ -239,17 +248,17 @@ class DotsOCRConverter(OpenAIConverterClient):
239
248
  image.width,
240
249
  image.height,
241
250
  )
242
- return {}, cells, False
251
+ return {}, cells, False, usage
243
252
  except Exception as e:
244
253
  logger.warning(f"cells post process error: {e}, returning raw response")
245
- return {}, response, True
254
+ return {}, response, True, usage
246
255
  else:
247
- return {}, response, None
256
+ return {}, response, None, usage
248
257
 
249
258
  async def async_call_inside_page(self, page: Page) -> Page:
250
259
  image = page.image
251
260
 
252
- _, response, _ = await self._parse_image_vllm(
261
+ _, response, _, usage = await self._parse_image_vllm(
253
262
  image, prompt_mode=self.config.prompt_mode
254
263
  )
255
264
  logger.info("Response: " + str(response))
@@ -274,4 +283,8 @@ class DotsOCRConverter(OpenAIConverterClient):
274
283
  text = clean_response(response)
275
284
  text = html_to_md_keep_tables(text)
276
285
  page.text = text
286
+
287
+ page.completion_tokens = usage.completion_tokens
288
+ page.prompt_tokens = usage.prompt_tokens
289
+ page.reasoning_tokens = usage.reasoning_tokens
277
290
  return page
@@ -39,7 +39,8 @@ class HunyuanOCRConverterConfig(OpenAIConverterConfig):
39
39
  completion_kwargs: dict | None = {
40
40
  "temperature": 0.0,
41
41
  "extra_body": {"top_k": 1, "repetition_penalty": 1.0},
42
+ "max_completion_tokens": 16384, # max token len used in training according to the technical report is 32000, but in practice the model breaks earlier
42
43
  }
43
- max_image_size: int | None = 1540
44
44
  dpi: int = 200
45
45
  aliases: list[str] = Field(default_factory=lambda: ["hunyuanocr"])
46
+ stream: bool = True
@@ -1,6 +1,5 @@
1
1
  import asyncio
2
2
  import io
3
- import os
4
3
 
5
4
  import orjson
6
5
  from loguru import logger
@@ -20,18 +19,21 @@ class MinerUDockerServerConfig(DockerServerConfig):
20
19
  docker_image: str = "pulsia/mineru25apipulsia:latest"
21
20
  docker_port: int = 4299
22
21
  container_port: int = 8000
22
+ server_ready_indicators: list[str] = Field(
23
+ default_factory=lambda: ["Uvicorn running"]
24
+ )
23
25
 
24
26
  @property
25
27
  def client_config(self):
26
- return MinerUConverterConfig(api_url=f"http://localhost:{self.docker_port}")
28
+ return MinerUConverterConfig(base_url=f"http://localhost:{self.docker_port}")
27
29
 
28
30
 
29
31
  class MinerUConverterConfig(ConverterConfig):
30
32
  """Configuration for MinerU API converter."""
31
33
 
32
- base_url: str = Field(
33
- default_factory=lambda: os.getenv("MINERU_API_URL", "http://localhost:4299")
34
- )
34
+ base_url: str
35
+ model_name: str = "opendatalab/MinerU2.5-2509-1.2B"
36
+ aliases: list[str] = Field(default_factory=lambda: ["mineru25"])
35
37
  timeout: int = 600
36
38
 
37
39
  def get_client(self, **kwargs) -> "MinerUConverter":
@@ -50,25 +52,22 @@ class MinerUConverter(BaseConverter):
50
52
 
51
53
  config: MinerUConverterConfig
52
54
 
53
- def __init__(self, config: MinerUConverterConfig, **kwargs):
54
- super().__init__(config=config, **kwargs)
55
- from httpx import AsyncClient
56
-
57
- self.client = AsyncClient(base_url=config.api_url, timeout=config.timeout)
58
-
59
55
  async def _async_inference_with_api(self, image) -> list:
60
56
  """Run async inference with MinerU API."""
57
+ from httpx import AsyncClient
61
58
 
62
- img_byte_arr = await asyncio.to_thread(to_bytes_io, image)
63
-
64
- response = await self.client.post(
65
- "process-image",
66
- files={"image": ("image.png", img_byte_arr, "image/png")},
67
- )
59
+ async with AsyncClient(
60
+ base_url=self.config.base_url, timeout=self.config.timeout
61
+ ) as client:
62
+ img_byte_arr = await asyncio.to_thread(to_bytes_io, image)
63
+ response = await client.post(
64
+ "process-image",
65
+ files={"image": ("image.png", img_byte_arr, "image/png")},
66
+ )
68
67
 
69
- response.raise_for_status()
68
+ response.raise_for_status()
70
69
 
71
- res = orjson.loads(response.content)
70
+ res = orjson.loads(response.content)
72
71
 
73
72
  return res
74
73
 
@@ -41,6 +41,6 @@ class OlmOCRConverterConfig(OpenAIConverterConfig):
41
41
  "temperature": 0.1,
42
42
  "max_tokens": 8000,
43
43
  }
44
- max_image_size: int | None = 1288
44
+ # max_image_size: int | None = 1288
45
45
  dpi: int = 200
46
46
  aliases: list[str] = Field(default_factory=lambda: ["olmocr-2-fp8"])
@@ -40,6 +40,8 @@ def get_llm_params(model_name: str, uri: str | None = None):
40
40
  ]:
41
41
  base_url = None
42
42
  api_key = os.getenv("OPENAI_API_KEY")
43
+ if api_key is None:
44
+ raise ValueError("OPENAI_API_KEY environment variable not set")
43
45
  else:
44
46
  if model_name in [
45
47
  "gemini-2.5-flash-lite",
@@ -48,6 +50,8 @@ def get_llm_params(model_name: str, uri: str | None = None):
48
50
  ]:
49
51
  base_url = GOOGLE_API_BASE_URL
50
52
  api_key = os.getenv("GOOGLE_API_KEY")
53
+ if api_key is None:
54
+ raise ValueError("GOOGLE_API_KEY environment variable not set")
51
55
  else:
52
56
  return None
53
57
  return LLMParams(base_url=base_url, model_name=model_name, api_key=api_key)
@@ -92,11 +96,12 @@ class OpenAIConverterClient(BaseConverter):
92
96
  base_url=self.config.llm_params.base_url,
93
97
  api_key=self.config.llm_params.api_key,
94
98
  timeout=self.config.llm_params.timeout,
99
+ max_retries=self.config.llm_params.max_retries,
95
100
  )
96
101
 
97
102
  async def _get_chat_completion(
98
103
  self, messages: list[dict], completion_kwargs: dict | None = None
99
- ) -> str:
104
+ ) -> tuple[str, "CompletionUsage"]: # noqa: F821
100
105
  """Helper to handle chat completion with optional streaming."""
101
106
  if completion_kwargs is None:
102
107
  completion_kwargs = self.config.completion_kwargs
@@ -125,7 +130,8 @@ class OpenAIConverterClient(BaseConverter):
125
130
  "Response is None, finish reason: "
126
131
  + response_obj.choices[0].finish_reason
127
132
  )
128
- return response_obj.choices[0].message.content
133
+
134
+ return response_obj.choices[0].message.content, response_obj.usage
129
135
 
130
136
  async def async_call_inside_page(self, page: Page) -> Page:
131
137
  """Process a single page using OpenAI-compatible API."""
@@ -162,12 +168,16 @@ class OpenAIConverterClient(BaseConverter):
162
168
  },
163
169
  ]
164
170
 
165
- response = await self._get_chat_completion(messages)
166
- logger.info("Response: " + str(response))
171
+ response, usage = await self._get_chat_completion(messages)
172
+ logger.debug("Response: " + str(response))
167
173
  page.raw_response = response
168
174
  text = clean_response(response)
169
175
 
170
176
  text = html_to_md_keep_tables(text)
171
177
  page.text = text
178
+ page.prompt_tokens = usage.prompt_tokens
179
+ page.completion_tokens = usage.completion_tokens
180
+ if hasattr(usage, "reasoning_tokens"):
181
+ page.reasoning_tokens = usage.reasoning_tokens
172
182
 
173
183
  return page
@@ -42,7 +42,8 @@ class PaddleOCRVLConverterConfig(OpenAIConverterConfig):
42
42
  postprompt: str | None = TASKS["ocr"]
43
43
  completion_kwargs: dict | None = {
44
44
  "temperature": 0.0,
45
+ "max_completion_tokens": 16384,
45
46
  }
46
- max_image_size: int | None = 1540
47
47
  dpi: int = 200
48
48
  aliases: list[str] = Field(default_factory=lambda: ["paddleocrvl"])
49
+ stream: bool = True
@@ -12,28 +12,35 @@ from vlmparse.utils import get_file_paths
12
12
  class ConverterWithServer:
13
13
  def __init__(
14
14
  self,
15
- model: str,
15
+ model: str | None = None,
16
16
  uri: str | None = None,
17
17
  gpus: str | None = None,
18
18
  port: int | None = None,
19
19
  with_vllm_server: bool = False,
20
20
  concurrency: int = 10,
21
+ vllm_kwargs: dict | None = None,
22
+ forget_predefined_vllm_kwargs: bool = False,
21
23
  ):
22
- from vlmparse.registries import (
23
- converter_config_registry,
24
- docker_config_registry,
25
- )
26
-
27
24
  self.model = model
28
25
  self.uri = uri
29
26
  self.port = port
30
27
  self.gpus = gpus
31
28
  self.with_vllm_server = with_vllm_server
32
29
  self.concurrency = concurrency
30
+ self.vllm_kwargs = vllm_kwargs
31
+ self.forget_predefined_vllm_kwargs = forget_predefined_vllm_kwargs
32
+ self.server = None
33
+ self.client = None
33
34
 
34
35
  if self.uri is not None and self.model is None:
35
36
  self.model = get_model_from_uri(self.uri)
36
37
 
38
+ def start_server_and_client(self):
39
+ from vlmparse.registries import (
40
+ converter_config_registry,
41
+ docker_config_registry,
42
+ )
43
+
37
44
  gpu_device_ids = None
38
45
  if self.gpus is not None:
39
46
  gpu_device_ids = [g.strip() for g in self.gpus.split(",")]
@@ -42,13 +49,18 @@ class ConverterWithServer:
42
49
  docker_config = docker_config_registry.get(
43
50
  self.model, default=self.with_vllm_server
44
51
  )
45
- if self.port is not None:
46
- docker_config.docker_port = self.port
47
52
 
48
53
  if docker_config is not None:
54
+ if self.port is not None:
55
+ docker_config.docker_port = self.port
49
56
  docker_config.gpu_device_ids = gpu_device_ids
50
- server = docker_config.get_server(auto_stop=True)
51
- server.start()
57
+ docker_config.update_command_args(
58
+ self.vllm_kwargs,
59
+ forget_predefined_vllm_kwargs=self.forget_predefined_vllm_kwargs,
60
+ )
61
+ self.server = docker_config.get_server(auto_stop=True)
62
+
63
+ self.server.start()
52
64
 
53
65
  self.client = docker_config.get_client()
54
66
  else:
@@ -56,8 +68,20 @@ class ConverterWithServer:
56
68
 
57
69
  else:
58
70
  client_config = converter_config_registry.get(self.model, uri=self.uri)
71
+
59
72
  self.client = client_config.get_client()
60
73
 
74
+ def stop_server(self):
75
+ if self.server is not None and self.server.auto_stop:
76
+ self.server.stop()
77
+
78
+ def __enter__(self):
79
+ self.start_server_and_client()
80
+ return self
81
+
82
+ def __exit__(self, exc_type, exc_value, traceback):
83
+ self.stop_server()
84
+
61
85
  def parse(
62
86
  self,
63
87
  inputs: str | list[str],
@@ -67,6 +91,9 @@ class ConverterWithServer:
67
91
  debug: bool = False,
68
92
  retrylast: bool = False,
69
93
  ):
94
+ assert (
95
+ self.client is not None
96
+ ), "Client not initialized. Call start_server_and_client() first."
70
97
  file_paths = get_file_paths(inputs)
71
98
  assert (
72
99
  out_folder is not None
@@ -118,5 +145,5 @@ class ConverterWithServer:
118
145
 
119
146
  return documents
120
147
 
121
- def get_out_folder(self) -> Path:
148
+ def get_out_folder(self) -> str | None:
122
149
  return self.client.save_folder