vlmparse 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vlmparse/cli.py CHANGED
@@ -4,44 +4,42 @@ from loguru import logger
4
4
 
5
5
 
6
6
  class DParseCLI:
7
- def serve(self, model: str, port: int | None = None, gpus: str | None = None):
7
+ """Parsing of pdf to text using VLMs: typ in vlmparse to get the command lists, then `vlmparse <command> --help` to get help on a specific command."""
8
+
9
+ def serve(
10
+ self,
11
+ model: str,
12
+ port: int | None = None,
13
+ gpus: str | None = None,
14
+ vllm_args: list[str] | None = None,
15
+ forget_predefined_vllm_args: bool = False,
16
+ ):
8
17
  """Deploy a VLLM server in a Docker container.
9
18
 
10
19
  Args:
11
20
  model: Model name
12
21
  port: VLLM server port (default: 8056)
13
22
  gpus: Comma-separated GPU device IDs (e.g., "0" or "0,1,2"). If not specified, all GPUs will be used.
23
+ vllm_args: Additional keyword arguments to pass to the VLLM server.
24
+ forget_predefined_vllm_args: If True, the predefined VLLM kwargs from the docker config will be replaced by vllm_args otherwise the predefined kwargs will be updated with vllm_args with a risk of collision of argument names.
14
25
  """
15
- if port is None:
16
- port = 8056
17
-
18
- from vlmparse.registries import docker_config_registry
19
-
20
- docker_config = docker_config_registry.get(model)
21
- if docker_config is None:
22
- logger.warning(
23
- f"No Docker configuration found for model: {model}, using default configuration"
24
- )
25
- return
26
-
27
- docker_config.docker_port = port
28
26
 
29
- # Only override GPU configuration if explicitly specified
30
- # This preserves CPU-only settings from the config
31
- if gpus is not None:
32
- docker_config.gpu_device_ids = [g.strip() for g in str(gpus).split(",")]
33
- server = docker_config.get_server(auto_stop=False)
27
+ from vlmparse.converter_with_server import start_server
34
28
 
35
- # Deploy server and leave it running (cleanup=False)
36
- logger.info(
37
- f"Deploying VLLM server for {docker_config.model_name} on port {port}..."
29
+ base_url, container, _, _ = start_server(
30
+ model=model,
31
+ gpus=gpus,
32
+ port=port,
33
+ with_vllm_server=True,
34
+ vllm_args=vllm_args,
35
+ forget_predefined_vllm_args=forget_predefined_vllm_args,
36
+ auto_stop=False,
38
37
  )
39
38
 
40
- base_url, container = server.start()
41
-
42
39
  logger.info(f"✓ VLLM server ready at {base_url}")
43
- logger.info(f"✓ Container ID: {container.id}")
44
- logger.info(f"✓ Container name: {container.name}")
40
+ if container is not None:
41
+ logger.info(f"✓ Container ID: {container.id}")
42
+ logger.info(f"✓ Container name: {container.name}")
45
43
 
46
44
  def convert(
47
45
  self,
@@ -54,6 +52,7 @@ class DParseCLI:
54
52
  with_vllm_server: bool = False,
55
53
  concurrency: int = 10,
56
54
  dpi: int | None = None,
55
+ debug: bool = False,
57
56
  ):
58
57
  """Parse PDF documents and save results.
59
58
 
@@ -67,109 +66,20 @@ class DParseCLI:
67
66
  mode: Output mode - "document" (save as JSON zip), "md" (save as markdown file), "md_page" (save as folder of markdown pages)
68
67
  with_vllm_server: If True, a local VLLM server will be deployed if the model is not found in the registry. Note that if the model is in the registry and the uri is None, the server will be anyway deployed.
69
68
  dpi: DPI to use for the conversion. If not specified, the default DPI will be used.
69
+ debug: If True, run in debug mode (single-threaded, no concurrency)
70
70
  """
71
71
  from vlmparse.converter_with_server import ConverterWithServer
72
72
 
73
- converter_with_server = ConverterWithServer(
73
+ with ConverterWithServer(
74
74
  model=model,
75
75
  uri=uri,
76
76
  gpus=gpus,
77
77
  with_vllm_server=with_vllm_server,
78
78
  concurrency=concurrency,
79
- )
80
-
81
- return converter_with_server.parse(
82
- inputs=inputs, out_folder=out_folder, mode=mode, dpi=dpi
83
- )
84
- # from vlmparse.registries import converter_config_registry
85
-
86
- # # Infer model from URI if provided
87
- # if uri is not None and model is None:
88
- # import docker
89
-
90
- # try:
91
- # docker_client = docker.from_env()
92
- # containers = docker_client.containers.list()
93
- # for container in containers:
94
- # # Check both exact match and match with/without trailing slash
95
- # container_uri = container.labels.get("vlmparse_uri", "")
96
- # if container_uri and (
97
- # container_uri == uri
98
- # or container_uri.rstrip("/") == uri.rstrip("/")
99
- # ):
100
- # inferred_model = container.labels.get("vlmparse_model_name")
101
- # if inferred_model:
102
- # logger.info(
103
- # f"Inferred model {inferred_model} from URI {uri}"
104
- # )
105
- # model = inferred_model
106
- # break
107
- # except Exception:
108
- # # If Docker is not available or fails, just proceed with provided arguments
109
- # pass
110
-
111
- # if mode not in ["document", "md", "md_page"]:
112
- # logger.error(f"Invalid mode: {mode}. Must be one of: document, md, md_page")
113
- # return
114
-
115
- # # Expand file paths from glob patterns
116
- # file_paths = []
117
- # if isinstance(inputs, str):
118
- # inputs = [inputs]
119
- # for pattern in inputs:
120
- # if "*" in pattern or "?" in pattern:
121
- # file_paths.extend(glob(pattern, recursive=True))
122
- # elif os.path.isdir(pattern):
123
- # file_paths.extend(glob(os.path.join(pattern, "*.pdf"), recursive=True))
124
- # elif os.path.isfile(pattern):
125
- # file_paths.append(pattern)
126
- # else:
127
- # logger.error(f"Invalid input: {pattern}")
128
-
129
- # # Filter to only existing PDF files
130
- # file_paths = [f for f in file_paths if os.path.exists(f) and f.endswith(".pdf")]
131
-
132
- # if not file_paths:
133
- # logger.error("No PDF files found matching the inputs patterns")
134
- # return
135
-
136
- # logger.info(f"Processing {len(file_paths)} files with {model} converter")
137
-
138
- # gpu_device_ids = None
139
- # if gpus is not None:
140
- # gpu_device_ids = [g.strip() for g in gpus.split(",")]
141
-
142
- # if uri is None:
143
- # from vlmparse.registries import docker_config_registry
144
-
145
- # docker_config = docker_config_registry.get(model, default=with_vllm_server)
146
-
147
- # if docker_config is not None:
148
- # docker_config.gpu_device_ids = gpu_device_ids
149
- # server = docker_config.get_server(auto_stop=True)
150
- # server.start()
151
-
152
- # client = docker_config.get_client(
153
- # save_folder=out_folder, save_mode=mode
154
- # )
155
- # else:
156
- # client = converter_config_registry.get(model).get_client(
157
- # save_folder=out_folder, save_mode=mode
158
- # )
159
-
160
- # else:
161
- # client_config = converter_config_registry.get(model, uri=uri)
162
- # client = client_config.get_client(save_folder=out_folder, save_mode=mode)
163
- # client.num_concurrent_files = concurrency
164
- # client.num_concurrent_pages = concurrency
165
- # if dpi is not None:
166
- # client.config.dpi = int(dpi)
167
- # documents = client.batch(file_paths)
168
-
169
- # if documents is not None:
170
- # logger.info(f"Processed {len(documents)} documents to {out_folder}")
171
- # else:
172
- # logger.info(f"Processed {len(file_paths)} documents to {out_folder}")
79
+ ) as converter_with_server:
80
+ return converter_with_server.parse(
81
+ inputs=inputs, out_folder=out_folder, mode=mode, dpi=dpi, debug=debug
82
+ )
173
83
 
174
84
  def list(self):
175
85
  """List all containers whose name begins with vlmparse."""
@@ -194,7 +194,7 @@ class ChandraConverterConfig(OpenAIConverterConfig):
194
194
  model_name: str = "datalab-to/chandra"
195
195
  prompt_type: str = "ocr" # Default prompt type
196
196
  bbox_scale: int = 1024
197
- max_retries: int = 6
197
+ max_retries: int = 0
198
198
  max_failure_retries: int = None
199
199
  completion_kwargs: dict = Field(
200
200
  default_factory=lambda: {
@@ -15,6 +15,57 @@ from vlmparse.servers.docker_server import VLLMDockerServerConfig
15
15
  from vlmparse.utils import to_base64
16
16
 
17
17
 
18
+ class DeepSeekOCRDockerServerConfig(VLLMDockerServerConfig):
19
+ """Configuration for DeepSeekOCR model."""
20
+
21
+ model_name: str = "deepseek-ai/DeepSeek-OCR"
22
+ command_args: list[str] = Field(
23
+ default_factory=lambda: [
24
+ "--limit-mm-per-prompt",
25
+ '{"image": 1}',
26
+ "--async-scheduling",
27
+ "--logits_processors",
28
+ "vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor",
29
+ "--no-enable-prefix-caching",
30
+ "--mm-processor-cache-gb",
31
+ "0",
32
+ ]
33
+ )
34
+ aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
35
+
36
+ @property
37
+ def client_config(self):
38
+ return DeepSeekOCRConverterConfig(llm_params=self.llm_params)
39
+
40
+
41
+ class DeepSeekOCRConverterConfig(OpenAIConverterConfig):
42
+ """DeepSeekOCR converter - backward compatibility alias."""
43
+
44
+ model_name: str = "deepseek-ai/DeepSeek-OCR"
45
+ aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
46
+
47
+ prompt_mode: Literal["layout", "ocr"] = "ocr"
48
+ completion_kwargs: dict | None = {
49
+ "temperature": 0.0,
50
+ "max_tokens": 8181,
51
+ "extra_body": {
52
+ "skip_special_tokens": False,
53
+ # args used to control custom logits processor
54
+ "vllm_xargs": {
55
+ "ngram_size": 30,
56
+ "window_size": 90,
57
+ # whitelist: <td>, </td>
58
+ "whitelist_token_ids": [128821, 128822],
59
+ },
60
+ },
61
+ }
62
+ dpi: int = 200
63
+ aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
64
+
65
+ def get_client(self, **kwargs) -> "DeepSeekOCRConverterClient":
66
+ return DeepSeekOCRConverterClient(config=self, **kwargs)
67
+
68
+
18
69
  def re_match(text):
19
70
  pattern = r"(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)"
20
71
  matches = re.findall(pattern, text, re.DOTALL)
@@ -150,54 +201,3 @@ class DeepSeekOCRConverterClient(OpenAIConverterClient):
150
201
  logger.debug(page.text)
151
202
 
152
203
  return page
153
-
154
-
155
- class DeepSeekOCRDockerServerConfig(VLLMDockerServerConfig):
156
- """Configuration for DeepSeekOCR model."""
157
-
158
- model_name: str = "deepseek-ai/DeepSeek-OCR"
159
- command_args: list[str] = Field(
160
- default_factory=lambda: [
161
- "--limit-mm-per-prompt",
162
- '{"image": 1}',
163
- "--async-scheduling",
164
- "--logits_processors",
165
- "vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor",
166
- "--no-enable-prefix-caching",
167
- "--mm-processor-cache-gb",
168
- "0",
169
- ]
170
- )
171
- aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
172
-
173
- @property
174
- def client_config(self):
175
- return DeepSeekOCRConverterConfig(llm_params=self.llm_params)
176
-
177
-
178
- class DeepSeekOCRConverterConfig(OpenAIConverterConfig):
179
- """DeepSeekOCR converter - backward compatibility alias."""
180
-
181
- model_name: str = "deepseek-ai/DeepSeek-OCR"
182
- aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
183
-
184
- prompt_mode: Literal["layout", "ocr"] = "ocr"
185
- completion_kwargs: dict | None = {
186
- "temperature": 0.0,
187
- "max_tokens": 8181,
188
- "extra_body": {
189
- "skip_special_tokens": False,
190
- # args used to control custom logits processor
191
- "vllm_xargs": {
192
- "ngram_size": 30,
193
- "window_size": 90,
194
- # whitelist: <td>, </td>
195
- "whitelist_token_ids": [128821, 128822],
196
- },
197
- },
198
- }
199
- dpi: int = 200
200
- aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
201
-
202
- def get_client(self, **kwargs) -> "DeepSeekOCRConverterClient":
203
- return DeepSeekOCRConverterClient(config=self, **kwargs)
@@ -237,7 +237,7 @@ class DotsOCRConverter(OpenAIConverterClient):
237
237
  )
238
238
  prompt = self.PROMPTS[prompt_mode]
239
239
 
240
- response = await self._async_inference_with_vllm(image, prompt)
240
+ response, usage = await self._async_inference_with_vllm(image, prompt)
241
241
 
242
242
  if prompt_mode in ["prompt_layout_all_en"]:
243
243
  try:
@@ -248,17 +248,17 @@ class DotsOCRConverter(OpenAIConverterClient):
248
248
  image.width,
249
249
  image.height,
250
250
  )
251
- return {}, cells, False
251
+ return {}, cells, False, usage
252
252
  except Exception as e:
253
253
  logger.warning(f"cells post process error: {e}, returning raw response")
254
- return {}, response, True
254
+ return {}, response, True, usage
255
255
  else:
256
- return {}, response, None
256
+ return {}, response, None, usage
257
257
 
258
258
  async def async_call_inside_page(self, page: Page) -> Page:
259
259
  image = page.image
260
260
 
261
- _, response, _ = await self._parse_image_vllm(
261
+ _, response, _, usage = await self._parse_image_vllm(
262
262
  image, prompt_mode=self.config.prompt_mode
263
263
  )
264
264
  logger.info("Response: " + str(response))
@@ -283,4 +283,8 @@ class DotsOCRConverter(OpenAIConverterClient):
283
283
  text = clean_response(response)
284
284
  text = html_to_md_keep_tables(text)
285
285
  page.text = text
286
+
287
+ page.completion_tokens = usage.completion_tokens
288
+ page.prompt_tokens = usage.prompt_tokens
289
+ page.reasoning_tokens = usage.reasoning_tokens
286
290
  return page
@@ -34,6 +34,7 @@ class GraniteDoclingDockerServerConfig(VLLMDockerServerConfig):
34
34
  class GraniteDoclingConverterConfig(OpenAIConverterConfig):
35
35
  """Granite Docling converter configuration."""
36
36
 
37
+ model_name: str = "ibm-granite/granite-docling-258M"
37
38
  preprompt: str | None = None
38
39
  postprompt: str | None = "Convert this page to docling."
39
40
  completion_kwargs: dict | None = {
@@ -39,7 +39,8 @@ class HunyuanOCRConverterConfig(OpenAIConverterConfig):
39
39
  completion_kwargs: dict | None = {
40
40
  "temperature": 0.0,
41
41
  "extra_body": {"top_k": 1, "repetition_penalty": 1.0},
42
+ "max_completion_tokens": 16384, # max token len used in training according to the technical report is 32000, but in practice the model breaks earlier
42
43
  }
43
- max_image_size: int | None = 1540
44
44
  dpi: int = 200
45
45
  aliases: list[str] = Field(default_factory=lambda: ["hunyuanocr"])
46
+ stream: bool = True
@@ -52,24 +52,22 @@ class MinerUConverter(BaseConverter):
52
52
 
53
53
  config: MinerUConverterConfig
54
54
 
55
- def __init__(self, config: MinerUConverterConfig, **kwargs):
56
- super().__init__(config=config, **kwargs)
57
- from httpx import AsyncClient
58
-
59
- self.client = AsyncClient(base_url=config.base_url, timeout=config.timeout)
60
-
61
55
  async def _async_inference_with_api(self, image) -> list:
62
56
  """Run async inference with MinerU API."""
57
+ from httpx import AsyncClient
63
58
 
64
- img_byte_arr = await asyncio.to_thread(to_bytes_io, image)
65
- response = await self.client.post(
66
- "process-image",
67
- files={"image": ("image.png", img_byte_arr, "image/png")},
68
- )
59
+ async with AsyncClient(
60
+ base_url=self.config.base_url, timeout=self.config.timeout
61
+ ) as client:
62
+ img_byte_arr = await asyncio.to_thread(to_bytes_io, image)
63
+ response = await client.post(
64
+ "process-image",
65
+ files={"image": ("image.png", img_byte_arr, "image/png")},
66
+ )
69
67
 
70
- response.raise_for_status()
68
+ response.raise_for_status()
71
69
 
72
- res = orjson.loads(response.content)
70
+ res = orjson.loads(response.content)
73
71
 
74
72
  return res
75
73
 
@@ -41,6 +41,6 @@ class OlmOCRConverterConfig(OpenAIConverterConfig):
41
41
  "temperature": 0.1,
42
42
  "max_tokens": 8000,
43
43
  }
44
- max_image_size: int | None = 1288
44
+ # max_image_size: int | None = 1288
45
45
  dpi: int = 200
46
46
  aliases: list[str] = Field(default_factory=lambda: ["olmocr-2-fp8"])
@@ -1,5 +1,5 @@
1
1
  import os
2
- from typing import Literal
2
+ from typing import Literal, Optional
3
3
 
4
4
  from loguru import logger
5
5
  from pydantic import Field
@@ -40,6 +40,8 @@ def get_llm_params(model_name: str, uri: str | None = None):
40
40
  ]:
41
41
  base_url = None
42
42
  api_key = os.getenv("OPENAI_API_KEY")
43
+ if api_key is None:
44
+ raise ValueError("OPENAI_API_KEY environment variable not set")
43
45
  else:
44
46
  if model_name in [
45
47
  "gemini-2.5-flash-lite",
@@ -48,6 +50,8 @@ def get_llm_params(model_name: str, uri: str | None = None):
48
50
  ]:
49
51
  base_url = GOOGLE_API_BASE_URL
50
52
  api_key = os.getenv("GOOGLE_API_KEY")
53
+ if api_key is None:
54
+ raise ValueError("GOOGLE_API_KEY environment variable not set")
51
55
  else:
52
56
  return None
53
57
  return LLMParams(base_url=base_url, model_name=model_name, api_key=api_key)
@@ -97,7 +101,7 @@ class OpenAIConverterClient(BaseConverter):
97
101
 
98
102
  async def _get_chat_completion(
99
103
  self, messages: list[dict], completion_kwargs: dict | None = None
100
- ) -> str:
104
+ ) -> tuple[str, Optional["CompletionUsage"]]: # noqa: F821
101
105
  """Helper to handle chat completion with optional streaming."""
102
106
  if completion_kwargs is None:
103
107
  completion_kwargs = self.config.completion_kwargs
@@ -113,7 +117,8 @@ class OpenAIConverterClient(BaseConverter):
113
117
  async for chunk in response_stream:
114
118
  if chunk.choices and chunk.choices[0].delta.content:
115
119
  response_parts.append(chunk.choices[0].delta.content)
116
- return "".join(response_parts)
120
+
121
+ return "".join(response_parts), None
117
122
  else:
118
123
  response_obj = await self.model.chat.completions.create(
119
124
  model=self.config.llm_params.model_name,
@@ -126,7 +131,8 @@ class OpenAIConverterClient(BaseConverter):
126
131
  "Response is None, finish reason: "
127
132
  + response_obj.choices[0].finish_reason
128
133
  )
129
- return response_obj.choices[0].message.content
134
+
135
+ return response_obj.choices[0].message.content, response_obj.usage
130
136
 
131
137
  async def async_call_inside_page(self, page: Page) -> Page:
132
138
  """Process a single page using OpenAI-compatible API."""
@@ -163,12 +169,17 @@ class OpenAIConverterClient(BaseConverter):
163
169
  },
164
170
  ]
165
171
 
166
- response = await self._get_chat_completion(messages)
167
- logger.info("Response: " + str(response))
172
+ response, usage = await self._get_chat_completion(messages)
173
+ logger.debug("Response: " + str(response))
168
174
  page.raw_response = response
169
175
  text = clean_response(response)
170
176
 
171
177
  text = html_to_md_keep_tables(text)
172
178
  page.text = text
179
+ if usage is not None:
180
+ page.prompt_tokens = usage.prompt_tokens
181
+ page.completion_tokens = usage.completion_tokens
182
+ if hasattr(usage, "reasoning_tokens"):
183
+ page.reasoning_tokens = usage.reasoning_tokens
173
184
 
174
185
  return page
@@ -42,7 +42,8 @@ class PaddleOCRVLConverterConfig(OpenAIConverterConfig):
42
42
  postprompt: str | None = TASKS["ocr"]
43
43
  completion_kwargs: dict | None = {
44
44
  "temperature": 0.0,
45
+ "max_completion_tokens": 16384,
45
46
  }
46
- max_image_size: int | None = 1540
47
47
  dpi: int = 200
48
48
  aliases: list[str] = Field(default_factory=lambda: ["paddleocrvl"])
49
+ stream: bool = True
vlmparse/constants.py CHANGED
@@ -1,2 +1,5 @@
1
+ import os
2
+
1
3
  IMAGE_EXTENSIONS = [".jpg", ".jpeg", ".png", ".tiff", ".tif", ".bmp", ".gif", ".webp"]
2
4
  PDF_EXTENSION = ".pdf"
5
+ DEFAULT_SERVER_PORT = os.getenv("VLMPARSE_DEFAULT_PORT", 8056)
@@ -5,51 +5,102 @@ from typing import Literal
5
5
 
6
6
  from loguru import logger
7
7
 
8
+ from vlmparse.constants import DEFAULT_SERVER_PORT
8
9
  from vlmparse.servers.utils import get_model_from_uri
9
10
  from vlmparse.utils import get_file_paths
10
11
 
11
12
 
13
+ def start_server(
14
+ model: str,
15
+ gpus: str,
16
+ port: None | int = None,
17
+ with_vllm_server: bool = True,
18
+ vllm_args: list[str] = {},
19
+ forget_predefined_vllm_args: bool = False,
20
+ auto_stop: bool = False,
21
+ ):
22
+ from vlmparse.registries import docker_config_registry
23
+
24
+ base_url = ""
25
+ container = None
26
+ docker_config = docker_config_registry.get(model, default=with_vllm_server)
27
+
28
+ if port is None:
29
+ port = DEFAULT_SERVER_PORT
30
+
31
+ if docker_config is None:
32
+ logger.warning(
33
+ f"No Docker configuration found for model: {model}, using default configuration"
34
+ )
35
+ return "", container, None, docker_config
36
+
37
+ gpu_device_ids = None
38
+ if gpus is not None:
39
+ gpu_device_ids = [g.strip() for g in str(gpus).split(",")]
40
+
41
+ if docker_config is not None:
42
+ if port is not None:
43
+ docker_config.docker_port = port
44
+ docker_config.gpu_device_ids = gpu_device_ids
45
+ docker_config.update_command_args(
46
+ vllm_args,
47
+ forget_predefined_vllm_args=forget_predefined_vllm_args,
48
+ )
49
+
50
+ logger.info(
51
+ f"Deploying VLLM server for {docker_config.model_name} on port {port}..."
52
+ )
53
+ server = docker_config.get_server(auto_stop=auto_stop)
54
+ if server is None:
55
+ logger.error(f"Model server not found for model: {model}")
56
+ return "", container, None, docker_config
57
+
58
+ base_url, container = server.start()
59
+
60
+ return base_url, container, server, docker_config
61
+
62
+
12
63
  class ConverterWithServer:
13
64
  def __init__(
14
65
  self,
15
- model: str,
66
+ model: str | None = None,
16
67
  uri: str | None = None,
17
68
  gpus: str | None = None,
18
69
  port: int | None = None,
19
70
  with_vllm_server: bool = False,
20
71
  concurrency: int = 10,
72
+ vllm_args: dict | None = None,
73
+ forget_predefined_vllm_args: bool = False,
21
74
  ):
22
- from vlmparse.registries import (
23
- converter_config_registry,
24
- docker_config_registry,
25
- )
26
-
27
75
  self.model = model
28
76
  self.uri = uri
29
77
  self.port = port
30
78
  self.gpus = gpus
31
79
  self.with_vllm_server = with_vllm_server
32
80
  self.concurrency = concurrency
81
+ self.vllm_args = vllm_args
82
+ self.forget_predefined_vllm_args = forget_predefined_vllm_args
83
+ self.server = None
84
+ self.client = None
33
85
 
34
86
  if self.uri is not None and self.model is None:
35
87
  self.model = get_model_from_uri(self.uri)
36
88
 
37
- gpu_device_ids = None
38
- if self.gpus is not None:
39
- gpu_device_ids = [g.strip() for g in self.gpus.split(",")]
89
+ def start_server_and_client(self):
90
+ from vlmparse.registries import converter_config_registry
40
91
 
41
92
  if self.uri is None:
42
- docker_config = docker_config_registry.get(
43
- self.model, default=self.with_vllm_server
93
+ _, _, self.server, docker_config = start_server(
94
+ model=self.model,
95
+ gpus=self.gpus,
96
+ port=self.port,
97
+ with_vllm_server=self.with_vllm_server,
98
+ vllm_args=self.vllm_args,
99
+ forget_predefined_vllm_args=self.forget_predefined_vllm_args,
100
+ auto_stop=True,
44
101
  )
45
102
 
46
103
  if docker_config is not None:
47
- if self.port is not None:
48
- docker_config.docker_port = self.port
49
- docker_config.gpu_device_ids = gpu_device_ids
50
- self.server = docker_config.get_server(auto_stop=True)
51
- self.server.start()
52
-
53
104
  self.client = docker_config.get_client()
54
105
  else:
55
106
  self.client = converter_config_registry.get(self.model).get_client()
@@ -59,6 +110,17 @@ class ConverterWithServer:
59
110
 
60
111
  self.client = client_config.get_client()
61
112
 
113
+ def stop_server(self):
114
+ if self.server is not None and self.server.auto_stop:
115
+ self.server.stop()
116
+
117
+ def __enter__(self):
118
+ self.start_server_and_client()
119
+ return self
120
+
121
+ def __exit__(self, exc_type, exc_value, traceback):
122
+ self.stop_server()
123
+
62
124
  def parse(
63
125
  self,
64
126
  inputs: str | list[str],
@@ -68,6 +130,9 @@ class ConverterWithServer:
68
130
  debug: bool = False,
69
131
  retrylast: bool = False,
70
132
  ):
133
+ assert (
134
+ self.client is not None
135
+ ), "Client not initialized. Call start_server_and_client() first."
71
136
  file_paths = get_file_paths(inputs)
72
137
  assert (
73
138
  out_folder is not None
@@ -119,5 +184,5 @@ class ConverterWithServer:
119
184
 
120
185
  return documents
121
186
 
122
- def get_out_folder(self) -> Path:
187
+ def get_out_folder(self) -> str | None:
123
188
  return self.client.save_folder
@@ -41,6 +41,10 @@ class Page(VLMParseBaseModel):
41
41
  buffer_image: Optional[Image.Image | str | dict] = None
42
42
  latency: Optional[float] = None
43
43
  """Time taken to process the page in seconds."""
44
+ prompt_tokens: Optional[int] = None
45
+ completion_tokens: Optional[int] = None
46
+ """Include reasoning tokens"""
47
+ reasoning_tokens: Optional[int] = None
44
48
 
45
49
  @property
46
50
  def image(self):
@@ -66,7 +70,7 @@ class Page(VLMParseBaseModel):
66
70
 
67
71
  image = self.image
68
72
 
69
- if layout:
73
+ if layout and image is not None:
70
74
  if self.items is None:
71
75
  return image
72
76
  items = self.items
@@ -85,6 +89,9 @@ class Page(VLMParseBaseModel):
85
89
  )
86
90
  return image
87
91
 
92
+ def to_markdown(self, **kwargs):
93
+ return self.text if self.text is not None else ""
94
+
88
95
 
89
96
  class Document(VLMParseBaseModel):
90
97
  file_path: str
@@ -104,6 +111,9 @@ class Document(VLMParseBaseModel):
104
111
  page.error is not None for page in self.pages
105
112
  )
106
113
 
114
+ def to_markdown(self, **kwargs):
115
+ return "\n\n".join([page.to_markdown(**kwargs) for page in self.pages])
116
+
107
117
  def to_zip(
108
118
  self,
109
119
  file_path,
vlmparse/registries.py CHANGED
@@ -77,9 +77,7 @@ class ConverterConfigRegistry:
77
77
  """Register a config factory for a model name."""
78
78
  self._registry[model_name] = config_factory
79
79
 
80
- def get(
81
- self, model_name: str, uri: str | None = None
82
- ) -> OpenAIConverterConfig | None:
80
+ def get(self, model_name: str, uri: str | None = None) -> OpenAIConverterConfig:
83
81
  """Get config for a model name. Returns default if not registered."""
84
82
  if model_name in self._registry:
85
83
  return self._registry[model_name](uri=uri)
@@ -47,6 +47,19 @@ class DockerServerConfig(BaseModel):
47
47
  """Build command for container. Override in subclasses for specific logic."""
48
48
  return self.command_args if self.command_args else None
49
49
 
50
+ def update_command_args(
51
+ self,
52
+ vllm_args: dict | None = None,
53
+ forget_predefined_vllm_args: bool = False,
54
+ ) -> list[str]:
55
+ if vllm_args is not None:
56
+ if forget_predefined_vllm_args:
57
+ self.command_args = vllm_args
58
+ else:
59
+ self.command_args.extend(vllm_args)
60
+
61
+ return self.command_args
62
+
50
63
  def get_volumes(self) -> dict | None:
51
64
  """Setup volumes for container. Override in subclasses for specific logic."""
52
65
  return self.volumes
@@ -144,7 +157,7 @@ class ConverterServer:
144
157
  """Start the Docker server."""
145
158
  if self._server_context is not None:
146
159
  logger.warning("Server already started")
147
- return self.base_url
160
+ return self.base_url, self._container
148
161
 
149
162
  # Use the generic docker_server for all server types
150
163
  self._server_context = docker_server(config=self.config, cleanup=self.auto_stop)
vlmparse/servers/utils.py CHANGED
@@ -2,6 +2,7 @@ import getpass
2
2
  import time
3
3
  from contextlib import contextmanager
4
4
  from pathlib import Path
5
+ from urllib.parse import parse_qsl, urlparse
5
6
 
6
7
  import docker
7
8
  from loguru import logger
@@ -222,25 +223,52 @@ def docker_server(
222
223
  logger.info("Container stopped")
223
224
 
224
225
 
226
+ def normalize_uri(uri: str) -> tuple:
227
+ u = urlparse(uri)
228
+
229
+ # --- Normalize scheme ---
230
+ scheme = (u.scheme or "http").lower()
231
+
232
+ # --- Normalize host ---
233
+ host = (u.hostname or "").lower()
234
+ if host in ("localhost", "0.0.0.0"):
235
+ host = "localhost"
236
+
237
+ # --- Normalize port (apply defaults) ---
238
+ if u.port:
239
+ port = u.port
240
+ else:
241
+ port = 443 if scheme == "https" else 80
242
+
243
+ # --- Normalize path ---
244
+ # Treat empty path as "/" and remove trailing slash (except root)
245
+ path = u.path or "/"
246
+ if path != "/" and path.endswith("/"):
247
+ path = path.rstrip("/")
248
+
249
+ # Collapse duplicate slashes
250
+ while "//" in path:
251
+ path = path.replace("//", "/")
252
+
253
+ # --- Normalize query parameters (sorted) ---
254
+ query_pairs = parse_qsl(u.query, keep_blank_values=True)
255
+ query = "&".join(f"{k}={v}" for k, v in sorted(query_pairs))
256
+
257
+ return (scheme, host, port, path, query)
258
+
259
+
225
260
  def get_model_from_uri(uri: str) -> str:
226
261
  model = None
227
262
  client = docker.from_env()
228
263
  containers = client.containers.list()
264
+
265
+ uri = normalize_uri(uri)
266
+
229
267
  for container in containers:
230
268
  c_uri = container.labels.get("vlmparse_uri")
231
269
  c_model = container.labels.get("vlmparse_model_name")
232
- if c_uri is not None:
233
- c_uri = c_uri.replace("localhost", "0.0.0.0")
234
-
235
- # Check if user URI matches container URI (ignoring /v1 suffix if missing)
236
- if c_uri and (
237
- c_uri == uri or c_uri.startswith(uri.rstrip("/")) or uri.startswith(c_uri)
238
- ):
239
- # Update URI to the correct one from container (likely has /v1)
240
- if len(c_uri) > len(uri.rstrip("/")):
241
- logger.info(f"Updating URI from {uri} to {c_uri}")
242
- uri = c_uri
243
270
 
271
+ if c_uri and uri == normalize_uri(c_uri):
244
272
  # Infer model if not provided
245
273
  if model is None and c_model:
246
274
  logger.info(f"Inferred model {c_model} from container")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vlmparse
3
- Version: 0.1.5
3
+ Version: 0.1.7
4
4
  Requires-Python: >=3.11.0
5
5
  Description-Content-Type: text/markdown
6
6
  License-File: LICENSE
@@ -199,11 +199,13 @@ server.stop()
199
199
  ```
200
200
 
201
201
 
202
- Converter with automatic server deployment:
202
+ Converter with automatic server management:
203
203
 
204
204
  ```python
205
205
  from vlmparse.converter_with_server import ConverterWithServer
206
206
 
207
- converter_with_server = ConverterWithServer(model="mineru2.5")
208
- documents = converter_with_server.parse(inputs=["file1.pdf", "file2.pdf"], out_folder="./output")
207
+ with ConverterWithServer(model="mineru2.5") as converter_with_server:
208
+ documents = converter_with_server.parse(inputs=["file1.pdf", "file2.pdf"], out_folder="./output")
209
209
  ```
210
+
211
+ Note that if you pass an uri of a vllm server to `ConverterWithServer`, the model name is inferred automatically and no server is started.
@@ -0,0 +1,36 @@
1
+ vlmparse/base_model.py,sha256=4U4UPe8SNArliKnUf8pp8zQugWYsnhg9okylt7mrW1U,381
2
+ vlmparse/build_doc.py,sha256=LAWrnFrqamN5PwJo57AUtQOPrMFGnCGw4gBjEKZ6pYo,2127
3
+ vlmparse/cli.py,sha256=JfR6gk0pdYAavJgFTVx4OcgWdiLktGoKJ8TcVcD_IHw,12235
4
+ vlmparse/constants.py,sha256=DYaK7KtTW8p9MPb3iPvoP5H1r7ICRuIFo89P01q4uCI,184
5
+ vlmparse/converter.py,sha256=F0JSY9sFYUggCvaUCb27kKGJJpnZKW2FStMDVJoIOeQ,7383
6
+ vlmparse/converter_with_server.py,sha256=zpUHDpHbDBs4Cj7dcVjvUQw0-U_InRNDC5Ekb_gehRM,6022
7
+ vlmparse/registries.py,sha256=yBVrrhy61rSoLwdNV-z0C4lqIpTbLoWab3V6u7aSyNM,5797
8
+ vlmparse/utils.py,sha256=rcVrtPiQVj_8HAmFQOu___72uYIapp_X89yxrMNCBow,1236
9
+ vlmparse/clients/chandra.py,sha256=EulsCZdwOtm0pQ6CDm320U96k8aWFN4wKqCm1Xo7VCE,9775
10
+ vlmparse/clients/deepseekocr.py,sha256=Uw6tPvP2KVsPDlz1ZUgYdbgQSjmFPuYeFDrGMMOTBAo,6501
11
+ vlmparse/clients/docling.py,sha256=SAkLsqseuWfkuiel8FWR1G0Z5s-SZU3dE2JbsOvF4SA,5328
12
+ vlmparse/clients/dotsocr.py,sha256=uGJoYEiDkP3-rmfdkAnMeAX-T4RZyEPoh6jmow5_-J8,10336
13
+ vlmparse/clients/granite_docling.py,sha256=LMJAFjpSxcgLhsVxknSqrCC35MUTmklsE9PJZvMK2O8,4691
14
+ vlmparse/clients/hunyuanocr.py,sha256=UFqaS4b8UM9EtizyrZIxlqcYlESmxm8xrQZP7lL6tkE,1857
15
+ vlmparse/clients/lightonocr.py,sha256=wx1Im8Z3wlRWwYbPqnSd3LqTtdAU8CnX5mzu1BuCUY8,1314
16
+ vlmparse/clients/mineru.py,sha256=6jZ1sKn2kGwUvD8gVs4PqEDH7uUXYK8pAB5Fr1JeqnY,3617
17
+ vlmparse/clients/nanonetocr.py,sha256=BT5vaeerCsK5agvOaHK3NvLUqWd1FfDmrMmDYbp646I,1543
18
+ vlmparse/clients/olmocr.py,sha256=A4Vl0meYpU5QPTML_OxyyRM07xCxtfrMZedgGMYEcuU,1851
19
+ vlmparse/clients/openai_converter.py,sha256=bIDpR7Yn70eEp0pmzFoG2dDwY-mxCj3kH1IZS9BvXVQ,6266
20
+ vlmparse/clients/paddleocrvl.py,sha256=qFBDj_UQocyq3WCh24tUOx9Ud7S9DfSm-1n3ztikY2s,1402
21
+ vlmparse/clients/prompts.py,sha256=-J60lqxgRzlkQ9VsQLxmWsIMaDt-gNqWqWoqHIw9CLc,4228
22
+ vlmparse/clients/pipe_utils/cleaner.py,sha256=oxBkBTOkluN1lmeNbzajRIe0_D__ZGwUOBaI_Ph0uxE,2396
23
+ vlmparse/clients/pipe_utils/html_to_md_conversion.py,sha256=cFFqzD2jCNw_968_eu3Wt--Ox7iJj2Rn5UoP_DZWosU,4112
24
+ vlmparse/clients/pipe_utils/utils.py,sha256=935ecIO446I0pstszE_1nrIPHn1Ffrxunq7fVd0dsd8,315
25
+ vlmparse/data_model/box.py,sha256=lJsh4qhjgYXZF5vTSJ1qMXD5GVlBi2_SBedBMlfJikU,16868
26
+ vlmparse/data_model/document.py,sha256=xheaMeStOj2c9GZKmdtxcEl_Dj44V5JyVp6JnTrSpH0,4615
27
+ vlmparse/servers/docker_server.py,sha256=UVU7VDloJ8Yfqj-WUv3Trti9AODcdC9JyTzW3sCM-l4,7032
28
+ vlmparse/servers/utils.py,sha256=tIXhgbF9EVOJy2nYEguVq69gn9ATxtya_1F4wZSt68o,9454
29
+ vlmparse/st_viewer/fs_nav.py,sha256=7GNH68h2Loh5pQ64Pe72-D2cs2BLhqRXevEmKdFmPX0,1616
30
+ vlmparse/st_viewer/st_viewer.py,sha256=m2rQTtk5rlwErNmivNAg-4rkHkvNkvLhoJZxFQi7Dwk,2105
31
+ vlmparse-0.1.7.dist-info/licenses/LICENSE,sha256=3TKJHk8hPBR5dbLWZ3IpfCftl-_m-iyBwpYQGZYxj14,1080
32
+ vlmparse-0.1.7.dist-info/METADATA,sha256=DP--8aCeLxAgvo6vvaDog7xzzMzvZywVvCrMiAKhDbo,5597
33
+ vlmparse-0.1.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
34
+ vlmparse-0.1.7.dist-info/entry_points.txt,sha256=gD5berP6HwE2wNIkls-Lw5goiceA8uMgPEd7ifnFJXs,47
35
+ vlmparse-0.1.7.dist-info/top_level.txt,sha256=k4ni-GNH_iAX7liQEsk_KY_c3xgZgt8k9fsSs9IXLXs,9
36
+ vlmparse-0.1.7.dist-info/RECORD,,
@@ -1,36 +0,0 @@
1
- vlmparse/base_model.py,sha256=4U4UPe8SNArliKnUf8pp8zQugWYsnhg9okylt7mrW1U,381
2
- vlmparse/build_doc.py,sha256=LAWrnFrqamN5PwJo57AUtQOPrMFGnCGw4gBjEKZ6pYo,2127
3
- vlmparse/cli.py,sha256=tQma1IkOsFnqPKqqHVO1PJh18n1w82gp4ewA7oraJkE,15855
4
- vlmparse/constants.py,sha256=7-47S01n4MI2ebR09bpdOo3_P16d-z-NVGsm6KJP8ls,110
5
- vlmparse/converter.py,sha256=F0JSY9sFYUggCvaUCb27kKGJJpnZKW2FStMDVJoIOeQ,7383
6
- vlmparse/converter_with_server.py,sha256=G393O7vU_lJz6Vz-qYVkrjFhf0Vmpjjl8OjPKQe2blU,3928
7
- vlmparse/registries.py,sha256=6bEUKTkTjc8C7c1R1ZvAHSF5NCXmAuhNpw0qNnuQ7-A,5818
8
- vlmparse/utils.py,sha256=rcVrtPiQVj_8HAmFQOu___72uYIapp_X89yxrMNCBow,1236
9
- vlmparse/clients/chandra.py,sha256=zfu-A6Slh-fIAyrtrlVoCb6QHLBimnimefap_K9YwYw,9775
10
- vlmparse/clients/deepseekocr.py,sha256=rQvaOaPPoDiZ0MzXqfqqH9BgUBfjmlfHu3NlMjSDgiQ,6501
11
- vlmparse/clients/docling.py,sha256=SAkLsqseuWfkuiel8FWR1G0Z5s-SZU3dE2JbsOvF4SA,5328
12
- vlmparse/clients/dotsocr.py,sha256=w2T-xkhlw1AfT-CUYoF0ectr2jDYHe9239B24XKB1UQ,10139
13
- vlmparse/clients/granite_docling.py,sha256=EQpsv5qSJG0HtMSacmJStER2sq4TGf1EMU5_NmJsl4g,4634
14
- vlmparse/clients/hunyuanocr.py,sha256=Xw0Q1l-3pQzaEgFngnfM8vrSWpnT3I99QvDaGZ8XooM,1712
15
- vlmparse/clients/lightonocr.py,sha256=wx1Im8Z3wlRWwYbPqnSd3LqTtdAU8CnX5mzu1BuCUY8,1314
16
- vlmparse/clients/mineru.py,sha256=bilDPcUoLk2rcFVqMk4q2Hx2txilc3GDUbjAEoMM_BI,3671
17
- vlmparse/clients/nanonetocr.py,sha256=BT5vaeerCsK5agvOaHK3NvLUqWd1FfDmrMmDYbp646I,1543
18
- vlmparse/clients/olmocr.py,sha256=mQEDpfyLY8a80Zlps5mG0QaWytIgnNQZVEVWKWjPIjk,1849
19
- vlmparse/clients/openai_converter.py,sha256=j2H0iAQTADRRpu1Zy1b-1OFfWyXuqCvrQKy2UcwggTA,5696
20
- vlmparse/clients/paddleocrvl.py,sha256=tmaqg3boV4edywiiiNiNiI3dBHi111wz4dFb52OISXw,1376
21
- vlmparse/clients/prompts.py,sha256=-J60lqxgRzlkQ9VsQLxmWsIMaDt-gNqWqWoqHIw9CLc,4228
22
- vlmparse/clients/pipe_utils/cleaner.py,sha256=oxBkBTOkluN1lmeNbzajRIe0_D__ZGwUOBaI_Ph0uxE,2396
23
- vlmparse/clients/pipe_utils/html_to_md_conversion.py,sha256=cFFqzD2jCNw_968_eu3Wt--Ox7iJj2Rn5UoP_DZWosU,4112
24
- vlmparse/clients/pipe_utils/utils.py,sha256=935ecIO446I0pstszE_1nrIPHn1Ffrxunq7fVd0dsd8,315
25
- vlmparse/data_model/box.py,sha256=lJsh4qhjgYXZF5vTSJ1qMXD5GVlBi2_SBedBMlfJikU,16868
26
- vlmparse/data_model/document.py,sha256=pdCZvWzRFkez53ZJpNaB4ezUW-OVUlbR3_SBmmgVzGQ,4217
27
- vlmparse/servers/docker_server.py,sha256=qOoZcWSHrK7kK7tAL61RJSW-Jmee93It2SEfWG3jGrc,6633
28
- vlmparse/servers/utils.py,sha256=qy2-rnQTCQKt6CeTV5H74tvRTXyzBV2KswQiYW8Tf-k,8908
29
- vlmparse/st_viewer/fs_nav.py,sha256=7GNH68h2Loh5pQ64Pe72-D2cs2BLhqRXevEmKdFmPX0,1616
30
- vlmparse/st_viewer/st_viewer.py,sha256=m2rQTtk5rlwErNmivNAg-4rkHkvNkvLhoJZxFQi7Dwk,2105
31
- vlmparse-0.1.5.dist-info/licenses/LICENSE,sha256=3TKJHk8hPBR5dbLWZ3IpfCftl-_m-iyBwpYQGZYxj14,1080
32
- vlmparse-0.1.5.dist-info/METADATA,sha256=LN4W1cvXJvL22hwLAgeSwd3PGTmlrt6lgqNi-tL9pes,5446
33
- vlmparse-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
34
- vlmparse-0.1.5.dist-info/entry_points.txt,sha256=gD5berP6HwE2wNIkls-Lw5goiceA8uMgPEd7ifnFJXs,47
35
- vlmparse-0.1.5.dist-info/top_level.txt,sha256=k4ni-GNH_iAX7liQEsk_KY_c3xgZgt8k9fsSs9IXLXs,9
36
- vlmparse-0.1.5.dist-info/RECORD,,