vlmparse 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,14 +1,13 @@
1
1
  import json
2
2
  import math
3
3
  from pathlib import Path
4
- from typing import ClassVar, Literal
4
+ from typing import ClassVar
5
5
 
6
6
  from loguru import logger
7
7
  from PIL import Image
8
8
  from pydantic import Field
9
9
 
10
10
  from vlmparse.clients.openai_converter import (
11
- LLMParams,
12
11
  OpenAIConverterClient,
13
12
  OpenAIConverterConfig,
14
13
  )
@@ -48,12 +47,13 @@ class DotsOCRDockerServerConfig(DockerServerConfig):
48
47
  )
49
48
  add_model_key_to_server: bool = True
50
49
  aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
50
+ default_model_name: str = DEFAULT_MODEL_NAME
51
51
 
52
52
  @property
53
53
  def client_config(self):
54
54
  return DotsOCRConverterConfig(
55
- llm_params=LLMParams(
56
- base_url=f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}",
55
+ **self._create_client_kwargs(
56
+ f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
57
57
  )
58
58
  )
59
59
 
@@ -65,29 +65,7 @@ class DotsOCRConverterConfig(OpenAIConverterConfig):
65
65
  model_name: str = "rednote-hilab/dots.ocr"
66
66
  preprompt: str | None = ""
67
67
  postprompt: str | None = None
68
- completion_kwargs: dict | None = {
69
- "temperature": 0.1,
70
- "top_p": 1.0,
71
- "max_completion_tokens": 16384,
72
- }
73
- aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
74
- dpi: int = 200
75
- prompt_mode: Literal["prompt_layout_all_en", "prompt_ocr"] = "prompt_ocr"
76
-
77
- def get_client(self, **kwargs) -> "DotsOCRConverter":
78
- return DotsOCRConverter(config=self, **kwargs)
79
-
80
-
81
- class DotsOCRConverter(OpenAIConverterClient):
82
- """DotsOCR VLLM converter."""
83
-
84
- # Constants
85
- MIN_PIXELS: ClassVar[int] = 3136
86
- MAX_PIXELS: ClassVar[int] = 11289600
87
- IMAGE_FACTOR: ClassVar[int] = 28
88
-
89
- # Prompts
90
- PROMPTS: ClassVar[dict] = {
68
+ prompts: dict[str, str] = {
91
69
  "prompt_layout_all_en": """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
92
70
 
93
71
  1. Bbox format: [x1, y1, x2, y2]
@@ -108,6 +86,30 @@ class DotsOCRConverter(OpenAIConverterClient):
108
86
  """,
109
87
  "prompt_ocr": """Extract the text content from this image.""",
110
88
  }
89
+ prompt_mode_map: dict[str, str] = {
90
+ "ocr": "prompt_ocr",
91
+ "ocr_layout": "prompt_layout_all_en",
92
+ "table": "prompt_layout_all_en",
93
+ }
94
+ completion_kwargs: dict | None = {
95
+ "temperature": 0.1,
96
+ "top_p": 1.0,
97
+ "max_completion_tokens": 16384,
98
+ }
99
+ aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
100
+ dpi: int = 200
101
+
102
+ def get_client(self, **kwargs) -> "DotsOCRConverter":
103
+ return DotsOCRConverter(config=self, **kwargs)
104
+
105
+
106
+ class DotsOCRConverter(OpenAIConverterClient):
107
+ """DotsOCR VLLM converter."""
108
+
109
+ # Constants
110
+ MIN_PIXELS: ClassVar[int] = 3136
111
+ MAX_PIXELS: ClassVar[int] = 11289600
112
+ IMAGE_FACTOR: ClassVar[int] = 28
111
113
 
112
114
  @staticmethod
113
115
  def round_by_factor(number: int, factor: int) -> int:
@@ -235,7 +237,7 @@ class DotsOCRConverter(OpenAIConverterClient):
235
237
  image = self.fetch_image(
236
238
  origin_image, min_pixels=self.MIN_PIXELS, max_pixels=self.MAX_PIXELS
237
239
  )
238
- prompt = self.PROMPTS[prompt_mode]
240
+ prompt = self.config.prompts[prompt_mode]
239
241
 
240
242
  response, usage = await self._async_inference_with_vllm(image, prompt)
241
243
 
@@ -258,13 +260,15 @@ class DotsOCRConverter(OpenAIConverterClient):
258
260
  async def async_call_inside_page(self, page: Page) -> Page:
259
261
  image = page.image
260
262
 
263
+ prompt_key = self.get_prompt_key() or "prompt_ocr"
264
+
261
265
  _, response, _, usage = await self._parse_image_vllm(
262
- image, prompt_mode=self.config.prompt_mode
266
+ image, prompt_mode=prompt_key
263
267
  )
264
268
  logger.info("Response: " + str(response))
265
269
 
266
270
  items = None
267
- if self.config.prompt_mode == "prompt_layout_all_en":
271
+ if prompt_key == "prompt_layout_all_en":
268
272
  text = "\n\n".join([item.get("text", "") for item in response])
269
273
 
270
274
  items = []
@@ -286,5 +290,4 @@ class DotsOCRConverter(OpenAIConverterClient):
286
290
 
287
291
  page.completion_tokens = usage.completion_tokens
288
292
  page.prompt_tokens = usage.prompt_tokens
289
- page.reasoning_tokens = usage.reasoning_tokens
290
293
  return page
@@ -28,12 +28,17 @@ class GraniteDoclingDockerServerConfig(VLLMDockerServerConfig):
28
28
 
29
29
  @property
30
30
  def client_config(self):
31
- return GraniteDoclingConverterConfig(llm_params=self.llm_params)
31
+ return GraniteDoclingConverterConfig(
32
+ **self._create_client_kwargs(
33
+ f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
34
+ )
35
+ )
32
36
 
33
37
 
34
38
  class GraniteDoclingConverterConfig(OpenAIConverterConfig):
35
39
  """Granite Docling converter configuration."""
36
40
 
41
+ model_name: str = "ibm-granite/granite-docling-258M"
37
42
  preprompt: str | None = None
38
43
  postprompt: str | None = "Convert this page to docling."
39
44
  completion_kwargs: dict | None = {
@@ -69,49 +74,18 @@ class GraniteDoclingConverter(OpenAIConverterClient):
69
74
  }
70
75
  ]
71
76
 
72
- doctags = await self._get_chat_completion_adaptive(
77
+ doctags, usage = await self._get_chat_completion(
73
78
  messages, completion_kwargs=self.config.completion_kwargs
74
79
  )
75
80
  doctags = clean_response(doctags)
76
81
 
77
82
  page.raw_response = doctags
78
83
  page.text = _doctags_to_markdown(doctags, image)
84
+ if usage is not None:
85
+ page.prompt_tokens = usage.prompt_tokens
86
+ page.completion_tokens = usage.completion_tokens
79
87
  return page
80
88
 
81
- async def _get_chat_completion_adaptive(
82
- self, messages: list[dict], completion_kwargs: dict | None
83
- ) -> str:
84
- """
85
- vLLM enforces input+output <= model context length. If `max_tokens` is too
86
- high (especially for multimodal prompts), retry with progressively smaller
87
- `max_tokens`.
88
- """
89
- kwargs = (completion_kwargs or {}).copy()
90
- max_tokens = kwargs.get("max_tokens") or kwargs.get("max_completion_tokens")
91
-
92
- for _ in range(6):
93
- try:
94
- return await self._get_chat_completion(
95
- messages, completion_kwargs=kwargs
96
- )
97
- except Exception as e:
98
- msg = str(e)
99
- too_large = (
100
- "max_tokens" in msg
101
- and "maximum context length" in msg
102
- and "is too large" in msg
103
- )
104
- if not too_large or not isinstance(max_tokens, int):
105
- raise
106
-
107
- max_tokens = max(256, int(max_tokens * 0.75))
108
- if "max_tokens" in kwargs:
109
- kwargs["max_tokens"] = max_tokens
110
- if "max_completion_tokens" in kwargs:
111
- kwargs["max_completion_tokens"] = max_tokens
112
-
113
- return await self._get_chat_completion(messages, completion_kwargs=kwargs)
114
-
115
89
 
116
90
  def _doctags_to_markdown(doctags: str, image):
117
91
  try:
@@ -25,7 +25,11 @@ class HunyuanOCRDockerServerConfig(VLLMDockerServerConfig):
25
25
 
26
26
  @property
27
27
  def client_config(self):
28
- return HunyuanOCRConverterConfig(llm_params=self.llm_params)
28
+ return HunyuanOCRConverterConfig(
29
+ **self._create_client_kwargs(
30
+ f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
31
+ )
32
+ )
29
33
 
30
34
 
31
35
  class HunyuanOCRConverterConfig(OpenAIConverterConfig):
@@ -25,7 +25,11 @@ class LightOnOCRDockerServerConfig(VLLMDockerServerConfig):
25
25
 
26
26
  @property
27
27
  def client_config(self):
28
- return LightOnOCRConverterConfig(llm_params=self.llm_params)
28
+ return LightOnOCRConverterConfig(
29
+ **self._create_client_kwargs(
30
+ f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
31
+ )
32
+ )
29
33
 
30
34
 
31
35
  class LightOnOCRConverterConfig(OpenAIConverterConfig):
@@ -41,3 +45,21 @@ class LightOnOCRConverterConfig(OpenAIConverterConfig):
41
45
  }
42
46
  dpi: int = 200
43
47
  aliases: list[str] = Field(default_factory=lambda: ["lightonocr"])
48
+
49
+
50
+ class LightonOCR21BServerConfig(LightOnOCRDockerServerConfig):
51
+ model_name: str = "lightonai/LightOnOCR-2-1B"
52
+ aliases: list[str] = Field(default_factory=lambda: ["lightonocr2"])
53
+
54
+ @property
55
+ def client_config(self):
56
+ return LightonOCR21BConverterConfig(
57
+ **self._create_client_kwargs(
58
+ f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
59
+ )
60
+ )
61
+
62
+
63
+ class LightonOCR21BConverterConfig(LightOnOCRConverterConfig):
64
+ model_name: str = "lightonai/LightOnOCR-2-1B"
65
+ aliases: list[str] = Field(default_factory=lambda: ["lightonocr2"])
@@ -31,7 +31,6 @@ class MinerUDockerServerConfig(DockerServerConfig):
31
31
  class MinerUConverterConfig(ConverterConfig):
32
32
  """Configuration for MinerU API converter."""
33
33
 
34
- base_url: str
35
34
  model_name: str = "opendatalab/MinerU2.5-2509-1.2B"
36
35
  aliases: list[str] = Field(default_factory=lambda: ["mineru25"])
37
36
  timeout: int = 600
@@ -0,0 +1,85 @@
1
+ import os
2
+
3
+ import httpx
4
+ import orjson
5
+ from loguru import logger
6
+ from pydantic import Field
7
+
8
+ from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
9
+ from vlmparse.clients.pipe_utils.utils import clean_response
10
+ from vlmparse.converter import BaseConverter, ConverterConfig
11
+ from vlmparse.data_model.document import Page
12
+ from vlmparse.utils import to_base64
13
+
14
+
15
+ class MistralOCRConverterConfig(ConverterConfig):
16
+ """Configuration for Mistral OCR converter."""
17
+
18
+ base_url: str = "https://api.mistral.ai/v1"
19
+ model_name: str = "mistral-ocr-latest"
20
+ api_key: str | None = None
21
+ timeout: int = 300
22
+ aliases: list[str] = Field(
23
+ default_factory=lambda: ["mistral-ocr-latest", "mistral-ocr"]
24
+ )
25
+
26
+ def get_client(self, **kwargs) -> "MistralOCRConverter":
27
+ return MistralOCRConverter(config=self, **kwargs)
28
+
29
+
30
+ class MistralOCRConverter(BaseConverter):
31
+ """Client for Mistral OCR API."""
32
+
33
+ config: MistralOCRConverterConfig
34
+
35
+ def __init__(self, config: MistralOCRConverterConfig, **kwargs):
36
+ super().__init__(config=config, **kwargs)
37
+ if not self.config.api_key:
38
+ self.config.api_key = os.getenv("MISTRAL_API_KEY")
39
+ if not self.config.api_key:
40
+ raise ValueError("MISTRAL_API_KEY environment variable not set")
41
+ self._base_url = self.config.base_url.rstrip("/")
42
+
43
+ async def _async_ocr(self, image) -> httpx.Response:
44
+ payload = {
45
+ "model": self.config.model_name,
46
+ "document": {
47
+ "type": "image_url",
48
+ "image_url": f"data:image/png;base64,{to_base64(image)}",
49
+ },
50
+ }
51
+ headers = {"Authorization": f"Bearer {self.config.api_key}"}
52
+
53
+ async with httpx.AsyncClient(timeout=self.config.timeout) as client:
54
+ response = await client.post(
55
+ f"{self._base_url}/ocr",
56
+ json=payload,
57
+ headers=headers,
58
+ )
59
+ response.raise_for_status()
60
+ return response
61
+
62
+ async def async_call_inside_page(self, page: Page) -> Page:
63
+ response = await self._async_ocr(page.image)
64
+ page.raw_response = response.text
65
+
66
+ try:
67
+ data = response.json()
68
+ except ValueError:
69
+ logger.warning("Mistral OCR returned non-JSON response")
70
+ page.text = clean_response(response.text)
71
+ return page
72
+
73
+ pages = data.get("pages") or []
74
+ if pages:
75
+ page_data = pages[0]
76
+ text = page_data.get("markdown") or page_data.get("text") or ""
77
+ else:
78
+ text = (
79
+ data.get("markdown") or data.get("text") or orjson.dumps(data).decode()
80
+ )
81
+
82
+ text = clean_response(text)
83
+ text = html_to_md_keep_tables(text)
84
+ page.text = text
85
+ return page
@@ -12,7 +12,11 @@ class NanonetOCR2DockerServerConfig(VLLMDockerServerConfig):
12
12
 
13
13
  @property
14
14
  def client_config(self):
15
- return NanonetOCR2ConverterConfig(llm_params=self.llm_params)
15
+ return NanonetOCR2ConverterConfig(
16
+ **self._create_client_kwargs(
17
+ f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
18
+ )
19
+ )
16
20
 
17
21
 
18
22
  class NanonetOCR2ConverterConfig(OpenAIConverterConfig):
@@ -23,7 +23,11 @@ class OlmOCRDockerServerConfig(VLLMDockerServerConfig):
23
23
 
24
24
  @property
25
25
  def client_config(self):
26
- return OlmOCRConverterConfig(llm_params=self.llm_params)
26
+ return OlmOCRConverterConfig(
27
+ **self._create_client_kwargs(
28
+ f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
29
+ )
30
+ )
27
31
 
28
32
 
29
33
  class OlmOCRConverterConfig(OpenAIConverterConfig):
@@ -37,7 +41,7 @@ class OlmOCRConverterConfig(OpenAIConverterConfig):
37
41
  "Return your output as markdown, with a front matter section on top specifying values for the primary_language, is_rotation_valid, rotation_correction, is_table, and is_diagram parameters."
38
42
  )
39
43
  postprompt: str | None = None
40
- completion_kwargs: dict | None = {
44
+ completion_kwargs: dict = {
41
45
  "temperature": 0.1,
42
46
  "max_tokens": 8000,
43
47
  }
@@ -1,15 +1,13 @@
1
- import os
2
- from typing import Literal
1
+ import asyncio
2
+ from typing import Literal, Optional
3
3
 
4
4
  from loguru import logger
5
5
  from pydantic import Field
6
6
 
7
- from vlmparse.base_model import VLMParseBaseModel
8
7
  from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
9
8
  from vlmparse.clients.pipe_utils.utils import clean_response
10
9
  from vlmparse.converter import BaseConverter, ConverterConfig
11
10
  from vlmparse.data_model.document import Page
12
- from vlmparse.servers.docker_server import DEFAULT_MODEL_NAME
13
11
  from vlmparse.utils import to_base64
14
12
 
15
13
  from .prompts import PDF2MD_PROMPT
@@ -17,50 +15,14 @@ from .prompts import PDF2MD_PROMPT
17
15
  GOOGLE_API_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
18
16
 
19
17
 
20
- class LLMParams(VLMParseBaseModel):
18
+ class OpenAIConverterConfig(ConverterConfig):
21
19
  api_key: str = ""
22
- base_url: str | None = None
23
- model_name: str = DEFAULT_MODEL_NAME
24
20
  timeout: int | None = 500
25
21
  max_retries: int = 1
26
-
27
-
28
- def get_llm_params(model_name: str, uri: str | None = None):
29
- if uri is not None:
30
- return LLMParams(base_url=uri, model_name="vllm-model", api_key="")
31
- if model_name in [
32
- "gpt-4o",
33
- "gpt-4o-mini",
34
- "gpt-4.1",
35
- "gpt-4.1-mini",
36
- "gpt-4.1-nano",
37
- "gpt-5",
38
- "gpt-5-mini",
39
- "gpt-5-nano",
40
- ]:
41
- base_url = None
42
- api_key = os.getenv("OPENAI_API_KEY")
43
- if api_key is None:
44
- raise ValueError("OPENAI_API_KEY environment variable not set")
45
- else:
46
- if model_name in [
47
- "gemini-2.5-flash-lite",
48
- "gemini-2.5-flash",
49
- "gemini-2.5-pro",
50
- ]:
51
- base_url = GOOGLE_API_BASE_URL
52
- api_key = os.getenv("GOOGLE_API_KEY")
53
- if api_key is None:
54
- raise ValueError("GOOGLE_API_KEY environment variable not set")
55
- else:
56
- return None
57
- return LLMParams(base_url=base_url, model_name=model_name, api_key=api_key)
58
-
59
-
60
- class OpenAIConverterConfig(ConverterConfig):
61
- llm_params: LLMParams
62
22
  preprompt: str | None = None
63
- postprompt: str | None = PDF2MD_PROMPT
23
+ postprompt: str | dict[str, str] | None = PDF2MD_PROMPT
24
+ prompts: dict[str, str] = Field(default_factory=dict)
25
+ prompt_mode_map: dict[str, str] = Field(default_factory=dict)
64
26
  completion_kwargs: dict = Field(default_factory=dict)
65
27
  stream: bool = False
66
28
 
@@ -71,6 +33,33 @@ class OpenAIConverterConfig(ConverterConfig):
71
33
  class OpenAIConverterClient(BaseConverter):
72
34
  """Client for OpenAI-compatible API servers."""
73
35
 
36
+ def get_prompt_key(self) -> str | None:
37
+ """Resolve a prompt key from conversion_mode using class mappings."""
38
+ mode = getattr(self.config, "conversion_mode", None) or "ocr"
39
+ prompts = self._get_prompts()
40
+ if mode in prompts:
41
+ return mode
42
+ mapped = self._get_prompt_mode_map().get(mode)
43
+ if mapped in prompts:
44
+ return mapped
45
+ return None
46
+
47
+ def get_prompt_for_mode(self) -> str | None:
48
+ key = self.get_prompt_key()
49
+ if key is None:
50
+ return None
51
+ return self._get_prompts().get(key)
52
+
53
+ def _get_prompts(self) -> dict[str, str]:
54
+ if self.config.prompts:
55
+ return self.config.prompts
56
+ if isinstance(self.config.postprompt, dict):
57
+ return self.config.postprompt
58
+ return {}
59
+
60
+ def _get_prompt_mode_map(self) -> dict[str, str]:
61
+ return self.config.prompt_mode_map or {}
62
+
74
63
  def __init__(
75
64
  self,
76
65
  config: OpenAIConverterConfig,
@@ -90,25 +79,67 @@ class OpenAIConverterClient(BaseConverter):
90
79
  debug=debug,
91
80
  return_documents_in_batch_mode=return_documents_in_batch_mode,
92
81
  )
93
- from openai import AsyncOpenAI
94
-
95
- self.model = AsyncOpenAI(
96
- base_url=self.config.llm_params.base_url,
97
- api_key=self.config.llm_params.api_key,
98
- timeout=self.config.llm_params.timeout,
99
- max_retries=self.config.llm_params.max_retries,
100
- )
82
+ self._model = None
83
+ self._model_loop = None
84
+
85
+ async def _get_async_model(self):
86
+ loop = asyncio.get_running_loop()
87
+ if self._model is None or self._model_loop is not loop:
88
+ await self._close_model()
89
+ from openai import AsyncOpenAI
90
+
91
+ self._model = AsyncOpenAI(
92
+ base_url=self.config.base_url,
93
+ api_key=self.config.api_key,
94
+ timeout=self.config.timeout,
95
+ max_retries=self.config.max_retries,
96
+ )
97
+ self._model_loop = loop
98
+ return self._model
99
+
100
+ async def _close_model(self):
101
+ """Close the async OpenAI client if it exists."""
102
+ if self._model is not None:
103
+ try:
104
+ await self._model.close()
105
+ except RuntimeError:
106
+ # Event loop may already be closed
107
+ pass
108
+ finally:
109
+ self._model = None
110
+ self._model_loop = None
111
+
112
+ async def aclose(self):
113
+ """Close the converter and release resources."""
114
+ await self._close_model()
115
+
116
+ def close(self):
117
+ """Synchronously close the converter if possible."""
118
+ if self._model is not None:
119
+ try:
120
+ loop = asyncio.get_running_loop()
121
+ loop.create_task(self._close_model())
122
+ except RuntimeError:
123
+ # No running loop, try to close synchronously
124
+ try:
125
+ asyncio.run(self._close_model())
126
+ except RuntimeError:
127
+ # Event loop already closed, force cleanup
128
+ self._model = None
129
+ self._model_loop = None
101
130
 
102
131
  async def _get_chat_completion(
103
132
  self, messages: list[dict], completion_kwargs: dict | None = None
104
- ) -> tuple[str, "CompletionUsage"]: # noqa: F821
133
+ ) -> tuple[str, Optional["CompletionUsage"]]: # noqa: F821
105
134
  """Helper to handle chat completion with optional streaming."""
106
135
  if completion_kwargs is None:
107
136
  completion_kwargs = self.config.completion_kwargs
108
137
 
138
+ model = await self._get_async_model()
139
+
109
140
  if self.config.stream:
110
- response_stream = await self.model.chat.completions.create(
111
- model=self.config.llm_params.model_name,
141
+ response_stream = await model.chat.completions.create(
142
+ model=self.config.default_model_name,
112
143
  messages=messages,
113
144
  stream=True,
114
145
  **completion_kwargs,
@@ -117,10 +148,11 @@ class OpenAIConverterClient(BaseConverter):
117
148
  async for chunk in response_stream:
118
149
  if chunk.choices and chunk.choices[0].delta.content:
119
150
  response_parts.append(chunk.choices[0].delta.content)
120
- return "".join(response_parts)
151
+
152
+ return "".join(response_parts), None
121
153
  else:
122
- response_obj = await self.model.chat.completions.create(
123
- model=self.config.llm_params.model_name,
154
+ response_obj = await model.chat.completions.create(
155
+ model=self.config.default_model_name,
124
156
  messages=messages,
125
157
  **completion_kwargs,
126
158
  )
@@ -146,11 +178,15 @@ class OpenAIConverterClient(BaseConverter):
146
178
  else:
147
179
  preprompt = []
148
180
 
149
- postprompt = (
150
- [{"type": "text", "text": self.config.postprompt}]
151
- if self.config.postprompt
152
- else []
153
- )
181
+ selected_prompt = self.get_prompt_for_mode()
182
+ if selected_prompt is not None:
183
+ postprompt = [{"type": "text", "text": selected_prompt}]
184
+ else:
185
+ postprompt = (
186
+ [{"type": "text", "text": self.config.postprompt}]
187
+ if isinstance(self.config.postprompt, str) and self.config.postprompt
188
+ else []
189
+ )
154
190
 
155
191
  messages = [
156
192
  *preprompt,
@@ -175,9 +211,10 @@ class OpenAIConverterClient(BaseConverter):
175
211
 
176
212
  text = html_to_md_keep_tables(text)
177
213
  page.text = text
178
- page.prompt_tokens = usage.prompt_tokens
179
- page.completion_tokens = usage.completion_tokens
180
- if hasattr(usage, "reasoning_tokens"):
181
- page.reasoning_tokens = usage.reasoning_tokens
214
+ if usage is not None:
215
+ page.prompt_tokens = usage.prompt_tokens
216
+ page.completion_tokens = usage.completion_tokens
217
+ if hasattr(usage, "reasoning_tokens"):
218
+ page.reasoning_tokens = usage.reasoning_tokens
182
219
 
183
220
  return page
@@ -22,7 +22,11 @@ class PaddleOCRVLDockerServerConfig(VLLMDockerServerConfig):
22
22
 
23
23
  @property
24
24
  def client_config(self):
25
- return PaddleOCRVLConverterConfig(llm_params=self.llm_params)
25
+ return PaddleOCRVLConverterConfig(
26
+ **self._create_client_kwargs(
27
+ f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
28
+ )
29
+ )
26
30
 
27
31
 
28
32
  # Task-specific base prompts
@@ -39,7 +43,10 @@ class PaddleOCRVLConverterConfig(OpenAIConverterConfig):
39
43
 
40
44
  model_name: str = "PaddlePaddle/PaddleOCR-VL"
41
45
  preprompt: str | None = None
42
- postprompt: str | None = TASKS["ocr"]
46
+ postprompt: dict[str, str] = TASKS
47
+ prompt_mode_map: dict[str, str] = {
48
+ "ocr_layout": "ocr",
49
+ }
43
50
  completion_kwargs: dict | None = {
44
51
  "temperature": 0.0,
45
52
  "max_completion_tokens": 16384,
vlmparse/constants.py CHANGED
@@ -1,2 +1,5 @@
1
+ import os
2
+
1
3
  IMAGE_EXTENSIONS = [".jpg", ".jpeg", ".png", ".tiff", ".tif", ".bmp", ".gif", ".webp"]
2
4
  PDF_EXTENSION = ".pdf"
5
+ DEFAULT_SERVER_PORT = os.getenv("VLMPARSE_DEFAULT_PORT", 8056)