vlmparse 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. vlmparse/build_doc.py +20 -19
  2. vlmparse/cli.py +439 -270
  3. vlmparse/clients/chandra.py +176 -60
  4. vlmparse/clients/deepseekocr.py +193 -12
  5. vlmparse/clients/docling.py +0 -1
  6. vlmparse/clients/dotsocr.py +34 -31
  7. vlmparse/clients/glmocr.py +243 -0
  8. vlmparse/clients/granite_docling.py +9 -36
  9. vlmparse/clients/hunyuanocr.py +5 -1
  10. vlmparse/clients/lightonocr.py +23 -1
  11. vlmparse/clients/mineru.py +0 -1
  12. vlmparse/clients/mistral_converter.py +85 -0
  13. vlmparse/clients/nanonetocr.py +5 -1
  14. vlmparse/clients/olmocr.py +6 -2
  15. vlmparse/clients/openai_converter.py +95 -60
  16. vlmparse/clients/paddleocrvl.py +195 -40
  17. vlmparse/converter.py +51 -11
  18. vlmparse/converter_with_server.py +92 -19
  19. vlmparse/registries.py +107 -89
  20. vlmparse/servers/base_server.py +127 -0
  21. vlmparse/servers/docker_compose_deployment.py +489 -0
  22. vlmparse/servers/docker_compose_server.py +39 -0
  23. vlmparse/servers/docker_run_deployment.py +226 -0
  24. vlmparse/servers/docker_server.py +17 -109
  25. vlmparse/servers/model_identity.py +48 -0
  26. vlmparse/servers/server_registry.py +42 -0
  27. vlmparse/servers/utils.py +83 -219
  28. vlmparse/st_viewer/st_viewer.py +1 -1
  29. vlmparse/utils.py +15 -2
  30. {vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/METADATA +13 -3
  31. vlmparse-0.1.9.dist-info/RECORD +44 -0
  32. {vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/WHEEL +1 -1
  33. vlmparse-0.1.7.dist-info/RECORD +0 -36
  34. {vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/entry_points.txt +0 -0
  35. {vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/licenses/LICENSE +0 -0
  36. {vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,243 @@
1
+ import asyncio
2
+ import os
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ import httpx
7
+ import orjson
8
+ from loguru import logger
9
+ from pydantic import Field
10
+
11
+ from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
12
+ from vlmparse.clients.pipe_utils.utils import clean_response
13
+ from vlmparse.converter import BaseConverter, ConverterConfig
14
+ from vlmparse.data_model.document import BoundingBox, Item, Page
15
+ from vlmparse.servers.docker_compose_server import DockerComposeServerConfig
16
+ from vlmparse.utils import to_base64
17
+
18
+ DOCKER_PIPELINE_DIR = (
19
+ Path(__file__).parent.parent.parent / "docker_pipelines" / "glmocr"
20
+ )
21
+
22
+
23
+ class GLMOCRDockerServerConfig(DockerComposeServerConfig):
24
+ """Docker Compose configuration for GLM-OCR server."""
25
+
26
+ model_name: str = "GLM-OCR"
27
+ aliases: list[str] = Field(default_factory=lambda: ["glmocr", "glm-ocr"])
28
+ compose_file: str = str(DOCKER_PIPELINE_DIR / "compose.yaml")
29
+ server_service: str = "glmocr-api"
30
+ compose_services: list[str] = Field(
31
+ default_factory=lambda: ["glmocr-api", "glmocr-vllm-server"]
32
+ )
33
+ gpu_service_names: list[str] = Field(default_factory=lambda: ["glmocr-vllm-server"])
34
+ docker_port: int = 5002
35
+ container_port: int = 5002
36
+ environment: dict[str, str] = Field(
37
+ default_factory=lambda: {
38
+ "VLM_BACKEND": "vllm",
39
+ "API_PORT": "8080",
40
+ }
41
+ )
42
+ environment_services: list[str] = Field(default_factory=lambda: ["glmocr-api"])
43
+ server_ready_indicators: list[str] = Field(
44
+ default_factory=lambda: ["Running on", "Application startup complete"]
45
+ )
46
+
47
+ def model_post_init(self, __context):
48
+ if not self.compose_env:
49
+ compose_env = {}
50
+ for key in [
51
+ "API_IMAGE_TAG_SUFFIX",
52
+ "VLM_IMAGE_TAG_SUFFIX",
53
+ "VLM_BACKEND",
54
+ ]:
55
+ value = os.getenv(key)
56
+ if value:
57
+ compose_env[key] = value
58
+ if compose_env:
59
+ self.compose_env = compose_env
60
+
61
+ @property
62
+ def client_config(self):
63
+ return GLMOCRConverterConfig(
64
+ **self._create_client_kwargs(f"http://localhost:{self.docker_port}")
65
+ )
66
+
67
+
68
+ class GLMOCRConverterConfig(ConverterConfig):
69
+ """Configuration for GLM-OCR API client."""
70
+
71
+ model_name: str = "GLM-OCR"
72
+ aliases: list[str] = Field(default_factory=lambda: ["glmocr", "glm-ocr"])
73
+ timeout: int = 600
74
+
75
+ endpoint_parse: str = "/glmocr/parse"
76
+
77
+ # GLM-OCR specific configuration
78
+
79
+ # Output format: "json", "markdown", or "both"
80
+ output_format: str = "both"
81
+
82
+ # Enable layout detection (PP-DocLayout)
83
+ enable_layout: bool = True
84
+
85
+ # GLM-OCR model parameters
86
+ max_tokens: int = 16384
87
+ temperature: float = 0.01
88
+ image_format: str = "JPEG"
89
+ min_pixels: int = 12544
90
+ max_pixels: int = 71372800
91
+
92
+ # Backward-compat escape hatch: if set, applied last to the payload.
93
+ request_overrides: dict[str, Any] = Field(default_factory=dict)
94
+
95
+ def get_client(self, **kwargs) -> "GLMOCRConverter":
96
+ return GLMOCRConverter(config=self, **kwargs)
97
+
98
+
99
+ class GLMOCRConverter(BaseConverter):
100
+ """GLM-OCR HTTP API converter."""
101
+
102
+ config: GLMOCRConverterConfig
103
+
104
+ def _build_parse_payload(self, file_content_b64: str) -> dict:
105
+ """Build the request payload for the GLM-OCR parse endpoint.
106
+
107
+ Args:
108
+ file_content_b64: Base64 encoded image content
109
+
110
+ Returns:
111
+ Dictionary payload for the API request
112
+ """
113
+ # Wrap base64 in data URI format as expected by GLM-OCR
114
+ # Format: data:image/png;base64,<base64_data>
115
+ data_uri = f"data:image/png;base64,{file_content_b64}"
116
+
117
+ payload: dict[str, Any] = {
118
+ "images": [data_uri] # GLM-OCR expects a list
119
+ }
120
+
121
+ # Apply any request overrides
122
+ if self.config.request_overrides:
123
+ payload.update(self.config.request_overrides)
124
+
125
+ return payload
126
+
127
+ async def _post_json(self, endpoint: str, payload: dict) -> dict:
128
+ """Make a POST request to the GLM-OCR API.
129
+
130
+ Args:
131
+ endpoint: API endpoint path
132
+ payload: Request payload
133
+
134
+ Returns:
135
+ Parsed JSON response
136
+
137
+ Raises:
138
+ RuntimeError: If the API returns an error
139
+ """
140
+ headers = {}
141
+
142
+ async with httpx.AsyncClient(
143
+ base_url=self.config.base_url, timeout=self.config.timeout, headers=headers
144
+ ) as client:
145
+ response = await client.post(endpoint, json=payload)
146
+
147
+ response.raise_for_status()
148
+ data = response.json()
149
+
150
+ # Check for error in response
151
+ if "error" in data:
152
+ raise RuntimeError(data.get("error", "Unknown error"))
153
+
154
+ return data
155
+
156
+ def _apply_markdown(self, page: Page, markdown_text: str | None):
157
+ """Apply markdown text to the page.
158
+
159
+ Args:
160
+ page: Page object to update
161
+ markdown_text: Markdown content from GLM-OCR
162
+ """
163
+ text = markdown_text or ""
164
+ text = clean_response(text)
165
+ text = html_to_md_keep_tables(text)
166
+ logger.debug(f"Converted markdown text: {text[:100]}...")
167
+ page.text = text
168
+
169
+ def _apply_items(self, page: Page, json_result: list[dict] | None):
170
+ """Apply structured items to the page from JSON result.
171
+
172
+ Args:
173
+ page: Page object to update
174
+ json_result: List of detected regions from GLM-OCR
175
+ """
176
+ if not json_result:
177
+ return
178
+
179
+ items: list[Item] = []
180
+
181
+ for block in json_result:
182
+ bbox = block.get("bbox_2d")
183
+ if not bbox or len(bbox) != 4:
184
+ # If no bbox, skip this item
185
+ continue
186
+
187
+ x1, y1, x2, y2 = bbox
188
+ text = block.get("content") or ""
189
+ label = block.get("label") or ""
190
+
191
+ items.append(
192
+ Item(
193
+ text=text,
194
+ box=BoundingBox(l=x1, t=y1, r=x2, b=y2),
195
+ category=label,
196
+ )
197
+ )
198
+
199
+ page.items = items
200
+
201
+ async def async_call_inside_page(self, page: Page) -> Page:
202
+ """Process a single page through the GLM-OCR API.
203
+
204
+ Args:
205
+ page: Page object containing the image to process
206
+
207
+ Returns:
208
+ Updated Page object with OCR results
209
+ """
210
+ image = page.image
211
+
212
+ # Convert image to base64
213
+ file_content_b64 = await asyncio.to_thread(to_base64, image, "PNG")
214
+
215
+ # Build request payload
216
+ payload = self._build_parse_payload(file_content_b64)
217
+
218
+ # Call the GLM-OCR API
219
+ data = await self._post_json(self.config.endpoint_parse, payload)
220
+
221
+ # GLM-OCR returns results as a list (one per document)
222
+ # Since we send one image, we get one document result
223
+ result = data.get("markdown_result", None)
224
+
225
+ if result:
226
+ # Get markdown output if available
227
+ markdown_result = result
228
+ if markdown_result:
229
+ self._apply_markdown(page, markdown_result)
230
+
231
+ # Get JSON output if available and layout detection is enabled
232
+ json_result = data.get("json_result")
233
+ if json_result and isinstance(json_result, list) and len(json_result) > 0:
234
+ # json_result is a list of pages, take the first page
235
+ page_result = (
236
+ json_result[0] if isinstance(json_result[0], list) else json_result
237
+ )
238
+ self._apply_items(page, page_result)
239
+
240
+ # Store raw response
241
+ page.raw_response = orjson.dumps(result).decode("utf-8")
242
+
243
+ return page
@@ -28,7 +28,11 @@ class GraniteDoclingDockerServerConfig(VLLMDockerServerConfig):
28
28
 
29
29
  @property
30
30
  def client_config(self):
31
- return GraniteDoclingConverterConfig(llm_params=self.llm_params)
31
+ return GraniteDoclingConverterConfig(
32
+ **self._create_client_kwargs(
33
+ f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
34
+ )
35
+ )
32
36
 
33
37
 
34
38
  class GraniteDoclingConverterConfig(OpenAIConverterConfig):
@@ -70,49 +74,18 @@ class GraniteDoclingConverter(OpenAIConverterClient):
70
74
  }
71
75
  ]
72
76
 
73
- doctags = await self._get_chat_completion_adaptive(
77
+ doctags, usage = await self._get_chat_completion(
74
78
  messages, completion_kwargs=self.config.completion_kwargs
75
79
  )
76
80
  doctags = clean_response(doctags)
77
81
 
78
82
  page.raw_response = doctags
79
83
  page.text = _doctags_to_markdown(doctags, image)
84
+ if usage is not None:
85
+ page.prompt_tokens = usage.prompt_tokens
86
+ page.completion_tokens = usage.completion_tokens
80
87
  return page
81
88
 
82
- async def _get_chat_completion_adaptive(
83
- self, messages: list[dict], completion_kwargs: dict | None
84
- ) -> str:
85
- """
86
- vLLM enforces input+output <= model context length. If `max_tokens` is too
87
- high (especially for multimodal prompts), retry with progressively smaller
88
- `max_tokens`.
89
- """
90
- kwargs = (completion_kwargs or {}).copy()
91
- max_tokens = kwargs.get("max_tokens") or kwargs.get("max_completion_tokens")
92
-
93
- for _ in range(6):
94
- try:
95
- return await self._get_chat_completion(
96
- messages, completion_kwargs=kwargs
97
- )
98
- except Exception as e:
99
- msg = str(e)
100
- too_large = (
101
- "max_tokens" in msg
102
- and "maximum context length" in msg
103
- and "is too large" in msg
104
- )
105
- if not too_large or not isinstance(max_tokens, int):
106
- raise
107
-
108
- max_tokens = max(256, int(max_tokens * 0.75))
109
- if "max_tokens" in kwargs:
110
- kwargs["max_tokens"] = max_tokens
111
- if "max_completion_tokens" in kwargs:
112
- kwargs["max_completion_tokens"] = max_tokens
113
-
114
- return await self._get_chat_completion(messages, completion_kwargs=kwargs)
115
-
116
89
 
117
90
  def _doctags_to_markdown(doctags: str, image):
118
91
  try:
@@ -25,7 +25,11 @@ class HunyuanOCRDockerServerConfig(VLLMDockerServerConfig):
25
25
 
26
26
  @property
27
27
  def client_config(self):
28
- return HunyuanOCRConverterConfig(llm_params=self.llm_params)
28
+ return HunyuanOCRConverterConfig(
29
+ **self._create_client_kwargs(
30
+ f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
31
+ )
32
+ )
29
33
 
30
34
 
31
35
  class HunyuanOCRConverterConfig(OpenAIConverterConfig):
@@ -25,7 +25,11 @@ class LightOnOCRDockerServerConfig(VLLMDockerServerConfig):
25
25
 
26
26
  @property
27
27
  def client_config(self):
28
- return LightOnOCRConverterConfig(llm_params=self.llm_params)
28
+ return LightOnOCRConverterConfig(
29
+ **self._create_client_kwargs(
30
+ f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
31
+ )
32
+ )
29
33
 
30
34
 
31
35
  class LightOnOCRConverterConfig(OpenAIConverterConfig):
@@ -41,3 +45,21 @@ class LightOnOCRConverterConfig(OpenAIConverterConfig):
41
45
  }
42
46
  dpi: int = 200
43
47
  aliases: list[str] = Field(default_factory=lambda: ["lightonocr"])
48
+
49
+
50
+ class LightonOCR21BServerConfig(LightOnOCRDockerServerConfig):
51
+ model_name: str = "lightonai/LightOnOCR-2-1B"
52
+ aliases: list[str] = Field(default_factory=lambda: ["lightonocr2"])
53
+
54
+ @property
55
+ def client_config(self):
56
+ return LightonOCR21BConverterConfig(
57
+ **self._create_client_kwargs(
58
+ f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
59
+ )
60
+ )
61
+
62
+
63
+ class LightonOCR21BConverterConfig(LightOnOCRConverterConfig):
64
+ model_name: str = "lightonai/LightOnOCR-2-1B"
65
+ aliases: list[str] = Field(default_factory=lambda: ["lightonocr2"])
@@ -31,7 +31,6 @@ class MinerUDockerServerConfig(DockerServerConfig):
31
31
  class MinerUConverterConfig(ConverterConfig):
32
32
  """Configuration for MinerU API converter."""
33
33
 
34
- base_url: str
35
34
  model_name: str = "opendatalab/MinerU2.5-2509-1.2B"
36
35
  aliases: list[str] = Field(default_factory=lambda: ["mineru25"])
37
36
  timeout: int = 600
@@ -0,0 +1,85 @@
1
+ import os
2
+
3
+ import httpx
4
+ import orjson
5
+ from loguru import logger
6
+ from pydantic import Field
7
+
8
+ from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
9
+ from vlmparse.clients.pipe_utils.utils import clean_response
10
+ from vlmparse.converter import BaseConverter, ConverterConfig
11
+ from vlmparse.data_model.document import Page
12
+ from vlmparse.utils import to_base64
13
+
14
+
15
+ class MistralOCRConverterConfig(ConverterConfig):
16
+ """Configuration for Mistral OCR converter."""
17
+
18
+ base_url: str = "https://api.mistral.ai/v1"
19
+ model_name: str = "mistral-ocr-latest"
20
+ api_key: str | None = None
21
+ timeout: int = 300
22
+ aliases: list[str] = Field(
23
+ default_factory=lambda: ["mistral-ocr-latest", "mistral-ocr"]
24
+ )
25
+
26
+ def get_client(self, **kwargs) -> "MistralOCRConverter":
27
+ return MistralOCRConverter(config=self, **kwargs)
28
+
29
+
30
+ class MistralOCRConverter(BaseConverter):
31
+ """Client for Mistral OCR API."""
32
+
33
+ config: MistralOCRConverterConfig
34
+
35
+ def __init__(self, config: MistralOCRConverterConfig, **kwargs):
36
+ super().__init__(config=config, **kwargs)
37
+ if not self.config.api_key:
38
+ self.config.api_key = os.getenv("MISTRAL_API_KEY")
39
+ if not self.config.api_key:
40
+ raise ValueError("MISTRAL_API_KEY environment variable not set")
41
+ self._base_url = self.config.base_url.rstrip("/")
42
+
43
+ async def _async_ocr(self, image) -> httpx.Response:
44
+ payload = {
45
+ "model": self.config.model_name,
46
+ "document": {
47
+ "type": "image_url",
48
+ "image_url": f"data:image/png;base64,{to_base64(image)}",
49
+ },
50
+ }
51
+ headers = {"Authorization": f"Bearer {self.config.api_key}"}
52
+
53
+ async with httpx.AsyncClient(timeout=self.config.timeout) as client:
54
+ response = await client.post(
55
+ f"{self._base_url}/ocr",
56
+ json=payload,
57
+ headers=headers,
58
+ )
59
+ response.raise_for_status()
60
+ return response
61
+
62
+ async def async_call_inside_page(self, page: Page) -> Page:
63
+ response = await self._async_ocr(page.image)
64
+ page.raw_response = response.text
65
+
66
+ try:
67
+ data = response.json()
68
+ except ValueError:
69
+ logger.warning("Mistral OCR returned non-JSON response")
70
+ page.text = clean_response(response.text)
71
+ return page
72
+
73
+ pages = data.get("pages") or []
74
+ if pages:
75
+ page_data = pages[0]
76
+ text = page_data.get("markdown") or page_data.get("text") or ""
77
+ else:
78
+ text = (
79
+ data.get("markdown") or data.get("text") or orjson.dumps(data).decode()
80
+ )
81
+
82
+ text = clean_response(text)
83
+ text = html_to_md_keep_tables(text)
84
+ page.text = text
85
+ return page
@@ -12,7 +12,11 @@ class NanonetOCR2DockerServerConfig(VLLMDockerServerConfig):
12
12
 
13
13
  @property
14
14
  def client_config(self):
15
- return NanonetOCR2ConverterConfig(llm_params=self.llm_params)
15
+ return NanonetOCR2ConverterConfig(
16
+ **self._create_client_kwargs(
17
+ f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
18
+ )
19
+ )
16
20
 
17
21
 
18
22
  class NanonetOCR2ConverterConfig(OpenAIConverterConfig):
@@ -23,7 +23,11 @@ class OlmOCRDockerServerConfig(VLLMDockerServerConfig):
23
23
 
24
24
  @property
25
25
  def client_config(self):
26
- return OlmOCRConverterConfig(llm_params=self.llm_params)
26
+ return OlmOCRConverterConfig(
27
+ **self._create_client_kwargs(
28
+ f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
29
+ )
30
+ )
27
31
 
28
32
 
29
33
  class OlmOCRConverterConfig(OpenAIConverterConfig):
@@ -37,7 +41,7 @@ class OlmOCRConverterConfig(OpenAIConverterConfig):
37
41
  "Return your output as markdown, with a front matter section on top specifying values for the primary_language, is_rotation_valid, rotation_correction, is_table, and is_diagram parameters."
38
42
  )
39
43
  postprompt: str | None = None
40
- completion_kwargs: dict | None = {
44
+ completion_kwargs: dict = {
41
45
  "temperature": 0.1,
42
46
  "max_tokens": 8000,
43
47
  }