vlmparse 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vlmparse/cli.py +439 -286
- vlmparse/clients/deepseekocr.py +170 -0
- vlmparse/clients/glmocr.py +243 -0
- vlmparse/clients/paddleocrvl.py +191 -43
- vlmparse/converter_with_server.py +53 -16
- vlmparse/registries.py +20 -10
- vlmparse/servers/base_server.py +127 -0
- vlmparse/servers/docker_compose_deployment.py +489 -0
- vlmparse/servers/docker_compose_server.py +39 -0
- vlmparse/servers/docker_run_deployment.py +226 -0
- vlmparse/servers/docker_server.py +9 -125
- vlmparse/servers/server_registry.py +42 -0
- vlmparse/servers/utils.py +83 -219
- vlmparse/st_viewer/st_viewer.py +1 -1
- {vlmparse-0.1.8.dist-info → vlmparse-0.1.9.dist-info}/METADATA +3 -3
- {vlmparse-0.1.8.dist-info → vlmparse-0.1.9.dist-info}/RECORD +20 -14
- {vlmparse-0.1.8.dist-info → vlmparse-0.1.9.dist-info}/WHEEL +0 -0
- {vlmparse-0.1.8.dist-info → vlmparse-0.1.9.dist-info}/entry_points.txt +0 -0
- {vlmparse-0.1.8.dist-info → vlmparse-0.1.9.dist-info}/licenses/LICENSE +0 -0
- {vlmparse-0.1.8.dist-info → vlmparse-0.1.9.dist-info}/top_level.txt +0 -0
vlmparse/clients/deepseekocr.py
CHANGED
|
@@ -13,6 +13,10 @@ from vlmparse.data_model.document import Item, Page
|
|
|
13
13
|
from vlmparse.servers.docker_server import VLLMDockerServerConfig
|
|
14
14
|
from vlmparse.utils import to_base64
|
|
15
15
|
|
|
16
|
+
# ==============================================================================
|
|
17
|
+
# DeepSeek-OCR (v1)
|
|
18
|
+
# ==============================================================================
|
|
19
|
+
|
|
16
20
|
|
|
17
21
|
class DeepSeekOCRDockerServerConfig(VLLMDockerServerConfig):
|
|
18
22
|
"""Configuration for DeepSeekOCR model."""
|
|
@@ -212,3 +216,169 @@ class DeepSeekOCRConverterClient(OpenAIConverterClient):
|
|
|
212
216
|
page.completion_tokens = usage.completion_tokens
|
|
213
217
|
|
|
214
218
|
return page
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
# ==============================================================================
|
|
222
|
+
# DeepSeek-OCR-2
|
|
223
|
+
# ==============================================================================
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
class DeepSeekOCR2DockerServerConfig(VLLMDockerServerConfig):
|
|
227
|
+
"""Configuration for DeepSeek-OCR-2 model.
|
|
228
|
+
|
|
229
|
+
DeepSeek-OCR-2 uses a custom architecture that requires:
|
|
230
|
+
- Custom model registration via hf_overrides
|
|
231
|
+
- NoRepeatNGram logits processor with specific whitelist tokens
|
|
232
|
+
- Custom image processor (DeepseekOCR2Processor)
|
|
233
|
+
"""
|
|
234
|
+
|
|
235
|
+
docker_image: str = "vllm/vllm-openai:nightly"
|
|
236
|
+
model_name: str = "deepseek-ai/DeepSeek-OCR-2"
|
|
237
|
+
command_args: list[str] = Field(
|
|
238
|
+
default_factory=lambda: [
|
|
239
|
+
"--limit-mm-per-prompt",
|
|
240
|
+
'{"image": 1}',
|
|
241
|
+
"--hf-overrides",
|
|
242
|
+
'{"architectures": ["DeepseekOCR2ForCausalLM"]}',
|
|
243
|
+
"--block-size",
|
|
244
|
+
"256",
|
|
245
|
+
"--trust-remote-code",
|
|
246
|
+
"--max-model-len",
|
|
247
|
+
"8192",
|
|
248
|
+
"--swap-space",
|
|
249
|
+
"0",
|
|
250
|
+
"--gpu-memory-utilization",
|
|
251
|
+
"0.9",
|
|
252
|
+
"--logits_processors",
|
|
253
|
+
"vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor",
|
|
254
|
+
]
|
|
255
|
+
)
|
|
256
|
+
aliases: list[str] = Field(
|
|
257
|
+
default_factory=lambda: ["deepseekocr2", "DeepSeek-OCR-2"]
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
@property
|
|
261
|
+
def client_config(self):
|
|
262
|
+
return DeepSeekOCR2ConverterConfig(
|
|
263
|
+
**self._create_client_kwargs(
|
|
264
|
+
f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
|
|
265
|
+
)
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
class DeepSeekOCR2ConverterConfig(OpenAIConverterConfig):
|
|
270
|
+
"""DeepSeek-OCR-2 converter configuration.
|
|
271
|
+
|
|
272
|
+
Key differences from DeepSeek-OCR v1:
|
|
273
|
+
- Uses DeepseekOCR2ForCausalLM architecture
|
|
274
|
+
- Different logits processor parameters (ngram_size=20, window_size=50)
|
|
275
|
+
- Supports cropping mode for image processing
|
|
276
|
+
"""
|
|
277
|
+
|
|
278
|
+
model_name: str = "deepseek-ai/DeepSeek-OCR-2"
|
|
279
|
+
aliases: list[str] = Field(
|
|
280
|
+
default_factory=lambda: ["deepseekocr2", "DeepSeek-OCR-2"]
|
|
281
|
+
)
|
|
282
|
+
postprompt: str | None = None
|
|
283
|
+
prompts: dict[str, str] = {
|
|
284
|
+
"layout": "<|grounding|>Convert the document to markdown.",
|
|
285
|
+
"ocr": "Free OCR.",
|
|
286
|
+
"image_description": "Describe this image in detail.",
|
|
287
|
+
}
|
|
288
|
+
prompt_mode_map: dict[str, str] = {
|
|
289
|
+
"ocr_layout": "layout",
|
|
290
|
+
"table": "layout",
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
completion_kwargs: dict | None = {
|
|
294
|
+
"temperature": 0.0,
|
|
295
|
+
"max_tokens": 8180,
|
|
296
|
+
"extra_body": {
|
|
297
|
+
"skip_special_tokens": False,
|
|
298
|
+
# args used to control custom logits processor
|
|
299
|
+
"vllm_xargs": {
|
|
300
|
+
"ngram_size": 20,
|
|
301
|
+
"window_size": 50,
|
|
302
|
+
# whitelist: <td>, </td>
|
|
303
|
+
"whitelist_token_ids": [128821, 128822],
|
|
304
|
+
},
|
|
305
|
+
},
|
|
306
|
+
}
|
|
307
|
+
dpi: int = 144 # Default DPI used in reference implementation
|
|
308
|
+
|
|
309
|
+
def get_client(self, **kwargs) -> "DeepSeekOCR2ConverterClient":
|
|
310
|
+
return DeepSeekOCR2ConverterClient(config=self, **kwargs)
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
class DeepSeekOCR2ConverterClient(DeepSeekOCRConverterClient):
|
|
314
|
+
"""Client for DeepSeek-OCR-2 with specific post-processing.
|
|
315
|
+
|
|
316
|
+
Inherits from DeepSeekOCRConverterClient as the post-processing logic
|
|
317
|
+
for parsing grounding references and extracting items is the same.
|
|
318
|
+
The main differences are in the model configuration and logits processor.
|
|
319
|
+
"""
|
|
320
|
+
|
|
321
|
+
async def async_call_inside_page(self, page: Page) -> Page:
|
|
322
|
+
# Prepare messages as in parent class
|
|
323
|
+
image = page.image
|
|
324
|
+
|
|
325
|
+
prompt_key = self.get_prompt_key() or "ocr"
|
|
326
|
+
|
|
327
|
+
messages = [
|
|
328
|
+
{
|
|
329
|
+
"role": "user",
|
|
330
|
+
"content": [
|
|
331
|
+
{
|
|
332
|
+
"type": "image_url",
|
|
333
|
+
"image_url": {
|
|
334
|
+
"url": f"data:image/png;base64,{to_base64(image)}"
|
|
335
|
+
},
|
|
336
|
+
},
|
|
337
|
+
{"type": "text", "text": self.config.prompts[prompt_key]},
|
|
338
|
+
],
|
|
339
|
+
},
|
|
340
|
+
]
|
|
341
|
+
|
|
342
|
+
# Get raw response using parent's method
|
|
343
|
+
response, usage = await self._get_chat_completion(messages)
|
|
344
|
+
logger.info("Response length: " + str(len(response)))
|
|
345
|
+
page.raw_response = response
|
|
346
|
+
|
|
347
|
+
if prompt_key == "layout":
|
|
348
|
+
# Post-processing
|
|
349
|
+
matches, matches_image, matches_other = re_match(response)
|
|
350
|
+
|
|
351
|
+
# Extract items (bounding boxes)
|
|
352
|
+
page.items = self.extract_items(page.image, matches)
|
|
353
|
+
|
|
354
|
+
# Clean text
|
|
355
|
+
outputs = response
|
|
356
|
+
|
|
357
|
+
# Check for sentence end marker (indicates successful completion)
|
|
358
|
+
# If not present, it might be due to repetition detection
|
|
359
|
+
if "<|end▁of▁sentence|>" in outputs:
|
|
360
|
+
outputs = outputs.replace("<|end▁of▁sentence|>", "")
|
|
361
|
+
|
|
362
|
+
# Replace image references with a placeholder
|
|
363
|
+
for a_match_image in matches_image:
|
|
364
|
+
outputs = outputs.replace(a_match_image, "![image]")
|
|
365
|
+
|
|
366
|
+
# Replace other references (text grounding) and cleanup
|
|
367
|
+
for a_match_other in matches_other:
|
|
368
|
+
outputs = (
|
|
369
|
+
outputs.replace(a_match_other, "")
|
|
370
|
+
.replace("\\coloneqq", ":=")
|
|
371
|
+
.replace("\\eqqcolon", "=:")
|
|
372
|
+
.replace("\n\n\n\n", "\n\n")
|
|
373
|
+
.replace("\n\n\n", "\n\n")
|
|
374
|
+
)
|
|
375
|
+
else:
|
|
376
|
+
outputs = response
|
|
377
|
+
|
|
378
|
+
page.text = outputs.strip()
|
|
379
|
+
logger.debug(page.text)
|
|
380
|
+
if usage is not None:
|
|
381
|
+
page.prompt_tokens = usage.prompt_tokens
|
|
382
|
+
page.completion_tokens = usage.completion_tokens
|
|
383
|
+
|
|
384
|
+
return page
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
import orjson
|
|
8
|
+
from loguru import logger
|
|
9
|
+
from pydantic import Field
|
|
10
|
+
|
|
11
|
+
from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
|
|
12
|
+
from vlmparse.clients.pipe_utils.utils import clean_response
|
|
13
|
+
from vlmparse.converter import BaseConverter, ConverterConfig
|
|
14
|
+
from vlmparse.data_model.document import BoundingBox, Item, Page
|
|
15
|
+
from vlmparse.servers.docker_compose_server import DockerComposeServerConfig
|
|
16
|
+
from vlmparse.utils import to_base64
|
|
17
|
+
|
|
18
|
+
DOCKER_PIPELINE_DIR = (
|
|
19
|
+
Path(__file__).parent.parent.parent / "docker_pipelines" / "glmocr"
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class GLMOCRDockerServerConfig(DockerComposeServerConfig):
|
|
24
|
+
"""Docker Compose configuration for GLM-OCR server."""
|
|
25
|
+
|
|
26
|
+
model_name: str = "GLM-OCR"
|
|
27
|
+
aliases: list[str] = Field(default_factory=lambda: ["glmocr", "glm-ocr"])
|
|
28
|
+
compose_file: str = str(DOCKER_PIPELINE_DIR / "compose.yaml")
|
|
29
|
+
server_service: str = "glmocr-api"
|
|
30
|
+
compose_services: list[str] = Field(
|
|
31
|
+
default_factory=lambda: ["glmocr-api", "glmocr-vllm-server"]
|
|
32
|
+
)
|
|
33
|
+
gpu_service_names: list[str] = Field(default_factory=lambda: ["glmocr-vllm-server"])
|
|
34
|
+
docker_port: int = 5002
|
|
35
|
+
container_port: int = 5002
|
|
36
|
+
environment: dict[str, str] = Field(
|
|
37
|
+
default_factory=lambda: {
|
|
38
|
+
"VLM_BACKEND": "vllm",
|
|
39
|
+
"API_PORT": "8080",
|
|
40
|
+
}
|
|
41
|
+
)
|
|
42
|
+
environment_services: list[str] = Field(default_factory=lambda: ["glmocr-api"])
|
|
43
|
+
server_ready_indicators: list[str] = Field(
|
|
44
|
+
default_factory=lambda: ["Running on", "Application startup complete"]
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
def model_post_init(self, __context):
|
|
48
|
+
if not self.compose_env:
|
|
49
|
+
compose_env = {}
|
|
50
|
+
for key in [
|
|
51
|
+
"API_IMAGE_TAG_SUFFIX",
|
|
52
|
+
"VLM_IMAGE_TAG_SUFFIX",
|
|
53
|
+
"VLM_BACKEND",
|
|
54
|
+
]:
|
|
55
|
+
value = os.getenv(key)
|
|
56
|
+
if value:
|
|
57
|
+
compose_env[key] = value
|
|
58
|
+
if compose_env:
|
|
59
|
+
self.compose_env = compose_env
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def client_config(self):
|
|
63
|
+
return GLMOCRConverterConfig(
|
|
64
|
+
**self._create_client_kwargs(f"http://localhost:{self.docker_port}")
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class GLMOCRConverterConfig(ConverterConfig):
|
|
69
|
+
"""Configuration for GLM-OCR API client."""
|
|
70
|
+
|
|
71
|
+
model_name: str = "GLM-OCR"
|
|
72
|
+
aliases: list[str] = Field(default_factory=lambda: ["glmocr", "glm-ocr"])
|
|
73
|
+
timeout: int = 600
|
|
74
|
+
|
|
75
|
+
endpoint_parse: str = "/glmocr/parse"
|
|
76
|
+
|
|
77
|
+
# GLM-OCR specific configuration
|
|
78
|
+
|
|
79
|
+
# Output format: "json", "markdown", or "both"
|
|
80
|
+
output_format: str = "both"
|
|
81
|
+
|
|
82
|
+
# Enable layout detection (PP-DocLayout)
|
|
83
|
+
enable_layout: bool = True
|
|
84
|
+
|
|
85
|
+
# GLM-OCR model parameters
|
|
86
|
+
max_tokens: int = 16384
|
|
87
|
+
temperature: float = 0.01
|
|
88
|
+
image_format: str = "JPEG"
|
|
89
|
+
min_pixels: int = 12544
|
|
90
|
+
max_pixels: int = 71372800
|
|
91
|
+
|
|
92
|
+
# Backward-compat escape hatch: if set, applied last to the payload.
|
|
93
|
+
request_overrides: dict[str, Any] = Field(default_factory=dict)
|
|
94
|
+
|
|
95
|
+
def get_client(self, **kwargs) -> "GLMOCRConverter":
|
|
96
|
+
return GLMOCRConverter(config=self, **kwargs)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class GLMOCRConverter(BaseConverter):
|
|
100
|
+
"""GLM-OCR HTTP API converter."""
|
|
101
|
+
|
|
102
|
+
config: GLMOCRConverterConfig
|
|
103
|
+
|
|
104
|
+
def _build_parse_payload(self, file_content_b64: str) -> dict:
|
|
105
|
+
"""Build the request payload for the GLM-OCR parse endpoint.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
file_content_b64: Base64 encoded image content
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Dictionary payload for the API request
|
|
112
|
+
"""
|
|
113
|
+
# Wrap base64 in data URI format as expected by GLM-OCR
|
|
114
|
+
# Format: data:image/png;base64,<base64_data>
|
|
115
|
+
data_uri = f"data:image/png;base64,{file_content_b64}"
|
|
116
|
+
|
|
117
|
+
payload: dict[str, Any] = {
|
|
118
|
+
"images": [data_uri] # GLM-OCR expects a list
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
# Apply any request overrides
|
|
122
|
+
if self.config.request_overrides:
|
|
123
|
+
payload.update(self.config.request_overrides)
|
|
124
|
+
|
|
125
|
+
return payload
|
|
126
|
+
|
|
127
|
+
async def _post_json(self, endpoint: str, payload: dict) -> dict:
|
|
128
|
+
"""Make a POST request to the GLM-OCR API.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
endpoint: API endpoint path
|
|
132
|
+
payload: Request payload
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
Parsed JSON response
|
|
136
|
+
|
|
137
|
+
Raises:
|
|
138
|
+
RuntimeError: If the API returns an error
|
|
139
|
+
"""
|
|
140
|
+
headers = {}
|
|
141
|
+
|
|
142
|
+
async with httpx.AsyncClient(
|
|
143
|
+
base_url=self.config.base_url, timeout=self.config.timeout, headers=headers
|
|
144
|
+
) as client:
|
|
145
|
+
response = await client.post(endpoint, json=payload)
|
|
146
|
+
|
|
147
|
+
response.raise_for_status()
|
|
148
|
+
data = response.json()
|
|
149
|
+
|
|
150
|
+
# Check for error in response
|
|
151
|
+
if "error" in data:
|
|
152
|
+
raise RuntimeError(data.get("error", "Unknown error"))
|
|
153
|
+
|
|
154
|
+
return data
|
|
155
|
+
|
|
156
|
+
def _apply_markdown(self, page: Page, markdown_text: str | None):
|
|
157
|
+
"""Apply markdown text to the page.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
page: Page object to update
|
|
161
|
+
markdown_text: Markdown content from GLM-OCR
|
|
162
|
+
"""
|
|
163
|
+
text = markdown_text or ""
|
|
164
|
+
text = clean_response(text)
|
|
165
|
+
text = html_to_md_keep_tables(text)
|
|
166
|
+
logger.debug(f"Converted markdown text: {text[:100]}...")
|
|
167
|
+
page.text = text
|
|
168
|
+
|
|
169
|
+
def _apply_items(self, page: Page, json_result: list[dict] | None):
|
|
170
|
+
"""Apply structured items to the page from JSON result.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
page: Page object to update
|
|
174
|
+
json_result: List of detected regions from GLM-OCR
|
|
175
|
+
"""
|
|
176
|
+
if not json_result:
|
|
177
|
+
return
|
|
178
|
+
|
|
179
|
+
items: list[Item] = []
|
|
180
|
+
|
|
181
|
+
for block in json_result:
|
|
182
|
+
bbox = block.get("bbox_2d")
|
|
183
|
+
if not bbox or len(bbox) != 4:
|
|
184
|
+
# If no bbox, skip this item
|
|
185
|
+
continue
|
|
186
|
+
|
|
187
|
+
x1, y1, x2, y2 = bbox
|
|
188
|
+
text = block.get("content") or ""
|
|
189
|
+
label = block.get("label") or ""
|
|
190
|
+
|
|
191
|
+
items.append(
|
|
192
|
+
Item(
|
|
193
|
+
text=text,
|
|
194
|
+
box=BoundingBox(l=x1, t=y1, r=x2, b=y2),
|
|
195
|
+
category=label,
|
|
196
|
+
)
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
page.items = items
|
|
200
|
+
|
|
201
|
+
async def async_call_inside_page(self, page: Page) -> Page:
|
|
202
|
+
"""Process a single page through the GLM-OCR API.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
page: Page object containing the image to process
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
Updated Page object with OCR results
|
|
209
|
+
"""
|
|
210
|
+
image = page.image
|
|
211
|
+
|
|
212
|
+
# Convert image to base64
|
|
213
|
+
file_content_b64 = await asyncio.to_thread(to_base64, image, "PNG")
|
|
214
|
+
|
|
215
|
+
# Build request payload
|
|
216
|
+
payload = self._build_parse_payload(file_content_b64)
|
|
217
|
+
|
|
218
|
+
# Call the GLM-OCR API
|
|
219
|
+
data = await self._post_json(self.config.endpoint_parse, payload)
|
|
220
|
+
|
|
221
|
+
# GLM-OCR returns results as a list (one per document)
|
|
222
|
+
# Since we send one image, we get one document result
|
|
223
|
+
result = data.get("markdown_result", None)
|
|
224
|
+
|
|
225
|
+
if result:
|
|
226
|
+
# Get markdown output if available
|
|
227
|
+
markdown_result = result
|
|
228
|
+
if markdown_result:
|
|
229
|
+
self._apply_markdown(page, markdown_result)
|
|
230
|
+
|
|
231
|
+
# Get JSON output if available and layout detection is enabled
|
|
232
|
+
json_result = data.get("json_result")
|
|
233
|
+
if json_result and isinstance(json_result, list) and len(json_result) > 0:
|
|
234
|
+
# json_result is a list of pages, take the first page
|
|
235
|
+
page_result = (
|
|
236
|
+
json_result[0] if isinstance(json_result[0], list) else json_result
|
|
237
|
+
)
|
|
238
|
+
self._apply_items(page, page_result)
|
|
239
|
+
|
|
240
|
+
# Store raw response
|
|
241
|
+
page.raw_response = orjson.dumps(result).decode("utf-8")
|
|
242
|
+
|
|
243
|
+
return page
|
vlmparse/clients/paddleocrvl.py
CHANGED
|
@@ -1,56 +1,204 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
import orjson
|
|
8
|
+
from loguru import logger
|
|
1
9
|
from pydantic import Field
|
|
2
10
|
|
|
3
|
-
from vlmparse.clients.
|
|
4
|
-
from vlmparse.
|
|
11
|
+
from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
|
|
12
|
+
from vlmparse.clients.pipe_utils.utils import clean_response
|
|
13
|
+
from vlmparse.converter import BaseConverter, ConverterConfig
|
|
14
|
+
from vlmparse.data_model.document import BoundingBox, Item, Page
|
|
15
|
+
from vlmparse.servers.docker_compose_server import DockerComposeServerConfig
|
|
16
|
+
from vlmparse.utils import to_base64
|
|
17
|
+
|
|
18
|
+
DOCKER_PIPELINE_DIR = (
|
|
19
|
+
Path(__file__).parent.parent.parent / "docker_pipelines" / "paddleocrvl"
|
|
20
|
+
)
|
|
5
21
|
|
|
6
22
|
|
|
7
|
-
class PaddleOCRVLDockerServerConfig(
|
|
8
|
-
"""
|
|
23
|
+
class PaddleOCRVLDockerServerConfig(DockerComposeServerConfig):
|
|
24
|
+
"""Docker Compose configuration for PaddleOCR-VL server."""
|
|
9
25
|
|
|
10
|
-
model_name: str = "
|
|
11
|
-
|
|
12
|
-
default_factory=lambda: [
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
26
|
+
model_name: str = "PaddleOCR-VL-1.5"
|
|
27
|
+
aliases: list[str] = Field(
|
|
28
|
+
default_factory=lambda: ["paddleocrvl1.5", "paddleocr-vl-1.5"]
|
|
29
|
+
)
|
|
30
|
+
compose_file: str = str(DOCKER_PIPELINE_DIR / "compose.yaml")
|
|
31
|
+
server_service: str = "paddleocr-vl-api"
|
|
32
|
+
compose_services: list[str] = Field(
|
|
33
|
+
default_factory=lambda: ["paddleocr-vl-api", "paddleocr-vlm-server"]
|
|
34
|
+
)
|
|
35
|
+
gpu_service_names: list[str] = Field(
|
|
36
|
+
default_factory=lambda: ["paddleocr-vl-api", "paddleocr-vlm-server"]
|
|
37
|
+
)
|
|
38
|
+
docker_port: int = 8080
|
|
39
|
+
container_port: int = 8080
|
|
40
|
+
environment: dict[str, str] = Field(
|
|
41
|
+
default_factory=lambda: {
|
|
42
|
+
"VLM_BACKEND": "vllm",
|
|
43
|
+
}
|
|
20
44
|
)
|
|
21
|
-
|
|
45
|
+
environment_services: list[str] = Field(
|
|
46
|
+
default_factory=lambda: ["paddleocr-vl-api"]
|
|
47
|
+
)
|
|
48
|
+
server_ready_indicators: list[str] = Field(
|
|
49
|
+
default_factory=lambda: ["Application startup complete", "Uvicorn running"]
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
def model_post_init(self, __context):
|
|
53
|
+
if not self.compose_env:
|
|
54
|
+
compose_env = {}
|
|
55
|
+
for key in [
|
|
56
|
+
"API_IMAGE_TAG_SUFFIX",
|
|
57
|
+
"VLM_IMAGE_TAG_SUFFIX",
|
|
58
|
+
"VLM_BACKEND",
|
|
59
|
+
]:
|
|
60
|
+
value = os.getenv(key)
|
|
61
|
+
if value:
|
|
62
|
+
compose_env[key] = value
|
|
63
|
+
if compose_env:
|
|
64
|
+
self.compose_env = compose_env
|
|
22
65
|
|
|
23
66
|
@property
|
|
24
67
|
def client_config(self):
|
|
25
68
|
return PaddleOCRVLConverterConfig(
|
|
26
|
-
**self._create_client_kwargs(
|
|
27
|
-
f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
|
|
28
|
-
)
|
|
69
|
+
**self._create_client_kwargs(f"http://localhost:{self.docker_port}")
|
|
29
70
|
)
|
|
30
71
|
|
|
31
72
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
73
|
+
class PaddleOCRVLConverterConfig(ConverterConfig):
|
|
74
|
+
"""Configuration for PaddleOCR-VL API client."""
|
|
75
|
+
|
|
76
|
+
model_name: str = "PaddleOCR-VL-1.5"
|
|
77
|
+
aliases: list[str] = Field(
|
|
78
|
+
default_factory=lambda: ["paddleocrvl1.5", "paddleocr-vl-1.5"]
|
|
79
|
+
)
|
|
80
|
+
timeout: int = 600
|
|
81
|
+
|
|
82
|
+
endpoint_layout_parsing: str = "/layout-parsing"
|
|
83
|
+
endpoint_restructure_pages: str = "/restructure-pages"
|
|
84
|
+
|
|
85
|
+
# Dict of PaddleOCR-VL API args.
|
|
86
|
+
# Keys should match the PaddleOCR-VL API JSON fields (camelCase), e.g.
|
|
87
|
+
# {"useLayoutDetection": true, "promptLabel": "..."}.
|
|
88
|
+
paddleocr_args: dict[str, Any] = Field(
|
|
89
|
+
default_factory=lambda: {
|
|
90
|
+
# Preserve previous default behavior (these were always sent before).
|
|
91
|
+
"prettifyMarkdown": True,
|
|
92
|
+
"showFormulaNumber": False,
|
|
93
|
+
"restructurePages": False,
|
|
94
|
+
}
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# Optional args for the /restructure-pages endpoint (if/when used).
|
|
98
|
+
restructure_args: dict[str, Any] = Field(default_factory=dict)
|
|
99
|
+
|
|
100
|
+
# Backward-compat escape hatch: if set, applied last to the payload.
|
|
101
|
+
request_overrides: dict[str, Any] = Field(default_factory=dict)
|
|
102
|
+
|
|
103
|
+
def get_client(self, **kwargs) -> "PaddleOCRVLConverter":
|
|
104
|
+
return PaddleOCRVLConverter(config=self, **kwargs)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class PaddleOCRVLConverter(BaseConverter):
|
|
108
|
+
"""PaddleOCR-VL HTTP API converter."""
|
|
109
|
+
|
|
110
|
+
config: PaddleOCRVLConverterConfig
|
|
111
|
+
|
|
112
|
+
def _build_layout_payload(self, file_content_b64: str, file_type: int | None):
|
|
113
|
+
payload: dict[str, Any] = {"file": file_content_b64}
|
|
114
|
+
|
|
115
|
+
if self.config.paddleocr_args:
|
|
116
|
+
payload.update(self.config.paddleocr_args)
|
|
117
|
+
|
|
118
|
+
if file_type is not None:
|
|
119
|
+
payload["fileType"] = file_type
|
|
120
|
+
|
|
121
|
+
if self.config.request_overrides:
|
|
122
|
+
payload.update(self.config.request_overrides)
|
|
123
|
+
|
|
124
|
+
return payload
|
|
125
|
+
|
|
126
|
+
def _build_restructure_payload(self, layout_results: list[dict]) -> dict:
|
|
127
|
+
pages = []
|
|
128
|
+
for page_result in layout_results:
|
|
129
|
+
pruned = page_result.get("prunedResult")
|
|
130
|
+
markdown = page_result.get("markdown") or {}
|
|
131
|
+
if pruned is None:
|
|
132
|
+
continue
|
|
133
|
+
pages.append(
|
|
134
|
+
{
|
|
135
|
+
"prunedResult": pruned,
|
|
136
|
+
"markdownImages": markdown.get("images"),
|
|
137
|
+
}
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
payload: dict[str, Any] = {"pages": pages}
|
|
141
|
+
|
|
142
|
+
if self.config.restructure_args:
|
|
143
|
+
payload.update(self.config.restructure_args)
|
|
144
|
+
|
|
145
|
+
return payload
|
|
146
|
+
|
|
147
|
+
async def _post_json(self, endpoint: str, payload: dict) -> dict:
|
|
148
|
+
async with httpx.AsyncClient(
|
|
149
|
+
base_url=self.config.base_url, timeout=self.config.timeout
|
|
150
|
+
) as client:
|
|
151
|
+
response = await client.post(endpoint, json=payload)
|
|
152
|
+
|
|
153
|
+
response.raise_for_status()
|
|
154
|
+
data = response.json()
|
|
155
|
+
if data.get("errorCode", 0) != 0:
|
|
156
|
+
raise RuntimeError(data.get("errorMsg", "Unknown error"))
|
|
157
|
+
return data
|
|
158
|
+
|
|
159
|
+
def _apply_markdown(self, page: Page, markdown_text: str | None):
|
|
160
|
+
text = markdown_text or ""
|
|
161
|
+
text = clean_response(text)
|
|
162
|
+
text = html_to_md_keep_tables(text)
|
|
163
|
+
logger.debug(f"Converted markdown text: {text}...")
|
|
164
|
+
page.text = text
|
|
165
|
+
|
|
166
|
+
def _apply_items(self, page: Page, pruned_result: dict | None):
|
|
167
|
+
if not pruned_result:
|
|
168
|
+
return
|
|
169
|
+
parsing_res_list = pruned_result.get("parsing_res_list") or []
|
|
170
|
+
items: list[Item] = []
|
|
171
|
+
for block in parsing_res_list:
|
|
172
|
+
bbox = block.get("block_bbox")
|
|
173
|
+
if not bbox or len(bbox) != 4:
|
|
174
|
+
logger.warning(f"Invalid bbox in block: {block}")
|
|
175
|
+
continue
|
|
176
|
+
l, t, r, b = bbox
|
|
177
|
+
text = block.get("block_content") or ""
|
|
178
|
+
items.append(
|
|
179
|
+
Item(
|
|
180
|
+
text=text,
|
|
181
|
+
box=BoundingBox(l=l, t=t, r=r, b=b),
|
|
182
|
+
category=block.get("block_label") or "",
|
|
183
|
+
)
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
page.items = items
|
|
187
|
+
|
|
188
|
+
async def async_call_inside_page(self, page: Page) -> Page:
|
|
189
|
+
image = page.image
|
|
190
|
+
file_content_b64 = await asyncio.to_thread(to_base64, image, "PNG")
|
|
191
|
+
payload = self._build_layout_payload(file_content_b64, 1)
|
|
192
|
+
|
|
193
|
+
data = await self._post_json(self.config.endpoint_layout_parsing, payload)
|
|
194
|
+
result = data.get("result", {})
|
|
195
|
+
layout_results = result.get("layoutParsingResults", [])
|
|
196
|
+
if layout_results:
|
|
197
|
+
first = layout_results[0]
|
|
198
|
+
|
|
199
|
+
markdown = first.get("markdown") or {}
|
|
200
|
+
self._apply_markdown(page, markdown.get("text"))
|
|
201
|
+
self._apply_items(page, first.get("prunedResult"))
|
|
202
|
+
page.raw_response = orjson.dumps(first).decode("utf-8")
|
|
203
|
+
|
|
204
|
+
return page
|