vlmparse 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vlmparse/build_doc.py +20 -19
- vlmparse/cli.py +439 -270
- vlmparse/clients/chandra.py +176 -60
- vlmparse/clients/deepseekocr.py +193 -12
- vlmparse/clients/docling.py +0 -1
- vlmparse/clients/dotsocr.py +34 -31
- vlmparse/clients/glmocr.py +243 -0
- vlmparse/clients/granite_docling.py +9 -36
- vlmparse/clients/hunyuanocr.py +5 -1
- vlmparse/clients/lightonocr.py +23 -1
- vlmparse/clients/mineru.py +0 -1
- vlmparse/clients/mistral_converter.py +85 -0
- vlmparse/clients/nanonetocr.py +5 -1
- vlmparse/clients/olmocr.py +6 -2
- vlmparse/clients/openai_converter.py +95 -60
- vlmparse/clients/paddleocrvl.py +195 -40
- vlmparse/converter.py +51 -11
- vlmparse/converter_with_server.py +92 -19
- vlmparse/registries.py +107 -89
- vlmparse/servers/base_server.py +127 -0
- vlmparse/servers/docker_compose_deployment.py +489 -0
- vlmparse/servers/docker_compose_server.py +39 -0
- vlmparse/servers/docker_run_deployment.py +226 -0
- vlmparse/servers/docker_server.py +17 -109
- vlmparse/servers/model_identity.py +48 -0
- vlmparse/servers/server_registry.py +42 -0
- vlmparse/servers/utils.py +83 -219
- vlmparse/st_viewer/st_viewer.py +1 -1
- vlmparse/utils.py +15 -2
- {vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/METADATA +13 -3
- vlmparse-0.1.9.dist-info/RECORD +44 -0
- {vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/WHEEL +1 -1
- vlmparse-0.1.7.dist-info/RECORD +0 -36
- {vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/entry_points.txt +0 -0
- {vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/licenses/LICENSE +0 -0
- {vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/top_level.txt +0 -0
vlmparse/clients/chandra.py
CHANGED
|
@@ -1,6 +1,9 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import math
|
|
2
3
|
import time
|
|
4
|
+
from dataclasses import asdict, dataclass
|
|
3
5
|
|
|
6
|
+
from bs4 import BeautifulSoup
|
|
4
7
|
from loguru import logger
|
|
5
8
|
from PIL import Image
|
|
6
9
|
from pydantic import Field
|
|
@@ -11,7 +14,8 @@ from vlmparse.clients.openai_converter import (
|
|
|
11
14
|
)
|
|
12
15
|
from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
|
|
13
16
|
from vlmparse.clients.pipe_utils.utils import clean_response
|
|
14
|
-
from vlmparse.data_model.
|
|
17
|
+
from vlmparse.data_model.box import BoundingBox
|
|
18
|
+
from vlmparse.data_model.document import Item, Page
|
|
15
19
|
from vlmparse.servers.docker_server import VLLMDockerServerConfig
|
|
16
20
|
from vlmparse.utils import to_base64
|
|
17
21
|
|
|
@@ -110,11 +114,6 @@ OCR this image to HTML.
|
|
|
110
114
|
{PROMPT_ENDING}
|
|
111
115
|
""".strip()
|
|
112
116
|
|
|
113
|
-
PROMPT_MAPPING = {
|
|
114
|
-
"ocr_layout": OCR_LAYOUT_PROMPT,
|
|
115
|
-
"ocr": OCR_PROMPT,
|
|
116
|
-
}
|
|
117
|
-
|
|
118
117
|
|
|
119
118
|
def scale_to_fit(
|
|
120
119
|
img: Image.Image,
|
|
@@ -188,11 +187,135 @@ def detect_repeat_token(
|
|
|
188
187
|
return False
|
|
189
188
|
|
|
190
189
|
|
|
190
|
+
@dataclass
|
|
191
|
+
class LayoutBlock:
|
|
192
|
+
"""Represents a layout block with bounding box and content."""
|
|
193
|
+
|
|
194
|
+
bbox: list[int]
|
|
195
|
+
label: str
|
|
196
|
+
content: str
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def parse_layout(
|
|
200
|
+
html: str, image: Image.Image, bbox_scale: int = 1024
|
|
201
|
+
) -> list[LayoutBlock]:
|
|
202
|
+
"""
|
|
203
|
+
Parse HTML layout blocks with bounding boxes.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
html: HTML string with layout blocks (divs with data-bbox and data-label attributes)
|
|
207
|
+
image: PIL Image to get dimensions for bbox scaling
|
|
208
|
+
bbox_scale: The scale used in the prompt for normalized bboxes
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
List of LayoutBlock objects with scaled bounding boxes
|
|
212
|
+
"""
|
|
213
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
214
|
+
top_level_divs = soup.find_all("div", recursive=False)
|
|
215
|
+
width, height = image.size
|
|
216
|
+
width_scaler = width / bbox_scale
|
|
217
|
+
height_scaler = height / bbox_scale
|
|
218
|
+
layout_blocks = []
|
|
219
|
+
|
|
220
|
+
for div in top_level_divs:
|
|
221
|
+
bbox = div.get("data-bbox")
|
|
222
|
+
|
|
223
|
+
try:
|
|
224
|
+
bbox = json.loads(bbox)
|
|
225
|
+
assert len(bbox) == 4, "Invalid bbox length"
|
|
226
|
+
except Exception:
|
|
227
|
+
try:
|
|
228
|
+
bbox = bbox.split(" ")
|
|
229
|
+
assert len(bbox) == 4, "Invalid bbox length"
|
|
230
|
+
except Exception:
|
|
231
|
+
# Default bbox if parsing fails
|
|
232
|
+
bbox = [0, 0, bbox_scale, bbox_scale]
|
|
233
|
+
|
|
234
|
+
bbox = list(map(int, bbox))
|
|
235
|
+
# Scale bbox to image dimensions
|
|
236
|
+
bbox = [
|
|
237
|
+
max(0, int(bbox[0] * width_scaler)),
|
|
238
|
+
max(0, int(bbox[1] * height_scaler)),
|
|
239
|
+
min(int(bbox[2] * width_scaler), width),
|
|
240
|
+
min(int(bbox[3] * height_scaler), height),
|
|
241
|
+
]
|
|
242
|
+
|
|
243
|
+
label = div.get("data-label", "block")
|
|
244
|
+
content = str(div.decode_contents())
|
|
245
|
+
layout_blocks.append(LayoutBlock(bbox=bbox, label=label, content=content))
|
|
246
|
+
|
|
247
|
+
return layout_blocks
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def parse_chunks(html: str, image: Image.Image, bbox_scale: int = 1024) -> list[dict]:
|
|
251
|
+
"""
|
|
252
|
+
Parse HTML layout blocks into dictionaries.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
html: HTML string with layout blocks
|
|
256
|
+
image: PIL Image to get dimensions for bbox scaling
|
|
257
|
+
bbox_scale: The scale used in the prompt for normalized bboxes
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
List of dictionaries with bbox, label, and content keys
|
|
261
|
+
"""
|
|
262
|
+
layout = parse_layout(html, image, bbox_scale=bbox_scale)
|
|
263
|
+
chunks = [asdict(block) for block in layout]
|
|
264
|
+
return chunks
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def layout_blocks_to_items(
|
|
268
|
+
layout_blocks: list[LayoutBlock],
|
|
269
|
+
) -> list[Item]:
|
|
270
|
+
"""
|
|
271
|
+
Convert layout blocks to Item objects for the Page model.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
layout_blocks: List of LayoutBlock objects
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
List of Item objects with category, box, and text
|
|
278
|
+
"""
|
|
279
|
+
items = []
|
|
280
|
+
for block in layout_blocks:
|
|
281
|
+
# Convert content HTML to markdown
|
|
282
|
+
try:
|
|
283
|
+
text = html_to_md_keep_tables(block.content)
|
|
284
|
+
except Exception as e:
|
|
285
|
+
logger.warning(f"Error converting block content to markdown: {e}")
|
|
286
|
+
text = block.content
|
|
287
|
+
|
|
288
|
+
# Create bounding box from [x0, y0, x1, y1] format
|
|
289
|
+
bbox = BoundingBox(
|
|
290
|
+
l=block.bbox[0],
|
|
291
|
+
t=block.bbox[1],
|
|
292
|
+
r=block.bbox[2],
|
|
293
|
+
b=block.bbox[3],
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
items.append(
|
|
297
|
+
Item(
|
|
298
|
+
category=block.label,
|
|
299
|
+
box=bbox,
|
|
300
|
+
text=text.strip(),
|
|
301
|
+
)
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
return items
|
|
305
|
+
|
|
306
|
+
|
|
191
307
|
class ChandraConverterConfig(OpenAIConverterConfig):
|
|
192
308
|
"""Chandra converter configuration."""
|
|
193
309
|
|
|
194
310
|
model_name: str = "datalab-to/chandra"
|
|
195
|
-
|
|
311
|
+
postprompt: str | None = None
|
|
312
|
+
prompts: dict[str, str] = {
|
|
313
|
+
"ocr": OCR_PROMPT,
|
|
314
|
+
"ocr_layout": OCR_LAYOUT_PROMPT,
|
|
315
|
+
}
|
|
316
|
+
prompt_mode_map: dict[str, str] = {
|
|
317
|
+
"table": "ocr_layout",
|
|
318
|
+
}
|
|
196
319
|
bbox_scale: int = 1024
|
|
197
320
|
max_retries: int = 0
|
|
198
321
|
max_failure_retries: int = None
|
|
@@ -216,8 +339,7 @@ class ChandraConverterClient(OpenAIConverterClient):
|
|
|
216
339
|
|
|
217
340
|
async def async_call_inside_page(self, page: Page) -> Page:
|
|
218
341
|
"""Process a single page using Chandra logic."""
|
|
219
|
-
|
|
220
|
-
prompt = PROMPT_MAPPING.get(self.config.prompt_type, OCR_PROMPT)
|
|
342
|
+
prompt = self.get_prompt_for_mode() or OCR_PROMPT
|
|
221
343
|
prompt = prompt.replace("{bbox_scale}", str(self.config.bbox_scale))
|
|
222
344
|
|
|
223
345
|
image = scale_to_fit(page.image)
|
|
@@ -238,61 +360,34 @@ class ChandraConverterClient(OpenAIConverterClient):
|
|
|
238
360
|
|
|
239
361
|
retries = 0
|
|
240
362
|
max_retries = self.config.max_retries
|
|
241
|
-
max_failure_retries = self.config.max_failure_retries
|
|
242
363
|
|
|
243
364
|
result_content = ""
|
|
244
|
-
error_occurred = False
|
|
245
365
|
|
|
246
366
|
while True:
|
|
247
|
-
try:
|
|
248
|
-
# Adjust temperature if retrying
|
|
249
|
-
temperature = self.config.completion_kwargs.get("temperature", 0.0)
|
|
250
|
-
if retries > 0:
|
|
251
|
-
temperature = 0.3 # As per vllm.py logic
|
|
252
|
-
|
|
253
|
-
completion_kwargs = self.config.completion_kwargs.copy()
|
|
254
|
-
completion_kwargs["temperature"] = temperature
|
|
255
|
-
if retries > 0:
|
|
256
|
-
completion_kwargs["top_p"] = 0.95
|
|
257
|
-
|
|
258
|
-
result_content = await self._get_chat_completion(
|
|
259
|
-
messages, completion_kwargs=completion_kwargs
|
|
260
|
-
)
|
|
261
|
-
error_occurred = False
|
|
262
|
-
except Exception as e:
|
|
263
|
-
logger.error(f"Error during VLLM generation: {e}")
|
|
264
|
-
error_occurred = True
|
|
265
|
-
result_content = ""
|
|
266
|
-
|
|
267
367
|
should_retry = False
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
if
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
368
|
+
# Adjust temperature if retrying
|
|
369
|
+
temperature = self.config.completion_kwargs.get("temperature", 0.0)
|
|
370
|
+
if retries > 0:
|
|
371
|
+
temperature = 0.3 # As per vllm.py logic
|
|
372
|
+
|
|
373
|
+
completion_kwargs = self.config.completion_kwargs.copy()
|
|
374
|
+
completion_kwargs["temperature"] = temperature
|
|
375
|
+
if retries > 0:
|
|
376
|
+
completion_kwargs["top_p"] = 0.95
|
|
377
|
+
|
|
378
|
+
result_content, usage = await self._get_chat_completion(
|
|
379
|
+
messages, completion_kwargs=completion_kwargs
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
has_repeat = detect_repeat_token(result_content) or (
|
|
383
|
+
len(result_content) > 50
|
|
384
|
+
and detect_repeat_token(result_content, cut_from_end=50)
|
|
385
|
+
)
|
|
386
|
+
if has_repeat and retries < max_retries:
|
|
387
|
+
logger.warning(
|
|
388
|
+
f"Detected repeat token, retrying generation (attempt {retries + 1})..."
|
|
274
389
|
)
|
|
275
|
-
|
|
276
|
-
logger.warning(
|
|
277
|
-
f"Detected repeat token, retrying generation (attempt {retries + 1})..."
|
|
278
|
-
)
|
|
279
|
-
should_retry = True
|
|
280
|
-
|
|
281
|
-
# Check for error
|
|
282
|
-
if error_occurred:
|
|
283
|
-
if max_failure_retries is not None:
|
|
284
|
-
if retries < max_failure_retries:
|
|
285
|
-
logger.warning(
|
|
286
|
-
f"Detected vllm error, retrying generation (attempt {retries + 1})..."
|
|
287
|
-
)
|
|
288
|
-
should_retry = True
|
|
289
|
-
elif (
|
|
290
|
-
retries < max_retries
|
|
291
|
-
): # Fallback to max_retries if max_failure_retries not set (vllm.py logic varies slightly but this is safe)
|
|
292
|
-
logger.warning(
|
|
293
|
-
f"Detected vllm error, retrying generation (attempt {retries + 1})..."
|
|
294
|
-
)
|
|
295
|
-
should_retry = True
|
|
390
|
+
should_retry = True
|
|
296
391
|
|
|
297
392
|
if should_retry:
|
|
298
393
|
time.sleep(2 * (retries + 1))
|
|
@@ -305,10 +400,27 @@ class ChandraConverterClient(OpenAIConverterClient):
|
|
|
305
400
|
page.raw_response = result_content
|
|
306
401
|
text = clean_response(result_content)
|
|
307
402
|
|
|
403
|
+
# Check if we're in layout mode (ocr_layout prompt)
|
|
404
|
+
current_prompt_key = self.get_prompt_key()
|
|
405
|
+
is_layout_mode = current_prompt_key == "ocr_layout"
|
|
406
|
+
|
|
407
|
+
if is_layout_mode:
|
|
408
|
+
# Parse layout blocks and populate items
|
|
409
|
+
try:
|
|
410
|
+
layout_blocks = parse_layout(
|
|
411
|
+
text, image, bbox_scale=self.config.bbox_scale
|
|
412
|
+
)
|
|
413
|
+
page.items = layout_blocks_to_items(layout_blocks)
|
|
414
|
+
logger.info(f"Parsed {len(page.items)} layout blocks")
|
|
415
|
+
except Exception as e:
|
|
416
|
+
logger.warning(f"Error parsing layout blocks: {e}")
|
|
417
|
+
page.items = []
|
|
418
|
+
|
|
308
419
|
# Convert HTML to MD
|
|
309
420
|
text = html_to_md_keep_tables(text)
|
|
310
421
|
page.text = text
|
|
311
|
-
|
|
422
|
+
page.completion_tokens = usage.completion_tokens
|
|
423
|
+
page.prompt_tokens = usage.prompt_tokens
|
|
312
424
|
return page
|
|
313
425
|
|
|
314
426
|
|
|
@@ -320,4 +432,8 @@ class ChandraDockerServerConfig(VLLMDockerServerConfig):
|
|
|
320
432
|
|
|
321
433
|
@property
|
|
322
434
|
def client_config(self):
|
|
323
|
-
return ChandraConverterConfig(
|
|
435
|
+
return ChandraConverterConfig(
|
|
436
|
+
**self._create_client_kwargs(
|
|
437
|
+
f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
|
|
438
|
+
)
|
|
439
|
+
)
|
vlmparse/clients/deepseekocr.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from typing import ClassVar, Literal
|
|
3
2
|
|
|
4
3
|
from loguru import logger
|
|
5
4
|
from PIL import Image
|
|
@@ -14,6 +13,10 @@ from vlmparse.data_model.document import Item, Page
|
|
|
14
13
|
from vlmparse.servers.docker_server import VLLMDockerServerConfig
|
|
15
14
|
from vlmparse.utils import to_base64
|
|
16
15
|
|
|
16
|
+
# ==============================================================================
|
|
17
|
+
# DeepSeek-OCR (v1)
|
|
18
|
+
# ==============================================================================
|
|
19
|
+
|
|
17
20
|
|
|
18
21
|
class DeepSeekOCRDockerServerConfig(VLLMDockerServerConfig):
|
|
19
22
|
"""Configuration for DeepSeekOCR model."""
|
|
@@ -35,7 +38,11 @@ class DeepSeekOCRDockerServerConfig(VLLMDockerServerConfig):
|
|
|
35
38
|
|
|
36
39
|
@property
|
|
37
40
|
def client_config(self):
|
|
38
|
-
return DeepSeekOCRConverterConfig(
|
|
41
|
+
return DeepSeekOCRConverterConfig(
|
|
42
|
+
**self._create_client_kwargs(
|
|
43
|
+
f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
|
|
44
|
+
)
|
|
45
|
+
)
|
|
39
46
|
|
|
40
47
|
|
|
41
48
|
class DeepSeekOCRConverterConfig(OpenAIConverterConfig):
|
|
@@ -43,8 +50,17 @@ class DeepSeekOCRConverterConfig(OpenAIConverterConfig):
|
|
|
43
50
|
|
|
44
51
|
model_name: str = "deepseek-ai/DeepSeek-OCR"
|
|
45
52
|
aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
|
|
53
|
+
postprompt: str | None = None
|
|
54
|
+
prompts: dict[str, str] = {
|
|
55
|
+
"layout": "<|grounding|>Convert the document to markdown.",
|
|
56
|
+
"ocr": "Free OCR.",
|
|
57
|
+
"image_description": "Describe this image in detail.",
|
|
58
|
+
}
|
|
59
|
+
prompt_mode_map: dict[str, str] = {
|
|
60
|
+
"ocr_layout": "layout",
|
|
61
|
+
"table": "layout",
|
|
62
|
+
}
|
|
46
63
|
|
|
47
|
-
prompt_mode: Literal["layout", "ocr"] = "ocr"
|
|
48
64
|
completion_kwargs: dict | None = {
|
|
49
65
|
"temperature": 0.0,
|
|
50
66
|
"max_tokens": 8181,
|
|
@@ -95,12 +111,6 @@ def extract_coordinates_and_label(ref_text):
|
|
|
95
111
|
class DeepSeekOCRConverterClient(OpenAIConverterClient):
|
|
96
112
|
"""Client for DeepSeekOCR with specific post-processing."""
|
|
97
113
|
|
|
98
|
-
PROMPTS: ClassVar[dict] = {
|
|
99
|
-
"layout": "<|grounding|>Convert the document to markdown.",
|
|
100
|
-
"ocr": "Free OCR.",
|
|
101
|
-
"image_description": "Describe this image in detail.",
|
|
102
|
-
}
|
|
103
|
-
|
|
104
114
|
def extract_items(self, image: Image.Image, matches: list) -> list[Item]:
|
|
105
115
|
items = []
|
|
106
116
|
width, height = image.size
|
|
@@ -153,6 +163,8 @@ class DeepSeekOCRConverterClient(OpenAIConverterClient):
|
|
|
153
163
|
# Prepare messages as in parent class
|
|
154
164
|
image = page.image
|
|
155
165
|
|
|
166
|
+
prompt_key = self.get_prompt_key() or "ocr"
|
|
167
|
+
|
|
156
168
|
messages = [
|
|
157
169
|
{
|
|
158
170
|
"role": "user",
|
|
@@ -163,17 +175,17 @@ class DeepSeekOCRConverterClient(OpenAIConverterClient):
|
|
|
163
175
|
"url": f"data:image/png;base64,{to_base64(image)}"
|
|
164
176
|
},
|
|
165
177
|
},
|
|
166
|
-
{"type": "text", "text": self.
|
|
178
|
+
{"type": "text", "text": self.config.prompts[prompt_key]},
|
|
167
179
|
],
|
|
168
180
|
},
|
|
169
181
|
]
|
|
170
182
|
|
|
171
183
|
# Get raw response using parent's method
|
|
172
|
-
response = await self._get_chat_completion(messages)
|
|
184
|
+
response, usage = await self._get_chat_completion(messages)
|
|
173
185
|
logger.info("Response length: " + str(len(response)))
|
|
174
186
|
page.raw_response = response
|
|
175
187
|
|
|
176
|
-
if
|
|
188
|
+
if prompt_key == "layout":
|
|
177
189
|
# Post-processing
|
|
178
190
|
matches, matches_image, matches_other = re_match(response)
|
|
179
191
|
|
|
@@ -199,5 +211,174 @@ class DeepSeekOCRConverterClient(OpenAIConverterClient):
|
|
|
199
211
|
|
|
200
212
|
page.text = outputs.strip()
|
|
201
213
|
logger.debug(page.text)
|
|
214
|
+
if usage is not None:
|
|
215
|
+
page.prompt_tokens = usage.prompt_tokens
|
|
216
|
+
page.completion_tokens = usage.completion_tokens
|
|
217
|
+
|
|
218
|
+
return page
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
# ==============================================================================
|
|
222
|
+
# DeepSeek-OCR-2
|
|
223
|
+
# ==============================================================================
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
class DeepSeekOCR2DockerServerConfig(VLLMDockerServerConfig):
|
|
227
|
+
"""Configuration for DeepSeek-OCR-2 model.
|
|
228
|
+
|
|
229
|
+
DeepSeek-OCR-2 uses a custom architecture that requires:
|
|
230
|
+
- Custom model registration via hf_overrides
|
|
231
|
+
- NoRepeatNGram logits processor with specific whitelist tokens
|
|
232
|
+
- Custom image processor (DeepseekOCR2Processor)
|
|
233
|
+
"""
|
|
234
|
+
|
|
235
|
+
docker_image: str = "vllm/vllm-openai:nightly"
|
|
236
|
+
model_name: str = "deepseek-ai/DeepSeek-OCR-2"
|
|
237
|
+
command_args: list[str] = Field(
|
|
238
|
+
default_factory=lambda: [
|
|
239
|
+
"--limit-mm-per-prompt",
|
|
240
|
+
'{"image": 1}',
|
|
241
|
+
"--hf-overrides",
|
|
242
|
+
'{"architectures": ["DeepseekOCR2ForCausalLM"]}',
|
|
243
|
+
"--block-size",
|
|
244
|
+
"256",
|
|
245
|
+
"--trust-remote-code",
|
|
246
|
+
"--max-model-len",
|
|
247
|
+
"8192",
|
|
248
|
+
"--swap-space",
|
|
249
|
+
"0",
|
|
250
|
+
"--gpu-memory-utilization",
|
|
251
|
+
"0.9",
|
|
252
|
+
"--logits_processors",
|
|
253
|
+
"vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor",
|
|
254
|
+
]
|
|
255
|
+
)
|
|
256
|
+
aliases: list[str] = Field(
|
|
257
|
+
default_factory=lambda: ["deepseekocr2", "DeepSeek-OCR-2"]
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
@property
|
|
261
|
+
def client_config(self):
|
|
262
|
+
return DeepSeekOCR2ConverterConfig(
|
|
263
|
+
**self._create_client_kwargs(
|
|
264
|
+
f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
|
|
265
|
+
)
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
class DeepSeekOCR2ConverterConfig(OpenAIConverterConfig):
|
|
270
|
+
"""DeepSeek-OCR-2 converter configuration.
|
|
271
|
+
|
|
272
|
+
Key differences from DeepSeek-OCR v1:
|
|
273
|
+
- Uses DeepseekOCR2ForCausalLM architecture
|
|
274
|
+
- Different logits processor parameters (ngram_size=20, window_size=50)
|
|
275
|
+
- Supports cropping mode for image processing
|
|
276
|
+
"""
|
|
277
|
+
|
|
278
|
+
model_name: str = "deepseek-ai/DeepSeek-OCR-2"
|
|
279
|
+
aliases: list[str] = Field(
|
|
280
|
+
default_factory=lambda: ["deepseekocr2", "DeepSeek-OCR-2"]
|
|
281
|
+
)
|
|
282
|
+
postprompt: str | None = None
|
|
283
|
+
prompts: dict[str, str] = {
|
|
284
|
+
"layout": "<|grounding|>Convert the document to markdown.",
|
|
285
|
+
"ocr": "Free OCR.",
|
|
286
|
+
"image_description": "Describe this image in detail.",
|
|
287
|
+
}
|
|
288
|
+
prompt_mode_map: dict[str, str] = {
|
|
289
|
+
"ocr_layout": "layout",
|
|
290
|
+
"table": "layout",
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
completion_kwargs: dict | None = {
|
|
294
|
+
"temperature": 0.0,
|
|
295
|
+
"max_tokens": 8180,
|
|
296
|
+
"extra_body": {
|
|
297
|
+
"skip_special_tokens": False,
|
|
298
|
+
# args used to control custom logits processor
|
|
299
|
+
"vllm_xargs": {
|
|
300
|
+
"ngram_size": 20,
|
|
301
|
+
"window_size": 50,
|
|
302
|
+
# whitelist: <td>, </td>
|
|
303
|
+
"whitelist_token_ids": [128821, 128822],
|
|
304
|
+
},
|
|
305
|
+
},
|
|
306
|
+
}
|
|
307
|
+
dpi: int = 144 # Default DPI used in reference implementation
|
|
308
|
+
|
|
309
|
+
def get_client(self, **kwargs) -> "DeepSeekOCR2ConverterClient":
|
|
310
|
+
return DeepSeekOCR2ConverterClient(config=self, **kwargs)
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
class DeepSeekOCR2ConverterClient(DeepSeekOCRConverterClient):
|
|
314
|
+
"""Client for DeepSeek-OCR-2 with specific post-processing.
|
|
315
|
+
|
|
316
|
+
Inherits from DeepSeekOCRConverterClient as the post-processing logic
|
|
317
|
+
for parsing grounding references and extracting items is the same.
|
|
318
|
+
The main differences are in the model configuration and logits processor.
|
|
319
|
+
"""
|
|
320
|
+
|
|
321
|
+
async def async_call_inside_page(self, page: Page) -> Page:
|
|
322
|
+
# Prepare messages as in parent class
|
|
323
|
+
image = page.image
|
|
324
|
+
|
|
325
|
+
prompt_key = self.get_prompt_key() or "ocr"
|
|
326
|
+
|
|
327
|
+
messages = [
|
|
328
|
+
{
|
|
329
|
+
"role": "user",
|
|
330
|
+
"content": [
|
|
331
|
+
{
|
|
332
|
+
"type": "image_url",
|
|
333
|
+
"image_url": {
|
|
334
|
+
"url": f"data:image/png;base64,{to_base64(image)}"
|
|
335
|
+
},
|
|
336
|
+
},
|
|
337
|
+
{"type": "text", "text": self.config.prompts[prompt_key]},
|
|
338
|
+
],
|
|
339
|
+
},
|
|
340
|
+
]
|
|
341
|
+
|
|
342
|
+
# Get raw response using parent's method
|
|
343
|
+
response, usage = await self._get_chat_completion(messages)
|
|
344
|
+
logger.info("Response length: " + str(len(response)))
|
|
345
|
+
page.raw_response = response
|
|
346
|
+
|
|
347
|
+
if prompt_key == "layout":
|
|
348
|
+
# Post-processing
|
|
349
|
+
matches, matches_image, matches_other = re_match(response)
|
|
350
|
+
|
|
351
|
+
# Extract items (bounding boxes)
|
|
352
|
+
page.items = self.extract_items(page.image, matches)
|
|
353
|
+
|
|
354
|
+
# Clean text
|
|
355
|
+
outputs = response
|
|
356
|
+
|
|
357
|
+
# Check for sentence end marker (indicates successful completion)
|
|
358
|
+
# If not present, it might be due to repetition detection
|
|
359
|
+
if "<|end▁of▁sentence|>" in outputs:
|
|
360
|
+
outputs = outputs.replace("<|end▁of▁sentence|>", "")
|
|
361
|
+
|
|
362
|
+
# Replace image references with a placeholder
|
|
363
|
+
for a_match_image in matches_image:
|
|
364
|
+
outputs = outputs.replace(a_match_image, "![image]")
|
|
365
|
+
|
|
366
|
+
# Replace other references (text grounding) and cleanup
|
|
367
|
+
for a_match_other in matches_other:
|
|
368
|
+
outputs = (
|
|
369
|
+
outputs.replace(a_match_other, "")
|
|
370
|
+
.replace("\\coloneqq", ":=")
|
|
371
|
+
.replace("\\eqqcolon", "=:")
|
|
372
|
+
.replace("\n\n\n\n", "\n\n")
|
|
373
|
+
.replace("\n\n\n", "\n\n")
|
|
374
|
+
)
|
|
375
|
+
else:
|
|
376
|
+
outputs = response
|
|
377
|
+
|
|
378
|
+
page.text = outputs.strip()
|
|
379
|
+
logger.debug(page.text)
|
|
380
|
+
if usage is not None:
|
|
381
|
+
page.prompt_tokens = usage.prompt_tokens
|
|
382
|
+
page.completion_tokens = usage.completion_tokens
|
|
202
383
|
|
|
203
384
|
return page
|
vlmparse/clients/docling.py
CHANGED
|
@@ -62,7 +62,6 @@ class DoclingDockerServerConfig(DockerServerConfig):
|
|
|
62
62
|
class DoclingConverterConfig(ConverterConfig):
|
|
63
63
|
"""Configuration for Docling converter client."""
|
|
64
64
|
|
|
65
|
-
base_url: str
|
|
66
65
|
model_name: str = "docling"
|
|
67
66
|
timeout: int = 300
|
|
68
67
|
api_kwargs: dict = {"output_format": "markdown", "image_export_mode": "referenced"}
|
vlmparse/clients/dotsocr.py
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import math
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import ClassVar
|
|
4
|
+
from typing import ClassVar
|
|
5
5
|
|
|
6
6
|
from loguru import logger
|
|
7
7
|
from PIL import Image
|
|
8
8
|
from pydantic import Field
|
|
9
9
|
|
|
10
10
|
from vlmparse.clients.openai_converter import (
|
|
11
|
-
LLMParams,
|
|
12
11
|
OpenAIConverterClient,
|
|
13
12
|
OpenAIConverterConfig,
|
|
14
13
|
)
|
|
@@ -48,12 +47,13 @@ class DotsOCRDockerServerConfig(DockerServerConfig):
|
|
|
48
47
|
)
|
|
49
48
|
add_model_key_to_server: bool = True
|
|
50
49
|
aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
|
|
50
|
+
default_model_name: str = DEFAULT_MODEL_NAME
|
|
51
51
|
|
|
52
52
|
@property
|
|
53
53
|
def client_config(self):
|
|
54
54
|
return DotsOCRConverterConfig(
|
|
55
|
-
|
|
56
|
-
|
|
55
|
+
**self._create_client_kwargs(
|
|
56
|
+
f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
|
|
57
57
|
)
|
|
58
58
|
)
|
|
59
59
|
|
|
@@ -65,29 +65,7 @@ class DotsOCRConverterConfig(OpenAIConverterConfig):
|
|
|
65
65
|
model_name: str = "rednote-hilab/dots.ocr"
|
|
66
66
|
preprompt: str | None = ""
|
|
67
67
|
postprompt: str | None = None
|
|
68
|
-
|
|
69
|
-
"temperature": 0.1,
|
|
70
|
-
"top_p": 1.0,
|
|
71
|
-
"max_completion_tokens": 16384,
|
|
72
|
-
}
|
|
73
|
-
aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
|
|
74
|
-
dpi: int = 200
|
|
75
|
-
prompt_mode: Literal["prompt_layout_all_en", "prompt_ocr"] = "prompt_ocr"
|
|
76
|
-
|
|
77
|
-
def get_client(self, **kwargs) -> "DotsOCRConverter":
|
|
78
|
-
return DotsOCRConverter(config=self, **kwargs)
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
class DotsOCRConverter(OpenAIConverterClient):
|
|
82
|
-
"""DotsOCR VLLM converter."""
|
|
83
|
-
|
|
84
|
-
# Constants
|
|
85
|
-
MIN_PIXELS: ClassVar[int] = 3136
|
|
86
|
-
MAX_PIXELS: ClassVar[int] = 11289600
|
|
87
|
-
IMAGE_FACTOR: ClassVar[int] = 28
|
|
88
|
-
|
|
89
|
-
# Prompts
|
|
90
|
-
PROMPTS: ClassVar[dict] = {
|
|
68
|
+
prompts: dict[str, str] = {
|
|
91
69
|
"prompt_layout_all_en": """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
|
|
92
70
|
|
|
93
71
|
1. Bbox format: [x1, y1, x2, y2]
|
|
@@ -108,6 +86,30 @@ class DotsOCRConverter(OpenAIConverterClient):
|
|
|
108
86
|
""",
|
|
109
87
|
"prompt_ocr": """Extract the text content from this image.""",
|
|
110
88
|
}
|
|
89
|
+
prompt_mode_map: dict[str, str] = {
|
|
90
|
+
"ocr": "prompt_ocr",
|
|
91
|
+
"ocr_layout": "prompt_layout_all_en",
|
|
92
|
+
"table": "prompt_layout_all_en",
|
|
93
|
+
}
|
|
94
|
+
completion_kwargs: dict | None = {
|
|
95
|
+
"temperature": 0.1,
|
|
96
|
+
"top_p": 1.0,
|
|
97
|
+
"max_completion_tokens": 16384,
|
|
98
|
+
}
|
|
99
|
+
aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
|
|
100
|
+
dpi: int = 200
|
|
101
|
+
|
|
102
|
+
def get_client(self, **kwargs) -> "DotsOCRConverter":
|
|
103
|
+
return DotsOCRConverter(config=self, **kwargs)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class DotsOCRConverter(OpenAIConverterClient):
|
|
107
|
+
"""DotsOCR VLLM converter."""
|
|
108
|
+
|
|
109
|
+
# Constants
|
|
110
|
+
MIN_PIXELS: ClassVar[int] = 3136
|
|
111
|
+
MAX_PIXELS: ClassVar[int] = 11289600
|
|
112
|
+
IMAGE_FACTOR: ClassVar[int] = 28
|
|
111
113
|
|
|
112
114
|
@staticmethod
|
|
113
115
|
def round_by_factor(number: int, factor: int) -> int:
|
|
@@ -235,7 +237,7 @@ class DotsOCRConverter(OpenAIConverterClient):
|
|
|
235
237
|
image = self.fetch_image(
|
|
236
238
|
origin_image, min_pixels=self.MIN_PIXELS, max_pixels=self.MAX_PIXELS
|
|
237
239
|
)
|
|
238
|
-
prompt = self.
|
|
240
|
+
prompt = self.config.prompts[prompt_mode]
|
|
239
241
|
|
|
240
242
|
response, usage = await self._async_inference_with_vllm(image, prompt)
|
|
241
243
|
|
|
@@ -258,13 +260,15 @@ class DotsOCRConverter(OpenAIConverterClient):
|
|
|
258
260
|
async def async_call_inside_page(self, page: Page) -> Page:
|
|
259
261
|
image = page.image
|
|
260
262
|
|
|
263
|
+
prompt_key = self.get_prompt_key() or "prompt_ocr"
|
|
264
|
+
|
|
261
265
|
_, response, _, usage = await self._parse_image_vllm(
|
|
262
|
-
image, prompt_mode=
|
|
266
|
+
image, prompt_mode=prompt_key
|
|
263
267
|
)
|
|
264
268
|
logger.info("Response: " + str(response))
|
|
265
269
|
|
|
266
270
|
items = None
|
|
267
|
-
if
|
|
271
|
+
if prompt_key == "prompt_layout_all_en":
|
|
268
272
|
text = "\n\n".join([item.get("text", "") for item in response])
|
|
269
273
|
|
|
270
274
|
items = []
|
|
@@ -286,5 +290,4 @@ class DotsOCRConverter(OpenAIConverterClient):
|
|
|
286
290
|
|
|
287
291
|
page.completion_tokens = usage.completion_tokens
|
|
288
292
|
page.prompt_tokens = usage.prompt_tokens
|
|
289
|
-
page.reasoning_tokens = usage.reasoning_tokens
|
|
290
293
|
return page
|