vlmparse 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. vlmparse/build_doc.py +20 -19
  2. vlmparse/cli.py +439 -270
  3. vlmparse/clients/chandra.py +176 -60
  4. vlmparse/clients/deepseekocr.py +193 -12
  5. vlmparse/clients/docling.py +0 -1
  6. vlmparse/clients/dotsocr.py +34 -31
  7. vlmparse/clients/glmocr.py +243 -0
  8. vlmparse/clients/granite_docling.py +9 -36
  9. vlmparse/clients/hunyuanocr.py +5 -1
  10. vlmparse/clients/lightonocr.py +23 -1
  11. vlmparse/clients/mineru.py +0 -1
  12. vlmparse/clients/mistral_converter.py +85 -0
  13. vlmparse/clients/nanonetocr.py +5 -1
  14. vlmparse/clients/olmocr.py +6 -2
  15. vlmparse/clients/openai_converter.py +95 -60
  16. vlmparse/clients/paddleocrvl.py +195 -40
  17. vlmparse/converter.py +51 -11
  18. vlmparse/converter_with_server.py +92 -19
  19. vlmparse/registries.py +107 -89
  20. vlmparse/servers/base_server.py +127 -0
  21. vlmparse/servers/docker_compose_deployment.py +489 -0
  22. vlmparse/servers/docker_compose_server.py +39 -0
  23. vlmparse/servers/docker_run_deployment.py +226 -0
  24. vlmparse/servers/docker_server.py +17 -109
  25. vlmparse/servers/model_identity.py +48 -0
  26. vlmparse/servers/server_registry.py +42 -0
  27. vlmparse/servers/utils.py +83 -219
  28. vlmparse/st_viewer/st_viewer.py +1 -1
  29. vlmparse/utils.py +15 -2
  30. {vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/METADATA +13 -3
  31. vlmparse-0.1.9.dist-info/RECORD +44 -0
  32. {vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/WHEEL +1 -1
  33. vlmparse-0.1.7.dist-info/RECORD +0 -36
  34. {vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/entry_points.txt +0 -0
  35. {vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/licenses/LICENSE +0 -0
  36. {vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,9 @@
1
+ import json
1
2
  import math
2
3
  import time
4
+ from dataclasses import asdict, dataclass
3
5
 
6
+ from bs4 import BeautifulSoup
4
7
  from loguru import logger
5
8
  from PIL import Image
6
9
  from pydantic import Field
@@ -11,7 +14,8 @@ from vlmparse.clients.openai_converter import (
11
14
  )
12
15
  from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
13
16
  from vlmparse.clients.pipe_utils.utils import clean_response
14
- from vlmparse.data_model.document import Page
17
+ from vlmparse.data_model.box import BoundingBox
18
+ from vlmparse.data_model.document import Item, Page
15
19
  from vlmparse.servers.docker_server import VLLMDockerServerConfig
16
20
  from vlmparse.utils import to_base64
17
21
 
@@ -110,11 +114,6 @@ OCR this image to HTML.
110
114
  {PROMPT_ENDING}
111
115
  """.strip()
112
116
 
113
- PROMPT_MAPPING = {
114
- "ocr_layout": OCR_LAYOUT_PROMPT,
115
- "ocr": OCR_PROMPT,
116
- }
117
-
118
117
 
119
118
  def scale_to_fit(
120
119
  img: Image.Image,
@@ -188,11 +187,135 @@ def detect_repeat_token(
188
187
  return False
189
188
 
190
189
 
190
+ @dataclass
191
+ class LayoutBlock:
192
+ """Represents a layout block with bounding box and content."""
193
+
194
+ bbox: list[int]
195
+ label: str
196
+ content: str
197
+
198
+
199
+ def parse_layout(
200
+ html: str, image: Image.Image, bbox_scale: int = 1024
201
+ ) -> list[LayoutBlock]:
202
+ """
203
+ Parse HTML layout blocks with bounding boxes.
204
+
205
+ Args:
206
+ html: HTML string with layout blocks (divs with data-bbox and data-label attributes)
207
+ image: PIL Image to get dimensions for bbox scaling
208
+ bbox_scale: The scale used in the prompt for normalized bboxes
209
+
210
+ Returns:
211
+ List of LayoutBlock objects with scaled bounding boxes
212
+ """
213
+ soup = BeautifulSoup(html, "html.parser")
214
+ top_level_divs = soup.find_all("div", recursive=False)
215
+ width, height = image.size
216
+ width_scaler = width / bbox_scale
217
+ height_scaler = height / bbox_scale
218
+ layout_blocks = []
219
+
220
+ for div in top_level_divs:
221
+ bbox = div.get("data-bbox")
222
+
223
+ try:
224
+ bbox = json.loads(bbox)
225
+ assert len(bbox) == 4, "Invalid bbox length"
226
+ except Exception:
227
+ try:
228
+ bbox = bbox.split(" ")
229
+ assert len(bbox) == 4, "Invalid bbox length"
230
+ except Exception:
231
+ # Default bbox if parsing fails
232
+ bbox = [0, 0, bbox_scale, bbox_scale]
233
+
234
+ bbox = list(map(int, bbox))
235
+ # Scale bbox to image dimensions
236
+ bbox = [
237
+ max(0, int(bbox[0] * width_scaler)),
238
+ max(0, int(bbox[1] * height_scaler)),
239
+ min(int(bbox[2] * width_scaler), width),
240
+ min(int(bbox[3] * height_scaler), height),
241
+ ]
242
+
243
+ label = div.get("data-label", "block")
244
+ content = str(div.decode_contents())
245
+ layout_blocks.append(LayoutBlock(bbox=bbox, label=label, content=content))
246
+
247
+ return layout_blocks
248
+
249
+
250
+ def parse_chunks(html: str, image: Image.Image, bbox_scale: int = 1024) -> list[dict]:
251
+ """
252
+ Parse HTML layout blocks into dictionaries.
253
+
254
+ Args:
255
+ html: HTML string with layout blocks
256
+ image: PIL Image to get dimensions for bbox scaling
257
+ bbox_scale: The scale used in the prompt for normalized bboxes
258
+
259
+ Returns:
260
+ List of dictionaries with bbox, label, and content keys
261
+ """
262
+ layout = parse_layout(html, image, bbox_scale=bbox_scale)
263
+ chunks = [asdict(block) for block in layout]
264
+ return chunks
265
+
266
+
267
+ def layout_blocks_to_items(
268
+ layout_blocks: list[LayoutBlock],
269
+ ) -> list[Item]:
270
+ """
271
+ Convert layout blocks to Item objects for the Page model.
272
+
273
+ Args:
274
+ layout_blocks: List of LayoutBlock objects
275
+
276
+ Returns:
277
+ List of Item objects with category, box, and text
278
+ """
279
+ items = []
280
+ for block in layout_blocks:
281
+ # Convert content HTML to markdown
282
+ try:
283
+ text = html_to_md_keep_tables(block.content)
284
+ except Exception as e:
285
+ logger.warning(f"Error converting block content to markdown: {e}")
286
+ text = block.content
287
+
288
+ # Create bounding box from [x0, y0, x1, y1] format
289
+ bbox = BoundingBox(
290
+ l=block.bbox[0],
291
+ t=block.bbox[1],
292
+ r=block.bbox[2],
293
+ b=block.bbox[3],
294
+ )
295
+
296
+ items.append(
297
+ Item(
298
+ category=block.label,
299
+ box=bbox,
300
+ text=text.strip(),
301
+ )
302
+ )
303
+
304
+ return items
305
+
306
+
191
307
  class ChandraConverterConfig(OpenAIConverterConfig):
192
308
  """Chandra converter configuration."""
193
309
 
194
310
  model_name: str = "datalab-to/chandra"
195
- prompt_type: str = "ocr" # Default prompt type
311
+ postprompt: str | None = None
312
+ prompts: dict[str, str] = {
313
+ "ocr": OCR_PROMPT,
314
+ "ocr_layout": OCR_LAYOUT_PROMPT,
315
+ }
316
+ prompt_mode_map: dict[str, str] = {
317
+ "table": "ocr_layout",
318
+ }
196
319
  bbox_scale: int = 1024
197
320
  max_retries: int = 0
198
321
  max_failure_retries: int = None
@@ -216,8 +339,7 @@ class ChandraConverterClient(OpenAIConverterClient):
216
339
 
217
340
  async def async_call_inside_page(self, page: Page) -> Page:
218
341
  """Process a single page using Chandra logic."""
219
-
220
- prompt = PROMPT_MAPPING.get(self.config.prompt_type, OCR_PROMPT)
342
+ prompt = self.get_prompt_for_mode() or OCR_PROMPT
221
343
  prompt = prompt.replace("{bbox_scale}", str(self.config.bbox_scale))
222
344
 
223
345
  image = scale_to_fit(page.image)
@@ -238,61 +360,34 @@ class ChandraConverterClient(OpenAIConverterClient):
238
360
 
239
361
  retries = 0
240
362
  max_retries = self.config.max_retries
241
- max_failure_retries = self.config.max_failure_retries
242
363
 
243
364
  result_content = ""
244
- error_occurred = False
245
365
 
246
366
  while True:
247
- try:
248
- # Adjust temperature if retrying
249
- temperature = self.config.completion_kwargs.get("temperature", 0.0)
250
- if retries > 0:
251
- temperature = 0.3 # As per vllm.py logic
252
-
253
- completion_kwargs = self.config.completion_kwargs.copy()
254
- completion_kwargs["temperature"] = temperature
255
- if retries > 0:
256
- completion_kwargs["top_p"] = 0.95
257
-
258
- result_content = await self._get_chat_completion(
259
- messages, completion_kwargs=completion_kwargs
260
- )
261
- error_occurred = False
262
- except Exception as e:
263
- logger.error(f"Error during VLLM generation: {e}")
264
- error_occurred = True
265
- result_content = ""
266
-
267
367
  should_retry = False
268
-
269
- # Check for repeat token
270
- if not error_occurred:
271
- has_repeat = detect_repeat_token(result_content) or (
272
- len(result_content) > 50
273
- and detect_repeat_token(result_content, cut_from_end=50)
368
+ # Adjust temperature if retrying
369
+ temperature = self.config.completion_kwargs.get("temperature", 0.0)
370
+ if retries > 0:
371
+ temperature = 0.3 # As per vllm.py logic
372
+
373
+ completion_kwargs = self.config.completion_kwargs.copy()
374
+ completion_kwargs["temperature"] = temperature
375
+ if retries > 0:
376
+ completion_kwargs["top_p"] = 0.95
377
+
378
+ result_content, usage = await self._get_chat_completion(
379
+ messages, completion_kwargs=completion_kwargs
380
+ )
381
+
382
+ has_repeat = detect_repeat_token(result_content) or (
383
+ len(result_content) > 50
384
+ and detect_repeat_token(result_content, cut_from_end=50)
385
+ )
386
+ if has_repeat and retries < max_retries:
387
+ logger.warning(
388
+ f"Detected repeat token, retrying generation (attempt {retries + 1})..."
274
389
  )
275
- if has_repeat and retries < max_retries:
276
- logger.warning(
277
- f"Detected repeat token, retrying generation (attempt {retries + 1})..."
278
- )
279
- should_retry = True
280
-
281
- # Check for error
282
- if error_occurred:
283
- if max_failure_retries is not None:
284
- if retries < max_failure_retries:
285
- logger.warning(
286
- f"Detected vllm error, retrying generation (attempt {retries + 1})..."
287
- )
288
- should_retry = True
289
- elif (
290
- retries < max_retries
291
- ): # Fallback to max_retries if max_failure_retries not set (vllm.py logic varies slightly but this is safe)
292
- logger.warning(
293
- f"Detected vllm error, retrying generation (attempt {retries + 1})..."
294
- )
295
- should_retry = True
390
+ should_retry = True
296
391
 
297
392
  if should_retry:
298
393
  time.sleep(2 * (retries + 1))
@@ -305,10 +400,27 @@ class ChandraConverterClient(OpenAIConverterClient):
305
400
  page.raw_response = result_content
306
401
  text = clean_response(result_content)
307
402
 
403
+ # Check if we're in layout mode (ocr_layout prompt)
404
+ current_prompt_key = self.get_prompt_key()
405
+ is_layout_mode = current_prompt_key == "ocr_layout"
406
+
407
+ if is_layout_mode:
408
+ # Parse layout blocks and populate items
409
+ try:
410
+ layout_blocks = parse_layout(
411
+ text, image, bbox_scale=self.config.bbox_scale
412
+ )
413
+ page.items = layout_blocks_to_items(layout_blocks)
414
+ logger.info(f"Parsed {len(page.items)} layout blocks")
415
+ except Exception as e:
416
+ logger.warning(f"Error parsing layout blocks: {e}")
417
+ page.items = []
418
+
308
419
  # Convert HTML to MD
309
420
  text = html_to_md_keep_tables(text)
310
421
  page.text = text
311
-
422
+ page.completion_tokens = usage.completion_tokens
423
+ page.prompt_tokens = usage.prompt_tokens
312
424
  return page
313
425
 
314
426
 
@@ -320,4 +432,8 @@ class ChandraDockerServerConfig(VLLMDockerServerConfig):
320
432
 
321
433
  @property
322
434
  def client_config(self):
323
- return ChandraConverterConfig(llm_params=self.llm_params)
435
+ return ChandraConverterConfig(
436
+ **self._create_client_kwargs(
437
+ f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
438
+ )
439
+ )
@@ -1,5 +1,4 @@
1
1
  import re
2
- from typing import ClassVar, Literal
3
2
 
4
3
  from loguru import logger
5
4
  from PIL import Image
@@ -14,6 +13,10 @@ from vlmparse.data_model.document import Item, Page
14
13
  from vlmparse.servers.docker_server import VLLMDockerServerConfig
15
14
  from vlmparse.utils import to_base64
16
15
 
16
+ # ==============================================================================
17
+ # DeepSeek-OCR (v1)
18
+ # ==============================================================================
19
+
17
20
 
18
21
  class DeepSeekOCRDockerServerConfig(VLLMDockerServerConfig):
19
22
  """Configuration for DeepSeekOCR model."""
@@ -35,7 +38,11 @@ class DeepSeekOCRDockerServerConfig(VLLMDockerServerConfig):
35
38
 
36
39
  @property
37
40
  def client_config(self):
38
- return DeepSeekOCRConverterConfig(llm_params=self.llm_params)
41
+ return DeepSeekOCRConverterConfig(
42
+ **self._create_client_kwargs(
43
+ f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
44
+ )
45
+ )
39
46
 
40
47
 
41
48
  class DeepSeekOCRConverterConfig(OpenAIConverterConfig):
@@ -43,8 +50,17 @@ class DeepSeekOCRConverterConfig(OpenAIConverterConfig):
43
50
 
44
51
  model_name: str = "deepseek-ai/DeepSeek-OCR"
45
52
  aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
53
+ postprompt: str | None = None
54
+ prompts: dict[str, str] = {
55
+ "layout": "<|grounding|>Convert the document to markdown.",
56
+ "ocr": "Free OCR.",
57
+ "image_description": "Describe this image in detail.",
58
+ }
59
+ prompt_mode_map: dict[str, str] = {
60
+ "ocr_layout": "layout",
61
+ "table": "layout",
62
+ }
46
63
 
47
- prompt_mode: Literal["layout", "ocr"] = "ocr"
48
64
  completion_kwargs: dict | None = {
49
65
  "temperature": 0.0,
50
66
  "max_tokens": 8181,
@@ -95,12 +111,6 @@ def extract_coordinates_and_label(ref_text):
95
111
  class DeepSeekOCRConverterClient(OpenAIConverterClient):
96
112
  """Client for DeepSeekOCR with specific post-processing."""
97
113
 
98
- PROMPTS: ClassVar[dict] = {
99
- "layout": "<|grounding|>Convert the document to markdown.",
100
- "ocr": "Free OCR.",
101
- "image_description": "Describe this image in detail.",
102
- }
103
-
104
114
  def extract_items(self, image: Image.Image, matches: list) -> list[Item]:
105
115
  items = []
106
116
  width, height = image.size
@@ -153,6 +163,8 @@ class DeepSeekOCRConverterClient(OpenAIConverterClient):
153
163
  # Prepare messages as in parent class
154
164
  image = page.image
155
165
 
166
+ prompt_key = self.get_prompt_key() or "ocr"
167
+
156
168
  messages = [
157
169
  {
158
170
  "role": "user",
@@ -163,17 +175,17 @@ class DeepSeekOCRConverterClient(OpenAIConverterClient):
163
175
  "url": f"data:image/png;base64,{to_base64(image)}"
164
176
  },
165
177
  },
166
- {"type": "text", "text": self.PROMPTS[self.config.prompt_mode]},
178
+ {"type": "text", "text": self.config.prompts[prompt_key]},
167
179
  ],
168
180
  },
169
181
  ]
170
182
 
171
183
  # Get raw response using parent's method
172
- response = await self._get_chat_completion(messages)
184
+ response, usage = await self._get_chat_completion(messages)
173
185
  logger.info("Response length: " + str(len(response)))
174
186
  page.raw_response = response
175
187
 
176
- if self.config.prompt_mode == "layout":
188
+ if prompt_key == "layout":
177
189
  # Post-processing
178
190
  matches, matches_image, matches_other = re_match(response)
179
191
 
@@ -199,5 +211,174 @@ class DeepSeekOCRConverterClient(OpenAIConverterClient):
199
211
 
200
212
  page.text = outputs.strip()
201
213
  logger.debug(page.text)
214
+ if usage is not None:
215
+ page.prompt_tokens = usage.prompt_tokens
216
+ page.completion_tokens = usage.completion_tokens
217
+
218
+ return page
219
+
220
+
221
+ # ==============================================================================
222
+ # DeepSeek-OCR-2
223
+ # ==============================================================================
224
+
225
+
226
+ class DeepSeekOCR2DockerServerConfig(VLLMDockerServerConfig):
227
+ """Configuration for DeepSeek-OCR-2 model.
228
+
229
+ DeepSeek-OCR-2 uses a custom architecture that requires:
230
+ - Custom model registration via hf_overrides
231
+ - NoRepeatNGram logits processor with specific whitelist tokens
232
+ - Custom image processor (DeepseekOCR2Processor)
233
+ """
234
+
235
+ docker_image: str = "vllm/vllm-openai:nightly"
236
+ model_name: str = "deepseek-ai/DeepSeek-OCR-2"
237
+ command_args: list[str] = Field(
238
+ default_factory=lambda: [
239
+ "--limit-mm-per-prompt",
240
+ '{"image": 1}',
241
+ "--hf-overrides",
242
+ '{"architectures": ["DeepseekOCR2ForCausalLM"]}',
243
+ "--block-size",
244
+ "256",
245
+ "--trust-remote-code",
246
+ "--max-model-len",
247
+ "8192",
248
+ "--swap-space",
249
+ "0",
250
+ "--gpu-memory-utilization",
251
+ "0.9",
252
+ "--logits_processors",
253
+ "vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor",
254
+ ]
255
+ )
256
+ aliases: list[str] = Field(
257
+ default_factory=lambda: ["deepseekocr2", "DeepSeek-OCR-2"]
258
+ )
259
+
260
+ @property
261
+ def client_config(self):
262
+ return DeepSeekOCR2ConverterConfig(
263
+ **self._create_client_kwargs(
264
+ f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
265
+ )
266
+ )
267
+
268
+
269
+ class DeepSeekOCR2ConverterConfig(OpenAIConverterConfig):
270
+ """DeepSeek-OCR-2 converter configuration.
271
+
272
+ Key differences from DeepSeek-OCR v1:
273
+ - Uses DeepseekOCR2ForCausalLM architecture
274
+ - Different logits processor parameters (ngram_size=20, window_size=50)
275
+ - Supports cropping mode for image processing
276
+ """
277
+
278
+ model_name: str = "deepseek-ai/DeepSeek-OCR-2"
279
+ aliases: list[str] = Field(
280
+ default_factory=lambda: ["deepseekocr2", "DeepSeek-OCR-2"]
281
+ )
282
+ postprompt: str | None = None
283
+ prompts: dict[str, str] = {
284
+ "layout": "<|grounding|>Convert the document to markdown.",
285
+ "ocr": "Free OCR.",
286
+ "image_description": "Describe this image in detail.",
287
+ }
288
+ prompt_mode_map: dict[str, str] = {
289
+ "ocr_layout": "layout",
290
+ "table": "layout",
291
+ }
292
+
293
+ completion_kwargs: dict | None = {
294
+ "temperature": 0.0,
295
+ "max_tokens": 8180,
296
+ "extra_body": {
297
+ "skip_special_tokens": False,
298
+ # args used to control custom logits processor
299
+ "vllm_xargs": {
300
+ "ngram_size": 20,
301
+ "window_size": 50,
302
+ # whitelist: <td>, </td>
303
+ "whitelist_token_ids": [128821, 128822],
304
+ },
305
+ },
306
+ }
307
+ dpi: int = 144 # Default DPI used in reference implementation
308
+
309
+ def get_client(self, **kwargs) -> "DeepSeekOCR2ConverterClient":
310
+ return DeepSeekOCR2ConverterClient(config=self, **kwargs)
311
+
312
+
313
+ class DeepSeekOCR2ConverterClient(DeepSeekOCRConverterClient):
314
+ """Client for DeepSeek-OCR-2 with specific post-processing.
315
+
316
+ Inherits from DeepSeekOCRConverterClient as the post-processing logic
317
+ for parsing grounding references and extracting items is the same.
318
+ The main differences are in the model configuration and logits processor.
319
+ """
320
+
321
+ async def async_call_inside_page(self, page: Page) -> Page:
322
+ # Prepare messages as in parent class
323
+ image = page.image
324
+
325
+ prompt_key = self.get_prompt_key() or "ocr"
326
+
327
+ messages = [
328
+ {
329
+ "role": "user",
330
+ "content": [
331
+ {
332
+ "type": "image_url",
333
+ "image_url": {
334
+ "url": f"data:image/png;base64,{to_base64(image)}"
335
+ },
336
+ },
337
+ {"type": "text", "text": self.config.prompts[prompt_key]},
338
+ ],
339
+ },
340
+ ]
341
+
342
+ # Get raw response using parent's method
343
+ response, usage = await self._get_chat_completion(messages)
344
+ logger.info("Response length: " + str(len(response)))
345
+ page.raw_response = response
346
+
347
+ if prompt_key == "layout":
348
+ # Post-processing
349
+ matches, matches_image, matches_other = re_match(response)
350
+
351
+ # Extract items (bounding boxes)
352
+ page.items = self.extract_items(page.image, matches)
353
+
354
+ # Clean text
355
+ outputs = response
356
+
357
+ # Check for sentence end marker (indicates successful completion)
358
+ # If not present, it might be due to repetition detection
359
+ if "<|end▁of▁sentence|>" in outputs:
360
+ outputs = outputs.replace("<|end▁of▁sentence|>", "")
361
+
362
+ # Replace image references with a placeholder
363
+ for a_match_image in matches_image:
364
+ outputs = outputs.replace(a_match_image, "![image]")
365
+
366
+ # Replace other references (text grounding) and cleanup
367
+ for a_match_other in matches_other:
368
+ outputs = (
369
+ outputs.replace(a_match_other, "")
370
+ .replace("\\coloneqq", ":=")
371
+ .replace("\\eqqcolon", "=:")
372
+ .replace("\n\n\n\n", "\n\n")
373
+ .replace("\n\n\n", "\n\n")
374
+ )
375
+ else:
376
+ outputs = response
377
+
378
+ page.text = outputs.strip()
379
+ logger.debug(page.text)
380
+ if usage is not None:
381
+ page.prompt_tokens = usage.prompt_tokens
382
+ page.completion_tokens = usage.completion_tokens
202
383
 
203
384
  return page
@@ -62,7 +62,6 @@ class DoclingDockerServerConfig(DockerServerConfig):
62
62
  class DoclingConverterConfig(ConverterConfig):
63
63
  """Configuration for Docling converter client."""
64
64
 
65
- base_url: str
66
65
  model_name: str = "docling"
67
66
  timeout: int = 300
68
67
  api_kwargs: dict = {"output_format": "markdown", "image_export_mode": "referenced"}
@@ -1,14 +1,13 @@
1
1
  import json
2
2
  import math
3
3
  from pathlib import Path
4
- from typing import ClassVar, Literal
4
+ from typing import ClassVar
5
5
 
6
6
  from loguru import logger
7
7
  from PIL import Image
8
8
  from pydantic import Field
9
9
 
10
10
  from vlmparse.clients.openai_converter import (
11
- LLMParams,
12
11
  OpenAIConverterClient,
13
12
  OpenAIConverterConfig,
14
13
  )
@@ -48,12 +47,13 @@ class DotsOCRDockerServerConfig(DockerServerConfig):
48
47
  )
49
48
  add_model_key_to_server: bool = True
50
49
  aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
50
+ default_model_name: str = DEFAULT_MODEL_NAME
51
51
 
52
52
  @property
53
53
  def client_config(self):
54
54
  return DotsOCRConverterConfig(
55
- llm_params=LLMParams(
56
- base_url=f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}",
55
+ **self._create_client_kwargs(
56
+ f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
57
57
  )
58
58
  )
59
59
 
@@ -65,29 +65,7 @@ class DotsOCRConverterConfig(OpenAIConverterConfig):
65
65
  model_name: str = "rednote-hilab/dots.ocr"
66
66
  preprompt: str | None = ""
67
67
  postprompt: str | None = None
68
- completion_kwargs: dict | None = {
69
- "temperature": 0.1,
70
- "top_p": 1.0,
71
- "max_completion_tokens": 16384,
72
- }
73
- aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
74
- dpi: int = 200
75
- prompt_mode: Literal["prompt_layout_all_en", "prompt_ocr"] = "prompt_ocr"
76
-
77
- def get_client(self, **kwargs) -> "DotsOCRConverter":
78
- return DotsOCRConverter(config=self, **kwargs)
79
-
80
-
81
- class DotsOCRConverter(OpenAIConverterClient):
82
- """DotsOCR VLLM converter."""
83
-
84
- # Constants
85
- MIN_PIXELS: ClassVar[int] = 3136
86
- MAX_PIXELS: ClassVar[int] = 11289600
87
- IMAGE_FACTOR: ClassVar[int] = 28
88
-
89
- # Prompts
90
- PROMPTS: ClassVar[dict] = {
68
+ prompts: dict[str, str] = {
91
69
  "prompt_layout_all_en": """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
92
70
 
93
71
  1. Bbox format: [x1, y1, x2, y2]
@@ -108,6 +86,30 @@ class DotsOCRConverter(OpenAIConverterClient):
108
86
  """,
109
87
  "prompt_ocr": """Extract the text content from this image.""",
110
88
  }
89
+ prompt_mode_map: dict[str, str] = {
90
+ "ocr": "prompt_ocr",
91
+ "ocr_layout": "prompt_layout_all_en",
92
+ "table": "prompt_layout_all_en",
93
+ }
94
+ completion_kwargs: dict | None = {
95
+ "temperature": 0.1,
96
+ "top_p": 1.0,
97
+ "max_completion_tokens": 16384,
98
+ }
99
+ aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
100
+ dpi: int = 200
101
+
102
+ def get_client(self, **kwargs) -> "DotsOCRConverter":
103
+ return DotsOCRConverter(config=self, **kwargs)
104
+
105
+
106
+ class DotsOCRConverter(OpenAIConverterClient):
107
+ """DotsOCR VLLM converter."""
108
+
109
+ # Constants
110
+ MIN_PIXELS: ClassVar[int] = 3136
111
+ MAX_PIXELS: ClassVar[int] = 11289600
112
+ IMAGE_FACTOR: ClassVar[int] = 28
111
113
 
112
114
  @staticmethod
113
115
  def round_by_factor(number: int, factor: int) -> int:
@@ -235,7 +237,7 @@ class DotsOCRConverter(OpenAIConverterClient):
235
237
  image = self.fetch_image(
236
238
  origin_image, min_pixels=self.MIN_PIXELS, max_pixels=self.MAX_PIXELS
237
239
  )
238
- prompt = self.PROMPTS[prompt_mode]
240
+ prompt = self.config.prompts[prompt_mode]
239
241
 
240
242
  response, usage = await self._async_inference_with_vllm(image, prompt)
241
243
 
@@ -258,13 +260,15 @@ class DotsOCRConverter(OpenAIConverterClient):
258
260
  async def async_call_inside_page(self, page: Page) -> Page:
259
261
  image = page.image
260
262
 
263
+ prompt_key = self.get_prompt_key() or "prompt_ocr"
264
+
261
265
  _, response, _, usage = await self._parse_image_vllm(
262
- image, prompt_mode=self.config.prompt_mode
266
+ image, prompt_mode=prompt_key
263
267
  )
264
268
  logger.info("Response: " + str(response))
265
269
 
266
270
  items = None
267
- if self.config.prompt_mode == "prompt_layout_all_en":
271
+ if prompt_key == "prompt_layout_all_en":
268
272
  text = "\n\n".join([item.get("text", "") for item in response])
269
273
 
270
274
  items = []
@@ -286,5 +290,4 @@ class DotsOCRConverter(OpenAIConverterClient):
286
290
 
287
291
  page.completion_tokens = usage.completion_tokens
288
292
  page.prompt_tokens = usage.prompt_tokens
289
- page.reasoning_tokens = usage.reasoning_tokens
290
293
  return page