vlmparse 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vlmparse/build_doc.py CHANGED
@@ -10,12 +10,10 @@ from .constants import PDF_EXTENSION
10
10
 
11
11
 
12
12
  def convert_pdfium(file_path, dpi):
13
- pdf = pdfium.PdfDocument(file_path)
14
13
  pil_images = []
15
- for page in pdf:
16
- pil_images.append(page.render(scale=dpi / 72).to_pil())
17
-
18
- pdf.close()
14
+ with pdfium.PdfDocument(file_path) as pdf:
15
+ for page in pdf:
16
+ pil_images.append(page.render(scale=dpi / 72).to_pil())
19
17
  return pil_images
20
18
 
21
19
 
@@ -32,24 +30,29 @@ def convert_pdfium_to_images(file_path, dpi=175):
32
30
  ]
33
31
 
34
32
  except PIL.Image.DecompressionBombError as e:
35
- logger.exception(f"Got problem size document with {file_path}")
33
+ logger.opt(exception=True).warning(
34
+ "Decompression bomb detected for {file_path}, reducing DPI",
35
+ file_path=str(file_path),
36
+ )
36
37
  cur_size, limit_size = map(int, re.findall(r"\d+", str(e)))
37
38
  factor = custom_ceil(cur_size / limit_size, precision=1)
38
- logger.warning(
39
- f"Try again by reducing DPI for doc {file_path} from {dpi} to {dpi//factor}"
39
+ new_dpi = dpi // factor
40
+ logger.info(
41
+ "Retrying {file_path} with reduced DPI: {old_dpi} -> {new_dpi}",
42
+ file_path=str(file_path),
43
+ old_dpi=dpi,
44
+ new_dpi=new_dpi,
40
45
  )
41
- dpi = dpi // factor
42
- images = convert_pdfium(file_path, dpi=dpi)
46
+ images = convert_pdfium(file_path, dpi=new_dpi)
43
47
 
44
48
  return images
45
49
 
46
50
 
47
51
  def convert_specific_page_to_image(file_path, page_number, dpi=175):
48
- pdf = pdfium.PdfDocument(file_path)
49
- page = pdf.get_page(page_number)
50
- image = page.render(scale=dpi / 72).to_pil()
51
- image = image.convert("L").convert("RGB") if image.mode != "RGB" else image
52
- pdf.close()
52
+ with pdfium.PdfDocument(file_path) as pdf:
53
+ page = pdf.get_page(page_number)
54
+ image = page.render(scale=dpi / 72).to_pil()
55
+ image = image.convert("L").convert("RGB") if image.mode != "RGB" else image
53
56
  return image
54
57
 
55
58
 
@@ -68,9 +71,7 @@ def resize_image(image, max_image_size):
68
71
 
69
72
  def get_page_count(file_path):
70
73
  if Path(file_path).suffix.lower() == PDF_EXTENSION:
71
- pdf = pdfium.PdfDocument(file_path)
72
- count = len(pdf)
73
- pdf.close()
74
- return count
74
+ with pdfium.PdfDocument(file_path) as pdf:
75
+ return len(pdf)
75
76
  else:
76
77
  return 1
vlmparse/cli.py CHANGED
@@ -49,10 +49,19 @@ class DParseCLI:
49
49
  uri: str | None = None,
50
50
  gpus: str | None = None,
51
51
  mode: Literal["document", "md", "md_page"] = "document",
52
+ conversion_mode: Literal[
53
+ "ocr",
54
+ "ocr_layout",
55
+ "table",
56
+ "image_description",
57
+ "formula",
58
+ "chart",
59
+ ] = "ocr",
52
60
  with_vllm_server: bool = False,
53
61
  concurrency: int = 10,
54
62
  dpi: int | None = None,
55
63
  debug: bool = False,
64
+ _return_documents: bool = False,
56
65
  ):
57
66
  """Parse PDF documents and save results.
58
67
 
@@ -64,6 +73,7 @@ class DParseCLI:
64
73
  uri: URI of the server, if not specified and the pipe is vllm, a local server will be deployed
65
74
  gpus: Comma-separated GPU device IDs (e.g., "0" or "0,1,2"). If not specified, all GPUs will be used.
66
75
  mode: Output mode - "document" (save as JSON zip), "md" (save as markdown file), "md_page" (save as folder of markdown pages)
76
+ conversion_mode: Conversion mode - "ocr" (plain), "ocr_layout" (OCR with layout), "table" (table-centric), "image_description" (describe the image), "formula" (formula extraction), "chart" (chart recognition)
67
77
  with_vllm_server: If True, a local VLLM server will be deployed if the model is not found in the registry. Note that if the model is in the registry and the uri is None, the server will be anyway deployed.
68
78
  dpi: DPI to use for the conversion. If not specified, the default DPI will be used.
69
79
  debug: If True, run in debug mode (single-threaded, no concurrency)
@@ -76,9 +86,15 @@ class DParseCLI:
76
86
  gpus=gpus,
77
87
  with_vllm_server=with_vllm_server,
78
88
  concurrency=concurrency,
89
+ return_documents=_return_documents,
79
90
  ) as converter_with_server:
80
91
  return converter_with_server.parse(
81
- inputs=inputs, out_folder=out_folder, mode=mode, dpi=dpi, debug=debug
92
+ inputs=inputs,
93
+ out_folder=out_folder,
94
+ mode=mode,
95
+ conversion_mode=conversion_mode,
96
+ dpi=dpi,
97
+ debug=debug,
82
98
  )
83
99
 
84
100
  def list(self):
@@ -1,6 +1,9 @@
1
+ import json
1
2
  import math
2
3
  import time
4
+ from dataclasses import asdict, dataclass
3
5
 
6
+ from bs4 import BeautifulSoup
4
7
  from loguru import logger
5
8
  from PIL import Image
6
9
  from pydantic import Field
@@ -11,7 +14,8 @@ from vlmparse.clients.openai_converter import (
11
14
  )
12
15
  from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
13
16
  from vlmparse.clients.pipe_utils.utils import clean_response
14
- from vlmparse.data_model.document import Page
17
+ from vlmparse.data_model.box import BoundingBox
18
+ from vlmparse.data_model.document import Item, Page
15
19
  from vlmparse.servers.docker_server import VLLMDockerServerConfig
16
20
  from vlmparse.utils import to_base64
17
21
 
@@ -110,11 +114,6 @@ OCR this image to HTML.
110
114
  {PROMPT_ENDING}
111
115
  """.strip()
112
116
 
113
- PROMPT_MAPPING = {
114
- "ocr_layout": OCR_LAYOUT_PROMPT,
115
- "ocr": OCR_PROMPT,
116
- }
117
-
118
117
 
119
118
  def scale_to_fit(
120
119
  img: Image.Image,
@@ -188,11 +187,135 @@ def detect_repeat_token(
188
187
  return False
189
188
 
190
189
 
190
+ @dataclass
191
+ class LayoutBlock:
192
+ """Represents a layout block with bounding box and content."""
193
+
194
+ bbox: list[int]
195
+ label: str
196
+ content: str
197
+
198
+
199
+ def parse_layout(
200
+ html: str, image: Image.Image, bbox_scale: int = 1024
201
+ ) -> list[LayoutBlock]:
202
+ """
203
+ Parse HTML layout blocks with bounding boxes.
204
+
205
+ Args:
206
+ html: HTML string with layout blocks (divs with data-bbox and data-label attributes)
207
+ image: PIL Image to get dimensions for bbox scaling
208
+ bbox_scale: The scale used in the prompt for normalized bboxes
209
+
210
+ Returns:
211
+ List of LayoutBlock objects with scaled bounding boxes
212
+ """
213
+ soup = BeautifulSoup(html, "html.parser")
214
+ top_level_divs = soup.find_all("div", recursive=False)
215
+ width, height = image.size
216
+ width_scaler = width / bbox_scale
217
+ height_scaler = height / bbox_scale
218
+ layout_blocks = []
219
+
220
+ for div in top_level_divs:
221
+ bbox = div.get("data-bbox")
222
+
223
+ try:
224
+ bbox = json.loads(bbox)
225
+ assert len(bbox) == 4, "Invalid bbox length"
226
+ except Exception:
227
+ try:
228
+ bbox = bbox.split(" ")
229
+ assert len(bbox) == 4, "Invalid bbox length"
230
+ except Exception:
231
+ # Default bbox if parsing fails
232
+ bbox = [0, 0, bbox_scale, bbox_scale]
233
+
234
+ bbox = list(map(int, bbox))
235
+ # Scale bbox to image dimensions
236
+ bbox = [
237
+ max(0, int(bbox[0] * width_scaler)),
238
+ max(0, int(bbox[1] * height_scaler)),
239
+ min(int(bbox[2] * width_scaler), width),
240
+ min(int(bbox[3] * height_scaler), height),
241
+ ]
242
+
243
+ label = div.get("data-label", "block")
244
+ content = str(div.decode_contents())
245
+ layout_blocks.append(LayoutBlock(bbox=bbox, label=label, content=content))
246
+
247
+ return layout_blocks
248
+
249
+
250
+ def parse_chunks(html: str, image: Image.Image, bbox_scale: int = 1024) -> list[dict]:
251
+ """
252
+ Parse HTML layout blocks into dictionaries.
253
+
254
+ Args:
255
+ html: HTML string with layout blocks
256
+ image: PIL Image to get dimensions for bbox scaling
257
+ bbox_scale: The scale used in the prompt for normalized bboxes
258
+
259
+ Returns:
260
+ List of dictionaries with bbox, label, and content keys
261
+ """
262
+ layout = parse_layout(html, image, bbox_scale=bbox_scale)
263
+ chunks = [asdict(block) for block in layout]
264
+ return chunks
265
+
266
+
267
+ def layout_blocks_to_items(
268
+ layout_blocks: list[LayoutBlock],
269
+ ) -> list[Item]:
270
+ """
271
+ Convert layout blocks to Item objects for the Page model.
272
+
273
+ Args:
274
+ layout_blocks: List of LayoutBlock objects
275
+
276
+ Returns:
277
+ List of Item objects with category, box, and text
278
+ """
279
+ items = []
280
+ for block in layout_blocks:
281
+ # Convert content HTML to markdown
282
+ try:
283
+ text = html_to_md_keep_tables(block.content)
284
+ except Exception as e:
285
+ logger.warning(f"Error converting block content to markdown: {e}")
286
+ text = block.content
287
+
288
+ # Create bounding box from [x0, y0, x1, y1] format
289
+ bbox = BoundingBox(
290
+ l=block.bbox[0],
291
+ t=block.bbox[1],
292
+ r=block.bbox[2],
293
+ b=block.bbox[3],
294
+ )
295
+
296
+ items.append(
297
+ Item(
298
+ category=block.label,
299
+ box=bbox,
300
+ text=text.strip(),
301
+ )
302
+ )
303
+
304
+ return items
305
+
306
+
191
307
  class ChandraConverterConfig(OpenAIConverterConfig):
192
308
  """Chandra converter configuration."""
193
309
 
194
310
  model_name: str = "datalab-to/chandra"
195
- prompt_type: str = "ocr" # Default prompt type
311
+ postprompt: str | None = None
312
+ prompts: dict[str, str] = {
313
+ "ocr": OCR_PROMPT,
314
+ "ocr_layout": OCR_LAYOUT_PROMPT,
315
+ }
316
+ prompt_mode_map: dict[str, str] = {
317
+ "table": "ocr_layout",
318
+ }
196
319
  bbox_scale: int = 1024
197
320
  max_retries: int = 0
198
321
  max_failure_retries: int = None
@@ -216,8 +339,7 @@ class ChandraConverterClient(OpenAIConverterClient):
216
339
 
217
340
  async def async_call_inside_page(self, page: Page) -> Page:
218
341
  """Process a single page using Chandra logic."""
219
-
220
- prompt = PROMPT_MAPPING.get(self.config.prompt_type, OCR_PROMPT)
342
+ prompt = self.get_prompt_for_mode() or OCR_PROMPT
221
343
  prompt = prompt.replace("{bbox_scale}", str(self.config.bbox_scale))
222
344
 
223
345
  image = scale_to_fit(page.image)
@@ -238,61 +360,34 @@ class ChandraConverterClient(OpenAIConverterClient):
238
360
 
239
361
  retries = 0
240
362
  max_retries = self.config.max_retries
241
- max_failure_retries = self.config.max_failure_retries
242
363
 
243
364
  result_content = ""
244
- error_occurred = False
245
365
 
246
366
  while True:
247
- try:
248
- # Adjust temperature if retrying
249
- temperature = self.config.completion_kwargs.get("temperature", 0.0)
250
- if retries > 0:
251
- temperature = 0.3 # As per vllm.py logic
252
-
253
- completion_kwargs = self.config.completion_kwargs.copy()
254
- completion_kwargs["temperature"] = temperature
255
- if retries > 0:
256
- completion_kwargs["top_p"] = 0.95
257
-
258
- result_content = await self._get_chat_completion(
259
- messages, completion_kwargs=completion_kwargs
260
- )
261
- error_occurred = False
262
- except Exception as e:
263
- logger.error(f"Error during VLLM generation: {e}")
264
- error_occurred = True
265
- result_content = ""
266
-
267
367
  should_retry = False
268
-
269
- # Check for repeat token
270
- if not error_occurred:
271
- has_repeat = detect_repeat_token(result_content) or (
272
- len(result_content) > 50
273
- and detect_repeat_token(result_content, cut_from_end=50)
368
+ # Adjust temperature if retrying
369
+ temperature = self.config.completion_kwargs.get("temperature", 0.0)
370
+ if retries > 0:
371
+ temperature = 0.3 # As per vllm.py logic
372
+
373
+ completion_kwargs = self.config.completion_kwargs.copy()
374
+ completion_kwargs["temperature"] = temperature
375
+ if retries > 0:
376
+ completion_kwargs["top_p"] = 0.95
377
+
378
+ result_content, usage = await self._get_chat_completion(
379
+ messages, completion_kwargs=completion_kwargs
380
+ )
381
+
382
+ has_repeat = detect_repeat_token(result_content) or (
383
+ len(result_content) > 50
384
+ and detect_repeat_token(result_content, cut_from_end=50)
385
+ )
386
+ if has_repeat and retries < max_retries:
387
+ logger.warning(
388
+ f"Detected repeat token, retrying generation (attempt {retries + 1})..."
274
389
  )
275
- if has_repeat and retries < max_retries:
276
- logger.warning(
277
- f"Detected repeat token, retrying generation (attempt {retries + 1})..."
278
- )
279
- should_retry = True
280
-
281
- # Check for error
282
- if error_occurred:
283
- if max_failure_retries is not None:
284
- if retries < max_failure_retries:
285
- logger.warning(
286
- f"Detected vllm error, retrying generation (attempt {retries + 1})..."
287
- )
288
- should_retry = True
289
- elif (
290
- retries < max_retries
291
- ): # Fallback to max_retries if max_failure_retries not set (vllm.py logic varies slightly but this is safe)
292
- logger.warning(
293
- f"Detected vllm error, retrying generation (attempt {retries + 1})..."
294
- )
295
- should_retry = True
390
+ should_retry = True
296
391
 
297
392
  if should_retry:
298
393
  time.sleep(2 * (retries + 1))
@@ -305,10 +400,27 @@ class ChandraConverterClient(OpenAIConverterClient):
305
400
  page.raw_response = result_content
306
401
  text = clean_response(result_content)
307
402
 
403
+ # Check if we're in layout mode (ocr_layout prompt)
404
+ current_prompt_key = self.get_prompt_key()
405
+ is_layout_mode = current_prompt_key == "ocr_layout"
406
+
407
+ if is_layout_mode:
408
+ # Parse layout blocks and populate items
409
+ try:
410
+ layout_blocks = parse_layout(
411
+ text, image, bbox_scale=self.config.bbox_scale
412
+ )
413
+ page.items = layout_blocks_to_items(layout_blocks)
414
+ logger.info(f"Parsed {len(page.items)} layout blocks")
415
+ except Exception as e:
416
+ logger.warning(f"Error parsing layout blocks: {e}")
417
+ page.items = []
418
+
308
419
  # Convert HTML to MD
309
420
  text = html_to_md_keep_tables(text)
310
421
  page.text = text
311
-
422
+ page.completion_tokens = usage.completion_tokens
423
+ page.prompt_tokens = usage.prompt_tokens
312
424
  return page
313
425
 
314
426
 
@@ -320,4 +432,8 @@ class ChandraDockerServerConfig(VLLMDockerServerConfig):
320
432
 
321
433
  @property
322
434
  def client_config(self):
323
- return ChandraConverterConfig(llm_params=self.llm_params)
435
+ return ChandraConverterConfig(
436
+ **self._create_client_kwargs(
437
+ f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
438
+ )
439
+ )
@@ -1,5 +1,4 @@
1
1
  import re
2
- from typing import ClassVar, Literal
3
2
 
4
3
  from loguru import logger
5
4
  from PIL import Image
@@ -35,7 +34,11 @@ class DeepSeekOCRDockerServerConfig(VLLMDockerServerConfig):
35
34
 
36
35
  @property
37
36
  def client_config(self):
38
- return DeepSeekOCRConverterConfig(llm_params=self.llm_params)
37
+ return DeepSeekOCRConverterConfig(
38
+ **self._create_client_kwargs(
39
+ f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
40
+ )
41
+ )
39
42
 
40
43
 
41
44
  class DeepSeekOCRConverterConfig(OpenAIConverterConfig):
@@ -43,8 +46,17 @@ class DeepSeekOCRConverterConfig(OpenAIConverterConfig):
43
46
 
44
47
  model_name: str = "deepseek-ai/DeepSeek-OCR"
45
48
  aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
49
+ postprompt: str | None = None
50
+ prompts: dict[str, str] = {
51
+ "layout": "<|grounding|>Convert the document to markdown.",
52
+ "ocr": "Free OCR.",
53
+ "image_description": "Describe this image in detail.",
54
+ }
55
+ prompt_mode_map: dict[str, str] = {
56
+ "ocr_layout": "layout",
57
+ "table": "layout",
58
+ }
46
59
 
47
- prompt_mode: Literal["layout", "ocr"] = "ocr"
48
60
  completion_kwargs: dict | None = {
49
61
  "temperature": 0.0,
50
62
  "max_tokens": 8181,
@@ -95,12 +107,6 @@ def extract_coordinates_and_label(ref_text):
95
107
  class DeepSeekOCRConverterClient(OpenAIConverterClient):
96
108
  """Client for DeepSeekOCR with specific post-processing."""
97
109
 
98
- PROMPTS: ClassVar[dict] = {
99
- "layout": "<|grounding|>Convert the document to markdown.",
100
- "ocr": "Free OCR.",
101
- "image_description": "Describe this image in detail.",
102
- }
103
-
104
110
  def extract_items(self, image: Image.Image, matches: list) -> list[Item]:
105
111
  items = []
106
112
  width, height = image.size
@@ -153,6 +159,8 @@ class DeepSeekOCRConverterClient(OpenAIConverterClient):
153
159
  # Prepare messages as in parent class
154
160
  image = page.image
155
161
 
162
+ prompt_key = self.get_prompt_key() or "ocr"
163
+
156
164
  messages = [
157
165
  {
158
166
  "role": "user",
@@ -163,17 +171,17 @@ class DeepSeekOCRConverterClient(OpenAIConverterClient):
163
171
  "url": f"data:image/png;base64,{to_base64(image)}"
164
172
  },
165
173
  },
166
- {"type": "text", "text": self.PROMPTS[self.config.prompt_mode]},
174
+ {"type": "text", "text": self.config.prompts[prompt_key]},
167
175
  ],
168
176
  },
169
177
  ]
170
178
 
171
179
  # Get raw response using parent's method
172
- response = await self._get_chat_completion(messages)
180
+ response, usage = await self._get_chat_completion(messages)
173
181
  logger.info("Response length: " + str(len(response)))
174
182
  page.raw_response = response
175
183
 
176
- if self.config.prompt_mode == "layout":
184
+ if prompt_key == "layout":
177
185
  # Post-processing
178
186
  matches, matches_image, matches_other = re_match(response)
179
187
 
@@ -199,5 +207,8 @@ class DeepSeekOCRConverterClient(OpenAIConverterClient):
199
207
 
200
208
  page.text = outputs.strip()
201
209
  logger.debug(page.text)
210
+ if usage is not None:
211
+ page.prompt_tokens = usage.prompt_tokens
212
+ page.completion_tokens = usage.completion_tokens
202
213
 
203
214
  return page
@@ -62,7 +62,6 @@ class DoclingDockerServerConfig(DockerServerConfig):
62
62
  class DoclingConverterConfig(ConverterConfig):
63
63
  """Configuration for Docling converter client."""
64
64
 
65
- base_url: str
66
65
  model_name: str = "docling"
67
66
  timeout: int = 300
68
67
  api_kwargs: dict = {"output_format": "markdown", "image_export_mode": "referenced"}
@@ -1,14 +1,13 @@
1
1
  import json
2
2
  import math
3
3
  from pathlib import Path
4
- from typing import ClassVar, Literal
4
+ from typing import ClassVar
5
5
 
6
6
  from loguru import logger
7
7
  from PIL import Image
8
8
  from pydantic import Field
9
9
 
10
10
  from vlmparse.clients.openai_converter import (
11
- LLMParams,
12
11
  OpenAIConverterClient,
13
12
  OpenAIConverterConfig,
14
13
  )
@@ -48,12 +47,13 @@ class DotsOCRDockerServerConfig(DockerServerConfig):
48
47
  )
49
48
  add_model_key_to_server: bool = True
50
49
  aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
50
+ default_model_name: str = DEFAULT_MODEL_NAME
51
51
 
52
52
  @property
53
53
  def client_config(self):
54
54
  return DotsOCRConverterConfig(
55
- llm_params=LLMParams(
56
- base_url=f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}",
55
+ **self._create_client_kwargs(
56
+ f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
57
57
  )
58
58
  )
59
59
 
@@ -65,29 +65,7 @@ class DotsOCRConverterConfig(OpenAIConverterConfig):
65
65
  model_name: str = "rednote-hilab/dots.ocr"
66
66
  preprompt: str | None = ""
67
67
  postprompt: str | None = None
68
- completion_kwargs: dict | None = {
69
- "temperature": 0.1,
70
- "top_p": 1.0,
71
- "max_completion_tokens": 16384,
72
- }
73
- aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
74
- dpi: int = 200
75
- prompt_mode: Literal["prompt_layout_all_en", "prompt_ocr"] = "prompt_ocr"
76
-
77
- def get_client(self, **kwargs) -> "DotsOCRConverter":
78
- return DotsOCRConverter(config=self, **kwargs)
79
-
80
-
81
- class DotsOCRConverter(OpenAIConverterClient):
82
- """DotsOCR VLLM converter."""
83
-
84
- # Constants
85
- MIN_PIXELS: ClassVar[int] = 3136
86
- MAX_PIXELS: ClassVar[int] = 11289600
87
- IMAGE_FACTOR: ClassVar[int] = 28
88
-
89
- # Prompts
90
- PROMPTS: ClassVar[dict] = {
68
+ prompts: dict[str, str] = {
91
69
  "prompt_layout_all_en": """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
92
70
 
93
71
  1. Bbox format: [x1, y1, x2, y2]
@@ -108,6 +86,30 @@ class DotsOCRConverter(OpenAIConverterClient):
108
86
  """,
109
87
  "prompt_ocr": """Extract the text content from this image.""",
110
88
  }
89
+ prompt_mode_map: dict[str, str] = {
90
+ "ocr": "prompt_ocr",
91
+ "ocr_layout": "prompt_layout_all_en",
92
+ "table": "prompt_layout_all_en",
93
+ }
94
+ completion_kwargs: dict | None = {
95
+ "temperature": 0.1,
96
+ "top_p": 1.0,
97
+ "max_completion_tokens": 16384,
98
+ }
99
+ aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
100
+ dpi: int = 200
101
+
102
+ def get_client(self, **kwargs) -> "DotsOCRConverter":
103
+ return DotsOCRConverter(config=self, **kwargs)
104
+
105
+
106
+ class DotsOCRConverter(OpenAIConverterClient):
107
+ """DotsOCR VLLM converter."""
108
+
109
+ # Constants
110
+ MIN_PIXELS: ClassVar[int] = 3136
111
+ MAX_PIXELS: ClassVar[int] = 11289600
112
+ IMAGE_FACTOR: ClassVar[int] = 28
111
113
 
112
114
  @staticmethod
113
115
  def round_by_factor(number: int, factor: int) -> int:
@@ -235,7 +237,7 @@ class DotsOCRConverter(OpenAIConverterClient):
235
237
  image = self.fetch_image(
236
238
  origin_image, min_pixels=self.MIN_PIXELS, max_pixels=self.MAX_PIXELS
237
239
  )
238
- prompt = self.PROMPTS[prompt_mode]
240
+ prompt = self.config.prompts[prompt_mode]
239
241
 
240
242
  response, usage = await self._async_inference_with_vllm(image, prompt)
241
243
 
@@ -258,13 +260,15 @@ class DotsOCRConverter(OpenAIConverterClient):
258
260
  async def async_call_inside_page(self, page: Page) -> Page:
259
261
  image = page.image
260
262
 
263
+ prompt_key = self.get_prompt_key() or "prompt_ocr"
264
+
261
265
  _, response, _, usage = await self._parse_image_vllm(
262
- image, prompt_mode=self.config.prompt_mode
266
+ image, prompt_mode=prompt_key
263
267
  )
264
268
  logger.info("Response: " + str(response))
265
269
 
266
270
  items = None
267
- if self.config.prompt_mode == "prompt_layout_all_en":
271
+ if prompt_key == "prompt_layout_all_en":
268
272
  text = "\n\n".join([item.get("text", "") for item in response])
269
273
 
270
274
  items = []
@@ -286,5 +290,4 @@ class DotsOCRConverter(OpenAIConverterClient):
286
290
 
287
291
  page.completion_tokens = usage.completion_tokens
288
292
  page.prompt_tokens = usage.prompt_tokens
289
- page.reasoning_tokens = usage.reasoning_tokens
290
293
  return page