vlmparse 0.1.0__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py +1763 -0
  2. vlmparse/benchpdf2md/bench_tests/utils.py +0 -0
  3. vlmparse/benchpdf2md/create_dataset.py +60 -0
  4. vlmparse/benchpdf2md/olmocrbench/katex/__init__.py +1 -0
  5. vlmparse/benchpdf2md/olmocrbench/katex/render.py +592 -0
  6. vlmparse/benchpdf2md/olmocrbench/repeatdetect.py +175 -0
  7. vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py +256 -0
  8. vlmparse/benchpdf2md/olmocrbench/tests.py +1334 -0
  9. vlmparse/benchpdf2md/run_benchmark.py +296 -0
  10. vlmparse/benchpdf2md/st_visu_benchmark/app.py +271 -0
  11. vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py +117 -0
  12. vlmparse/benchpdf2md/st_visu_benchmark/test_form.py +95 -0
  13. vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py +20 -0
  14. vlmparse/benchpdf2md/st_visu_benchmark/utils.py +50 -0
  15. vlmparse/benchpdf2md/utils.py +56 -0
  16. vlmparse/clients/chandra.py +323 -0
  17. vlmparse/clients/deepseekocr.py +52 -0
  18. vlmparse/clients/docling.py +146 -0
  19. vlmparse/clients/dotsocr.py +277 -0
  20. vlmparse/clients/granite_docling.py +132 -0
  21. vlmparse/clients/hunyuanocr.py +45 -0
  22. vlmparse/clients/lightonocr.py +43 -0
  23. vlmparse/clients/mineru.py +119 -0
  24. vlmparse/clients/nanonetocr.py +29 -0
  25. vlmparse/clients/olmocr.py +46 -0
  26. vlmparse/clients/openai_converter.py +173 -0
  27. vlmparse/clients/paddleocrvl.py +48 -0
  28. vlmparse/clients/pipe_utils/cleaner.py +74 -0
  29. vlmparse/clients/pipe_utils/html_to_md_conversion.py +136 -0
  30. vlmparse/clients/pipe_utils/utils.py +12 -0
  31. vlmparse/clients/prompts.py +66 -0
  32. vlmparse/data_model/box.py +551 -0
  33. vlmparse/data_model/document.py +148 -0
  34. vlmparse/servers/docker_server.py +199 -0
  35. vlmparse/servers/utils.py +250 -0
  36. vlmparse/st_viewer/fs_nav.py +53 -0
  37. vlmparse/st_viewer/st_viewer.py +80 -0
  38. {vlmparse-0.1.0.dist-info → vlmparse-0.1.3.dist-info}/METADATA +12 -1
  39. vlmparse-0.1.3.dist-info/RECORD +50 -0
  40. vlmparse-0.1.0.dist-info/RECORD +0 -13
  41. {vlmparse-0.1.0.dist-info → vlmparse-0.1.3.dist-info}/WHEEL +0 -0
  42. {vlmparse-0.1.0.dist-info → vlmparse-0.1.3.dist-info}/entry_points.txt +0 -0
  43. {vlmparse-0.1.0.dist-info → vlmparse-0.1.3.dist-info}/licenses/LICENSE +0 -0
  44. {vlmparse-0.1.0.dist-info → vlmparse-0.1.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,95 @@
1
+ import streamlit as st
2
+
3
+
4
+ def edit_test_form(test_obj, test_type):
5
+ st.markdown("### Edit Test Fields")
6
+ with st.form("edit_test_fields"):
7
+ type_fields = {}
8
+ type_fields["max_diffs"] = st.number_input(
9
+ "Max Diffs", value=test_obj.max_diffs, min_value=0, step=1
10
+ )
11
+ type_fields["unidecode"] = st.checkbox("Unidecode", value=test_obj.unidecode)
12
+ type_fields["alphanum"] = st.checkbox("Alphanum", value=test_obj.alphanum)
13
+ type_fields["ignore_str"] = st.text_input(
14
+ "Ignore strings (seperarated by spaces)",
15
+ value=" ".join(test_obj.ignore_str),
16
+ )
17
+ type_fields["ignore_space"] = st.checkbox(
18
+ "Ignore space", value=test_obj.ignore_space
19
+ )
20
+
21
+ type_fields["ignore_str"] = (
22
+ type_fields["ignore_str"].split(" ") if type_fields["ignore_str"] else []
23
+ )
24
+
25
+ if test_type == "present" or test_type == "absent":
26
+ type_fields["text"] = st.text_area(
27
+ "Text", value=test_obj.text, height="content"
28
+ )
29
+ layout_cat_options = [
30
+ "text",
31
+ "footer",
32
+ "header",
33
+ "footnote",
34
+ "image",
35
+ "image_caption",
36
+ ]
37
+
38
+ type_fields["layout_cat"] = st.selectbox(
39
+ "Layout Category",
40
+ layout_cat_options,
41
+ index=layout_cat_options.index(test_obj.layout_cat),
42
+ )
43
+ type_fields["case_sensitive"] = st.checkbox(
44
+ "Case Sensitive", value=test_obj.case_sensitive
45
+ )
46
+ type_fields["first_n"] = st.number_input(
47
+ "First N",
48
+ value=test_obj.first_n if test_obj.first_n else 0,
49
+ min_value=0,
50
+ step=100,
51
+ )
52
+ type_fields["last_n"] = st.number_input(
53
+ "Last N",
54
+ value=test_obj.last_n if test_obj.last_n else 0,
55
+ min_value=0,
56
+ step=100,
57
+ )
58
+ if type_fields["first_n"] == 0:
59
+ type_fields["first_n"] = None
60
+ if type_fields["last_n"] == 0:
61
+ type_fields["last_n"] = None
62
+ elif test_type == "order":
63
+ type_fields["before"] = st.text_area(
64
+ "Before", value=test_obj.before, height="content"
65
+ )
66
+ type_fields["after"] = st.text_area(
67
+ "After", value=test_obj.after, height="content"
68
+ )
69
+ elif test_type == "table":
70
+ type_fields["cell"] = st.text_input("Cell", value=test_obj.cell)
71
+ type_fields["up"] = st.text_input(
72
+ "Up", value=test_obj.up if test_obj.up else ""
73
+ )
74
+ type_fields["down"] = st.text_input(
75
+ "Down", value=test_obj.down if test_obj.down else ""
76
+ )
77
+ type_fields["left"] = st.text_input(
78
+ "Left", value=test_obj.left if test_obj.left else ""
79
+ )
80
+ type_fields["right"] = st.text_input(
81
+ "Right", value=test_obj.right if test_obj.right else ""
82
+ )
83
+ type_fields["top_heading"] = st.text_input(
84
+ "Top Heading",
85
+ value=test_obj.top_heading if test_obj.top_heading else "",
86
+ )
87
+ type_fields["left_heading"] = st.text_input(
88
+ "Left Heading",
89
+ value=test_obj.left_heading if test_obj.left_heading else "",
90
+ )
91
+ if st.form_submit_button("Save Changes"):
92
+ for field, value in type_fields.items():
93
+ setattr(test_obj, field, value)
94
+
95
+ return test_obj
@@ -0,0 +1,20 @@
1
+ from pathlib import Path
2
+ from typing import Optional
3
+
4
+ import streamlit as st
5
+
6
+ from vlmparse.benchpdf2md.st_visu_benchmark.utils import get_pdf_bytes
7
+
8
+
9
+ def download_pdf_page(
10
+ pdf_path: Path, page_no: int = 0, file_name: Optional[str] = None
11
+ ):
12
+ pdf_bytes = get_pdf_bytes(pdf_path, page_no)
13
+ if pdf_bytes:
14
+ st.download_button(
15
+ label="📄 Download PDF Page",
16
+ data=pdf_bytes,
17
+ file_name=file_name if file_name else f"{pdf_path.stem}.pdf",
18
+ mime="application/pdf",
19
+ use_container_width=True,
20
+ )
@@ -0,0 +1,50 @@
1
+ import io
2
+ from pathlib import Path
3
+
4
+ import pypdfium2 as pdfium
5
+ import streamlit as st
6
+
7
+ from vlmparse.data_model.document import Document
8
+
9
+
10
+ @st.cache_data
11
+ def get_pdf_bytes(pdf_path, page_no=0):
12
+ pdf_reader = pdfium.PdfDocument(pdf_path)
13
+ if page_no >= len(pdf_reader):
14
+ pdf_reader.close()
15
+ return None
16
+
17
+ # Create a new PDF
18
+ new_pdf = pdfium.PdfDocument.new()
19
+
20
+ # Import the chosen page into the new PDF
21
+ new_pdf.import_pages(pdf_reader, pages=[page_no])
22
+
23
+ bytes_io = io.BytesIO()
24
+ # Get bytes
25
+ new_pdf.save(bytes_io)
26
+
27
+ pdf_bytes = bytes_io.getvalue()
28
+
29
+ # Clean up
30
+ new_pdf.close()
31
+ pdf_reader.close()
32
+
33
+ return pdf_bytes
34
+
35
+
36
+ @st.cache_data
37
+ def get_doc(doc_path: Path):
38
+ return Document.from_zip(doc_path)
39
+
40
+
41
+ def save_new_test(tests, test_obj_edited, test_path):
42
+ from vlmparse.benchpdf2md.bench_tests.benchmark_tsts import save_tests
43
+
44
+ for test in tests:
45
+ if test.id == test_obj_edited.id:
46
+ test = test_obj_edited
47
+ else:
48
+ test = test
49
+ save_tests(tests, test_path)
50
+ st.success("Test updated successfully!")
@@ -0,0 +1,56 @@
1
+ import base64
2
+ from io import BytesIO
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ from PIL import Image
7
+
8
+
9
+ def vectorized_bootstrap_grouped_std(df, group_col, value_col, n_bootstrap=1000):
10
+ group_col = [group_col] if isinstance(group_col, str) else group_col
11
+ grouped = df.groupby(group_col)[value_col]
12
+
13
+ def bootstrap_group(group):
14
+ values = group.values
15
+ n = len(values)
16
+ bootstrap_samples = np.random.choice(
17
+ values, size=(n_bootstrap, n), replace=True
18
+ )
19
+ bootstrap_means = np.mean(bootstrap_samples, axis=1)
20
+ return pd.Series(
21
+ {"mean": np.mean(values), "bootstrap_std": np.std(bootstrap_means)}
22
+ )
23
+
24
+ result = grouped.apply(bootstrap_group)
25
+ return result.unstack(-1)
26
+
27
+
28
+ def format_results_vectorized(result_df, precision=2):
29
+ means = result_df["mean"].values
30
+ margins = 2 * result_df["bootstrap_std"].values
31
+
32
+ formatted = np.char.add(
33
+ np.char.add(np.round(means, precision).astype(str), " ± "),
34
+ np.round(margins, precision).astype(str),
35
+ )
36
+
37
+ return pd.DataFrame({"formatted_result": formatted}, index=result_df.index)
38
+
39
+
40
+ def bootstrap_and_format_results(
41
+ df, group_col, value_col, n_bootstrap=1000, precision=2
42
+ ):
43
+ result_df = vectorized_bootstrap_grouped_std(df, group_col, value_col, n_bootstrap)
44
+ return format_results_vectorized(result_df, precision)
45
+
46
+
47
+ def to_base64(image: Image, extension="PNG"):
48
+ img_byte_arr = BytesIO()
49
+ image.save(img_byte_arr, format=extension)
50
+ img_byte_arr = img_byte_arr.getvalue()
51
+ return base64.b64encode(img_byte_arr).decode("utf-8")
52
+
53
+
54
+ def from_base64(base64_str: str):
55
+ image_data = base64.b64decode(base64_str)
56
+ return Image.open(BytesIO(image_data))
@@ -0,0 +1,323 @@
1
+ import math
2
+ import time
3
+
4
+ from loguru import logger
5
+ from PIL import Image
6
+ from pydantic import Field
7
+
8
+ from vlmparse.clients.openai_converter import (
9
+ OpenAIConverterClient,
10
+ OpenAIConverterConfig,
11
+ )
12
+ from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
13
+ from vlmparse.clients.pipe_utils.utils import clean_response
14
+ from vlmparse.data_model.document import Page
15
+ from vlmparse.servers.docker_server import VLLMDockerServerConfig
16
+ from vlmparse.utils import to_base64
17
+
18
+ ALLOWED_TAGS = [
19
+ "math",
20
+ "br",
21
+ "i",
22
+ "b",
23
+ "u",
24
+ "del",
25
+ "sup",
26
+ "sub",
27
+ "table",
28
+ "tr",
29
+ "td",
30
+ "p",
31
+ "th",
32
+ "div",
33
+ "pre",
34
+ "h1",
35
+ "h2",
36
+ "h3",
37
+ "h4",
38
+ "h5",
39
+ "ul",
40
+ "ol",
41
+ "li",
42
+ "input",
43
+ "a",
44
+ "span",
45
+ "img",
46
+ "hr",
47
+ "tbody",
48
+ "small",
49
+ "caption",
50
+ "strong",
51
+ "thead",
52
+ "big",
53
+ "code",
54
+ ]
55
+ ALLOWED_ATTRIBUTES = [
56
+ "class",
57
+ "colspan",
58
+ "rowspan",
59
+ "display",
60
+ "checked",
61
+ "type",
62
+ "border",
63
+ "value",
64
+ "style",
65
+ "href",
66
+ "alt",
67
+ "align",
68
+ ]
69
+
70
+ PROMPT_ENDING = f"""
71
+ Only use these tags {ALLOWED_TAGS}, and these attributes {ALLOWED_ATTRIBUTES}.
72
+
73
+ Guidelines:
74
+ * Inline math: Surround math with <math>...</math> tags. Math expressions should be rendered in KaTeX-compatible LaTeX. Use display for block math.
75
+ * Tables: Use colspan and rowspan attributes to match table structure.
76
+ * Formatting: Maintain consistent formatting with the image, including spacing, indentation, subscripts/superscripts, and special characters.
77
+ * Images: Include a description of any images in the alt attribute of an <img> tag. Do not fill out the src property.
78
+ * Forms: Mark checkboxes and radio buttons properly.
79
+ * Text: join lines together properly into paragraphs using <p>...</p> tags. Use <br> tags for line breaks within paragraphs, but only when absolutely necessary to maintain meaning.
80
+ * Use the simplest possible HTML structure that accurately represents the content of the block.
81
+ * Make sure the text is accurate and easy for a human to read and interpret. Reading order should be correct and natural.
82
+ """.strip()
83
+
84
+ OCR_LAYOUT_PROMPT = f"""
85
+ OCR this image to HTML, arranged as layout blocks. Each layout block should be a div with the data-bbox attribute representing the bounding box of the block in [x0, y0, x1, y1] format. Bboxes are normalized 0-{{bbox_scale}}. The data-label attribute is the label for the block.
86
+
87
+ Use the following labels:
88
+ - Caption
89
+ - Footnote
90
+ - Equation-Block
91
+ - List-Group
92
+ - Page-Header
93
+ - Page-Footer
94
+ - Image
95
+ - Section-Header
96
+ - Table
97
+ - Text
98
+ - Complex-Block
99
+ - Code-Block
100
+ - Form
101
+ - Table-Of-Contents
102
+ - Figure
103
+
104
+ {PROMPT_ENDING}
105
+ """.strip()
106
+
107
+ OCR_PROMPT = f"""
108
+ OCR this image to HTML.
109
+
110
+ {PROMPT_ENDING}
111
+ """.strip()
112
+
113
+ PROMPT_MAPPING = {
114
+ "ocr_layout": OCR_LAYOUT_PROMPT,
115
+ "ocr": OCR_PROMPT,
116
+ }
117
+
118
+
119
+ def scale_to_fit(
120
+ img: Image.Image,
121
+ max_size: tuple[int, int] = (3072, 2048),
122
+ min_size: tuple[int, int] = (28, 28),
123
+ ):
124
+ resample_method = Image.Resampling.LANCZOS
125
+ width, height = img.size
126
+ if width == 0 or height == 0:
127
+ return img
128
+ max_width, max_height = max_size
129
+ min_width, min_height = min_size
130
+ current_pixels = width * height
131
+ max_pixels = max_width * max_height
132
+ min_pixels = min_width * min_height
133
+
134
+ if current_pixels > max_pixels:
135
+ scale_factor = (max_pixels / current_pixels) ** 0.5
136
+ new_width = math.floor(width * scale_factor)
137
+ new_height = math.floor(height * scale_factor)
138
+ elif current_pixels < min_pixels:
139
+ scale_factor = (min_pixels / current_pixels) ** 0.5
140
+ new_width = math.ceil(width * scale_factor)
141
+ new_height = math.ceil(height * scale_factor)
142
+ else:
143
+ return img
144
+
145
+ return img.resize((new_width, new_height), resample=resample_method)
146
+
147
+
148
+ def detect_repeat_token(
149
+ predicted_tokens: str,
150
+ base_max_repeats: int = 4,
151
+ window_size: int = 500,
152
+ cut_from_end: int = 0,
153
+ scaling_factor: float = 3.0,
154
+ ):
155
+ try:
156
+ # Use existing html_to_md_keep_tables from vlmparse
157
+ predicted_tokens = html_to_md_keep_tables(predicted_tokens)
158
+ except Exception as e:
159
+ logger.error(f"Error parsing markdown: {e}")
160
+ return True
161
+
162
+ if cut_from_end > 0:
163
+ predicted_tokens = predicted_tokens[:-cut_from_end]
164
+
165
+ for seq_len in range(1, window_size // 2 + 1):
166
+ # Extract the potential repeating sequence from the end
167
+ candidate_seq = predicted_tokens[-seq_len:]
168
+
169
+ # Inverse scaling: shorter sequences need more repeats
170
+ max_repeats = int(base_max_repeats * (1 + scaling_factor / seq_len))
171
+
172
+ # Count how many times this sequence appears consecutively at the end
173
+ repeat_count = 0
174
+ pos = len(predicted_tokens) - seq_len
175
+ if pos < 0:
176
+ continue
177
+
178
+ while pos >= 0:
179
+ if predicted_tokens[pos : pos + seq_len] == candidate_seq:
180
+ repeat_count += 1
181
+ pos -= seq_len
182
+ else:
183
+ break
184
+
185
+ if repeat_count > max_repeats:
186
+ return True
187
+
188
+ return False
189
+
190
+
191
+ class ChandraConverterConfig(OpenAIConverterConfig):
192
+ """Chandra converter configuration."""
193
+
194
+ model_name: str = "datalab-to/chandra"
195
+ prompt_type: str = "ocr" # Default prompt type
196
+ bbox_scale: int = 1024
197
+ max_retries: int = 6
198
+ max_failure_retries: int = None
199
+ completion_kwargs: dict = Field(
200
+ default_factory=lambda: {
201
+ "temperature": 0.0,
202
+ "max_tokens": 12384,
203
+ "top_p": 0.1,
204
+ }
205
+ )
206
+ aliases: list[str] = Field(default_factory=lambda: ["chandra"])
207
+
208
+ def get_client(self, **kwargs) -> "ChandraConverterClient":
209
+ return ChandraConverterClient(config=self, **kwargs)
210
+
211
+
212
+ class ChandraConverterClient(OpenAIConverterClient):
213
+ """Client for Chandra model."""
214
+
215
+ config: ChandraConverterConfig
216
+
217
+ async def async_call_inside_page(self, page: Page) -> Page:
218
+ """Process a single page using Chandra logic."""
219
+
220
+ prompt = PROMPT_MAPPING.get(self.config.prompt_type, OCR_PROMPT)
221
+ prompt = prompt.replace("{bbox_scale}", str(self.config.bbox_scale))
222
+
223
+ image = scale_to_fit(page.image)
224
+ image_b64 = to_base64(image) # vlmparse.utils.to_base64
225
+
226
+ messages = [
227
+ {
228
+ "role": "user",
229
+ "content": [
230
+ {
231
+ "type": "image_url",
232
+ "image_url": {"url": f"data:image/png;base64,{image_b64}"},
233
+ },
234
+ {"type": "text", "text": prompt},
235
+ ],
236
+ }
237
+ ]
238
+
239
+ retries = 0
240
+ max_retries = self.config.max_retries
241
+ max_failure_retries = self.config.max_failure_retries
242
+
243
+ result_content = ""
244
+ error_occurred = False
245
+
246
+ while True:
247
+ try:
248
+ # Adjust temperature if retrying
249
+ temperature = self.config.completion_kwargs.get("temperature", 0.0)
250
+ if retries > 0:
251
+ temperature = 0.3 # As per vllm.py logic
252
+
253
+ completion_kwargs = self.config.completion_kwargs.copy()
254
+ completion_kwargs["temperature"] = temperature
255
+ if retries > 0:
256
+ completion_kwargs["top_p"] = 0.95
257
+
258
+ result_content = await self._get_chat_completion(
259
+ messages, completion_kwargs=completion_kwargs
260
+ )
261
+ error_occurred = False
262
+ except Exception as e:
263
+ logger.error(f"Error during VLLM generation: {e}")
264
+ error_occurred = True
265
+ result_content = ""
266
+
267
+ should_retry = False
268
+
269
+ # Check for repeat token
270
+ if not error_occurred:
271
+ has_repeat = detect_repeat_token(result_content) or (
272
+ len(result_content) > 50
273
+ and detect_repeat_token(result_content, cut_from_end=50)
274
+ )
275
+ if has_repeat and retries < max_retries:
276
+ logger.warning(
277
+ f"Detected repeat token, retrying generation (attempt {retries + 1})..."
278
+ )
279
+ should_retry = True
280
+
281
+ # Check for error
282
+ if error_occurred:
283
+ if max_failure_retries is not None:
284
+ if retries < max_failure_retries:
285
+ logger.warning(
286
+ f"Detected vllm error, retrying generation (attempt {retries + 1})..."
287
+ )
288
+ should_retry = True
289
+ elif (
290
+ retries < max_retries
291
+ ): # Fallback to max_retries if max_failure_retries not set (vllm.py logic varies slightly but this is safe)
292
+ logger.warning(
293
+ f"Detected vllm error, retrying generation (attempt {retries + 1})..."
294
+ )
295
+ should_retry = True
296
+
297
+ if should_retry:
298
+ time.sleep(2 * (retries + 1))
299
+ retries += 1
300
+ continue
301
+ else:
302
+ break
303
+
304
+ logger.info("Response length: " + str(len(result_content)))
305
+ page.raw_response = result_content
306
+ text = clean_response(result_content)
307
+
308
+ # Convert HTML to MD
309
+ text = html_to_md_keep_tables(text)
310
+ page.text = text
311
+
312
+ return page
313
+
314
+
315
+ class ChandraDockerServerConfig(VLLMDockerServerConfig):
316
+ """Configuration for Chandra Docker server."""
317
+
318
+ model_name: str = "datalab-to/chandra"
319
+ aliases: list[str] = Field(default_factory=lambda: ["chandra"])
320
+
321
+ @property
322
+ def client_config(self):
323
+ return ChandraConverterConfig(llm_params=self.llm_params)
@@ -0,0 +1,52 @@
1
+ from pydantic import Field
2
+
3
+ from vlmparse.clients.openai_converter import OpenAIConverterConfig
4
+ from vlmparse.servers.docker_server import VLLMDockerServerConfig
5
+
6
+
7
+ class DeepSeekOCRDockerServerConfig(VLLMDockerServerConfig):
8
+ """Configuration for DeepSeekOCR model."""
9
+
10
+ model_name: str = "deepseek-ai/DeepSeek-OCR"
11
+ command_args: list[str] = Field(
12
+ default_factory=lambda: [
13
+ "--limit-mm-per-prompt",
14
+ '{"image": 1}',
15
+ "--async-scheduling",
16
+ "--logits_processors",
17
+ "vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor",
18
+ "--no-enable-prefix-caching",
19
+ "--mm-processor-cache-gb",
20
+ "0",
21
+ ]
22
+ )
23
+ aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
24
+
25
+ @property
26
+ def client_config(self):
27
+ return DeepSeekOCRConverterConfig(llm_params=self.llm_params)
28
+
29
+
30
+ class DeepSeekOCRConverterConfig(OpenAIConverterConfig):
31
+ """DeepSeekOCR converter - backward compatibility alias."""
32
+
33
+ model_name: str = "deepseek-ai/DeepSeek-OCR"
34
+ aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])
35
+ preprompt: str | None = None
36
+ postprompt: str | None = "<|grounding|>Convert the document to markdown."
37
+ completion_kwargs: dict | None = {
38
+ "temperature": 0.0,
39
+ "extra_body": {
40
+ "skip_special_tokens": False,
41
+ # args used to control custom logits processor
42
+ "vllm_xargs": {
43
+ "ngram_size": 30,
44
+ "window_size": 90,
45
+ # whitelist: <td>, </td>
46
+ "whitelist_token_ids": [128821, 128822],
47
+ },
48
+ },
49
+ }
50
+ max_image_size: int | None = 1540
51
+ dpi: int = 200
52
+ aliases: list[str] = Field(default_factory=lambda: ["deepseekocr"])