vlmparse 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. vlmparse/benchpdf2md/bench_tests/benchmark_tsts.py +1763 -0
  2. vlmparse/benchpdf2md/bench_tests/utils.py +0 -0
  3. vlmparse/benchpdf2md/create_dataset.py +60 -0
  4. vlmparse/benchpdf2md/olmocrbench/katex/__init__.py +1 -0
  5. vlmparse/benchpdf2md/olmocrbench/katex/render.py +592 -0
  6. vlmparse/benchpdf2md/olmocrbench/repeatdetect.py +175 -0
  7. vlmparse/benchpdf2md/olmocrbench/run_olmocr_bench.py +256 -0
  8. vlmparse/benchpdf2md/olmocrbench/tests.py +1334 -0
  9. vlmparse/benchpdf2md/run_benchmark.py +296 -0
  10. vlmparse/benchpdf2md/st_visu_benchmark/app.py +271 -0
  11. vlmparse/benchpdf2md/st_visu_benchmark/highligh_text.py +117 -0
  12. vlmparse/benchpdf2md/st_visu_benchmark/test_form.py +95 -0
  13. vlmparse/benchpdf2md/st_visu_benchmark/ui_elements.py +20 -0
  14. vlmparse/benchpdf2md/st_visu_benchmark/utils.py +50 -0
  15. vlmparse/benchpdf2md/utils.py +56 -0
  16. vlmparse/clients/chandra.py +323 -0
  17. vlmparse/clients/deepseekocr.py +52 -0
  18. vlmparse/clients/docling.py +146 -0
  19. vlmparse/clients/dotsocr.py +277 -0
  20. vlmparse/clients/granite_docling.py +132 -0
  21. vlmparse/clients/hunyuanocr.py +45 -0
  22. vlmparse/clients/lightonocr.py +43 -0
  23. vlmparse/clients/mineru.py +119 -0
  24. vlmparse/clients/nanonetocr.py +29 -0
  25. vlmparse/clients/olmocr.py +46 -0
  26. vlmparse/clients/openai_converter.py +173 -0
  27. vlmparse/clients/paddleocrvl.py +48 -0
  28. vlmparse/clients/pipe_utils/cleaner.py +74 -0
  29. vlmparse/clients/pipe_utils/html_to_md_conversion.py +136 -0
  30. vlmparse/clients/pipe_utils/utils.py +12 -0
  31. vlmparse/clients/prompts.py +66 -0
  32. vlmparse/data_model/box.py +551 -0
  33. vlmparse/data_model/document.py +148 -0
  34. vlmparse/servers/docker_server.py +199 -0
  35. vlmparse/servers/utils.py +250 -0
  36. vlmparse/st_viewer/fs_nav.py +53 -0
  37. vlmparse/st_viewer/st_viewer.py +80 -0
  38. {vlmparse-0.1.0.dist-info → vlmparse-0.1.2.dist-info}/METADATA +11 -1
  39. vlmparse-0.1.2.dist-info/RECORD +50 -0
  40. vlmparse-0.1.0.dist-info/RECORD +0 -13
  41. {vlmparse-0.1.0.dist-info → vlmparse-0.1.2.dist-info}/WHEEL +0 -0
  42. {vlmparse-0.1.0.dist-info → vlmparse-0.1.2.dist-info}/entry_points.txt +0 -0
  43. {vlmparse-0.1.0.dist-info → vlmparse-0.1.2.dist-info}/licenses/LICENSE +0 -0
  44. {vlmparse-0.1.0.dist-info → vlmparse-0.1.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,146 @@
1
+ import asyncio
2
+ from io import BytesIO
3
+ from typing import Literal
4
+
5
+ import httpx
6
+ from loguru import logger
7
+ from PIL import Image
8
+ from pydantic import Field
9
+
10
+ from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
11
+ from vlmparse.clients.pipe_utils.utils import clean_response
12
+ from vlmparse.converter import BaseConverter, ConverterConfig
13
+ from vlmparse.data_model.document import Page
14
+ from vlmparse.servers.docker_server import DockerServerConfig
15
+
16
+
17
+ class DoclingDockerServerConfig(DockerServerConfig):
18
+ """Configuration for Docling Serve using official image."""
19
+
20
+ model_name: str = "docling"
21
+ docker_image: str = Field(default="")
22
+ cpu_only: bool = False
23
+ command_args: list[str] = Field(default_factory=list)
24
+ server_ready_indicators: list[str] = Field(
25
+ default_factory=lambda: ["Application startup complete", "Uvicorn running"]
26
+ )
27
+ enable_ui: bool = False
28
+ docker_port: int = 5001
29
+ container_port: int = 5001
30
+ environment: dict[str, str] = Field(
31
+ default_factory=lambda: {
32
+ "DOCLING_SERVE_HOST": "0.0.0.0",
33
+ "DOCLING_SERVE_PORT": "5001",
34
+ "LOG_LEVEL": "DEBUG", # Enable verbose logging
35
+ # Performance Tuning
36
+ # "UVICORN_WORKERS": "4", # Increase web server workers (Default: 1)
37
+ # "DOCLING_SERVE_ENG_LOC_NUM_WORKERS": "4", # Increase processing workers (Default: 2)
38
+ "DOCLING_NUM_THREADS": "32", # Increase torch threads (Default: 4)
39
+ }
40
+ )
41
+
42
+ def model_post_init(self, __context):
43
+ """Set docker_image and gpu_device_ids based on cpu_only if not explicitly provided."""
44
+ if not self.docker_image:
45
+ if self.cpu_only:
46
+ self.docker_image = "quay.io/docling-project/docling-serve-cpu:latest"
47
+ else:
48
+ self.docker_image = "quay.io/docling-project/docling-serve:latest"
49
+
50
+ # For CPU-only mode, explicitly disable GPU by setting empty list
51
+ if self.cpu_only and self.gpu_device_ids is None:
52
+ self.gpu_device_ids = []
53
+
54
+ if self.enable_ui:
55
+ self.command_args.append("--enable-ui")
56
+
57
+ @property
58
+ def client_config(self):
59
+ return DoclingConverterConfig(base_url=f"http://localhost:{self.docker_port}")
60
+
61
+
62
+ class DoclingConverterConfig(ConverterConfig):
63
+ """Configuration for Docling converter client."""
64
+
65
+ model_name: str = "docling"
66
+ base_url: str = "http://localhost:5001"
67
+ timeout: int = 300
68
+ api_kwargs: dict = {"output_format": "markdown", "image_export_mode": "referenced"}
69
+
70
+ def get_client(self, **kwargs) -> "DoclingConverter":
71
+ return DoclingConverter(config=self, **kwargs)
72
+
73
+
74
+ def image_to_bytes(image: Image.Image) -> bytes:
75
+ # Convert image to bytes for file upload
76
+ img_byte_arr = BytesIO()
77
+ image.save(img_byte_arr, format="PNG")
78
+ img_bytes = img_byte_arr.getvalue()
79
+ return img_bytes
80
+
81
+
82
+ class DoclingConverter(BaseConverter):
83
+ """Client for Docling Serve API using httpx."""
84
+
85
+ def __init__(
86
+ self,
87
+ config: DoclingConverterConfig,
88
+ num_concurrent_files: int = 10,
89
+ num_concurrent_pages: int = 10,
90
+ save_folder: str | None = None,
91
+ save_mode: Literal["document", "md", "md_page"] = "document",
92
+ debug: bool = False,
93
+ return_documents_in_batch_mode: bool = False,
94
+ ):
95
+ super().__init__(
96
+ config=config,
97
+ num_concurrent_files=num_concurrent_files,
98
+ num_concurrent_pages=num_concurrent_pages,
99
+ save_folder=save_folder,
100
+ save_mode=save_mode,
101
+ debug=debug,
102
+ return_documents_in_batch_mode=return_documents_in_batch_mode,
103
+ )
104
+
105
+ async def async_call_inside_page(self, page: Page) -> Page:
106
+ """Process a single page using Docling Serve API."""
107
+ img_bytes = await asyncio.to_thread(image_to_bytes, page.image)
108
+
109
+ data = self.config.api_kwargs
110
+ url = f"{self.config.base_url}/v1/convert/file"
111
+ logger.debug(f"Calling Docling API at: {url}")
112
+ files = {"files": ("image.png", img_bytes, "image/png")}
113
+
114
+ try:
115
+ async with httpx.AsyncClient(timeout=self.config.timeout) as client:
116
+ response = await client.post(
117
+ url, files=files, data=data, headers={"Accept": "application/json"}
118
+ )
119
+ response.raise_for_status()
120
+
121
+ result = response.json()
122
+ logger.info(f"Docling API response status: {response.status_code}")
123
+
124
+ # Extract text from the response
125
+ # The response structure depends on the output format
126
+ if self.config.api_kwargs["output_format"] == "markdown":
127
+ text = result["document"]["md_content"]
128
+
129
+ elif self.config.api_kwargs["output_format"] == "text":
130
+ text = result["document"]["md_content"]
131
+
132
+ else: # json or other formats
133
+ text = str(result)
134
+
135
+ logger.info(f"Extracted text length: {len(text)}")
136
+
137
+ # Clean and convert the response
138
+ text = clean_response(text)
139
+ text = html_to_md_keep_tables(text)
140
+ page.text = text
141
+
142
+ except Exception as e:
143
+ logger.error(f"Error processing page with Docling: {e}")
144
+ page.text = f"Error: {str(e)}"
145
+
146
+ return page
@@ -0,0 +1,277 @@
1
+ import json
2
+ import math
3
+ from pathlib import Path
4
+ from typing import ClassVar, Literal
5
+
6
+ from loguru import logger
7
+ from PIL import Image
8
+ from pydantic import Field
9
+
10
+ from vlmparse.clients.openai_converter import (
11
+ OpenAIConverterClient,
12
+ OpenAIConverterConfig,
13
+ )
14
+ from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
15
+ from vlmparse.clients.pipe_utils.utils import clean_response
16
+ from vlmparse.data_model.document import BoundingBox, Item, Page
17
+ from vlmparse.servers.docker_server import DEFAULT_MODEL_NAME, DockerServerConfig
18
+ from vlmparse.utils import to_base64
19
+
20
+ DOCKERFILE_DIR = Path(__file__).parent.parent.parent / "docker_pipelines"
21
+
22
+
23
+ class DotsOCRDockerServerConfig(DockerServerConfig):
24
+ """Configuration for DotsOCR model."""
25
+
26
+ model_name: str = "rednote-hilab/dots.ocr"
27
+ docker_image: str = "dotsocr:latest"
28
+ dockerfile_dir: str = str(DOCKERFILE_DIR / "dotsocr")
29
+ command_args: list[str] = Field(
30
+ default_factory=lambda: [
31
+ "--tensor-parallel-size",
32
+ "1",
33
+ "--gpu-memory-utilization",
34
+ "0.8",
35
+ "--chat-template-content-format",
36
+ "string",
37
+ "--served-model-name",
38
+ DEFAULT_MODEL_NAME,
39
+ "--trust-remote-code",
40
+ # "--limit-mm-per-prompt",
41
+ # '{"image": 1}',
42
+ # "--no-enable-prefix-caching",
43
+ # "--max-model-len",
44
+ # "16384",
45
+ ]
46
+ )
47
+ add_model_key_to_server: bool = False
48
+ aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
49
+
50
+ @property
51
+ def client_config(self):
52
+ return DotsOCRConverterConfig(llm_params=self.llm_params)
53
+
54
+
55
+ class DotsOCRConverterConfig(OpenAIConverterConfig):
56
+ model_name: str = "rednote-hilab/dots.ocr"
57
+ preprompt: str | None = ""
58
+ postprompt: str | None = None
59
+ completion_kwargs: dict | None = {
60
+ "temperature": 0.1,
61
+ "top_p": 1.0,
62
+ "max_completion_tokens": 16384,
63
+ }
64
+ aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
65
+ dpi: int = 200
66
+ prompt_mode: Literal["prompt_layout_all_en", "prompt_ocr"] = "prompt_ocr"
67
+
68
+ def get_client(self, **kwargs) -> "DotsOCRConverter":
69
+ return DotsOCRConverter(config=self, **kwargs)
70
+
71
+
72
+ class DotsOCRConverter(OpenAIConverterClient):
73
+ """DotsOCR VLLM converter."""
74
+
75
+ # Constants
76
+ MIN_PIXELS: ClassVar[int] = 3136
77
+ MAX_PIXELS: ClassVar[int] = 11289600
78
+ IMAGE_FACTOR: ClassVar[int] = 28
79
+
80
+ # Prompts
81
+ PROMPTS: ClassVar[dict] = {
82
+ "prompt_layout_all_en": """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
83
+
84
+ 1. Bbox format: [x1, y1, x2, y2]
85
+
86
+ 2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
87
+
88
+ 3. Text Extraction & Formatting Rules:
89
+ - Picture: For the 'Picture' category, the text field should be omitted.
90
+ - Formula: Format its text as LaTeX.
91
+ - Table: Format its text as HTML.
92
+ - All Others (Text, Title, etc.): Format their text as Markdown.
93
+
94
+ 4. Constraints:
95
+ - The output text must be the original text from the image, with no translation.
96
+ - All layout elements must be sorted according to human reading order.
97
+
98
+ 5. Final Output: The entire output must be a single JSON object.
99
+ """,
100
+ "prompt_ocr": """Extract the text content from this image.""",
101
+ }
102
+
103
+ @staticmethod
104
+ def round_by_factor(number: int, factor: int) -> int:
105
+ """Returns the closest integer to 'number' that is divisible by 'factor'."""
106
+ return round(number / factor) * factor
107
+
108
+ @staticmethod
109
+ def ceil_by_factor(number: int, factor: int) -> int:
110
+ """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
111
+ return math.ceil(number / factor) * factor
112
+
113
+ @staticmethod
114
+ def floor_by_factor(number: int, factor: int) -> int:
115
+ """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
116
+ return math.floor(number / factor) * factor
117
+
118
+ def smart_resize(
119
+ self,
120
+ height: int,
121
+ width: int,
122
+ factor: int = 28,
123
+ min_pixels: int = 3136,
124
+ max_pixels: int = 11289600,
125
+ ):
126
+ """Rescales image dimensions to meet factor, pixel range, and aspect ratio constraints."""
127
+ if max(height, width) / min(height, width) > 200:
128
+ raise ValueError(
129
+ f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
130
+ )
131
+ h_bar = max(factor, self.round_by_factor(height, factor))
132
+ w_bar = max(factor, self.round_by_factor(width, factor))
133
+ if h_bar * w_bar > max_pixels:
134
+ beta = math.sqrt((height * width) / max_pixels)
135
+ h_bar = max(factor, self.floor_by_factor(height / beta, factor))
136
+ w_bar = max(factor, self.floor_by_factor(width / beta, factor))
137
+ elif h_bar * w_bar < min_pixels:
138
+ beta = math.sqrt(min_pixels / (height * width))
139
+ h_bar = self.ceil_by_factor(height * beta, factor)
140
+ w_bar = self.ceil_by_factor(width * beta, factor)
141
+ if h_bar * w_bar > max_pixels:
142
+ beta = math.sqrt((h_bar * w_bar) / max_pixels)
143
+ h_bar = max(factor, self.floor_by_factor(h_bar / beta, factor))
144
+ w_bar = max(factor, self.floor_by_factor(w_bar / beta, factor))
145
+ return h_bar, w_bar
146
+
147
+ def fetch_image(
148
+ self,
149
+ image,
150
+ min_pixels=None,
151
+ max_pixels=None,
152
+ ) -> Image.Image:
153
+ """Fetch and resize image."""
154
+ # Resize if needed
155
+ if min_pixels or max_pixels:
156
+ width, height = image.size
157
+ if not min_pixels:
158
+ min_pixels = self.MIN_PIXELS
159
+ if not max_pixels:
160
+ max_pixels = self.MAX_PIXELS
161
+ resized_height, resized_width = self.smart_resize(
162
+ height,
163
+ width,
164
+ factor=self.IMAGE_FACTOR,
165
+ min_pixels=min_pixels,
166
+ max_pixels=max_pixels,
167
+ )
168
+ assert resized_height > 0 and resized_width > 0
169
+ image = image.resize((resized_width, resized_height))
170
+
171
+ return image
172
+
173
+ def post_process_cells(
174
+ self,
175
+ origin_image: Image.Image,
176
+ cells: list,
177
+ input_width: int,
178
+ input_height: int,
179
+ ) -> list:
180
+ """Post-process cell bounding boxes to original image dimensions."""
181
+ if not cells or not isinstance(cells, list):
182
+ return cells
183
+
184
+ original_width, original_height = origin_image.size
185
+
186
+ scale_x = input_width / original_width
187
+ scale_y = input_height / original_height
188
+
189
+ cells_out = []
190
+ for cell in cells:
191
+ bbox = cell["bbox"]
192
+ bbox_resized = [
193
+ int(float(bbox[0]) / scale_x),
194
+ int(float(bbox[1]) / scale_y),
195
+ int(float(bbox[2]) / scale_x),
196
+ int(float(bbox[3]) / scale_y),
197
+ ]
198
+ cell_copy = cell.copy()
199
+ cell_copy["bbox"] = bbox_resized
200
+ cells_out.append(cell_copy)
201
+
202
+ return cells_out
203
+
204
+ async def _async_inference_with_vllm(self, image, prompt):
205
+ """Run async inference with VLLM."""
206
+ messages = [
207
+ {
208
+ "role": "user",
209
+ "content": [
210
+ {
211
+ "type": "image_url",
212
+ "image_url": {
213
+ "url": f"data:image/png;base64,{to_base64(image)}"
214
+ },
215
+ },
216
+ {"type": "text", "text": f"<|img|><|imgpad|><|endofimg|>{prompt}"},
217
+ ],
218
+ }
219
+ ]
220
+
221
+ return await self._get_chat_completion(messages)
222
+
223
+ async def _parse_image_vllm(self, origin_image, prompt_mode="prompt_layout_all_en"):
224
+ """Parse image using VLLM inference."""
225
+
226
+ image = self.fetch_image(
227
+ origin_image, min_pixels=self.MIN_PIXELS, max_pixels=self.MAX_PIXELS
228
+ )
229
+ prompt = self.PROMPTS[prompt_mode]
230
+
231
+ response = await self._async_inference_with_vllm(image, prompt)
232
+
233
+ if prompt_mode in ["prompt_layout_all_en"]:
234
+ try:
235
+ cells = json.loads(response)
236
+ cells = self.post_process_cells(
237
+ origin_image,
238
+ cells,
239
+ image.width,
240
+ image.height,
241
+ )
242
+ return {}, cells, False
243
+ except Exception as e:
244
+ logger.warning(f"cells post process error: {e}, returning raw response")
245
+ return {}, response, True
246
+ else:
247
+ return {}, response, None
248
+
249
+ async def async_call_inside_page(self, page: Page) -> Page:
250
+ image = page.image
251
+
252
+ _, response, _ = await self._parse_image_vllm(
253
+ image, prompt_mode=self.config.prompt_mode
254
+ )
255
+ logger.info("Response: " + str(response))
256
+
257
+ items = None
258
+ if self.config.prompt_mode == "prompt_layout_all_en":
259
+ text = "\n\n".join([item.get("text", "") for item in response])
260
+
261
+ items = []
262
+ for item in response:
263
+ l, t, r, b = item["bbox"]
264
+ items.append(
265
+ Item(
266
+ text=item.get("text", ""),
267
+ box=BoundingBox(l=l, t=t, r=r, b=b),
268
+ category=item["category"],
269
+ )
270
+ )
271
+ response = text
272
+ page.items = items
273
+
274
+ text = clean_response(response)
275
+ text = html_to_md_keep_tables(text)
276
+ page.text = text
277
+ return page
@@ -0,0 +1,132 @@
1
+ from pydantic import Field
2
+
3
+ from vlmparse.clients.openai_converter import (
4
+ OpenAIConverterClient,
5
+ OpenAIConverterConfig,
6
+ )
7
+ from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
8
+ from vlmparse.clients.pipe_utils.utils import clean_response
9
+ from vlmparse.data_model.document import Page
10
+ from vlmparse.servers.docker_server import VLLMDockerServerConfig
11
+ from vlmparse.utils import to_base64
12
+
13
+
14
+ class GraniteDoclingDockerServerConfig(VLLMDockerServerConfig):
15
+ """Configuration for Granite Docling model."""
16
+
17
+ model_name: str = "ibm-granite/granite-docling-258M"
18
+ command_args: list[str] = Field(
19
+ default_factory=lambda: [
20
+ "--revision",
21
+ "untied",
22
+ "--limit-mm-per-prompt",
23
+ '{"image": 1}',
24
+ "--trust-remote-code",
25
+ ]
26
+ )
27
+ aliases: list[str] = Field(default_factory=lambda: ["granite-docling"])
28
+
29
+ @property
30
+ def client_config(self):
31
+ return GraniteDoclingConverterConfig(llm_params=self.llm_params)
32
+
33
+
34
+ class GraniteDoclingConverterConfig(OpenAIConverterConfig):
35
+ """Granite Docling converter configuration."""
36
+
37
+ preprompt: str | None = None
38
+ postprompt: str | None = "Convert this page to docling."
39
+ completion_kwargs: dict | None = {
40
+ "temperature": 0.0,
41
+ "max_tokens": 8000,
42
+ "extra_body": {
43
+ "skip_special_tokens": False,
44
+ },
45
+ }
46
+ aliases: list[str] = Field(default_factory=lambda: ["granite-docling"])
47
+
48
+ def get_client(self, **kwargs) -> "GraniteDoclingConverter":
49
+ return GraniteDoclingConverter(config=self, **kwargs)
50
+
51
+
52
+ class GraniteDoclingConverter(OpenAIConverterClient):
53
+ """Client for Granite Docling model."""
54
+
55
+ async def async_call_inside_page(self, page: Page) -> Page:
56
+ image = page.image.convert("RGB")
57
+ messages = [
58
+ {
59
+ "role": "user",
60
+ "content": [
61
+ {
62
+ "type": "image_url",
63
+ "image_url": {
64
+ "url": f"data:image/png;base64,{to_base64(image)}"
65
+ },
66
+ },
67
+ {"type": "text", "text": self.config.postprompt},
68
+ ],
69
+ }
70
+ ]
71
+
72
+ doctags = await self._get_chat_completion_adaptive(
73
+ messages, completion_kwargs=self.config.completion_kwargs
74
+ )
75
+ doctags = clean_response(doctags)
76
+
77
+ page.raw_response = doctags
78
+ page.text = _doctags_to_markdown(doctags, image)
79
+ return page
80
+
81
+ async def _get_chat_completion_adaptive(
82
+ self, messages: list[dict], completion_kwargs: dict | None
83
+ ) -> str:
84
+ """
85
+ vLLM enforces input+output <= model context length. If `max_tokens` is too
86
+ high (especially for multimodal prompts), retry with progressively smaller
87
+ `max_tokens`.
88
+ """
89
+ kwargs = (completion_kwargs or {}).copy()
90
+ max_tokens = kwargs.get("max_tokens") or kwargs.get("max_completion_tokens")
91
+
92
+ for _ in range(6):
93
+ try:
94
+ return await self._get_chat_completion(
95
+ messages, completion_kwargs=kwargs
96
+ )
97
+ except Exception as e:
98
+ msg = str(e)
99
+ too_large = (
100
+ "max_tokens" in msg
101
+ and "maximum context length" in msg
102
+ and "is too large" in msg
103
+ )
104
+ if not too_large or not isinstance(max_tokens, int):
105
+ raise
106
+
107
+ max_tokens = max(256, int(max_tokens * 0.75))
108
+ if "max_tokens" in kwargs:
109
+ kwargs["max_tokens"] = max_tokens
110
+ if "max_completion_tokens" in kwargs:
111
+ kwargs["max_completion_tokens"] = max_tokens
112
+
113
+ return await self._get_chat_completion(messages, completion_kwargs=kwargs)
114
+
115
+
116
+ def _doctags_to_markdown(doctags: str, image):
117
+ try:
118
+ from docling_core.types.doc import DoclingDocument
119
+ from docling_core.types.doc.document import DocTagsDocument
120
+ except Exception as e: # pragma: no cover
121
+ raise RuntimeError(
122
+ "Missing optional dependency 'docling-core'. "
123
+ "Install it with: pip install 'vlmparse[docling_core]'"
124
+ ) from e
125
+
126
+ doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
127
+ doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
128
+
129
+ html = doc.export_to_html()
130
+ html = clean_response(html)
131
+ md = html_to_md_keep_tables(html, remove_head=True)
132
+ return md
@@ -0,0 +1,45 @@
1
+ from pydantic import Field
2
+
3
+ from vlmparse.clients.openai_converter import OpenAIConverterConfig
4
+ from vlmparse.servers.docker_server import VLLMDockerServerConfig
5
+
6
+
7
+ class HunyuanOCRDockerServerConfig(VLLMDockerServerConfig):
8
+ """Configuration for HunyuanOCR model."""
9
+
10
+ model_name: str = "tencent/HunyuanOCR"
11
+ command_args: list[str] = Field(
12
+ default_factory=lambda: [
13
+ "--limit-mm-per-prompt",
14
+ '{"image": 1}',
15
+ "--async-scheduling",
16
+ "--no-enable-prefix-caching",
17
+ "--mm-processor-cache-gb",
18
+ "0",
19
+ # Default argument in the hunyuan model, not sure why it is set this low.
20
+ "--gpu-memory-utilization",
21
+ "0.2",
22
+ ]
23
+ )
24
+ aliases: list[str] = Field(default_factory=lambda: ["hunyuanocr"])
25
+
26
+ @property
27
+ def client_config(self):
28
+ return HunyuanOCRConverterConfig(llm_params=self.llm_params)
29
+
30
+
31
+ class HunyuanOCRConverterConfig(OpenAIConverterConfig):
32
+ """HunyuanOCR converter"""
33
+
34
+ model_name: str = "tencent/HunyuanOCR"
35
+ preprompt: str | None = ""
36
+ postprompt: str | None = (
37
+ "Extract all information from the main body of the document image and represent it in markdown format, ignoring headers and footers. Tables should be expressed in HTML format, formulas in the document should be represented using LaTeX format, and the parsing should be organized according to the reading order."
38
+ )
39
+ completion_kwargs: dict | None = {
40
+ "temperature": 0.0,
41
+ "extra_body": {"top_k": 1, "repetition_penalty": 1.0},
42
+ }
43
+ max_image_size: int | None = 1540
44
+ dpi: int = 200
45
+ aliases: list[str] = Field(default_factory=lambda: ["hunyuanocr"])
@@ -0,0 +1,43 @@
1
+ from pathlib import Path
2
+
3
+ from pydantic import Field
4
+
5
+ from vlmparse.clients.openai_converter import OpenAIConverterConfig
6
+ from vlmparse.servers.docker_server import VLLMDockerServerConfig
7
+
8
+ DOCKERFILE_DIR = Path(__file__).parent.parent.parent / "docker_pipelines"
9
+
10
+
11
+ class LightOnOCRDockerServerConfig(VLLMDockerServerConfig):
12
+ """Configuration for LightOnOCR model."""
13
+
14
+ model_name: str = "lightonai/LightOnOCR-1B-1025"
15
+ command_args: list[str] = Field(
16
+ default_factory=lambda: [
17
+ "--limit-mm-per-prompt",
18
+ '{"image": 1}',
19
+ "--mm-processor-cache-gb",
20
+ "0",
21
+ "--no-enable-prefix-caching",
22
+ ]
23
+ )
24
+ aliases: list[str] = Field(default_factory=lambda: ["lightonocr"])
25
+
26
+ @property
27
+ def client_config(self):
28
+ return LightOnOCRConverterConfig(llm_params=self.llm_params)
29
+
30
+
31
+ class LightOnOCRConverterConfig(OpenAIConverterConfig):
32
+ """LightOnOCR converter - backward compatibility alias."""
33
+
34
+ model_name: str = "lightonai/LightOnOCR-1B-1025"
35
+ preprompt: str | None = None
36
+ postprompt: str | None = None
37
+ completion_kwargs: dict | None = {
38
+ "temperature": 0.2,
39
+ "max_tokens": 4096,
40
+ "top_p": 0.9,
41
+ }
42
+ dpi: int = 200
43
+ aliases: list[str] = Field(default_factory=lambda: ["lightonocr"])