vlmparse 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,6 +13,10 @@ from vlmparse.data_model.document import Item, Page
13
13
  from vlmparse.servers.docker_server import VLLMDockerServerConfig
14
14
  from vlmparse.utils import to_base64
15
15
 
16
+ # ==============================================================================
17
+ # DeepSeek-OCR (v1)
18
+ # ==============================================================================
19
+
16
20
 
17
21
  class DeepSeekOCRDockerServerConfig(VLLMDockerServerConfig):
18
22
  """Configuration for DeepSeekOCR model."""
@@ -212,3 +216,169 @@ class DeepSeekOCRConverterClient(OpenAIConverterClient):
212
216
  page.completion_tokens = usage.completion_tokens
213
217
 
214
218
  return page
219
+
220
+
221
+ # ==============================================================================
222
+ # DeepSeek-OCR-2
223
+ # ==============================================================================
224
+
225
+
226
+ class DeepSeekOCR2DockerServerConfig(VLLMDockerServerConfig):
227
+ """Configuration for DeepSeek-OCR-2 model.
228
+
229
+ DeepSeek-OCR-2 uses a custom architecture that requires:
230
+ - Custom model registration via hf_overrides
231
+ - NoRepeatNGram logits processor with specific whitelist tokens
232
+ - Custom image processor (DeepseekOCR2Processor)
233
+ """
234
+
235
+ docker_image: str = "vllm/vllm-openai:nightly"
236
+ model_name: str = "deepseek-ai/DeepSeek-OCR-2"
237
+ command_args: list[str] = Field(
238
+ default_factory=lambda: [
239
+ "--limit-mm-per-prompt",
240
+ '{"image": 1}',
241
+ "--hf-overrides",
242
+ '{"architectures": ["DeepseekOCR2ForCausalLM"]}',
243
+ "--block-size",
244
+ "256",
245
+ "--trust-remote-code",
246
+ "--max-model-len",
247
+ "8192",
248
+ "--swap-space",
249
+ "0",
250
+ "--gpu-memory-utilization",
251
+ "0.9",
252
+ "--logits_processors",
253
+ "vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor",
254
+ ]
255
+ )
256
+ aliases: list[str] = Field(
257
+ default_factory=lambda: ["deepseekocr2", "DeepSeek-OCR-2"]
258
+ )
259
+
260
+ @property
261
+ def client_config(self):
262
+ return DeepSeekOCR2ConverterConfig(
263
+ **self._create_client_kwargs(
264
+ f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
265
+ )
266
+ )
267
+
268
+
269
+ class DeepSeekOCR2ConverterConfig(OpenAIConverterConfig):
270
+ """DeepSeek-OCR-2 converter configuration.
271
+
272
+ Key differences from DeepSeek-OCR v1:
273
+ - Uses DeepseekOCR2ForCausalLM architecture
274
+ - Different logits processor parameters (ngram_size=20, window_size=50)
275
+ - Supports cropping mode for image processing
276
+ """
277
+
278
+ model_name: str = "deepseek-ai/DeepSeek-OCR-2"
279
+ aliases: list[str] = Field(
280
+ default_factory=lambda: ["deepseekocr2", "DeepSeek-OCR-2"]
281
+ )
282
+ postprompt: str | None = None
283
+ prompts: dict[str, str] = {
284
+ "layout": "<|grounding|>Convert the document to markdown.",
285
+ "ocr": "Free OCR.",
286
+ "image_description": "Describe this image in detail.",
287
+ }
288
+ prompt_mode_map: dict[str, str] = {
289
+ "ocr_layout": "layout",
290
+ "table": "layout",
291
+ }
292
+
293
+ completion_kwargs: dict | None = {
294
+ "temperature": 0.0,
295
+ "max_tokens": 8180,
296
+ "extra_body": {
297
+ "skip_special_tokens": False,
298
+ # args used to control custom logits processor
299
+ "vllm_xargs": {
300
+ "ngram_size": 20,
301
+ "window_size": 50,
302
+ # whitelist: <td>, </td>
303
+ "whitelist_token_ids": [128821, 128822],
304
+ },
305
+ },
306
+ }
307
+ dpi: int = 144 # Default DPI used in reference implementation
308
+
309
+ def get_client(self, **kwargs) -> "DeepSeekOCR2ConverterClient":
310
+ return DeepSeekOCR2ConverterClient(config=self, **kwargs)
311
+
312
+
313
+ class DeepSeekOCR2ConverterClient(DeepSeekOCRConverterClient):
314
+ """Client for DeepSeek-OCR-2 with specific post-processing.
315
+
316
+ Inherits from DeepSeekOCRConverterClient as the post-processing logic
317
+ for parsing grounding references and extracting items is the same.
318
+ The main differences are in the model configuration and logits processor.
319
+ """
320
+
321
+ async def async_call_inside_page(self, page: Page) -> Page:
322
+ # Prepare messages as in parent class
323
+ image = page.image
324
+
325
+ prompt_key = self.get_prompt_key() or "ocr"
326
+
327
+ messages = [
328
+ {
329
+ "role": "user",
330
+ "content": [
331
+ {
332
+ "type": "image_url",
333
+ "image_url": {
334
+ "url": f"data:image/png;base64,{to_base64(image)}"
335
+ },
336
+ },
337
+ {"type": "text", "text": self.config.prompts[prompt_key]},
338
+ ],
339
+ },
340
+ ]
341
+
342
+ # Get raw response using parent's method
343
+ response, usage = await self._get_chat_completion(messages)
344
+ logger.info("Response length: " + str(len(response)))
345
+ page.raw_response = response
346
+
347
+ if prompt_key == "layout":
348
+ # Post-processing
349
+ matches, matches_image, matches_other = re_match(response)
350
+
351
+ # Extract items (bounding boxes)
352
+ page.items = self.extract_items(page.image, matches)
353
+
354
+ # Clean text
355
+ outputs = response
356
+
357
+ # Check for sentence end marker (indicates successful completion)
358
+ # If not present, it might be due to repetition detection
359
+ if "<|end▁of▁sentence|>" in outputs:
360
+ outputs = outputs.replace("<|end▁of▁sentence|>", "")
361
+
362
+ # Replace image references with a placeholder
363
+ for a_match_image in matches_image:
364
+ outputs = outputs.replace(a_match_image, "![image]")
365
+
366
+ # Replace other references (text grounding) and cleanup
367
+ for a_match_other in matches_other:
368
+ outputs = (
369
+ outputs.replace(a_match_other, "")
370
+ .replace("\\coloneqq", ":=")
371
+ .replace("\\eqqcolon", "=:")
372
+ .replace("\n\n\n\n", "\n\n")
373
+ .replace("\n\n\n", "\n\n")
374
+ )
375
+ else:
376
+ outputs = response
377
+
378
+ page.text = outputs.strip()
379
+ logger.debug(page.text)
380
+ if usage is not None:
381
+ page.prompt_tokens = usage.prompt_tokens
382
+ page.completion_tokens = usage.completion_tokens
383
+
384
+ return page
@@ -0,0 +1,243 @@
1
+ import asyncio
2
+ import os
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ import httpx
7
+ import orjson
8
+ from loguru import logger
9
+ from pydantic import Field
10
+
11
+ from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
12
+ from vlmparse.clients.pipe_utils.utils import clean_response
13
+ from vlmparse.converter import BaseConverter, ConverterConfig
14
+ from vlmparse.data_model.document import BoundingBox, Item, Page
15
+ from vlmparse.servers.docker_compose_server import DockerComposeServerConfig
16
+ from vlmparse.utils import to_base64
17
+
18
+ DOCKER_PIPELINE_DIR = (
19
+ Path(__file__).parent.parent.parent / "docker_pipelines" / "glmocr"
20
+ )
21
+
22
+
23
+ class GLMOCRDockerServerConfig(DockerComposeServerConfig):
24
+ """Docker Compose configuration for GLM-OCR server."""
25
+
26
+ model_name: str = "GLM-OCR"
27
+ aliases: list[str] = Field(default_factory=lambda: ["glmocr", "glm-ocr"])
28
+ compose_file: str = str(DOCKER_PIPELINE_DIR / "compose.yaml")
29
+ server_service: str = "glmocr-api"
30
+ compose_services: list[str] = Field(
31
+ default_factory=lambda: ["glmocr-api", "glmocr-vllm-server"]
32
+ )
33
+ gpu_service_names: list[str] = Field(default_factory=lambda: ["glmocr-vllm-server"])
34
+ docker_port: int = 5002
35
+ container_port: int = 5002
36
+ environment: dict[str, str] = Field(
37
+ default_factory=lambda: {
38
+ "VLM_BACKEND": "vllm",
39
+ "API_PORT": "8080",
40
+ }
41
+ )
42
+ environment_services: list[str] = Field(default_factory=lambda: ["glmocr-api"])
43
+ server_ready_indicators: list[str] = Field(
44
+ default_factory=lambda: ["Running on", "Application startup complete"]
45
+ )
46
+
47
+ def model_post_init(self, __context):
48
+ if not self.compose_env:
49
+ compose_env = {}
50
+ for key in [
51
+ "API_IMAGE_TAG_SUFFIX",
52
+ "VLM_IMAGE_TAG_SUFFIX",
53
+ "VLM_BACKEND",
54
+ ]:
55
+ value = os.getenv(key)
56
+ if value:
57
+ compose_env[key] = value
58
+ if compose_env:
59
+ self.compose_env = compose_env
60
+
61
+ @property
62
+ def client_config(self):
63
+ return GLMOCRConverterConfig(
64
+ **self._create_client_kwargs(f"http://localhost:{self.docker_port}")
65
+ )
66
+
67
+
68
+ class GLMOCRConverterConfig(ConverterConfig):
69
+ """Configuration for GLM-OCR API client."""
70
+
71
+ model_name: str = "GLM-OCR"
72
+ aliases: list[str] = Field(default_factory=lambda: ["glmocr", "glm-ocr"])
73
+ timeout: int = 600
74
+
75
+ endpoint_parse: str = "/glmocr/parse"
76
+
77
+ # GLM-OCR specific configuration
78
+
79
+ # Output format: "json", "markdown", or "both"
80
+ output_format: str = "both"
81
+
82
+ # Enable layout detection (PP-DocLayout)
83
+ enable_layout: bool = True
84
+
85
+ # GLM-OCR model parameters
86
+ max_tokens: int = 16384
87
+ temperature: float = 0.01
88
+ image_format: str = "JPEG"
89
+ min_pixels: int = 12544
90
+ max_pixels: int = 71372800
91
+
92
+ # Backward-compat escape hatch: if set, applied last to the payload.
93
+ request_overrides: dict[str, Any] = Field(default_factory=dict)
94
+
95
+ def get_client(self, **kwargs) -> "GLMOCRConverter":
96
+ return GLMOCRConverter(config=self, **kwargs)
97
+
98
+
99
+ class GLMOCRConverter(BaseConverter):
100
+ """GLM-OCR HTTP API converter."""
101
+
102
+ config: GLMOCRConverterConfig
103
+
104
+ def _build_parse_payload(self, file_content_b64: str) -> dict:
105
+ """Build the request payload for the GLM-OCR parse endpoint.
106
+
107
+ Args:
108
+ file_content_b64: Base64 encoded image content
109
+
110
+ Returns:
111
+ Dictionary payload for the API request
112
+ """
113
+ # Wrap base64 in data URI format as expected by GLM-OCR
114
+ # Format: data:image/png;base64,<base64_data>
115
+ data_uri = f"data:image/png;base64,{file_content_b64}"
116
+
117
+ payload: dict[str, Any] = {
118
+ "images": [data_uri] # GLM-OCR expects a list
119
+ }
120
+
121
+ # Apply any request overrides
122
+ if self.config.request_overrides:
123
+ payload.update(self.config.request_overrides)
124
+
125
+ return payload
126
+
127
+ async def _post_json(self, endpoint: str, payload: dict) -> dict:
128
+ """Make a POST request to the GLM-OCR API.
129
+
130
+ Args:
131
+ endpoint: API endpoint path
132
+ payload: Request payload
133
+
134
+ Returns:
135
+ Parsed JSON response
136
+
137
+ Raises:
138
+ RuntimeError: If the API returns an error
139
+ """
140
+ headers = {}
141
+
142
+ async with httpx.AsyncClient(
143
+ base_url=self.config.base_url, timeout=self.config.timeout, headers=headers
144
+ ) as client:
145
+ response = await client.post(endpoint, json=payload)
146
+
147
+ response.raise_for_status()
148
+ data = response.json()
149
+
150
+ # Check for error in response
151
+ if "error" in data:
152
+ raise RuntimeError(data.get("error", "Unknown error"))
153
+
154
+ return data
155
+
156
+ def _apply_markdown(self, page: Page, markdown_text: str | None):
157
+ """Apply markdown text to the page.
158
+
159
+ Args:
160
+ page: Page object to update
161
+ markdown_text: Markdown content from GLM-OCR
162
+ """
163
+ text = markdown_text or ""
164
+ text = clean_response(text)
165
+ text = html_to_md_keep_tables(text)
166
+ logger.debug(f"Converted markdown text: {text[:100]}...")
167
+ page.text = text
168
+
169
+ def _apply_items(self, page: Page, json_result: list[dict] | None):
170
+ """Apply structured items to the page from JSON result.
171
+
172
+ Args:
173
+ page: Page object to update
174
+ json_result: List of detected regions from GLM-OCR
175
+ """
176
+ if not json_result:
177
+ return
178
+
179
+ items: list[Item] = []
180
+
181
+ for block in json_result:
182
+ bbox = block.get("bbox_2d")
183
+ if not bbox or len(bbox) != 4:
184
+ # If no bbox, skip this item
185
+ continue
186
+
187
+ x1, y1, x2, y2 = bbox
188
+ text = block.get("content") or ""
189
+ label = block.get("label") or ""
190
+
191
+ items.append(
192
+ Item(
193
+ text=text,
194
+ box=BoundingBox(l=x1, t=y1, r=x2, b=y2),
195
+ category=label,
196
+ )
197
+ )
198
+
199
+ page.items = items
200
+
201
+ async def async_call_inside_page(self, page: Page) -> Page:
202
+ """Process a single page through the GLM-OCR API.
203
+
204
+ Args:
205
+ page: Page object containing the image to process
206
+
207
+ Returns:
208
+ Updated Page object with OCR results
209
+ """
210
+ image = page.image
211
+
212
+ # Convert image to base64
213
+ file_content_b64 = await asyncio.to_thread(to_base64, image, "PNG")
214
+
215
+ # Build request payload
216
+ payload = self._build_parse_payload(file_content_b64)
217
+
218
+ # Call the GLM-OCR API
219
+ data = await self._post_json(self.config.endpoint_parse, payload)
220
+
221
+ # GLM-OCR returns results as a list (one per document)
222
+ # Since we send one image, we get one document result
223
+ result = data.get("markdown_result", None)
224
+
225
+ if result:
226
+ # Get markdown output if available
227
+ markdown_result = result
228
+ if markdown_result:
229
+ self._apply_markdown(page, markdown_result)
230
+
231
+ # Get JSON output if available and layout detection is enabled
232
+ json_result = data.get("json_result")
233
+ if json_result and isinstance(json_result, list) and len(json_result) > 0:
234
+ # json_result is a list of pages, take the first page
235
+ page_result = (
236
+ json_result[0] if isinstance(json_result[0], list) else json_result
237
+ )
238
+ self._apply_items(page, page_result)
239
+
240
+ # Store raw response
241
+ page.raw_response = orjson.dumps(result).decode("utf-8")
242
+
243
+ return page
@@ -1,56 +1,204 @@
1
+ import asyncio
2
+ import os
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ import httpx
7
+ import orjson
8
+ from loguru import logger
1
9
  from pydantic import Field
2
10
 
3
- from vlmparse.clients.openai_converter import OpenAIConverterConfig
4
- from vlmparse.servers.docker_server import VLLMDockerServerConfig
11
+ from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
12
+ from vlmparse.clients.pipe_utils.utils import clean_response
13
+ from vlmparse.converter import BaseConverter, ConverterConfig
14
+ from vlmparse.data_model.document import BoundingBox, Item, Page
15
+ from vlmparse.servers.docker_compose_server import DockerComposeServerConfig
16
+ from vlmparse.utils import to_base64
17
+
18
+ DOCKER_PIPELINE_DIR = (
19
+ Path(__file__).parent.parent.parent / "docker_pipelines" / "paddleocrvl"
20
+ )
5
21
 
6
22
 
7
- class PaddleOCRVLDockerServerConfig(VLLMDockerServerConfig):
8
- """Configuration for PaddleOCRVL model."""
23
+ class PaddleOCRVLDockerServerConfig(DockerComposeServerConfig):
24
+ """Docker Compose configuration for PaddleOCR-VL server."""
9
25
 
10
- model_name: str = "PaddlePaddle/PaddleOCR-VL"
11
- command_args: list[str] = Field(
12
- default_factory=lambda: [
13
- "--limit-mm-per-prompt",
14
- '{"image": 1}',
15
- "--async-scheduling",
16
- "--trust-remote-code",
17
- "--mm-processor-cache-gb",
18
- "0",
19
- ]
26
+ model_name: str = "PaddleOCR-VL-1.5"
27
+ aliases: list[str] = Field(
28
+ default_factory=lambda: ["paddleocrvl1.5", "paddleocr-vl-1.5"]
29
+ )
30
+ compose_file: str = str(DOCKER_PIPELINE_DIR / "compose.yaml")
31
+ server_service: str = "paddleocr-vl-api"
32
+ compose_services: list[str] = Field(
33
+ default_factory=lambda: ["paddleocr-vl-api", "paddleocr-vlm-server"]
34
+ )
35
+ gpu_service_names: list[str] = Field(
36
+ default_factory=lambda: ["paddleocr-vl-api", "paddleocr-vlm-server"]
37
+ )
38
+ docker_port: int = 8080
39
+ container_port: int = 8080
40
+ environment: dict[str, str] = Field(
41
+ default_factory=lambda: {
42
+ "VLM_BACKEND": "vllm",
43
+ }
20
44
  )
21
- aliases: list[str] = Field(default_factory=lambda: ["paddleocrvl"])
45
+ environment_services: list[str] = Field(
46
+ default_factory=lambda: ["paddleocr-vl-api"]
47
+ )
48
+ server_ready_indicators: list[str] = Field(
49
+ default_factory=lambda: ["Application startup complete", "Uvicorn running"]
50
+ )
51
+
52
+ def model_post_init(self, __context):
53
+ if not self.compose_env:
54
+ compose_env = {}
55
+ for key in [
56
+ "API_IMAGE_TAG_SUFFIX",
57
+ "VLM_IMAGE_TAG_SUFFIX",
58
+ "VLM_BACKEND",
59
+ ]:
60
+ value = os.getenv(key)
61
+ if value:
62
+ compose_env[key] = value
63
+ if compose_env:
64
+ self.compose_env = compose_env
22
65
 
23
66
  @property
24
67
  def client_config(self):
25
68
  return PaddleOCRVLConverterConfig(
26
- **self._create_client_kwargs(
27
- f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
28
- )
69
+ **self._create_client_kwargs(f"http://localhost:{self.docker_port}")
29
70
  )
30
71
 
31
72
 
32
- # Task-specific base prompts
33
- TASKS = {
34
- "ocr": "OCR:",
35
- "table": "Table Recognition:",
36
- "formula": "Formula Recognition:",
37
- "chart": "Chart Recognition:",
38
- }
39
-
40
-
41
- class PaddleOCRVLConverterConfig(OpenAIConverterConfig):
42
- """PaddleOCRVL converter"""
43
-
44
- model_name: str = "PaddlePaddle/PaddleOCR-VL"
45
- preprompt: str | None = None
46
- postprompt: dict[str, str] = TASKS
47
- prompt_mode_map: dict[str, str] = {
48
- "ocr_layout": "ocr",
49
- }
50
- completion_kwargs: dict | None = {
51
- "temperature": 0.0,
52
- "max_completion_tokens": 16384,
53
- }
54
- dpi: int = 200
55
- aliases: list[str] = Field(default_factory=lambda: ["paddleocrvl"])
56
- stream: bool = True
73
+ class PaddleOCRVLConverterConfig(ConverterConfig):
74
+ """Configuration for PaddleOCR-VL API client."""
75
+
76
+ model_name: str = "PaddleOCR-VL-1.5"
77
+ aliases: list[str] = Field(
78
+ default_factory=lambda: ["paddleocrvl1.5", "paddleocr-vl-1.5"]
79
+ )
80
+ timeout: int = 600
81
+
82
+ endpoint_layout_parsing: str = "/layout-parsing"
83
+ endpoint_restructure_pages: str = "/restructure-pages"
84
+
85
+ # Dict of PaddleOCR-VL API args.
86
+ # Keys should match the PaddleOCR-VL API JSON fields (camelCase), e.g.
87
+ # {"useLayoutDetection": true, "promptLabel": "..."}.
88
+ paddleocr_args: dict[str, Any] = Field(
89
+ default_factory=lambda: {
90
+ # Preserve previous default behavior (these were always sent before).
91
+ "prettifyMarkdown": True,
92
+ "showFormulaNumber": False,
93
+ "restructurePages": False,
94
+ }
95
+ )
96
+
97
+ # Optional args for the /restructure-pages endpoint (if/when used).
98
+ restructure_args: dict[str, Any] = Field(default_factory=dict)
99
+
100
+ # Backward-compat escape hatch: if set, applied last to the payload.
101
+ request_overrides: dict[str, Any] = Field(default_factory=dict)
102
+
103
+ def get_client(self, **kwargs) -> "PaddleOCRVLConverter":
104
+ return PaddleOCRVLConverter(config=self, **kwargs)
105
+
106
+
107
+ class PaddleOCRVLConverter(BaseConverter):
108
+ """PaddleOCR-VL HTTP API converter."""
109
+
110
+ config: PaddleOCRVLConverterConfig
111
+
112
+ def _build_layout_payload(self, file_content_b64: str, file_type: int | None):
113
+ payload: dict[str, Any] = {"file": file_content_b64}
114
+
115
+ if self.config.paddleocr_args:
116
+ payload.update(self.config.paddleocr_args)
117
+
118
+ if file_type is not None:
119
+ payload["fileType"] = file_type
120
+
121
+ if self.config.request_overrides:
122
+ payload.update(self.config.request_overrides)
123
+
124
+ return payload
125
+
126
+ def _build_restructure_payload(self, layout_results: list[dict]) -> dict:
127
+ pages = []
128
+ for page_result in layout_results:
129
+ pruned = page_result.get("prunedResult")
130
+ markdown = page_result.get("markdown") or {}
131
+ if pruned is None:
132
+ continue
133
+ pages.append(
134
+ {
135
+ "prunedResult": pruned,
136
+ "markdownImages": markdown.get("images"),
137
+ }
138
+ )
139
+
140
+ payload: dict[str, Any] = {"pages": pages}
141
+
142
+ if self.config.restructure_args:
143
+ payload.update(self.config.restructure_args)
144
+
145
+ return payload
146
+
147
+ async def _post_json(self, endpoint: str, payload: dict) -> dict:
148
+ async with httpx.AsyncClient(
149
+ base_url=self.config.base_url, timeout=self.config.timeout
150
+ ) as client:
151
+ response = await client.post(endpoint, json=payload)
152
+
153
+ response.raise_for_status()
154
+ data = response.json()
155
+ if data.get("errorCode", 0) != 0:
156
+ raise RuntimeError(data.get("errorMsg", "Unknown error"))
157
+ return data
158
+
159
+ def _apply_markdown(self, page: Page, markdown_text: str | None):
160
+ text = markdown_text or ""
161
+ text = clean_response(text)
162
+ text = html_to_md_keep_tables(text)
163
+ logger.debug(f"Converted markdown text: {text}...")
164
+ page.text = text
165
+
166
+ def _apply_items(self, page: Page, pruned_result: dict | None):
167
+ if not pruned_result:
168
+ return
169
+ parsing_res_list = pruned_result.get("parsing_res_list") or []
170
+ items: list[Item] = []
171
+ for block in parsing_res_list:
172
+ bbox = block.get("block_bbox")
173
+ if not bbox or len(bbox) != 4:
174
+ logger.warning(f"Invalid bbox in block: {block}")
175
+ continue
176
+ l, t, r, b = bbox
177
+ text = block.get("block_content") or ""
178
+ items.append(
179
+ Item(
180
+ text=text,
181
+ box=BoundingBox(l=l, t=t, r=r, b=b),
182
+ category=block.get("block_label") or "",
183
+ )
184
+ )
185
+
186
+ page.items = items
187
+
188
+ async def async_call_inside_page(self, page: Page) -> Page:
189
+ image = page.image
190
+ file_content_b64 = await asyncio.to_thread(to_base64, image, "PNG")
191
+ payload = self._build_layout_payload(file_content_b64, 1)
192
+
193
+ data = await self._post_json(self.config.endpoint_layout_parsing, payload)
194
+ result = data.get("result", {})
195
+ layout_results = result.get("layoutParsingResults", [])
196
+ if layout_results:
197
+ first = layout_results[0]
198
+
199
+ markdown = first.get("markdown") or {}
200
+ self._apply_markdown(page, markdown.get("text"))
201
+ self._apply_items(page, first.get("prunedResult"))
202
+ page.raw_response = orjson.dumps(first).decode("utf-8")
203
+
204
+ return page