vlmparse 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vlmparse/build_doc.py +20 -19
- vlmparse/cli.py +439 -270
- vlmparse/clients/chandra.py +176 -60
- vlmparse/clients/deepseekocr.py +193 -12
- vlmparse/clients/docling.py +0 -1
- vlmparse/clients/dotsocr.py +34 -31
- vlmparse/clients/glmocr.py +243 -0
- vlmparse/clients/granite_docling.py +9 -36
- vlmparse/clients/hunyuanocr.py +5 -1
- vlmparse/clients/lightonocr.py +23 -1
- vlmparse/clients/mineru.py +0 -1
- vlmparse/clients/mistral_converter.py +85 -0
- vlmparse/clients/nanonetocr.py +5 -1
- vlmparse/clients/olmocr.py +6 -2
- vlmparse/clients/openai_converter.py +95 -60
- vlmparse/clients/paddleocrvl.py +195 -40
- vlmparse/converter.py +51 -11
- vlmparse/converter_with_server.py +92 -19
- vlmparse/registries.py +107 -89
- vlmparse/servers/base_server.py +127 -0
- vlmparse/servers/docker_compose_deployment.py +489 -0
- vlmparse/servers/docker_compose_server.py +39 -0
- vlmparse/servers/docker_run_deployment.py +226 -0
- vlmparse/servers/docker_server.py +17 -109
- vlmparse/servers/model_identity.py +48 -0
- vlmparse/servers/server_registry.py +42 -0
- vlmparse/servers/utils.py +83 -219
- vlmparse/st_viewer/st_viewer.py +1 -1
- vlmparse/utils.py +15 -2
- {vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/METADATA +13 -3
- vlmparse-0.1.9.dist-info/RECORD +44 -0
- {vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/WHEEL +1 -1
- vlmparse-0.1.7.dist-info/RECORD +0 -36
- {vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/entry_points.txt +0 -0
- {vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/licenses/LICENSE +0 -0
- {vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/top_level.txt +0 -0
|
@@ -1,15 +1,13 @@
|
|
|
1
|
-
import
|
|
1
|
+
import asyncio
|
|
2
2
|
from typing import Literal, Optional
|
|
3
3
|
|
|
4
4
|
from loguru import logger
|
|
5
5
|
from pydantic import Field
|
|
6
6
|
|
|
7
|
-
from vlmparse.base_model import VLMParseBaseModel
|
|
8
7
|
from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
|
|
9
8
|
from vlmparse.clients.pipe_utils.utils import clean_response
|
|
10
9
|
from vlmparse.converter import BaseConverter, ConverterConfig
|
|
11
10
|
from vlmparse.data_model.document import Page
|
|
12
|
-
from vlmparse.servers.docker_server import DEFAULT_MODEL_NAME
|
|
13
11
|
from vlmparse.utils import to_base64
|
|
14
12
|
|
|
15
13
|
from .prompts import PDF2MD_PROMPT
|
|
@@ -17,50 +15,14 @@ from .prompts import PDF2MD_PROMPT
|
|
|
17
15
|
GOOGLE_API_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
|
|
18
16
|
|
|
19
17
|
|
|
20
|
-
class
|
|
18
|
+
class OpenAIConverterConfig(ConverterConfig):
|
|
21
19
|
api_key: str = ""
|
|
22
|
-
base_url: str | None = None
|
|
23
|
-
model_name: str = DEFAULT_MODEL_NAME
|
|
24
20
|
timeout: int | None = 500
|
|
25
21
|
max_retries: int = 1
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
def get_llm_params(model_name: str, uri: str | None = None):
|
|
29
|
-
if uri is not None:
|
|
30
|
-
return LLMParams(base_url=uri, model_name="vllm-model", api_key="")
|
|
31
|
-
if model_name in [
|
|
32
|
-
"gpt-4o",
|
|
33
|
-
"gpt-4o-mini",
|
|
34
|
-
"gpt-4.1",
|
|
35
|
-
"gpt-4.1-mini",
|
|
36
|
-
"gpt-4.1-nano",
|
|
37
|
-
"gpt-5",
|
|
38
|
-
"gpt-5-mini",
|
|
39
|
-
"gpt-5-nano",
|
|
40
|
-
]:
|
|
41
|
-
base_url = None
|
|
42
|
-
api_key = os.getenv("OPENAI_API_KEY")
|
|
43
|
-
if api_key is None:
|
|
44
|
-
raise ValueError("OPENAI_API_KEY environment variable not set")
|
|
45
|
-
else:
|
|
46
|
-
if model_name in [
|
|
47
|
-
"gemini-2.5-flash-lite",
|
|
48
|
-
"gemini-2.5-flash",
|
|
49
|
-
"gemini-2.5-pro",
|
|
50
|
-
]:
|
|
51
|
-
base_url = GOOGLE_API_BASE_URL
|
|
52
|
-
api_key = os.getenv("GOOGLE_API_KEY")
|
|
53
|
-
if api_key is None:
|
|
54
|
-
raise ValueError("GOOGLE_API_KEY environment variable not set")
|
|
55
|
-
else:
|
|
56
|
-
return None
|
|
57
|
-
return LLMParams(base_url=base_url, model_name=model_name, api_key=api_key)
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
class OpenAIConverterConfig(ConverterConfig):
|
|
61
|
-
llm_params: LLMParams
|
|
62
22
|
preprompt: str | None = None
|
|
63
|
-
postprompt: str | None = PDF2MD_PROMPT
|
|
23
|
+
postprompt: str | dict[str, str] | None = PDF2MD_PROMPT
|
|
24
|
+
prompts: dict[str, str] = Field(default_factory=dict)
|
|
25
|
+
prompt_mode_map: dict[str, str] = Field(default_factory=dict)
|
|
64
26
|
completion_kwargs: dict = Field(default_factory=dict)
|
|
65
27
|
stream: bool = False
|
|
66
28
|
|
|
@@ -71,6 +33,33 @@ class OpenAIConverterConfig(ConverterConfig):
|
|
|
71
33
|
class OpenAIConverterClient(BaseConverter):
|
|
72
34
|
"""Client for OpenAI-compatible API servers."""
|
|
73
35
|
|
|
36
|
+
def get_prompt_key(self) -> str | None:
|
|
37
|
+
"""Resolve a prompt key from conversion_mode using class mappings."""
|
|
38
|
+
mode = getattr(self.config, "conversion_mode", None) or "ocr"
|
|
39
|
+
prompts = self._get_prompts()
|
|
40
|
+
if mode in prompts:
|
|
41
|
+
return mode
|
|
42
|
+
mapped = self._get_prompt_mode_map().get(mode)
|
|
43
|
+
if mapped in prompts:
|
|
44
|
+
return mapped
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
def get_prompt_for_mode(self) -> str | None:
|
|
48
|
+
key = self.get_prompt_key()
|
|
49
|
+
if key is None:
|
|
50
|
+
return None
|
|
51
|
+
return self._get_prompts().get(key)
|
|
52
|
+
|
|
53
|
+
def _get_prompts(self) -> dict[str, str]:
|
|
54
|
+
if self.config.prompts:
|
|
55
|
+
return self.config.prompts
|
|
56
|
+
if isinstance(self.config.postprompt, dict):
|
|
57
|
+
return self.config.postprompt
|
|
58
|
+
return {}
|
|
59
|
+
|
|
60
|
+
def _get_prompt_mode_map(self) -> dict[str, str]:
|
|
61
|
+
return self.config.prompt_mode_map or {}
|
|
62
|
+
|
|
74
63
|
def __init__(
|
|
75
64
|
self,
|
|
76
65
|
config: OpenAIConverterConfig,
|
|
@@ -90,14 +79,54 @@ class OpenAIConverterClient(BaseConverter):
|
|
|
90
79
|
debug=debug,
|
|
91
80
|
return_documents_in_batch_mode=return_documents_in_batch_mode,
|
|
92
81
|
)
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
82
|
+
self._model = None
|
|
83
|
+
self._model_loop = None
|
|
84
|
+
|
|
85
|
+
async def _get_async_model(self):
|
|
86
|
+
loop = asyncio.get_running_loop()
|
|
87
|
+
if self._model is None or self._model_loop is not loop:
|
|
88
|
+
await self._close_model()
|
|
89
|
+
from openai import AsyncOpenAI
|
|
90
|
+
|
|
91
|
+
self._model = AsyncOpenAI(
|
|
92
|
+
base_url=self.config.base_url,
|
|
93
|
+
api_key=self.config.api_key,
|
|
94
|
+
timeout=self.config.timeout,
|
|
95
|
+
max_retries=self.config.max_retries,
|
|
96
|
+
)
|
|
97
|
+
self._model_loop = loop
|
|
98
|
+
return self._model
|
|
99
|
+
|
|
100
|
+
async def _close_model(self):
|
|
101
|
+
"""Close the async OpenAI client if it exists."""
|
|
102
|
+
if self._model is not None:
|
|
103
|
+
try:
|
|
104
|
+
await self._model.close()
|
|
105
|
+
except RuntimeError:
|
|
106
|
+
# Event loop may already be closed
|
|
107
|
+
pass
|
|
108
|
+
finally:
|
|
109
|
+
self._model = None
|
|
110
|
+
self._model_loop = None
|
|
111
|
+
|
|
112
|
+
async def aclose(self):
|
|
113
|
+
"""Close the converter and release resources."""
|
|
114
|
+
await self._close_model()
|
|
115
|
+
|
|
116
|
+
def close(self):
|
|
117
|
+
"""Synchronously close the converter if possible."""
|
|
118
|
+
if self._model is not None:
|
|
119
|
+
try:
|
|
120
|
+
loop = asyncio.get_running_loop()
|
|
121
|
+
loop.create_task(self._close_model())
|
|
122
|
+
except RuntimeError:
|
|
123
|
+
# No running loop, try to close synchronously
|
|
124
|
+
try:
|
|
125
|
+
asyncio.run(self._close_model())
|
|
126
|
+
except RuntimeError:
|
|
127
|
+
# Event loop already closed, force cleanup
|
|
128
|
+
self._model = None
|
|
129
|
+
self._model_loop = None
|
|
101
130
|
|
|
102
131
|
async def _get_chat_completion(
|
|
103
132
|
self, messages: list[dict], completion_kwargs: dict | None = None
|
|
@@ -106,9 +135,11 @@ class OpenAIConverterClient(BaseConverter):
|
|
|
106
135
|
if completion_kwargs is None:
|
|
107
136
|
completion_kwargs = self.config.completion_kwargs
|
|
108
137
|
|
|
138
|
+
model = await self._get_async_model()
|
|
139
|
+
|
|
109
140
|
if self.config.stream:
|
|
110
|
-
response_stream = await
|
|
111
|
-
model=self.config.
|
|
141
|
+
response_stream = await model.chat.completions.create(
|
|
142
|
+
model=self.config.default_model_name,
|
|
112
143
|
messages=messages,
|
|
113
144
|
stream=True,
|
|
114
145
|
**completion_kwargs,
|
|
@@ -120,8 +151,8 @@ class OpenAIConverterClient(BaseConverter):
|
|
|
120
151
|
|
|
121
152
|
return "".join(response_parts), None
|
|
122
153
|
else:
|
|
123
|
-
response_obj = await
|
|
124
|
-
model=self.config.
|
|
154
|
+
response_obj = await model.chat.completions.create(
|
|
155
|
+
model=self.config.default_model_name,
|
|
125
156
|
messages=messages,
|
|
126
157
|
**completion_kwargs,
|
|
127
158
|
)
|
|
@@ -147,11 +178,15 @@ class OpenAIConverterClient(BaseConverter):
|
|
|
147
178
|
else:
|
|
148
179
|
preprompt = []
|
|
149
180
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
181
|
+
selected_prompt = self.get_prompt_for_mode()
|
|
182
|
+
if selected_prompt is not None:
|
|
183
|
+
postprompt = [{"type": "text", "text": selected_prompt}]
|
|
184
|
+
else:
|
|
185
|
+
postprompt = (
|
|
186
|
+
[{"type": "text", "text": self.config.postprompt}]
|
|
187
|
+
if isinstance(self.config.postprompt, str) and self.config.postprompt
|
|
188
|
+
else []
|
|
189
|
+
)
|
|
155
190
|
|
|
156
191
|
messages = [
|
|
157
192
|
*preprompt,
|
vlmparse/clients/paddleocrvl.py
CHANGED
|
@@ -1,49 +1,204 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
import orjson
|
|
8
|
+
from loguru import logger
|
|
1
9
|
from pydantic import Field
|
|
2
10
|
|
|
3
|
-
from vlmparse.clients.
|
|
4
|
-
from vlmparse.
|
|
11
|
+
from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
|
|
12
|
+
from vlmparse.clients.pipe_utils.utils import clean_response
|
|
13
|
+
from vlmparse.converter import BaseConverter, ConverterConfig
|
|
14
|
+
from vlmparse.data_model.document import BoundingBox, Item, Page
|
|
15
|
+
from vlmparse.servers.docker_compose_server import DockerComposeServerConfig
|
|
16
|
+
from vlmparse.utils import to_base64
|
|
17
|
+
|
|
18
|
+
DOCKER_PIPELINE_DIR = (
|
|
19
|
+
Path(__file__).parent.parent.parent / "docker_pipelines" / "paddleocrvl"
|
|
20
|
+
)
|
|
5
21
|
|
|
6
22
|
|
|
7
|
-
class PaddleOCRVLDockerServerConfig(
|
|
8
|
-
"""
|
|
23
|
+
class PaddleOCRVLDockerServerConfig(DockerComposeServerConfig):
|
|
24
|
+
"""Docker Compose configuration for PaddleOCR-VL server."""
|
|
9
25
|
|
|
10
|
-
model_name: str = "
|
|
11
|
-
|
|
12
|
-
default_factory=lambda: [
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
26
|
+
model_name: str = "PaddleOCR-VL-1.5"
|
|
27
|
+
aliases: list[str] = Field(
|
|
28
|
+
default_factory=lambda: ["paddleocrvl1.5", "paddleocr-vl-1.5"]
|
|
29
|
+
)
|
|
30
|
+
compose_file: str = str(DOCKER_PIPELINE_DIR / "compose.yaml")
|
|
31
|
+
server_service: str = "paddleocr-vl-api"
|
|
32
|
+
compose_services: list[str] = Field(
|
|
33
|
+
default_factory=lambda: ["paddleocr-vl-api", "paddleocr-vlm-server"]
|
|
34
|
+
)
|
|
35
|
+
gpu_service_names: list[str] = Field(
|
|
36
|
+
default_factory=lambda: ["paddleocr-vl-api", "paddleocr-vlm-server"]
|
|
37
|
+
)
|
|
38
|
+
docker_port: int = 8080
|
|
39
|
+
container_port: int = 8080
|
|
40
|
+
environment: dict[str, str] = Field(
|
|
41
|
+
default_factory=lambda: {
|
|
42
|
+
"VLM_BACKEND": "vllm",
|
|
43
|
+
}
|
|
20
44
|
)
|
|
21
|
-
|
|
45
|
+
environment_services: list[str] = Field(
|
|
46
|
+
default_factory=lambda: ["paddleocr-vl-api"]
|
|
47
|
+
)
|
|
48
|
+
server_ready_indicators: list[str] = Field(
|
|
49
|
+
default_factory=lambda: ["Application startup complete", "Uvicorn running"]
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
def model_post_init(self, __context):
|
|
53
|
+
if not self.compose_env:
|
|
54
|
+
compose_env = {}
|
|
55
|
+
for key in [
|
|
56
|
+
"API_IMAGE_TAG_SUFFIX",
|
|
57
|
+
"VLM_IMAGE_TAG_SUFFIX",
|
|
58
|
+
"VLM_BACKEND",
|
|
59
|
+
]:
|
|
60
|
+
value = os.getenv(key)
|
|
61
|
+
if value:
|
|
62
|
+
compose_env[key] = value
|
|
63
|
+
if compose_env:
|
|
64
|
+
self.compose_env = compose_env
|
|
22
65
|
|
|
23
66
|
@property
|
|
24
67
|
def client_config(self):
|
|
25
|
-
return PaddleOCRVLConverterConfig(
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
"
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
68
|
+
return PaddleOCRVLConverterConfig(
|
|
69
|
+
**self._create_client_kwargs(f"http://localhost:{self.docker_port}")
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class PaddleOCRVLConverterConfig(ConverterConfig):
|
|
74
|
+
"""Configuration for PaddleOCR-VL API client."""
|
|
75
|
+
|
|
76
|
+
model_name: str = "PaddleOCR-VL-1.5"
|
|
77
|
+
aliases: list[str] = Field(
|
|
78
|
+
default_factory=lambda: ["paddleocrvl1.5", "paddleocr-vl-1.5"]
|
|
79
|
+
)
|
|
80
|
+
timeout: int = 600
|
|
81
|
+
|
|
82
|
+
endpoint_layout_parsing: str = "/layout-parsing"
|
|
83
|
+
endpoint_restructure_pages: str = "/restructure-pages"
|
|
84
|
+
|
|
85
|
+
# Dict of PaddleOCR-VL API args.
|
|
86
|
+
# Keys should match the PaddleOCR-VL API JSON fields (camelCase), e.g.
|
|
87
|
+
# {"useLayoutDetection": true, "promptLabel": "..."}.
|
|
88
|
+
paddleocr_args: dict[str, Any] = Field(
|
|
89
|
+
default_factory=lambda: {
|
|
90
|
+
# Preserve previous default behavior (these were always sent before).
|
|
91
|
+
"prettifyMarkdown": True,
|
|
92
|
+
"showFormulaNumber": False,
|
|
93
|
+
"restructurePages": False,
|
|
94
|
+
}
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# Optional args for the /restructure-pages endpoint (if/when used).
|
|
98
|
+
restructure_args: dict[str, Any] = Field(default_factory=dict)
|
|
99
|
+
|
|
100
|
+
# Backward-compat escape hatch: if set, applied last to the payload.
|
|
101
|
+
request_overrides: dict[str, Any] = Field(default_factory=dict)
|
|
102
|
+
|
|
103
|
+
def get_client(self, **kwargs) -> "PaddleOCRVLConverter":
|
|
104
|
+
return PaddleOCRVLConverter(config=self, **kwargs)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class PaddleOCRVLConverter(BaseConverter):
|
|
108
|
+
"""PaddleOCR-VL HTTP API converter."""
|
|
109
|
+
|
|
110
|
+
config: PaddleOCRVLConverterConfig
|
|
111
|
+
|
|
112
|
+
def _build_layout_payload(self, file_content_b64: str, file_type: int | None):
|
|
113
|
+
payload: dict[str, Any] = {"file": file_content_b64}
|
|
114
|
+
|
|
115
|
+
if self.config.paddleocr_args:
|
|
116
|
+
payload.update(self.config.paddleocr_args)
|
|
117
|
+
|
|
118
|
+
if file_type is not None:
|
|
119
|
+
payload["fileType"] = file_type
|
|
120
|
+
|
|
121
|
+
if self.config.request_overrides:
|
|
122
|
+
payload.update(self.config.request_overrides)
|
|
123
|
+
|
|
124
|
+
return payload
|
|
125
|
+
|
|
126
|
+
def _build_restructure_payload(self, layout_results: list[dict]) -> dict:
|
|
127
|
+
pages = []
|
|
128
|
+
for page_result in layout_results:
|
|
129
|
+
pruned = page_result.get("prunedResult")
|
|
130
|
+
markdown = page_result.get("markdown") or {}
|
|
131
|
+
if pruned is None:
|
|
132
|
+
continue
|
|
133
|
+
pages.append(
|
|
134
|
+
{
|
|
135
|
+
"prunedResult": pruned,
|
|
136
|
+
"markdownImages": markdown.get("images"),
|
|
137
|
+
}
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
payload: dict[str, Any] = {"pages": pages}
|
|
141
|
+
|
|
142
|
+
if self.config.restructure_args:
|
|
143
|
+
payload.update(self.config.restructure_args)
|
|
144
|
+
|
|
145
|
+
return payload
|
|
146
|
+
|
|
147
|
+
async def _post_json(self, endpoint: str, payload: dict) -> dict:
|
|
148
|
+
async with httpx.AsyncClient(
|
|
149
|
+
base_url=self.config.base_url, timeout=self.config.timeout
|
|
150
|
+
) as client:
|
|
151
|
+
response = await client.post(endpoint, json=payload)
|
|
152
|
+
|
|
153
|
+
response.raise_for_status()
|
|
154
|
+
data = response.json()
|
|
155
|
+
if data.get("errorCode", 0) != 0:
|
|
156
|
+
raise RuntimeError(data.get("errorMsg", "Unknown error"))
|
|
157
|
+
return data
|
|
158
|
+
|
|
159
|
+
def _apply_markdown(self, page: Page, markdown_text: str | None):
|
|
160
|
+
text = markdown_text or ""
|
|
161
|
+
text = clean_response(text)
|
|
162
|
+
text = html_to_md_keep_tables(text)
|
|
163
|
+
logger.debug(f"Converted markdown text: {text}...")
|
|
164
|
+
page.text = text
|
|
165
|
+
|
|
166
|
+
def _apply_items(self, page: Page, pruned_result: dict | None):
|
|
167
|
+
if not pruned_result:
|
|
168
|
+
return
|
|
169
|
+
parsing_res_list = pruned_result.get("parsing_res_list") or []
|
|
170
|
+
items: list[Item] = []
|
|
171
|
+
for block in parsing_res_list:
|
|
172
|
+
bbox = block.get("block_bbox")
|
|
173
|
+
if not bbox or len(bbox) != 4:
|
|
174
|
+
logger.warning(f"Invalid bbox in block: {block}")
|
|
175
|
+
continue
|
|
176
|
+
l, t, r, b = bbox
|
|
177
|
+
text = block.get("block_content") or ""
|
|
178
|
+
items.append(
|
|
179
|
+
Item(
|
|
180
|
+
text=text,
|
|
181
|
+
box=BoundingBox(l=l, t=t, r=r, b=b),
|
|
182
|
+
category=block.get("block_label") or "",
|
|
183
|
+
)
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
page.items = items
|
|
187
|
+
|
|
188
|
+
async def async_call_inside_page(self, page: Page) -> Page:
|
|
189
|
+
image = page.image
|
|
190
|
+
file_content_b64 = await asyncio.to_thread(to_base64, image, "PNG")
|
|
191
|
+
payload = self._build_layout_payload(file_content_b64, 1)
|
|
192
|
+
|
|
193
|
+
data = await self._post_json(self.config.endpoint_layout_parsing, payload)
|
|
194
|
+
result = data.get("result", {})
|
|
195
|
+
layout_results = result.get("layoutParsingResults", [])
|
|
196
|
+
if layout_results:
|
|
197
|
+
first = layout_results[0]
|
|
198
|
+
|
|
199
|
+
markdown = first.get("markdown") or {}
|
|
200
|
+
self._apply_markdown(page, markdown.get("text"))
|
|
201
|
+
self._apply_items(page, first.get("prunedResult"))
|
|
202
|
+
page.raw_response = orjson.dumps(first).decode("utf-8")
|
|
203
|
+
|
|
204
|
+
return page
|
vlmparse/converter.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import threading
|
|
3
3
|
import time
|
|
4
|
-
import traceback
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
from typing import Literal
|
|
7
6
|
|
|
@@ -9,6 +8,8 @@ from loguru import logger
|
|
|
9
8
|
from PIL import Image
|
|
10
9
|
from pydantic import Field
|
|
11
10
|
|
|
11
|
+
from vlmparse.servers.docker_server import DEFAULT_MODEL_NAME
|
|
12
|
+
|
|
12
13
|
from .base_model import VLMParseBaseModel
|
|
13
14
|
from .build_doc import convert_specific_page_to_image, get_page_count, resize_image
|
|
14
15
|
from .constants import IMAGE_EXTENSIONS, PDF_EXTENSION
|
|
@@ -19,9 +20,20 @@ PDFIUM_LOCK = threading.Lock()
|
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class ConverterConfig(VLMParseBaseModel):
|
|
23
|
+
model_name: str
|
|
22
24
|
aliases: list[str] = Field(default_factory=list)
|
|
23
|
-
dpi: int = 175
|
|
24
|
-
max_image_size: int | None = 4000
|
|
25
|
+
dpi: int = Field(default=175, ge=30, le=600)
|
|
26
|
+
max_image_size: int | None = Field(default=4000, ge=50)
|
|
27
|
+
base_url: str | None = None
|
|
28
|
+
default_model_name: str = DEFAULT_MODEL_NAME
|
|
29
|
+
conversion_mode: Literal[
|
|
30
|
+
"ocr",
|
|
31
|
+
"ocr_layout",
|
|
32
|
+
"table",
|
|
33
|
+
"image_description",
|
|
34
|
+
"formula",
|
|
35
|
+
"chart",
|
|
36
|
+
] = "ocr"
|
|
25
37
|
|
|
26
38
|
def get_client(self, **kwargs) -> "BaseConverter":
|
|
27
39
|
return BaseConverter(config=self, **kwargs)
|
|
@@ -94,14 +106,22 @@ class BaseConverter:
|
|
|
94
106
|
page = await self.async_call_inside_page(page)
|
|
95
107
|
toc = time.perf_counter()
|
|
96
108
|
page.latency = toc - tic
|
|
97
|
-
logger.debug(
|
|
109
|
+
logger.debug(
|
|
110
|
+
"Page {page_idx} processed in {latency:.2f}s",
|
|
111
|
+
page_idx=page_idx,
|
|
112
|
+
latency=page.latency,
|
|
113
|
+
)
|
|
98
114
|
except KeyboardInterrupt:
|
|
99
115
|
raise
|
|
100
116
|
except Exception:
|
|
101
117
|
if self.debug:
|
|
102
118
|
raise
|
|
103
119
|
else:
|
|
104
|
-
logger.exception
|
|
120
|
+
logger.opt(exception=True).error(
|
|
121
|
+
"Error processing page {page_idx} of {file_path}",
|
|
122
|
+
page_idx=page_idx,
|
|
123
|
+
file_path=str(file_path),
|
|
124
|
+
)
|
|
105
125
|
page.error = ProcessingError.from_class(self)
|
|
106
126
|
if not self.save_page_images:
|
|
107
127
|
page.buffer_image = dict(
|
|
@@ -122,12 +142,19 @@ class BaseConverter:
|
|
|
122
142
|
if self.debug:
|
|
123
143
|
raise
|
|
124
144
|
else:
|
|
125
|
-
logger.exception
|
|
145
|
+
logger.opt(exception=True).error(
|
|
146
|
+
"Error processing document {file_path}",
|
|
147
|
+
file_path=str(file_path),
|
|
148
|
+
)
|
|
126
149
|
document.error = ProcessingError.from_class(self)
|
|
127
150
|
return document
|
|
128
151
|
toc = time.perf_counter()
|
|
129
152
|
document.latency = toc - tic
|
|
130
|
-
logger.debug(
|
|
153
|
+
logger.debug(
|
|
154
|
+
"Document {file_path} processed in {latency:.2f}s",
|
|
155
|
+
file_path=str(file_path),
|
|
156
|
+
latency=document.latency,
|
|
157
|
+
)
|
|
131
158
|
if self.save_folder is not None:
|
|
132
159
|
self._save_document(document)
|
|
133
160
|
|
|
@@ -169,8 +196,16 @@ class BaseConverter:
|
|
|
169
196
|
else:
|
|
170
197
|
logger.warning(f"Unknown save_mode: {self.save_mode}, skipping save")
|
|
171
198
|
|
|
199
|
+
async def _async_call_with_cleanup(self, file_path: str | Path):
|
|
200
|
+
"""Call async_call and ensure cleanup."""
|
|
201
|
+
try:
|
|
202
|
+
return await self.async_call(file_path)
|
|
203
|
+
finally:
|
|
204
|
+
if hasattr(self, "aclose"):
|
|
205
|
+
await self.aclose()
|
|
206
|
+
|
|
172
207
|
def __call__(self, file_path: str | Path):
|
|
173
|
-
return asyncio.run(self.
|
|
208
|
+
return asyncio.run(self._async_call_with_cleanup(file_path))
|
|
174
209
|
|
|
175
210
|
async def async_batch(self, file_paths: list[str | Path]) -> list[Document] | None:
|
|
176
211
|
"""Process multiple files concurrently with semaphore limit."""
|
|
@@ -184,9 +219,14 @@ class BaseConverter:
|
|
|
184
219
|
await self.async_call(file_path)
|
|
185
220
|
|
|
186
221
|
tasks = [asyncio.create_task(worker(file_path)) for file_path in file_paths]
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
222
|
+
try:
|
|
223
|
+
documents = await asyncio.gather(*tasks)
|
|
224
|
+
if self.return_documents_in_batch_mode:
|
|
225
|
+
return documents
|
|
226
|
+
finally:
|
|
227
|
+
# Close async resources before the event loop ends
|
|
228
|
+
if hasattr(self, "aclose"):
|
|
229
|
+
await self.aclose()
|
|
190
230
|
|
|
191
231
|
def batch(self, file_paths: list[str | Path]) -> list[Document] | None:
|
|
192
232
|
"""Synchronous wrapper for async_batch."""
|