vlmparse 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vlmparse/build_doc.py +20 -19
- vlmparse/cli.py +439 -270
- vlmparse/clients/chandra.py +176 -60
- vlmparse/clients/deepseekocr.py +193 -12
- vlmparse/clients/docling.py +0 -1
- vlmparse/clients/dotsocr.py +34 -31
- vlmparse/clients/glmocr.py +243 -0
- vlmparse/clients/granite_docling.py +9 -36
- vlmparse/clients/hunyuanocr.py +5 -1
- vlmparse/clients/lightonocr.py +23 -1
- vlmparse/clients/mineru.py +0 -1
- vlmparse/clients/mistral_converter.py +85 -0
- vlmparse/clients/nanonetocr.py +5 -1
- vlmparse/clients/olmocr.py +6 -2
- vlmparse/clients/openai_converter.py +95 -60
- vlmparse/clients/paddleocrvl.py +195 -40
- vlmparse/converter.py +51 -11
- vlmparse/converter_with_server.py +92 -19
- vlmparse/registries.py +107 -89
- vlmparse/servers/base_server.py +127 -0
- vlmparse/servers/docker_compose_deployment.py +489 -0
- vlmparse/servers/docker_compose_server.py +39 -0
- vlmparse/servers/docker_run_deployment.py +226 -0
- vlmparse/servers/docker_server.py +17 -109
- vlmparse/servers/model_identity.py +48 -0
- vlmparse/servers/server_registry.py +42 -0
- vlmparse/servers/utils.py +83 -219
- vlmparse/st_viewer/st_viewer.py +1 -1
- vlmparse/utils.py +15 -2
- {vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/METADATA +13 -3
- vlmparse-0.1.9.dist-info/RECORD +44 -0
- {vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/WHEEL +1 -1
- vlmparse-0.1.7.dist-info/RECORD +0 -36
- {vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/entry_points.txt +0 -0
- {vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/licenses/LICENSE +0 -0
- {vlmparse-0.1.7.dist-info → vlmparse-0.1.9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
import orjson
|
|
8
|
+
from loguru import logger
|
|
9
|
+
from pydantic import Field
|
|
10
|
+
|
|
11
|
+
from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
|
|
12
|
+
from vlmparse.clients.pipe_utils.utils import clean_response
|
|
13
|
+
from vlmparse.converter import BaseConverter, ConverterConfig
|
|
14
|
+
from vlmparse.data_model.document import BoundingBox, Item, Page
|
|
15
|
+
from vlmparse.servers.docker_compose_server import DockerComposeServerConfig
|
|
16
|
+
from vlmparse.utils import to_base64
|
|
17
|
+
|
|
18
|
+
DOCKER_PIPELINE_DIR = (
|
|
19
|
+
Path(__file__).parent.parent.parent / "docker_pipelines" / "glmocr"
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class GLMOCRDockerServerConfig(DockerComposeServerConfig):
|
|
24
|
+
"""Docker Compose configuration for GLM-OCR server."""
|
|
25
|
+
|
|
26
|
+
model_name: str = "GLM-OCR"
|
|
27
|
+
aliases: list[str] = Field(default_factory=lambda: ["glmocr", "glm-ocr"])
|
|
28
|
+
compose_file: str = str(DOCKER_PIPELINE_DIR / "compose.yaml")
|
|
29
|
+
server_service: str = "glmocr-api"
|
|
30
|
+
compose_services: list[str] = Field(
|
|
31
|
+
default_factory=lambda: ["glmocr-api", "glmocr-vllm-server"]
|
|
32
|
+
)
|
|
33
|
+
gpu_service_names: list[str] = Field(default_factory=lambda: ["glmocr-vllm-server"])
|
|
34
|
+
docker_port: int = 5002
|
|
35
|
+
container_port: int = 5002
|
|
36
|
+
environment: dict[str, str] = Field(
|
|
37
|
+
default_factory=lambda: {
|
|
38
|
+
"VLM_BACKEND": "vllm",
|
|
39
|
+
"API_PORT": "8080",
|
|
40
|
+
}
|
|
41
|
+
)
|
|
42
|
+
environment_services: list[str] = Field(default_factory=lambda: ["glmocr-api"])
|
|
43
|
+
server_ready_indicators: list[str] = Field(
|
|
44
|
+
default_factory=lambda: ["Running on", "Application startup complete"]
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
def model_post_init(self, __context):
|
|
48
|
+
if not self.compose_env:
|
|
49
|
+
compose_env = {}
|
|
50
|
+
for key in [
|
|
51
|
+
"API_IMAGE_TAG_SUFFIX",
|
|
52
|
+
"VLM_IMAGE_TAG_SUFFIX",
|
|
53
|
+
"VLM_BACKEND",
|
|
54
|
+
]:
|
|
55
|
+
value = os.getenv(key)
|
|
56
|
+
if value:
|
|
57
|
+
compose_env[key] = value
|
|
58
|
+
if compose_env:
|
|
59
|
+
self.compose_env = compose_env
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def client_config(self):
|
|
63
|
+
return GLMOCRConverterConfig(
|
|
64
|
+
**self._create_client_kwargs(f"http://localhost:{self.docker_port}")
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class GLMOCRConverterConfig(ConverterConfig):
|
|
69
|
+
"""Configuration for GLM-OCR API client."""
|
|
70
|
+
|
|
71
|
+
model_name: str = "GLM-OCR"
|
|
72
|
+
aliases: list[str] = Field(default_factory=lambda: ["glmocr", "glm-ocr"])
|
|
73
|
+
timeout: int = 600
|
|
74
|
+
|
|
75
|
+
endpoint_parse: str = "/glmocr/parse"
|
|
76
|
+
|
|
77
|
+
# GLM-OCR specific configuration
|
|
78
|
+
|
|
79
|
+
# Output format: "json", "markdown", or "both"
|
|
80
|
+
output_format: str = "both"
|
|
81
|
+
|
|
82
|
+
# Enable layout detection (PP-DocLayout)
|
|
83
|
+
enable_layout: bool = True
|
|
84
|
+
|
|
85
|
+
# GLM-OCR model parameters
|
|
86
|
+
max_tokens: int = 16384
|
|
87
|
+
temperature: float = 0.01
|
|
88
|
+
image_format: str = "JPEG"
|
|
89
|
+
min_pixels: int = 12544
|
|
90
|
+
max_pixels: int = 71372800
|
|
91
|
+
|
|
92
|
+
# Backward-compat escape hatch: if set, applied last to the payload.
|
|
93
|
+
request_overrides: dict[str, Any] = Field(default_factory=dict)
|
|
94
|
+
|
|
95
|
+
def get_client(self, **kwargs) -> "GLMOCRConverter":
|
|
96
|
+
return GLMOCRConverter(config=self, **kwargs)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class GLMOCRConverter(BaseConverter):
|
|
100
|
+
"""GLM-OCR HTTP API converter."""
|
|
101
|
+
|
|
102
|
+
config: GLMOCRConverterConfig
|
|
103
|
+
|
|
104
|
+
def _build_parse_payload(self, file_content_b64: str) -> dict:
|
|
105
|
+
"""Build the request payload for the GLM-OCR parse endpoint.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
file_content_b64: Base64 encoded image content
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Dictionary payload for the API request
|
|
112
|
+
"""
|
|
113
|
+
# Wrap base64 in data URI format as expected by GLM-OCR
|
|
114
|
+
# Format: data:image/png;base64,<base64_data>
|
|
115
|
+
data_uri = f"data:image/png;base64,{file_content_b64}"
|
|
116
|
+
|
|
117
|
+
payload: dict[str, Any] = {
|
|
118
|
+
"images": [data_uri] # GLM-OCR expects a list
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
# Apply any request overrides
|
|
122
|
+
if self.config.request_overrides:
|
|
123
|
+
payload.update(self.config.request_overrides)
|
|
124
|
+
|
|
125
|
+
return payload
|
|
126
|
+
|
|
127
|
+
async def _post_json(self, endpoint: str, payload: dict) -> dict:
|
|
128
|
+
"""Make a POST request to the GLM-OCR API.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
endpoint: API endpoint path
|
|
132
|
+
payload: Request payload
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
Parsed JSON response
|
|
136
|
+
|
|
137
|
+
Raises:
|
|
138
|
+
RuntimeError: If the API returns an error
|
|
139
|
+
"""
|
|
140
|
+
headers = {}
|
|
141
|
+
|
|
142
|
+
async with httpx.AsyncClient(
|
|
143
|
+
base_url=self.config.base_url, timeout=self.config.timeout, headers=headers
|
|
144
|
+
) as client:
|
|
145
|
+
response = await client.post(endpoint, json=payload)
|
|
146
|
+
|
|
147
|
+
response.raise_for_status()
|
|
148
|
+
data = response.json()
|
|
149
|
+
|
|
150
|
+
# Check for error in response
|
|
151
|
+
if "error" in data:
|
|
152
|
+
raise RuntimeError(data.get("error", "Unknown error"))
|
|
153
|
+
|
|
154
|
+
return data
|
|
155
|
+
|
|
156
|
+
def _apply_markdown(self, page: Page, markdown_text: str | None):
|
|
157
|
+
"""Apply markdown text to the page.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
page: Page object to update
|
|
161
|
+
markdown_text: Markdown content from GLM-OCR
|
|
162
|
+
"""
|
|
163
|
+
text = markdown_text or ""
|
|
164
|
+
text = clean_response(text)
|
|
165
|
+
text = html_to_md_keep_tables(text)
|
|
166
|
+
logger.debug(f"Converted markdown text: {text[:100]}...")
|
|
167
|
+
page.text = text
|
|
168
|
+
|
|
169
|
+
def _apply_items(self, page: Page, json_result: list[dict] | None):
|
|
170
|
+
"""Apply structured items to the page from JSON result.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
page: Page object to update
|
|
174
|
+
json_result: List of detected regions from GLM-OCR
|
|
175
|
+
"""
|
|
176
|
+
if not json_result:
|
|
177
|
+
return
|
|
178
|
+
|
|
179
|
+
items: list[Item] = []
|
|
180
|
+
|
|
181
|
+
for block in json_result:
|
|
182
|
+
bbox = block.get("bbox_2d")
|
|
183
|
+
if not bbox or len(bbox) != 4:
|
|
184
|
+
# If no bbox, skip this item
|
|
185
|
+
continue
|
|
186
|
+
|
|
187
|
+
x1, y1, x2, y2 = bbox
|
|
188
|
+
text = block.get("content") or ""
|
|
189
|
+
label = block.get("label") or ""
|
|
190
|
+
|
|
191
|
+
items.append(
|
|
192
|
+
Item(
|
|
193
|
+
text=text,
|
|
194
|
+
box=BoundingBox(l=x1, t=y1, r=x2, b=y2),
|
|
195
|
+
category=label,
|
|
196
|
+
)
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
page.items = items
|
|
200
|
+
|
|
201
|
+
async def async_call_inside_page(self, page: Page) -> Page:
|
|
202
|
+
"""Process a single page through the GLM-OCR API.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
page: Page object containing the image to process
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
Updated Page object with OCR results
|
|
209
|
+
"""
|
|
210
|
+
image = page.image
|
|
211
|
+
|
|
212
|
+
# Convert image to base64
|
|
213
|
+
file_content_b64 = await asyncio.to_thread(to_base64, image, "PNG")
|
|
214
|
+
|
|
215
|
+
# Build request payload
|
|
216
|
+
payload = self._build_parse_payload(file_content_b64)
|
|
217
|
+
|
|
218
|
+
# Call the GLM-OCR API
|
|
219
|
+
data = await self._post_json(self.config.endpoint_parse, payload)
|
|
220
|
+
|
|
221
|
+
# GLM-OCR returns results as a list (one per document)
|
|
222
|
+
# Since we send one image, we get one document result
|
|
223
|
+
result = data.get("markdown_result", None)
|
|
224
|
+
|
|
225
|
+
if result:
|
|
226
|
+
# Get markdown output if available
|
|
227
|
+
markdown_result = result
|
|
228
|
+
if markdown_result:
|
|
229
|
+
self._apply_markdown(page, markdown_result)
|
|
230
|
+
|
|
231
|
+
# Get JSON output if available and layout detection is enabled
|
|
232
|
+
json_result = data.get("json_result")
|
|
233
|
+
if json_result and isinstance(json_result, list) and len(json_result) > 0:
|
|
234
|
+
# json_result is a list of pages, take the first page
|
|
235
|
+
page_result = (
|
|
236
|
+
json_result[0] if isinstance(json_result[0], list) else json_result
|
|
237
|
+
)
|
|
238
|
+
self._apply_items(page, page_result)
|
|
239
|
+
|
|
240
|
+
# Store raw response
|
|
241
|
+
page.raw_response = orjson.dumps(result).decode("utf-8")
|
|
242
|
+
|
|
243
|
+
return page
|
|
@@ -28,7 +28,11 @@ class GraniteDoclingDockerServerConfig(VLLMDockerServerConfig):
|
|
|
28
28
|
|
|
29
29
|
@property
|
|
30
30
|
def client_config(self):
|
|
31
|
-
return GraniteDoclingConverterConfig(
|
|
31
|
+
return GraniteDoclingConverterConfig(
|
|
32
|
+
**self._create_client_kwargs(
|
|
33
|
+
f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
|
|
34
|
+
)
|
|
35
|
+
)
|
|
32
36
|
|
|
33
37
|
|
|
34
38
|
class GraniteDoclingConverterConfig(OpenAIConverterConfig):
|
|
@@ -70,49 +74,18 @@ class GraniteDoclingConverter(OpenAIConverterClient):
|
|
|
70
74
|
}
|
|
71
75
|
]
|
|
72
76
|
|
|
73
|
-
doctags = await self.
|
|
77
|
+
doctags, usage = await self._get_chat_completion(
|
|
74
78
|
messages, completion_kwargs=self.config.completion_kwargs
|
|
75
79
|
)
|
|
76
80
|
doctags = clean_response(doctags)
|
|
77
81
|
|
|
78
82
|
page.raw_response = doctags
|
|
79
83
|
page.text = _doctags_to_markdown(doctags, image)
|
|
84
|
+
if usage is not None:
|
|
85
|
+
page.prompt_tokens = usage.prompt_tokens
|
|
86
|
+
page.completion_tokens = usage.completion_tokens
|
|
80
87
|
return page
|
|
81
88
|
|
|
82
|
-
async def _get_chat_completion_adaptive(
|
|
83
|
-
self, messages: list[dict], completion_kwargs: dict | None
|
|
84
|
-
) -> str:
|
|
85
|
-
"""
|
|
86
|
-
vLLM enforces input+output <= model context length. If `max_tokens` is too
|
|
87
|
-
high (especially for multimodal prompts), retry with progressively smaller
|
|
88
|
-
`max_tokens`.
|
|
89
|
-
"""
|
|
90
|
-
kwargs = (completion_kwargs or {}).copy()
|
|
91
|
-
max_tokens = kwargs.get("max_tokens") or kwargs.get("max_completion_tokens")
|
|
92
|
-
|
|
93
|
-
for _ in range(6):
|
|
94
|
-
try:
|
|
95
|
-
return await self._get_chat_completion(
|
|
96
|
-
messages, completion_kwargs=kwargs
|
|
97
|
-
)
|
|
98
|
-
except Exception as e:
|
|
99
|
-
msg = str(e)
|
|
100
|
-
too_large = (
|
|
101
|
-
"max_tokens" in msg
|
|
102
|
-
and "maximum context length" in msg
|
|
103
|
-
and "is too large" in msg
|
|
104
|
-
)
|
|
105
|
-
if not too_large or not isinstance(max_tokens, int):
|
|
106
|
-
raise
|
|
107
|
-
|
|
108
|
-
max_tokens = max(256, int(max_tokens * 0.75))
|
|
109
|
-
if "max_tokens" in kwargs:
|
|
110
|
-
kwargs["max_tokens"] = max_tokens
|
|
111
|
-
if "max_completion_tokens" in kwargs:
|
|
112
|
-
kwargs["max_completion_tokens"] = max_tokens
|
|
113
|
-
|
|
114
|
-
return await self._get_chat_completion(messages, completion_kwargs=kwargs)
|
|
115
|
-
|
|
116
89
|
|
|
117
90
|
def _doctags_to_markdown(doctags: str, image):
|
|
118
91
|
try:
|
vlmparse/clients/hunyuanocr.py
CHANGED
|
@@ -25,7 +25,11 @@ class HunyuanOCRDockerServerConfig(VLLMDockerServerConfig):
|
|
|
25
25
|
|
|
26
26
|
@property
|
|
27
27
|
def client_config(self):
|
|
28
|
-
return HunyuanOCRConverterConfig(
|
|
28
|
+
return HunyuanOCRConverterConfig(
|
|
29
|
+
**self._create_client_kwargs(
|
|
30
|
+
f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
|
|
31
|
+
)
|
|
32
|
+
)
|
|
29
33
|
|
|
30
34
|
|
|
31
35
|
class HunyuanOCRConverterConfig(OpenAIConverterConfig):
|
vlmparse/clients/lightonocr.py
CHANGED
|
@@ -25,7 +25,11 @@ class LightOnOCRDockerServerConfig(VLLMDockerServerConfig):
|
|
|
25
25
|
|
|
26
26
|
@property
|
|
27
27
|
def client_config(self):
|
|
28
|
-
return LightOnOCRConverterConfig(
|
|
28
|
+
return LightOnOCRConverterConfig(
|
|
29
|
+
**self._create_client_kwargs(
|
|
30
|
+
f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
|
|
31
|
+
)
|
|
32
|
+
)
|
|
29
33
|
|
|
30
34
|
|
|
31
35
|
class LightOnOCRConverterConfig(OpenAIConverterConfig):
|
|
@@ -41,3 +45,21 @@ class LightOnOCRConverterConfig(OpenAIConverterConfig):
|
|
|
41
45
|
}
|
|
42
46
|
dpi: int = 200
|
|
43
47
|
aliases: list[str] = Field(default_factory=lambda: ["lightonocr"])
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class LightonOCR21BServerConfig(LightOnOCRDockerServerConfig):
|
|
51
|
+
model_name: str = "lightonai/LightOnOCR-2-1B"
|
|
52
|
+
aliases: list[str] = Field(default_factory=lambda: ["lightonocr2"])
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def client_config(self):
|
|
56
|
+
return LightonOCR21BConverterConfig(
|
|
57
|
+
**self._create_client_kwargs(
|
|
58
|
+
f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
|
|
59
|
+
)
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class LightonOCR21BConverterConfig(LightOnOCRConverterConfig):
|
|
64
|
+
model_name: str = "lightonai/LightOnOCR-2-1B"
|
|
65
|
+
aliases: list[str] = Field(default_factory=lambda: ["lightonocr2"])
|
vlmparse/clients/mineru.py
CHANGED
|
@@ -31,7 +31,6 @@ class MinerUDockerServerConfig(DockerServerConfig):
|
|
|
31
31
|
class MinerUConverterConfig(ConverterConfig):
|
|
32
32
|
"""Configuration for MinerU API converter."""
|
|
33
33
|
|
|
34
|
-
base_url: str
|
|
35
34
|
model_name: str = "opendatalab/MinerU2.5-2509-1.2B"
|
|
36
35
|
aliases: list[str] = Field(default_factory=lambda: ["mineru25"])
|
|
37
36
|
timeout: int = 600
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import httpx
|
|
4
|
+
import orjson
|
|
5
|
+
from loguru import logger
|
|
6
|
+
from pydantic import Field
|
|
7
|
+
|
|
8
|
+
from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
|
|
9
|
+
from vlmparse.clients.pipe_utils.utils import clean_response
|
|
10
|
+
from vlmparse.converter import BaseConverter, ConverterConfig
|
|
11
|
+
from vlmparse.data_model.document import Page
|
|
12
|
+
from vlmparse.utils import to_base64
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MistralOCRConverterConfig(ConverterConfig):
|
|
16
|
+
"""Configuration for Mistral OCR converter."""
|
|
17
|
+
|
|
18
|
+
base_url: str = "https://api.mistral.ai/v1"
|
|
19
|
+
model_name: str = "mistral-ocr-latest"
|
|
20
|
+
api_key: str | None = None
|
|
21
|
+
timeout: int = 300
|
|
22
|
+
aliases: list[str] = Field(
|
|
23
|
+
default_factory=lambda: ["mistral-ocr-latest", "mistral-ocr"]
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
def get_client(self, **kwargs) -> "MistralOCRConverter":
|
|
27
|
+
return MistralOCRConverter(config=self, **kwargs)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class MistralOCRConverter(BaseConverter):
|
|
31
|
+
"""Client for Mistral OCR API."""
|
|
32
|
+
|
|
33
|
+
config: MistralOCRConverterConfig
|
|
34
|
+
|
|
35
|
+
def __init__(self, config: MistralOCRConverterConfig, **kwargs):
|
|
36
|
+
super().__init__(config=config, **kwargs)
|
|
37
|
+
if not self.config.api_key:
|
|
38
|
+
self.config.api_key = os.getenv("MISTRAL_API_KEY")
|
|
39
|
+
if not self.config.api_key:
|
|
40
|
+
raise ValueError("MISTRAL_API_KEY environment variable not set")
|
|
41
|
+
self._base_url = self.config.base_url.rstrip("/")
|
|
42
|
+
|
|
43
|
+
async def _async_ocr(self, image) -> httpx.Response:
|
|
44
|
+
payload = {
|
|
45
|
+
"model": self.config.model_name,
|
|
46
|
+
"document": {
|
|
47
|
+
"type": "image_url",
|
|
48
|
+
"image_url": f"data:image/png;base64,{to_base64(image)}",
|
|
49
|
+
},
|
|
50
|
+
}
|
|
51
|
+
headers = {"Authorization": f"Bearer {self.config.api_key}"}
|
|
52
|
+
|
|
53
|
+
async with httpx.AsyncClient(timeout=self.config.timeout) as client:
|
|
54
|
+
response = await client.post(
|
|
55
|
+
f"{self._base_url}/ocr",
|
|
56
|
+
json=payload,
|
|
57
|
+
headers=headers,
|
|
58
|
+
)
|
|
59
|
+
response.raise_for_status()
|
|
60
|
+
return response
|
|
61
|
+
|
|
62
|
+
async def async_call_inside_page(self, page: Page) -> Page:
|
|
63
|
+
response = await self._async_ocr(page.image)
|
|
64
|
+
page.raw_response = response.text
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
data = response.json()
|
|
68
|
+
except ValueError:
|
|
69
|
+
logger.warning("Mistral OCR returned non-JSON response")
|
|
70
|
+
page.text = clean_response(response.text)
|
|
71
|
+
return page
|
|
72
|
+
|
|
73
|
+
pages = data.get("pages") or []
|
|
74
|
+
if pages:
|
|
75
|
+
page_data = pages[0]
|
|
76
|
+
text = page_data.get("markdown") or page_data.get("text") or ""
|
|
77
|
+
else:
|
|
78
|
+
text = (
|
|
79
|
+
data.get("markdown") or data.get("text") or orjson.dumps(data).decode()
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
text = clean_response(text)
|
|
83
|
+
text = html_to_md_keep_tables(text)
|
|
84
|
+
page.text = text
|
|
85
|
+
return page
|
vlmparse/clients/nanonetocr.py
CHANGED
|
@@ -12,7 +12,11 @@ class NanonetOCR2DockerServerConfig(VLLMDockerServerConfig):
|
|
|
12
12
|
|
|
13
13
|
@property
|
|
14
14
|
def client_config(self):
|
|
15
|
-
return NanonetOCR2ConverterConfig(
|
|
15
|
+
return NanonetOCR2ConverterConfig(
|
|
16
|
+
**self._create_client_kwargs(
|
|
17
|
+
f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
|
|
18
|
+
)
|
|
19
|
+
)
|
|
16
20
|
|
|
17
21
|
|
|
18
22
|
class NanonetOCR2ConverterConfig(OpenAIConverterConfig):
|
vlmparse/clients/olmocr.py
CHANGED
|
@@ -23,7 +23,11 @@ class OlmOCRDockerServerConfig(VLLMDockerServerConfig):
|
|
|
23
23
|
|
|
24
24
|
@property
|
|
25
25
|
def client_config(self):
|
|
26
|
-
return OlmOCRConverterConfig(
|
|
26
|
+
return OlmOCRConverterConfig(
|
|
27
|
+
**self._create_client_kwargs(
|
|
28
|
+
f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
|
|
29
|
+
)
|
|
30
|
+
)
|
|
27
31
|
|
|
28
32
|
|
|
29
33
|
class OlmOCRConverterConfig(OpenAIConverterConfig):
|
|
@@ -37,7 +41,7 @@ class OlmOCRConverterConfig(OpenAIConverterConfig):
|
|
|
37
41
|
"Return your output as markdown, with a front matter section on top specifying values for the primary_language, is_rotation_valid, rotation_correction, is_table, and is_diagram parameters."
|
|
38
42
|
)
|
|
39
43
|
postprompt: str | None = None
|
|
40
|
-
completion_kwargs: dict
|
|
44
|
+
completion_kwargs: dict = {
|
|
41
45
|
"temperature": 0.1,
|
|
42
46
|
"max_tokens": 8000,
|
|
43
47
|
}
|