vlmparse 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vlmparse/build_doc.py +20 -19
- vlmparse/cli.py +33 -37
- vlmparse/clients/chandra.py +176 -60
- vlmparse/clients/deepseekocr.py +23 -12
- vlmparse/clients/docling.py +0 -1
- vlmparse/clients/dotsocr.py +34 -31
- vlmparse/clients/granite_docling.py +10 -36
- vlmparse/clients/hunyuanocr.py +5 -1
- vlmparse/clients/lightonocr.py +23 -1
- vlmparse/clients/mineru.py +0 -1
- vlmparse/clients/mistral_converter.py +85 -0
- vlmparse/clients/nanonetocr.py +5 -1
- vlmparse/clients/olmocr.py +6 -2
- vlmparse/clients/openai_converter.py +104 -67
- vlmparse/clients/paddleocrvl.py +9 -2
- vlmparse/constants.py +3 -0
- vlmparse/converter.py +51 -11
- vlmparse/converter_with_server.py +104 -29
- vlmparse/registries.py +97 -89
- vlmparse/servers/docker_server.py +65 -42
- vlmparse/servers/model_identity.py +48 -0
- vlmparse/servers/utils.py +39 -11
- vlmparse/utils.py +15 -2
- {vlmparse-0.1.6.dist-info → vlmparse-0.1.8.dist-info}/METADATA +11 -1
- vlmparse-0.1.8.dist-info/RECORD +38 -0
- {vlmparse-0.1.6.dist-info → vlmparse-0.1.8.dist-info}/WHEEL +1 -1
- vlmparse-0.1.6.dist-info/RECORD +0 -36
- {vlmparse-0.1.6.dist-info → vlmparse-0.1.8.dist-info}/entry_points.txt +0 -0
- {vlmparse-0.1.6.dist-info → vlmparse-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {vlmparse-0.1.6.dist-info → vlmparse-0.1.8.dist-info}/top_level.txt +0 -0
vlmparse/clients/dotsocr.py
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import math
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import ClassVar
|
|
4
|
+
from typing import ClassVar
|
|
5
5
|
|
|
6
6
|
from loguru import logger
|
|
7
7
|
from PIL import Image
|
|
8
8
|
from pydantic import Field
|
|
9
9
|
|
|
10
10
|
from vlmparse.clients.openai_converter import (
|
|
11
|
-
LLMParams,
|
|
12
11
|
OpenAIConverterClient,
|
|
13
12
|
OpenAIConverterConfig,
|
|
14
13
|
)
|
|
@@ -48,12 +47,13 @@ class DotsOCRDockerServerConfig(DockerServerConfig):
|
|
|
48
47
|
)
|
|
49
48
|
add_model_key_to_server: bool = True
|
|
50
49
|
aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
|
|
50
|
+
default_model_name: str = DEFAULT_MODEL_NAME
|
|
51
51
|
|
|
52
52
|
@property
|
|
53
53
|
def client_config(self):
|
|
54
54
|
return DotsOCRConverterConfig(
|
|
55
|
-
|
|
56
|
-
|
|
55
|
+
**self._create_client_kwargs(
|
|
56
|
+
f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
|
|
57
57
|
)
|
|
58
58
|
)
|
|
59
59
|
|
|
@@ -65,29 +65,7 @@ class DotsOCRConverterConfig(OpenAIConverterConfig):
|
|
|
65
65
|
model_name: str = "rednote-hilab/dots.ocr"
|
|
66
66
|
preprompt: str | None = ""
|
|
67
67
|
postprompt: str | None = None
|
|
68
|
-
|
|
69
|
-
"temperature": 0.1,
|
|
70
|
-
"top_p": 1.0,
|
|
71
|
-
"max_completion_tokens": 16384,
|
|
72
|
-
}
|
|
73
|
-
aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
|
|
74
|
-
dpi: int = 200
|
|
75
|
-
prompt_mode: Literal["prompt_layout_all_en", "prompt_ocr"] = "prompt_ocr"
|
|
76
|
-
|
|
77
|
-
def get_client(self, **kwargs) -> "DotsOCRConverter":
|
|
78
|
-
return DotsOCRConverter(config=self, **kwargs)
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
class DotsOCRConverter(OpenAIConverterClient):
|
|
82
|
-
"""DotsOCR VLLM converter."""
|
|
83
|
-
|
|
84
|
-
# Constants
|
|
85
|
-
MIN_PIXELS: ClassVar[int] = 3136
|
|
86
|
-
MAX_PIXELS: ClassVar[int] = 11289600
|
|
87
|
-
IMAGE_FACTOR: ClassVar[int] = 28
|
|
88
|
-
|
|
89
|
-
# Prompts
|
|
90
|
-
PROMPTS: ClassVar[dict] = {
|
|
68
|
+
prompts: dict[str, str] = {
|
|
91
69
|
"prompt_layout_all_en": """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
|
|
92
70
|
|
|
93
71
|
1. Bbox format: [x1, y1, x2, y2]
|
|
@@ -108,6 +86,30 @@ class DotsOCRConverter(OpenAIConverterClient):
|
|
|
108
86
|
""",
|
|
109
87
|
"prompt_ocr": """Extract the text content from this image.""",
|
|
110
88
|
}
|
|
89
|
+
prompt_mode_map: dict[str, str] = {
|
|
90
|
+
"ocr": "prompt_ocr",
|
|
91
|
+
"ocr_layout": "prompt_layout_all_en",
|
|
92
|
+
"table": "prompt_layout_all_en",
|
|
93
|
+
}
|
|
94
|
+
completion_kwargs: dict | None = {
|
|
95
|
+
"temperature": 0.1,
|
|
96
|
+
"top_p": 1.0,
|
|
97
|
+
"max_completion_tokens": 16384,
|
|
98
|
+
}
|
|
99
|
+
aliases: list[str] = Field(default_factory=lambda: ["dotsocr"])
|
|
100
|
+
dpi: int = 200
|
|
101
|
+
|
|
102
|
+
def get_client(self, **kwargs) -> "DotsOCRConverter":
|
|
103
|
+
return DotsOCRConverter(config=self, **kwargs)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class DotsOCRConverter(OpenAIConverterClient):
|
|
107
|
+
"""DotsOCR VLLM converter."""
|
|
108
|
+
|
|
109
|
+
# Constants
|
|
110
|
+
MIN_PIXELS: ClassVar[int] = 3136
|
|
111
|
+
MAX_PIXELS: ClassVar[int] = 11289600
|
|
112
|
+
IMAGE_FACTOR: ClassVar[int] = 28
|
|
111
113
|
|
|
112
114
|
@staticmethod
|
|
113
115
|
def round_by_factor(number: int, factor: int) -> int:
|
|
@@ -235,7 +237,7 @@ class DotsOCRConverter(OpenAIConverterClient):
|
|
|
235
237
|
image = self.fetch_image(
|
|
236
238
|
origin_image, min_pixels=self.MIN_PIXELS, max_pixels=self.MAX_PIXELS
|
|
237
239
|
)
|
|
238
|
-
prompt = self.
|
|
240
|
+
prompt = self.config.prompts[prompt_mode]
|
|
239
241
|
|
|
240
242
|
response, usage = await self._async_inference_with_vllm(image, prompt)
|
|
241
243
|
|
|
@@ -258,13 +260,15 @@ class DotsOCRConverter(OpenAIConverterClient):
|
|
|
258
260
|
async def async_call_inside_page(self, page: Page) -> Page:
|
|
259
261
|
image = page.image
|
|
260
262
|
|
|
263
|
+
prompt_key = self.get_prompt_key() or "prompt_ocr"
|
|
264
|
+
|
|
261
265
|
_, response, _, usage = await self._parse_image_vllm(
|
|
262
|
-
image, prompt_mode=
|
|
266
|
+
image, prompt_mode=prompt_key
|
|
263
267
|
)
|
|
264
268
|
logger.info("Response: " + str(response))
|
|
265
269
|
|
|
266
270
|
items = None
|
|
267
|
-
if
|
|
271
|
+
if prompt_key == "prompt_layout_all_en":
|
|
268
272
|
text = "\n\n".join([item.get("text", "") for item in response])
|
|
269
273
|
|
|
270
274
|
items = []
|
|
@@ -286,5 +290,4 @@ class DotsOCRConverter(OpenAIConverterClient):
|
|
|
286
290
|
|
|
287
291
|
page.completion_tokens = usage.completion_tokens
|
|
288
292
|
page.prompt_tokens = usage.prompt_tokens
|
|
289
|
-
page.reasoning_tokens = usage.reasoning_tokens
|
|
290
293
|
return page
|
|
@@ -28,12 +28,17 @@ class GraniteDoclingDockerServerConfig(VLLMDockerServerConfig):
|
|
|
28
28
|
|
|
29
29
|
@property
|
|
30
30
|
def client_config(self):
|
|
31
|
-
return GraniteDoclingConverterConfig(
|
|
31
|
+
return GraniteDoclingConverterConfig(
|
|
32
|
+
**self._create_client_kwargs(
|
|
33
|
+
f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
|
|
34
|
+
)
|
|
35
|
+
)
|
|
32
36
|
|
|
33
37
|
|
|
34
38
|
class GraniteDoclingConverterConfig(OpenAIConverterConfig):
|
|
35
39
|
"""Granite Docling converter configuration."""
|
|
36
40
|
|
|
41
|
+
model_name: str = "ibm-granite/granite-docling-258M"
|
|
37
42
|
preprompt: str | None = None
|
|
38
43
|
postprompt: str | None = "Convert this page to docling."
|
|
39
44
|
completion_kwargs: dict | None = {
|
|
@@ -69,49 +74,18 @@ class GraniteDoclingConverter(OpenAIConverterClient):
|
|
|
69
74
|
}
|
|
70
75
|
]
|
|
71
76
|
|
|
72
|
-
doctags = await self.
|
|
77
|
+
doctags, usage = await self._get_chat_completion(
|
|
73
78
|
messages, completion_kwargs=self.config.completion_kwargs
|
|
74
79
|
)
|
|
75
80
|
doctags = clean_response(doctags)
|
|
76
81
|
|
|
77
82
|
page.raw_response = doctags
|
|
78
83
|
page.text = _doctags_to_markdown(doctags, image)
|
|
84
|
+
if usage is not None:
|
|
85
|
+
page.prompt_tokens = usage.prompt_tokens
|
|
86
|
+
page.completion_tokens = usage.completion_tokens
|
|
79
87
|
return page
|
|
80
88
|
|
|
81
|
-
async def _get_chat_completion_adaptive(
|
|
82
|
-
self, messages: list[dict], completion_kwargs: dict | None
|
|
83
|
-
) -> str:
|
|
84
|
-
"""
|
|
85
|
-
vLLM enforces input+output <= model context length. If `max_tokens` is too
|
|
86
|
-
high (especially for multimodal prompts), retry with progressively smaller
|
|
87
|
-
`max_tokens`.
|
|
88
|
-
"""
|
|
89
|
-
kwargs = (completion_kwargs or {}).copy()
|
|
90
|
-
max_tokens = kwargs.get("max_tokens") or kwargs.get("max_completion_tokens")
|
|
91
|
-
|
|
92
|
-
for _ in range(6):
|
|
93
|
-
try:
|
|
94
|
-
return await self._get_chat_completion(
|
|
95
|
-
messages, completion_kwargs=kwargs
|
|
96
|
-
)
|
|
97
|
-
except Exception as e:
|
|
98
|
-
msg = str(e)
|
|
99
|
-
too_large = (
|
|
100
|
-
"max_tokens" in msg
|
|
101
|
-
and "maximum context length" in msg
|
|
102
|
-
and "is too large" in msg
|
|
103
|
-
)
|
|
104
|
-
if not too_large or not isinstance(max_tokens, int):
|
|
105
|
-
raise
|
|
106
|
-
|
|
107
|
-
max_tokens = max(256, int(max_tokens * 0.75))
|
|
108
|
-
if "max_tokens" in kwargs:
|
|
109
|
-
kwargs["max_tokens"] = max_tokens
|
|
110
|
-
if "max_completion_tokens" in kwargs:
|
|
111
|
-
kwargs["max_completion_tokens"] = max_tokens
|
|
112
|
-
|
|
113
|
-
return await self._get_chat_completion(messages, completion_kwargs=kwargs)
|
|
114
|
-
|
|
115
89
|
|
|
116
90
|
def _doctags_to_markdown(doctags: str, image):
|
|
117
91
|
try:
|
vlmparse/clients/hunyuanocr.py
CHANGED
|
@@ -25,7 +25,11 @@ class HunyuanOCRDockerServerConfig(VLLMDockerServerConfig):
|
|
|
25
25
|
|
|
26
26
|
@property
|
|
27
27
|
def client_config(self):
|
|
28
|
-
return HunyuanOCRConverterConfig(
|
|
28
|
+
return HunyuanOCRConverterConfig(
|
|
29
|
+
**self._create_client_kwargs(
|
|
30
|
+
f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
|
|
31
|
+
)
|
|
32
|
+
)
|
|
29
33
|
|
|
30
34
|
|
|
31
35
|
class HunyuanOCRConverterConfig(OpenAIConverterConfig):
|
vlmparse/clients/lightonocr.py
CHANGED
|
@@ -25,7 +25,11 @@ class LightOnOCRDockerServerConfig(VLLMDockerServerConfig):
|
|
|
25
25
|
|
|
26
26
|
@property
|
|
27
27
|
def client_config(self):
|
|
28
|
-
return LightOnOCRConverterConfig(
|
|
28
|
+
return LightOnOCRConverterConfig(
|
|
29
|
+
**self._create_client_kwargs(
|
|
30
|
+
f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
|
|
31
|
+
)
|
|
32
|
+
)
|
|
29
33
|
|
|
30
34
|
|
|
31
35
|
class LightOnOCRConverterConfig(OpenAIConverterConfig):
|
|
@@ -41,3 +45,21 @@ class LightOnOCRConverterConfig(OpenAIConverterConfig):
|
|
|
41
45
|
}
|
|
42
46
|
dpi: int = 200
|
|
43
47
|
aliases: list[str] = Field(default_factory=lambda: ["lightonocr"])
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class LightonOCR21BServerConfig(LightOnOCRDockerServerConfig):
|
|
51
|
+
model_name: str = "lightonai/LightOnOCR-2-1B"
|
|
52
|
+
aliases: list[str] = Field(default_factory=lambda: ["lightonocr2"])
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def client_config(self):
|
|
56
|
+
return LightonOCR21BConverterConfig(
|
|
57
|
+
**self._create_client_kwargs(
|
|
58
|
+
f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
|
|
59
|
+
)
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class LightonOCR21BConverterConfig(LightOnOCRConverterConfig):
|
|
64
|
+
model_name: str = "lightonai/LightOnOCR-2-1B"
|
|
65
|
+
aliases: list[str] = Field(default_factory=lambda: ["lightonocr2"])
|
vlmparse/clients/mineru.py
CHANGED
|
@@ -31,7 +31,6 @@ class MinerUDockerServerConfig(DockerServerConfig):
|
|
|
31
31
|
class MinerUConverterConfig(ConverterConfig):
|
|
32
32
|
"""Configuration for MinerU API converter."""
|
|
33
33
|
|
|
34
|
-
base_url: str
|
|
35
34
|
model_name: str = "opendatalab/MinerU2.5-2509-1.2B"
|
|
36
35
|
aliases: list[str] = Field(default_factory=lambda: ["mineru25"])
|
|
37
36
|
timeout: int = 600
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import httpx
|
|
4
|
+
import orjson
|
|
5
|
+
from loguru import logger
|
|
6
|
+
from pydantic import Field
|
|
7
|
+
|
|
8
|
+
from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
|
|
9
|
+
from vlmparse.clients.pipe_utils.utils import clean_response
|
|
10
|
+
from vlmparse.converter import BaseConverter, ConverterConfig
|
|
11
|
+
from vlmparse.data_model.document import Page
|
|
12
|
+
from vlmparse.utils import to_base64
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MistralOCRConverterConfig(ConverterConfig):
|
|
16
|
+
"""Configuration for Mistral OCR converter."""
|
|
17
|
+
|
|
18
|
+
base_url: str = "https://api.mistral.ai/v1"
|
|
19
|
+
model_name: str = "mistral-ocr-latest"
|
|
20
|
+
api_key: str | None = None
|
|
21
|
+
timeout: int = 300
|
|
22
|
+
aliases: list[str] = Field(
|
|
23
|
+
default_factory=lambda: ["mistral-ocr-latest", "mistral-ocr"]
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
def get_client(self, **kwargs) -> "MistralOCRConverter":
|
|
27
|
+
return MistralOCRConverter(config=self, **kwargs)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class MistralOCRConverter(BaseConverter):
|
|
31
|
+
"""Client for Mistral OCR API."""
|
|
32
|
+
|
|
33
|
+
config: MistralOCRConverterConfig
|
|
34
|
+
|
|
35
|
+
def __init__(self, config: MistralOCRConverterConfig, **kwargs):
|
|
36
|
+
super().__init__(config=config, **kwargs)
|
|
37
|
+
if not self.config.api_key:
|
|
38
|
+
self.config.api_key = os.getenv("MISTRAL_API_KEY")
|
|
39
|
+
if not self.config.api_key:
|
|
40
|
+
raise ValueError("MISTRAL_API_KEY environment variable not set")
|
|
41
|
+
self._base_url = self.config.base_url.rstrip("/")
|
|
42
|
+
|
|
43
|
+
async def _async_ocr(self, image) -> httpx.Response:
|
|
44
|
+
payload = {
|
|
45
|
+
"model": self.config.model_name,
|
|
46
|
+
"document": {
|
|
47
|
+
"type": "image_url",
|
|
48
|
+
"image_url": f"data:image/png;base64,{to_base64(image)}",
|
|
49
|
+
},
|
|
50
|
+
}
|
|
51
|
+
headers = {"Authorization": f"Bearer {self.config.api_key}"}
|
|
52
|
+
|
|
53
|
+
async with httpx.AsyncClient(timeout=self.config.timeout) as client:
|
|
54
|
+
response = await client.post(
|
|
55
|
+
f"{self._base_url}/ocr",
|
|
56
|
+
json=payload,
|
|
57
|
+
headers=headers,
|
|
58
|
+
)
|
|
59
|
+
response.raise_for_status()
|
|
60
|
+
return response
|
|
61
|
+
|
|
62
|
+
async def async_call_inside_page(self, page: Page) -> Page:
|
|
63
|
+
response = await self._async_ocr(page.image)
|
|
64
|
+
page.raw_response = response.text
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
data = response.json()
|
|
68
|
+
except ValueError:
|
|
69
|
+
logger.warning("Mistral OCR returned non-JSON response")
|
|
70
|
+
page.text = clean_response(response.text)
|
|
71
|
+
return page
|
|
72
|
+
|
|
73
|
+
pages = data.get("pages") or []
|
|
74
|
+
if pages:
|
|
75
|
+
page_data = pages[0]
|
|
76
|
+
text = page_data.get("markdown") or page_data.get("text") or ""
|
|
77
|
+
else:
|
|
78
|
+
text = (
|
|
79
|
+
data.get("markdown") or data.get("text") or orjson.dumps(data).decode()
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
text = clean_response(text)
|
|
83
|
+
text = html_to_md_keep_tables(text)
|
|
84
|
+
page.text = text
|
|
85
|
+
return page
|
vlmparse/clients/nanonetocr.py
CHANGED
|
@@ -12,7 +12,11 @@ class NanonetOCR2DockerServerConfig(VLLMDockerServerConfig):
|
|
|
12
12
|
|
|
13
13
|
@property
|
|
14
14
|
def client_config(self):
|
|
15
|
-
return NanonetOCR2ConverterConfig(
|
|
15
|
+
return NanonetOCR2ConverterConfig(
|
|
16
|
+
**self._create_client_kwargs(
|
|
17
|
+
f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
|
|
18
|
+
)
|
|
19
|
+
)
|
|
16
20
|
|
|
17
21
|
|
|
18
22
|
class NanonetOCR2ConverterConfig(OpenAIConverterConfig):
|
vlmparse/clients/olmocr.py
CHANGED
|
@@ -23,7 +23,11 @@ class OlmOCRDockerServerConfig(VLLMDockerServerConfig):
|
|
|
23
23
|
|
|
24
24
|
@property
|
|
25
25
|
def client_config(self):
|
|
26
|
-
return OlmOCRConverterConfig(
|
|
26
|
+
return OlmOCRConverterConfig(
|
|
27
|
+
**self._create_client_kwargs(
|
|
28
|
+
f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
|
|
29
|
+
)
|
|
30
|
+
)
|
|
27
31
|
|
|
28
32
|
|
|
29
33
|
class OlmOCRConverterConfig(OpenAIConverterConfig):
|
|
@@ -37,7 +41,7 @@ class OlmOCRConverterConfig(OpenAIConverterConfig):
|
|
|
37
41
|
"Return your output as markdown, with a front matter section on top specifying values for the primary_language, is_rotation_valid, rotation_correction, is_table, and is_diagram parameters."
|
|
38
42
|
)
|
|
39
43
|
postprompt: str | None = None
|
|
40
|
-
completion_kwargs: dict
|
|
44
|
+
completion_kwargs: dict = {
|
|
41
45
|
"temperature": 0.1,
|
|
42
46
|
"max_tokens": 8000,
|
|
43
47
|
}
|
|
@@ -1,15 +1,13 @@
|
|
|
1
|
-
import
|
|
2
|
-
from typing import Literal
|
|
1
|
+
import asyncio
|
|
2
|
+
from typing import Literal, Optional
|
|
3
3
|
|
|
4
4
|
from loguru import logger
|
|
5
5
|
from pydantic import Field
|
|
6
6
|
|
|
7
|
-
from vlmparse.base_model import VLMParseBaseModel
|
|
8
7
|
from vlmparse.clients.pipe_utils.html_to_md_conversion import html_to_md_keep_tables
|
|
9
8
|
from vlmparse.clients.pipe_utils.utils import clean_response
|
|
10
9
|
from vlmparse.converter import BaseConverter, ConverterConfig
|
|
11
10
|
from vlmparse.data_model.document import Page
|
|
12
|
-
from vlmparse.servers.docker_server import DEFAULT_MODEL_NAME
|
|
13
11
|
from vlmparse.utils import to_base64
|
|
14
12
|
|
|
15
13
|
from .prompts import PDF2MD_PROMPT
|
|
@@ -17,50 +15,14 @@ from .prompts import PDF2MD_PROMPT
|
|
|
17
15
|
GOOGLE_API_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
|
|
18
16
|
|
|
19
17
|
|
|
20
|
-
class
|
|
18
|
+
class OpenAIConverterConfig(ConverterConfig):
|
|
21
19
|
api_key: str = ""
|
|
22
|
-
base_url: str | None = None
|
|
23
|
-
model_name: str = DEFAULT_MODEL_NAME
|
|
24
20
|
timeout: int | None = 500
|
|
25
21
|
max_retries: int = 1
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
def get_llm_params(model_name: str, uri: str | None = None):
|
|
29
|
-
if uri is not None:
|
|
30
|
-
return LLMParams(base_url=uri, model_name="vllm-model", api_key="")
|
|
31
|
-
if model_name in [
|
|
32
|
-
"gpt-4o",
|
|
33
|
-
"gpt-4o-mini",
|
|
34
|
-
"gpt-4.1",
|
|
35
|
-
"gpt-4.1-mini",
|
|
36
|
-
"gpt-4.1-nano",
|
|
37
|
-
"gpt-5",
|
|
38
|
-
"gpt-5-mini",
|
|
39
|
-
"gpt-5-nano",
|
|
40
|
-
]:
|
|
41
|
-
base_url = None
|
|
42
|
-
api_key = os.getenv("OPENAI_API_KEY")
|
|
43
|
-
if api_key is None:
|
|
44
|
-
raise ValueError("OPENAI_API_KEY environment variable not set")
|
|
45
|
-
else:
|
|
46
|
-
if model_name in [
|
|
47
|
-
"gemini-2.5-flash-lite",
|
|
48
|
-
"gemini-2.5-flash",
|
|
49
|
-
"gemini-2.5-pro",
|
|
50
|
-
]:
|
|
51
|
-
base_url = GOOGLE_API_BASE_URL
|
|
52
|
-
api_key = os.getenv("GOOGLE_API_KEY")
|
|
53
|
-
if api_key is None:
|
|
54
|
-
raise ValueError("GOOGLE_API_KEY environment variable not set")
|
|
55
|
-
else:
|
|
56
|
-
return None
|
|
57
|
-
return LLMParams(base_url=base_url, model_name=model_name, api_key=api_key)
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
class OpenAIConverterConfig(ConverterConfig):
|
|
61
|
-
llm_params: LLMParams
|
|
62
22
|
preprompt: str | None = None
|
|
63
|
-
postprompt: str | None = PDF2MD_PROMPT
|
|
23
|
+
postprompt: str | dict[str, str] | None = PDF2MD_PROMPT
|
|
24
|
+
prompts: dict[str, str] = Field(default_factory=dict)
|
|
25
|
+
prompt_mode_map: dict[str, str] = Field(default_factory=dict)
|
|
64
26
|
completion_kwargs: dict = Field(default_factory=dict)
|
|
65
27
|
stream: bool = False
|
|
66
28
|
|
|
@@ -71,6 +33,33 @@ class OpenAIConverterConfig(ConverterConfig):
|
|
|
71
33
|
class OpenAIConverterClient(BaseConverter):
|
|
72
34
|
"""Client for OpenAI-compatible API servers."""
|
|
73
35
|
|
|
36
|
+
def get_prompt_key(self) -> str | None:
|
|
37
|
+
"""Resolve a prompt key from conversion_mode using class mappings."""
|
|
38
|
+
mode = getattr(self.config, "conversion_mode", None) or "ocr"
|
|
39
|
+
prompts = self._get_prompts()
|
|
40
|
+
if mode in prompts:
|
|
41
|
+
return mode
|
|
42
|
+
mapped = self._get_prompt_mode_map().get(mode)
|
|
43
|
+
if mapped in prompts:
|
|
44
|
+
return mapped
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
def get_prompt_for_mode(self) -> str | None:
|
|
48
|
+
key = self.get_prompt_key()
|
|
49
|
+
if key is None:
|
|
50
|
+
return None
|
|
51
|
+
return self._get_prompts().get(key)
|
|
52
|
+
|
|
53
|
+
def _get_prompts(self) -> dict[str, str]:
|
|
54
|
+
if self.config.prompts:
|
|
55
|
+
return self.config.prompts
|
|
56
|
+
if isinstance(self.config.postprompt, dict):
|
|
57
|
+
return self.config.postprompt
|
|
58
|
+
return {}
|
|
59
|
+
|
|
60
|
+
def _get_prompt_mode_map(self) -> dict[str, str]:
|
|
61
|
+
return self.config.prompt_mode_map or {}
|
|
62
|
+
|
|
74
63
|
def __init__(
|
|
75
64
|
self,
|
|
76
65
|
config: OpenAIConverterConfig,
|
|
@@ -90,25 +79,67 @@ class OpenAIConverterClient(BaseConverter):
|
|
|
90
79
|
debug=debug,
|
|
91
80
|
return_documents_in_batch_mode=return_documents_in_batch_mode,
|
|
92
81
|
)
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
82
|
+
self._model = None
|
|
83
|
+
self._model_loop = None
|
|
84
|
+
|
|
85
|
+
async def _get_async_model(self):
|
|
86
|
+
loop = asyncio.get_running_loop()
|
|
87
|
+
if self._model is None or self._model_loop is not loop:
|
|
88
|
+
await self._close_model()
|
|
89
|
+
from openai import AsyncOpenAI
|
|
90
|
+
|
|
91
|
+
self._model = AsyncOpenAI(
|
|
92
|
+
base_url=self.config.base_url,
|
|
93
|
+
api_key=self.config.api_key,
|
|
94
|
+
timeout=self.config.timeout,
|
|
95
|
+
max_retries=self.config.max_retries,
|
|
96
|
+
)
|
|
97
|
+
self._model_loop = loop
|
|
98
|
+
return self._model
|
|
99
|
+
|
|
100
|
+
async def _close_model(self):
|
|
101
|
+
"""Close the async OpenAI client if it exists."""
|
|
102
|
+
if self._model is not None:
|
|
103
|
+
try:
|
|
104
|
+
await self._model.close()
|
|
105
|
+
except RuntimeError:
|
|
106
|
+
# Event loop may already be closed
|
|
107
|
+
pass
|
|
108
|
+
finally:
|
|
109
|
+
self._model = None
|
|
110
|
+
self._model_loop = None
|
|
111
|
+
|
|
112
|
+
async def aclose(self):
|
|
113
|
+
"""Close the converter and release resources."""
|
|
114
|
+
await self._close_model()
|
|
115
|
+
|
|
116
|
+
def close(self):
|
|
117
|
+
"""Synchronously close the converter if possible."""
|
|
118
|
+
if self._model is not None:
|
|
119
|
+
try:
|
|
120
|
+
loop = asyncio.get_running_loop()
|
|
121
|
+
loop.create_task(self._close_model())
|
|
122
|
+
except RuntimeError:
|
|
123
|
+
# No running loop, try to close synchronously
|
|
124
|
+
try:
|
|
125
|
+
asyncio.run(self._close_model())
|
|
126
|
+
except RuntimeError:
|
|
127
|
+
# Event loop already closed, force cleanup
|
|
128
|
+
self._model = None
|
|
129
|
+
self._model_loop = None
|
|
101
130
|
|
|
102
131
|
async def _get_chat_completion(
|
|
103
132
|
self, messages: list[dict], completion_kwargs: dict | None = None
|
|
104
|
-
) -> tuple[str, "CompletionUsage"]: # noqa: F821
|
|
133
|
+
) -> tuple[str, Optional["CompletionUsage"]]: # noqa: F821
|
|
105
134
|
"""Helper to handle chat completion with optional streaming."""
|
|
106
135
|
if completion_kwargs is None:
|
|
107
136
|
completion_kwargs = self.config.completion_kwargs
|
|
108
137
|
|
|
138
|
+
model = await self._get_async_model()
|
|
139
|
+
|
|
109
140
|
if self.config.stream:
|
|
110
|
-
response_stream = await
|
|
111
|
-
model=self.config.
|
|
141
|
+
response_stream = await model.chat.completions.create(
|
|
142
|
+
model=self.config.default_model_name,
|
|
112
143
|
messages=messages,
|
|
113
144
|
stream=True,
|
|
114
145
|
**completion_kwargs,
|
|
@@ -117,10 +148,11 @@ class OpenAIConverterClient(BaseConverter):
|
|
|
117
148
|
async for chunk in response_stream:
|
|
118
149
|
if chunk.choices and chunk.choices[0].delta.content:
|
|
119
150
|
response_parts.append(chunk.choices[0].delta.content)
|
|
120
|
-
|
|
151
|
+
|
|
152
|
+
return "".join(response_parts), None
|
|
121
153
|
else:
|
|
122
|
-
response_obj = await
|
|
123
|
-
model=self.config.
|
|
154
|
+
response_obj = await model.chat.completions.create(
|
|
155
|
+
model=self.config.default_model_name,
|
|
124
156
|
messages=messages,
|
|
125
157
|
**completion_kwargs,
|
|
126
158
|
)
|
|
@@ -146,11 +178,15 @@ class OpenAIConverterClient(BaseConverter):
|
|
|
146
178
|
else:
|
|
147
179
|
preprompt = []
|
|
148
180
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
181
|
+
selected_prompt = self.get_prompt_for_mode()
|
|
182
|
+
if selected_prompt is not None:
|
|
183
|
+
postprompt = [{"type": "text", "text": selected_prompt}]
|
|
184
|
+
else:
|
|
185
|
+
postprompt = (
|
|
186
|
+
[{"type": "text", "text": self.config.postprompt}]
|
|
187
|
+
if isinstance(self.config.postprompt, str) and self.config.postprompt
|
|
188
|
+
else []
|
|
189
|
+
)
|
|
154
190
|
|
|
155
191
|
messages = [
|
|
156
192
|
*preprompt,
|
|
@@ -175,9 +211,10 @@ class OpenAIConverterClient(BaseConverter):
|
|
|
175
211
|
|
|
176
212
|
text = html_to_md_keep_tables(text)
|
|
177
213
|
page.text = text
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
214
|
+
if usage is not None:
|
|
215
|
+
page.prompt_tokens = usage.prompt_tokens
|
|
216
|
+
page.completion_tokens = usage.completion_tokens
|
|
217
|
+
if hasattr(usage, "reasoning_tokens"):
|
|
218
|
+
page.reasoning_tokens = usage.reasoning_tokens
|
|
182
219
|
|
|
183
220
|
return page
|
vlmparse/clients/paddleocrvl.py
CHANGED
|
@@ -22,7 +22,11 @@ class PaddleOCRVLDockerServerConfig(VLLMDockerServerConfig):
|
|
|
22
22
|
|
|
23
23
|
@property
|
|
24
24
|
def client_config(self):
|
|
25
|
-
return PaddleOCRVLConverterConfig(
|
|
25
|
+
return PaddleOCRVLConverterConfig(
|
|
26
|
+
**self._create_client_kwargs(
|
|
27
|
+
f"http://localhost:{self.docker_port}{self.get_base_url_suffix()}"
|
|
28
|
+
)
|
|
29
|
+
)
|
|
26
30
|
|
|
27
31
|
|
|
28
32
|
# Task-specific base prompts
|
|
@@ -39,7 +43,10 @@ class PaddleOCRVLConverterConfig(OpenAIConverterConfig):
|
|
|
39
43
|
|
|
40
44
|
model_name: str = "PaddlePaddle/PaddleOCR-VL"
|
|
41
45
|
preprompt: str | None = None
|
|
42
|
-
postprompt: str
|
|
46
|
+
postprompt: dict[str, str] = TASKS
|
|
47
|
+
prompt_mode_map: dict[str, str] = {
|
|
48
|
+
"ocr_layout": "ocr",
|
|
49
|
+
}
|
|
43
50
|
completion_kwargs: dict | None = {
|
|
44
51
|
"temperature": 0.0,
|
|
45
52
|
"max_completion_tokens": 16384,
|