abstractcore 2.9.1__py3-none-any.whl → 2.11.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractcore/__init__.py +7 -27
- abstractcore/apps/extractor.py +33 -100
- abstractcore/apps/intent.py +19 -0
- abstractcore/apps/judge.py +20 -1
- abstractcore/apps/summarizer.py +20 -1
- abstractcore/architectures/detection.py +34 -1
- abstractcore/architectures/response_postprocessing.py +313 -0
- abstractcore/assets/architecture_formats.json +38 -8
- abstractcore/assets/model_capabilities.json +781 -160
- abstractcore/compression/__init__.py +1 -2
- abstractcore/compression/glyph_processor.py +6 -4
- abstractcore/config/main.py +31 -19
- abstractcore/config/manager.py +389 -11
- abstractcore/config/vision_config.py +5 -5
- abstractcore/core/interface.py +151 -3
- abstractcore/core/session.py +16 -10
- abstractcore/download.py +1 -1
- abstractcore/embeddings/manager.py +20 -6
- abstractcore/endpoint/__init__.py +2 -0
- abstractcore/endpoint/app.py +458 -0
- abstractcore/mcp/client.py +3 -1
- abstractcore/media/__init__.py +52 -17
- abstractcore/media/auto_handler.py +42 -22
- abstractcore/media/base.py +44 -1
- abstractcore/media/capabilities.py +12 -33
- abstractcore/media/enrichment.py +105 -0
- abstractcore/media/handlers/anthropic_handler.py +19 -28
- abstractcore/media/handlers/local_handler.py +124 -70
- abstractcore/media/handlers/openai_handler.py +19 -31
- abstractcore/media/processors/__init__.py +4 -2
- abstractcore/media/processors/audio_processor.py +57 -0
- abstractcore/media/processors/office_processor.py +8 -3
- abstractcore/media/processors/pdf_processor.py +46 -3
- abstractcore/media/processors/text_processor.py +22 -24
- abstractcore/media/processors/video_processor.py +58 -0
- abstractcore/media/types.py +97 -4
- abstractcore/media/utils/image_scaler.py +20 -2
- abstractcore/media/utils/video_frames.py +219 -0
- abstractcore/media/vision_fallback.py +136 -22
- abstractcore/processing/__init__.py +32 -3
- abstractcore/processing/basic_deepsearch.py +15 -10
- abstractcore/processing/basic_intent.py +3 -2
- abstractcore/processing/basic_judge.py +3 -2
- abstractcore/processing/basic_summarizer.py +1 -1
- abstractcore/providers/__init__.py +3 -1
- abstractcore/providers/anthropic_provider.py +95 -8
- abstractcore/providers/base.py +1516 -81
- abstractcore/providers/huggingface_provider.py +546 -69
- abstractcore/providers/lmstudio_provider.py +35 -923
- abstractcore/providers/mlx_provider.py +382 -35
- abstractcore/providers/model_capabilities.py +5 -1
- abstractcore/providers/ollama_provider.py +99 -15
- abstractcore/providers/openai_compatible_provider.py +406 -180
- abstractcore/providers/openai_provider.py +188 -44
- abstractcore/providers/openrouter_provider.py +76 -0
- abstractcore/providers/registry.py +61 -5
- abstractcore/providers/streaming.py +138 -33
- abstractcore/providers/vllm_provider.py +92 -817
- abstractcore/server/app.py +461 -13
- abstractcore/server/audio_endpoints.py +139 -0
- abstractcore/server/vision_endpoints.py +1319 -0
- abstractcore/structured/handler.py +316 -41
- abstractcore/tools/common_tools.py +5501 -2012
- abstractcore/tools/comms_tools.py +1641 -0
- abstractcore/tools/core.py +37 -7
- abstractcore/tools/handler.py +4 -9
- abstractcore/tools/parser.py +49 -2
- abstractcore/tools/tag_rewriter.py +2 -1
- abstractcore/tools/telegram_tdlib.py +407 -0
- abstractcore/tools/telegram_tools.py +261 -0
- abstractcore/utils/cli.py +1085 -72
- abstractcore/utils/token_utils.py +2 -0
- abstractcore/utils/truncation.py +29 -0
- abstractcore/utils/version.py +3 -4
- abstractcore/utils/vlm_token_calculator.py +12 -2
- abstractcore-2.11.2.dist-info/METADATA +562 -0
- abstractcore-2.11.2.dist-info/RECORD +133 -0
- {abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/WHEEL +1 -1
- {abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/entry_points.txt +1 -0
- abstractcore-2.9.1.dist-info/METADATA +0 -1190
- abstractcore-2.9.1.dist-info/RECORD +0 -119
- {abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/licenses/LICENSE +0 -0
- {abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/top_level.txt +0 -0
|
@@ -1,666 +1,100 @@
|
|
|
1
1
|
"""
|
|
2
2
|
vLLM provider implementation with advanced features.
|
|
3
3
|
|
|
4
|
-
vLLM-
|
|
4
|
+
vLLM exposes an OpenAI-compatible API (chat completions, models, embeddings) plus
|
|
5
|
+
additional management endpoints and request extensions:
|
|
5
6
|
- Guided Decoding: guided_regex, guided_json, guided_grammar
|
|
6
|
-
- Multi-LoRA: load_adapter, unload_adapter, list_adapters
|
|
7
7
|
- Beam Search: best_of, use_beam_search
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
import os
|
|
11
|
-
import httpx
|
|
12
|
-
import json
|
|
13
|
-
import time
|
|
14
|
-
from typing import List, Dict, Any, Optional, Union, Iterator, AsyncIterator, Type
|
|
15
|
-
|
|
16
|
-
try:
|
|
17
|
-
from pydantic import BaseModel
|
|
18
|
-
PYDANTIC_AVAILABLE = True
|
|
19
|
-
except ImportError:
|
|
20
|
-
PYDANTIC_AVAILABLE = False
|
|
21
|
-
BaseModel = None
|
|
22
|
-
|
|
23
|
-
from .base import BaseProvider
|
|
24
|
-
from ..core.types import GenerateResponse
|
|
25
|
-
from ..exceptions import ProviderAPIError, ModelNotFoundError, format_model_error, format_provider_error
|
|
26
|
-
from ..tools import UniversalToolHandler, execute_tools
|
|
27
|
-
from ..events import EventType
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
class VLLMProvider(BaseProvider):
|
|
31
|
-
"""vLLM provider for high-throughput GPU inference with advanced features."""
|
|
32
|
-
|
|
33
|
-
def __init__(self, model: str = "Qwen/Qwen3-Coder-30B-A3B-Instruct",
|
|
34
|
-
base_url: Optional[str] = None,
|
|
35
|
-
api_key: Optional[str] = None,
|
|
36
|
-
**kwargs):
|
|
37
|
-
super().__init__(model, **kwargs)
|
|
38
|
-
self.provider = "vllm"
|
|
39
|
-
|
|
40
|
-
# Initialize tool handler
|
|
41
|
-
self.tool_handler = UniversalToolHandler(model)
|
|
42
|
-
|
|
43
|
-
# Base URL: parameter > VLLM_BASE_URL > default
|
|
44
|
-
self.base_url = (
|
|
45
|
-
base_url or
|
|
46
|
-
os.getenv("VLLM_BASE_URL") or
|
|
47
|
-
"http://localhost:8000/v1"
|
|
48
|
-
).rstrip('/')
|
|
49
|
-
|
|
50
|
-
# API key: parameter > VLLM_API_KEY > "EMPTY"
|
|
51
|
-
self.api_key = api_key or os.getenv("VLLM_API_KEY") or "EMPTY"
|
|
52
|
-
|
|
53
|
-
# Get timeout value - None means unlimited timeout
|
|
54
|
-
timeout_value = getattr(self, '_timeout', None)
|
|
55
|
-
if timeout_value is not None and timeout_value <= 0:
|
|
56
|
-
timeout_value = None # Invalid timeout becomes unlimited
|
|
57
|
-
|
|
58
|
-
try:
|
|
59
|
-
self.client = httpx.Client(timeout=timeout_value)
|
|
60
|
-
except Exception as e:
|
|
61
|
-
try:
|
|
62
|
-
fallback_timeout = None
|
|
63
|
-
try:
|
|
64
|
-
from ..config.manager import get_config_manager
|
|
65
|
-
|
|
66
|
-
fallback_timeout = float(get_config_manager().get_default_timeout())
|
|
67
|
-
except Exception:
|
|
68
|
-
fallback_timeout = 7200.0
|
|
69
|
-
if isinstance(fallback_timeout, (int, float)) and float(fallback_timeout) <= 0:
|
|
70
|
-
fallback_timeout = None
|
|
71
|
-
self.client = httpx.Client(timeout=fallback_timeout)
|
|
72
|
-
except Exception:
|
|
73
|
-
raise RuntimeError(f"Failed to create HTTP client for vLLM: {e}")
|
|
74
|
-
|
|
75
|
-
self._async_client = None # Lazy-loaded async client
|
|
76
|
-
|
|
77
|
-
# Validate model exists in vLLM
|
|
78
|
-
self._validate_model()
|
|
79
|
-
|
|
80
|
-
@property
|
|
81
|
-
def async_client(self):
|
|
82
|
-
"""Lazy-load async HTTP client for native async operations."""
|
|
83
|
-
if self._async_client is None:
|
|
84
|
-
timeout_value = getattr(self, '_timeout', None)
|
|
85
|
-
if timeout_value is not None and timeout_value <= 0:
|
|
86
|
-
timeout_value = None
|
|
87
|
-
self._async_client = httpx.AsyncClient(timeout=timeout_value)
|
|
88
|
-
return self._async_client
|
|
89
|
-
|
|
90
|
-
def _get_headers(self) -> Dict[str, str]:
|
|
91
|
-
"""Get HTTP headers including API key if configured."""
|
|
92
|
-
headers = {"Content-Type": "application/json"}
|
|
93
|
-
if self.api_key and self.api_key != "EMPTY":
|
|
94
|
-
headers["Authorization"] = f"Bearer {self.api_key}"
|
|
95
|
-
return headers
|
|
8
|
+
- Multi-LoRA management: load_adapter, unload_adapter, list_adapters
|
|
96
9
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
try:
|
|
100
|
-
available_models = self.list_available_models(base_url=self.base_url)
|
|
101
|
-
if available_models and self.model not in available_models:
|
|
102
|
-
error_message = format_model_error("vLLM", self.model, available_models)
|
|
103
|
-
raise ModelNotFoundError(error_message)
|
|
104
|
-
except httpx.ConnectError:
|
|
105
|
-
if hasattr(self, 'logger'):
|
|
106
|
-
self.logger.debug(f"vLLM server not accessible at {self.base_url} - model validation skipped")
|
|
107
|
-
pass
|
|
108
|
-
except ModelNotFoundError:
|
|
109
|
-
raise
|
|
110
|
-
except Exception as e:
|
|
111
|
-
if hasattr(self, 'logger'):
|
|
112
|
-
self.logger.debug(f"Model validation failed with error: {e} - continuing anyway")
|
|
113
|
-
pass
|
|
10
|
+
This provider subclasses `OpenAICompatibleProvider` and injects vLLM-specific request
|
|
11
|
+
extensions via `payload["extra_body"]`.
|
|
114
12
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
Note: vLLM manages model memory automatically.
|
|
120
|
-
This method only closes the HTTP client connection for cleanup.
|
|
121
|
-
"""
|
|
122
|
-
try:
|
|
123
|
-
if hasattr(self, 'client') and self.client is not None:
|
|
124
|
-
self.client.close()
|
|
125
|
-
|
|
126
|
-
if self._async_client is not None:
|
|
127
|
-
import asyncio
|
|
128
|
-
try:
|
|
129
|
-
loop = asyncio.get_running_loop()
|
|
130
|
-
loop.create_task(self._async_client.aclose())
|
|
131
|
-
except RuntimeError:
|
|
132
|
-
import asyncio
|
|
133
|
-
asyncio.run(self._async_client.aclose())
|
|
134
|
-
|
|
135
|
-
except Exception as e:
|
|
136
|
-
if hasattr(self, 'logger'):
|
|
137
|
-
self.logger.warning(f"Error during unload: {e}")
|
|
138
|
-
|
|
139
|
-
def generate(self, *args, **kwargs):
|
|
140
|
-
"""Public generate method that includes telemetry."""
|
|
141
|
-
return self.generate_with_telemetry(*args, **kwargs)
|
|
142
|
-
|
|
143
|
-
def _generate_internal(self,
|
|
144
|
-
prompt: str,
|
|
145
|
-
messages: Optional[List[Dict[str, str]]] = None,
|
|
146
|
-
system_prompt: Optional[str] = None,
|
|
147
|
-
tools: Optional[List[Dict[str, Any]]] = None,
|
|
148
|
-
media: Optional[List['MediaContent']] = None,
|
|
149
|
-
stream: bool = False,
|
|
150
|
-
response_model: Optional[Type[BaseModel]] = None,
|
|
151
|
-
execute_tools: Optional[bool] = None,
|
|
152
|
-
tool_call_tags: Optional[str] = None,
|
|
153
|
-
# vLLM-specific parameters:
|
|
154
|
-
guided_regex: Optional[str] = None,
|
|
155
|
-
guided_json: Optional[Dict] = None,
|
|
156
|
-
guided_grammar: Optional[str] = None,
|
|
157
|
-
best_of: Optional[int] = None,
|
|
158
|
-
use_beam_search: bool = False,
|
|
159
|
-
**kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
|
|
160
|
-
"""Generate response using vLLM with advanced features."""
|
|
161
|
-
|
|
162
|
-
# Build messages for chat completions with tool support
|
|
163
|
-
chat_messages = []
|
|
164
|
-
|
|
165
|
-
# Add tools to system prompt if provided
|
|
166
|
-
final_system_prompt = system_prompt
|
|
167
|
-
# Prefer native tools when the model supports them. Only inject a prompted tool list
|
|
168
|
-
# when native tool calling is not available.
|
|
169
|
-
if tools and self.tool_handler.supports_prompted and not self.tool_handler.supports_native:
|
|
170
|
-
include_tool_list = True
|
|
171
|
-
if final_system_prompt and "## Tools (session)" in final_system_prompt:
|
|
172
|
-
include_tool_list = False
|
|
173
|
-
tool_prompt = self.tool_handler.format_tools_prompt(tools, include_tool_list=include_tool_list)
|
|
174
|
-
if final_system_prompt:
|
|
175
|
-
final_system_prompt += f"\n\n{tool_prompt}"
|
|
176
|
-
else:
|
|
177
|
-
final_system_prompt = tool_prompt
|
|
178
|
-
|
|
179
|
-
# Add system message if provided
|
|
180
|
-
if final_system_prompt:
|
|
181
|
-
chat_messages.append({
|
|
182
|
-
"role": "system",
|
|
183
|
-
"content": final_system_prompt
|
|
184
|
-
})
|
|
185
|
-
|
|
186
|
-
# Add conversation history
|
|
187
|
-
if messages:
|
|
188
|
-
chat_messages.extend(messages)
|
|
189
|
-
|
|
190
|
-
# Handle media content if provided
|
|
191
|
-
if media:
|
|
192
|
-
user_message_text = prompt.strip() if prompt else ""
|
|
193
|
-
if not user_message_text and chat_messages:
|
|
194
|
-
for msg in reversed(chat_messages):
|
|
195
|
-
if msg.get("role") == "user" and msg.get("content"):
|
|
196
|
-
user_message_text = msg["content"]
|
|
197
|
-
break
|
|
198
|
-
try:
|
|
199
|
-
processed_media = self._process_media_content(media)
|
|
200
|
-
media_handler = self._get_media_handler_for_model(self.model)
|
|
201
|
-
multimodal_message = media_handler.create_multimodal_message(user_message_text, processed_media)
|
|
202
|
-
|
|
203
|
-
if isinstance(multimodal_message, str):
|
|
204
|
-
if chat_messages and chat_messages[-1].get("role") == "user":
|
|
205
|
-
chat_messages[-1]["content"] = multimodal_message
|
|
206
|
-
else:
|
|
207
|
-
chat_messages.append({"role": "user", "content": multimodal_message})
|
|
208
|
-
else:
|
|
209
|
-
if chat_messages and chat_messages[-1].get("role") == "user":
|
|
210
|
-
chat_messages[-1] = multimodal_message
|
|
211
|
-
else:
|
|
212
|
-
chat_messages.append(multimodal_message)
|
|
213
|
-
except ImportError:
|
|
214
|
-
self.logger.warning("Media processing not available. Install with: pip install abstractcore[media]")
|
|
215
|
-
if user_message_text:
|
|
216
|
-
chat_messages.append({"role": "user", "content": user_message_text})
|
|
217
|
-
except Exception as e:
|
|
218
|
-
self.logger.warning(f"Failed to process media content: {e}")
|
|
219
|
-
if user_message_text:
|
|
220
|
-
chat_messages.append({"role": "user", "content": user_message_text})
|
|
221
|
-
|
|
222
|
-
# Add prompt as separate message if provided
|
|
223
|
-
elif prompt and prompt.strip():
|
|
224
|
-
chat_messages.append({"role": "user", "content": prompt})
|
|
225
|
-
|
|
226
|
-
# Build request payload using unified system
|
|
227
|
-
generation_kwargs = self._prepare_generation_kwargs(**kwargs)
|
|
228
|
-
max_output_tokens = self._get_provider_max_tokens_param(generation_kwargs)
|
|
229
|
-
|
|
230
|
-
payload = {
|
|
231
|
-
"model": self.model,
|
|
232
|
-
"messages": chat_messages,
|
|
233
|
-
"stream": stream,
|
|
234
|
-
"temperature": kwargs.get("temperature", self.temperature),
|
|
235
|
-
"max_tokens": max_output_tokens,
|
|
236
|
-
"top_p": kwargs.get("top_p", 0.9),
|
|
237
|
-
}
|
|
13
|
+
TODO(vllm): Add an opt-in integration test that exercises streaming + LoRA management endpoints
|
|
14
|
+
once a reachable vLLM server is available in the test environment.
|
|
15
|
+
"""
|
|
238
16
|
|
|
239
|
-
|
|
240
|
-
if tools and self.tool_handler.supports_native:
|
|
241
|
-
payload["tools"] = self.tool_handler.prepare_tools_for_native(tools)
|
|
242
|
-
payload["tool_choice"] = kwargs.get("tool_choice", "auto")
|
|
17
|
+
from typing import Any, Dict, List, Optional
|
|
243
18
|
|
|
244
|
-
|
|
245
|
-
if "frequency_penalty" in kwargs:
|
|
246
|
-
payload["frequency_penalty"] = kwargs["frequency_penalty"]
|
|
247
|
-
if "presence_penalty" in kwargs:
|
|
248
|
-
payload["presence_penalty"] = kwargs["presence_penalty"]
|
|
19
|
+
from .openai_compatible_provider import OpenAICompatibleProvider
|
|
249
20
|
|
|
250
|
-
# Add seed if provided
|
|
251
|
-
seed_value = kwargs.get("seed", self.seed)
|
|
252
|
-
if seed_value is not None:
|
|
253
|
-
payload["seed"] = seed_value
|
|
254
21
|
|
|
255
|
-
|
|
256
|
-
|
|
22
|
+
class VLLMProvider(OpenAICompatibleProvider):
|
|
23
|
+
"""vLLM provider for high-throughput GPU inference with advanced features."""
|
|
257
24
|
|
|
258
|
-
|
|
25
|
+
PROVIDER_ID = "vllm"
|
|
26
|
+
PROVIDER_DISPLAY_NAME = "vLLM"
|
|
27
|
+
BASE_URL_ENV_VAR = "VLLM_BASE_URL"
|
|
28
|
+
API_KEY_ENV_VAR = "VLLM_API_KEY" # Optional; some deployments sit behind auth
|
|
29
|
+
DEFAULT_BASE_URL = "http://localhost:8000/v1"
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
model: str = "Qwen/Qwen3-Coder-30B-A3B-Instruct",
|
|
34
|
+
base_url: Optional[str] = None,
|
|
35
|
+
api_key: Optional[str] = None,
|
|
36
|
+
**kwargs,
|
|
37
|
+
):
|
|
38
|
+
super().__init__(model=model, base_url=base_url, api_key=api_key, **kwargs)
|
|
39
|
+
|
|
40
|
+
def _mutate_payload(self, payload: Dict[str, Any], **kwargs) -> Dict[str, Any]:
|
|
41
|
+
extra_body_updates: Dict[str, Any] = {}
|
|
42
|
+
|
|
43
|
+
guided_regex = kwargs.get("guided_regex")
|
|
259
44
|
if guided_regex:
|
|
260
|
-
|
|
261
|
-
if guided_json:
|
|
262
|
-
extra_body["guided_json"] = guided_json
|
|
263
|
-
if guided_grammar:
|
|
264
|
-
extra_body["guided_grammar"] = guided_grammar
|
|
265
|
-
|
|
266
|
-
# Beam search
|
|
267
|
-
if use_beam_search or best_of:
|
|
268
|
-
extra_body["use_beam_search"] = use_beam_search
|
|
269
|
-
if best_of:
|
|
270
|
-
extra_body["best_of"] = best_of
|
|
271
|
-
|
|
272
|
-
# Add structured output support (standard OpenAI-compatible format)
|
|
273
|
-
if response_model and PYDANTIC_AVAILABLE:
|
|
274
|
-
json_schema = response_model.model_json_schema()
|
|
275
|
-
payload["response_format"] = {
|
|
276
|
-
"type": "json_schema",
|
|
277
|
-
"json_schema": {
|
|
278
|
-
"name": response_model.__name__,
|
|
279
|
-
"schema": json_schema
|
|
280
|
-
}
|
|
281
|
-
}
|
|
282
|
-
|
|
283
|
-
# Add extra_body if we have vLLM-specific parameters
|
|
284
|
-
if extra_body:
|
|
285
|
-
payload["extra_body"] = extra_body
|
|
286
|
-
|
|
287
|
-
if stream:
|
|
288
|
-
return self._stream_generate(payload)
|
|
289
|
-
else:
|
|
290
|
-
response = self._single_generate(payload)
|
|
291
|
-
|
|
292
|
-
# Execute tools if enabled and tools are present
|
|
293
|
-
if self.execute_tools and tools and self.tool_handler.supports_prompted and response.content:
|
|
294
|
-
response = self._handle_prompted_tool_execution(response, tools, execute_tools)
|
|
295
|
-
|
|
296
|
-
return response
|
|
297
|
-
|
|
298
|
-
def _single_generate(self, payload: Dict[str, Any]) -> GenerateResponse:
|
|
299
|
-
"""Generate single response."""
|
|
300
|
-
try:
|
|
301
|
-
if not hasattr(self, 'client') or self.client is None:
|
|
302
|
-
raise ProviderAPIError("HTTP client not initialized")
|
|
303
|
-
|
|
304
|
-
start_time = time.time()
|
|
305
|
-
request_url = f"{self.base_url}/chat/completions"
|
|
306
|
-
response = self.client.post(
|
|
307
|
-
request_url,
|
|
308
|
-
json=payload,
|
|
309
|
-
headers=self._get_headers()
|
|
310
|
-
)
|
|
311
|
-
response.raise_for_status()
|
|
312
|
-
gen_time = round((time.time() - start_time) * 1000, 1)
|
|
313
|
-
|
|
314
|
-
result = response.json()
|
|
315
|
-
|
|
316
|
-
# Extract response from OpenAI format
|
|
317
|
-
if "choices" in result and len(result["choices"]) > 0:
|
|
318
|
-
choice = result["choices"][0]
|
|
319
|
-
message = choice.get("message") or {}
|
|
320
|
-
if not isinstance(message, dict):
|
|
321
|
-
message = {}
|
|
322
|
-
|
|
323
|
-
content = message.get("content", "")
|
|
324
|
-
tool_calls = message.get("tool_calls")
|
|
325
|
-
if tool_calls is None:
|
|
326
|
-
tool_calls = choice.get("tool_calls")
|
|
327
|
-
finish_reason = choice.get("finish_reason", "stop")
|
|
328
|
-
else:
|
|
329
|
-
content = "No response generated"
|
|
330
|
-
tool_calls = None
|
|
331
|
-
finish_reason = "error"
|
|
332
|
-
|
|
333
|
-
# Extract usage info
|
|
334
|
-
usage = result.get("usage", {})
|
|
335
|
-
|
|
336
|
-
return GenerateResponse(
|
|
337
|
-
content=content,
|
|
338
|
-
model=self.model,
|
|
339
|
-
finish_reason=finish_reason,
|
|
340
|
-
raw_response=result,
|
|
341
|
-
tool_calls=tool_calls if isinstance(tool_calls, list) else None,
|
|
342
|
-
metadata={
|
|
343
|
-
"_provider_request": {
|
|
344
|
-
"url": request_url,
|
|
345
|
-
"payload": payload,
|
|
346
|
-
}
|
|
347
|
-
},
|
|
348
|
-
usage={
|
|
349
|
-
"input_tokens": usage.get("prompt_tokens", 0),
|
|
350
|
-
"output_tokens": usage.get("completion_tokens", 0),
|
|
351
|
-
"total_tokens": usage.get("total_tokens", 0),
|
|
352
|
-
"prompt_tokens": usage.get("prompt_tokens", 0),
|
|
353
|
-
"completion_tokens": usage.get("completion_tokens", 0)
|
|
354
|
-
},
|
|
355
|
-
gen_time=gen_time
|
|
356
|
-
)
|
|
357
|
-
|
|
358
|
-
except AttributeError as e:
|
|
359
|
-
if "'NoneType'" in str(e):
|
|
360
|
-
raise ProviderAPIError(f"vLLM provider not properly initialized: {str(e)}")
|
|
361
|
-
else:
|
|
362
|
-
raise ProviderAPIError(f"vLLM configuration error: {str(e)}")
|
|
363
|
-
except Exception as e:
|
|
364
|
-
error_str = str(e).lower()
|
|
365
|
-
if ('404' in error_str or 'not found' in error_str or 'model' in error_str) and ('not found' in error_str):
|
|
366
|
-
try:
|
|
367
|
-
available_models = self.list_available_models(base_url=self.base_url)
|
|
368
|
-
error_message = format_model_error("vLLM", self.model, available_models)
|
|
369
|
-
raise ModelNotFoundError(error_message)
|
|
370
|
-
except Exception:
|
|
371
|
-
raise ModelNotFoundError(f"Model '{self.model}' not found in vLLM and could not fetch available models")
|
|
372
|
-
else:
|
|
373
|
-
raise
|
|
374
|
-
|
|
375
|
-
def _stream_generate(self, payload: Dict[str, Any]) -> Iterator[GenerateResponse]:
|
|
376
|
-
"""Generate streaming response."""
|
|
377
|
-
try:
|
|
378
|
-
with self.client.stream(
|
|
379
|
-
"POST",
|
|
380
|
-
f"{self.base_url}/chat/completions",
|
|
381
|
-
json=payload,
|
|
382
|
-
headers=self._get_headers()
|
|
383
|
-
) as response:
|
|
384
|
-
response.raise_for_status()
|
|
385
|
-
|
|
386
|
-
for line in response.iter_lines():
|
|
387
|
-
if line:
|
|
388
|
-
if isinstance(line, bytes):
|
|
389
|
-
line = line.decode('utf-8')
|
|
390
|
-
line = line.strip()
|
|
391
|
-
|
|
392
|
-
if line.startswith("data: "):
|
|
393
|
-
data = line[6:] # Remove "data: " prefix
|
|
394
|
-
|
|
395
|
-
if data == "[DONE]":
|
|
396
|
-
break
|
|
397
|
-
|
|
398
|
-
try:
|
|
399
|
-
chunk = json.loads(data)
|
|
400
|
-
|
|
401
|
-
if "choices" in chunk and len(chunk["choices"]) > 0:
|
|
402
|
-
choice = chunk["choices"][0]
|
|
403
|
-
delta = choice.get("delta", {})
|
|
404
|
-
if not isinstance(delta, dict):
|
|
405
|
-
delta = {}
|
|
406
|
-
content = delta.get("content", "")
|
|
407
|
-
tool_calls = delta.get("tool_calls") or choice.get("tool_calls")
|
|
408
|
-
finish_reason = choice.get("finish_reason")
|
|
409
|
-
|
|
410
|
-
yield GenerateResponse(
|
|
411
|
-
content=content,
|
|
412
|
-
model=self.model,
|
|
413
|
-
finish_reason=finish_reason,
|
|
414
|
-
tool_calls=tool_calls if isinstance(tool_calls, list) else None,
|
|
415
|
-
raw_response=chunk
|
|
416
|
-
)
|
|
417
|
-
|
|
418
|
-
except json.JSONDecodeError:
|
|
419
|
-
continue
|
|
420
|
-
|
|
421
|
-
except Exception as e:
|
|
422
|
-
yield GenerateResponse(
|
|
423
|
-
content=f"Error: {str(e)}",
|
|
424
|
-
model=self.model,
|
|
425
|
-
finish_reason="error"
|
|
426
|
-
)
|
|
427
|
-
|
|
428
|
-
async def _agenerate_internal(self,
|
|
429
|
-
prompt: str,
|
|
430
|
-
messages: Optional[List[Dict[str, str]]] = None,
|
|
431
|
-
system_prompt: Optional[str] = None,
|
|
432
|
-
tools: Optional[List[Dict[str, Any]]] = None,
|
|
433
|
-
media: Optional[List['MediaContent']] = None,
|
|
434
|
-
stream: bool = False,
|
|
435
|
-
response_model: Optional[Type[BaseModel]] = None,
|
|
436
|
-
execute_tools: Optional[bool] = None,
|
|
437
|
-
tool_call_tags: Optional[str] = None,
|
|
438
|
-
# vLLM-specific parameters:
|
|
439
|
-
guided_regex: Optional[str] = None,
|
|
440
|
-
guided_json: Optional[Dict] = None,
|
|
441
|
-
guided_grammar: Optional[str] = None,
|
|
442
|
-
best_of: Optional[int] = None,
|
|
443
|
-
use_beam_search: bool = False,
|
|
444
|
-
**kwargs) -> Union[GenerateResponse, AsyncIterator[GenerateResponse]]:
|
|
445
|
-
"""Native async implementation with vLLM features."""
|
|
446
|
-
|
|
447
|
-
# Build messages (same logic as sync)
|
|
448
|
-
chat_messages = []
|
|
449
|
-
|
|
450
|
-
final_system_prompt = system_prompt
|
|
451
|
-
# Prefer native tools when available; only inject prompted tool syntax as fallback.
|
|
452
|
-
if tools and self.tool_handler.supports_prompted and not self.tool_handler.supports_native:
|
|
453
|
-
include_tool_list = True
|
|
454
|
-
if final_system_prompt and "## Tools (session)" in final_system_prompt:
|
|
455
|
-
include_tool_list = False
|
|
456
|
-
tool_prompt = self.tool_handler.format_tools_prompt(tools, include_tool_list=include_tool_list)
|
|
457
|
-
if final_system_prompt:
|
|
458
|
-
final_system_prompt += f"\n\n{tool_prompt}"
|
|
459
|
-
else:
|
|
460
|
-
final_system_prompt = tool_prompt
|
|
461
|
-
|
|
462
|
-
if final_system_prompt:
|
|
463
|
-
chat_messages.append({"role": "system", "content": final_system_prompt})
|
|
45
|
+
extra_body_updates["guided_regex"] = guided_regex
|
|
464
46
|
|
|
465
|
-
|
|
466
|
-
chat_messages.extend(messages)
|
|
467
|
-
|
|
468
|
-
if media:
|
|
469
|
-
user_message_text = prompt.strip() if prompt else ""
|
|
470
|
-
if not user_message_text and chat_messages:
|
|
471
|
-
for msg in reversed(chat_messages):
|
|
472
|
-
if msg.get("role") == "user" and msg.get("content"):
|
|
473
|
-
user_message_text = msg["content"]
|
|
474
|
-
break
|
|
475
|
-
try:
|
|
476
|
-
processed_media = self._process_media_content(media)
|
|
477
|
-
media_handler = self._get_media_handler_for_model(self.model)
|
|
478
|
-
multimodal_message = media_handler.create_multimodal_message(user_message_text, processed_media)
|
|
479
|
-
|
|
480
|
-
if isinstance(multimodal_message, str):
|
|
481
|
-
if chat_messages and chat_messages[-1].get("role") == "user":
|
|
482
|
-
chat_messages[-1]["content"] = multimodal_message
|
|
483
|
-
else:
|
|
484
|
-
chat_messages.append({"role": "user", "content": multimodal_message})
|
|
485
|
-
else:
|
|
486
|
-
if chat_messages and chat_messages[-1].get("role") == "user":
|
|
487
|
-
chat_messages[-1] = multimodal_message
|
|
488
|
-
else:
|
|
489
|
-
chat_messages.append(multimodal_message)
|
|
490
|
-
except ImportError:
|
|
491
|
-
self.logger.warning("Media processing not available. Install with: pip install abstractcore[media]")
|
|
492
|
-
if user_message_text:
|
|
493
|
-
chat_messages.append({"role": "user", "content": user_message_text})
|
|
494
|
-
except Exception as e:
|
|
495
|
-
self.logger.warning(f"Failed to process media content: {e}")
|
|
496
|
-
if user_message_text:
|
|
497
|
-
chat_messages.append({"role": "user", "content": user_message_text})
|
|
498
|
-
|
|
499
|
-
elif prompt and prompt.strip():
|
|
500
|
-
chat_messages.append({"role": "user", "content": prompt})
|
|
501
|
-
|
|
502
|
-
# Build request payload
|
|
503
|
-
generation_kwargs = self._prepare_generation_kwargs(**kwargs)
|
|
504
|
-
max_output_tokens = self._get_provider_max_tokens_param(generation_kwargs)
|
|
505
|
-
|
|
506
|
-
payload = {
|
|
507
|
-
"model": self.model,
|
|
508
|
-
"messages": chat_messages,
|
|
509
|
-
"stream": stream,
|
|
510
|
-
"temperature": kwargs.get("temperature", self.temperature),
|
|
511
|
-
"max_tokens": max_output_tokens,
|
|
512
|
-
"top_p": kwargs.get("top_p", 0.9),
|
|
513
|
-
}
|
|
514
|
-
|
|
515
|
-
# Native tools (OpenAI-compatible): send structured tools/tool_choice when supported.
|
|
516
|
-
if tools and self.tool_handler.supports_native:
|
|
517
|
-
payload["tools"] = self.tool_handler.prepare_tools_for_native(tools)
|
|
518
|
-
payload["tool_choice"] = kwargs.get("tool_choice", "auto")
|
|
519
|
-
|
|
520
|
-
if "frequency_penalty" in kwargs:
|
|
521
|
-
payload["frequency_penalty"] = kwargs["frequency_penalty"]
|
|
522
|
-
if "presence_penalty" in kwargs:
|
|
523
|
-
payload["presence_penalty"] = kwargs["presence_penalty"]
|
|
524
|
-
|
|
525
|
-
seed_value = kwargs.get("seed", self.seed)
|
|
526
|
-
if seed_value is not None:
|
|
527
|
-
payload["seed"] = seed_value
|
|
528
|
-
|
|
529
|
-
# vLLM-specific features
|
|
530
|
-
extra_body = {}
|
|
531
|
-
|
|
532
|
-
if guided_regex:
|
|
533
|
-
extra_body["guided_regex"] = guided_regex
|
|
47
|
+
guided_json = kwargs.get("guided_json")
|
|
534
48
|
if guided_json:
|
|
535
|
-
|
|
49
|
+
extra_body_updates["guided_json"] = guided_json
|
|
50
|
+
|
|
51
|
+
guided_grammar = kwargs.get("guided_grammar")
|
|
536
52
|
if guided_grammar:
|
|
537
|
-
|
|
53
|
+
extra_body_updates["guided_grammar"] = guided_grammar
|
|
538
54
|
|
|
55
|
+
best_of = kwargs.get("best_of")
|
|
56
|
+
use_beam_search = kwargs.get("use_beam_search", False)
|
|
539
57
|
if use_beam_search or best_of:
|
|
540
|
-
|
|
541
|
-
if best_of:
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
if extra_body:
|
|
555
|
-
payload["extra_body"] = extra_body
|
|
556
|
-
|
|
557
|
-
if stream:
|
|
558
|
-
return self._async_stream_generate(payload)
|
|
559
|
-
else:
|
|
560
|
-
response = await self._async_single_generate(payload)
|
|
561
|
-
|
|
562
|
-
if self.execute_tools and tools and self.tool_handler.supports_prompted and response.content:
|
|
563
|
-
response = self._handle_prompted_tool_execution(response, tools, execute_tools)
|
|
564
|
-
|
|
565
|
-
return response
|
|
566
|
-
|
|
567
|
-
async def _async_single_generate(self, payload: Dict[str, Any]) -> GenerateResponse:
|
|
568
|
-
"""Native async single response generation."""
|
|
569
|
-
try:
|
|
570
|
-
start_time = time.time()
|
|
571
|
-
response = await self.async_client.post(
|
|
572
|
-
f"{self.base_url}/chat/completions",
|
|
573
|
-
json=payload,
|
|
574
|
-
headers=self._get_headers()
|
|
575
|
-
)
|
|
576
|
-
response.raise_for_status()
|
|
577
|
-
gen_time = round((time.time() - start_time) * 1000, 1)
|
|
578
|
-
|
|
579
|
-
result = response.json()
|
|
580
|
-
|
|
581
|
-
if "choices" in result and len(result["choices"]) > 0:
|
|
582
|
-
choice = result["choices"][0]
|
|
583
|
-
content = choice.get("message", {}).get("content", "")
|
|
584
|
-
finish_reason = choice.get("finish_reason", "stop")
|
|
58
|
+
extra_body_updates["use_beam_search"] = bool(use_beam_search)
|
|
59
|
+
if best_of is not None:
|
|
60
|
+
extra_body_updates["best_of"] = best_of
|
|
61
|
+
|
|
62
|
+
# Allow callers to pass raw extra_body (merge with our computed updates).
|
|
63
|
+
caller_extra_body = kwargs.get("extra_body")
|
|
64
|
+
if isinstance(caller_extra_body, dict) and caller_extra_body:
|
|
65
|
+
extra_body_updates = {**caller_extra_body, **extra_body_updates}
|
|
66
|
+
|
|
67
|
+
if extra_body_updates:
|
|
68
|
+
existing = payload.get("extra_body")
|
|
69
|
+
if isinstance(existing, dict) and existing:
|
|
70
|
+
payload["extra_body"] = {**existing, **extra_body_updates}
|
|
585
71
|
else:
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
raise ModelNotFoundError(error_message)
|
|
613
|
-
except Exception:
|
|
614
|
-
raise ModelNotFoundError(f"Model '{self.model}' not found in vLLM")
|
|
615
|
-
else:
|
|
616
|
-
raise ProviderAPIError(f"vLLM API error: {str(e)}")
|
|
617
|
-
|
|
618
|
-
async def _async_stream_generate(self, payload: Dict[str, Any]) -> AsyncIterator[GenerateResponse]:
|
|
619
|
-
"""Native async streaming response generation."""
|
|
620
|
-
try:
|
|
621
|
-
async with self.async_client.stream(
|
|
622
|
-
"POST",
|
|
623
|
-
f"{self.base_url}/chat/completions",
|
|
624
|
-
json=payload,
|
|
625
|
-
headers=self._get_headers()
|
|
626
|
-
) as response:
|
|
627
|
-
response.raise_for_status()
|
|
628
|
-
|
|
629
|
-
async for line in response.aiter_lines():
|
|
630
|
-
if line:
|
|
631
|
-
line = line.strip()
|
|
632
|
-
|
|
633
|
-
if line.startswith("data: "):
|
|
634
|
-
data = line[6:] # Remove "data: " prefix
|
|
635
|
-
|
|
636
|
-
if data == "[DONE]":
|
|
637
|
-
break
|
|
638
|
-
|
|
639
|
-
try:
|
|
640
|
-
chunk = json.loads(data)
|
|
641
|
-
|
|
642
|
-
if "choices" in chunk and len(chunk["choices"]) > 0:
|
|
643
|
-
choice = chunk["choices"][0]
|
|
644
|
-
delta = choice.get("delta", {})
|
|
645
|
-
content = delta.get("content", "")
|
|
646
|
-
finish_reason = choice.get("finish_reason")
|
|
647
|
-
|
|
648
|
-
yield GenerateResponse(
|
|
649
|
-
content=content,
|
|
650
|
-
model=self.model,
|
|
651
|
-
finish_reason=finish_reason,
|
|
652
|
-
raw_response=chunk
|
|
653
|
-
)
|
|
654
|
-
|
|
655
|
-
except json.JSONDecodeError:
|
|
656
|
-
continue
|
|
657
|
-
|
|
658
|
-
except Exception as e:
|
|
659
|
-
yield GenerateResponse(
|
|
660
|
-
content=f"Error: {str(e)}",
|
|
661
|
-
model=self.model,
|
|
662
|
-
finish_reason="error"
|
|
663
|
-
)
|
|
72
|
+
payload["extra_body"] = extra_body_updates
|
|
73
|
+
|
|
74
|
+
return payload
|
|
75
|
+
|
|
76
|
+
def _apply_provider_thinking_kwargs(
|
|
77
|
+
self,
|
|
78
|
+
*,
|
|
79
|
+
enabled: Optional[bool],
|
|
80
|
+
level: Optional[str],
|
|
81
|
+
kwargs: Dict[str, Any],
|
|
82
|
+
) -> tuple[Dict[str, Any], bool]:
|
|
83
|
+
# vLLM exposes reasoning controls via `extra_body.chat_template_kwargs`.
|
|
84
|
+
# For Qwen3 templates specifically, the variable is commonly named `enable_thinking`.
|
|
85
|
+
_ = level
|
|
86
|
+
if enabled is None:
|
|
87
|
+
return kwargs, False
|
|
88
|
+
|
|
89
|
+
new_kwargs = dict(kwargs)
|
|
90
|
+
extra_body = new_kwargs.get("extra_body")
|
|
91
|
+
extra_body_dict: Dict[str, Any] = dict(extra_body) if isinstance(extra_body, dict) else {}
|
|
92
|
+
ctk = extra_body_dict.get("chat_template_kwargs")
|
|
93
|
+
ctk_dict: Dict[str, Any] = dict(ctk) if isinstance(ctk, dict) else {}
|
|
94
|
+
ctk_dict["enable_thinking"] = bool(enabled)
|
|
95
|
+
extra_body_dict["chat_template_kwargs"] = ctk_dict
|
|
96
|
+
new_kwargs["extra_body"] = extra_body_dict
|
|
97
|
+
return new_kwargs, True
|
|
664
98
|
|
|
665
99
|
# vLLM-specific methods
|
|
666
100
|
|
|
@@ -674,207 +108,48 @@ class VLLMProvider(BaseProvider):
|
|
|
674
108
|
|
|
675
109
|
Returns:
|
|
676
110
|
Success message
|
|
677
|
-
|
|
678
|
-
Usage:
|
|
679
|
-
llm.load_adapter("sql-expert", "/models/adapters/sql-lora")
|
|
680
|
-
response = llm.generate("Query...", model="sql-expert")
|
|
681
111
|
"""
|
|
682
|
-
management_url = self.base_url.rstrip(
|
|
112
|
+
management_url = self.base_url.rstrip("/").replace("/v1", "")
|
|
683
113
|
|
|
684
114
|
response = self.client.post(
|
|
685
115
|
f"{management_url}/v1/load_lora_adapter",
|
|
686
116
|
json={"lora_name": adapter_name, "lora_path": adapter_path},
|
|
687
|
-
headers=self._get_headers()
|
|
117
|
+
headers=self._get_headers(),
|
|
688
118
|
)
|
|
689
|
-
|
|
119
|
+
self._raise_for_status(response, request_url=f"{management_url}/v1/load_lora_adapter")
|
|
690
120
|
return f"Adapter '{adapter_name}' loaded successfully"
|
|
691
121
|
|
|
692
122
|
def unload_adapter(self, adapter_name: str) -> str:
|
|
693
123
|
"""Unload a LoRA adapter from memory."""
|
|
694
|
-
management_url = self.base_url.rstrip(
|
|
124
|
+
management_url = self.base_url.rstrip("/").replace("/v1", "")
|
|
695
125
|
|
|
696
126
|
response = self.client.post(
|
|
697
127
|
f"{management_url}/v1/unload_lora_adapter",
|
|
698
128
|
json={"lora_name": adapter_name},
|
|
699
|
-
headers=self._get_headers()
|
|
129
|
+
headers=self._get_headers(),
|
|
700
130
|
)
|
|
701
|
-
|
|
131
|
+
self._raise_for_status(response, request_url=f"{management_url}/v1/unload_lora_adapter")
|
|
702
132
|
return f"Adapter '{adapter_name}' unloaded successfully"
|
|
703
133
|
|
|
704
134
|
def list_adapters(self) -> List[str]:
|
|
705
135
|
"""List currently loaded LoRA adapters."""
|
|
706
|
-
management_url = self.base_url.rstrip(
|
|
136
|
+
management_url = self.base_url.rstrip("/").replace("/v1", "")
|
|
707
137
|
|
|
708
138
|
response = self.client.get(
|
|
709
139
|
f"{management_url}/v1/lora_adapters",
|
|
710
|
-
headers=self._get_headers()
|
|
140
|
+
headers=self._get_headers(),
|
|
711
141
|
)
|
|
712
|
-
|
|
713
|
-
|
|
142
|
+
self._raise_for_status(response, request_url=f"{management_url}/v1/lora_adapters")
|
|
143
|
+
data = response.json()
|
|
144
|
+
if isinstance(data, dict):
|
|
145
|
+
adapters = data.get("adapters", [])
|
|
146
|
+
return adapters if isinstance(adapters, list) else []
|
|
147
|
+
return []
|
|
714
148
|
|
|
715
149
|
# Standard AbstractCore methods
|
|
716
150
|
|
|
717
151
|
def get_capabilities(self) -> List[str]:
|
|
718
152
|
"""Get vLLM capabilities."""
|
|
719
153
|
capabilities = ["streaming", "chat", "tools", "structured_output"]
|
|
720
|
-
# vLLM-specific capabilities
|
|
721
154
|
capabilities.extend(["guided_decoding", "multi_lora", "beam_search"])
|
|
722
155
|
return capabilities
|
|
723
|
-
|
|
724
|
-
def validate_config(self) -> bool:
|
|
725
|
-
"""Validate vLLM connection."""
|
|
726
|
-
try:
|
|
727
|
-
response = self.client.get(f"{self.base_url}/models", headers=self._get_headers())
|
|
728
|
-
return response.status_code == 200
|
|
729
|
-
except:
|
|
730
|
-
return False
|
|
731
|
-
|
|
732
|
-
def _get_provider_max_tokens_param(self, kwargs: Dict[str, Any]) -> int:
|
|
733
|
-
"""Get max tokens parameter for vLLM API."""
|
|
734
|
-
return kwargs.get("max_output_tokens", self.max_output_tokens)
|
|
735
|
-
|
|
736
|
-
def _update_http_client_timeout(self) -> None:
|
|
737
|
-
"""Update HTTP client timeout when timeout is changed."""
|
|
738
|
-
if hasattr(self, 'client') and self.client is not None:
|
|
739
|
-
try:
|
|
740
|
-
self.client.close()
|
|
741
|
-
|
|
742
|
-
timeout_value = getattr(self, '_timeout', None)
|
|
743
|
-
if timeout_value is not None and timeout_value <= 0:
|
|
744
|
-
timeout_value = None
|
|
745
|
-
|
|
746
|
-
self.client = httpx.Client(timeout=timeout_value)
|
|
747
|
-
except Exception as e:
|
|
748
|
-
if hasattr(self, 'logger'):
|
|
749
|
-
self.logger.warning(f"Failed to update HTTP client timeout: {e}")
|
|
750
|
-
try:
|
|
751
|
-
fallback_timeout = None
|
|
752
|
-
try:
|
|
753
|
-
from ..config.manager import get_config_manager
|
|
754
|
-
|
|
755
|
-
fallback_timeout = float(get_config_manager().get_default_timeout())
|
|
756
|
-
except Exception:
|
|
757
|
-
fallback_timeout = 7200.0
|
|
758
|
-
if isinstance(fallback_timeout, (int, float)) and float(fallback_timeout) <= 0:
|
|
759
|
-
fallback_timeout = None
|
|
760
|
-
self.client = httpx.Client(timeout=fallback_timeout)
|
|
761
|
-
except Exception:
|
|
762
|
-
pass
|
|
763
|
-
|
|
764
|
-
def _normalize_model_name(self, model_name: str) -> str:
|
|
765
|
-
"""Remove common provider prefixes from model name."""
|
|
766
|
-
for prefix in ["vllm/", "qwen/", "ollama/", "huggingface/"]:
|
|
767
|
-
if model_name.startswith(prefix):
|
|
768
|
-
model_name = model_name[len(prefix):]
|
|
769
|
-
return model_name
|
|
770
|
-
|
|
771
|
-
def _get_media_handler_for_model(self, model_name: str):
|
|
772
|
-
"""Get appropriate media handler based on model vision capabilities."""
|
|
773
|
-
from ..media.handlers import OpenAIMediaHandler, LocalMediaHandler
|
|
774
|
-
|
|
775
|
-
clean_model_name = self._normalize_model_name(model_name)
|
|
776
|
-
|
|
777
|
-
try:
|
|
778
|
-
from ..architectures.detection import supports_vision
|
|
779
|
-
use_vision_handler = supports_vision(clean_model_name)
|
|
780
|
-
except Exception as e:
|
|
781
|
-
self.logger.debug(f"Vision detection failed: {e}, defaulting to LocalMediaHandler")
|
|
782
|
-
use_vision_handler = False
|
|
783
|
-
|
|
784
|
-
if use_vision_handler:
|
|
785
|
-
handler = OpenAIMediaHandler(self.model_capabilities, model_name=model_name)
|
|
786
|
-
self.logger.debug(f"Using OpenAIMediaHandler for vision model: {clean_model_name}")
|
|
787
|
-
else:
|
|
788
|
-
handler = LocalMediaHandler("vllm", self.model_capabilities, model_name=model_name)
|
|
789
|
-
self.logger.debug(f"Using LocalMediaHandler for model: {clean_model_name}")
|
|
790
|
-
|
|
791
|
-
return handler
|
|
792
|
-
|
|
793
|
-
def list_available_models(self, **kwargs) -> List[str]:
|
|
794
|
-
"""
|
|
795
|
-
List available models from vLLM server.
|
|
796
|
-
|
|
797
|
-
Args:
|
|
798
|
-
**kwargs: Optional parameters including:
|
|
799
|
-
- base_url: vLLM server URL
|
|
800
|
-
- input_capabilities: List of ModelInputCapability enums to filter by input capability
|
|
801
|
-
- output_capabilities: List of ModelOutputCapability enums to filter by output capability
|
|
802
|
-
|
|
803
|
-
Returns:
|
|
804
|
-
List of model names, optionally filtered by capabilities
|
|
805
|
-
"""
|
|
806
|
-
try:
|
|
807
|
-
from .model_capabilities import filter_models_by_capabilities
|
|
808
|
-
|
|
809
|
-
base_url = kwargs.get('base_url', self.base_url)
|
|
810
|
-
|
|
811
|
-
response = self.client.get(f"{base_url}/models", headers=self._get_headers(), timeout=5.0)
|
|
812
|
-
if response.status_code == 200:
|
|
813
|
-
data = response.json()
|
|
814
|
-
models = [model["id"] for model in data.get("data", [])]
|
|
815
|
-
models = sorted(models)
|
|
816
|
-
|
|
817
|
-
# Apply capability filtering if provided
|
|
818
|
-
input_capabilities = kwargs.get('input_capabilities')
|
|
819
|
-
output_capabilities = kwargs.get('output_capabilities')
|
|
820
|
-
|
|
821
|
-
if input_capabilities or output_capabilities:
|
|
822
|
-
models = filter_models_by_capabilities(
|
|
823
|
-
models,
|
|
824
|
-
input_capabilities=input_capabilities,
|
|
825
|
-
output_capabilities=output_capabilities
|
|
826
|
-
)
|
|
827
|
-
|
|
828
|
-
return models
|
|
829
|
-
else:
|
|
830
|
-
self.logger.warning(f"vLLM API returned status {response.status_code}")
|
|
831
|
-
return []
|
|
832
|
-
except Exception as e:
|
|
833
|
-
self.logger.warning(f"Failed to list vLLM models: {e}")
|
|
834
|
-
return []
|
|
835
|
-
|
|
836
|
-
def embed(self, input_text: Union[str, List[str]], **kwargs) -> Dict[str, Any]:
|
|
837
|
-
"""
|
|
838
|
-
Generate embeddings using vLLM's OpenAI-compatible embedding API.
|
|
839
|
-
|
|
840
|
-
Args:
|
|
841
|
-
input_text: Single string or list of strings to embed
|
|
842
|
-
**kwargs: Additional parameters (encoding_format, dimensions, user, etc.)
|
|
843
|
-
|
|
844
|
-
Returns:
|
|
845
|
-
Dict with embeddings in OpenAI-compatible format:
|
|
846
|
-
{
|
|
847
|
-
"object": "list",
|
|
848
|
-
"data": [{"object": "embedding", "embedding": [...], "index": 0}, ...],
|
|
849
|
-
"model": "model-name",
|
|
850
|
-
"usage": {"prompt_tokens": N, "total_tokens": N}
|
|
851
|
-
}
|
|
852
|
-
"""
|
|
853
|
-
try:
|
|
854
|
-
payload = {
|
|
855
|
-
"input": input_text,
|
|
856
|
-
"model": self.model
|
|
857
|
-
}
|
|
858
|
-
|
|
859
|
-
if "encoding_format" in kwargs:
|
|
860
|
-
payload["encoding_format"] = kwargs["encoding_format"]
|
|
861
|
-
if "dimensions" in kwargs and kwargs["dimensions"]:
|
|
862
|
-
payload["dimensions"] = kwargs["dimensions"]
|
|
863
|
-
if "user" in kwargs:
|
|
864
|
-
payload["user"] = kwargs["user"]
|
|
865
|
-
|
|
866
|
-
response = self.client.post(
|
|
867
|
-
f"{self.base_url}/embeddings",
|
|
868
|
-
json=payload,
|
|
869
|
-
headers=self._get_headers()
|
|
870
|
-
)
|
|
871
|
-
response.raise_for_status()
|
|
872
|
-
|
|
873
|
-
result = response.json()
|
|
874
|
-
result["model"] = self.model
|
|
875
|
-
|
|
876
|
-
return result
|
|
877
|
-
|
|
878
|
-
except Exception as e:
|
|
879
|
-
self.logger.error(f"Failed to generate embeddings: {e}")
|
|
880
|
-
raise ProviderAPIError(f"vLLM embedding error: {str(e)}")
|