abstractcore 2.9.1__py3-none-any.whl → 2.11.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. abstractcore/__init__.py +7 -27
  2. abstractcore/apps/extractor.py +33 -100
  3. abstractcore/apps/intent.py +19 -0
  4. abstractcore/apps/judge.py +20 -1
  5. abstractcore/apps/summarizer.py +20 -1
  6. abstractcore/architectures/detection.py +34 -1
  7. abstractcore/architectures/response_postprocessing.py +313 -0
  8. abstractcore/assets/architecture_formats.json +38 -8
  9. abstractcore/assets/model_capabilities.json +781 -160
  10. abstractcore/compression/__init__.py +1 -2
  11. abstractcore/compression/glyph_processor.py +6 -4
  12. abstractcore/config/main.py +31 -19
  13. abstractcore/config/manager.py +389 -11
  14. abstractcore/config/vision_config.py +5 -5
  15. abstractcore/core/interface.py +151 -3
  16. abstractcore/core/session.py +16 -10
  17. abstractcore/download.py +1 -1
  18. abstractcore/embeddings/manager.py +20 -6
  19. abstractcore/endpoint/__init__.py +2 -0
  20. abstractcore/endpoint/app.py +458 -0
  21. abstractcore/mcp/client.py +3 -1
  22. abstractcore/media/__init__.py +52 -17
  23. abstractcore/media/auto_handler.py +42 -22
  24. abstractcore/media/base.py +44 -1
  25. abstractcore/media/capabilities.py +12 -33
  26. abstractcore/media/enrichment.py +105 -0
  27. abstractcore/media/handlers/anthropic_handler.py +19 -28
  28. abstractcore/media/handlers/local_handler.py +124 -70
  29. abstractcore/media/handlers/openai_handler.py +19 -31
  30. abstractcore/media/processors/__init__.py +4 -2
  31. abstractcore/media/processors/audio_processor.py +57 -0
  32. abstractcore/media/processors/office_processor.py +8 -3
  33. abstractcore/media/processors/pdf_processor.py +46 -3
  34. abstractcore/media/processors/text_processor.py +22 -24
  35. abstractcore/media/processors/video_processor.py +58 -0
  36. abstractcore/media/types.py +97 -4
  37. abstractcore/media/utils/image_scaler.py +20 -2
  38. abstractcore/media/utils/video_frames.py +219 -0
  39. abstractcore/media/vision_fallback.py +136 -22
  40. abstractcore/processing/__init__.py +32 -3
  41. abstractcore/processing/basic_deepsearch.py +15 -10
  42. abstractcore/processing/basic_intent.py +3 -2
  43. abstractcore/processing/basic_judge.py +3 -2
  44. abstractcore/processing/basic_summarizer.py +1 -1
  45. abstractcore/providers/__init__.py +3 -1
  46. abstractcore/providers/anthropic_provider.py +95 -8
  47. abstractcore/providers/base.py +1516 -81
  48. abstractcore/providers/huggingface_provider.py +546 -69
  49. abstractcore/providers/lmstudio_provider.py +35 -923
  50. abstractcore/providers/mlx_provider.py +382 -35
  51. abstractcore/providers/model_capabilities.py +5 -1
  52. abstractcore/providers/ollama_provider.py +99 -15
  53. abstractcore/providers/openai_compatible_provider.py +406 -180
  54. abstractcore/providers/openai_provider.py +188 -44
  55. abstractcore/providers/openrouter_provider.py +76 -0
  56. abstractcore/providers/registry.py +61 -5
  57. abstractcore/providers/streaming.py +138 -33
  58. abstractcore/providers/vllm_provider.py +92 -817
  59. abstractcore/server/app.py +461 -13
  60. abstractcore/server/audio_endpoints.py +139 -0
  61. abstractcore/server/vision_endpoints.py +1319 -0
  62. abstractcore/structured/handler.py +316 -41
  63. abstractcore/tools/common_tools.py +5501 -2012
  64. abstractcore/tools/comms_tools.py +1641 -0
  65. abstractcore/tools/core.py +37 -7
  66. abstractcore/tools/handler.py +4 -9
  67. abstractcore/tools/parser.py +49 -2
  68. abstractcore/tools/tag_rewriter.py +2 -1
  69. abstractcore/tools/telegram_tdlib.py +407 -0
  70. abstractcore/tools/telegram_tools.py +261 -0
  71. abstractcore/utils/cli.py +1085 -72
  72. abstractcore/utils/token_utils.py +2 -0
  73. abstractcore/utils/truncation.py +29 -0
  74. abstractcore/utils/version.py +3 -4
  75. abstractcore/utils/vlm_token_calculator.py +12 -2
  76. abstractcore-2.11.2.dist-info/METADATA +562 -0
  77. abstractcore-2.11.2.dist-info/RECORD +133 -0
  78. {abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/WHEEL +1 -1
  79. {abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/entry_points.txt +1 -0
  80. abstractcore-2.9.1.dist-info/METADATA +0 -1190
  81. abstractcore-2.9.1.dist-info/RECORD +0 -119
  82. {abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/licenses/LICENSE +0 -0
  83. {abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/top_level.txt +0 -0
@@ -1,666 +1,100 @@
1
1
  """
2
2
  vLLM provider implementation with advanced features.
3
3
 
4
- vLLM-specific features:
4
+ vLLM exposes an OpenAI-compatible API (chat completions, models, embeddings) plus
5
+ additional management endpoints and request extensions:
5
6
  - Guided Decoding: guided_regex, guided_json, guided_grammar
6
- - Multi-LoRA: load_adapter, unload_adapter, list_adapters
7
7
  - Beam Search: best_of, use_beam_search
8
- """
9
-
10
- import os
11
- import httpx
12
- import json
13
- import time
14
- from typing import List, Dict, Any, Optional, Union, Iterator, AsyncIterator, Type
15
-
16
- try:
17
- from pydantic import BaseModel
18
- PYDANTIC_AVAILABLE = True
19
- except ImportError:
20
- PYDANTIC_AVAILABLE = False
21
- BaseModel = None
22
-
23
- from .base import BaseProvider
24
- from ..core.types import GenerateResponse
25
- from ..exceptions import ProviderAPIError, ModelNotFoundError, format_model_error, format_provider_error
26
- from ..tools import UniversalToolHandler, execute_tools
27
- from ..events import EventType
28
-
29
-
30
- class VLLMProvider(BaseProvider):
31
- """vLLM provider for high-throughput GPU inference with advanced features."""
32
-
33
- def __init__(self, model: str = "Qwen/Qwen3-Coder-30B-A3B-Instruct",
34
- base_url: Optional[str] = None,
35
- api_key: Optional[str] = None,
36
- **kwargs):
37
- super().__init__(model, **kwargs)
38
- self.provider = "vllm"
39
-
40
- # Initialize tool handler
41
- self.tool_handler = UniversalToolHandler(model)
42
-
43
- # Base URL: parameter > VLLM_BASE_URL > default
44
- self.base_url = (
45
- base_url or
46
- os.getenv("VLLM_BASE_URL") or
47
- "http://localhost:8000/v1"
48
- ).rstrip('/')
49
-
50
- # API key: parameter > VLLM_API_KEY > "EMPTY"
51
- self.api_key = api_key or os.getenv("VLLM_API_KEY") or "EMPTY"
52
-
53
- # Get timeout value - None means unlimited timeout
54
- timeout_value = getattr(self, '_timeout', None)
55
- if timeout_value is not None and timeout_value <= 0:
56
- timeout_value = None # Invalid timeout becomes unlimited
57
-
58
- try:
59
- self.client = httpx.Client(timeout=timeout_value)
60
- except Exception as e:
61
- try:
62
- fallback_timeout = None
63
- try:
64
- from ..config.manager import get_config_manager
65
-
66
- fallback_timeout = float(get_config_manager().get_default_timeout())
67
- except Exception:
68
- fallback_timeout = 7200.0
69
- if isinstance(fallback_timeout, (int, float)) and float(fallback_timeout) <= 0:
70
- fallback_timeout = None
71
- self.client = httpx.Client(timeout=fallback_timeout)
72
- except Exception:
73
- raise RuntimeError(f"Failed to create HTTP client for vLLM: {e}")
74
-
75
- self._async_client = None # Lazy-loaded async client
76
-
77
- # Validate model exists in vLLM
78
- self._validate_model()
79
-
80
- @property
81
- def async_client(self):
82
- """Lazy-load async HTTP client for native async operations."""
83
- if self._async_client is None:
84
- timeout_value = getattr(self, '_timeout', None)
85
- if timeout_value is not None and timeout_value <= 0:
86
- timeout_value = None
87
- self._async_client = httpx.AsyncClient(timeout=timeout_value)
88
- return self._async_client
89
-
90
- def _get_headers(self) -> Dict[str, str]:
91
- """Get HTTP headers including API key if configured."""
92
- headers = {"Content-Type": "application/json"}
93
- if self.api_key and self.api_key != "EMPTY":
94
- headers["Authorization"] = f"Bearer {self.api_key}"
95
- return headers
8
+ - Multi-LoRA management: load_adapter, unload_adapter, list_adapters
96
9
 
97
- def _validate_model(self):
98
- """Validate that the model exists in vLLM."""
99
- try:
100
- available_models = self.list_available_models(base_url=self.base_url)
101
- if available_models and self.model not in available_models:
102
- error_message = format_model_error("vLLM", self.model, available_models)
103
- raise ModelNotFoundError(error_message)
104
- except httpx.ConnectError:
105
- if hasattr(self, 'logger'):
106
- self.logger.debug(f"vLLM server not accessible at {self.base_url} - model validation skipped")
107
- pass
108
- except ModelNotFoundError:
109
- raise
110
- except Exception as e:
111
- if hasattr(self, 'logger'):
112
- self.logger.debug(f"Model validation failed with error: {e} - continuing anyway")
113
- pass
10
+ This provider subclasses `OpenAICompatibleProvider` and injects vLLM-specific request
11
+ extensions via `payload["extra_body"]`.
114
12
 
115
- def unload(self) -> None:
116
- """
117
- Close HTTP client connection.
118
-
119
- Note: vLLM manages model memory automatically.
120
- This method only closes the HTTP client connection for cleanup.
121
- """
122
- try:
123
- if hasattr(self, 'client') and self.client is not None:
124
- self.client.close()
125
-
126
- if self._async_client is not None:
127
- import asyncio
128
- try:
129
- loop = asyncio.get_running_loop()
130
- loop.create_task(self._async_client.aclose())
131
- except RuntimeError:
132
- import asyncio
133
- asyncio.run(self._async_client.aclose())
134
-
135
- except Exception as e:
136
- if hasattr(self, 'logger'):
137
- self.logger.warning(f"Error during unload: {e}")
138
-
139
- def generate(self, *args, **kwargs):
140
- """Public generate method that includes telemetry."""
141
- return self.generate_with_telemetry(*args, **kwargs)
142
-
143
- def _generate_internal(self,
144
- prompt: str,
145
- messages: Optional[List[Dict[str, str]]] = None,
146
- system_prompt: Optional[str] = None,
147
- tools: Optional[List[Dict[str, Any]]] = None,
148
- media: Optional[List['MediaContent']] = None,
149
- stream: bool = False,
150
- response_model: Optional[Type[BaseModel]] = None,
151
- execute_tools: Optional[bool] = None,
152
- tool_call_tags: Optional[str] = None,
153
- # vLLM-specific parameters:
154
- guided_regex: Optional[str] = None,
155
- guided_json: Optional[Dict] = None,
156
- guided_grammar: Optional[str] = None,
157
- best_of: Optional[int] = None,
158
- use_beam_search: bool = False,
159
- **kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
160
- """Generate response using vLLM with advanced features."""
161
-
162
- # Build messages for chat completions with tool support
163
- chat_messages = []
164
-
165
- # Add tools to system prompt if provided
166
- final_system_prompt = system_prompt
167
- # Prefer native tools when the model supports them. Only inject a prompted tool list
168
- # when native tool calling is not available.
169
- if tools and self.tool_handler.supports_prompted and not self.tool_handler.supports_native:
170
- include_tool_list = True
171
- if final_system_prompt and "## Tools (session)" in final_system_prompt:
172
- include_tool_list = False
173
- tool_prompt = self.tool_handler.format_tools_prompt(tools, include_tool_list=include_tool_list)
174
- if final_system_prompt:
175
- final_system_prompt += f"\n\n{tool_prompt}"
176
- else:
177
- final_system_prompt = tool_prompt
178
-
179
- # Add system message if provided
180
- if final_system_prompt:
181
- chat_messages.append({
182
- "role": "system",
183
- "content": final_system_prompt
184
- })
185
-
186
- # Add conversation history
187
- if messages:
188
- chat_messages.extend(messages)
189
-
190
- # Handle media content if provided
191
- if media:
192
- user_message_text = prompt.strip() if prompt else ""
193
- if not user_message_text and chat_messages:
194
- for msg in reversed(chat_messages):
195
- if msg.get("role") == "user" and msg.get("content"):
196
- user_message_text = msg["content"]
197
- break
198
- try:
199
- processed_media = self._process_media_content(media)
200
- media_handler = self._get_media_handler_for_model(self.model)
201
- multimodal_message = media_handler.create_multimodal_message(user_message_text, processed_media)
202
-
203
- if isinstance(multimodal_message, str):
204
- if chat_messages and chat_messages[-1].get("role") == "user":
205
- chat_messages[-1]["content"] = multimodal_message
206
- else:
207
- chat_messages.append({"role": "user", "content": multimodal_message})
208
- else:
209
- if chat_messages and chat_messages[-1].get("role") == "user":
210
- chat_messages[-1] = multimodal_message
211
- else:
212
- chat_messages.append(multimodal_message)
213
- except ImportError:
214
- self.logger.warning("Media processing not available. Install with: pip install abstractcore[media]")
215
- if user_message_text:
216
- chat_messages.append({"role": "user", "content": user_message_text})
217
- except Exception as e:
218
- self.logger.warning(f"Failed to process media content: {e}")
219
- if user_message_text:
220
- chat_messages.append({"role": "user", "content": user_message_text})
221
-
222
- # Add prompt as separate message if provided
223
- elif prompt and prompt.strip():
224
- chat_messages.append({"role": "user", "content": prompt})
225
-
226
- # Build request payload using unified system
227
- generation_kwargs = self._prepare_generation_kwargs(**kwargs)
228
- max_output_tokens = self._get_provider_max_tokens_param(generation_kwargs)
229
-
230
- payload = {
231
- "model": self.model,
232
- "messages": chat_messages,
233
- "stream": stream,
234
- "temperature": kwargs.get("temperature", self.temperature),
235
- "max_tokens": max_output_tokens,
236
- "top_p": kwargs.get("top_p", 0.9),
237
- }
13
+ TODO(vllm): Add an opt-in integration test that exercises streaming + LoRA management endpoints
14
+ once a reachable vLLM server is available in the test environment.
15
+ """
238
16
 
239
- # Native tools (OpenAI-compatible): send structured tools/tool_choice when supported.
240
- if tools and self.tool_handler.supports_native:
241
- payload["tools"] = self.tool_handler.prepare_tools_for_native(tools)
242
- payload["tool_choice"] = kwargs.get("tool_choice", "auto")
17
+ from typing import Any, Dict, List, Optional
243
18
 
244
- # Add additional generation parameters if provided
245
- if "frequency_penalty" in kwargs:
246
- payload["frequency_penalty"] = kwargs["frequency_penalty"]
247
- if "presence_penalty" in kwargs:
248
- payload["presence_penalty"] = kwargs["presence_penalty"]
19
+ from .openai_compatible_provider import OpenAICompatibleProvider
249
20
 
250
- # Add seed if provided
251
- seed_value = kwargs.get("seed", self.seed)
252
- if seed_value is not None:
253
- payload["seed"] = seed_value
254
21
 
255
- # Build extra_body for vLLM-specific features
256
- extra_body = {}
22
+ class VLLMProvider(OpenAICompatibleProvider):
23
+ """vLLM provider for high-throughput GPU inference with advanced features."""
257
24
 
258
- # Guided decoding
25
+ PROVIDER_ID = "vllm"
26
+ PROVIDER_DISPLAY_NAME = "vLLM"
27
+ BASE_URL_ENV_VAR = "VLLM_BASE_URL"
28
+ API_KEY_ENV_VAR = "VLLM_API_KEY" # Optional; some deployments sit behind auth
29
+ DEFAULT_BASE_URL = "http://localhost:8000/v1"
30
+
31
+ def __init__(
32
+ self,
33
+ model: str = "Qwen/Qwen3-Coder-30B-A3B-Instruct",
34
+ base_url: Optional[str] = None,
35
+ api_key: Optional[str] = None,
36
+ **kwargs,
37
+ ):
38
+ super().__init__(model=model, base_url=base_url, api_key=api_key, **kwargs)
39
+
40
+ def _mutate_payload(self, payload: Dict[str, Any], **kwargs) -> Dict[str, Any]:
41
+ extra_body_updates: Dict[str, Any] = {}
42
+
43
+ guided_regex = kwargs.get("guided_regex")
259
44
  if guided_regex:
260
- extra_body["guided_regex"] = guided_regex
261
- if guided_json:
262
- extra_body["guided_json"] = guided_json
263
- if guided_grammar:
264
- extra_body["guided_grammar"] = guided_grammar
265
-
266
- # Beam search
267
- if use_beam_search or best_of:
268
- extra_body["use_beam_search"] = use_beam_search
269
- if best_of:
270
- extra_body["best_of"] = best_of
271
-
272
- # Add structured output support (standard OpenAI-compatible format)
273
- if response_model and PYDANTIC_AVAILABLE:
274
- json_schema = response_model.model_json_schema()
275
- payload["response_format"] = {
276
- "type": "json_schema",
277
- "json_schema": {
278
- "name": response_model.__name__,
279
- "schema": json_schema
280
- }
281
- }
282
-
283
- # Add extra_body if we have vLLM-specific parameters
284
- if extra_body:
285
- payload["extra_body"] = extra_body
286
-
287
- if stream:
288
- return self._stream_generate(payload)
289
- else:
290
- response = self._single_generate(payload)
291
-
292
- # Execute tools if enabled and tools are present
293
- if self.execute_tools and tools and self.tool_handler.supports_prompted and response.content:
294
- response = self._handle_prompted_tool_execution(response, tools, execute_tools)
295
-
296
- return response
297
-
298
- def _single_generate(self, payload: Dict[str, Any]) -> GenerateResponse:
299
- """Generate single response."""
300
- try:
301
- if not hasattr(self, 'client') or self.client is None:
302
- raise ProviderAPIError("HTTP client not initialized")
303
-
304
- start_time = time.time()
305
- request_url = f"{self.base_url}/chat/completions"
306
- response = self.client.post(
307
- request_url,
308
- json=payload,
309
- headers=self._get_headers()
310
- )
311
- response.raise_for_status()
312
- gen_time = round((time.time() - start_time) * 1000, 1)
313
-
314
- result = response.json()
315
-
316
- # Extract response from OpenAI format
317
- if "choices" in result and len(result["choices"]) > 0:
318
- choice = result["choices"][0]
319
- message = choice.get("message") or {}
320
- if not isinstance(message, dict):
321
- message = {}
322
-
323
- content = message.get("content", "")
324
- tool_calls = message.get("tool_calls")
325
- if tool_calls is None:
326
- tool_calls = choice.get("tool_calls")
327
- finish_reason = choice.get("finish_reason", "stop")
328
- else:
329
- content = "No response generated"
330
- tool_calls = None
331
- finish_reason = "error"
332
-
333
- # Extract usage info
334
- usage = result.get("usage", {})
335
-
336
- return GenerateResponse(
337
- content=content,
338
- model=self.model,
339
- finish_reason=finish_reason,
340
- raw_response=result,
341
- tool_calls=tool_calls if isinstance(tool_calls, list) else None,
342
- metadata={
343
- "_provider_request": {
344
- "url": request_url,
345
- "payload": payload,
346
- }
347
- },
348
- usage={
349
- "input_tokens": usage.get("prompt_tokens", 0),
350
- "output_tokens": usage.get("completion_tokens", 0),
351
- "total_tokens": usage.get("total_tokens", 0),
352
- "prompt_tokens": usage.get("prompt_tokens", 0),
353
- "completion_tokens": usage.get("completion_tokens", 0)
354
- },
355
- gen_time=gen_time
356
- )
357
-
358
- except AttributeError as e:
359
- if "'NoneType'" in str(e):
360
- raise ProviderAPIError(f"vLLM provider not properly initialized: {str(e)}")
361
- else:
362
- raise ProviderAPIError(f"vLLM configuration error: {str(e)}")
363
- except Exception as e:
364
- error_str = str(e).lower()
365
- if ('404' in error_str or 'not found' in error_str or 'model' in error_str) and ('not found' in error_str):
366
- try:
367
- available_models = self.list_available_models(base_url=self.base_url)
368
- error_message = format_model_error("vLLM", self.model, available_models)
369
- raise ModelNotFoundError(error_message)
370
- except Exception:
371
- raise ModelNotFoundError(f"Model '{self.model}' not found in vLLM and could not fetch available models")
372
- else:
373
- raise
374
-
375
- def _stream_generate(self, payload: Dict[str, Any]) -> Iterator[GenerateResponse]:
376
- """Generate streaming response."""
377
- try:
378
- with self.client.stream(
379
- "POST",
380
- f"{self.base_url}/chat/completions",
381
- json=payload,
382
- headers=self._get_headers()
383
- ) as response:
384
- response.raise_for_status()
385
-
386
- for line in response.iter_lines():
387
- if line:
388
- if isinstance(line, bytes):
389
- line = line.decode('utf-8')
390
- line = line.strip()
391
-
392
- if line.startswith("data: "):
393
- data = line[6:] # Remove "data: " prefix
394
-
395
- if data == "[DONE]":
396
- break
397
-
398
- try:
399
- chunk = json.loads(data)
400
-
401
- if "choices" in chunk and len(chunk["choices"]) > 0:
402
- choice = chunk["choices"][0]
403
- delta = choice.get("delta", {})
404
- if not isinstance(delta, dict):
405
- delta = {}
406
- content = delta.get("content", "")
407
- tool_calls = delta.get("tool_calls") or choice.get("tool_calls")
408
- finish_reason = choice.get("finish_reason")
409
-
410
- yield GenerateResponse(
411
- content=content,
412
- model=self.model,
413
- finish_reason=finish_reason,
414
- tool_calls=tool_calls if isinstance(tool_calls, list) else None,
415
- raw_response=chunk
416
- )
417
-
418
- except json.JSONDecodeError:
419
- continue
420
-
421
- except Exception as e:
422
- yield GenerateResponse(
423
- content=f"Error: {str(e)}",
424
- model=self.model,
425
- finish_reason="error"
426
- )
427
-
428
- async def _agenerate_internal(self,
429
- prompt: str,
430
- messages: Optional[List[Dict[str, str]]] = None,
431
- system_prompt: Optional[str] = None,
432
- tools: Optional[List[Dict[str, Any]]] = None,
433
- media: Optional[List['MediaContent']] = None,
434
- stream: bool = False,
435
- response_model: Optional[Type[BaseModel]] = None,
436
- execute_tools: Optional[bool] = None,
437
- tool_call_tags: Optional[str] = None,
438
- # vLLM-specific parameters:
439
- guided_regex: Optional[str] = None,
440
- guided_json: Optional[Dict] = None,
441
- guided_grammar: Optional[str] = None,
442
- best_of: Optional[int] = None,
443
- use_beam_search: bool = False,
444
- **kwargs) -> Union[GenerateResponse, AsyncIterator[GenerateResponse]]:
445
- """Native async implementation with vLLM features."""
446
-
447
- # Build messages (same logic as sync)
448
- chat_messages = []
449
-
450
- final_system_prompt = system_prompt
451
- # Prefer native tools when available; only inject prompted tool syntax as fallback.
452
- if tools and self.tool_handler.supports_prompted and not self.tool_handler.supports_native:
453
- include_tool_list = True
454
- if final_system_prompt and "## Tools (session)" in final_system_prompt:
455
- include_tool_list = False
456
- tool_prompt = self.tool_handler.format_tools_prompt(tools, include_tool_list=include_tool_list)
457
- if final_system_prompt:
458
- final_system_prompt += f"\n\n{tool_prompt}"
459
- else:
460
- final_system_prompt = tool_prompt
461
-
462
- if final_system_prompt:
463
- chat_messages.append({"role": "system", "content": final_system_prompt})
45
+ extra_body_updates["guided_regex"] = guided_regex
464
46
 
465
- if messages:
466
- chat_messages.extend(messages)
467
-
468
- if media:
469
- user_message_text = prompt.strip() if prompt else ""
470
- if not user_message_text and chat_messages:
471
- for msg in reversed(chat_messages):
472
- if msg.get("role") == "user" and msg.get("content"):
473
- user_message_text = msg["content"]
474
- break
475
- try:
476
- processed_media = self._process_media_content(media)
477
- media_handler = self._get_media_handler_for_model(self.model)
478
- multimodal_message = media_handler.create_multimodal_message(user_message_text, processed_media)
479
-
480
- if isinstance(multimodal_message, str):
481
- if chat_messages and chat_messages[-1].get("role") == "user":
482
- chat_messages[-1]["content"] = multimodal_message
483
- else:
484
- chat_messages.append({"role": "user", "content": multimodal_message})
485
- else:
486
- if chat_messages and chat_messages[-1].get("role") == "user":
487
- chat_messages[-1] = multimodal_message
488
- else:
489
- chat_messages.append(multimodal_message)
490
- except ImportError:
491
- self.logger.warning("Media processing not available. Install with: pip install abstractcore[media]")
492
- if user_message_text:
493
- chat_messages.append({"role": "user", "content": user_message_text})
494
- except Exception as e:
495
- self.logger.warning(f"Failed to process media content: {e}")
496
- if user_message_text:
497
- chat_messages.append({"role": "user", "content": user_message_text})
498
-
499
- elif prompt and prompt.strip():
500
- chat_messages.append({"role": "user", "content": prompt})
501
-
502
- # Build request payload
503
- generation_kwargs = self._prepare_generation_kwargs(**kwargs)
504
- max_output_tokens = self._get_provider_max_tokens_param(generation_kwargs)
505
-
506
- payload = {
507
- "model": self.model,
508
- "messages": chat_messages,
509
- "stream": stream,
510
- "temperature": kwargs.get("temperature", self.temperature),
511
- "max_tokens": max_output_tokens,
512
- "top_p": kwargs.get("top_p", 0.9),
513
- }
514
-
515
- # Native tools (OpenAI-compatible): send structured tools/tool_choice when supported.
516
- if tools and self.tool_handler.supports_native:
517
- payload["tools"] = self.tool_handler.prepare_tools_for_native(tools)
518
- payload["tool_choice"] = kwargs.get("tool_choice", "auto")
519
-
520
- if "frequency_penalty" in kwargs:
521
- payload["frequency_penalty"] = kwargs["frequency_penalty"]
522
- if "presence_penalty" in kwargs:
523
- payload["presence_penalty"] = kwargs["presence_penalty"]
524
-
525
- seed_value = kwargs.get("seed", self.seed)
526
- if seed_value is not None:
527
- payload["seed"] = seed_value
528
-
529
- # vLLM-specific features
530
- extra_body = {}
531
-
532
- if guided_regex:
533
- extra_body["guided_regex"] = guided_regex
47
+ guided_json = kwargs.get("guided_json")
534
48
  if guided_json:
535
- extra_body["guided_json"] = guided_json
49
+ extra_body_updates["guided_json"] = guided_json
50
+
51
+ guided_grammar = kwargs.get("guided_grammar")
536
52
  if guided_grammar:
537
- extra_body["guided_grammar"] = guided_grammar
53
+ extra_body_updates["guided_grammar"] = guided_grammar
538
54
 
55
+ best_of = kwargs.get("best_of")
56
+ use_beam_search = kwargs.get("use_beam_search", False)
539
57
  if use_beam_search or best_of:
540
- extra_body["use_beam_search"] = use_beam_search
541
- if best_of:
542
- extra_body["best_of"] = best_of
543
-
544
- if response_model and PYDANTIC_AVAILABLE:
545
- json_schema = response_model.model_json_schema()
546
- payload["response_format"] = {
547
- "type": "json_schema",
548
- "json_schema": {
549
- "name": response_model.__name__,
550
- "schema": json_schema
551
- }
552
- }
553
-
554
- if extra_body:
555
- payload["extra_body"] = extra_body
556
-
557
- if stream:
558
- return self._async_stream_generate(payload)
559
- else:
560
- response = await self._async_single_generate(payload)
561
-
562
- if self.execute_tools and tools and self.tool_handler.supports_prompted and response.content:
563
- response = self._handle_prompted_tool_execution(response, tools, execute_tools)
564
-
565
- return response
566
-
567
- async def _async_single_generate(self, payload: Dict[str, Any]) -> GenerateResponse:
568
- """Native async single response generation."""
569
- try:
570
- start_time = time.time()
571
- response = await self.async_client.post(
572
- f"{self.base_url}/chat/completions",
573
- json=payload,
574
- headers=self._get_headers()
575
- )
576
- response.raise_for_status()
577
- gen_time = round((time.time() - start_time) * 1000, 1)
578
-
579
- result = response.json()
580
-
581
- if "choices" in result and len(result["choices"]) > 0:
582
- choice = result["choices"][0]
583
- content = choice.get("message", {}).get("content", "")
584
- finish_reason = choice.get("finish_reason", "stop")
58
+ extra_body_updates["use_beam_search"] = bool(use_beam_search)
59
+ if best_of is not None:
60
+ extra_body_updates["best_of"] = best_of
61
+
62
+ # Allow callers to pass raw extra_body (merge with our computed updates).
63
+ caller_extra_body = kwargs.get("extra_body")
64
+ if isinstance(caller_extra_body, dict) and caller_extra_body:
65
+ extra_body_updates = {**caller_extra_body, **extra_body_updates}
66
+
67
+ if extra_body_updates:
68
+ existing = payload.get("extra_body")
69
+ if isinstance(existing, dict) and existing:
70
+ payload["extra_body"] = {**existing, **extra_body_updates}
585
71
  else:
586
- content = "No response generated"
587
- finish_reason = "error"
588
-
589
- usage = result.get("usage", {})
590
-
591
- return GenerateResponse(
592
- content=content,
593
- model=self.model,
594
- finish_reason=finish_reason,
595
- raw_response=result,
596
- usage={
597
- "input_tokens": usage.get("prompt_tokens", 0),
598
- "output_tokens": usage.get("completion_tokens", 0),
599
- "total_tokens": usage.get("total_tokens", 0),
600
- "prompt_tokens": usage.get("prompt_tokens", 0),
601
- "completion_tokens": usage.get("completion_tokens", 0)
602
- },
603
- gen_time=gen_time
604
- )
605
-
606
- except Exception as e:
607
- error_str = str(e).lower()
608
- if ('404' in error_str or 'not found' in error_str or 'model' in error_str) and ('not found' in error_str):
609
- try:
610
- available_models = self.list_available_models(base_url=self.base_url)
611
- error_message = format_model_error("vLLM", self.model, available_models)
612
- raise ModelNotFoundError(error_message)
613
- except Exception:
614
- raise ModelNotFoundError(f"Model '{self.model}' not found in vLLM")
615
- else:
616
- raise ProviderAPIError(f"vLLM API error: {str(e)}")
617
-
618
- async def _async_stream_generate(self, payload: Dict[str, Any]) -> AsyncIterator[GenerateResponse]:
619
- """Native async streaming response generation."""
620
- try:
621
- async with self.async_client.stream(
622
- "POST",
623
- f"{self.base_url}/chat/completions",
624
- json=payload,
625
- headers=self._get_headers()
626
- ) as response:
627
- response.raise_for_status()
628
-
629
- async for line in response.aiter_lines():
630
- if line:
631
- line = line.strip()
632
-
633
- if line.startswith("data: "):
634
- data = line[6:] # Remove "data: " prefix
635
-
636
- if data == "[DONE]":
637
- break
638
-
639
- try:
640
- chunk = json.loads(data)
641
-
642
- if "choices" in chunk and len(chunk["choices"]) > 0:
643
- choice = chunk["choices"][0]
644
- delta = choice.get("delta", {})
645
- content = delta.get("content", "")
646
- finish_reason = choice.get("finish_reason")
647
-
648
- yield GenerateResponse(
649
- content=content,
650
- model=self.model,
651
- finish_reason=finish_reason,
652
- raw_response=chunk
653
- )
654
-
655
- except json.JSONDecodeError:
656
- continue
657
-
658
- except Exception as e:
659
- yield GenerateResponse(
660
- content=f"Error: {str(e)}",
661
- model=self.model,
662
- finish_reason="error"
663
- )
72
+ payload["extra_body"] = extra_body_updates
73
+
74
+ return payload
75
+
76
+ def _apply_provider_thinking_kwargs(
77
+ self,
78
+ *,
79
+ enabled: Optional[bool],
80
+ level: Optional[str],
81
+ kwargs: Dict[str, Any],
82
+ ) -> tuple[Dict[str, Any], bool]:
83
+ # vLLM exposes reasoning controls via `extra_body.chat_template_kwargs`.
84
+ # For Qwen3 templates specifically, the variable is commonly named `enable_thinking`.
85
+ _ = level
86
+ if enabled is None:
87
+ return kwargs, False
88
+
89
+ new_kwargs = dict(kwargs)
90
+ extra_body = new_kwargs.get("extra_body")
91
+ extra_body_dict: Dict[str, Any] = dict(extra_body) if isinstance(extra_body, dict) else {}
92
+ ctk = extra_body_dict.get("chat_template_kwargs")
93
+ ctk_dict: Dict[str, Any] = dict(ctk) if isinstance(ctk, dict) else {}
94
+ ctk_dict["enable_thinking"] = bool(enabled)
95
+ extra_body_dict["chat_template_kwargs"] = ctk_dict
96
+ new_kwargs["extra_body"] = extra_body_dict
97
+ return new_kwargs, True
664
98
 
665
99
  # vLLM-specific methods
666
100
 
@@ -674,207 +108,48 @@ class VLLMProvider(BaseProvider):
674
108
 
675
109
  Returns:
676
110
  Success message
677
-
678
- Usage:
679
- llm.load_adapter("sql-expert", "/models/adapters/sql-lora")
680
- response = llm.generate("Query...", model="sql-expert")
681
111
  """
682
- management_url = self.base_url.rstrip('/').replace('/v1', '')
112
+ management_url = self.base_url.rstrip("/").replace("/v1", "")
683
113
 
684
114
  response = self.client.post(
685
115
  f"{management_url}/v1/load_lora_adapter",
686
116
  json={"lora_name": adapter_name, "lora_path": adapter_path},
687
- headers=self._get_headers()
117
+ headers=self._get_headers(),
688
118
  )
689
- response.raise_for_status()
119
+ self._raise_for_status(response, request_url=f"{management_url}/v1/load_lora_adapter")
690
120
  return f"Adapter '{adapter_name}' loaded successfully"
691
121
 
692
122
  def unload_adapter(self, adapter_name: str) -> str:
693
123
  """Unload a LoRA adapter from memory."""
694
- management_url = self.base_url.rstrip('/').replace('/v1', '')
124
+ management_url = self.base_url.rstrip("/").replace("/v1", "")
695
125
 
696
126
  response = self.client.post(
697
127
  f"{management_url}/v1/unload_lora_adapter",
698
128
  json={"lora_name": adapter_name},
699
- headers=self._get_headers()
129
+ headers=self._get_headers(),
700
130
  )
701
- response.raise_for_status()
131
+ self._raise_for_status(response, request_url=f"{management_url}/v1/unload_lora_adapter")
702
132
  return f"Adapter '{adapter_name}' unloaded successfully"
703
133
 
704
134
  def list_adapters(self) -> List[str]:
705
135
  """List currently loaded LoRA adapters."""
706
- management_url = self.base_url.rstrip('/').replace('/v1', '')
136
+ management_url = self.base_url.rstrip("/").replace("/v1", "")
707
137
 
708
138
  response = self.client.get(
709
139
  f"{management_url}/v1/lora_adapters",
710
- headers=self._get_headers()
140
+ headers=self._get_headers(),
711
141
  )
712
- response.raise_for_status()
713
- return response.json().get("adapters", [])
142
+ self._raise_for_status(response, request_url=f"{management_url}/v1/lora_adapters")
143
+ data = response.json()
144
+ if isinstance(data, dict):
145
+ adapters = data.get("adapters", [])
146
+ return adapters if isinstance(adapters, list) else []
147
+ return []
714
148
 
715
149
  # Standard AbstractCore methods
716
150
 
717
151
  def get_capabilities(self) -> List[str]:
718
152
  """Get vLLM capabilities."""
719
153
  capabilities = ["streaming", "chat", "tools", "structured_output"]
720
- # vLLM-specific capabilities
721
154
  capabilities.extend(["guided_decoding", "multi_lora", "beam_search"])
722
155
  return capabilities
723
-
724
- def validate_config(self) -> bool:
725
- """Validate vLLM connection."""
726
- try:
727
- response = self.client.get(f"{self.base_url}/models", headers=self._get_headers())
728
- return response.status_code == 200
729
- except:
730
- return False
731
-
732
- def _get_provider_max_tokens_param(self, kwargs: Dict[str, Any]) -> int:
733
- """Get max tokens parameter for vLLM API."""
734
- return kwargs.get("max_output_tokens", self.max_output_tokens)
735
-
736
- def _update_http_client_timeout(self) -> None:
737
- """Update HTTP client timeout when timeout is changed."""
738
- if hasattr(self, 'client') and self.client is not None:
739
- try:
740
- self.client.close()
741
-
742
- timeout_value = getattr(self, '_timeout', None)
743
- if timeout_value is not None and timeout_value <= 0:
744
- timeout_value = None
745
-
746
- self.client = httpx.Client(timeout=timeout_value)
747
- except Exception as e:
748
- if hasattr(self, 'logger'):
749
- self.logger.warning(f"Failed to update HTTP client timeout: {e}")
750
- try:
751
- fallback_timeout = None
752
- try:
753
- from ..config.manager import get_config_manager
754
-
755
- fallback_timeout = float(get_config_manager().get_default_timeout())
756
- except Exception:
757
- fallback_timeout = 7200.0
758
- if isinstance(fallback_timeout, (int, float)) and float(fallback_timeout) <= 0:
759
- fallback_timeout = None
760
- self.client = httpx.Client(timeout=fallback_timeout)
761
- except Exception:
762
- pass
763
-
764
- def _normalize_model_name(self, model_name: str) -> str:
765
- """Remove common provider prefixes from model name."""
766
- for prefix in ["vllm/", "qwen/", "ollama/", "huggingface/"]:
767
- if model_name.startswith(prefix):
768
- model_name = model_name[len(prefix):]
769
- return model_name
770
-
771
- def _get_media_handler_for_model(self, model_name: str):
772
- """Get appropriate media handler based on model vision capabilities."""
773
- from ..media.handlers import OpenAIMediaHandler, LocalMediaHandler
774
-
775
- clean_model_name = self._normalize_model_name(model_name)
776
-
777
- try:
778
- from ..architectures.detection import supports_vision
779
- use_vision_handler = supports_vision(clean_model_name)
780
- except Exception as e:
781
- self.logger.debug(f"Vision detection failed: {e}, defaulting to LocalMediaHandler")
782
- use_vision_handler = False
783
-
784
- if use_vision_handler:
785
- handler = OpenAIMediaHandler(self.model_capabilities, model_name=model_name)
786
- self.logger.debug(f"Using OpenAIMediaHandler for vision model: {clean_model_name}")
787
- else:
788
- handler = LocalMediaHandler("vllm", self.model_capabilities, model_name=model_name)
789
- self.logger.debug(f"Using LocalMediaHandler for model: {clean_model_name}")
790
-
791
- return handler
792
-
793
- def list_available_models(self, **kwargs) -> List[str]:
794
- """
795
- List available models from vLLM server.
796
-
797
- Args:
798
- **kwargs: Optional parameters including:
799
- - base_url: vLLM server URL
800
- - input_capabilities: List of ModelInputCapability enums to filter by input capability
801
- - output_capabilities: List of ModelOutputCapability enums to filter by output capability
802
-
803
- Returns:
804
- List of model names, optionally filtered by capabilities
805
- """
806
- try:
807
- from .model_capabilities import filter_models_by_capabilities
808
-
809
- base_url = kwargs.get('base_url', self.base_url)
810
-
811
- response = self.client.get(f"{base_url}/models", headers=self._get_headers(), timeout=5.0)
812
- if response.status_code == 200:
813
- data = response.json()
814
- models = [model["id"] for model in data.get("data", [])]
815
- models = sorted(models)
816
-
817
- # Apply capability filtering if provided
818
- input_capabilities = kwargs.get('input_capabilities')
819
- output_capabilities = kwargs.get('output_capabilities')
820
-
821
- if input_capabilities or output_capabilities:
822
- models = filter_models_by_capabilities(
823
- models,
824
- input_capabilities=input_capabilities,
825
- output_capabilities=output_capabilities
826
- )
827
-
828
- return models
829
- else:
830
- self.logger.warning(f"vLLM API returned status {response.status_code}")
831
- return []
832
- except Exception as e:
833
- self.logger.warning(f"Failed to list vLLM models: {e}")
834
- return []
835
-
836
- def embed(self, input_text: Union[str, List[str]], **kwargs) -> Dict[str, Any]:
837
- """
838
- Generate embeddings using vLLM's OpenAI-compatible embedding API.
839
-
840
- Args:
841
- input_text: Single string or list of strings to embed
842
- **kwargs: Additional parameters (encoding_format, dimensions, user, etc.)
843
-
844
- Returns:
845
- Dict with embeddings in OpenAI-compatible format:
846
- {
847
- "object": "list",
848
- "data": [{"object": "embedding", "embedding": [...], "index": 0}, ...],
849
- "model": "model-name",
850
- "usage": {"prompt_tokens": N, "total_tokens": N}
851
- }
852
- """
853
- try:
854
- payload = {
855
- "input": input_text,
856
- "model": self.model
857
- }
858
-
859
- if "encoding_format" in kwargs:
860
- payload["encoding_format"] = kwargs["encoding_format"]
861
- if "dimensions" in kwargs and kwargs["dimensions"]:
862
- payload["dimensions"] = kwargs["dimensions"]
863
- if "user" in kwargs:
864
- payload["user"] = kwargs["user"]
865
-
866
- response = self.client.post(
867
- f"{self.base_url}/embeddings",
868
- json=payload,
869
- headers=self._get_headers()
870
- )
871
- response.raise_for_status()
872
-
873
- result = response.json()
874
- result["model"] = self.model
875
-
876
- return result
877
-
878
- except Exception as e:
879
- self.logger.error(f"Failed to generate embeddings: {e}")
880
- raise ProviderAPIError(f"vLLM embedding error: {str(e)}")