abstractcore 2.6.3__py3-none-any.whl → 2.6.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,823 @@
1
+ """
2
+ vLLM provider implementation with advanced features.
3
+
4
+ vLLM-specific features:
5
+ - Guided Decoding: guided_regex, guided_json, guided_grammar
6
+ - Multi-LoRA: load_adapter, unload_adapter, list_adapters
7
+ - Beam Search: best_of, use_beam_search
8
+ """
9
+
10
+ import os
11
+ import httpx
12
+ import json
13
+ import time
14
+ from typing import List, Dict, Any, Optional, Union, Iterator, AsyncIterator, Type
15
+
16
+ try:
17
+ from pydantic import BaseModel
18
+ PYDANTIC_AVAILABLE = True
19
+ except ImportError:
20
+ PYDANTIC_AVAILABLE = False
21
+ BaseModel = None
22
+
23
+ from .base import BaseProvider
24
+ from ..core.types import GenerateResponse
25
+ from ..exceptions import ProviderAPIError, ModelNotFoundError, format_model_error, format_provider_error
26
+ from ..tools import UniversalToolHandler, execute_tools
27
+ from ..events import EventType
28
+
29
+
30
+ class VLLMProvider(BaseProvider):
31
+ """vLLM provider for high-throughput GPU inference with advanced features."""
32
+
33
+ def __init__(self, model: str = "Qwen/Qwen3-Coder-30B-A3B-Instruct",
34
+ base_url: Optional[str] = None,
35
+ api_key: Optional[str] = None,
36
+ **kwargs):
37
+ super().__init__(model, **kwargs)
38
+ self.provider = "vllm"
39
+
40
+ # Initialize tool handler
41
+ self.tool_handler = UniversalToolHandler(model)
42
+
43
+ # Base URL: parameter > VLLM_BASE_URL > default
44
+ self.base_url = (
45
+ base_url or
46
+ os.getenv("VLLM_BASE_URL") or
47
+ "http://localhost:8000/v1"
48
+ ).rstrip('/')
49
+
50
+ # API key: parameter > VLLM_API_KEY > "EMPTY"
51
+ self.api_key = api_key or os.getenv("VLLM_API_KEY") or "EMPTY"
52
+
53
+ # Get timeout value - None means unlimited timeout
54
+ timeout_value = getattr(self, '_timeout', None)
55
+ if timeout_value is not None and timeout_value <= 0:
56
+ timeout_value = None # Invalid timeout becomes unlimited
57
+
58
+ try:
59
+ self.client = httpx.Client(timeout=timeout_value)
60
+ except Exception as e:
61
+ try:
62
+ self.client = httpx.Client(timeout=300.0)
63
+ except Exception:
64
+ raise RuntimeError(f"Failed to create HTTP client for vLLM: {e}")
65
+
66
+ self._async_client = None # Lazy-loaded async client
67
+
68
+ # Validate model exists in vLLM
69
+ self._validate_model()
70
+
71
+ @property
72
+ def async_client(self):
73
+ """Lazy-load async HTTP client for native async operations."""
74
+ if self._async_client is None:
75
+ timeout_value = getattr(self, '_timeout', None)
76
+ if timeout_value is not None and timeout_value <= 0:
77
+ timeout_value = None
78
+ self._async_client = httpx.AsyncClient(timeout=timeout_value)
79
+ return self._async_client
80
+
81
+ def _get_headers(self) -> Dict[str, str]:
82
+ """Get HTTP headers including API key if configured."""
83
+ headers = {"Content-Type": "application/json"}
84
+ if self.api_key and self.api_key != "EMPTY":
85
+ headers["Authorization"] = f"Bearer {self.api_key}"
86
+ return headers
87
+
88
+ def _validate_model(self):
89
+ """Validate that the model exists in vLLM."""
90
+ try:
91
+ available_models = self.list_available_models(base_url=self.base_url)
92
+ if available_models and self.model not in available_models:
93
+ error_message = format_model_error("vLLM", self.model, available_models)
94
+ raise ModelNotFoundError(error_message)
95
+ except httpx.ConnectError:
96
+ if hasattr(self, 'logger'):
97
+ self.logger.debug(f"vLLM server not accessible at {self.base_url} - model validation skipped")
98
+ pass
99
+ except ModelNotFoundError:
100
+ raise
101
+ except Exception as e:
102
+ if hasattr(self, 'logger'):
103
+ self.logger.debug(f"Model validation failed with error: {e} - continuing anyway")
104
+ pass
105
+
106
+ def unload(self) -> None:
107
+ """
108
+ Close HTTP client connection.
109
+
110
+ Note: vLLM manages model memory automatically.
111
+ This method only closes the HTTP client connection for cleanup.
112
+ """
113
+ try:
114
+ if hasattr(self, 'client') and self.client is not None:
115
+ self.client.close()
116
+
117
+ if self._async_client is not None:
118
+ import asyncio
119
+ try:
120
+ loop = asyncio.get_running_loop()
121
+ loop.create_task(self._async_client.aclose())
122
+ except RuntimeError:
123
+ import asyncio
124
+ asyncio.run(self._async_client.aclose())
125
+
126
+ except Exception as e:
127
+ if hasattr(self, 'logger'):
128
+ self.logger.warning(f"Error during unload: {e}")
129
+
130
+ def generate(self, *args, **kwargs):
131
+ """Public generate method that includes telemetry."""
132
+ return self.generate_with_telemetry(*args, **kwargs)
133
+
134
+ def _generate_internal(self,
135
+ prompt: str,
136
+ messages: Optional[List[Dict[str, str]]] = None,
137
+ system_prompt: Optional[str] = None,
138
+ tools: Optional[List[Dict[str, Any]]] = None,
139
+ media: Optional[List['MediaContent']] = None,
140
+ stream: bool = False,
141
+ response_model: Optional[Type[BaseModel]] = None,
142
+ execute_tools: Optional[bool] = None,
143
+ tool_call_tags: Optional[str] = None,
144
+ # vLLM-specific parameters:
145
+ guided_regex: Optional[str] = None,
146
+ guided_json: Optional[Dict] = None,
147
+ guided_grammar: Optional[str] = None,
148
+ best_of: Optional[int] = None,
149
+ use_beam_search: bool = False,
150
+ **kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
151
+ """Generate response using vLLM with advanced features."""
152
+
153
+ # Build messages for chat completions with tool support
154
+ chat_messages = []
155
+
156
+ # Add tools to system prompt if provided
157
+ enhanced_system_prompt = system_prompt
158
+ if tools and self.tool_handler.supports_prompted:
159
+ tool_prompt = self.tool_handler.format_tools_prompt(tools)
160
+ if enhanced_system_prompt:
161
+ enhanced_system_prompt += f"\n\n{tool_prompt}"
162
+ else:
163
+ enhanced_system_prompt = tool_prompt
164
+
165
+ # Add system message if provided
166
+ if enhanced_system_prompt:
167
+ chat_messages.append({
168
+ "role": "system",
169
+ "content": enhanced_system_prompt
170
+ })
171
+
172
+ # Add conversation history
173
+ if messages:
174
+ chat_messages.extend(messages)
175
+
176
+ # Handle media content if provided
177
+ if media:
178
+ user_message_text = prompt.strip() if prompt else ""
179
+ if not user_message_text and chat_messages:
180
+ for msg in reversed(chat_messages):
181
+ if msg.get("role") == "user" and msg.get("content"):
182
+ user_message_text = msg["content"]
183
+ break
184
+ try:
185
+ processed_media = self._process_media_content(media)
186
+ media_handler = self._get_media_handler_for_model(self.model)
187
+ multimodal_message = media_handler.create_multimodal_message(user_message_text, processed_media)
188
+
189
+ if isinstance(multimodal_message, str):
190
+ if chat_messages and chat_messages[-1].get("role") == "user":
191
+ chat_messages[-1]["content"] = multimodal_message
192
+ else:
193
+ chat_messages.append({"role": "user", "content": multimodal_message})
194
+ else:
195
+ if chat_messages and chat_messages[-1].get("role") == "user":
196
+ chat_messages[-1] = multimodal_message
197
+ else:
198
+ chat_messages.append(multimodal_message)
199
+ except ImportError:
200
+ self.logger.warning("Media processing not available. Install with: pip install abstractcore[media]")
201
+ if user_message_text:
202
+ chat_messages.append({"role": "user", "content": user_message_text})
203
+ except Exception as e:
204
+ self.logger.warning(f"Failed to process media content: {e}")
205
+ if user_message_text:
206
+ chat_messages.append({"role": "user", "content": user_message_text})
207
+
208
+ # Add prompt as separate message if provided
209
+ elif prompt and prompt.strip():
210
+ chat_messages.append({"role": "user", "content": prompt})
211
+
212
+ # Build request payload using unified system
213
+ generation_kwargs = self._prepare_generation_kwargs(**kwargs)
214
+ max_output_tokens = self._get_provider_max_tokens_param(generation_kwargs)
215
+
216
+ payload = {
217
+ "model": self.model,
218
+ "messages": chat_messages,
219
+ "stream": stream,
220
+ "temperature": kwargs.get("temperature", self.temperature),
221
+ "max_tokens": max_output_tokens,
222
+ "top_p": kwargs.get("top_p", 0.9),
223
+ }
224
+
225
+ # Add additional generation parameters if provided
226
+ if "frequency_penalty" in kwargs:
227
+ payload["frequency_penalty"] = kwargs["frequency_penalty"]
228
+ if "presence_penalty" in kwargs:
229
+ payload["presence_penalty"] = kwargs["presence_penalty"]
230
+
231
+ # Add seed if provided
232
+ seed_value = kwargs.get("seed", self.seed)
233
+ if seed_value is not None:
234
+ payload["seed"] = seed_value
235
+
236
+ # Build extra_body for vLLM-specific features
237
+ extra_body = {}
238
+
239
+ # Guided decoding
240
+ if guided_regex:
241
+ extra_body["guided_regex"] = guided_regex
242
+ if guided_json:
243
+ extra_body["guided_json"] = guided_json
244
+ if guided_grammar:
245
+ extra_body["guided_grammar"] = guided_grammar
246
+
247
+ # Beam search
248
+ if use_beam_search or best_of:
249
+ extra_body["use_beam_search"] = use_beam_search
250
+ if best_of:
251
+ extra_body["best_of"] = best_of
252
+
253
+ # Add structured output support (standard OpenAI-compatible format)
254
+ if response_model and PYDANTIC_AVAILABLE:
255
+ json_schema = response_model.model_json_schema()
256
+ payload["response_format"] = {
257
+ "type": "json_schema",
258
+ "json_schema": {
259
+ "name": response_model.__name__,
260
+ "schema": json_schema
261
+ }
262
+ }
263
+
264
+ # Add extra_body if we have vLLM-specific parameters
265
+ if extra_body:
266
+ payload["extra_body"] = extra_body
267
+
268
+ if stream:
269
+ return self._stream_generate(payload)
270
+ else:
271
+ response = self._single_generate(payload)
272
+
273
+ # Execute tools if enabled and tools are present
274
+ if self.execute_tools and tools and self.tool_handler.supports_prompted and response.content:
275
+ response = self._handle_prompted_tool_execution(response, tools, execute_tools)
276
+
277
+ return response
278
+
279
+ def _single_generate(self, payload: Dict[str, Any]) -> GenerateResponse:
280
+ """Generate single response."""
281
+ try:
282
+ if not hasattr(self, 'client') or self.client is None:
283
+ raise ProviderAPIError("HTTP client not initialized")
284
+
285
+ start_time = time.time()
286
+ response = self.client.post(
287
+ f"{self.base_url}/chat/completions",
288
+ json=payload,
289
+ headers=self._get_headers()
290
+ )
291
+ response.raise_for_status()
292
+ gen_time = round((time.time() - start_time) * 1000, 1)
293
+
294
+ result = response.json()
295
+
296
+ # Extract response from OpenAI format
297
+ if "choices" in result and len(result["choices"]) > 0:
298
+ choice = result["choices"][0]
299
+ content = choice.get("message", {}).get("content", "")
300
+ finish_reason = choice.get("finish_reason", "stop")
301
+ else:
302
+ content = "No response generated"
303
+ finish_reason = "error"
304
+
305
+ # Extract usage info
306
+ usage = result.get("usage", {})
307
+
308
+ return GenerateResponse(
309
+ content=content,
310
+ model=self.model,
311
+ finish_reason=finish_reason,
312
+ raw_response=result,
313
+ usage={
314
+ "input_tokens": usage.get("prompt_tokens", 0),
315
+ "output_tokens": usage.get("completion_tokens", 0),
316
+ "total_tokens": usage.get("total_tokens", 0),
317
+ "prompt_tokens": usage.get("prompt_tokens", 0),
318
+ "completion_tokens": usage.get("completion_tokens", 0)
319
+ },
320
+ gen_time=gen_time
321
+ )
322
+
323
+ except AttributeError as e:
324
+ if "'NoneType'" in str(e):
325
+ raise ProviderAPIError(f"vLLM provider not properly initialized: {str(e)}")
326
+ else:
327
+ raise ProviderAPIError(f"vLLM configuration error: {str(e)}")
328
+ except Exception as e:
329
+ error_str = str(e).lower()
330
+ if ('404' in error_str or 'not found' in error_str or 'model' in error_str) and ('not found' in error_str):
331
+ try:
332
+ available_models = self.list_available_models(base_url=self.base_url)
333
+ error_message = format_model_error("vLLM", self.model, available_models)
334
+ raise ModelNotFoundError(error_message)
335
+ except Exception:
336
+ raise ModelNotFoundError(f"Model '{self.model}' not found in vLLM and could not fetch available models")
337
+ else:
338
+ raise ProviderAPIError(f"vLLM API error: {str(e)}")
339
+
340
+ def _stream_generate(self, payload: Dict[str, Any]) -> Iterator[GenerateResponse]:
341
+ """Generate streaming response."""
342
+ try:
343
+ with self.client.stream(
344
+ "POST",
345
+ f"{self.base_url}/chat/completions",
346
+ json=payload,
347
+ headers=self._get_headers()
348
+ ) as response:
349
+ response.raise_for_status()
350
+
351
+ for line in response.iter_lines():
352
+ if line:
353
+ if isinstance(line, bytes):
354
+ line = line.decode('utf-8')
355
+ line = line.strip()
356
+
357
+ if line.startswith("data: "):
358
+ data = line[6:] # Remove "data: " prefix
359
+
360
+ if data == "[DONE]":
361
+ break
362
+
363
+ try:
364
+ chunk = json.loads(data)
365
+
366
+ if "choices" in chunk and len(chunk["choices"]) > 0:
367
+ choice = chunk["choices"][0]
368
+ delta = choice.get("delta", {})
369
+ content = delta.get("content", "")
370
+ finish_reason = choice.get("finish_reason")
371
+
372
+ yield GenerateResponse(
373
+ content=content,
374
+ model=self.model,
375
+ finish_reason=finish_reason,
376
+ raw_response=chunk
377
+ )
378
+
379
+ except json.JSONDecodeError:
380
+ continue
381
+
382
+ except Exception as e:
383
+ yield GenerateResponse(
384
+ content=f"Error: {str(e)}",
385
+ model=self.model,
386
+ finish_reason="error"
387
+ )
388
+
389
+ async def _agenerate_internal(self,
390
+ prompt: str,
391
+ messages: Optional[List[Dict[str, str]]] = None,
392
+ system_prompt: Optional[str] = None,
393
+ tools: Optional[List[Dict[str, Any]]] = None,
394
+ media: Optional[List['MediaContent']] = None,
395
+ stream: bool = False,
396
+ response_model: Optional[Type[BaseModel]] = None,
397
+ execute_tools: Optional[bool] = None,
398
+ tool_call_tags: Optional[str] = None,
399
+ # vLLM-specific parameters:
400
+ guided_regex: Optional[str] = None,
401
+ guided_json: Optional[Dict] = None,
402
+ guided_grammar: Optional[str] = None,
403
+ best_of: Optional[int] = None,
404
+ use_beam_search: bool = False,
405
+ **kwargs) -> Union[GenerateResponse, AsyncIterator[GenerateResponse]]:
406
+ """Native async implementation with vLLM features."""
407
+
408
+ # Build messages (same logic as sync)
409
+ chat_messages = []
410
+
411
+ enhanced_system_prompt = system_prompt
412
+ if tools and self.tool_handler.supports_prompted:
413
+ tool_prompt = self.tool_handler.format_tools_prompt(tools)
414
+ if enhanced_system_prompt:
415
+ enhanced_system_prompt += f"\n\n{tool_prompt}"
416
+ else:
417
+ enhanced_system_prompt = tool_prompt
418
+
419
+ if enhanced_system_prompt:
420
+ chat_messages.append({"role": "system", "content": enhanced_system_prompt})
421
+
422
+ if messages:
423
+ chat_messages.extend(messages)
424
+
425
+ if media:
426
+ user_message_text = prompt.strip() if prompt else ""
427
+ if not user_message_text and chat_messages:
428
+ for msg in reversed(chat_messages):
429
+ if msg.get("role") == "user" and msg.get("content"):
430
+ user_message_text = msg["content"]
431
+ break
432
+ try:
433
+ processed_media = self._process_media_content(media)
434
+ media_handler = self._get_media_handler_for_model(self.model)
435
+ multimodal_message = media_handler.create_multimodal_message(user_message_text, processed_media)
436
+
437
+ if isinstance(multimodal_message, str):
438
+ if chat_messages and chat_messages[-1].get("role") == "user":
439
+ chat_messages[-1]["content"] = multimodal_message
440
+ else:
441
+ chat_messages.append({"role": "user", "content": multimodal_message})
442
+ else:
443
+ if chat_messages and chat_messages[-1].get("role") == "user":
444
+ chat_messages[-1] = multimodal_message
445
+ else:
446
+ chat_messages.append(multimodal_message)
447
+ except ImportError:
448
+ self.logger.warning("Media processing not available. Install with: pip install abstractcore[media]")
449
+ if user_message_text:
450
+ chat_messages.append({"role": "user", "content": user_message_text})
451
+ except Exception as e:
452
+ self.logger.warning(f"Failed to process media content: {e}")
453
+ if user_message_text:
454
+ chat_messages.append({"role": "user", "content": user_message_text})
455
+
456
+ elif prompt and prompt.strip():
457
+ chat_messages.append({"role": "user", "content": prompt})
458
+
459
+ # Build request payload
460
+ generation_kwargs = self._prepare_generation_kwargs(**kwargs)
461
+ max_output_tokens = self._get_provider_max_tokens_param(generation_kwargs)
462
+
463
+ payload = {
464
+ "model": self.model,
465
+ "messages": chat_messages,
466
+ "stream": stream,
467
+ "temperature": kwargs.get("temperature", self.temperature),
468
+ "max_tokens": max_output_tokens,
469
+ "top_p": kwargs.get("top_p", 0.9),
470
+ }
471
+
472
+ if "frequency_penalty" in kwargs:
473
+ payload["frequency_penalty"] = kwargs["frequency_penalty"]
474
+ if "presence_penalty" in kwargs:
475
+ payload["presence_penalty"] = kwargs["presence_penalty"]
476
+
477
+ seed_value = kwargs.get("seed", self.seed)
478
+ if seed_value is not None:
479
+ payload["seed"] = seed_value
480
+
481
+ # vLLM-specific features
482
+ extra_body = {}
483
+
484
+ if guided_regex:
485
+ extra_body["guided_regex"] = guided_regex
486
+ if guided_json:
487
+ extra_body["guided_json"] = guided_json
488
+ if guided_grammar:
489
+ extra_body["guided_grammar"] = guided_grammar
490
+
491
+ if use_beam_search or best_of:
492
+ extra_body["use_beam_search"] = use_beam_search
493
+ if best_of:
494
+ extra_body["best_of"] = best_of
495
+
496
+ if response_model and PYDANTIC_AVAILABLE:
497
+ json_schema = response_model.model_json_schema()
498
+ payload["response_format"] = {
499
+ "type": "json_schema",
500
+ "json_schema": {
501
+ "name": response_model.__name__,
502
+ "schema": json_schema
503
+ }
504
+ }
505
+
506
+ if extra_body:
507
+ payload["extra_body"] = extra_body
508
+
509
+ if stream:
510
+ return self._async_stream_generate(payload)
511
+ else:
512
+ response = await self._async_single_generate(payload)
513
+
514
+ if self.execute_tools and tools and self.tool_handler.supports_prompted and response.content:
515
+ response = self._handle_prompted_tool_execution(response, tools, execute_tools)
516
+
517
+ return response
518
+
519
+ async def _async_single_generate(self, payload: Dict[str, Any]) -> GenerateResponse:
520
+ """Native async single response generation."""
521
+ try:
522
+ start_time = time.time()
523
+ response = await self.async_client.post(
524
+ f"{self.base_url}/chat/completions",
525
+ json=payload,
526
+ headers=self._get_headers()
527
+ )
528
+ response.raise_for_status()
529
+ gen_time = round((time.time() - start_time) * 1000, 1)
530
+
531
+ result = response.json()
532
+
533
+ if "choices" in result and len(result["choices"]) > 0:
534
+ choice = result["choices"][0]
535
+ content = choice.get("message", {}).get("content", "")
536
+ finish_reason = choice.get("finish_reason", "stop")
537
+ else:
538
+ content = "No response generated"
539
+ finish_reason = "error"
540
+
541
+ usage = result.get("usage", {})
542
+
543
+ return GenerateResponse(
544
+ content=content,
545
+ model=self.model,
546
+ finish_reason=finish_reason,
547
+ raw_response=result,
548
+ usage={
549
+ "input_tokens": usage.get("prompt_tokens", 0),
550
+ "output_tokens": usage.get("completion_tokens", 0),
551
+ "total_tokens": usage.get("total_tokens", 0),
552
+ "prompt_tokens": usage.get("prompt_tokens", 0),
553
+ "completion_tokens": usage.get("completion_tokens", 0)
554
+ },
555
+ gen_time=gen_time
556
+ )
557
+
558
+ except Exception as e:
559
+ error_str = str(e).lower()
560
+ if ('404' in error_str or 'not found' in error_str or 'model' in error_str) and ('not found' in error_str):
561
+ try:
562
+ available_models = self.list_available_models(base_url=self.base_url)
563
+ error_message = format_model_error("vLLM", self.model, available_models)
564
+ raise ModelNotFoundError(error_message)
565
+ except Exception:
566
+ raise ModelNotFoundError(f"Model '{self.model}' not found in vLLM")
567
+ else:
568
+ raise ProviderAPIError(f"vLLM API error: {str(e)}")
569
+
570
+ async def _async_stream_generate(self, payload: Dict[str, Any]) -> AsyncIterator[GenerateResponse]:
571
+ """Native async streaming response generation."""
572
+ try:
573
+ async with self.async_client.stream(
574
+ "POST",
575
+ f"{self.base_url}/chat/completions",
576
+ json=payload,
577
+ headers=self._get_headers()
578
+ ) as response:
579
+ response.raise_for_status()
580
+
581
+ async for line in response.aiter_lines():
582
+ if line:
583
+ line = line.strip()
584
+
585
+ if line.startswith("data: "):
586
+ data = line[6:] # Remove "data: " prefix
587
+
588
+ if data == "[DONE]":
589
+ break
590
+
591
+ try:
592
+ chunk = json.loads(data)
593
+
594
+ if "choices" in chunk and len(chunk["choices"]) > 0:
595
+ choice = chunk["choices"][0]
596
+ delta = choice.get("delta", {})
597
+ content = delta.get("content", "")
598
+ finish_reason = choice.get("finish_reason")
599
+
600
+ yield GenerateResponse(
601
+ content=content,
602
+ model=self.model,
603
+ finish_reason=finish_reason,
604
+ raw_response=chunk
605
+ )
606
+
607
+ except json.JSONDecodeError:
608
+ continue
609
+
610
+ except Exception as e:
611
+ yield GenerateResponse(
612
+ content=f"Error: {str(e)}",
613
+ model=self.model,
614
+ finish_reason="error"
615
+ )
616
+
617
+ # vLLM-specific methods
618
+
619
+ def load_adapter(self, adapter_name: str, adapter_path: str) -> str:
620
+ """
621
+ Load a LoRA adapter dynamically without restarting the server.
622
+
623
+ Args:
624
+ adapter_name: Name to identify the adapter (e.g., "sql-expert")
625
+ adapter_path: Path to the LoRA adapter weights
626
+
627
+ Returns:
628
+ Success message
629
+
630
+ Usage:
631
+ llm.load_adapter("sql-expert", "/models/adapters/sql-lora")
632
+ response = llm.generate("Query...", model="sql-expert")
633
+ """
634
+ management_url = self.base_url.rstrip('/').replace('/v1', '')
635
+
636
+ response = self.client.post(
637
+ f"{management_url}/v1/load_lora_adapter",
638
+ json={"lora_name": adapter_name, "lora_path": adapter_path},
639
+ headers=self._get_headers()
640
+ )
641
+ response.raise_for_status()
642
+ return f"Adapter '{adapter_name}' loaded successfully"
643
+
644
+ def unload_adapter(self, adapter_name: str) -> str:
645
+ """Unload a LoRA adapter from memory."""
646
+ management_url = self.base_url.rstrip('/').replace('/v1', '')
647
+
648
+ response = self.client.post(
649
+ f"{management_url}/v1/unload_lora_adapter",
650
+ json={"lora_name": adapter_name},
651
+ headers=self._get_headers()
652
+ )
653
+ response.raise_for_status()
654
+ return f"Adapter '{adapter_name}' unloaded successfully"
655
+
656
+ def list_adapters(self) -> List[str]:
657
+ """List currently loaded LoRA adapters."""
658
+ management_url = self.base_url.rstrip('/').replace('/v1', '')
659
+
660
+ response = self.client.get(
661
+ f"{management_url}/v1/lora_adapters",
662
+ headers=self._get_headers()
663
+ )
664
+ response.raise_for_status()
665
+ return response.json().get("adapters", [])
666
+
667
+ # Standard AbstractCore methods
668
+
669
+ def get_capabilities(self) -> List[str]:
670
+ """Get vLLM capabilities."""
671
+ capabilities = ["streaming", "chat", "tools", "structured_output"]
672
+ # vLLM-specific capabilities
673
+ capabilities.extend(["guided_decoding", "multi_lora", "beam_search"])
674
+ return capabilities
675
+
676
+ def validate_config(self) -> bool:
677
+ """Validate vLLM connection."""
678
+ try:
679
+ response = self.client.get(f"{self.base_url}/models", headers=self._get_headers())
680
+ return response.status_code == 200
681
+ except:
682
+ return False
683
+
684
+ def _get_provider_max_tokens_param(self, kwargs: Dict[str, Any]) -> int:
685
+ """Get max tokens parameter for vLLM API."""
686
+ return kwargs.get("max_output_tokens", self.max_output_tokens)
687
+
688
+ def _update_http_client_timeout(self) -> None:
689
+ """Update HTTP client timeout when timeout is changed."""
690
+ if hasattr(self, 'client') and self.client is not None:
691
+ try:
692
+ self.client.close()
693
+
694
+ timeout_value = getattr(self, '_timeout', None)
695
+ if timeout_value is not None and timeout_value <= 0:
696
+ timeout_value = None
697
+
698
+ self.client = httpx.Client(timeout=timeout_value)
699
+ except Exception as e:
700
+ if hasattr(self, 'logger'):
701
+ self.logger.warning(f"Failed to update HTTP client timeout: {e}")
702
+ try:
703
+ self.client = httpx.Client(timeout=300.0)
704
+ except Exception:
705
+ pass
706
+
707
+ def _normalize_model_name(self, model_name: str) -> str:
708
+ """Remove common provider prefixes from model name."""
709
+ for prefix in ["vllm/", "qwen/", "ollama/", "huggingface/"]:
710
+ if model_name.startswith(prefix):
711
+ model_name = model_name[len(prefix):]
712
+ return model_name
713
+
714
+ def _get_media_handler_for_model(self, model_name: str):
715
+ """Get appropriate media handler based on model vision capabilities."""
716
+ from ..media.handlers import OpenAIMediaHandler, LocalMediaHandler
717
+
718
+ clean_model_name = self._normalize_model_name(model_name)
719
+
720
+ try:
721
+ from ..architectures.detection import supports_vision
722
+ use_vision_handler = supports_vision(clean_model_name)
723
+ except Exception as e:
724
+ self.logger.debug(f"Vision detection failed: {e}, defaulting to LocalMediaHandler")
725
+ use_vision_handler = False
726
+
727
+ if use_vision_handler:
728
+ handler = OpenAIMediaHandler(self.model_capabilities, model_name=model_name)
729
+ self.logger.debug(f"Using OpenAIMediaHandler for vision model: {clean_model_name}")
730
+ else:
731
+ handler = LocalMediaHandler("vllm", self.model_capabilities, model_name=model_name)
732
+ self.logger.debug(f"Using LocalMediaHandler for model: {clean_model_name}")
733
+
734
+ return handler
735
+
736
+ def list_available_models(self, **kwargs) -> List[str]:
737
+ """
738
+ List available models from vLLM server.
739
+
740
+ Args:
741
+ **kwargs: Optional parameters including:
742
+ - base_url: vLLM server URL
743
+ - input_capabilities: List of ModelInputCapability enums to filter by input capability
744
+ - output_capabilities: List of ModelOutputCapability enums to filter by output capability
745
+
746
+ Returns:
747
+ List of model names, optionally filtered by capabilities
748
+ """
749
+ try:
750
+ from .model_capabilities import filter_models_by_capabilities
751
+
752
+ base_url = kwargs.get('base_url', self.base_url)
753
+
754
+ response = self.client.get(f"{base_url}/models", headers=self._get_headers(), timeout=5.0)
755
+ if response.status_code == 200:
756
+ data = response.json()
757
+ models = [model["id"] for model in data.get("data", [])]
758
+ models = sorted(models)
759
+
760
+ # Apply capability filtering if provided
761
+ input_capabilities = kwargs.get('input_capabilities')
762
+ output_capabilities = kwargs.get('output_capabilities')
763
+
764
+ if input_capabilities or output_capabilities:
765
+ models = filter_models_by_capabilities(
766
+ models,
767
+ input_capabilities=input_capabilities,
768
+ output_capabilities=output_capabilities
769
+ )
770
+
771
+ return models
772
+ else:
773
+ self.logger.warning(f"vLLM API returned status {response.status_code}")
774
+ return []
775
+ except Exception as e:
776
+ self.logger.warning(f"Failed to list vLLM models: {e}")
777
+ return []
778
+
779
+ def embed(self, input_text: Union[str, List[str]], **kwargs) -> Dict[str, Any]:
780
+ """
781
+ Generate embeddings using vLLM's OpenAI-compatible embedding API.
782
+
783
+ Args:
784
+ input_text: Single string or list of strings to embed
785
+ **kwargs: Additional parameters (encoding_format, dimensions, user, etc.)
786
+
787
+ Returns:
788
+ Dict with embeddings in OpenAI-compatible format:
789
+ {
790
+ "object": "list",
791
+ "data": [{"object": "embedding", "embedding": [...], "index": 0}, ...],
792
+ "model": "model-name",
793
+ "usage": {"prompt_tokens": N, "total_tokens": N}
794
+ }
795
+ """
796
+ try:
797
+ payload = {
798
+ "input": input_text,
799
+ "model": self.model
800
+ }
801
+
802
+ if "encoding_format" in kwargs:
803
+ payload["encoding_format"] = kwargs["encoding_format"]
804
+ if "dimensions" in kwargs and kwargs["dimensions"]:
805
+ payload["dimensions"] = kwargs["dimensions"]
806
+ if "user" in kwargs:
807
+ payload["user"] = kwargs["user"]
808
+
809
+ response = self.client.post(
810
+ f"{self.base_url}/embeddings",
811
+ json=payload,
812
+ headers=self._get_headers()
813
+ )
814
+ response.raise_for_status()
815
+
816
+ result = response.json()
817
+ result["model"] = self.model
818
+
819
+ return result
820
+
821
+ except Exception as e:
822
+ self.logger.error(f"Failed to generate embeddings: {e}")
823
+ raise ProviderAPIError(f"vLLM embedding error: {str(e)}")