abstractcore 2.6.2__py3-none-any.whl → 2.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,829 @@
1
+ """
2
+ Generic OpenAI-compatible provider for any OpenAI-compatible API endpoint.
3
+
4
+ Supports any server implementing the OpenAI API format:
5
+ - llama.cpp server
6
+ - text-generation-webui (with OpenAI extension)
7
+ - LocalAI
8
+ - FastChat
9
+ - Aphrodite
10
+ - SGLang
11
+ - Custom deployments and proxies
12
+ """
13
+
14
+ import os
15
+ import httpx
16
+ import json
17
+ import time
18
+ from typing import List, Dict, Any, Optional, Union, Iterator, AsyncIterator, Type
19
+
20
+ try:
21
+ from pydantic import BaseModel
22
+ PYDANTIC_AVAILABLE = True
23
+ except ImportError:
24
+ PYDANTIC_AVAILABLE = False
25
+ BaseModel = None
26
+ from .base import BaseProvider
27
+ from ..core.types import GenerateResponse
28
+ from ..exceptions import ProviderAPIError, ModelNotFoundError, format_model_error, format_provider_error
29
+ from ..tools import UniversalToolHandler, execute_tools
30
+ from ..events import EventType
31
+
32
+
33
+ class OpenAICompatibleProvider(BaseProvider):
34
+ """
35
+ Generic provider for any OpenAI-compatible API endpoint.
36
+
37
+ Works with any server implementing the OpenAI API format:
38
+ - llama.cpp server
39
+ - text-generation-webui (OpenAI extension)
40
+ - LocalAI
41
+ - FastChat
42
+ - Aphrodite
43
+ - SGLang
44
+ - Custom deployments and proxies
45
+
46
+ Usage:
47
+ # Basic usage
48
+ llm = create_llm("openai-compatible",
49
+ base_url="http://localhost:8080/v1",
50
+ model="llama-3.1-8b")
51
+
52
+ # With API key (optional for many local servers)
53
+ llm = create_llm("openai-compatible",
54
+ base_url="http://localhost:8080/v1",
55
+ model="my-model",
56
+ api_key="your-key")
57
+
58
+ # Environment variable configuration
59
+ export OPENAI_COMPATIBLE_BASE_URL="http://localhost:8080/v1"
60
+ export OPENAI_COMPATIBLE_API_KEY="your-key" # Optional
61
+ llm = create_llm("openai-compatible", model="my-model")
62
+ """
63
+
64
+ def __init__(self, model: str = "default", base_url: Optional[str] = None,
65
+ api_key: Optional[str] = None, **kwargs):
66
+ super().__init__(model, **kwargs)
67
+ self.provider = "openai-compatible"
68
+
69
+ # Initialize tool handler
70
+ self.tool_handler = UniversalToolHandler(model)
71
+
72
+ # Base URL priority: parameter > OPENAI_COMPATIBLE_BASE_URL > default
73
+ self.base_url = (
74
+ base_url or
75
+ os.getenv("OPENAI_COMPATIBLE_BASE_URL") or
76
+ "http://localhost:8080/v1"
77
+ ).rstrip('/')
78
+
79
+ # API key: OPTIONAL (many local servers don't require authentication)
80
+ # Priority: parameter > OPENAI_COMPATIBLE_API_KEY > None
81
+ self.api_key = api_key or os.getenv("OPENAI_COMPATIBLE_API_KEY")
82
+
83
+ # Get timeout value - None means unlimited timeout
84
+ timeout_value = getattr(self, '_timeout', None)
85
+ # Validate timeout if provided (None is allowed for unlimited)
86
+ if timeout_value is not None and timeout_value <= 0:
87
+ timeout_value = None # Invalid timeout becomes unlimited
88
+
89
+ try:
90
+ self.client = httpx.Client(timeout=timeout_value)
91
+ except Exception as e:
92
+ # Fallback with default timeout if client creation fails
93
+ try:
94
+ self.client = httpx.Client(timeout=300.0)
95
+ except Exception:
96
+ raise RuntimeError(f"Failed to create HTTP client for OpenAI-compatible provider: {e}")
97
+
98
+ self._async_client = None # Lazy-loaded async client
99
+
100
+ # Validate model exists on server
101
+ self._validate_model()
102
+
103
+ @property
104
+ def async_client(self):
105
+ """Lazy-load async HTTP client for native async operations."""
106
+ if self._async_client is None:
107
+ timeout_value = getattr(self, '_timeout', None)
108
+ if timeout_value is not None and timeout_value <= 0:
109
+ timeout_value = None
110
+ self._async_client = httpx.AsyncClient(timeout=timeout_value)
111
+ return self._async_client
112
+
113
+ def _get_headers(self) -> Dict[str, str]:
114
+ """Get HTTP headers with optional API key authentication."""
115
+ headers = {"Content-Type": "application/json"}
116
+ # Only add Authorization header if api_key is provided and truthy
117
+ if self.api_key:
118
+ headers["Authorization"] = f"Bearer {self.api_key}"
119
+ return headers
120
+
121
+ def _validate_model(self):
122
+ """Validate that the model exists on the OpenAI-compatible server"""
123
+ # Skip validation for "default" placeholder (used by registry for model listing)
124
+ if self.model == "default":
125
+ return
126
+
127
+ try:
128
+ # Use base_url as-is (should include /v1) for model discovery
129
+ available_models = self.list_available_models(base_url=self.base_url)
130
+ if available_models and self.model not in available_models:
131
+ error_message = format_model_error("OpenAI-compatible server", self.model, available_models)
132
+ raise ModelNotFoundError(error_message)
133
+ except httpx.ConnectError:
134
+ # Server not running - will fail later when trying to generate
135
+ if hasattr(self, 'logger'):
136
+ self.logger.debug(f"OpenAI-compatible server not accessible at {self.base_url} - model validation skipped")
137
+ pass
138
+ except ModelNotFoundError:
139
+ # Re-raise model not found errors
140
+ raise
141
+ except Exception as e:
142
+ # Other errors (like timeout, None type errors) - continue, will fail later if needed
143
+ if hasattr(self, 'logger'):
144
+ self.logger.debug(f"Model validation failed with error: {e} - continuing anyway")
145
+ pass
146
+
147
+ def unload(self) -> None:
148
+ """
149
+ Close HTTP client connection.
150
+
151
+ Note: Most OpenAI-compatible servers manage model memory automatically.
152
+ This method only closes the HTTP client connection for cleanup.
153
+ """
154
+ try:
155
+ # Close the HTTP client connection
156
+ if hasattr(self, 'client') and self.client is not None:
157
+ self.client.close()
158
+
159
+ # Close async client if it was created
160
+ if self._async_client is not None:
161
+ import asyncio
162
+ try:
163
+ loop = asyncio.get_running_loop()
164
+ loop.create_task(self._async_client.aclose())
165
+ except RuntimeError:
166
+ # No running loop
167
+ import asyncio
168
+ asyncio.run(self._async_client.aclose())
169
+
170
+ except Exception as e:
171
+ # Log but don't raise - unload should be best-effort
172
+ if hasattr(self, 'logger'):
173
+ self.logger.warning(f"Error during unload: {e}")
174
+
175
+ def generate(self, *args, **kwargs):
176
+ """Public generate method that includes telemetry"""
177
+ return self.generate_with_telemetry(*args, **kwargs)
178
+
179
+ def _generate_internal(self,
180
+ prompt: str,
181
+ messages: Optional[List[Dict[str, str]]] = None,
182
+ system_prompt: Optional[str] = None,
183
+ tools: Optional[List[Dict[str, Any]]] = None,
184
+ media: Optional[List['MediaContent']] = None,
185
+ stream: bool = False,
186
+ response_model: Optional[Type[BaseModel]] = None,
187
+ execute_tools: Optional[bool] = None,
188
+ tool_call_tags: Optional[str] = None,
189
+ **kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
190
+ """Generate response using OpenAI-compatible server"""
191
+
192
+ # Build messages for chat completions with tool support
193
+ chat_messages = []
194
+
195
+ # Add tools to system prompt if provided
196
+ enhanced_system_prompt = system_prompt
197
+ if tools and self.tool_handler.supports_prompted:
198
+ tool_prompt = self.tool_handler.format_tools_prompt(tools)
199
+ if enhanced_system_prompt:
200
+ enhanced_system_prompt += f"\n\n{tool_prompt}"
201
+ else:
202
+ enhanced_system_prompt = tool_prompt
203
+
204
+ # Add system message if provided
205
+ if enhanced_system_prompt:
206
+ chat_messages.append({
207
+ "role": "system",
208
+ "content": enhanced_system_prompt
209
+ })
210
+
211
+ # Add conversation history
212
+ if messages:
213
+ chat_messages.extend(messages)
214
+
215
+ # Handle media content regardless of prompt (media can be used with messages too)
216
+ if media:
217
+ # Get the last user message content to combine with media
218
+ user_message_text = prompt.strip() if prompt else ""
219
+ if not user_message_text and chat_messages:
220
+ # If no prompt, try to get text from the last user message
221
+ for msg in reversed(chat_messages):
222
+ if msg.get("role") == "user" and msg.get("content"):
223
+ user_message_text = msg["content"]
224
+ break
225
+ try:
226
+ # Process media files into MediaContent objects first
227
+ processed_media = self._process_media_content(media)
228
+
229
+ # Use capability-based media handler selection
230
+ media_handler = self._get_media_handler_for_model(self.model)
231
+
232
+ # Create multimodal message combining text and processed media
233
+ multimodal_message = media_handler.create_multimodal_message(user_message_text, processed_media)
234
+
235
+ # For OpenAI-compatible servers, we might get a string (embedded text) or dict (structured)
236
+ if isinstance(multimodal_message, str):
237
+ # Replace the last user message with the multimodal message, or add new one
238
+ if chat_messages and chat_messages[-1].get("role") == "user":
239
+ chat_messages[-1]["content"] = multimodal_message
240
+ else:
241
+ chat_messages.append({
242
+ "role": "user",
243
+ "content": multimodal_message
244
+ })
245
+ else:
246
+ if chat_messages and chat_messages[-1].get("role") == "user":
247
+ # Replace last user message with structured multimodal message
248
+ chat_messages[-1] = multimodal_message
249
+ else:
250
+ chat_messages.append(multimodal_message)
251
+ except ImportError:
252
+ self.logger.warning("Media processing not available. Install with: pip install abstractcore[media]")
253
+ if user_message_text:
254
+ chat_messages.append({
255
+ "role": "user",
256
+ "content": user_message_text
257
+ })
258
+ except Exception as e:
259
+ self.logger.warning(f"Failed to process media content: {e}")
260
+ if user_message_text:
261
+ chat_messages.append({
262
+ "role": "user",
263
+ "content": user_message_text
264
+ })
265
+
266
+ # Add prompt as separate message if provided (for backward compatibility)
267
+ elif prompt and prompt.strip():
268
+ chat_messages.append({
269
+ "role": "user",
270
+ "content": prompt
271
+ })
272
+
273
+ # Build request payload using unified system
274
+ generation_kwargs = self._prepare_generation_kwargs(**kwargs)
275
+ max_output_tokens = self._get_provider_max_tokens_param(generation_kwargs)
276
+
277
+ payload = {
278
+ "model": self.model,
279
+ "messages": chat_messages,
280
+ "stream": stream,
281
+ "temperature": kwargs.get("temperature", self.temperature),
282
+ "max_tokens": max_output_tokens,
283
+ "top_p": kwargs.get("top_p", 0.9),
284
+ }
285
+
286
+ # Add additional generation parameters if provided (OpenAI-compatible)
287
+ if "frequency_penalty" in kwargs:
288
+ payload["frequency_penalty"] = kwargs["frequency_penalty"]
289
+ if "presence_penalty" in kwargs:
290
+ payload["presence_penalty"] = kwargs["presence_penalty"]
291
+ if "repetition_penalty" in kwargs:
292
+ # Some models support repetition_penalty directly
293
+ payload["repetition_penalty"] = kwargs["repetition_penalty"]
294
+
295
+ # Add seed if provided (many servers support seed via OpenAI-compatible API)
296
+ seed_value = kwargs.get("seed", self.seed)
297
+ if seed_value is not None:
298
+ payload["seed"] = seed_value
299
+
300
+ # Add structured output support (OpenAI-compatible format)
301
+ # Many servers support native structured outputs using the response_format parameter
302
+ if response_model and PYDANTIC_AVAILABLE:
303
+ json_schema = response_model.model_json_schema()
304
+ payload["response_format"] = {
305
+ "type": "json_schema",
306
+ "json_schema": {
307
+ "name": response_model.__name__,
308
+ "schema": json_schema
309
+ }
310
+ }
311
+
312
+ if stream:
313
+ # Return streaming response - BaseProvider will handle tag rewriting via UnifiedStreamProcessor
314
+ return self._stream_generate(payload)
315
+ else:
316
+ response = self._single_generate(payload)
317
+
318
+ # Execute tools if enabled and tools are present
319
+ if self.execute_tools and tools and self.tool_handler.supports_prompted and response.content:
320
+ response = self._handle_prompted_tool_execution(response, tools, execute_tools)
321
+
322
+ return response
323
+
324
+ def _single_generate(self, payload: Dict[str, Any]) -> GenerateResponse:
325
+ """Generate single response"""
326
+ try:
327
+ # Ensure client is available
328
+ if not hasattr(self, 'client') or self.client is None:
329
+ raise ProviderAPIError("HTTP client not initialized")
330
+
331
+ # Track generation time
332
+ start_time = time.time()
333
+ response = self.client.post(
334
+ f"{self.base_url}/chat/completions",
335
+ json=payload,
336
+ headers=self._get_headers()
337
+ )
338
+ response.raise_for_status()
339
+ gen_time = round((time.time() - start_time) * 1000, 1)
340
+
341
+ result = response.json()
342
+
343
+ # Extract response from OpenAI format
344
+ if "choices" in result and len(result["choices"]) > 0:
345
+ choice = result["choices"][0]
346
+ content = choice.get("message", {}).get("content", "")
347
+ finish_reason = choice.get("finish_reason", "stop")
348
+ else:
349
+ content = "No response generated"
350
+ finish_reason = "error"
351
+
352
+ # Extract usage info
353
+ usage = result.get("usage", {})
354
+
355
+ return GenerateResponse(
356
+ content=content,
357
+ model=self.model,
358
+ finish_reason=finish_reason,
359
+ raw_response=result,
360
+ usage={
361
+ "input_tokens": usage.get("prompt_tokens", 0),
362
+ "output_tokens": usage.get("completion_tokens", 0),
363
+ "total_tokens": usage.get("total_tokens", 0),
364
+ # Keep legacy keys for backward compatibility
365
+ "prompt_tokens": usage.get("prompt_tokens", 0),
366
+ "completion_tokens": usage.get("completion_tokens", 0)
367
+ },
368
+ gen_time=gen_time
369
+ )
370
+
371
+ except AttributeError as e:
372
+ # Handle None type errors specifically
373
+ if "'NoneType'" in str(e):
374
+ raise ProviderAPIError(f"OpenAI-compatible provider not properly initialized: {str(e)}")
375
+ else:
376
+ raise ProviderAPIError(f"OpenAI-compatible provider configuration error: {str(e)}")
377
+ except Exception as e:
378
+ error_str = str(e).lower()
379
+ if ('404' in error_str or 'not found' in error_str or 'model' in error_str) and ('not found' in error_str):
380
+ # Model not found - show available models
381
+ try:
382
+ available_models = self.list_available_models(base_url=self.base_url)
383
+ error_message = format_model_error("OpenAI-compatible server", self.model, available_models)
384
+ raise ModelNotFoundError(error_message)
385
+ except Exception:
386
+ # If model discovery also fails, provide a generic error
387
+ raise ModelNotFoundError(f"Model '{self.model}' not found on OpenAI-compatible server and could not fetch available models")
388
+ else:
389
+ raise ProviderAPIError(f"OpenAI-compatible server API error: {str(e)}")
390
+
391
+ def _stream_generate(self, payload: Dict[str, Any]) -> Iterator[GenerateResponse]:
392
+ """Generate streaming response"""
393
+ try:
394
+ with self.client.stream(
395
+ "POST",
396
+ f"{self.base_url}/chat/completions",
397
+ json=payload,
398
+ headers=self._get_headers()
399
+ ) as response:
400
+ response.raise_for_status()
401
+
402
+ for line in response.iter_lines():
403
+ if line:
404
+ # Decode bytes to string if necessary
405
+ if isinstance(line, bytes):
406
+ line = line.decode('utf-8')
407
+ line = line.strip()
408
+
409
+ if line.startswith("data: "):
410
+ data = line[6:] # Remove "data: " prefix
411
+
412
+ if data == "[DONE]":
413
+ break
414
+
415
+ try:
416
+ chunk = json.loads(data)
417
+
418
+ if "choices" in chunk and len(chunk["choices"]) > 0:
419
+ choice = chunk["choices"][0]
420
+ delta = choice.get("delta", {})
421
+ content = delta.get("content", "")
422
+ finish_reason = choice.get("finish_reason")
423
+
424
+ yield GenerateResponse(
425
+ content=content,
426
+ model=self.model,
427
+ finish_reason=finish_reason,
428
+ raw_response=chunk
429
+ )
430
+
431
+ except json.JSONDecodeError:
432
+ continue
433
+
434
+ except Exception as e:
435
+ yield GenerateResponse(
436
+ content=f"Error: {str(e)}",
437
+ model=self.model,
438
+ finish_reason="error"
439
+ )
440
+
441
+ async def _agenerate_internal(self,
442
+ prompt: str,
443
+ messages: Optional[List[Dict[str, str]]] = None,
444
+ system_prompt: Optional[str] = None,
445
+ tools: Optional[List[Dict[str, Any]]] = None,
446
+ media: Optional[List['MediaContent']] = None,
447
+ stream: bool = False,
448
+ response_model: Optional[Type[BaseModel]] = None,
449
+ execute_tools: Optional[bool] = None,
450
+ tool_call_tags: Optional[str] = None,
451
+ **kwargs) -> Union[GenerateResponse, AsyncIterator[GenerateResponse]]:
452
+ """Native async implementation using httpx.AsyncClient - 3-10x faster for batch operations."""
453
+
454
+ # Build messages for chat completions with tool support (same logic as sync)
455
+ chat_messages = []
456
+
457
+ # Add tools to system prompt if provided
458
+ enhanced_system_prompt = system_prompt
459
+ if tools and self.tool_handler.supports_prompted:
460
+ tool_prompt = self.tool_handler.format_tools_prompt(tools)
461
+ if enhanced_system_prompt:
462
+ enhanced_system_prompt += f"\n\n{tool_prompt}"
463
+ else:
464
+ enhanced_system_prompt = tool_prompt
465
+
466
+ # Add system message if provided
467
+ if enhanced_system_prompt:
468
+ chat_messages.append({
469
+ "role": "system",
470
+ "content": enhanced_system_prompt
471
+ })
472
+
473
+ # Add conversation history
474
+ if messages:
475
+ chat_messages.extend(messages)
476
+
477
+ # Handle media content
478
+ if media:
479
+ user_message_text = prompt.strip() if prompt else ""
480
+ if not user_message_text and chat_messages:
481
+ for msg in reversed(chat_messages):
482
+ if msg.get("role") == "user" and msg.get("content"):
483
+ user_message_text = msg["content"]
484
+ break
485
+ try:
486
+ processed_media = self._process_media_content(media)
487
+ media_handler = self._get_media_handler_for_model(self.model)
488
+ multimodal_message = media_handler.create_multimodal_message(user_message_text, processed_media)
489
+
490
+ if isinstance(multimodal_message, str):
491
+ if chat_messages and chat_messages[-1].get("role") == "user":
492
+ chat_messages[-1]["content"] = multimodal_message
493
+ else:
494
+ chat_messages.append({"role": "user", "content": multimodal_message})
495
+ else:
496
+ if chat_messages and chat_messages[-1].get("role") == "user":
497
+ chat_messages[-1] = multimodal_message
498
+ else:
499
+ chat_messages.append(multimodal_message)
500
+ except ImportError:
501
+ self.logger.warning("Media processing not available. Install with: pip install abstractcore[media]")
502
+ if user_message_text:
503
+ chat_messages.append({"role": "user", "content": user_message_text})
504
+ except Exception as e:
505
+ self.logger.warning(f"Failed to process media content: {e}")
506
+ if user_message_text:
507
+ chat_messages.append({"role": "user", "content": user_message_text})
508
+
509
+ # Add prompt as separate message if provided
510
+ elif prompt and prompt.strip():
511
+ chat_messages.append({"role": "user", "content": prompt})
512
+
513
+ # Build request payload
514
+ generation_kwargs = self._prepare_generation_kwargs(**kwargs)
515
+ max_output_tokens = self._get_provider_max_tokens_param(generation_kwargs)
516
+
517
+ payload = {
518
+ "model": self.model,
519
+ "messages": chat_messages,
520
+ "stream": stream,
521
+ "temperature": kwargs.get("temperature", self.temperature),
522
+ "max_tokens": max_output_tokens,
523
+ "top_p": kwargs.get("top_p", 0.9),
524
+ }
525
+
526
+ # Add additional parameters
527
+ if "frequency_penalty" in kwargs:
528
+ payload["frequency_penalty"] = kwargs["frequency_penalty"]
529
+ if "presence_penalty" in kwargs:
530
+ payload["presence_penalty"] = kwargs["presence_penalty"]
531
+ if "repetition_penalty" in kwargs:
532
+ payload["repetition_penalty"] = kwargs["repetition_penalty"]
533
+
534
+ # Add seed if provided
535
+ seed_value = kwargs.get("seed", self.seed)
536
+ if seed_value is not None:
537
+ payload["seed"] = seed_value
538
+
539
+ # Add structured output support
540
+ if response_model and PYDANTIC_AVAILABLE:
541
+ json_schema = response_model.model_json_schema()
542
+ payload["response_format"] = {
543
+ "type": "json_schema",
544
+ "json_schema": {
545
+ "name": response_model.__name__,
546
+ "schema": json_schema
547
+ }
548
+ }
549
+
550
+ if stream:
551
+ return self._async_stream_generate(payload)
552
+ else:
553
+ response = await self._async_single_generate(payload)
554
+
555
+ # Execute tools if enabled
556
+ if self.execute_tools and tools and self.tool_handler.supports_prompted and response.content:
557
+ response = self._handle_prompted_tool_execution(response, tools, execute_tools)
558
+
559
+ return response
560
+
561
+ async def _async_single_generate(self, payload: Dict[str, Any]) -> GenerateResponse:
562
+ """Native async single response generation."""
563
+ try:
564
+ # Track generation time
565
+ start_time = time.time()
566
+ response = await self.async_client.post(
567
+ f"{self.base_url}/chat/completions",
568
+ json=payload,
569
+ headers=self._get_headers()
570
+ )
571
+ response.raise_for_status()
572
+ gen_time = round((time.time() - start_time) * 1000, 1)
573
+
574
+ result = response.json()
575
+
576
+ # Extract response from OpenAI format
577
+ if "choices" in result and len(result["choices"]) > 0:
578
+ choice = result["choices"][0]
579
+ content = choice.get("message", {}).get("content", "")
580
+ finish_reason = choice.get("finish_reason", "stop")
581
+ else:
582
+ content = "No response generated"
583
+ finish_reason = "error"
584
+
585
+ # Extract usage info
586
+ usage = result.get("usage", {})
587
+
588
+ return GenerateResponse(
589
+ content=content,
590
+ model=self.model,
591
+ finish_reason=finish_reason,
592
+ raw_response=result,
593
+ usage={
594
+ "input_tokens": usage.get("prompt_tokens", 0),
595
+ "output_tokens": usage.get("completion_tokens", 0),
596
+ "total_tokens": usage.get("total_tokens", 0),
597
+ "prompt_tokens": usage.get("prompt_tokens", 0),
598
+ "completion_tokens": usage.get("completion_tokens", 0)
599
+ },
600
+ gen_time=gen_time
601
+ )
602
+
603
+ except Exception as e:
604
+ error_str = str(e).lower()
605
+ if ('404' in error_str or 'not found' in error_str or 'model' in error_str) and ('not found' in error_str):
606
+ try:
607
+ available_models = self.list_available_models(base_url=self.base_url)
608
+ error_message = format_model_error("OpenAI-compatible server", self.model, available_models)
609
+ raise ModelNotFoundError(error_message)
610
+ except Exception:
611
+ raise ModelNotFoundError(f"Model '{self.model}' not found on OpenAI-compatible server")
612
+ else:
613
+ raise ProviderAPIError(f"OpenAI-compatible server API error: {str(e)}")
614
+
615
+ async def _async_stream_generate(self, payload: Dict[str, Any]) -> AsyncIterator[GenerateResponse]:
616
+ """Native async streaming response generation."""
617
+ try:
618
+ async with self.async_client.stream(
619
+ "POST",
620
+ f"{self.base_url}/chat/completions",
621
+ json=payload,
622
+ headers=self._get_headers()
623
+ ) as response:
624
+ response.raise_for_status()
625
+
626
+ async for line in response.aiter_lines():
627
+ if line:
628
+ line = line.strip()
629
+
630
+ if line.startswith("data: "):
631
+ data = line[6:] # Remove "data: " prefix
632
+
633
+ if data == "[DONE]":
634
+ break
635
+
636
+ try:
637
+ chunk = json.loads(data)
638
+
639
+ if "choices" in chunk and len(chunk["choices"]) > 0:
640
+ choice = chunk["choices"][0]
641
+ delta = choice.get("delta", {})
642
+ content = delta.get("content", "")
643
+ finish_reason = choice.get("finish_reason")
644
+
645
+ yield GenerateResponse(
646
+ content=content,
647
+ model=self.model,
648
+ finish_reason=finish_reason,
649
+ raw_response=chunk
650
+ )
651
+
652
+ except json.JSONDecodeError:
653
+ continue
654
+
655
+ except Exception as e:
656
+ yield GenerateResponse(
657
+ content=f"Error: {str(e)}",
658
+ model=self.model,
659
+ finish_reason="error"
660
+ )
661
+
662
+ def get_capabilities(self) -> List[str]:
663
+ """Get OpenAI-compatible server capabilities"""
664
+ return ["streaming", "chat", "tools"]
665
+
666
+ def validate_config(self) -> bool:
667
+ """Validate OpenAI-compatible server connection"""
668
+ try:
669
+ response = self.client.get(f"{self.base_url}/models", headers=self._get_headers())
670
+ return response.status_code == 200
671
+ except:
672
+ return False
673
+
674
+ def _get_provider_max_tokens_param(self, kwargs: Dict[str, Any]) -> int:
675
+ """Get max tokens parameter for OpenAI-compatible API"""
676
+ # For OpenAI-compatible servers, max_tokens is the max output tokens
677
+ return kwargs.get("max_output_tokens", self.max_output_tokens)
678
+
679
+ def _update_http_client_timeout(self) -> None:
680
+ """Update HTTP client timeout when timeout is changed."""
681
+ if hasattr(self, 'client') and self.client is not None:
682
+ try:
683
+ # Create new client with updated timeout
684
+ self.client.close()
685
+
686
+ # Get timeout value - None means unlimited timeout
687
+ timeout_value = getattr(self, '_timeout', None)
688
+ # Validate timeout if provided (None is allowed for unlimited)
689
+ if timeout_value is not None and timeout_value <= 0:
690
+ timeout_value = None # Invalid timeout becomes unlimited
691
+
692
+ self.client = httpx.Client(timeout=timeout_value)
693
+ except Exception as e:
694
+ # Log error but don't fail - timeout update is not critical
695
+ if hasattr(self, 'logger'):
696
+ self.logger.warning(f"Failed to update HTTP client timeout: {e}")
697
+ # Try to create a new client with default timeout
698
+ try:
699
+ self.client = httpx.Client(timeout=300.0)
700
+ except Exception:
701
+ pass # Best effort - don't fail the operation
702
+
703
+ def _normalize_model_name(self, model_name: str) -> str:
704
+ """Remove common provider prefixes from model name."""
705
+ for prefix in ["openai-compatible/", "lmstudio/", "qwen/", "ollama/", "huggingface/"]:
706
+ if model_name.startswith(prefix):
707
+ model_name = model_name[len(prefix):]
708
+ return model_name
709
+
710
+ def _get_media_handler_for_model(self, model_name: str):
711
+ """Get appropriate media handler based on model vision capabilities."""
712
+ from ..media.handlers import OpenAIMediaHandler, LocalMediaHandler
713
+
714
+ # Normalize model name by removing provider prefixes
715
+ clean_model_name = self._normalize_model_name(model_name)
716
+
717
+ # Determine if model supports vision
718
+ try:
719
+ from ..architectures.detection import supports_vision
720
+ use_vision_handler = supports_vision(clean_model_name)
721
+ except Exception as e:
722
+ self.logger.debug(f"Vision detection failed: {e}, defaulting to LocalMediaHandler")
723
+ use_vision_handler = False
724
+
725
+ # Create appropriate handler
726
+ if use_vision_handler:
727
+ handler = OpenAIMediaHandler(self.model_capabilities, model_name=model_name)
728
+ self.logger.debug(f"Using OpenAIMediaHandler for vision model: {clean_model_name}")
729
+ else:
730
+ handler = LocalMediaHandler("openai-compatible", self.model_capabilities, model_name=model_name)
731
+ self.logger.debug(f"Using LocalMediaHandler for model: {clean_model_name}")
732
+
733
+ return handler
734
+
735
+ def list_available_models(self, **kwargs) -> List[str]:
736
+ """
737
+ List available models from OpenAI-compatible server.
738
+
739
+ Args:
740
+ **kwargs: Optional parameters including:
741
+ - base_url: Server URL
742
+ - input_capabilities: List of ModelInputCapability enums to filter by input capability
743
+ - output_capabilities: List of ModelOutputCapability enums to filter by output capability
744
+
745
+ Returns:
746
+ List of model names, optionally filtered by capabilities
747
+ """
748
+ try:
749
+ from .model_capabilities import filter_models_by_capabilities
750
+
751
+ # Use provided base_url or fall back to instance base_url
752
+ base_url = kwargs.get('base_url', self.base_url)
753
+
754
+ response = self.client.get(f"{base_url}/models", headers=self._get_headers(), timeout=5.0)
755
+ if response.status_code == 200:
756
+ data = response.json()
757
+ models = [model["id"] for model in data.get("data", [])]
758
+ models = sorted(models)
759
+
760
+ # Apply capability filtering if provided
761
+ input_capabilities = kwargs.get('input_capabilities')
762
+ output_capabilities = kwargs.get('output_capabilities')
763
+
764
+ if input_capabilities or output_capabilities:
765
+ models = filter_models_by_capabilities(
766
+ models,
767
+ input_capabilities=input_capabilities,
768
+ output_capabilities=output_capabilities
769
+ )
770
+
771
+ return models
772
+ else:
773
+ self.logger.warning(f"OpenAI-compatible server API returned status {response.status_code}")
774
+ return []
775
+ except Exception as e:
776
+ self.logger.warning(f"Failed to list models from OpenAI-compatible server: {e}")
777
+ return []
778
+
779
+ def embed(self, input_text: Union[str, List[str]], **kwargs) -> Dict[str, Any]:
780
+ """
781
+ Generate embeddings using OpenAI-compatible embedding API.
782
+
783
+ Args:
784
+ input_text: Single string or list of strings to embed
785
+ **kwargs: Additional parameters (encoding_format, dimensions, user, etc.)
786
+
787
+ Returns:
788
+ Dict with embeddings in OpenAI-compatible format:
789
+ {
790
+ "object": "list",
791
+ "data": [{"object": "embedding", "embedding": [...], "index": 0}, ...],
792
+ "model": "model-name",
793
+ "usage": {"prompt_tokens": N, "total_tokens": N}
794
+ }
795
+ """
796
+ try:
797
+ # Prepare request payload for OpenAI-compatible API
798
+ payload = {
799
+ "input": input_text,
800
+ "model": self.model
801
+ }
802
+
803
+ # Add optional parameters if provided
804
+ if "encoding_format" in kwargs:
805
+ payload["encoding_format"] = kwargs["encoding_format"]
806
+ if "dimensions" in kwargs and kwargs["dimensions"]:
807
+ payload["dimensions"] = kwargs["dimensions"]
808
+ if "user" in kwargs:
809
+ payload["user"] = kwargs["user"]
810
+
811
+ # Call server's embeddings API (OpenAI-compatible)
812
+ response = self.client.post(
813
+ f"{self.base_url}/embeddings",
814
+ json=payload,
815
+ headers=self._get_headers()
816
+ )
817
+ response.raise_for_status()
818
+
819
+ # Server returns OpenAI-compatible format
820
+ result = response.json()
821
+
822
+ # Ensure the model field uses our provider-prefixed format
823
+ result["model"] = self.model
824
+
825
+ return result
826
+
827
+ except Exception as e:
828
+ self.logger.error(f"Failed to generate embeddings: {e}")
829
+ raise ProviderAPIError(f"OpenAI-compatible server embedding error: {str(e)}")