abstractcore 2.4.2__py3-none-any.whl → 2.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractcore/apps/app_config_utils.py +19 -0
- abstractcore/apps/summarizer.py +85 -56
- abstractcore/architectures/detection.py +15 -4
- abstractcore/assets/architecture_formats.json +1 -1
- abstractcore/assets/model_capabilities.json +420 -11
- abstractcore/core/interface.py +2 -0
- abstractcore/core/session.py +4 -0
- abstractcore/embeddings/manager.py +54 -16
- abstractcore/media/__init__.py +116 -148
- abstractcore/media/auto_handler.py +363 -0
- abstractcore/media/base.py +456 -0
- abstractcore/media/capabilities.py +335 -0
- abstractcore/media/types.py +300 -0
- abstractcore/media/vision_fallback.py +260 -0
- abstractcore/providers/anthropic_provider.py +18 -1
- abstractcore/providers/base.py +187 -0
- abstractcore/providers/huggingface_provider.py +111 -12
- abstractcore/providers/lmstudio_provider.py +88 -5
- abstractcore/providers/mlx_provider.py +33 -1
- abstractcore/providers/ollama_provider.py +37 -3
- abstractcore/providers/openai_provider.py +18 -1
- abstractcore/server/app.py +1390 -104
- abstractcore/tools/common_tools.py +12 -8
- abstractcore/utils/__init__.py +9 -5
- abstractcore/utils/cli.py +199 -17
- abstractcore/utils/message_preprocessor.py +182 -0
- abstractcore/utils/structured_logging.py +117 -16
- abstractcore/utils/version.py +1 -1
- {abstractcore-2.4.2.dist-info → abstractcore-2.4.4.dist-info}/METADATA +214 -20
- {abstractcore-2.4.2.dist-info → abstractcore-2.4.4.dist-info}/RECORD +34 -27
- {abstractcore-2.4.2.dist-info → abstractcore-2.4.4.dist-info}/entry_points.txt +1 -0
- {abstractcore-2.4.2.dist-info → abstractcore-2.4.4.dist-info}/WHEEL +0 -0
- {abstractcore-2.4.2.dist-info → abstractcore-2.4.4.dist-info}/licenses/LICENSE +0 -0
- {abstractcore-2.4.2.dist-info → abstractcore-2.4.4.dist-info}/top_level.txt +0 -0
|
@@ -473,21 +473,23 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
473
473
|
messages: Optional[List[Dict[str, str]]] = None,
|
|
474
474
|
system_prompt: Optional[str] = None,
|
|
475
475
|
tools: Optional[List[Dict[str, Any]]] = None,
|
|
476
|
+
media: Optional[List['MediaContent']] = None,
|
|
476
477
|
stream: bool = False,
|
|
477
478
|
response_model: Optional[Type[BaseModel]] = None,
|
|
478
479
|
**kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
|
|
479
480
|
"""Generate response using appropriate backend"""
|
|
480
481
|
|
|
481
482
|
if self.model_type == "gguf":
|
|
482
|
-
return self._generate_gguf(prompt, messages, system_prompt, tools, stream, **kwargs)
|
|
483
|
+
return self._generate_gguf(prompt, messages, system_prompt, tools, media, stream, **kwargs)
|
|
483
484
|
else:
|
|
484
|
-
return self._generate_transformers(prompt, messages, system_prompt, tools, stream, **kwargs)
|
|
485
|
+
return self._generate_transformers(prompt, messages, system_prompt, tools, media, stream, **kwargs)
|
|
485
486
|
|
|
486
487
|
def _generate_transformers(self,
|
|
487
488
|
prompt: str,
|
|
488
489
|
messages: Optional[List[Dict[str, str]]] = None,
|
|
489
490
|
system_prompt: Optional[str] = None,
|
|
490
491
|
tools: Optional[List[Dict[str, Any]]] = None,
|
|
492
|
+
media: Optional[List['MediaContent']] = None,
|
|
491
493
|
stream: bool = False,
|
|
492
494
|
**kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
|
|
493
495
|
"""Generate using transformers backend (original implementation)"""
|
|
@@ -499,7 +501,37 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
499
501
|
finish_reason="error"
|
|
500
502
|
)
|
|
501
503
|
|
|
502
|
-
# Build input text with tool support
|
|
504
|
+
# Build input text with tool and media support
|
|
505
|
+
# Handle media content first if present
|
|
506
|
+
if media:
|
|
507
|
+
try:
|
|
508
|
+
from ..media.handlers import LocalMediaHandler
|
|
509
|
+
media_handler = LocalMediaHandler("huggingface", self.model_capabilities, model_name=self.model)
|
|
510
|
+
|
|
511
|
+
# Create multimodal message combining text and media
|
|
512
|
+
multimodal_message = media_handler.create_multimodal_message(prompt, media)
|
|
513
|
+
|
|
514
|
+
# For local providers, we get text-embedded content
|
|
515
|
+
if isinstance(multimodal_message, str):
|
|
516
|
+
prompt = multimodal_message
|
|
517
|
+
else:
|
|
518
|
+
# If we get a structured message, extract the content
|
|
519
|
+
if isinstance(multimodal_message, dict) and "content" in multimodal_message:
|
|
520
|
+
if isinstance(multimodal_message["content"], list):
|
|
521
|
+
# Find text content in the structured message
|
|
522
|
+
text_content = ""
|
|
523
|
+
for item in multimodal_message["content"]:
|
|
524
|
+
if item.get("type") == "text":
|
|
525
|
+
text_content = item.get("text", "")
|
|
526
|
+
break
|
|
527
|
+
prompt = text_content or prompt
|
|
528
|
+
else:
|
|
529
|
+
prompt = str(multimodal_message["content"])
|
|
530
|
+
except ImportError:
|
|
531
|
+
self.logger.warning("Media processing not available. Install with: pip install abstractcore[media]")
|
|
532
|
+
except Exception as e:
|
|
533
|
+
self.logger.warning(f"Failed to process media content: {e}")
|
|
534
|
+
|
|
503
535
|
input_text = self._build_input_text_transformers(prompt, messages, system_prompt, tools)
|
|
504
536
|
|
|
505
537
|
# Generation parameters using unified system
|
|
@@ -532,6 +564,7 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
532
564
|
messages: Optional[List[Dict[str, str]]] = None,
|
|
533
565
|
system_prompt: Optional[str] = None,
|
|
534
566
|
tools: Optional[List[Dict[str, Any]]] = None,
|
|
567
|
+
media: Optional[List['MediaContent']] = None,
|
|
535
568
|
stream: bool = False,
|
|
536
569
|
**kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
|
|
537
570
|
"""Generate using GGUF backend with llama-cpp-python"""
|
|
@@ -552,7 +585,64 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
552
585
|
if messages:
|
|
553
586
|
chat_messages.extend(messages)
|
|
554
587
|
|
|
555
|
-
|
|
588
|
+
# Handle media content for the user message - use proper vision format for GGUF models
|
|
589
|
+
if media:
|
|
590
|
+
try:
|
|
591
|
+
from ..architectures.detection import supports_vision
|
|
592
|
+
|
|
593
|
+
# Check if this model supports vision natively
|
|
594
|
+
if supports_vision(self.model):
|
|
595
|
+
# Use HuggingFace multimodal format for vision-capable GGUF models
|
|
596
|
+
user_message_content = []
|
|
597
|
+
|
|
598
|
+
# Add text content
|
|
599
|
+
user_message_content.append({"type": "text", "text": prompt})
|
|
600
|
+
|
|
601
|
+
# Add media content
|
|
602
|
+
for media_item in media:
|
|
603
|
+
if hasattr(media_item, 'file_path') and media_item.file_path:
|
|
604
|
+
# Use file:// URL format as specified in HuggingFace docs
|
|
605
|
+
file_path = str(media_item.file_path)
|
|
606
|
+
if not file_path.startswith('file://'):
|
|
607
|
+
file_path = f"file://{file_path}"
|
|
608
|
+
user_message_content.append({
|
|
609
|
+
"type": "image",
|
|
610
|
+
"image": file_path
|
|
611
|
+
})
|
|
612
|
+
elif hasattr(media_item, 'content') and media_item.content:
|
|
613
|
+
# For base64 or other content, we might need to save to temp file
|
|
614
|
+
import tempfile
|
|
615
|
+
import base64
|
|
616
|
+
with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp_file:
|
|
617
|
+
if isinstance(media_item.content, str) and media_item.content.startswith('data:'):
|
|
618
|
+
# Handle base64 data URLs
|
|
619
|
+
header, data = media_item.content.split(',', 1)
|
|
620
|
+
decoded_data = base64.b64decode(data)
|
|
621
|
+
tmp_file.write(decoded_data)
|
|
622
|
+
else:
|
|
623
|
+
tmp_file.write(media_item.content)
|
|
624
|
+
tmp_file.flush()
|
|
625
|
+
user_message_content.append({
|
|
626
|
+
"type": "image",
|
|
627
|
+
"image": f"file://{tmp_file.name}"
|
|
628
|
+
})
|
|
629
|
+
else:
|
|
630
|
+
# Fallback to text-based media handling for non-vision models
|
|
631
|
+
from ..media.handlers import LocalMediaHandler
|
|
632
|
+
media_handler = LocalMediaHandler("huggingface", self.model_capabilities, model_name=self.model)
|
|
633
|
+
multimodal_message = media_handler.create_multimodal_message(prompt, media)
|
|
634
|
+
user_message_content = multimodal_message if isinstance(multimodal_message, str) else prompt
|
|
635
|
+
|
|
636
|
+
except ImportError:
|
|
637
|
+
self.logger.warning("Media processing not available. Install with: pip install abstractcore[media]")
|
|
638
|
+
user_message_content = prompt
|
|
639
|
+
except Exception as e:
|
|
640
|
+
self.logger.warning(f"Failed to process media content: {e}")
|
|
641
|
+
user_message_content = prompt
|
|
642
|
+
else:
|
|
643
|
+
user_message_content = prompt
|
|
644
|
+
|
|
645
|
+
chat_messages.append({"role": "user", "content": user_message_content})
|
|
556
646
|
|
|
557
647
|
# Prepare parameters using unified system
|
|
558
648
|
unified_kwargs = self._prepare_generation_kwargs(**kwargs)
|
|
@@ -774,19 +864,14 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
774
864
|
if outputs and len(outputs) > 0:
|
|
775
865
|
response_text = outputs[0]['generated_text'].strip()
|
|
776
866
|
|
|
777
|
-
# Calculate token usage
|
|
778
|
-
|
|
779
|
-
output_tokens = len(self.tokenizer.encode(response_text))
|
|
867
|
+
# Calculate token usage using centralized utilities
|
|
868
|
+
usage = self._calculate_usage(input_text, response_text)
|
|
780
869
|
|
|
781
870
|
return GenerateResponse(
|
|
782
871
|
content=response_text,
|
|
783
872
|
model=self.model,
|
|
784
873
|
finish_reason="stop",
|
|
785
|
-
usage=
|
|
786
|
-
"prompt_tokens": input_tokens,
|
|
787
|
-
"completion_tokens": output_tokens,
|
|
788
|
-
"total_tokens": input_tokens + output_tokens
|
|
789
|
-
}
|
|
874
|
+
usage=usage
|
|
790
875
|
)
|
|
791
876
|
else:
|
|
792
877
|
return GenerateResponse(
|
|
@@ -802,6 +887,20 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
802
887
|
finish_reason="error"
|
|
803
888
|
)
|
|
804
889
|
|
|
890
|
+
def _calculate_usage(self, prompt: str, response: str) -> Dict[str, int]:
|
|
891
|
+
"""Calculate token usage using centralized token utilities."""
|
|
892
|
+
from ..utils.token_utils import TokenUtils
|
|
893
|
+
|
|
894
|
+
prompt_tokens = TokenUtils.estimate_tokens(prompt, self.model)
|
|
895
|
+
completion_tokens = TokenUtils.estimate_tokens(response, self.model)
|
|
896
|
+
total_tokens = prompt_tokens + completion_tokens
|
|
897
|
+
|
|
898
|
+
return {
|
|
899
|
+
"prompt_tokens": prompt_tokens,
|
|
900
|
+
"completion_tokens": completion_tokens,
|
|
901
|
+
"total_tokens": total_tokens
|
|
902
|
+
}
|
|
903
|
+
|
|
805
904
|
def _stream_generate_transformers(self, input_text: str, max_new_tokens: int,
|
|
806
905
|
temperature: float, top_p: float, tool_call_tags: Optional[str] = None) -> Iterator[GenerateResponse]:
|
|
807
906
|
"""Stream response using transformers (simulated, original implementation) with tool tag rewriting support"""
|
|
@@ -99,6 +99,7 @@ class LMStudioProvider(BaseProvider):
|
|
|
99
99
|
messages: Optional[List[Dict[str, str]]] = None,
|
|
100
100
|
system_prompt: Optional[str] = None,
|
|
101
101
|
tools: Optional[List[Dict[str, Any]]] = None,
|
|
102
|
+
media: Optional[List['MediaContent']] = None,
|
|
102
103
|
stream: bool = False,
|
|
103
104
|
response_model: Optional[Type[BaseModel]] = None,
|
|
104
105
|
execute_tools: Optional[bool] = None,
|
|
@@ -129,11 +130,63 @@ class LMStudioProvider(BaseProvider):
|
|
|
129
130
|
if messages:
|
|
130
131
|
chat_messages.extend(messages)
|
|
131
132
|
|
|
132
|
-
#
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
"
|
|
136
|
-
|
|
133
|
+
# Handle media content regardless of prompt (media can be used with messages too)
|
|
134
|
+
if media:
|
|
135
|
+
# Get the last user message content to combine with media
|
|
136
|
+
user_message_text = prompt.strip() if prompt else ""
|
|
137
|
+
if not user_message_text and chat_messages:
|
|
138
|
+
# If no prompt, try to get text from the last user message
|
|
139
|
+
for msg in reversed(chat_messages):
|
|
140
|
+
if msg.get("role") == "user" and msg.get("content"):
|
|
141
|
+
user_message_text = msg["content"]
|
|
142
|
+
break
|
|
143
|
+
try:
|
|
144
|
+
# CRITICAL FIX: Process media files into MediaContent objects first
|
|
145
|
+
processed_media = self._process_media_content(media)
|
|
146
|
+
|
|
147
|
+
# Use capability-based media handler selection
|
|
148
|
+
media_handler = self._get_media_handler_for_model(self.model)
|
|
149
|
+
|
|
150
|
+
# Create multimodal message combining text and processed media
|
|
151
|
+
multimodal_message = media_handler.create_multimodal_message(user_message_text, processed_media)
|
|
152
|
+
|
|
153
|
+
# For LMStudio (OpenAI-compatible), we might get a string (embedded text) or dict (structured)
|
|
154
|
+
if isinstance(multimodal_message, str):
|
|
155
|
+
# Replace the last user message with the multimodal message, or add new one
|
|
156
|
+
if chat_messages and chat_messages[-1].get("role") == "user":
|
|
157
|
+
chat_messages[-1]["content"] = multimodal_message
|
|
158
|
+
else:
|
|
159
|
+
chat_messages.append({
|
|
160
|
+
"role": "user",
|
|
161
|
+
"content": multimodal_message
|
|
162
|
+
})
|
|
163
|
+
else:
|
|
164
|
+
if chat_messages and chat_messages[-1].get("role") == "user":
|
|
165
|
+
# Replace last user message with structured multimodal message
|
|
166
|
+
chat_messages[-1] = multimodal_message
|
|
167
|
+
else:
|
|
168
|
+
chat_messages.append(multimodal_message)
|
|
169
|
+
except ImportError:
|
|
170
|
+
self.logger.warning("Media processing not available. Install with: pip install abstractcore[media]")
|
|
171
|
+
if user_message_text:
|
|
172
|
+
chat_messages.append({
|
|
173
|
+
"role": "user",
|
|
174
|
+
"content": user_message_text
|
|
175
|
+
})
|
|
176
|
+
except Exception as e:
|
|
177
|
+
self.logger.warning(f"Failed to process media content: {e}")
|
|
178
|
+
if user_message_text:
|
|
179
|
+
chat_messages.append({
|
|
180
|
+
"role": "user",
|
|
181
|
+
"content": user_message_text
|
|
182
|
+
})
|
|
183
|
+
|
|
184
|
+
# Add prompt as separate message if provided (for backward compatibility)
|
|
185
|
+
elif prompt and prompt.strip():
|
|
186
|
+
chat_messages.append({
|
|
187
|
+
"role": "user",
|
|
188
|
+
"content": prompt
|
|
189
|
+
})
|
|
137
190
|
|
|
138
191
|
# Build request payload using unified system
|
|
139
192
|
generation_kwargs = self._prepare_generation_kwargs(**kwargs)
|
|
@@ -313,7 +366,37 @@ class LMStudioProvider(BaseProvider):
|
|
|
313
366
|
except Exception:
|
|
314
367
|
pass # Best effort - don't fail the operation
|
|
315
368
|
|
|
369
|
+
def _normalize_model_name(self, model_name: str) -> str:
|
|
370
|
+
"""Remove common provider prefixes from model name."""
|
|
371
|
+
for prefix in ["lmstudio/", "qwen/", "ollama/", "huggingface/"]:
|
|
372
|
+
if model_name.startswith(prefix):
|
|
373
|
+
model_name = model_name[len(prefix):]
|
|
374
|
+
return model_name
|
|
375
|
+
|
|
376
|
+
def _get_media_handler_for_model(self, model_name: str):
|
|
377
|
+
"""Get appropriate media handler based on model vision capabilities."""
|
|
378
|
+
from ..media.handlers import OpenAIMediaHandler, LocalMediaHandler
|
|
379
|
+
|
|
380
|
+
# Normalize model name by removing provider prefixes
|
|
381
|
+
clean_model_name = self._normalize_model_name(model_name)
|
|
382
|
+
|
|
383
|
+
# Determine if model supports vision
|
|
384
|
+
try:
|
|
385
|
+
from ..architectures.detection import supports_vision
|
|
386
|
+
use_vision_handler = supports_vision(clean_model_name)
|
|
387
|
+
except Exception as e:
|
|
388
|
+
self.logger.debug(f"Vision detection failed: {e}, defaulting to LocalMediaHandler")
|
|
389
|
+
use_vision_handler = False
|
|
390
|
+
|
|
391
|
+
# Create appropriate handler
|
|
392
|
+
if use_vision_handler:
|
|
393
|
+
handler = OpenAIMediaHandler(self.model_capabilities, model_name=model_name)
|
|
394
|
+
self.logger.debug(f"Using OpenAIMediaHandler for vision model: {clean_model_name}")
|
|
395
|
+
else:
|
|
396
|
+
handler = LocalMediaHandler("lmstudio", self.model_capabilities, model_name=model_name)
|
|
397
|
+
self.logger.debug(f"Using LocalMediaHandler for model: {clean_model_name}")
|
|
316
398
|
|
|
399
|
+
return handler
|
|
317
400
|
|
|
318
401
|
def list_available_models(self, **kwargs) -> List[str]:
|
|
319
402
|
"""List available models from LMStudio server."""
|
|
@@ -139,6 +139,7 @@ class MLXProvider(BaseProvider):
|
|
|
139
139
|
messages: Optional[List[Dict[str, str]]] = None,
|
|
140
140
|
system_prompt: Optional[str] = None,
|
|
141
141
|
tools: Optional[List[Dict[str, Any]]] = None,
|
|
142
|
+
media: Optional[List['MediaContent']] = None,
|
|
142
143
|
stream: bool = False,
|
|
143
144
|
response_model: Optional[Type[BaseModel]] = None,
|
|
144
145
|
**kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
|
|
@@ -151,8 +152,39 @@ class MLXProvider(BaseProvider):
|
|
|
151
152
|
finish_reason="error"
|
|
152
153
|
)
|
|
153
154
|
|
|
155
|
+
# Handle media content first if present
|
|
156
|
+
processed_prompt = prompt
|
|
157
|
+
if media:
|
|
158
|
+
try:
|
|
159
|
+
from ..media.handlers import LocalMediaHandler
|
|
160
|
+
media_handler = LocalMediaHandler("mlx", self.model_capabilities, model_name=self.model)
|
|
161
|
+
|
|
162
|
+
# Create multimodal message combining text and media
|
|
163
|
+
multimodal_message = media_handler.create_multimodal_message(prompt, media)
|
|
164
|
+
|
|
165
|
+
# For MLX (local provider), we get text-embedded content
|
|
166
|
+
if isinstance(multimodal_message, str):
|
|
167
|
+
processed_prompt = multimodal_message
|
|
168
|
+
else:
|
|
169
|
+
# If we get a structured message, extract the content
|
|
170
|
+
if isinstance(multimodal_message, dict) and "content" in multimodal_message:
|
|
171
|
+
if isinstance(multimodal_message["content"], list):
|
|
172
|
+
# Find text content in the structured message
|
|
173
|
+
text_content = ""
|
|
174
|
+
for item in multimodal_message["content"]:
|
|
175
|
+
if item.get("type") == "text":
|
|
176
|
+
text_content = item.get("text", "")
|
|
177
|
+
break
|
|
178
|
+
processed_prompt = text_content or prompt
|
|
179
|
+
else:
|
|
180
|
+
processed_prompt = str(multimodal_message["content"])
|
|
181
|
+
except ImportError:
|
|
182
|
+
self.logger.warning("Media processing not available. Install with: pip install abstractcore[media]")
|
|
183
|
+
except Exception as e:
|
|
184
|
+
self.logger.warning(f"Failed to process media content: {e}")
|
|
185
|
+
|
|
154
186
|
# Build full prompt with tool support
|
|
155
|
-
full_prompt = self._build_prompt(
|
|
187
|
+
full_prompt = self._build_prompt(processed_prompt, messages, system_prompt, tools)
|
|
156
188
|
|
|
157
189
|
# MLX generation parameters using unified system
|
|
158
190
|
generation_kwargs = self._prepare_generation_kwargs(**kwargs)
|
|
@@ -109,6 +109,7 @@ class OllamaProvider(BaseProvider):
|
|
|
109
109
|
messages: Optional[List[Dict[str, str]]] = None,
|
|
110
110
|
system_prompt: Optional[str] = None,
|
|
111
111
|
tools: Optional[List[Dict[str, Any]]] = None,
|
|
112
|
+
media: Optional[List['MediaContent']] = None,
|
|
112
113
|
stream: bool = False,
|
|
113
114
|
response_model: Optional[Type[BaseModel]] = None,
|
|
114
115
|
**kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
|
|
@@ -160,9 +161,42 @@ class OllamaProvider(BaseProvider):
|
|
|
160
161
|
converted_messages = self._convert_messages_for_ollama(messages)
|
|
161
162
|
payload["messages"].extend(converted_messages)
|
|
162
163
|
|
|
163
|
-
#
|
|
164
|
-
|
|
165
|
-
|
|
164
|
+
# Handle media content regardless of prompt (media can be used with messages too)
|
|
165
|
+
if media:
|
|
166
|
+
# Get the text to combine with media
|
|
167
|
+
user_message_text = prompt.strip() if prompt else ""
|
|
168
|
+
try:
|
|
169
|
+
from ..media.handlers import LocalMediaHandler
|
|
170
|
+
media_handler = LocalMediaHandler("ollama", self.model_capabilities, model_name=self.model)
|
|
171
|
+
|
|
172
|
+
# Create multimodal message combining text and media
|
|
173
|
+
multimodal_message = media_handler.create_multimodal_message(user_message_text, media)
|
|
174
|
+
|
|
175
|
+
# For local providers, we might get a string (embedded text) or dict (structured)
|
|
176
|
+
if isinstance(multimodal_message, str):
|
|
177
|
+
payload["messages"].append({
|
|
178
|
+
"role": "user",
|
|
179
|
+
"content": multimodal_message
|
|
180
|
+
})
|
|
181
|
+
else:
|
|
182
|
+
payload["messages"].append(multimodal_message)
|
|
183
|
+
except ImportError:
|
|
184
|
+
self.logger.warning("Media processing not available. Install with: pip install abstractcore[media]")
|
|
185
|
+
if user_message_text:
|
|
186
|
+
payload["messages"].append({
|
|
187
|
+
"role": "user",
|
|
188
|
+
"content": user_message_text
|
|
189
|
+
})
|
|
190
|
+
except Exception as e:
|
|
191
|
+
self.logger.warning(f"Failed to process media content: {e}")
|
|
192
|
+
if user_message_text:
|
|
193
|
+
payload["messages"].append({
|
|
194
|
+
"role": "user",
|
|
195
|
+
"content": user_message_text
|
|
196
|
+
})
|
|
197
|
+
|
|
198
|
+
# Add prompt as separate message if provided (for backward compatibility)
|
|
199
|
+
elif prompt and prompt.strip():
|
|
166
200
|
payload["messages"].append({
|
|
167
201
|
"role": "user",
|
|
168
202
|
"content": prompt
|
|
@@ -65,6 +65,7 @@ class OpenAIProvider(BaseProvider):
|
|
|
65
65
|
messages: Optional[List[Dict[str, str]]] = None,
|
|
66
66
|
system_prompt: Optional[str] = None,
|
|
67
67
|
tools: Optional[List[Dict[str, Any]]] = None,
|
|
68
|
+
media: Optional[List['MediaContent']] = None,
|
|
68
69
|
stream: bool = False,
|
|
69
70
|
response_model: Optional[Type[BaseModel]] = None,
|
|
70
71
|
**kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
|
|
@@ -89,7 +90,23 @@ class OpenAIProvider(BaseProvider):
|
|
|
89
90
|
|
|
90
91
|
# Add current prompt as user message
|
|
91
92
|
if prompt and prompt not in [msg.get("content") for msg in (messages or [])]:
|
|
92
|
-
|
|
93
|
+
# Handle multimodal message with media content
|
|
94
|
+
if media:
|
|
95
|
+
try:
|
|
96
|
+
from ..media.handlers import OpenAIMediaHandler
|
|
97
|
+
media_handler = OpenAIMediaHandler(self.model_capabilities)
|
|
98
|
+
|
|
99
|
+
# Create multimodal message combining text and media
|
|
100
|
+
multimodal_message = media_handler.create_multimodal_message(prompt, media)
|
|
101
|
+
api_messages.append(multimodal_message)
|
|
102
|
+
except ImportError:
|
|
103
|
+
self.logger.warning("Media processing not available. Install with: pip install abstractcore[media]")
|
|
104
|
+
api_messages.append({"role": "user", "content": prompt})
|
|
105
|
+
except Exception as e:
|
|
106
|
+
self.logger.warning(f"Failed to process media content: {e}")
|
|
107
|
+
api_messages.append({"role": "user", "content": prompt})
|
|
108
|
+
else:
|
|
109
|
+
api_messages.append({"role": "user", "content": prompt})
|
|
93
110
|
|
|
94
111
|
# Prepare API call parameters using unified system
|
|
95
112
|
generation_kwargs = self._prepare_generation_kwargs(**kwargs)
|