abstractcore 2.4.2__py3-none-any.whl → 2.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractcore/apps/app_config_utils.py +19 -0
- abstractcore/apps/summarizer.py +85 -56
- abstractcore/architectures/detection.py +15 -4
- abstractcore/assets/architecture_formats.json +1 -1
- abstractcore/assets/model_capabilities.json +420 -11
- abstractcore/core/interface.py +2 -0
- abstractcore/core/session.py +4 -0
- abstractcore/embeddings/manager.py +54 -16
- abstractcore/media/__init__.py +116 -148
- abstractcore/media/auto_handler.py +363 -0
- abstractcore/media/base.py +456 -0
- abstractcore/media/capabilities.py +335 -0
- abstractcore/media/types.py +300 -0
- abstractcore/media/vision_fallback.py +260 -0
- abstractcore/providers/anthropic_provider.py +18 -1
- abstractcore/providers/base.py +187 -0
- abstractcore/providers/huggingface_provider.py +111 -12
- abstractcore/providers/lmstudio_provider.py +88 -5
- abstractcore/providers/mlx_provider.py +33 -1
- abstractcore/providers/ollama_provider.py +37 -3
- abstractcore/providers/openai_provider.py +18 -1
- abstractcore/server/app.py +1390 -104
- abstractcore/tools/common_tools.py +12 -8
- abstractcore/utils/__init__.py +9 -5
- abstractcore/utils/cli.py +199 -17
- abstractcore/utils/message_preprocessor.py +182 -0
- abstractcore/utils/structured_logging.py +117 -16
- abstractcore/utils/version.py +1 -1
- {abstractcore-2.4.2.dist-info → abstractcore-2.4.4.dist-info}/METADATA +214 -20
- {abstractcore-2.4.2.dist-info → abstractcore-2.4.4.dist-info}/RECORD +34 -27
- {abstractcore-2.4.2.dist-info → abstractcore-2.4.4.dist-info}/entry_points.txt +1 -0
- {abstractcore-2.4.2.dist-info → abstractcore-2.4.4.dist-info}/WHEEL +0 -0
- {abstractcore-2.4.2.dist-info → abstractcore-2.4.4.dist-info}/licenses/LICENSE +0 -0
- {abstractcore-2.4.2.dist-info → abstractcore-2.4.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Media capability detection and management for AbstractCore.
|
|
3
|
+
|
|
4
|
+
This module provides comprehensive capability detection for multimodal models,
|
|
5
|
+
leveraging the existing model_capabilities.json infrastructure to determine
|
|
6
|
+
what media types and formats each model supports.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from typing import Dict, List, Any, Optional, Union
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
import logging
|
|
13
|
+
|
|
14
|
+
from ..architectures import get_model_capabilities
|
|
15
|
+
from .types import MediaType
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class MediaCapabilities:
|
|
20
|
+
"""
|
|
21
|
+
Comprehensive media capabilities for a specific model.
|
|
22
|
+
|
|
23
|
+
This class aggregates all media-related capabilities from model_capabilities.json
|
|
24
|
+
and provides convenient methods for checking media support.
|
|
25
|
+
"""
|
|
26
|
+
model_name: str
|
|
27
|
+
|
|
28
|
+
# Core media support
|
|
29
|
+
vision_support: bool = False
|
|
30
|
+
audio_support: bool = False
|
|
31
|
+
video_support: bool = False
|
|
32
|
+
|
|
33
|
+
# Image capabilities
|
|
34
|
+
max_images_per_message: int = 1
|
|
35
|
+
supported_image_formats: List[str] = field(default_factory=lambda: ['jpg', 'jpeg', 'png'])
|
|
36
|
+
image_resolutions: List[str] = field(default_factory=list)
|
|
37
|
+
max_image_size_bytes: int = 5 * 1024 * 1024 # 5MB default
|
|
38
|
+
|
|
39
|
+
# Document capabilities
|
|
40
|
+
document_support: bool = True # Most models can handle text documents
|
|
41
|
+
max_document_size_bytes: int = 50 * 1024 * 1024 # 50MB default
|
|
42
|
+
|
|
43
|
+
# Provider-specific features
|
|
44
|
+
multimodal_message_support: bool = True
|
|
45
|
+
text_embedding_preferred: bool = False # For local models
|
|
46
|
+
streaming_media_support: bool = False
|
|
47
|
+
|
|
48
|
+
# Advanced features
|
|
49
|
+
parallel_media_processing: bool = False
|
|
50
|
+
media_token_estimation: bool = True
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def from_model_capabilities(cls, model: str, provider: str = None) -> 'MediaCapabilities':
|
|
54
|
+
"""
|
|
55
|
+
Create MediaCapabilities from model_capabilities.json data.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
model: Model name to look up capabilities for
|
|
59
|
+
provider: Optional provider name for provider-specific adjustments
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
MediaCapabilities instance with detected capabilities
|
|
63
|
+
"""
|
|
64
|
+
caps = get_model_capabilities(model)
|
|
65
|
+
if not caps:
|
|
66
|
+
caps = {}
|
|
67
|
+
|
|
68
|
+
# Base capabilities from JSON
|
|
69
|
+
instance = cls(
|
|
70
|
+
model_name=model,
|
|
71
|
+
vision_support=caps.get('vision_support', False),
|
|
72
|
+
audio_support=caps.get('audio_support', False),
|
|
73
|
+
video_support=caps.get('video_support', False),
|
|
74
|
+
image_resolutions=caps.get('image_resolutions', [])
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Provider-specific adjustments
|
|
78
|
+
if provider:
|
|
79
|
+
instance._apply_provider_adjustments(provider, caps)
|
|
80
|
+
|
|
81
|
+
# Model-specific adjustments based on model name patterns
|
|
82
|
+
instance._apply_model_adjustments(caps)
|
|
83
|
+
|
|
84
|
+
return instance
|
|
85
|
+
|
|
86
|
+
def _apply_provider_adjustments(self, provider: str, caps: Dict[str, Any]):
|
|
87
|
+
"""Apply provider-specific capability adjustments."""
|
|
88
|
+
provider_lower = provider.lower()
|
|
89
|
+
|
|
90
|
+
if provider_lower == "openai":
|
|
91
|
+
self.max_images_per_message = 10 if "gpt-4o" in self.model_name.lower() else 1
|
|
92
|
+
self.max_image_size_bytes = 20 * 1024 * 1024 # 20MB for OpenAI
|
|
93
|
+
self.supported_image_formats = ['png', 'jpeg', 'jpg', 'gif', 'webp']
|
|
94
|
+
self.streaming_media_support = True
|
|
95
|
+
|
|
96
|
+
elif provider_lower == "anthropic":
|
|
97
|
+
self.max_images_per_message = 20 # Claude supports up to 20 images
|
|
98
|
+
self.max_image_size_bytes = 5 * 1024 * 1024 # 5MB for Anthropic
|
|
99
|
+
self.supported_image_formats = ['png', 'jpeg', 'jpg', 'gif', 'webp']
|
|
100
|
+
self.streaming_media_support = True
|
|
101
|
+
|
|
102
|
+
elif provider_lower in ["ollama", "mlx", "lmstudio"]:
|
|
103
|
+
self.text_embedding_preferred = True # Local models often prefer text
|
|
104
|
+
self.multimodal_message_support = True
|
|
105
|
+
self.streaming_media_support = False
|
|
106
|
+
self.max_image_size_bytes = 10 * 1024 * 1024 # 10MB for local
|
|
107
|
+
|
|
108
|
+
elif provider_lower == "huggingface":
|
|
109
|
+
self.streaming_media_support = False
|
|
110
|
+
self.max_image_size_bytes = 15 * 1024 * 1024 # 15MB for HF
|
|
111
|
+
|
|
112
|
+
def _apply_model_adjustments(self, caps: Dict[str, Any]):
|
|
113
|
+
"""Apply model-specific capability adjustments based on model patterns."""
|
|
114
|
+
model_lower = self.model_name.lower()
|
|
115
|
+
|
|
116
|
+
# Vision model patterns
|
|
117
|
+
if any(pattern in model_lower for pattern in ['vision', 'vl', 'visual']):
|
|
118
|
+
self.vision_support = True
|
|
119
|
+
if 'qwen' in model_lower:
|
|
120
|
+
self.max_images_per_message = 5 # Qwen-VL supports multiple images
|
|
121
|
+
|
|
122
|
+
# Multimodal model patterns
|
|
123
|
+
if any(pattern in model_lower for pattern in ['4o', 'multimodal', 'omni']):
|
|
124
|
+
self.vision_support = True
|
|
125
|
+
if 'audio' not in caps or caps.get('audio_support'):
|
|
126
|
+
self.audio_support = True
|
|
127
|
+
|
|
128
|
+
# Local model adjustments
|
|
129
|
+
if any(pattern in model_lower for pattern in ['llama', 'qwen', 'phi', 'gemma']):
|
|
130
|
+
self.text_embedding_preferred = True
|
|
131
|
+
|
|
132
|
+
def supports_media_type(self, media_type: MediaType) -> bool:
|
|
133
|
+
"""Check if the model supports a specific media type."""
|
|
134
|
+
if media_type == MediaType.IMAGE:
|
|
135
|
+
return self.vision_support
|
|
136
|
+
elif media_type == MediaType.AUDIO:
|
|
137
|
+
return self.audio_support
|
|
138
|
+
elif media_type == MediaType.VIDEO:
|
|
139
|
+
return self.video_support
|
|
140
|
+
elif media_type in [MediaType.DOCUMENT, MediaType.TEXT]:
|
|
141
|
+
return self.document_support
|
|
142
|
+
return False
|
|
143
|
+
|
|
144
|
+
def get_image_limits(self) -> Dict[str, Any]:
|
|
145
|
+
"""Get image-specific limits and capabilities."""
|
|
146
|
+
return {
|
|
147
|
+
'max_images_per_message': self.max_images_per_message,
|
|
148
|
+
'supported_formats': self.supported_image_formats,
|
|
149
|
+
'max_size_bytes': self.max_image_size_bytes,
|
|
150
|
+
'supported_resolutions': self.image_resolutions,
|
|
151
|
+
'vision_support': self.vision_support
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
def get_document_limits(self) -> Dict[str, Any]:
|
|
155
|
+
"""Get document-specific limits and capabilities."""
|
|
156
|
+
return {
|
|
157
|
+
'max_size_bytes': self.max_document_size_bytes,
|
|
158
|
+
'document_support': self.document_support,
|
|
159
|
+
'text_embedding_preferred': self.text_embedding_preferred
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
def estimate_media_tokens(self, media_type: MediaType, content_size: int = 0) -> int:
|
|
163
|
+
"""
|
|
164
|
+
Estimate token usage for media content.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
media_type: Type of media
|
|
168
|
+
content_size: Size of content in bytes (optional)
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
Estimated token count
|
|
172
|
+
"""
|
|
173
|
+
if not self.media_token_estimation:
|
|
174
|
+
return 0
|
|
175
|
+
|
|
176
|
+
if media_type == MediaType.IMAGE and self.vision_support:
|
|
177
|
+
# Base token cost for images varies by model
|
|
178
|
+
model_lower = self.model_name.lower()
|
|
179
|
+
if 'gpt-4o' in model_lower:
|
|
180
|
+
return 85 + (170 * 4) # Simplified GPT-4o calculation
|
|
181
|
+
elif 'claude' in model_lower:
|
|
182
|
+
return 1600 # Anthropic standard
|
|
183
|
+
else:
|
|
184
|
+
return 512 # Conservative estimate for local models
|
|
185
|
+
|
|
186
|
+
elif media_type in [MediaType.TEXT, MediaType.DOCUMENT]:
|
|
187
|
+
# Text content token estimation
|
|
188
|
+
if content_size > 0:
|
|
189
|
+
return content_size // 4 # ~4 chars per token
|
|
190
|
+
return 100 # Default estimate
|
|
191
|
+
|
|
192
|
+
return 0
|
|
193
|
+
|
|
194
|
+
def validate_media_content(self, media_type: MediaType, file_size: int = 0,
|
|
195
|
+
format: str = None) -> tuple[bool, Optional[str]]:
|
|
196
|
+
"""
|
|
197
|
+
Validate if media content meets model requirements.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
media_type: Type of media
|
|
201
|
+
file_size: Size of file in bytes
|
|
202
|
+
format: File format/extension
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
Tuple of (is_valid, error_message)
|
|
206
|
+
"""
|
|
207
|
+
if not self.supports_media_type(media_type):
|
|
208
|
+
return False, f"Model {self.model_name} does not support {media_type.value} content"
|
|
209
|
+
|
|
210
|
+
if media_type == MediaType.IMAGE:
|
|
211
|
+
if file_size > self.max_image_size_bytes:
|
|
212
|
+
return False, f"Image size ({file_size} bytes) exceeds limit ({self.max_image_size_bytes} bytes)"
|
|
213
|
+
|
|
214
|
+
if format and format.lower() not in [f.lower() for f in self.supported_image_formats]:
|
|
215
|
+
return False, f"Image format '{format}' not supported. Supported: {self.supported_image_formats}"
|
|
216
|
+
|
|
217
|
+
elif media_type in [MediaType.DOCUMENT, MediaType.TEXT]:
|
|
218
|
+
if file_size > self.max_document_size_bytes:
|
|
219
|
+
return False, f"Document size ({file_size} bytes) exceeds limit ({self.max_document_size_bytes} bytes)"
|
|
220
|
+
|
|
221
|
+
return True, None
|
|
222
|
+
|
|
223
|
+
def get_processing_strategy(self, media_type: MediaType) -> str:
|
|
224
|
+
"""
|
|
225
|
+
Get the recommended processing strategy for this media type.
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
Processing strategy: 'multimodal', 'text_embedding', or 'unsupported'
|
|
229
|
+
"""
|
|
230
|
+
if not self.supports_media_type(media_type):
|
|
231
|
+
return 'unsupported'
|
|
232
|
+
|
|
233
|
+
if media_type == MediaType.IMAGE and self.vision_support:
|
|
234
|
+
if self.text_embedding_preferred:
|
|
235
|
+
return 'text_embedding' # Local models often prefer text description
|
|
236
|
+
else:
|
|
237
|
+
return 'multimodal'
|
|
238
|
+
|
|
239
|
+
elif media_type in [MediaType.DOCUMENT, MediaType.TEXT]:
|
|
240
|
+
return 'text_embedding' # Always embed documents as text
|
|
241
|
+
|
|
242
|
+
return 'unsupported'
|
|
243
|
+
|
|
244
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
245
|
+
"""Convert to dictionary for serialization."""
|
|
246
|
+
return {
|
|
247
|
+
'model_name': self.model_name,
|
|
248
|
+
'vision_support': self.vision_support,
|
|
249
|
+
'audio_support': self.audio_support,
|
|
250
|
+
'video_support': self.video_support,
|
|
251
|
+
'max_images_per_message': self.max_images_per_message,
|
|
252
|
+
'supported_image_formats': self.supported_image_formats,
|
|
253
|
+
'image_resolutions': self.image_resolutions,
|
|
254
|
+
'max_image_size_bytes': self.max_image_size_bytes,
|
|
255
|
+
'document_support': self.document_support,
|
|
256
|
+
'max_document_size_bytes': self.max_document_size_bytes,
|
|
257
|
+
'multimodal_message_support': self.multimodal_message_support,
|
|
258
|
+
'text_embedding_preferred': self.text_embedding_preferred,
|
|
259
|
+
'streaming_media_support': self.streaming_media_support,
|
|
260
|
+
'parallel_media_processing': self.parallel_media_processing,
|
|
261
|
+
'media_token_estimation': self.media_token_estimation
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def get_media_capabilities(model: str, provider: str = None) -> MediaCapabilities:
|
|
266
|
+
"""
|
|
267
|
+
Get comprehensive media capabilities for a model.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
model: Model name
|
|
271
|
+
provider: Optional provider name for provider-specific adjustments
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
MediaCapabilities instance
|
|
275
|
+
"""
|
|
276
|
+
return MediaCapabilities.from_model_capabilities(model, provider)
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def is_vision_model(model: str) -> bool:
|
|
280
|
+
"""Quick check if a model supports vision."""
|
|
281
|
+
caps = get_media_capabilities(model)
|
|
282
|
+
return caps.vision_support
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def is_multimodal_model(model: str) -> bool:
|
|
286
|
+
"""Quick check if a model supports any multimodal content."""
|
|
287
|
+
caps = get_media_capabilities(model)
|
|
288
|
+
return caps.vision_support or caps.audio_support or caps.video_support
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def get_supported_media_types(model: str, provider: str = None) -> List[MediaType]:
|
|
292
|
+
"""
|
|
293
|
+
Get list of supported media types for a model.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
model: Model name
|
|
297
|
+
provider: Optional provider name
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
List of supported MediaType values
|
|
301
|
+
"""
|
|
302
|
+
caps = get_media_capabilities(model, provider)
|
|
303
|
+
supported = []
|
|
304
|
+
|
|
305
|
+
if caps.vision_support:
|
|
306
|
+
supported.append(MediaType.IMAGE)
|
|
307
|
+
if caps.audio_support:
|
|
308
|
+
supported.append(MediaType.AUDIO)
|
|
309
|
+
if caps.video_support:
|
|
310
|
+
supported.append(MediaType.VIDEO)
|
|
311
|
+
if caps.document_support:
|
|
312
|
+
supported.extend([MediaType.DOCUMENT, MediaType.TEXT])
|
|
313
|
+
|
|
314
|
+
return supported
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
# Convenience functions for common capability checks
|
|
318
|
+
def supports_images(model: str, provider: str = None) -> bool:
|
|
319
|
+
"""Check if model supports image processing."""
|
|
320
|
+
return get_media_capabilities(model, provider).vision_support
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def supports_documents(model: str, provider: str = None) -> bool:
|
|
324
|
+
"""Check if model supports document processing."""
|
|
325
|
+
return get_media_capabilities(model, provider).document_support
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def get_max_images(model: str, provider: str = None) -> int:
|
|
329
|
+
"""Get maximum images per message for model."""
|
|
330
|
+
return get_media_capabilities(model, provider).max_images_per_message
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def should_use_text_embedding(model: str, provider: str = None) -> bool:
|
|
334
|
+
"""Check if model prefers text embedding over multimodal messages."""
|
|
335
|
+
return get_media_capabilities(model, provider).text_embedding_preferred
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core media types and models for AbstractCore multimodal support.
|
|
3
|
+
|
|
4
|
+
This module defines the fundamental data structures for handling various media types
|
|
5
|
+
across different LLM providers, following AbstractCore's unified interface patterns.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import base64
|
|
9
|
+
import mimetypes
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Union, Dict, Any, Optional, List, Literal
|
|
13
|
+
from enum import Enum
|
|
14
|
+
|
|
15
|
+
from pydantic import BaseModel, Field, validator
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MediaType(Enum):
|
|
19
|
+
"""Supported media types for multimodal processing."""
|
|
20
|
+
IMAGE = "image"
|
|
21
|
+
DOCUMENT = "document"
|
|
22
|
+
AUDIO = "audio"
|
|
23
|
+
VIDEO = "video"
|
|
24
|
+
TEXT = "text"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ContentFormat(Enum):
|
|
28
|
+
"""Different ways media content can be represented."""
|
|
29
|
+
BASE64 = "base64"
|
|
30
|
+
URL = "url"
|
|
31
|
+
FILE_PATH = "file_path"
|
|
32
|
+
TEXT = "text"
|
|
33
|
+
BINARY = "binary"
|
|
34
|
+
AUTO = "auto"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class MediaContent:
|
|
39
|
+
"""
|
|
40
|
+
Represents a piece of media content with metadata.
|
|
41
|
+
|
|
42
|
+
This is the core data structure for all media handling in AbstractCore.
|
|
43
|
+
It provides a unified way to represent different types of content regardless
|
|
44
|
+
of the underlying provider.
|
|
45
|
+
"""
|
|
46
|
+
media_type: MediaType
|
|
47
|
+
content: Union[str, bytes]
|
|
48
|
+
content_format: ContentFormat
|
|
49
|
+
mime_type: str
|
|
50
|
+
file_path: Optional[str] = None
|
|
51
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
52
|
+
|
|
53
|
+
def __post_init__(self):
|
|
54
|
+
"""Validate and normalize the MediaContent after initialization."""
|
|
55
|
+
# Auto-detect MIME type if not provided and we have a file path
|
|
56
|
+
if self.mime_type == "auto" and self.file_path:
|
|
57
|
+
detected_mime, _ = mimetypes.guess_type(self.file_path)
|
|
58
|
+
self.mime_type = detected_mime or "application/octet-stream"
|
|
59
|
+
|
|
60
|
+
# Ensure content format matches content type
|
|
61
|
+
if self.content_format == ContentFormat.BASE64 and isinstance(self.content, bytes):
|
|
62
|
+
self.content = base64.b64encode(self.content).decode('utf-8')
|
|
63
|
+
elif self.content_format == ContentFormat.TEXT and isinstance(self.content, bytes):
|
|
64
|
+
self.content = self.content.decode('utf-8')
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class MultimodalMessage(BaseModel):
|
|
68
|
+
"""
|
|
69
|
+
A message that can contain both text and media content.
|
|
70
|
+
|
|
71
|
+
This follows the pattern of modern multimodal APIs where a single message
|
|
72
|
+
can contain multiple content elements of different types.
|
|
73
|
+
"""
|
|
74
|
+
role: str = Field(..., description="Message role (user, assistant, system)")
|
|
75
|
+
content: List[Union[str, Dict[str, Any]]] = Field(
|
|
76
|
+
default_factory=list,
|
|
77
|
+
description="Mixed content list containing text strings and media objects"
|
|
78
|
+
)
|
|
79
|
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
80
|
+
|
|
81
|
+
@validator('role')
|
|
82
|
+
def validate_role(cls, v):
|
|
83
|
+
valid_roles = {'user', 'assistant', 'system', 'tool'}
|
|
84
|
+
if v not in valid_roles:
|
|
85
|
+
raise ValueError(f"Role must be one of {valid_roles}")
|
|
86
|
+
return v
|
|
87
|
+
|
|
88
|
+
def add_text(self, text: str) -> None:
|
|
89
|
+
"""Add text content to the message."""
|
|
90
|
+
self.content.append(text)
|
|
91
|
+
|
|
92
|
+
def add_media(self, media: MediaContent) -> None:
|
|
93
|
+
"""Add media content to the message."""
|
|
94
|
+
media_dict = {
|
|
95
|
+
"type": "media",
|
|
96
|
+
"media_type": media.media_type.value,
|
|
97
|
+
"content": media.content,
|
|
98
|
+
"content_format": media.content_format.value,
|
|
99
|
+
"mime_type": media.mime_type,
|
|
100
|
+
"metadata": media.metadata
|
|
101
|
+
}
|
|
102
|
+
if media.file_path:
|
|
103
|
+
media_dict["file_path"] = media.file_path
|
|
104
|
+
|
|
105
|
+
self.content.append(media_dict)
|
|
106
|
+
|
|
107
|
+
def has_media(self) -> bool:
|
|
108
|
+
"""Check if this message contains any media content."""
|
|
109
|
+
return any(
|
|
110
|
+
isinstance(item, dict) and item.get("type") == "media"
|
|
111
|
+
for item in self.content
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
def get_text_content(self) -> str:
|
|
115
|
+
"""Extract all text content from the message."""
|
|
116
|
+
text_parts = [
|
|
117
|
+
item for item in self.content
|
|
118
|
+
if isinstance(item, str)
|
|
119
|
+
]
|
|
120
|
+
return " ".join(text_parts)
|
|
121
|
+
|
|
122
|
+
def get_media_content(self) -> List[Dict[str, Any]]:
|
|
123
|
+
"""Extract all media content from the message."""
|
|
124
|
+
return [
|
|
125
|
+
item for item in self.content
|
|
126
|
+
if isinstance(item, dict) and item.get("type") == "media"
|
|
127
|
+
]
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
@dataclass
|
|
131
|
+
class MediaCapabilities:
|
|
132
|
+
"""
|
|
133
|
+
Represents what media capabilities a provider/model supports.
|
|
134
|
+
|
|
135
|
+
This is used for intelligent routing and validation of media content
|
|
136
|
+
based on the target provider and model capabilities.
|
|
137
|
+
"""
|
|
138
|
+
vision_support: bool = False
|
|
139
|
+
audio_support: bool = False
|
|
140
|
+
video_support: bool = False
|
|
141
|
+
document_support: bool = False
|
|
142
|
+
|
|
143
|
+
# Image-specific capabilities
|
|
144
|
+
max_image_resolution: Optional[str] = None
|
|
145
|
+
supported_image_formats: List[str] = field(default_factory=lambda: ["jpg", "png"])
|
|
146
|
+
|
|
147
|
+
# Document-specific capabilities
|
|
148
|
+
supported_document_formats: List[str] = field(default_factory=lambda: ["pdf", "txt"])
|
|
149
|
+
|
|
150
|
+
# Audio/Video capabilities
|
|
151
|
+
max_audio_duration: Optional[int] = None # in seconds
|
|
152
|
+
max_video_duration: Optional[int] = None # in seconds
|
|
153
|
+
|
|
154
|
+
# Provider-specific limits
|
|
155
|
+
max_file_size: Optional[int] = None # in bytes
|
|
156
|
+
max_concurrent_media: int = 1
|
|
157
|
+
|
|
158
|
+
def supports_media_type(self, media_type: MediaType) -> bool:
|
|
159
|
+
"""Check if this provider supports the given media type."""
|
|
160
|
+
support_map = {
|
|
161
|
+
MediaType.IMAGE: self.vision_support,
|
|
162
|
+
MediaType.AUDIO: self.audio_support,
|
|
163
|
+
MediaType.VIDEO: self.video_support,
|
|
164
|
+
MediaType.DOCUMENT: self.document_support,
|
|
165
|
+
MediaType.TEXT: True # All providers support text
|
|
166
|
+
}
|
|
167
|
+
return support_map.get(media_type, False)
|
|
168
|
+
|
|
169
|
+
def supports_format(self, media_type: MediaType, format_ext: str) -> bool:
|
|
170
|
+
"""Check if this provider supports the specific format."""
|
|
171
|
+
format_ext = format_ext.lower().lstrip('.')
|
|
172
|
+
|
|
173
|
+
if media_type == MediaType.IMAGE:
|
|
174
|
+
return format_ext in self.supported_image_formats
|
|
175
|
+
elif media_type == MediaType.DOCUMENT:
|
|
176
|
+
return format_ext in self.supported_document_formats
|
|
177
|
+
elif media_type in [MediaType.AUDIO, MediaType.VIDEO]:
|
|
178
|
+
# For now, assume basic support if the media type is supported
|
|
179
|
+
return self.supports_media_type(media_type)
|
|
180
|
+
else:
|
|
181
|
+
return True
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
class MediaProcessingResult(BaseModel):
|
|
185
|
+
"""
|
|
186
|
+
Result of processing a media file.
|
|
187
|
+
|
|
188
|
+
Contains the processed content and metadata about the processing operation.
|
|
189
|
+
"""
|
|
190
|
+
success: bool
|
|
191
|
+
media_content: Optional[MediaContent] = None
|
|
192
|
+
error_message: Optional[str] = None
|
|
193
|
+
processing_time: Optional[float] = None
|
|
194
|
+
extracted_text: Optional[str] = None
|
|
195
|
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
196
|
+
|
|
197
|
+
@property
|
|
198
|
+
def failed(self) -> bool:
|
|
199
|
+
"""Check if processing failed."""
|
|
200
|
+
return not self.success
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
# File extension mappings for quick media type detection
|
|
204
|
+
FILE_TYPE_MAPPINGS = {
|
|
205
|
+
# Images
|
|
206
|
+
'jpg': MediaType.IMAGE, 'jpeg': MediaType.IMAGE, 'png': MediaType.IMAGE,
|
|
207
|
+
'gif': MediaType.IMAGE, 'bmp': MediaType.IMAGE, 'tif': MediaType.IMAGE,
|
|
208
|
+
'tiff': MediaType.IMAGE, 'webp': MediaType.IMAGE, 'ico': MediaType.IMAGE,
|
|
209
|
+
|
|
210
|
+
# Documents
|
|
211
|
+
'pdf': MediaType.DOCUMENT, 'doc': MediaType.DOCUMENT, 'docx': MediaType.DOCUMENT,
|
|
212
|
+
'xls': MediaType.DOCUMENT, 'xlsx': MediaType.DOCUMENT, 'ppt': MediaType.DOCUMENT,
|
|
213
|
+
'pptx': MediaType.DOCUMENT, 'odt': MediaType.DOCUMENT, 'rtf': MediaType.DOCUMENT,
|
|
214
|
+
|
|
215
|
+
# Text formats
|
|
216
|
+
'txt': MediaType.TEXT, 'md': MediaType.TEXT, 'csv': MediaType.TEXT,
|
|
217
|
+
'tsv': MediaType.TEXT, 'json': MediaType.TEXT, 'xml': MediaType.TEXT,
|
|
218
|
+
'html': MediaType.TEXT, 'htm': MediaType.TEXT,
|
|
219
|
+
|
|
220
|
+
# Audio
|
|
221
|
+
'mp3': MediaType.AUDIO, 'wav': MediaType.AUDIO, 'm4a': MediaType.AUDIO,
|
|
222
|
+
'ogg': MediaType.AUDIO, 'flac': MediaType.AUDIO, 'aac': MediaType.AUDIO,
|
|
223
|
+
|
|
224
|
+
# Video
|
|
225
|
+
'mp4': MediaType.VIDEO, 'avi': MediaType.VIDEO, 'mov': MediaType.VIDEO,
|
|
226
|
+
'mkv': MediaType.VIDEO, 'webm': MediaType.VIDEO, 'wmv': MediaType.VIDEO,
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def detect_media_type(file_path: Union[str, Path]) -> MediaType:
|
|
231
|
+
"""
|
|
232
|
+
Detect the media type of a file based on its extension.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
file_path: Path to the file
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
MediaType enum value
|
|
239
|
+
"""
|
|
240
|
+
path = Path(file_path)
|
|
241
|
+
extension = path.suffix.lower().lstrip('.')
|
|
242
|
+
|
|
243
|
+
return FILE_TYPE_MAPPINGS.get(extension, MediaType.DOCUMENT)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def create_media_content(
|
|
247
|
+
file_path: Union[str, Path],
|
|
248
|
+
content_format: ContentFormat = ContentFormat.AUTO,
|
|
249
|
+
mime_type: str = "auto"
|
|
250
|
+
) -> MediaContent:
|
|
251
|
+
"""
|
|
252
|
+
Create a MediaContent object from a file path.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
file_path: Path to the media file
|
|
256
|
+
content_format: How to represent the content
|
|
257
|
+
mime_type: MIME type of the content (auto-detected if "auto")
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
MediaContent object
|
|
261
|
+
"""
|
|
262
|
+
path = Path(file_path)
|
|
263
|
+
|
|
264
|
+
if not path.exists():
|
|
265
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
266
|
+
|
|
267
|
+
media_type = detect_media_type(path)
|
|
268
|
+
|
|
269
|
+
# Auto-select content format based on media type
|
|
270
|
+
if content_format == ContentFormat.AUTO:
|
|
271
|
+
if media_type in [MediaType.IMAGE, MediaType.AUDIO, MediaType.VIDEO]:
|
|
272
|
+
content_format = ContentFormat.BASE64
|
|
273
|
+
else:
|
|
274
|
+
content_format = ContentFormat.TEXT
|
|
275
|
+
|
|
276
|
+
# Read and encode content based on format
|
|
277
|
+
if content_format == ContentFormat.BASE64:
|
|
278
|
+
with open(path, 'rb') as f:
|
|
279
|
+
content = base64.b64encode(f.read()).decode('utf-8')
|
|
280
|
+
elif content_format == ContentFormat.TEXT:
|
|
281
|
+
with open(path, 'r', encoding='utf-8') as f:
|
|
282
|
+
content = f.read()
|
|
283
|
+
elif content_format == ContentFormat.FILE_PATH:
|
|
284
|
+
content = str(path)
|
|
285
|
+
else:
|
|
286
|
+
with open(path, 'rb') as f:
|
|
287
|
+
content = f.read()
|
|
288
|
+
|
|
289
|
+
return MediaContent(
|
|
290
|
+
media_type=media_type,
|
|
291
|
+
content=content,
|
|
292
|
+
content_format=content_format,
|
|
293
|
+
mime_type=mime_type,
|
|
294
|
+
file_path=str(path),
|
|
295
|
+
metadata={
|
|
296
|
+
'file_size': path.stat().st_size,
|
|
297
|
+
'file_name': path.name,
|
|
298
|
+
'file_extension': path.suffix
|
|
299
|
+
}
|
|
300
|
+
)
|