abstractcore 2.4.4__py3-none-any.whl → 2.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractcore/cli/__init__.py +9 -0
- abstractcore/cli/main.py +759 -0
- abstractcore/cli/vision_config.py +491 -0
- abstractcore/core/interface.py +7 -0
- abstractcore/core/session.py +27 -2
- abstractcore/media/handlers/__init__.py +16 -0
- abstractcore/media/handlers/anthropic_handler.py +326 -0
- abstractcore/media/handlers/local_handler.py +541 -0
- abstractcore/media/handlers/openai_handler.py +281 -0
- abstractcore/media/processors/__init__.py +13 -0
- abstractcore/media/processors/image_processor.py +610 -0
- abstractcore/media/processors/office_processor.py +490 -0
- abstractcore/media/processors/pdf_processor.py +485 -0
- abstractcore/media/processors/text_processor.py +557 -0
- abstractcore/media/utils/__init__.py +22 -0
- abstractcore/media/utils/image_scaler.py +306 -0
- abstractcore/providers/anthropic_provider.py +14 -2
- abstractcore/providers/base.py +24 -0
- abstractcore/providers/huggingface_provider.py +23 -9
- abstractcore/providers/lmstudio_provider.py +6 -1
- abstractcore/providers/mlx_provider.py +20 -7
- abstractcore/providers/ollama_provider.py +6 -1
- abstractcore/providers/openai_provider.py +6 -2
- abstractcore/tools/common_tools.py +651 -1
- abstractcore/utils/version.py +1 -1
- {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/METADATA +59 -9
- {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/RECORD +31 -17
- {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/entry_points.txt +2 -0
- {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/WHEEL +0 -0
- {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/licenses/LICENSE +0 -0
- {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,541 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Local provider media handler.
|
|
3
|
+
|
|
4
|
+
This module provides media formatting capabilities for local providers
|
|
5
|
+
like Ollama, MLX, LMStudio that handle media differently than cloud APIs.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Dict, Any, List, Optional, Union
|
|
10
|
+
|
|
11
|
+
from ..base import BaseProviderMediaHandler, MediaProcessingError
|
|
12
|
+
from ..types import MediaContent, MediaType, ContentFormat
|
|
13
|
+
|
|
14
|
+
# Import vision detection from existing architecture system
|
|
15
|
+
try:
|
|
16
|
+
from ...architectures.detection import supports_vision
|
|
17
|
+
VISION_DETECTION_AVAILABLE = True
|
|
18
|
+
except ImportError:
|
|
19
|
+
VISION_DETECTION_AVAILABLE = False
|
|
20
|
+
supports_vision = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class LocalMediaHandler(BaseProviderMediaHandler):
|
|
24
|
+
"""
|
|
25
|
+
Media handler for local providers (Ollama, MLX, LMStudio).
|
|
26
|
+
|
|
27
|
+
Formats media content for local model providers that may have different
|
|
28
|
+
capabilities and formatting requirements than cloud APIs.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, provider_name: str, model_capabilities: Optional[Dict[str, Any]] = None, **kwargs):
|
|
32
|
+
"""
|
|
33
|
+
Initialize local media handler.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
provider_name: Name of the local provider ('ollama', 'mlx', 'lmstudio')
|
|
37
|
+
model_capabilities: Model capabilities from model_capabilities.json
|
|
38
|
+
**kwargs: Additional configuration including:
|
|
39
|
+
- model_name: Name of the specific model (for vision detection)
|
|
40
|
+
"""
|
|
41
|
+
super().__init__(provider_name, model_capabilities, **kwargs)
|
|
42
|
+
|
|
43
|
+
# Store model name for vision capability detection
|
|
44
|
+
self.model_name = kwargs.get('model_name', None)
|
|
45
|
+
|
|
46
|
+
# Local provider configuration
|
|
47
|
+
self.max_image_size = kwargs.get('max_image_size', 10 * 1024 * 1024) # 10MB for local
|
|
48
|
+
self.prefer_text_extraction = kwargs.get('prefer_text_extraction', True)
|
|
49
|
+
self.embed_images_in_text = kwargs.get('embed_images_in_text', False)
|
|
50
|
+
|
|
51
|
+
self.logger.debug(f"Initialized {provider_name} local media handler with model={self.model_name}, capabilities: {self.capabilities}")
|
|
52
|
+
|
|
53
|
+
def _process_internal(self, file_path: Path, media_type: MediaType, **kwargs) -> MediaContent:
|
|
54
|
+
"""
|
|
55
|
+
Process file using appropriate processor for local providers.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
file_path: Path to the file
|
|
59
|
+
media_type: Type of media
|
|
60
|
+
**kwargs: Processing options
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
MediaContent formatted for local providers
|
|
64
|
+
"""
|
|
65
|
+
# Local providers often prefer text extraction over binary content
|
|
66
|
+
processing_kwargs = kwargs.copy()
|
|
67
|
+
|
|
68
|
+
if media_type == MediaType.IMAGE:
|
|
69
|
+
if self.capabilities.vision_support:
|
|
70
|
+
from ..processors import ImageProcessor
|
|
71
|
+
# Pass model name for model-specific resolution optimization
|
|
72
|
+
if self.model_name:
|
|
73
|
+
processing_kwargs['model_name'] = self.model_name
|
|
74
|
+
processing_kwargs['prefer_max_resolution'] = True
|
|
75
|
+
processor = ImageProcessor(**processing_kwargs)
|
|
76
|
+
else:
|
|
77
|
+
# If no vision support, skip image processing
|
|
78
|
+
raise MediaProcessingError(f"Provider {self.provider_name} does not support image processing")
|
|
79
|
+
|
|
80
|
+
elif media_type == MediaType.DOCUMENT:
|
|
81
|
+
if file_path.suffix.lower() == '.pdf':
|
|
82
|
+
from ..processors import PDFProcessor
|
|
83
|
+
# Configure PDF processor for text-optimized output
|
|
84
|
+
processing_kwargs.update({
|
|
85
|
+
'extract_images': False, # Local providers typically don't need embedded images
|
|
86
|
+
'markdown_output': True, # Prefer markdown for better structure
|
|
87
|
+
'preserve_tables': True
|
|
88
|
+
})
|
|
89
|
+
processor = PDFProcessor(**processing_kwargs)
|
|
90
|
+
else:
|
|
91
|
+
from ..processors import TextProcessor
|
|
92
|
+
processor = TextProcessor(**processing_kwargs)
|
|
93
|
+
else:
|
|
94
|
+
from ..processors import TextProcessor
|
|
95
|
+
processor = TextProcessor(**processing_kwargs)
|
|
96
|
+
|
|
97
|
+
# Process the file
|
|
98
|
+
result = processor.process_file(file_path, **processing_kwargs)
|
|
99
|
+
|
|
100
|
+
if not result.success:
|
|
101
|
+
raise MediaProcessingError(f"Failed to process {file_path}: {result.error_message}")
|
|
102
|
+
|
|
103
|
+
return result.media_content
|
|
104
|
+
|
|
105
|
+
def format_for_provider(self, media_content: MediaContent) -> Dict[str, Any]:
|
|
106
|
+
"""
|
|
107
|
+
Format media content for local provider.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
media_content: MediaContent to format
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
Dictionary formatted for local provider
|
|
114
|
+
"""
|
|
115
|
+
if self.provider_name == "ollama":
|
|
116
|
+
return self._format_for_ollama(media_content)
|
|
117
|
+
elif self.provider_name == "mlx":
|
|
118
|
+
return self._format_for_mlx(media_content)
|
|
119
|
+
elif self.provider_name == "lmstudio":
|
|
120
|
+
return self._format_for_lmstudio(media_content)
|
|
121
|
+
else:
|
|
122
|
+
# Generic local provider format
|
|
123
|
+
return self._format_generic_local(media_content)
|
|
124
|
+
|
|
125
|
+
def _format_for_ollama(self, media_content: MediaContent) -> Dict[str, Any]:
|
|
126
|
+
"""
|
|
127
|
+
Format content for Ollama.
|
|
128
|
+
|
|
129
|
+
Ollama supports vision models and can handle base64 images.
|
|
130
|
+
"""
|
|
131
|
+
if media_content.media_type == MediaType.IMAGE:
|
|
132
|
+
if not self.capabilities.vision_support:
|
|
133
|
+
raise MediaProcessingError("Ollama model does not support vision")
|
|
134
|
+
|
|
135
|
+
if media_content.content_format != ContentFormat.BASE64:
|
|
136
|
+
raise MediaProcessingError("Ollama image formatting requires base64 content")
|
|
137
|
+
|
|
138
|
+
# Ollama uses a simple base64 format
|
|
139
|
+
return {
|
|
140
|
+
"type": "image",
|
|
141
|
+
"data": media_content.content,
|
|
142
|
+
"mime_type": media_content.mime_type
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
else:
|
|
146
|
+
# Text content for Ollama
|
|
147
|
+
content = str(media_content.content)
|
|
148
|
+
return {
|
|
149
|
+
"type": "text",
|
|
150
|
+
"content": content
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
def _format_for_mlx(self, media_content: MediaContent) -> Dict[str, Any]:
|
|
154
|
+
"""
|
|
155
|
+
Format content for MLX (Apple Silicon).
|
|
156
|
+
|
|
157
|
+
MLX may have specific optimizations for Apple hardware.
|
|
158
|
+
"""
|
|
159
|
+
if media_content.media_type == MediaType.IMAGE:
|
|
160
|
+
if not self.capabilities.vision_support:
|
|
161
|
+
raise MediaProcessingError("MLX model does not support vision")
|
|
162
|
+
|
|
163
|
+
# MLX might prefer direct tensor conversion, but fall back to base64
|
|
164
|
+
return {
|
|
165
|
+
"type": "image_base64",
|
|
166
|
+
"content": media_content.content,
|
|
167
|
+
"mime_type": media_content.mime_type,
|
|
168
|
+
"metadata": media_content.metadata
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
else:
|
|
172
|
+
# Text content for MLX
|
|
173
|
+
content = str(media_content.content)
|
|
174
|
+
return {
|
|
175
|
+
"type": "text",
|
|
176
|
+
"content": content,
|
|
177
|
+
"format": media_content.content_format.value
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
def _format_for_lmstudio(self, media_content: MediaContent) -> Dict[str, Any]:
|
|
181
|
+
"""
|
|
182
|
+
Format content for LMStudio.
|
|
183
|
+
|
|
184
|
+
LMStudio typically follows OpenAI-compatible formats but may have limitations.
|
|
185
|
+
"""
|
|
186
|
+
if media_content.media_type == MediaType.IMAGE:
|
|
187
|
+
if not self.capabilities.vision_support:
|
|
188
|
+
raise MediaProcessingError("LMStudio model does not support vision")
|
|
189
|
+
|
|
190
|
+
# LMStudio may use OpenAI-compatible format
|
|
191
|
+
data_url = f"data:{media_content.mime_type};base64,{media_content.content}"
|
|
192
|
+
return {
|
|
193
|
+
"type": "image_url",
|
|
194
|
+
"image_url": {
|
|
195
|
+
"url": data_url
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
else:
|
|
200
|
+
# Text content for LMStudio
|
|
201
|
+
content = str(media_content.content)
|
|
202
|
+
return {
|
|
203
|
+
"type": "text",
|
|
204
|
+
"text": content
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
def _format_generic_local(self, media_content: MediaContent) -> Dict[str, Any]:
|
|
208
|
+
"""
|
|
209
|
+
Generic format for unknown local providers.
|
|
210
|
+
"""
|
|
211
|
+
if media_content.media_type == MediaType.IMAGE:
|
|
212
|
+
return {
|
|
213
|
+
"type": "image",
|
|
214
|
+
"content": media_content.content,
|
|
215
|
+
"content_format": media_content.content_format.value,
|
|
216
|
+
"mime_type": media_content.mime_type
|
|
217
|
+
}
|
|
218
|
+
else:
|
|
219
|
+
return {
|
|
220
|
+
"type": "text",
|
|
221
|
+
"content": str(media_content.content)
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
def create_multimodal_message(self, text: str, media_contents: List[MediaContent]) -> Union[Dict[str, Any], str]:
|
|
225
|
+
"""
|
|
226
|
+
Create a multimodal message for local provider with intelligent vision routing.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
text: Text content
|
|
230
|
+
media_contents: List of media contents
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
Formatted message (structured dict for vision models, string for text-only)
|
|
234
|
+
"""
|
|
235
|
+
# Check if we have images in the media contents
|
|
236
|
+
has_images = any(mc.media_type == MediaType.IMAGE for mc in media_contents)
|
|
237
|
+
|
|
238
|
+
if not has_images:
|
|
239
|
+
# No images - use text embedding for efficiency
|
|
240
|
+
self.logger.debug("No images detected, using text-embedded format")
|
|
241
|
+
return self._create_text_embedded_message(text, media_contents)
|
|
242
|
+
|
|
243
|
+
# We have images - check vision capabilities
|
|
244
|
+
provider_vision_support = self.capabilities.vision_support if self.capabilities else False
|
|
245
|
+
|
|
246
|
+
# Check model-level vision support using existing detection system
|
|
247
|
+
model_vision_support = False
|
|
248
|
+
if VISION_DETECTION_AVAILABLE and self.model_name and supports_vision:
|
|
249
|
+
try:
|
|
250
|
+
model_vision_support = supports_vision(self.model_name)
|
|
251
|
+
self.logger.debug(f"Model '{self.model_name}' vision support: {model_vision_support}")
|
|
252
|
+
except Exception as e:
|
|
253
|
+
self.logger.warning(f"Failed to detect vision support for model '{self.model_name}': {e}")
|
|
254
|
+
model_vision_support = False
|
|
255
|
+
elif not VISION_DETECTION_AVAILABLE:
|
|
256
|
+
self.logger.warning("Vision detection system not available - falling back to provider capabilities only")
|
|
257
|
+
elif not self.model_name:
|
|
258
|
+
self.logger.warning("No model name provided - cannot check model-specific vision capabilities")
|
|
259
|
+
|
|
260
|
+
# Decision logic: require BOTH provider AND model support for structured format
|
|
261
|
+
if provider_vision_support and model_vision_support:
|
|
262
|
+
self.logger.debug(f"Using structured format for vision model '{self.model_name}' on provider '{self.provider_name}'")
|
|
263
|
+
try:
|
|
264
|
+
return self._create_structured_message(text, media_contents)
|
|
265
|
+
except Exception as e:
|
|
266
|
+
self.logger.error(f"Failed to create structured message for vision model: {e}")
|
|
267
|
+
self.logger.warning("Falling back to text-embedded format")
|
|
268
|
+
return self._create_text_embedded_message(text, media_contents)
|
|
269
|
+
|
|
270
|
+
# Handle capability mismatches with detailed warnings
|
|
271
|
+
if has_images and not model_vision_support and self.model_name:
|
|
272
|
+
self.logger.info(
|
|
273
|
+
f"Model '{self.model_name}' does not support vision. "
|
|
274
|
+
f"Using vision fallback system for image analysis."
|
|
275
|
+
)
|
|
276
|
+
elif has_images and not provider_vision_support and not self.model_name:
|
|
277
|
+
self.logger.info(
|
|
278
|
+
f"No model-specific vision capabilities detected for provider '{self.provider_name}'. "
|
|
279
|
+
f"Using vision fallback system for image analysis."
|
|
280
|
+
)
|
|
281
|
+
elif has_images and not self.model_name:
|
|
282
|
+
self.logger.info(
|
|
283
|
+
f"No model name available for vision detection. "
|
|
284
|
+
f"Using vision fallback system for image analysis."
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
# Fallback to text-embedded format
|
|
288
|
+
self.logger.debug("Using text-embedded format due to insufficient vision capabilities")
|
|
289
|
+
return self._create_text_embedded_message(text, media_contents)
|
|
290
|
+
|
|
291
|
+
def _create_text_embedded_message(self, text: str, media_contents: List[MediaContent]) -> str:
|
|
292
|
+
"""
|
|
293
|
+
Create a message with media content embedded as text.
|
|
294
|
+
|
|
295
|
+
This is often more reliable for local providers that don't have
|
|
296
|
+
robust multimodal support. For images on text-only models, uses vision fallback.
|
|
297
|
+
"""
|
|
298
|
+
message_parts = []
|
|
299
|
+
|
|
300
|
+
# Add main text
|
|
301
|
+
if text.strip():
|
|
302
|
+
message_parts.append(text)
|
|
303
|
+
|
|
304
|
+
# Add processed content from media
|
|
305
|
+
for i, media_content in enumerate(media_contents):
|
|
306
|
+
if media_content.media_type == MediaType.IMAGE:
|
|
307
|
+
if self.capabilities.vision_support:
|
|
308
|
+
# For vision models, we'll still need to handle images specially
|
|
309
|
+
# This will be handled by the provider's generate method
|
|
310
|
+
message_parts.append(f"[Image {i+1}: {media_content.metadata.get('file_name', 'image')}]")
|
|
311
|
+
else:
|
|
312
|
+
# Use vision fallback for text-only models
|
|
313
|
+
try:
|
|
314
|
+
from ..vision_fallback import VisionFallbackHandler, VisionNotConfiguredError
|
|
315
|
+
fallback_handler = VisionFallbackHandler()
|
|
316
|
+
|
|
317
|
+
# Get the actual file path from media_content object
|
|
318
|
+
file_path = media_content.file_path or media_content.metadata.get('file_path') or media_content.metadata.get('file_name', 'image')
|
|
319
|
+
|
|
320
|
+
# Generate description using vision fallback
|
|
321
|
+
description = fallback_handler.create_description(str(file_path), text)
|
|
322
|
+
# Remove the original question from message_parts if it exists
|
|
323
|
+
if message_parts and text.strip() in message_parts[0]:
|
|
324
|
+
message_parts.clear()
|
|
325
|
+
# Completely different approach: make model think it's continuing its own observation
|
|
326
|
+
# No questions, no external framing - just natural continuation
|
|
327
|
+
simple_prompt = f"{description}"
|
|
328
|
+
message_parts.append(simple_prompt)
|
|
329
|
+
|
|
330
|
+
except VisionNotConfiguredError as e:
|
|
331
|
+
# Vision not configured - show warning to USER, not model
|
|
332
|
+
self.logger.warning("Vision capability not configured for text-only models")
|
|
333
|
+
self.logger.warning("To enable image analysis with text-only models:")
|
|
334
|
+
self.logger.warning("🔸 EASIEST: Download BLIP vision model (990MB): abstractcore --download-vision-model")
|
|
335
|
+
self.logger.warning("🔸 Use existing Ollama model: abstractcore --set-vision-caption qwen2.5vl:7b")
|
|
336
|
+
self.logger.warning("🔸 Use cloud API: abstractcore --set-vision-provider openai --model gpt-4o")
|
|
337
|
+
self.logger.warning("🔸 Interactive setup: abstractcore --configure")
|
|
338
|
+
self.logger.warning("Current status: abstractcore --status")
|
|
339
|
+
|
|
340
|
+
# Provide minimal placeholder to model (not configuration instructions!)
|
|
341
|
+
file_name = media_content.metadata.get('file_name', 'image')
|
|
342
|
+
message_parts.append(f"[Image {i+1}: {file_name}]")
|
|
343
|
+
|
|
344
|
+
except Exception as e:
|
|
345
|
+
self.logger.warning(f"Vision fallback failed: {e}")
|
|
346
|
+
# Fallback to basic placeholder
|
|
347
|
+
file_name = media_content.metadata.get('file_name', 'image')
|
|
348
|
+
message_parts.append(f"[Image {i+1}: {file_name} - vision processing unavailable]")
|
|
349
|
+
else:
|
|
350
|
+
# Embed text/document content directly
|
|
351
|
+
content = str(media_content.content)
|
|
352
|
+
file_name = media_content.metadata.get('file_name', f'document_{i+1}')
|
|
353
|
+
message_parts.append(f"\n\n--- Content from {file_name} ---\n{content}\n--- End of {file_name} ---")
|
|
354
|
+
|
|
355
|
+
return "\n\n".join(message_parts)
|
|
356
|
+
|
|
357
|
+
def _create_structured_message(self, text: str, media_contents: List[MediaContent]) -> Dict[str, Any]:
|
|
358
|
+
"""
|
|
359
|
+
Create a structured message for local providers using provider-specific format.
|
|
360
|
+
"""
|
|
361
|
+
if self.provider_name == "ollama":
|
|
362
|
+
return self._create_ollama_message(text, media_contents)
|
|
363
|
+
elif self.provider_name == "lmstudio":
|
|
364
|
+
return self._create_lmstudio_message(text, media_contents)
|
|
365
|
+
else:
|
|
366
|
+
# Generic structured format for other providers
|
|
367
|
+
return self._create_generic_structured_message(text, media_contents)
|
|
368
|
+
|
|
369
|
+
def _create_ollama_message(self, text: str, media_contents: List[MediaContent]) -> Dict[str, Any]:
|
|
370
|
+
"""
|
|
371
|
+
Create Ollama-specific multimodal message format.
|
|
372
|
+
|
|
373
|
+
Ollama expects: {"role": "user", "content": "text", "images": ["base64..."]}
|
|
374
|
+
"""
|
|
375
|
+
message = {
|
|
376
|
+
"role": "user",
|
|
377
|
+
"content": text.strip() if text.strip() else "What's in this image?"
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
# Extract base64 images for Ollama's images array
|
|
381
|
+
images = []
|
|
382
|
+
for media_content in media_contents:
|
|
383
|
+
if media_content.media_type == MediaType.IMAGE and self.can_handle_media(media_content):
|
|
384
|
+
if media_content.content_format == ContentFormat.BASE64:
|
|
385
|
+
# Ollama expects raw base64 without data URL prefix
|
|
386
|
+
images.append(media_content.content)
|
|
387
|
+
else:
|
|
388
|
+
self.logger.warning(f"Ollama requires base64 image format, got {media_content.content_format}")
|
|
389
|
+
|
|
390
|
+
if images:
|
|
391
|
+
message["images"] = images
|
|
392
|
+
|
|
393
|
+
return message
|
|
394
|
+
|
|
395
|
+
def _create_lmstudio_message(self, text: str, media_contents: List[MediaContent]) -> Dict[str, Any]:
|
|
396
|
+
"""
|
|
397
|
+
Create LMStudio-specific multimodal message format.
|
|
398
|
+
|
|
399
|
+
LMStudio follows OpenAI-compatible format with structured content array.
|
|
400
|
+
"""
|
|
401
|
+
content = []
|
|
402
|
+
|
|
403
|
+
# Add text content
|
|
404
|
+
if text.strip():
|
|
405
|
+
content.append({
|
|
406
|
+
"type": "text",
|
|
407
|
+
"text": text
|
|
408
|
+
})
|
|
409
|
+
|
|
410
|
+
# Add images in OpenAI format
|
|
411
|
+
for media_content in media_contents:
|
|
412
|
+
if media_content.media_type == MediaType.IMAGE and self.can_handle_media(media_content):
|
|
413
|
+
if media_content.content_format == ContentFormat.BASE64:
|
|
414
|
+
data_url = f"data:{media_content.mime_type};base64,{media_content.content}"
|
|
415
|
+
content.append({
|
|
416
|
+
"type": "image_url",
|
|
417
|
+
"image_url": {
|
|
418
|
+
"url": data_url
|
|
419
|
+
}
|
|
420
|
+
})
|
|
421
|
+
else:
|
|
422
|
+
self.logger.warning(f"LMStudio requires base64 image format, got {media_content.content_format}")
|
|
423
|
+
|
|
424
|
+
return {
|
|
425
|
+
"role": "user",
|
|
426
|
+
"content": content
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
def _create_generic_structured_message(self, text: str, media_contents: List[MediaContent]) -> Dict[str, Any]:
|
|
430
|
+
"""
|
|
431
|
+
Create generic structured message for unknown local providers.
|
|
432
|
+
"""
|
|
433
|
+
content = []
|
|
434
|
+
|
|
435
|
+
# Add text content
|
|
436
|
+
if text.strip():
|
|
437
|
+
content.append({
|
|
438
|
+
"type": "text",
|
|
439
|
+
"content": text
|
|
440
|
+
})
|
|
441
|
+
|
|
442
|
+
# Add media contents using provider-specific formatting
|
|
443
|
+
for media_content in media_contents:
|
|
444
|
+
if self.can_handle_media(media_content):
|
|
445
|
+
formatted_content = self.format_for_provider(media_content)
|
|
446
|
+
content.append(formatted_content)
|
|
447
|
+
|
|
448
|
+
return {
|
|
449
|
+
"role": "user",
|
|
450
|
+
"content": content
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
def validate_media_for_model(self, media_content: MediaContent, model: str) -> bool:
|
|
454
|
+
"""
|
|
455
|
+
Validate if media content is compatible with local model.
|
|
456
|
+
|
|
457
|
+
Args:
|
|
458
|
+
media_content: MediaContent to validate
|
|
459
|
+
model: Local model name
|
|
460
|
+
|
|
461
|
+
Returns:
|
|
462
|
+
True if compatible, False otherwise
|
|
463
|
+
"""
|
|
464
|
+
model_lower = model.lower()
|
|
465
|
+
|
|
466
|
+
# Image validation
|
|
467
|
+
if media_content.media_type == MediaType.IMAGE:
|
|
468
|
+
# Check if model supports vision
|
|
469
|
+
vision_support = self.model_capabilities.get('vision_support', False)
|
|
470
|
+
if not vision_support:
|
|
471
|
+
return False
|
|
472
|
+
|
|
473
|
+
# Local models are generally more permissive with image sizes
|
|
474
|
+
# but still check reasonable limits
|
|
475
|
+
file_size = media_content.metadata.get('file_size', 0)
|
|
476
|
+
if file_size > self.max_image_size:
|
|
477
|
+
return False
|
|
478
|
+
|
|
479
|
+
# Model-specific checks for known vision models
|
|
480
|
+
vision_keywords = ['vision', 'vl', 'multimodal', 'llava', 'qwen2-vl', 'qwen3-vl']
|
|
481
|
+
if any(keyword in model_lower for keyword in vision_keywords):
|
|
482
|
+
return True
|
|
483
|
+
|
|
484
|
+
return vision_support
|
|
485
|
+
|
|
486
|
+
# Text/document validation
|
|
487
|
+
elif media_content.media_type in [MediaType.TEXT, MediaType.DOCUMENT]:
|
|
488
|
+
# All local models support text
|
|
489
|
+
return True
|
|
490
|
+
|
|
491
|
+
return False
|
|
492
|
+
|
|
493
|
+
def estimate_tokens_for_media(self, media_content: MediaContent) -> int:
|
|
494
|
+
"""
|
|
495
|
+
Estimate token usage for media content with local models.
|
|
496
|
+
|
|
497
|
+
Args:
|
|
498
|
+
media_content: MediaContent to estimate
|
|
499
|
+
|
|
500
|
+
Returns:
|
|
501
|
+
Estimated token count
|
|
502
|
+
"""
|
|
503
|
+
if media_content.media_type == MediaType.IMAGE:
|
|
504
|
+
# Local vision models typically use fewer tokens than cloud models
|
|
505
|
+
# but this varies significantly by model architecture
|
|
506
|
+
return 512 # Conservative estimate
|
|
507
|
+
|
|
508
|
+
elif media_content.media_type in [MediaType.TEXT, MediaType.DOCUMENT]:
|
|
509
|
+
# Local models typically use similar tokenization to their base models
|
|
510
|
+
content_length = len(str(media_content.content))
|
|
511
|
+
return content_length // 4 # Rough estimate
|
|
512
|
+
|
|
513
|
+
return 0
|
|
514
|
+
|
|
515
|
+
def get_model_media_limits(self, model: str) -> Dict[str, Any]:
|
|
516
|
+
"""
|
|
517
|
+
Get media-specific limits for local model.
|
|
518
|
+
|
|
519
|
+
Args:
|
|
520
|
+
model: Local model name
|
|
521
|
+
|
|
522
|
+
Returns:
|
|
523
|
+
Dictionary of limits
|
|
524
|
+
"""
|
|
525
|
+
limits = {
|
|
526
|
+
'max_images_per_message': 1, # Most local models support only 1 image
|
|
527
|
+
'max_image_size_bytes': self.max_image_size,
|
|
528
|
+
'supported_image_formats': ['png', 'jpeg', 'jpg'],
|
|
529
|
+
'prefers_text_extraction': self.prefer_text_extraction
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
model_lower = model.lower()
|
|
533
|
+
|
|
534
|
+
# Adjust limits based on known model capabilities
|
|
535
|
+
if 'qwen' in model_lower and ('vl' in model_lower or 'vision' in model_lower):
|
|
536
|
+
limits.update({
|
|
537
|
+
'max_images_per_message': 5, # Qwen-VL models can handle multiple images
|
|
538
|
+
'supported_image_formats': ['png', 'jpeg', 'jpg', 'gif', 'bmp']
|
|
539
|
+
})
|
|
540
|
+
|
|
541
|
+
return limits
|