abstractcore 2.4.4__py3-none-any.whl → 2.4.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. abstractcore/cli/__init__.py +9 -0
  2. abstractcore/cli/main.py +759 -0
  3. abstractcore/cli/vision_config.py +491 -0
  4. abstractcore/core/interface.py +7 -0
  5. abstractcore/core/session.py +27 -2
  6. abstractcore/media/handlers/__init__.py +16 -0
  7. abstractcore/media/handlers/anthropic_handler.py +326 -0
  8. abstractcore/media/handlers/local_handler.py +541 -0
  9. abstractcore/media/handlers/openai_handler.py +281 -0
  10. abstractcore/media/processors/__init__.py +13 -0
  11. abstractcore/media/processors/image_processor.py +610 -0
  12. abstractcore/media/processors/office_processor.py +490 -0
  13. abstractcore/media/processors/pdf_processor.py +485 -0
  14. abstractcore/media/processors/text_processor.py +557 -0
  15. abstractcore/media/utils/__init__.py +22 -0
  16. abstractcore/media/utils/image_scaler.py +306 -0
  17. abstractcore/providers/anthropic_provider.py +14 -2
  18. abstractcore/providers/base.py +24 -0
  19. abstractcore/providers/huggingface_provider.py +23 -9
  20. abstractcore/providers/lmstudio_provider.py +6 -1
  21. abstractcore/providers/mlx_provider.py +20 -7
  22. abstractcore/providers/ollama_provider.py +6 -1
  23. abstractcore/providers/openai_provider.py +6 -2
  24. abstractcore/tools/common_tools.py +651 -1
  25. abstractcore/utils/version.py +1 -1
  26. {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/METADATA +59 -9
  27. {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/RECORD +31 -17
  28. {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/entry_points.txt +2 -0
  29. {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/WHEEL +0 -0
  30. {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/licenses/LICENSE +0 -0
  31. {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,541 @@
1
+ """
2
+ Local provider media handler.
3
+
4
+ This module provides media formatting capabilities for local providers
5
+ like Ollama, MLX, LMStudio that handle media differently than cloud APIs.
6
+ """
7
+
8
+ from pathlib import Path
9
+ from typing import Dict, Any, List, Optional, Union
10
+
11
+ from ..base import BaseProviderMediaHandler, MediaProcessingError
12
+ from ..types import MediaContent, MediaType, ContentFormat
13
+
14
+ # Import vision detection from existing architecture system
15
+ try:
16
+ from ...architectures.detection import supports_vision
17
+ VISION_DETECTION_AVAILABLE = True
18
+ except ImportError:
19
+ VISION_DETECTION_AVAILABLE = False
20
+ supports_vision = None
21
+
22
+
23
+ class LocalMediaHandler(BaseProviderMediaHandler):
24
+ """
25
+ Media handler for local providers (Ollama, MLX, LMStudio).
26
+
27
+ Formats media content for local model providers that may have different
28
+ capabilities and formatting requirements than cloud APIs.
29
+ """
30
+
31
+ def __init__(self, provider_name: str, model_capabilities: Optional[Dict[str, Any]] = None, **kwargs):
32
+ """
33
+ Initialize local media handler.
34
+
35
+ Args:
36
+ provider_name: Name of the local provider ('ollama', 'mlx', 'lmstudio')
37
+ model_capabilities: Model capabilities from model_capabilities.json
38
+ **kwargs: Additional configuration including:
39
+ - model_name: Name of the specific model (for vision detection)
40
+ """
41
+ super().__init__(provider_name, model_capabilities, **kwargs)
42
+
43
+ # Store model name for vision capability detection
44
+ self.model_name = kwargs.get('model_name', None)
45
+
46
+ # Local provider configuration
47
+ self.max_image_size = kwargs.get('max_image_size', 10 * 1024 * 1024) # 10MB for local
48
+ self.prefer_text_extraction = kwargs.get('prefer_text_extraction', True)
49
+ self.embed_images_in_text = kwargs.get('embed_images_in_text', False)
50
+
51
+ self.logger.debug(f"Initialized {provider_name} local media handler with model={self.model_name}, capabilities: {self.capabilities}")
52
+
53
+ def _process_internal(self, file_path: Path, media_type: MediaType, **kwargs) -> MediaContent:
54
+ """
55
+ Process file using appropriate processor for local providers.
56
+
57
+ Args:
58
+ file_path: Path to the file
59
+ media_type: Type of media
60
+ **kwargs: Processing options
61
+
62
+ Returns:
63
+ MediaContent formatted for local providers
64
+ """
65
+ # Local providers often prefer text extraction over binary content
66
+ processing_kwargs = kwargs.copy()
67
+
68
+ if media_type == MediaType.IMAGE:
69
+ if self.capabilities.vision_support:
70
+ from ..processors import ImageProcessor
71
+ # Pass model name for model-specific resolution optimization
72
+ if self.model_name:
73
+ processing_kwargs['model_name'] = self.model_name
74
+ processing_kwargs['prefer_max_resolution'] = True
75
+ processor = ImageProcessor(**processing_kwargs)
76
+ else:
77
+ # If no vision support, skip image processing
78
+ raise MediaProcessingError(f"Provider {self.provider_name} does not support image processing")
79
+
80
+ elif media_type == MediaType.DOCUMENT:
81
+ if file_path.suffix.lower() == '.pdf':
82
+ from ..processors import PDFProcessor
83
+ # Configure PDF processor for text-optimized output
84
+ processing_kwargs.update({
85
+ 'extract_images': False, # Local providers typically don't need embedded images
86
+ 'markdown_output': True, # Prefer markdown for better structure
87
+ 'preserve_tables': True
88
+ })
89
+ processor = PDFProcessor(**processing_kwargs)
90
+ else:
91
+ from ..processors import TextProcessor
92
+ processor = TextProcessor(**processing_kwargs)
93
+ else:
94
+ from ..processors import TextProcessor
95
+ processor = TextProcessor(**processing_kwargs)
96
+
97
+ # Process the file
98
+ result = processor.process_file(file_path, **processing_kwargs)
99
+
100
+ if not result.success:
101
+ raise MediaProcessingError(f"Failed to process {file_path}: {result.error_message}")
102
+
103
+ return result.media_content
104
+
105
+ def format_for_provider(self, media_content: MediaContent) -> Dict[str, Any]:
106
+ """
107
+ Format media content for local provider.
108
+
109
+ Args:
110
+ media_content: MediaContent to format
111
+
112
+ Returns:
113
+ Dictionary formatted for local provider
114
+ """
115
+ if self.provider_name == "ollama":
116
+ return self._format_for_ollama(media_content)
117
+ elif self.provider_name == "mlx":
118
+ return self._format_for_mlx(media_content)
119
+ elif self.provider_name == "lmstudio":
120
+ return self._format_for_lmstudio(media_content)
121
+ else:
122
+ # Generic local provider format
123
+ return self._format_generic_local(media_content)
124
+
125
+ def _format_for_ollama(self, media_content: MediaContent) -> Dict[str, Any]:
126
+ """
127
+ Format content for Ollama.
128
+
129
+ Ollama supports vision models and can handle base64 images.
130
+ """
131
+ if media_content.media_type == MediaType.IMAGE:
132
+ if not self.capabilities.vision_support:
133
+ raise MediaProcessingError("Ollama model does not support vision")
134
+
135
+ if media_content.content_format != ContentFormat.BASE64:
136
+ raise MediaProcessingError("Ollama image formatting requires base64 content")
137
+
138
+ # Ollama uses a simple base64 format
139
+ return {
140
+ "type": "image",
141
+ "data": media_content.content,
142
+ "mime_type": media_content.mime_type
143
+ }
144
+
145
+ else:
146
+ # Text content for Ollama
147
+ content = str(media_content.content)
148
+ return {
149
+ "type": "text",
150
+ "content": content
151
+ }
152
+
153
+ def _format_for_mlx(self, media_content: MediaContent) -> Dict[str, Any]:
154
+ """
155
+ Format content for MLX (Apple Silicon).
156
+
157
+ MLX may have specific optimizations for Apple hardware.
158
+ """
159
+ if media_content.media_type == MediaType.IMAGE:
160
+ if not self.capabilities.vision_support:
161
+ raise MediaProcessingError("MLX model does not support vision")
162
+
163
+ # MLX might prefer direct tensor conversion, but fall back to base64
164
+ return {
165
+ "type": "image_base64",
166
+ "content": media_content.content,
167
+ "mime_type": media_content.mime_type,
168
+ "metadata": media_content.metadata
169
+ }
170
+
171
+ else:
172
+ # Text content for MLX
173
+ content = str(media_content.content)
174
+ return {
175
+ "type": "text",
176
+ "content": content,
177
+ "format": media_content.content_format.value
178
+ }
179
+
180
+ def _format_for_lmstudio(self, media_content: MediaContent) -> Dict[str, Any]:
181
+ """
182
+ Format content for LMStudio.
183
+
184
+ LMStudio typically follows OpenAI-compatible formats but may have limitations.
185
+ """
186
+ if media_content.media_type == MediaType.IMAGE:
187
+ if not self.capabilities.vision_support:
188
+ raise MediaProcessingError("LMStudio model does not support vision")
189
+
190
+ # LMStudio may use OpenAI-compatible format
191
+ data_url = f"data:{media_content.mime_type};base64,{media_content.content}"
192
+ return {
193
+ "type": "image_url",
194
+ "image_url": {
195
+ "url": data_url
196
+ }
197
+ }
198
+
199
+ else:
200
+ # Text content for LMStudio
201
+ content = str(media_content.content)
202
+ return {
203
+ "type": "text",
204
+ "text": content
205
+ }
206
+
207
+ def _format_generic_local(self, media_content: MediaContent) -> Dict[str, Any]:
208
+ """
209
+ Generic format for unknown local providers.
210
+ """
211
+ if media_content.media_type == MediaType.IMAGE:
212
+ return {
213
+ "type": "image",
214
+ "content": media_content.content,
215
+ "content_format": media_content.content_format.value,
216
+ "mime_type": media_content.mime_type
217
+ }
218
+ else:
219
+ return {
220
+ "type": "text",
221
+ "content": str(media_content.content)
222
+ }
223
+
224
+ def create_multimodal_message(self, text: str, media_contents: List[MediaContent]) -> Union[Dict[str, Any], str]:
225
+ """
226
+ Create a multimodal message for local provider with intelligent vision routing.
227
+
228
+ Args:
229
+ text: Text content
230
+ media_contents: List of media contents
231
+
232
+ Returns:
233
+ Formatted message (structured dict for vision models, string for text-only)
234
+ """
235
+ # Check if we have images in the media contents
236
+ has_images = any(mc.media_type == MediaType.IMAGE for mc in media_contents)
237
+
238
+ if not has_images:
239
+ # No images - use text embedding for efficiency
240
+ self.logger.debug("No images detected, using text-embedded format")
241
+ return self._create_text_embedded_message(text, media_contents)
242
+
243
+ # We have images - check vision capabilities
244
+ provider_vision_support = self.capabilities.vision_support if self.capabilities else False
245
+
246
+ # Check model-level vision support using existing detection system
247
+ model_vision_support = False
248
+ if VISION_DETECTION_AVAILABLE and self.model_name and supports_vision:
249
+ try:
250
+ model_vision_support = supports_vision(self.model_name)
251
+ self.logger.debug(f"Model '{self.model_name}' vision support: {model_vision_support}")
252
+ except Exception as e:
253
+ self.logger.warning(f"Failed to detect vision support for model '{self.model_name}': {e}")
254
+ model_vision_support = False
255
+ elif not VISION_DETECTION_AVAILABLE:
256
+ self.logger.warning("Vision detection system not available - falling back to provider capabilities only")
257
+ elif not self.model_name:
258
+ self.logger.warning("No model name provided - cannot check model-specific vision capabilities")
259
+
260
+ # Decision logic: require BOTH provider AND model support for structured format
261
+ if provider_vision_support and model_vision_support:
262
+ self.logger.debug(f"Using structured format for vision model '{self.model_name}' on provider '{self.provider_name}'")
263
+ try:
264
+ return self._create_structured_message(text, media_contents)
265
+ except Exception as e:
266
+ self.logger.error(f"Failed to create structured message for vision model: {e}")
267
+ self.logger.warning("Falling back to text-embedded format")
268
+ return self._create_text_embedded_message(text, media_contents)
269
+
270
+ # Handle capability mismatches with detailed warnings
271
+ if has_images and not model_vision_support and self.model_name:
272
+ self.logger.info(
273
+ f"Model '{self.model_name}' does not support vision. "
274
+ f"Using vision fallback system for image analysis."
275
+ )
276
+ elif has_images and not provider_vision_support and not self.model_name:
277
+ self.logger.info(
278
+ f"No model-specific vision capabilities detected for provider '{self.provider_name}'. "
279
+ f"Using vision fallback system for image analysis."
280
+ )
281
+ elif has_images and not self.model_name:
282
+ self.logger.info(
283
+ f"No model name available for vision detection. "
284
+ f"Using vision fallback system for image analysis."
285
+ )
286
+
287
+ # Fallback to text-embedded format
288
+ self.logger.debug("Using text-embedded format due to insufficient vision capabilities")
289
+ return self._create_text_embedded_message(text, media_contents)
290
+
291
+ def _create_text_embedded_message(self, text: str, media_contents: List[MediaContent]) -> str:
292
+ """
293
+ Create a message with media content embedded as text.
294
+
295
+ This is often more reliable for local providers that don't have
296
+ robust multimodal support. For images on text-only models, uses vision fallback.
297
+ """
298
+ message_parts = []
299
+
300
+ # Add main text
301
+ if text.strip():
302
+ message_parts.append(text)
303
+
304
+ # Add processed content from media
305
+ for i, media_content in enumerate(media_contents):
306
+ if media_content.media_type == MediaType.IMAGE:
307
+ if self.capabilities.vision_support:
308
+ # For vision models, we'll still need to handle images specially
309
+ # This will be handled by the provider's generate method
310
+ message_parts.append(f"[Image {i+1}: {media_content.metadata.get('file_name', 'image')}]")
311
+ else:
312
+ # Use vision fallback for text-only models
313
+ try:
314
+ from ..vision_fallback import VisionFallbackHandler, VisionNotConfiguredError
315
+ fallback_handler = VisionFallbackHandler()
316
+
317
+ # Get the actual file path from media_content object
318
+ file_path = media_content.file_path or media_content.metadata.get('file_path') or media_content.metadata.get('file_name', 'image')
319
+
320
+ # Generate description using vision fallback
321
+ description = fallback_handler.create_description(str(file_path), text)
322
+ # Remove the original question from message_parts if it exists
323
+ if message_parts and text.strip() in message_parts[0]:
324
+ message_parts.clear()
325
+ # Completely different approach: make model think it's continuing its own observation
326
+ # No questions, no external framing - just natural continuation
327
+ simple_prompt = f"{description}"
328
+ message_parts.append(simple_prompt)
329
+
330
+ except VisionNotConfiguredError as e:
331
+ # Vision not configured - show warning to USER, not model
332
+ self.logger.warning("Vision capability not configured for text-only models")
333
+ self.logger.warning("To enable image analysis with text-only models:")
334
+ self.logger.warning("🔸 EASIEST: Download BLIP vision model (990MB): abstractcore --download-vision-model")
335
+ self.logger.warning("🔸 Use existing Ollama model: abstractcore --set-vision-caption qwen2.5vl:7b")
336
+ self.logger.warning("🔸 Use cloud API: abstractcore --set-vision-provider openai --model gpt-4o")
337
+ self.logger.warning("🔸 Interactive setup: abstractcore --configure")
338
+ self.logger.warning("Current status: abstractcore --status")
339
+
340
+ # Provide minimal placeholder to model (not configuration instructions!)
341
+ file_name = media_content.metadata.get('file_name', 'image')
342
+ message_parts.append(f"[Image {i+1}: {file_name}]")
343
+
344
+ except Exception as e:
345
+ self.logger.warning(f"Vision fallback failed: {e}")
346
+ # Fallback to basic placeholder
347
+ file_name = media_content.metadata.get('file_name', 'image')
348
+ message_parts.append(f"[Image {i+1}: {file_name} - vision processing unavailable]")
349
+ else:
350
+ # Embed text/document content directly
351
+ content = str(media_content.content)
352
+ file_name = media_content.metadata.get('file_name', f'document_{i+1}')
353
+ message_parts.append(f"\n\n--- Content from {file_name} ---\n{content}\n--- End of {file_name} ---")
354
+
355
+ return "\n\n".join(message_parts)
356
+
357
+ def _create_structured_message(self, text: str, media_contents: List[MediaContent]) -> Dict[str, Any]:
358
+ """
359
+ Create a structured message for local providers using provider-specific format.
360
+ """
361
+ if self.provider_name == "ollama":
362
+ return self._create_ollama_message(text, media_contents)
363
+ elif self.provider_name == "lmstudio":
364
+ return self._create_lmstudio_message(text, media_contents)
365
+ else:
366
+ # Generic structured format for other providers
367
+ return self._create_generic_structured_message(text, media_contents)
368
+
369
+ def _create_ollama_message(self, text: str, media_contents: List[MediaContent]) -> Dict[str, Any]:
370
+ """
371
+ Create Ollama-specific multimodal message format.
372
+
373
+ Ollama expects: {"role": "user", "content": "text", "images": ["base64..."]}
374
+ """
375
+ message = {
376
+ "role": "user",
377
+ "content": text.strip() if text.strip() else "What's in this image?"
378
+ }
379
+
380
+ # Extract base64 images for Ollama's images array
381
+ images = []
382
+ for media_content in media_contents:
383
+ if media_content.media_type == MediaType.IMAGE and self.can_handle_media(media_content):
384
+ if media_content.content_format == ContentFormat.BASE64:
385
+ # Ollama expects raw base64 without data URL prefix
386
+ images.append(media_content.content)
387
+ else:
388
+ self.logger.warning(f"Ollama requires base64 image format, got {media_content.content_format}")
389
+
390
+ if images:
391
+ message["images"] = images
392
+
393
+ return message
394
+
395
+ def _create_lmstudio_message(self, text: str, media_contents: List[MediaContent]) -> Dict[str, Any]:
396
+ """
397
+ Create LMStudio-specific multimodal message format.
398
+
399
+ LMStudio follows OpenAI-compatible format with structured content array.
400
+ """
401
+ content = []
402
+
403
+ # Add text content
404
+ if text.strip():
405
+ content.append({
406
+ "type": "text",
407
+ "text": text
408
+ })
409
+
410
+ # Add images in OpenAI format
411
+ for media_content in media_contents:
412
+ if media_content.media_type == MediaType.IMAGE and self.can_handle_media(media_content):
413
+ if media_content.content_format == ContentFormat.BASE64:
414
+ data_url = f"data:{media_content.mime_type};base64,{media_content.content}"
415
+ content.append({
416
+ "type": "image_url",
417
+ "image_url": {
418
+ "url": data_url
419
+ }
420
+ })
421
+ else:
422
+ self.logger.warning(f"LMStudio requires base64 image format, got {media_content.content_format}")
423
+
424
+ return {
425
+ "role": "user",
426
+ "content": content
427
+ }
428
+
429
+ def _create_generic_structured_message(self, text: str, media_contents: List[MediaContent]) -> Dict[str, Any]:
430
+ """
431
+ Create generic structured message for unknown local providers.
432
+ """
433
+ content = []
434
+
435
+ # Add text content
436
+ if text.strip():
437
+ content.append({
438
+ "type": "text",
439
+ "content": text
440
+ })
441
+
442
+ # Add media contents using provider-specific formatting
443
+ for media_content in media_contents:
444
+ if self.can_handle_media(media_content):
445
+ formatted_content = self.format_for_provider(media_content)
446
+ content.append(formatted_content)
447
+
448
+ return {
449
+ "role": "user",
450
+ "content": content
451
+ }
452
+
453
+ def validate_media_for_model(self, media_content: MediaContent, model: str) -> bool:
454
+ """
455
+ Validate if media content is compatible with local model.
456
+
457
+ Args:
458
+ media_content: MediaContent to validate
459
+ model: Local model name
460
+
461
+ Returns:
462
+ True if compatible, False otherwise
463
+ """
464
+ model_lower = model.lower()
465
+
466
+ # Image validation
467
+ if media_content.media_type == MediaType.IMAGE:
468
+ # Check if model supports vision
469
+ vision_support = self.model_capabilities.get('vision_support', False)
470
+ if not vision_support:
471
+ return False
472
+
473
+ # Local models are generally more permissive with image sizes
474
+ # but still check reasonable limits
475
+ file_size = media_content.metadata.get('file_size', 0)
476
+ if file_size > self.max_image_size:
477
+ return False
478
+
479
+ # Model-specific checks for known vision models
480
+ vision_keywords = ['vision', 'vl', 'multimodal', 'llava', 'qwen2-vl', 'qwen3-vl']
481
+ if any(keyword in model_lower for keyword in vision_keywords):
482
+ return True
483
+
484
+ return vision_support
485
+
486
+ # Text/document validation
487
+ elif media_content.media_type in [MediaType.TEXT, MediaType.DOCUMENT]:
488
+ # All local models support text
489
+ return True
490
+
491
+ return False
492
+
493
+ def estimate_tokens_for_media(self, media_content: MediaContent) -> int:
494
+ """
495
+ Estimate token usage for media content with local models.
496
+
497
+ Args:
498
+ media_content: MediaContent to estimate
499
+
500
+ Returns:
501
+ Estimated token count
502
+ """
503
+ if media_content.media_type == MediaType.IMAGE:
504
+ # Local vision models typically use fewer tokens than cloud models
505
+ # but this varies significantly by model architecture
506
+ return 512 # Conservative estimate
507
+
508
+ elif media_content.media_type in [MediaType.TEXT, MediaType.DOCUMENT]:
509
+ # Local models typically use similar tokenization to their base models
510
+ content_length = len(str(media_content.content))
511
+ return content_length // 4 # Rough estimate
512
+
513
+ return 0
514
+
515
+ def get_model_media_limits(self, model: str) -> Dict[str, Any]:
516
+ """
517
+ Get media-specific limits for local model.
518
+
519
+ Args:
520
+ model: Local model name
521
+
522
+ Returns:
523
+ Dictionary of limits
524
+ """
525
+ limits = {
526
+ 'max_images_per_message': 1, # Most local models support only 1 image
527
+ 'max_image_size_bytes': self.max_image_size,
528
+ 'supported_image_formats': ['png', 'jpeg', 'jpg'],
529
+ 'prefers_text_extraction': self.prefer_text_extraction
530
+ }
531
+
532
+ model_lower = model.lower()
533
+
534
+ # Adjust limits based on known model capabilities
535
+ if 'qwen' in model_lower and ('vl' in model_lower or 'vision' in model_lower):
536
+ limits.update({
537
+ 'max_images_per_message': 5, # Qwen-VL models can handle multiple images
538
+ 'supported_image_formats': ['png', 'jpeg', 'jpg', 'gif', 'bmp']
539
+ })
540
+
541
+ return limits