abstractcore 2.4.2__py3-none-any.whl → 2.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractcore/apps/app_config_utils.py +19 -0
- abstractcore/apps/summarizer.py +85 -56
- abstractcore/architectures/detection.py +15 -4
- abstractcore/assets/architecture_formats.json +1 -1
- abstractcore/assets/model_capabilities.json +420 -11
- abstractcore/core/interface.py +2 -0
- abstractcore/core/session.py +4 -0
- abstractcore/embeddings/manager.py +54 -16
- abstractcore/media/__init__.py +116 -148
- abstractcore/media/auto_handler.py +363 -0
- abstractcore/media/base.py +456 -0
- abstractcore/media/capabilities.py +335 -0
- abstractcore/media/types.py +300 -0
- abstractcore/media/vision_fallback.py +260 -0
- abstractcore/providers/anthropic_provider.py +18 -1
- abstractcore/providers/base.py +187 -0
- abstractcore/providers/huggingface_provider.py +111 -12
- abstractcore/providers/lmstudio_provider.py +88 -5
- abstractcore/providers/mlx_provider.py +33 -1
- abstractcore/providers/ollama_provider.py +37 -3
- abstractcore/providers/openai_provider.py +18 -1
- abstractcore/server/app.py +1390 -104
- abstractcore/tools/common_tools.py +12 -8
- abstractcore/utils/__init__.py +9 -5
- abstractcore/utils/cli.py +199 -17
- abstractcore/utils/message_preprocessor.py +182 -0
- abstractcore/utils/structured_logging.py +117 -16
- abstractcore/utils/version.py +1 -1
- {abstractcore-2.4.2.dist-info → abstractcore-2.4.4.dist-info}/METADATA +214 -20
- {abstractcore-2.4.2.dist-info → abstractcore-2.4.4.dist-info}/RECORD +34 -27
- {abstractcore-2.4.2.dist-info → abstractcore-2.4.4.dist-info}/entry_points.txt +1 -0
- {abstractcore-2.4.2.dist-info → abstractcore-2.4.4.dist-info}/WHEEL +0 -0
- {abstractcore-2.4.2.dist-info → abstractcore-2.4.4.dist-info}/licenses/LICENSE +0 -0
- {abstractcore-2.4.2.dist-info → abstractcore-2.4.4.dist-info}/top_level.txt +0 -0
abstractcore/media/__init__.py
CHANGED
|
@@ -1,151 +1,119 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Media
|
|
2
|
+
AbstractCore Media Handler System.
|
|
3
|
+
|
|
4
|
+
This module provides unified media handling capabilities across all providers,
|
|
5
|
+
supporting various file types including images, documents, audio, and video.
|
|
6
|
+
|
|
7
|
+
The system follows AbstractCore's proven architectural patterns:
|
|
8
|
+
- Interface → Base → Provider-Specific implementations
|
|
9
|
+
- Capability detection using model_capabilities.json
|
|
10
|
+
- Unified API across all providers
|
|
3
11
|
"""
|
|
4
12
|
|
|
5
|
-
|
|
6
|
-
from
|
|
7
|
-
from
|
|
8
|
-
from
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
Check if a file is an image.
|
|
113
|
-
|
|
114
|
-
Args:
|
|
115
|
-
path: Path to check
|
|
116
|
-
|
|
117
|
-
Returns:
|
|
118
|
-
True if the file is an image
|
|
119
|
-
"""
|
|
120
|
-
image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.ico', '.tiff'}
|
|
121
|
-
return Path(path).suffix.lower() in image_extensions
|
|
122
|
-
|
|
123
|
-
@staticmethod
|
|
124
|
-
def get_media_type(path: Union[str, Path]) -> MediaType:
|
|
125
|
-
"""
|
|
126
|
-
Determine the media type of a file.
|
|
127
|
-
|
|
128
|
-
Args:
|
|
129
|
-
path: Path to the file
|
|
130
|
-
|
|
131
|
-
Returns:
|
|
132
|
-
MediaType enum value
|
|
133
|
-
"""
|
|
134
|
-
path = Path(path)
|
|
135
|
-
extension = path.suffix.lower()
|
|
136
|
-
|
|
137
|
-
image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp'}
|
|
138
|
-
audio_extensions = {'.mp3', '.wav', '.m4a', '.ogg', '.flac'}
|
|
139
|
-
video_extensions = {'.mp4', '.avi', '.mov', '.mkv', '.webm'}
|
|
140
|
-
document_extensions = {'.pdf', '.doc', '.docx', '.txt', '.md'}
|
|
141
|
-
|
|
142
|
-
if extension in image_extensions:
|
|
143
|
-
return MediaType.IMAGE
|
|
144
|
-
elif extension in audio_extensions:
|
|
145
|
-
return MediaType.AUDIO
|
|
146
|
-
elif extension in video_extensions:
|
|
147
|
-
return MediaType.VIDEO
|
|
148
|
-
elif extension in document_extensions:
|
|
149
|
-
return MediaType.DOCUMENT
|
|
150
|
-
else:
|
|
151
|
-
return MediaType.DOCUMENT # Default to document
|
|
13
|
+
# Core types and base classes
|
|
14
|
+
from .base import BaseMediaHandler, BaseProviderMediaHandler
|
|
15
|
+
from .types import MediaContent, MediaType, ContentFormat, MultimodalMessage
|
|
16
|
+
from .auto_handler import AutoMediaHandler
|
|
17
|
+
|
|
18
|
+
# Media processing capabilities
|
|
19
|
+
from .capabilities import (
|
|
20
|
+
MediaCapabilities,
|
|
21
|
+
get_media_capabilities,
|
|
22
|
+
is_vision_model,
|
|
23
|
+
is_multimodal_model,
|
|
24
|
+
get_supported_media_types,
|
|
25
|
+
supports_images,
|
|
26
|
+
supports_documents,
|
|
27
|
+
get_max_images,
|
|
28
|
+
should_use_text_embedding
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# Processors for different file types
|
|
32
|
+
from .processors import ImageProcessor, TextProcessor, PDFProcessor, OfficeProcessor
|
|
33
|
+
|
|
34
|
+
# Provider-specific handlers
|
|
35
|
+
from .handlers import OpenAIMediaHandler, AnthropicMediaHandler, LocalMediaHandler
|
|
36
|
+
|
|
37
|
+
# Default media handler - automatically selects appropriate processor
|
|
38
|
+
class MediaHandler(AutoMediaHandler):
|
|
39
|
+
"""
|
|
40
|
+
Default media handler that automatically selects the appropriate processor.
|
|
41
|
+
|
|
42
|
+
This class provides automatic file type detection and processor selection,
|
|
43
|
+
making it easy to process any supported media type with a single interface.
|
|
44
|
+
"""
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
# Convenience functions for common operations
|
|
48
|
+
def process_file(file_path: str) -> MediaContent:
|
|
49
|
+
"""
|
|
50
|
+
Process a file using the automatic media handler.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
file_path: Path to the file to process
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
MediaContent object with processed content
|
|
57
|
+
"""
|
|
58
|
+
handler = AutoMediaHandler()
|
|
59
|
+
result = handler.process_file(file_path)
|
|
60
|
+
if result.success:
|
|
61
|
+
return result.media_content
|
|
62
|
+
else:
|
|
63
|
+
from .base import MediaProcessingError
|
|
64
|
+
raise MediaProcessingError(result.error_message)
|
|
65
|
+
|
|
66
|
+
def get_media_type_from_path(file_path: str) -> MediaType:
|
|
67
|
+
"""
|
|
68
|
+
Determine media type from file path.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
file_path: Path to the file
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
MediaType enum value
|
|
75
|
+
"""
|
|
76
|
+
from .types import detect_media_type
|
|
77
|
+
from pathlib import Path
|
|
78
|
+
return detect_media_type(Path(file_path))
|
|
79
|
+
|
|
80
|
+
# Export all public components
|
|
81
|
+
__all__ = [
|
|
82
|
+
# Core types
|
|
83
|
+
'MediaContent',
|
|
84
|
+
'MediaType',
|
|
85
|
+
'ContentFormat',
|
|
86
|
+
'MultimodalMessage',
|
|
87
|
+
|
|
88
|
+
# Base classes
|
|
89
|
+
'BaseMediaHandler',
|
|
90
|
+
'BaseProviderMediaHandler',
|
|
91
|
+
'AutoMediaHandler',
|
|
92
|
+
|
|
93
|
+
# Capability detection
|
|
94
|
+
'MediaCapabilities',
|
|
95
|
+
'get_media_capabilities',
|
|
96
|
+
'is_vision_model',
|
|
97
|
+
'is_multimodal_model',
|
|
98
|
+
'get_supported_media_types',
|
|
99
|
+
'supports_images',
|
|
100
|
+
'supports_documents',
|
|
101
|
+
'get_max_images',
|
|
102
|
+
'should_use_text_embedding',
|
|
103
|
+
|
|
104
|
+
# Processors
|
|
105
|
+
'ImageProcessor',
|
|
106
|
+
'TextProcessor',
|
|
107
|
+
'PDFProcessor',
|
|
108
|
+
'OfficeProcessor',
|
|
109
|
+
|
|
110
|
+
# Handlers
|
|
111
|
+
'OpenAIMediaHandler',
|
|
112
|
+
'AnthropicMediaHandler',
|
|
113
|
+
'LocalMediaHandler',
|
|
114
|
+
|
|
115
|
+
# Legacy and convenience
|
|
116
|
+
'MediaHandler',
|
|
117
|
+
'process_file',
|
|
118
|
+
'get_media_type_from_path'
|
|
119
|
+
]
|
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Automatic media handler that selects the appropriate processor for each file type.
|
|
3
|
+
|
|
4
|
+
This module provides a unified interface that automatically chooses the best
|
|
5
|
+
processor (ImageProcessor, TextProcessor, PDFProcessor, or OfficeProcessor)
|
|
6
|
+
based on the file type and content.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Dict, Any, Optional, List
|
|
12
|
+
|
|
13
|
+
from .base import BaseMediaHandler
|
|
14
|
+
from .types import MediaContent, MediaType, ContentFormat, detect_media_type
|
|
15
|
+
from .processors import ImageProcessor, TextProcessor, PDFProcessor, OfficeProcessor
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class AutoMediaHandler(BaseMediaHandler):
|
|
19
|
+
"""
|
|
20
|
+
Automatic media handler that delegates to specialized processors.
|
|
21
|
+
|
|
22
|
+
This handler analyzes the input file and automatically selects the most
|
|
23
|
+
appropriate processor to handle the content, providing a unified interface
|
|
24
|
+
for all media types.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, **kwargs):
|
|
28
|
+
"""
|
|
29
|
+
Initialize the auto media handler.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
**kwargs: Configuration parameters passed to processors
|
|
33
|
+
"""
|
|
34
|
+
super().__init__(**kwargs)
|
|
35
|
+
|
|
36
|
+
# Configuration for processors
|
|
37
|
+
self.processor_config = kwargs
|
|
38
|
+
|
|
39
|
+
# Initialize processors lazily (only when needed)
|
|
40
|
+
self._image_processor = None
|
|
41
|
+
self._text_processor = None
|
|
42
|
+
self._pdf_processor = None
|
|
43
|
+
self._office_processor = None
|
|
44
|
+
|
|
45
|
+
# Track which processors are available
|
|
46
|
+
self._available_processors = self._check_processor_availability()
|
|
47
|
+
|
|
48
|
+
self.logger.debug(f"AutoMediaHandler initialized with processors: {list(self._available_processors.keys())}")
|
|
49
|
+
|
|
50
|
+
def _check_processor_availability(self) -> Dict[str, bool]:
|
|
51
|
+
"""Check which processors are available."""
|
|
52
|
+
availability = {}
|
|
53
|
+
|
|
54
|
+
# ImageProcessor (requires PIL)
|
|
55
|
+
try:
|
|
56
|
+
from PIL import Image
|
|
57
|
+
availability['image'] = True
|
|
58
|
+
except ImportError:
|
|
59
|
+
availability['image'] = False
|
|
60
|
+
|
|
61
|
+
# TextProcessor (always available - uses built-in libraries)
|
|
62
|
+
availability['text'] = True
|
|
63
|
+
|
|
64
|
+
# PDFProcessor (requires PyMuPDF4LLM)
|
|
65
|
+
try:
|
|
66
|
+
import pymupdf4llm
|
|
67
|
+
availability['pdf'] = True
|
|
68
|
+
except ImportError:
|
|
69
|
+
availability['pdf'] = False
|
|
70
|
+
|
|
71
|
+
# OfficeProcessor (requires unstructured)
|
|
72
|
+
try:
|
|
73
|
+
import unstructured
|
|
74
|
+
availability['office'] = True
|
|
75
|
+
except ImportError:
|
|
76
|
+
availability['office'] = False
|
|
77
|
+
|
|
78
|
+
return availability
|
|
79
|
+
|
|
80
|
+
def _get_image_processor(self) -> ImageProcessor:
|
|
81
|
+
"""Get or create ImageProcessor instance."""
|
|
82
|
+
if self._image_processor is None:
|
|
83
|
+
self._image_processor = ImageProcessor(**self.processor_config)
|
|
84
|
+
return self._image_processor
|
|
85
|
+
|
|
86
|
+
def _get_text_processor(self) -> TextProcessor:
|
|
87
|
+
"""Get or create TextProcessor instance."""
|
|
88
|
+
if self._text_processor is None:
|
|
89
|
+
self._text_processor = TextProcessor(**self.processor_config)
|
|
90
|
+
return self._text_processor
|
|
91
|
+
|
|
92
|
+
def _get_pdf_processor(self) -> PDFProcessor:
|
|
93
|
+
"""Get or create PDFProcessor instance."""
|
|
94
|
+
if self._pdf_processor is None:
|
|
95
|
+
self._pdf_processor = PDFProcessor(**self.processor_config)
|
|
96
|
+
return self._pdf_processor
|
|
97
|
+
|
|
98
|
+
def _get_office_processor(self) -> OfficeProcessor:
|
|
99
|
+
"""Get or create OfficeProcessor instance."""
|
|
100
|
+
if self._office_processor is None:
|
|
101
|
+
self._office_processor = OfficeProcessor(**self.processor_config)
|
|
102
|
+
return self._office_processor
|
|
103
|
+
|
|
104
|
+
def _select_processor(self, file_path: Path, media_type: MediaType) -> Optional[BaseMediaHandler]:
|
|
105
|
+
"""
|
|
106
|
+
Select the appropriate processor for the file.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
file_path: Path to the file
|
|
110
|
+
media_type: Detected media type
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
Appropriate processor instance or None if unsupported
|
|
114
|
+
"""
|
|
115
|
+
file_extension = file_path.suffix.lower()
|
|
116
|
+
|
|
117
|
+
# Handle images
|
|
118
|
+
if media_type == MediaType.IMAGE:
|
|
119
|
+
if self._available_processors.get('image', False):
|
|
120
|
+
return self._get_image_processor()
|
|
121
|
+
else:
|
|
122
|
+
self.logger.warning("Image processing requested but PIL not available")
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
# Handle text files
|
|
126
|
+
elif media_type == MediaType.TEXT:
|
|
127
|
+
return self._get_text_processor()
|
|
128
|
+
|
|
129
|
+
# Handle documents
|
|
130
|
+
elif media_type == MediaType.DOCUMENT:
|
|
131
|
+
# PDF files
|
|
132
|
+
if file_extension == '.pdf':
|
|
133
|
+
if self._available_processors.get('pdf', False):
|
|
134
|
+
return self._get_pdf_processor()
|
|
135
|
+
else:
|
|
136
|
+
self.logger.warning("PDF processing requested but PyMuPDF4LLM not available")
|
|
137
|
+
# Fall back to text processor for basic extraction
|
|
138
|
+
return self._get_text_processor()
|
|
139
|
+
|
|
140
|
+
# Office documents
|
|
141
|
+
elif file_extension in {'.docx', '.xlsx', '.pptx'}:
|
|
142
|
+
if self._available_processors.get('office', False):
|
|
143
|
+
return self._get_office_processor()
|
|
144
|
+
else:
|
|
145
|
+
self.logger.warning(f"Office document processing requested but unstructured library not available for {file_extension}")
|
|
146
|
+
# Fall back to text processor (limited functionality)
|
|
147
|
+
return self._get_text_processor()
|
|
148
|
+
|
|
149
|
+
# Text-based documents
|
|
150
|
+
else:
|
|
151
|
+
return self._get_text_processor()
|
|
152
|
+
|
|
153
|
+
# Handle other media types (audio, video) - not yet implemented
|
|
154
|
+
else:
|
|
155
|
+
self.logger.warning(f"Media type {media_type.value} not yet supported")
|
|
156
|
+
return None
|
|
157
|
+
|
|
158
|
+
def _process_internal(self, file_path: Path, media_type: MediaType, **kwargs) -> MediaContent:
|
|
159
|
+
"""
|
|
160
|
+
Internal processing that delegates to the appropriate processor.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
file_path: Path to the file to process
|
|
164
|
+
media_type: Detected media type
|
|
165
|
+
**kwargs: Additional processing parameters
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
MediaContent object with processed content
|
|
169
|
+
"""
|
|
170
|
+
# Select the appropriate processor
|
|
171
|
+
processor = self._select_processor(file_path, media_type)
|
|
172
|
+
|
|
173
|
+
if processor is None:
|
|
174
|
+
# Create a basic text representation as fallback
|
|
175
|
+
return self._create_fallback_content(file_path, media_type)
|
|
176
|
+
|
|
177
|
+
# Delegate to the selected processor
|
|
178
|
+
try:
|
|
179
|
+
return processor._process_internal(file_path, media_type, **kwargs)
|
|
180
|
+
except Exception as e:
|
|
181
|
+
self.logger.error(f"Processor {processor.__class__.__name__} failed for {file_path}: {e}")
|
|
182
|
+
# Fall back to basic content creation
|
|
183
|
+
return self._create_fallback_content(file_path, media_type)
|
|
184
|
+
|
|
185
|
+
def _create_fallback_content(self, file_path: Path, media_type: MediaType) -> MediaContent:
|
|
186
|
+
"""
|
|
187
|
+
Create fallback content when processors are not available.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
file_path: Path to the file
|
|
191
|
+
media_type: Media type
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
Basic MediaContent object
|
|
195
|
+
"""
|
|
196
|
+
file_extension = file_path.suffix.lower()
|
|
197
|
+
|
|
198
|
+
# Try to read as text for document types
|
|
199
|
+
if media_type == MediaType.DOCUMENT and file_extension in {'.txt', '.md', '.csv', '.tsv'}:
|
|
200
|
+
try:
|
|
201
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
202
|
+
content = f.read()
|
|
203
|
+
content_format = ContentFormat.TEXT
|
|
204
|
+
except Exception:
|
|
205
|
+
content = f"[Unable to read file: {file_path.name}]"
|
|
206
|
+
content_format = ContentFormat.TEXT
|
|
207
|
+
else:
|
|
208
|
+
# For other types, create a placeholder
|
|
209
|
+
content = f"[{media_type.value.title()}: {file_path.name}]"
|
|
210
|
+
content_format = ContentFormat.TEXT
|
|
211
|
+
|
|
212
|
+
return self._create_media_content(
|
|
213
|
+
content=content,
|
|
214
|
+
file_path=file_path,
|
|
215
|
+
media_type=media_type,
|
|
216
|
+
content_format=content_format,
|
|
217
|
+
mime_type="auto",
|
|
218
|
+
fallback_processing=True,
|
|
219
|
+
available_processors=list(self._available_processors.keys())
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
def supports_media_type(self, media_type: MediaType) -> bool:
|
|
223
|
+
"""
|
|
224
|
+
Check if this handler supports the given media type.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
media_type: MediaType to check
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
True if any processor can handle this type
|
|
231
|
+
"""
|
|
232
|
+
if media_type == MediaType.IMAGE:
|
|
233
|
+
return self._available_processors.get('image', False)
|
|
234
|
+
elif media_type == MediaType.TEXT:
|
|
235
|
+
return True # Always supported via text processor
|
|
236
|
+
elif media_type == MediaType.DOCUMENT:
|
|
237
|
+
return True # Always supported via text processor at minimum
|
|
238
|
+
elif media_type == MediaType.AUDIO:
|
|
239
|
+
return False # Not yet implemented
|
|
240
|
+
elif media_type == MediaType.VIDEO:
|
|
241
|
+
return False # Not yet implemented
|
|
242
|
+
return False
|
|
243
|
+
|
|
244
|
+
def supports_format(self, media_type: MediaType, format_ext: str) -> bool:
|
|
245
|
+
"""
|
|
246
|
+
Check if this handler supports the specific format.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
media_type: MediaType of the content
|
|
250
|
+
format_ext: File extension (without dot)
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
True if supported
|
|
254
|
+
"""
|
|
255
|
+
if media_type == MediaType.IMAGE:
|
|
256
|
+
if not self._available_processors.get('image', False):
|
|
257
|
+
return False
|
|
258
|
+
image_formats = {'jpg', 'jpeg', 'png', 'gif', 'bmp', 'tiff', 'webp'}
|
|
259
|
+
return format_ext.lower() in image_formats
|
|
260
|
+
|
|
261
|
+
elif media_type == MediaType.TEXT:
|
|
262
|
+
# Text formats (always available)
|
|
263
|
+
text_formats = {'txt', 'md', 'csv', 'tsv', 'json', 'yaml', 'yml'}
|
|
264
|
+
return format_ext.lower() in text_formats
|
|
265
|
+
|
|
266
|
+
elif media_type == MediaType.DOCUMENT:
|
|
267
|
+
# PDF support
|
|
268
|
+
if format_ext.lower() == 'pdf':
|
|
269
|
+
return self._available_processors.get('pdf', False) or True # Fallback to text
|
|
270
|
+
|
|
271
|
+
# Office document support
|
|
272
|
+
if format_ext.lower() in {'docx', 'xlsx', 'pptx'}:
|
|
273
|
+
return self._available_processors.get('office', False) or True # Fallback to text
|
|
274
|
+
|
|
275
|
+
# Text document support (always available)
|
|
276
|
+
text_formats = {'txt', 'md', 'csv', 'tsv', 'json', 'yaml', 'yml'}
|
|
277
|
+
return format_ext.lower() in text_formats
|
|
278
|
+
|
|
279
|
+
return False
|
|
280
|
+
|
|
281
|
+
def get_supported_formats(self) -> Dict[str, List[str]]:
|
|
282
|
+
"""
|
|
283
|
+
Get supported formats organized by media type.
|
|
284
|
+
|
|
285
|
+
Returns:
|
|
286
|
+
Dictionary mapping media type to list of supported extensions
|
|
287
|
+
"""
|
|
288
|
+
formats = {}
|
|
289
|
+
|
|
290
|
+
# Image formats
|
|
291
|
+
if self._available_processors.get('image', False):
|
|
292
|
+
formats['image'] = ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'tiff', 'webp']
|
|
293
|
+
|
|
294
|
+
# Document formats
|
|
295
|
+
doc_formats = ['txt', 'md', 'csv', 'tsv', 'json', 'yaml', 'yml']
|
|
296
|
+
|
|
297
|
+
if self._available_processors.get('pdf', False):
|
|
298
|
+
doc_formats.append('pdf')
|
|
299
|
+
|
|
300
|
+
if self._available_processors.get('office', False):
|
|
301
|
+
doc_formats.extend(['docx', 'xlsx', 'pptx'])
|
|
302
|
+
|
|
303
|
+
formats['document'] = doc_formats
|
|
304
|
+
|
|
305
|
+
return formats
|
|
306
|
+
|
|
307
|
+
def get_processor_info(self) -> Dict[str, Any]:
|
|
308
|
+
"""
|
|
309
|
+
Get information about available processors and their capabilities.
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
Dictionary with processor information
|
|
313
|
+
"""
|
|
314
|
+
info = {
|
|
315
|
+
'handler_type': 'AutoMediaHandler',
|
|
316
|
+
'available_processors': self._available_processors.copy(),
|
|
317
|
+
'supported_formats': self.get_supported_formats(),
|
|
318
|
+
'capabilities': {
|
|
319
|
+
'images': self._available_processors.get('image', False),
|
|
320
|
+
'pdf_documents': self._available_processors.get('pdf', False),
|
|
321
|
+
'office_documents': self._available_processors.get('office', False),
|
|
322
|
+
'text_documents': True,
|
|
323
|
+
'automatic_selection': True,
|
|
324
|
+
'fallback_processing': True
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
# Add processor-specific information
|
|
329
|
+
if self._available_processors.get('image', False):
|
|
330
|
+
info['image_processor'] = self._get_image_processor().get_processing_info()
|
|
331
|
+
|
|
332
|
+
if self._available_processors.get('pdf', False):
|
|
333
|
+
info['pdf_processor'] = self._get_pdf_processor().get_processing_info()
|
|
334
|
+
|
|
335
|
+
if self._available_processors.get('office', False):
|
|
336
|
+
info['office_processor'] = self._get_office_processor().get_processing_info()
|
|
337
|
+
|
|
338
|
+
info['text_processor'] = self._get_text_processor().get_processing_info()
|
|
339
|
+
|
|
340
|
+
return info
|
|
341
|
+
|
|
342
|
+
def estimate_processing_time(self, file_path: Path) -> float:
|
|
343
|
+
"""
|
|
344
|
+
Estimate processing time for a file.
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
file_path: Path to the file
|
|
348
|
+
|
|
349
|
+
Returns:
|
|
350
|
+
Estimated processing time in seconds
|
|
351
|
+
"""
|
|
352
|
+
if not file_path.exists():
|
|
353
|
+
return 0.0
|
|
354
|
+
|
|
355
|
+
media_type = detect_media_type(file_path)
|
|
356
|
+
processor = self._select_processor(file_path, media_type)
|
|
357
|
+
|
|
358
|
+
if processor and hasattr(processor, 'estimate_processing_time'):
|
|
359
|
+
return processor.estimate_processing_time(file_path)
|
|
360
|
+
else:
|
|
361
|
+
# Basic estimation based on file size
|
|
362
|
+
file_size_mb = file_path.stat().st_size / (1024 * 1024)
|
|
363
|
+
return max(0.1, file_size_mb / 10.0) # ~10MB/second processing rate
|