abstractcore 2.5.2__py3-none-any.whl → 2.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractcore/__init__.py +12 -0
- abstractcore/architectures/detection.py +250 -4
- abstractcore/assets/architecture_formats.json +14 -1
- abstractcore/assets/model_capabilities.json +533 -10
- abstractcore/compression/__init__.py +29 -0
- abstractcore/compression/analytics.py +420 -0
- abstractcore/compression/cache.py +250 -0
- abstractcore/compression/config.py +279 -0
- abstractcore/compression/exceptions.py +30 -0
- abstractcore/compression/glyph_processor.py +381 -0
- abstractcore/compression/optimizer.py +388 -0
- abstractcore/compression/orchestrator.py +380 -0
- abstractcore/compression/pil_text_renderer.py +818 -0
- abstractcore/compression/quality.py +226 -0
- abstractcore/compression/text_formatter.py +666 -0
- abstractcore/compression/vision_compressor.py +371 -0
- abstractcore/config/main.py +64 -0
- abstractcore/config/manager.py +100 -5
- abstractcore/core/session.py +61 -6
- abstractcore/events/__init__.py +1 -1
- abstractcore/media/auto_handler.py +312 -18
- abstractcore/media/handlers/local_handler.py +14 -2
- abstractcore/media/handlers/openai_handler.py +62 -3
- abstractcore/media/processors/__init__.py +11 -1
- abstractcore/media/processors/direct_pdf_processor.py +210 -0
- abstractcore/media/processors/glyph_pdf_processor.py +227 -0
- abstractcore/media/processors/image_processor.py +7 -1
- abstractcore/media/processors/text_processor.py +18 -3
- abstractcore/media/types.py +164 -7
- abstractcore/providers/__init__.py +18 -0
- abstractcore/providers/anthropic_provider.py +28 -2
- abstractcore/providers/base.py +278 -6
- abstractcore/providers/huggingface_provider.py +563 -23
- abstractcore/providers/lmstudio_provider.py +38 -2
- abstractcore/providers/mlx_provider.py +27 -2
- abstractcore/providers/model_capabilities.py +352 -0
- abstractcore/providers/ollama_provider.py +38 -4
- abstractcore/providers/openai_provider.py +28 -2
- abstractcore/providers/registry.py +85 -13
- abstractcore/server/app.py +91 -81
- abstractcore/utils/__init__.py +4 -1
- abstractcore/utils/trace_export.py +287 -0
- abstractcore/utils/version.py +1 -1
- abstractcore/utils/vlm_token_calculator.py +655 -0
- {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/METADATA +107 -6
- {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/RECORD +50 -33
- {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/WHEEL +0 -0
- {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/entry_points.txt +0 -0
- {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/licenses/LICENSE +0 -0
- {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,380 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Compression orchestrator for intelligent decision-making.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Optional, Union, Dict, Any, List
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from ..utils.token_utils import TokenUtils
|
|
9
|
+
from ..utils.structured_logging import get_logger
|
|
10
|
+
from .config import GlyphConfig
|
|
11
|
+
# Import GlyphProcessor lazily to avoid circular imports
|
|
12
|
+
from .exceptions import CompressionError
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class CompressionOrchestrator:
|
|
16
|
+
"""Intelligent decision-making for when and how to apply Glyph compression."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, config: Optional[GlyphConfig] = None):
|
|
19
|
+
"""
|
|
20
|
+
Initialize compression orchestrator.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
config: Glyph configuration
|
|
24
|
+
"""
|
|
25
|
+
self.config = config or GlyphConfig.from_abstractcore_config()
|
|
26
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
27
|
+
self.glyph_processor = None # Lazy initialization
|
|
28
|
+
|
|
29
|
+
self.logger.debug("CompressionOrchestrator initialized")
|
|
30
|
+
|
|
31
|
+
def _get_glyph_processor(self):
|
|
32
|
+
"""Get or create Glyph processor instance."""
|
|
33
|
+
if self.glyph_processor is None:
|
|
34
|
+
# Lazy import to avoid circular dependency
|
|
35
|
+
from .glyph_processor import GlyphProcessor
|
|
36
|
+
self.glyph_processor = GlyphProcessor(self.config)
|
|
37
|
+
return self.glyph_processor
|
|
38
|
+
|
|
39
|
+
def should_compress(
|
|
40
|
+
self,
|
|
41
|
+
content: Union[str, Path],
|
|
42
|
+
provider: str,
|
|
43
|
+
model: str,
|
|
44
|
+
user_preference: str = "auto"
|
|
45
|
+
) -> bool:
|
|
46
|
+
"""
|
|
47
|
+
Intelligent compression decision based on multiple factors.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
content: Text content or file path
|
|
51
|
+
provider: Provider name
|
|
52
|
+
model: Model name
|
|
53
|
+
user_preference: User compression preference (auto, always, never)
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
True if compression should be applied
|
|
57
|
+
"""
|
|
58
|
+
# Check user preference first
|
|
59
|
+
if user_preference == "never":
|
|
60
|
+
return False
|
|
61
|
+
elif user_preference == "always":
|
|
62
|
+
return self._can_compress(content, provider, model)
|
|
63
|
+
|
|
64
|
+
# Auto-decision logic
|
|
65
|
+
try:
|
|
66
|
+
# Get content as string
|
|
67
|
+
if isinstance(content, Path):
|
|
68
|
+
with open(content, 'r', encoding='utf-8') as f:
|
|
69
|
+
text_content = f.read()
|
|
70
|
+
else:
|
|
71
|
+
text_content = content
|
|
72
|
+
|
|
73
|
+
# Basic feasibility checks
|
|
74
|
+
if not self._can_compress(text_content, provider, model):
|
|
75
|
+
return False
|
|
76
|
+
|
|
77
|
+
# Token-based decision
|
|
78
|
+
token_count = TokenUtils.estimate_tokens(text_content, model)
|
|
79
|
+
model_context = self._get_model_context_window(provider, model)
|
|
80
|
+
|
|
81
|
+
# Decision matrix based on Glyph research
|
|
82
|
+
if token_count < self.config.min_token_threshold:
|
|
83
|
+
return False # Too small to benefit
|
|
84
|
+
elif token_count > model_context * 0.8:
|
|
85
|
+
return True # Necessary if approaching limits
|
|
86
|
+
elif token_count > 50000:
|
|
87
|
+
return True # Beneficial for large texts
|
|
88
|
+
else:
|
|
89
|
+
return False # Standard processing sufficient
|
|
90
|
+
|
|
91
|
+
except Exception as e:
|
|
92
|
+
self.logger.warning(f"Compression decision failed, defaulting to False: {e}")
|
|
93
|
+
return False
|
|
94
|
+
|
|
95
|
+
def _can_compress(self, content: Union[str, Path], provider: str, model: str) -> bool:
|
|
96
|
+
"""Check if compression is technically feasible."""
|
|
97
|
+
try:
|
|
98
|
+
# Check if compression is enabled
|
|
99
|
+
if not self.config.enabled:
|
|
100
|
+
return False
|
|
101
|
+
|
|
102
|
+
# Check provider vision support
|
|
103
|
+
if not self._supports_vision(provider, model):
|
|
104
|
+
return False
|
|
105
|
+
|
|
106
|
+
# Check content type suitability
|
|
107
|
+
if isinstance(content, Path):
|
|
108
|
+
if not self._is_compressible_file(content):
|
|
109
|
+
return False
|
|
110
|
+
else:
|
|
111
|
+
if not self._is_compressible_text(content):
|
|
112
|
+
return False
|
|
113
|
+
|
|
114
|
+
return True
|
|
115
|
+
|
|
116
|
+
except Exception:
|
|
117
|
+
return False
|
|
118
|
+
|
|
119
|
+
def _supports_vision(self, provider: str, model: str) -> bool:
|
|
120
|
+
"""Check if provider/model supports vision."""
|
|
121
|
+
try:
|
|
122
|
+
from ..media.capabilities import get_model_capabilities
|
|
123
|
+
capabilities = get_model_capabilities(provider, model)
|
|
124
|
+
return capabilities.get('vision_support', False)
|
|
125
|
+
except Exception:
|
|
126
|
+
# Conservative approach for unknown providers
|
|
127
|
+
vision_providers = ['openai', 'anthropic', 'ollama', 'lmstudio']
|
|
128
|
+
return any(vp in provider.lower() for vp in vision_providers)
|
|
129
|
+
|
|
130
|
+
def _get_model_context_window(self, provider: str, model: str) -> int:
|
|
131
|
+
"""Get model context window size."""
|
|
132
|
+
try:
|
|
133
|
+
from ..media.capabilities import get_model_capabilities
|
|
134
|
+
capabilities = get_model_capabilities(provider, model)
|
|
135
|
+
return capabilities.get('context_window', 32000)
|
|
136
|
+
except Exception:
|
|
137
|
+
# Default context windows by provider
|
|
138
|
+
defaults = {
|
|
139
|
+
'openai': 128000,
|
|
140
|
+
'anthropic': 200000,
|
|
141
|
+
'ollama': 32000,
|
|
142
|
+
'lmstudio': 32000,
|
|
143
|
+
'mlx': 32000,
|
|
144
|
+
'huggingface': 32000
|
|
145
|
+
}
|
|
146
|
+
return defaults.get(provider, 32000)
|
|
147
|
+
|
|
148
|
+
def _is_compressible_file(self, file_path: Path) -> bool:
|
|
149
|
+
"""Check if file type is suitable for compression."""
|
|
150
|
+
if not file_path.exists():
|
|
151
|
+
return False
|
|
152
|
+
|
|
153
|
+
# Check file extension
|
|
154
|
+
compressible_extensions = {'.txt', '.md', '.csv', '.tsv', '.json', '.yaml', '.yml'}
|
|
155
|
+
if file_path.suffix.lower() not in compressible_extensions:
|
|
156
|
+
return False
|
|
157
|
+
|
|
158
|
+
# Check file size (avoid very large files)
|
|
159
|
+
try:
|
|
160
|
+
file_size = file_path.stat().st_size
|
|
161
|
+
if file_size > 10 * 1024 * 1024: # 10MB limit
|
|
162
|
+
return False
|
|
163
|
+
except Exception:
|
|
164
|
+
return False
|
|
165
|
+
|
|
166
|
+
return True
|
|
167
|
+
|
|
168
|
+
def _is_compressible_text(self, text: str) -> bool:
|
|
169
|
+
"""Check if text content is suitable for compression."""
|
|
170
|
+
if not text or len(text.strip()) < 100:
|
|
171
|
+
return False
|
|
172
|
+
|
|
173
|
+
# Check for problematic content types
|
|
174
|
+
# Mathematical notation (challenging for OCR)
|
|
175
|
+
math_indicators = ['∑', '∫', '∂', '√', '±', '≤', '≥', '≠', '∞']
|
|
176
|
+
math_count = sum(1 for indicator in math_indicators if indicator in text)
|
|
177
|
+
if math_count > len(text) * 0.01: # >1% mathematical symbols
|
|
178
|
+
return False
|
|
179
|
+
|
|
180
|
+
# Very dense special characters
|
|
181
|
+
special_chars = sum(1 for c in text if not c.isalnum() and not c.isspace())
|
|
182
|
+
if special_chars > len(text) * 0.5: # >50% special characters
|
|
183
|
+
return False
|
|
184
|
+
|
|
185
|
+
return True
|
|
186
|
+
|
|
187
|
+
def compress_content(
|
|
188
|
+
self,
|
|
189
|
+
content: Union[str, Path],
|
|
190
|
+
provider: str,
|
|
191
|
+
model: str,
|
|
192
|
+
user_preference: str = "auto"
|
|
193
|
+
) -> Optional[List[Any]]:
|
|
194
|
+
"""
|
|
195
|
+
Compress content if beneficial.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
content: Text content or file path
|
|
199
|
+
provider: Provider name
|
|
200
|
+
model: Model name
|
|
201
|
+
user_preference: User compression preference
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
List of MediaContent objects if compressed, None if not compressed
|
|
205
|
+
"""
|
|
206
|
+
try:
|
|
207
|
+
# Check if compression should be applied
|
|
208
|
+
if not self.should_compress(content, provider, model, user_preference):
|
|
209
|
+
return None
|
|
210
|
+
|
|
211
|
+
# Get content as string
|
|
212
|
+
if isinstance(content, Path):
|
|
213
|
+
with open(content, 'r', encoding='utf-8') as f:
|
|
214
|
+
text_content = f.read()
|
|
215
|
+
else:
|
|
216
|
+
text_content = content
|
|
217
|
+
|
|
218
|
+
# Apply compression
|
|
219
|
+
processor = self._get_glyph_processor()
|
|
220
|
+
compressed_content = processor.process_text(text_content, provider, model, user_preference)
|
|
221
|
+
|
|
222
|
+
self.logger.info(f"Content compressed successfully: {len(compressed_content)} images")
|
|
223
|
+
return compressed_content
|
|
224
|
+
|
|
225
|
+
except Exception as e:
|
|
226
|
+
self.logger.error(f"Content compression failed: {e}")
|
|
227
|
+
raise CompressionError(f"Compression failed: {e}") from e
|
|
228
|
+
|
|
229
|
+
def get_compression_recommendation(
|
|
230
|
+
self,
|
|
231
|
+
content: Union[str, Path],
|
|
232
|
+
provider: str,
|
|
233
|
+
model: str
|
|
234
|
+
) -> Dict[str, Any]:
|
|
235
|
+
"""
|
|
236
|
+
Get detailed compression recommendation.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
content: Text content or file path
|
|
240
|
+
provider: Provider name
|
|
241
|
+
model: Model name
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
Dictionary with recommendation details
|
|
245
|
+
"""
|
|
246
|
+
try:
|
|
247
|
+
# Get content as string
|
|
248
|
+
if isinstance(content, Path):
|
|
249
|
+
with open(content, 'r', encoding='utf-8') as f:
|
|
250
|
+
text_content = f.read()
|
|
251
|
+
content_source = "file"
|
|
252
|
+
else:
|
|
253
|
+
text_content = content
|
|
254
|
+
content_source = "text"
|
|
255
|
+
|
|
256
|
+
# Analyze content
|
|
257
|
+
token_count = TokenUtils.estimate_tokens(text_content, model)
|
|
258
|
+
model_context = self._get_model_context_window(provider, model)
|
|
259
|
+
supports_vision = self._supports_vision(provider, model)
|
|
260
|
+
is_compressible = self._is_compressible_text(text_content)
|
|
261
|
+
|
|
262
|
+
# Calculate potential benefits
|
|
263
|
+
estimated_compression_ratio = self._estimate_compression_ratio(text_content, provider)
|
|
264
|
+
estimated_savings = token_count - (token_count / estimated_compression_ratio)
|
|
265
|
+
|
|
266
|
+
# Make recommendation
|
|
267
|
+
should_compress = self.should_compress(text_content, provider, model, "auto")
|
|
268
|
+
|
|
269
|
+
recommendation = {
|
|
270
|
+
'should_compress': should_compress,
|
|
271
|
+
'content_analysis': {
|
|
272
|
+
'source': content_source,
|
|
273
|
+
'length_chars': len(text_content),
|
|
274
|
+
'estimated_tokens': token_count,
|
|
275
|
+
'is_compressible': is_compressible
|
|
276
|
+
},
|
|
277
|
+
'provider_analysis': {
|
|
278
|
+
'provider': provider,
|
|
279
|
+
'model': model,
|
|
280
|
+
'supports_vision': supports_vision,
|
|
281
|
+
'context_window': model_context,
|
|
282
|
+
'utilization': token_count / model_context if model_context > 0 else 0
|
|
283
|
+
},
|
|
284
|
+
'compression_estimate': {
|
|
285
|
+
'estimated_ratio': estimated_compression_ratio,
|
|
286
|
+
'estimated_token_savings': int(estimated_savings),
|
|
287
|
+
'estimated_cost_savings': self._estimate_cost_savings(estimated_savings, provider)
|
|
288
|
+
},
|
|
289
|
+
'recommendation_reason': self._get_recommendation_reason(
|
|
290
|
+
should_compress, token_count, model_context, supports_vision, is_compressible
|
|
291
|
+
)
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
return recommendation
|
|
295
|
+
|
|
296
|
+
except Exception as e:
|
|
297
|
+
self.logger.error(f"Failed to generate compression recommendation: {e}")
|
|
298
|
+
return {
|
|
299
|
+
'should_compress': False,
|
|
300
|
+
'error': str(e),
|
|
301
|
+
'recommendation_reason': f"Analysis failed: {e}"
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
def _estimate_compression_ratio(self, text: str, provider: str) -> float:
|
|
305
|
+
"""Estimate compression ratio based on content and provider."""
|
|
306
|
+
base_ratio = 3.0 # Default from Glyph research
|
|
307
|
+
|
|
308
|
+
# Adjust based on content type
|
|
309
|
+
if self._is_code_content(text):
|
|
310
|
+
base_ratio *= 0.8 # Code compresses less well
|
|
311
|
+
elif self._is_prose_content(text):
|
|
312
|
+
base_ratio *= 1.1 # Prose compresses better
|
|
313
|
+
|
|
314
|
+
# Adjust based on provider OCR quality
|
|
315
|
+
provider_multipliers = {
|
|
316
|
+
'openai': 1.1, # Excellent OCR
|
|
317
|
+
'anthropic': 1.0, # Good OCR
|
|
318
|
+
'ollama': 0.9, # Variable OCR
|
|
319
|
+
'lmstudio': 0.9, # Variable OCR
|
|
320
|
+
'mlx': 0.8, # Limited OCR
|
|
321
|
+
'huggingface': 0.8 # Variable OCR
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
multiplier = provider_multipliers.get(provider, 0.9)
|
|
325
|
+
return base_ratio * multiplier
|
|
326
|
+
|
|
327
|
+
def _is_code_content(self, text: str) -> bool:
|
|
328
|
+
"""Check if content appears to be code."""
|
|
329
|
+
code_indicators = ['def ', 'class ', 'import ', 'function', '{', '}', '#!/', 'var ', 'const ']
|
|
330
|
+
return sum(1 for indicator in code_indicators if indicator in text) > 3
|
|
331
|
+
|
|
332
|
+
def _is_prose_content(self, text: str) -> bool:
|
|
333
|
+
"""Check if content appears to be prose."""
|
|
334
|
+
# Simple heuristic: high ratio of common words
|
|
335
|
+
common_words = ['the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by']
|
|
336
|
+
word_count = len(text.split())
|
|
337
|
+
common_count = sum(1 for word in text.lower().split() if word in common_words)
|
|
338
|
+
return word_count > 0 and (common_count / word_count) > 0.1
|
|
339
|
+
|
|
340
|
+
def _estimate_cost_savings(self, token_savings: float, provider: str) -> float:
|
|
341
|
+
"""Estimate cost savings from token reduction."""
|
|
342
|
+
# Rough cost estimates per 1K tokens (as of 2024)
|
|
343
|
+
cost_per_1k = {
|
|
344
|
+
'openai': 0.01, # GPT-4o pricing
|
|
345
|
+
'anthropic': 0.015, # Claude pricing
|
|
346
|
+
'ollama': 0.0, # Local models
|
|
347
|
+
'lmstudio': 0.0, # Local models
|
|
348
|
+
'mlx': 0.0, # Local models
|
|
349
|
+
'huggingface': 0.002 # API pricing
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
rate = cost_per_1k.get(provider, 0.01)
|
|
353
|
+
return (token_savings / 1000) * rate
|
|
354
|
+
|
|
355
|
+
def _get_recommendation_reason(
|
|
356
|
+
self,
|
|
357
|
+
should_compress: bool,
|
|
358
|
+
token_count: int,
|
|
359
|
+
model_context: int,
|
|
360
|
+
supports_vision: bool,
|
|
361
|
+
is_compressible: bool
|
|
362
|
+
) -> str:
|
|
363
|
+
"""Get human-readable recommendation reason."""
|
|
364
|
+
if not should_compress:
|
|
365
|
+
if not supports_vision:
|
|
366
|
+
return "Provider does not support vision processing"
|
|
367
|
+
elif not is_compressible:
|
|
368
|
+
return "Content type not suitable for visual compression"
|
|
369
|
+
elif token_count < self.config.min_token_threshold:
|
|
370
|
+
return f"Content too small ({token_count} tokens < {self.config.min_token_threshold} threshold)"
|
|
371
|
+
else:
|
|
372
|
+
return "Standard processing is sufficient for this content size"
|
|
373
|
+
else:
|
|
374
|
+
if token_count > model_context * 0.8:
|
|
375
|
+
return f"Compression necessary - approaching context limit ({token_count}/{model_context} tokens)"
|
|
376
|
+
elif token_count > 50000:
|
|
377
|
+
return f"Compression beneficial - large content ({token_count} tokens) will benefit from 3-4x reduction"
|
|
378
|
+
else:
|
|
379
|
+
return "Compression recommended based on content analysis"
|
|
380
|
+
|