abstractcore 2.5.2__py3-none-any.whl → 2.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. abstractcore/__init__.py +12 -0
  2. abstractcore/architectures/detection.py +250 -4
  3. abstractcore/assets/architecture_formats.json +14 -1
  4. abstractcore/assets/model_capabilities.json +533 -10
  5. abstractcore/compression/__init__.py +29 -0
  6. abstractcore/compression/analytics.py +420 -0
  7. abstractcore/compression/cache.py +250 -0
  8. abstractcore/compression/config.py +279 -0
  9. abstractcore/compression/exceptions.py +30 -0
  10. abstractcore/compression/glyph_processor.py +381 -0
  11. abstractcore/compression/optimizer.py +388 -0
  12. abstractcore/compression/orchestrator.py +380 -0
  13. abstractcore/compression/pil_text_renderer.py +818 -0
  14. abstractcore/compression/quality.py +226 -0
  15. abstractcore/compression/text_formatter.py +666 -0
  16. abstractcore/compression/vision_compressor.py +371 -0
  17. abstractcore/config/main.py +64 -0
  18. abstractcore/config/manager.py +100 -5
  19. abstractcore/core/session.py +61 -6
  20. abstractcore/events/__init__.py +1 -1
  21. abstractcore/media/auto_handler.py +312 -18
  22. abstractcore/media/handlers/local_handler.py +14 -2
  23. abstractcore/media/handlers/openai_handler.py +62 -3
  24. abstractcore/media/processors/__init__.py +11 -1
  25. abstractcore/media/processors/direct_pdf_processor.py +210 -0
  26. abstractcore/media/processors/glyph_pdf_processor.py +227 -0
  27. abstractcore/media/processors/image_processor.py +7 -1
  28. abstractcore/media/processors/text_processor.py +18 -3
  29. abstractcore/media/types.py +164 -7
  30. abstractcore/providers/__init__.py +18 -0
  31. abstractcore/providers/anthropic_provider.py +28 -2
  32. abstractcore/providers/base.py +278 -6
  33. abstractcore/providers/huggingface_provider.py +563 -23
  34. abstractcore/providers/lmstudio_provider.py +38 -2
  35. abstractcore/providers/mlx_provider.py +27 -2
  36. abstractcore/providers/model_capabilities.py +352 -0
  37. abstractcore/providers/ollama_provider.py +38 -4
  38. abstractcore/providers/openai_provider.py +28 -2
  39. abstractcore/providers/registry.py +85 -13
  40. abstractcore/server/app.py +91 -81
  41. abstractcore/utils/__init__.py +4 -1
  42. abstractcore/utils/trace_export.py +287 -0
  43. abstractcore/utils/version.py +1 -1
  44. abstractcore/utils/vlm_token_calculator.py +655 -0
  45. {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/METADATA +107 -6
  46. {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/RECORD +50 -33
  47. {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/WHEEL +0 -0
  48. {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/entry_points.txt +0 -0
  49. {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/licenses/LICENSE +0 -0
  50. {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,380 @@
1
+ """
2
+ Compression orchestrator for intelligent decision-making.
3
+ """
4
+
5
+ from typing import Optional, Union, Dict, Any, List
6
+ from pathlib import Path
7
+
8
+ from ..utils.token_utils import TokenUtils
9
+ from ..utils.structured_logging import get_logger
10
+ from .config import GlyphConfig
11
+ # Import GlyphProcessor lazily to avoid circular imports
12
+ from .exceptions import CompressionError
13
+
14
+
15
+ class CompressionOrchestrator:
16
+ """Intelligent decision-making for when and how to apply Glyph compression."""
17
+
18
+ def __init__(self, config: Optional[GlyphConfig] = None):
19
+ """
20
+ Initialize compression orchestrator.
21
+
22
+ Args:
23
+ config: Glyph configuration
24
+ """
25
+ self.config = config or GlyphConfig.from_abstractcore_config()
26
+ self.logger = get_logger(self.__class__.__name__)
27
+ self.glyph_processor = None # Lazy initialization
28
+
29
+ self.logger.debug("CompressionOrchestrator initialized")
30
+
31
+ def _get_glyph_processor(self):
32
+ """Get or create Glyph processor instance."""
33
+ if self.glyph_processor is None:
34
+ # Lazy import to avoid circular dependency
35
+ from .glyph_processor import GlyphProcessor
36
+ self.glyph_processor = GlyphProcessor(self.config)
37
+ return self.glyph_processor
38
+
39
+ def should_compress(
40
+ self,
41
+ content: Union[str, Path],
42
+ provider: str,
43
+ model: str,
44
+ user_preference: str = "auto"
45
+ ) -> bool:
46
+ """
47
+ Intelligent compression decision based on multiple factors.
48
+
49
+ Args:
50
+ content: Text content or file path
51
+ provider: Provider name
52
+ model: Model name
53
+ user_preference: User compression preference (auto, always, never)
54
+
55
+ Returns:
56
+ True if compression should be applied
57
+ """
58
+ # Check user preference first
59
+ if user_preference == "never":
60
+ return False
61
+ elif user_preference == "always":
62
+ return self._can_compress(content, provider, model)
63
+
64
+ # Auto-decision logic
65
+ try:
66
+ # Get content as string
67
+ if isinstance(content, Path):
68
+ with open(content, 'r', encoding='utf-8') as f:
69
+ text_content = f.read()
70
+ else:
71
+ text_content = content
72
+
73
+ # Basic feasibility checks
74
+ if not self._can_compress(text_content, provider, model):
75
+ return False
76
+
77
+ # Token-based decision
78
+ token_count = TokenUtils.estimate_tokens(text_content, model)
79
+ model_context = self._get_model_context_window(provider, model)
80
+
81
+ # Decision matrix based on Glyph research
82
+ if token_count < self.config.min_token_threshold:
83
+ return False # Too small to benefit
84
+ elif token_count > model_context * 0.8:
85
+ return True # Necessary if approaching limits
86
+ elif token_count > 50000:
87
+ return True # Beneficial for large texts
88
+ else:
89
+ return False # Standard processing sufficient
90
+
91
+ except Exception as e:
92
+ self.logger.warning(f"Compression decision failed, defaulting to False: {e}")
93
+ return False
94
+
95
+ def _can_compress(self, content: Union[str, Path], provider: str, model: str) -> bool:
96
+ """Check if compression is technically feasible."""
97
+ try:
98
+ # Check if compression is enabled
99
+ if not self.config.enabled:
100
+ return False
101
+
102
+ # Check provider vision support
103
+ if not self._supports_vision(provider, model):
104
+ return False
105
+
106
+ # Check content type suitability
107
+ if isinstance(content, Path):
108
+ if not self._is_compressible_file(content):
109
+ return False
110
+ else:
111
+ if not self._is_compressible_text(content):
112
+ return False
113
+
114
+ return True
115
+
116
+ except Exception:
117
+ return False
118
+
119
+ def _supports_vision(self, provider: str, model: str) -> bool:
120
+ """Check if provider/model supports vision."""
121
+ try:
122
+ from ..media.capabilities import get_model_capabilities
123
+ capabilities = get_model_capabilities(provider, model)
124
+ return capabilities.get('vision_support', False)
125
+ except Exception:
126
+ # Conservative approach for unknown providers
127
+ vision_providers = ['openai', 'anthropic', 'ollama', 'lmstudio']
128
+ return any(vp in provider.lower() for vp in vision_providers)
129
+
130
+ def _get_model_context_window(self, provider: str, model: str) -> int:
131
+ """Get model context window size."""
132
+ try:
133
+ from ..media.capabilities import get_model_capabilities
134
+ capabilities = get_model_capabilities(provider, model)
135
+ return capabilities.get('context_window', 32000)
136
+ except Exception:
137
+ # Default context windows by provider
138
+ defaults = {
139
+ 'openai': 128000,
140
+ 'anthropic': 200000,
141
+ 'ollama': 32000,
142
+ 'lmstudio': 32000,
143
+ 'mlx': 32000,
144
+ 'huggingface': 32000
145
+ }
146
+ return defaults.get(provider, 32000)
147
+
148
+ def _is_compressible_file(self, file_path: Path) -> bool:
149
+ """Check if file type is suitable for compression."""
150
+ if not file_path.exists():
151
+ return False
152
+
153
+ # Check file extension
154
+ compressible_extensions = {'.txt', '.md', '.csv', '.tsv', '.json', '.yaml', '.yml'}
155
+ if file_path.suffix.lower() not in compressible_extensions:
156
+ return False
157
+
158
+ # Check file size (avoid very large files)
159
+ try:
160
+ file_size = file_path.stat().st_size
161
+ if file_size > 10 * 1024 * 1024: # 10MB limit
162
+ return False
163
+ except Exception:
164
+ return False
165
+
166
+ return True
167
+
168
+ def _is_compressible_text(self, text: str) -> bool:
169
+ """Check if text content is suitable for compression."""
170
+ if not text or len(text.strip()) < 100:
171
+ return False
172
+
173
+ # Check for problematic content types
174
+ # Mathematical notation (challenging for OCR)
175
+ math_indicators = ['∑', '∫', '∂', '√', '±', '≤', '≥', '≠', '∞']
176
+ math_count = sum(1 for indicator in math_indicators if indicator in text)
177
+ if math_count > len(text) * 0.01: # >1% mathematical symbols
178
+ return False
179
+
180
+ # Very dense special characters
181
+ special_chars = sum(1 for c in text if not c.isalnum() and not c.isspace())
182
+ if special_chars > len(text) * 0.5: # >50% special characters
183
+ return False
184
+
185
+ return True
186
+
187
+ def compress_content(
188
+ self,
189
+ content: Union[str, Path],
190
+ provider: str,
191
+ model: str,
192
+ user_preference: str = "auto"
193
+ ) -> Optional[List[Any]]:
194
+ """
195
+ Compress content if beneficial.
196
+
197
+ Args:
198
+ content: Text content or file path
199
+ provider: Provider name
200
+ model: Model name
201
+ user_preference: User compression preference
202
+
203
+ Returns:
204
+ List of MediaContent objects if compressed, None if not compressed
205
+ """
206
+ try:
207
+ # Check if compression should be applied
208
+ if not self.should_compress(content, provider, model, user_preference):
209
+ return None
210
+
211
+ # Get content as string
212
+ if isinstance(content, Path):
213
+ with open(content, 'r', encoding='utf-8') as f:
214
+ text_content = f.read()
215
+ else:
216
+ text_content = content
217
+
218
+ # Apply compression
219
+ processor = self._get_glyph_processor()
220
+ compressed_content = processor.process_text(text_content, provider, model, user_preference)
221
+
222
+ self.logger.info(f"Content compressed successfully: {len(compressed_content)} images")
223
+ return compressed_content
224
+
225
+ except Exception as e:
226
+ self.logger.error(f"Content compression failed: {e}")
227
+ raise CompressionError(f"Compression failed: {e}") from e
228
+
229
+ def get_compression_recommendation(
230
+ self,
231
+ content: Union[str, Path],
232
+ provider: str,
233
+ model: str
234
+ ) -> Dict[str, Any]:
235
+ """
236
+ Get detailed compression recommendation.
237
+
238
+ Args:
239
+ content: Text content or file path
240
+ provider: Provider name
241
+ model: Model name
242
+
243
+ Returns:
244
+ Dictionary with recommendation details
245
+ """
246
+ try:
247
+ # Get content as string
248
+ if isinstance(content, Path):
249
+ with open(content, 'r', encoding='utf-8') as f:
250
+ text_content = f.read()
251
+ content_source = "file"
252
+ else:
253
+ text_content = content
254
+ content_source = "text"
255
+
256
+ # Analyze content
257
+ token_count = TokenUtils.estimate_tokens(text_content, model)
258
+ model_context = self._get_model_context_window(provider, model)
259
+ supports_vision = self._supports_vision(provider, model)
260
+ is_compressible = self._is_compressible_text(text_content)
261
+
262
+ # Calculate potential benefits
263
+ estimated_compression_ratio = self._estimate_compression_ratio(text_content, provider)
264
+ estimated_savings = token_count - (token_count / estimated_compression_ratio)
265
+
266
+ # Make recommendation
267
+ should_compress = self.should_compress(text_content, provider, model, "auto")
268
+
269
+ recommendation = {
270
+ 'should_compress': should_compress,
271
+ 'content_analysis': {
272
+ 'source': content_source,
273
+ 'length_chars': len(text_content),
274
+ 'estimated_tokens': token_count,
275
+ 'is_compressible': is_compressible
276
+ },
277
+ 'provider_analysis': {
278
+ 'provider': provider,
279
+ 'model': model,
280
+ 'supports_vision': supports_vision,
281
+ 'context_window': model_context,
282
+ 'utilization': token_count / model_context if model_context > 0 else 0
283
+ },
284
+ 'compression_estimate': {
285
+ 'estimated_ratio': estimated_compression_ratio,
286
+ 'estimated_token_savings': int(estimated_savings),
287
+ 'estimated_cost_savings': self._estimate_cost_savings(estimated_savings, provider)
288
+ },
289
+ 'recommendation_reason': self._get_recommendation_reason(
290
+ should_compress, token_count, model_context, supports_vision, is_compressible
291
+ )
292
+ }
293
+
294
+ return recommendation
295
+
296
+ except Exception as e:
297
+ self.logger.error(f"Failed to generate compression recommendation: {e}")
298
+ return {
299
+ 'should_compress': False,
300
+ 'error': str(e),
301
+ 'recommendation_reason': f"Analysis failed: {e}"
302
+ }
303
+
304
+ def _estimate_compression_ratio(self, text: str, provider: str) -> float:
305
+ """Estimate compression ratio based on content and provider."""
306
+ base_ratio = 3.0 # Default from Glyph research
307
+
308
+ # Adjust based on content type
309
+ if self._is_code_content(text):
310
+ base_ratio *= 0.8 # Code compresses less well
311
+ elif self._is_prose_content(text):
312
+ base_ratio *= 1.1 # Prose compresses better
313
+
314
+ # Adjust based on provider OCR quality
315
+ provider_multipliers = {
316
+ 'openai': 1.1, # Excellent OCR
317
+ 'anthropic': 1.0, # Good OCR
318
+ 'ollama': 0.9, # Variable OCR
319
+ 'lmstudio': 0.9, # Variable OCR
320
+ 'mlx': 0.8, # Limited OCR
321
+ 'huggingface': 0.8 # Variable OCR
322
+ }
323
+
324
+ multiplier = provider_multipliers.get(provider, 0.9)
325
+ return base_ratio * multiplier
326
+
327
+ def _is_code_content(self, text: str) -> bool:
328
+ """Check if content appears to be code."""
329
+ code_indicators = ['def ', 'class ', 'import ', 'function', '{', '}', '#!/', 'var ', 'const ']
330
+ return sum(1 for indicator in code_indicators if indicator in text) > 3
331
+
332
+ def _is_prose_content(self, text: str) -> bool:
333
+ """Check if content appears to be prose."""
334
+ # Simple heuristic: high ratio of common words
335
+ common_words = ['the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by']
336
+ word_count = len(text.split())
337
+ common_count = sum(1 for word in text.lower().split() if word in common_words)
338
+ return word_count > 0 and (common_count / word_count) > 0.1
339
+
340
+ def _estimate_cost_savings(self, token_savings: float, provider: str) -> float:
341
+ """Estimate cost savings from token reduction."""
342
+ # Rough cost estimates per 1K tokens (as of 2024)
343
+ cost_per_1k = {
344
+ 'openai': 0.01, # GPT-4o pricing
345
+ 'anthropic': 0.015, # Claude pricing
346
+ 'ollama': 0.0, # Local models
347
+ 'lmstudio': 0.0, # Local models
348
+ 'mlx': 0.0, # Local models
349
+ 'huggingface': 0.002 # API pricing
350
+ }
351
+
352
+ rate = cost_per_1k.get(provider, 0.01)
353
+ return (token_savings / 1000) * rate
354
+
355
+ def _get_recommendation_reason(
356
+ self,
357
+ should_compress: bool,
358
+ token_count: int,
359
+ model_context: int,
360
+ supports_vision: bool,
361
+ is_compressible: bool
362
+ ) -> str:
363
+ """Get human-readable recommendation reason."""
364
+ if not should_compress:
365
+ if not supports_vision:
366
+ return "Provider does not support vision processing"
367
+ elif not is_compressible:
368
+ return "Content type not suitable for visual compression"
369
+ elif token_count < self.config.min_token_threshold:
370
+ return f"Content too small ({token_count} tokens < {self.config.min_token_threshold} threshold)"
371
+ else:
372
+ return "Standard processing is sufficient for this content size"
373
+ else:
374
+ if token_count > model_context * 0.8:
375
+ return f"Compression necessary - approaching context limit ({token_count}/{model_context} tokens)"
376
+ elif token_count > 50000:
377
+ return f"Compression beneficial - large content ({token_count} tokens) will benefit from 3-4x reduction"
378
+ else:
379
+ return "Compression recommended based on content analysis"
380
+