abstractcore 2.5.2__py3-none-any.whl → 2.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. abstractcore/__init__.py +19 -1
  2. abstractcore/architectures/detection.py +252 -6
  3. abstractcore/assets/architecture_formats.json +14 -1
  4. abstractcore/assets/model_capabilities.json +533 -10
  5. abstractcore/compression/__init__.py +29 -0
  6. abstractcore/compression/analytics.py +420 -0
  7. abstractcore/compression/cache.py +250 -0
  8. abstractcore/compression/config.py +279 -0
  9. abstractcore/compression/exceptions.py +30 -0
  10. abstractcore/compression/glyph_processor.py +381 -0
  11. abstractcore/compression/optimizer.py +388 -0
  12. abstractcore/compression/orchestrator.py +380 -0
  13. abstractcore/compression/pil_text_renderer.py +818 -0
  14. abstractcore/compression/quality.py +226 -0
  15. abstractcore/compression/text_formatter.py +666 -0
  16. abstractcore/compression/vision_compressor.py +371 -0
  17. abstractcore/config/main.py +64 -0
  18. abstractcore/config/manager.py +100 -5
  19. abstractcore/core/retry.py +2 -2
  20. abstractcore/core/session.py +193 -7
  21. abstractcore/download.py +253 -0
  22. abstractcore/embeddings/manager.py +2 -2
  23. abstractcore/events/__init__.py +113 -2
  24. abstractcore/exceptions/__init__.py +49 -2
  25. abstractcore/media/auto_handler.py +312 -18
  26. abstractcore/media/handlers/local_handler.py +14 -2
  27. abstractcore/media/handlers/openai_handler.py +62 -3
  28. abstractcore/media/processors/__init__.py +11 -1
  29. abstractcore/media/processors/direct_pdf_processor.py +210 -0
  30. abstractcore/media/processors/glyph_pdf_processor.py +227 -0
  31. abstractcore/media/processors/image_processor.py +7 -1
  32. abstractcore/media/processors/office_processor.py +2 -2
  33. abstractcore/media/processors/text_processor.py +18 -3
  34. abstractcore/media/types.py +164 -7
  35. abstractcore/media/utils/image_scaler.py +2 -2
  36. abstractcore/media/vision_fallback.py +2 -2
  37. abstractcore/providers/__init__.py +18 -0
  38. abstractcore/providers/anthropic_provider.py +228 -8
  39. abstractcore/providers/base.py +378 -11
  40. abstractcore/providers/huggingface_provider.py +563 -23
  41. abstractcore/providers/lmstudio_provider.py +284 -4
  42. abstractcore/providers/mlx_provider.py +27 -2
  43. abstractcore/providers/model_capabilities.py +352 -0
  44. abstractcore/providers/ollama_provider.py +282 -6
  45. abstractcore/providers/openai_provider.py +286 -8
  46. abstractcore/providers/registry.py +85 -13
  47. abstractcore/providers/streaming.py +2 -2
  48. abstractcore/server/app.py +91 -81
  49. abstractcore/tools/common_tools.py +2 -2
  50. abstractcore/tools/handler.py +2 -2
  51. abstractcore/tools/parser.py +2 -2
  52. abstractcore/tools/registry.py +2 -2
  53. abstractcore/tools/syntax_rewriter.py +2 -2
  54. abstractcore/tools/tag_rewriter.py +3 -3
  55. abstractcore/utils/__init__.py +4 -1
  56. abstractcore/utils/self_fixes.py +2 -2
  57. abstractcore/utils/trace_export.py +287 -0
  58. abstractcore/utils/version.py +1 -1
  59. abstractcore/utils/vlm_token_calculator.py +655 -0
  60. {abstractcore-2.5.2.dist-info → abstractcore-2.6.0.dist-info}/METADATA +207 -8
  61. abstractcore-2.6.0.dist-info/RECORD +108 -0
  62. abstractcore-2.5.2.dist-info/RECORD +0 -90
  63. {abstractcore-2.5.2.dist-info → abstractcore-2.6.0.dist-info}/WHEEL +0 -0
  64. {abstractcore-2.5.2.dist-info → abstractcore-2.6.0.dist-info}/entry_points.txt +0 -0
  65. {abstractcore-2.5.2.dist-info → abstractcore-2.6.0.dist-info}/licenses/LICENSE +0 -0
  66. {abstractcore-2.5.2.dist-info → abstractcore-2.6.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,226 @@
1
+ """
2
+ Quality validation and assessment for Glyph compression.
3
+ """
4
+
5
+ import time
6
+ from dataclasses import dataclass
7
+ from typing import List, Optional, Dict, Any
8
+ from pathlib import Path
9
+
10
+
11
+ @dataclass
12
+ class CompressionStats:
13
+ """Statistics for compression operation."""
14
+
15
+ compression_ratio: float
16
+ quality_score: float
17
+ token_savings: int
18
+ processing_time: float
19
+ provider_optimized: str
20
+ original_tokens: int
21
+ compressed_tokens: int
22
+ cost_savings: float = 0.0
23
+
24
+ def to_dict(self) -> Dict[str, Any]:
25
+ """Convert to dictionary."""
26
+ return {
27
+ 'compression_ratio': self.compression_ratio,
28
+ 'quality_score': self.quality_score,
29
+ 'token_savings': self.token_savings,
30
+ 'processing_time': self.processing_time,
31
+ 'provider_optimized': self.provider_optimized,
32
+ 'original_tokens': self.original_tokens,
33
+ 'compressed_tokens': self.compressed_tokens,
34
+ 'cost_savings': self.cost_savings
35
+ }
36
+
37
+
38
+ class QualityValidator:
39
+ """Validates compression quality using multiple metrics."""
40
+
41
+ def __init__(self):
42
+ """Initialize quality validator."""
43
+ from ..utils.structured_logging import get_logger
44
+ self.logger = get_logger(self.__class__.__name__)
45
+ self.validation_methods = [
46
+ self._validate_compression_ratio,
47
+ self._validate_content_preservation,
48
+ self._validate_readability
49
+ ]
50
+
51
+ def assess(self, original_content: str, rendered_images: List[Path],
52
+ provider: str = None) -> float:
53
+ """
54
+ Assess compression quality using multiple metrics.
55
+
56
+ Args:
57
+ original_content: Original text content
58
+ rendered_images: List of rendered image paths
59
+ provider: Provider name for provider-specific assessment
60
+
61
+ Returns:
62
+ Quality score between 0.0 and 1.0
63
+ """
64
+ scores = []
65
+
66
+ # Basic compression ratio validation
67
+ ratio_score = self._validate_compression_ratio(original_content, rendered_images)
68
+ scores.append(ratio_score)
69
+
70
+ # Content preservation assessment
71
+ preservation_score = self._validate_content_preservation(original_content, rendered_images)
72
+ scores.append(preservation_score)
73
+
74
+ # Readability assessment
75
+ readability_score = self._validate_readability(original_content, rendered_images, provider)
76
+ scores.append(readability_score)
77
+
78
+ # Weighted average (can be tuned based on empirical results)
79
+ weights = [0.3, 0.4, 0.3] # ratio, preservation, readability
80
+ weighted_score = sum(score * weight for score, weight in zip(scores, weights))
81
+
82
+ return min(1.0, max(0.0, weighted_score))
83
+
84
+ def _validate_compression_ratio(self, original_content: str, rendered_images: List[Path]) -> float:
85
+ """Validate compression ratio is within expected range."""
86
+ from ..utils.token_utils import TokenUtils
87
+
88
+ # Estimate original tokens
89
+ original_tokens = TokenUtils.estimate_tokens(original_content)
90
+
91
+ # Estimate compressed tokens (rough approximation)
92
+ # Calculate accurate token count using proper VLM token calculation
93
+ from ..utils.vlm_token_calculator import VLMTokenCalculator
94
+ from pathlib import Path
95
+
96
+ calculator = VLMTokenCalculator()
97
+ try:
98
+ # Get provider from context or use default
99
+ provider_name = getattr(self, '_provider', 'openai')
100
+ model_name = getattr(self, '_model', '')
101
+
102
+ # Calculate tokens for all rendered images
103
+ if rendered_images and len(rendered_images) > 0:
104
+ # Assume rendered_images contains file paths or can be converted to paths
105
+ image_paths = []
106
+ for img in rendered_images:
107
+ if hasattr(img, 'path') and img.path:
108
+ image_paths.append(Path(img.path))
109
+ elif isinstance(img, (str, Path)):
110
+ image_paths.append(Path(img))
111
+
112
+ if image_paths:
113
+ token_analysis = calculator.calculate_tokens_for_images(
114
+ image_paths=image_paths,
115
+ provider=provider_name,
116
+ model=model_name
117
+ )
118
+ compressed_tokens = token_analysis['total_tokens']
119
+ self.logger.debug(f"Calculated {compressed_tokens} tokens for {len(image_paths)} images using {provider_name}")
120
+ else:
121
+ # Fallback to provider-specific base estimation
122
+ base_tokens = calculator.PROVIDER_CONFIGS.get(provider_name, {}).get('base_tokens', 512)
123
+ compressed_tokens = len(rendered_images) * base_tokens
124
+ self.logger.warning(f"Using fallback estimation: {compressed_tokens} tokens for {len(rendered_images)} images")
125
+ else:
126
+ compressed_tokens = 0
127
+
128
+ except Exception as e:
129
+ # Fallback to conservative estimate if calculation fails
130
+ self.logger.warning(f"VLM token calculation failed, using fallback: {e}")
131
+ compressed_tokens = len(rendered_images) * 1500 # Conservative fallback
132
+
133
+ if original_tokens == 0:
134
+ return 0.0
135
+
136
+ compression_ratio = original_tokens / compressed_tokens
137
+
138
+ # Score based on compression ratio
139
+ # Target: 3-4x compression
140
+ if 2.5 <= compression_ratio <= 5.0:
141
+ return 1.0 # Excellent compression
142
+ elif 2.0 <= compression_ratio < 2.5 or 5.0 < compression_ratio <= 6.0:
143
+ return 0.8 # Good compression
144
+ elif 1.5 <= compression_ratio < 2.0 or 6.0 < compression_ratio <= 8.0:
145
+ return 0.6 # Acceptable compression
146
+ else:
147
+ return 0.3 # Poor compression ratio
148
+
149
+ def _validate_content_preservation(self, original_content: str, rendered_images: List[Path]) -> float:
150
+ """Validate that content is preserved in rendering."""
151
+ # Basic heuristics for content preservation
152
+ score = 1.0
153
+
154
+ # Check for content length preservation
155
+ # Images should roughly correspond to content length
156
+ content_length = len(original_content)
157
+ expected_images = max(1, content_length // 5000) # ~5000 chars per image
158
+ actual_images = len(rendered_images)
159
+
160
+ # Penalize significant deviations
161
+ if actual_images < expected_images * 0.5 or actual_images > expected_images * 2:
162
+ score *= 0.8
163
+
164
+ # Check for special characters and formatting
165
+ special_chars = sum(1 for c in original_content if not c.isalnum() and not c.isspace())
166
+ if special_chars > len(original_content) * 0.3: # >30% special chars
167
+ score *= 0.9 # Slight penalty for complex formatting
168
+
169
+ # Check for very long lines (may cause rendering issues)
170
+ lines = original_content.split('\n')
171
+ long_lines = sum(1 for line in lines if len(line) > 200)
172
+ if long_lines > len(lines) * 0.2: # >20% long lines
173
+ score *= 0.95
174
+
175
+ return score
176
+
177
+ def _validate_readability(self, original_content: str, rendered_images: List[Path],
178
+ provider: str = None) -> float:
179
+ """Validate readability for the target provider."""
180
+ score = 1.0
181
+
182
+ # Provider-specific readability assessment
183
+ if provider == "openai":
184
+ # GPT-4o has excellent OCR, can handle dense text
185
+ score = 1.0
186
+ elif provider == "anthropic":
187
+ # Claude is font-sensitive, prefers clear rendering
188
+ score = 0.95
189
+ elif provider and "qwen" in provider.lower():
190
+ # Qwen models similar to Glyph's base model
191
+ score = 1.0
192
+ elif provider and "llava" in provider.lower():
193
+ # LLaVA models have limited OCR
194
+ score = 0.8
195
+ else:
196
+ # Conservative score for unknown providers
197
+ score = 0.85
198
+
199
+ # Adjust based on content characteristics
200
+ # Check for code content (harder to read in images)
201
+ code_indicators = ['def ', 'class ', 'import ', 'function', '{', '}', '#!/']
202
+ code_score = sum(1 for indicator in code_indicators if indicator in original_content)
203
+ if code_score > 5:
204
+ score *= 0.9 # Slight penalty for code content
205
+
206
+ # Check for mathematical notation (challenging for OCR)
207
+ math_indicators = ['∑', '∫', '∂', '√', '±', '≤', '≥', '≠', '∞']
208
+ math_score = sum(1 for indicator in math_indicators if indicator in original_content)
209
+ if math_score > 0:
210
+ score *= 0.85 # Penalty for mathematical notation
211
+
212
+ return score
213
+
214
+ def get_provider_threshold(self, provider: str) -> float:
215
+ """Get quality threshold for specific provider."""
216
+ thresholds = {
217
+ "openai": 0.93,
218
+ "anthropic": 0.96,
219
+ "ollama": 0.90,
220
+ "lmstudio": 0.94,
221
+ "mlx": 0.88,
222
+ "huggingface": 0.85
223
+ }
224
+
225
+ return thresholds.get(provider, 0.90) # Default threshold
226
+