abstractcore 2.5.2__py3-none-any.whl → 2.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractcore/__init__.py +19 -1
- abstractcore/architectures/detection.py +252 -6
- abstractcore/assets/architecture_formats.json +14 -1
- abstractcore/assets/model_capabilities.json +533 -10
- abstractcore/compression/__init__.py +29 -0
- abstractcore/compression/analytics.py +420 -0
- abstractcore/compression/cache.py +250 -0
- abstractcore/compression/config.py +279 -0
- abstractcore/compression/exceptions.py +30 -0
- abstractcore/compression/glyph_processor.py +381 -0
- abstractcore/compression/optimizer.py +388 -0
- abstractcore/compression/orchestrator.py +380 -0
- abstractcore/compression/pil_text_renderer.py +818 -0
- abstractcore/compression/quality.py +226 -0
- abstractcore/compression/text_formatter.py +666 -0
- abstractcore/compression/vision_compressor.py +371 -0
- abstractcore/config/main.py +64 -0
- abstractcore/config/manager.py +100 -5
- abstractcore/core/retry.py +2 -2
- abstractcore/core/session.py +193 -7
- abstractcore/download.py +253 -0
- abstractcore/embeddings/manager.py +2 -2
- abstractcore/events/__init__.py +113 -2
- abstractcore/exceptions/__init__.py +49 -2
- abstractcore/media/auto_handler.py +312 -18
- abstractcore/media/handlers/local_handler.py +14 -2
- abstractcore/media/handlers/openai_handler.py +62 -3
- abstractcore/media/processors/__init__.py +11 -1
- abstractcore/media/processors/direct_pdf_processor.py +210 -0
- abstractcore/media/processors/glyph_pdf_processor.py +227 -0
- abstractcore/media/processors/image_processor.py +7 -1
- abstractcore/media/processors/office_processor.py +2 -2
- abstractcore/media/processors/text_processor.py +18 -3
- abstractcore/media/types.py +164 -7
- abstractcore/media/utils/image_scaler.py +2 -2
- abstractcore/media/vision_fallback.py +2 -2
- abstractcore/providers/__init__.py +18 -0
- abstractcore/providers/anthropic_provider.py +228 -8
- abstractcore/providers/base.py +378 -11
- abstractcore/providers/huggingface_provider.py +563 -23
- abstractcore/providers/lmstudio_provider.py +284 -4
- abstractcore/providers/mlx_provider.py +27 -2
- abstractcore/providers/model_capabilities.py +352 -0
- abstractcore/providers/ollama_provider.py +282 -6
- abstractcore/providers/openai_provider.py +286 -8
- abstractcore/providers/registry.py +85 -13
- abstractcore/providers/streaming.py +2 -2
- abstractcore/server/app.py +91 -81
- abstractcore/tools/common_tools.py +2 -2
- abstractcore/tools/handler.py +2 -2
- abstractcore/tools/parser.py +2 -2
- abstractcore/tools/registry.py +2 -2
- abstractcore/tools/syntax_rewriter.py +2 -2
- abstractcore/tools/tag_rewriter.py +3 -3
- abstractcore/utils/__init__.py +4 -1
- abstractcore/utils/self_fixes.py +2 -2
- abstractcore/utils/trace_export.py +287 -0
- abstractcore/utils/version.py +1 -1
- abstractcore/utils/vlm_token_calculator.py +655 -0
- {abstractcore-2.5.2.dist-info → abstractcore-2.6.0.dist-info}/METADATA +207 -8
- abstractcore-2.6.0.dist-info/RECORD +108 -0
- abstractcore-2.5.2.dist-info/RECORD +0 -90
- {abstractcore-2.5.2.dist-info → abstractcore-2.6.0.dist-info}/WHEEL +0 -0
- {abstractcore-2.5.2.dist-info → abstractcore-2.6.0.dist-info}/entry_points.txt +0 -0
- {abstractcore-2.5.2.dist-info → abstractcore-2.6.0.dist-info}/licenses/LICENSE +0 -0
- {abstractcore-2.5.2.dist-info → abstractcore-2.6.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Quality validation and assessment for Glyph compression.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import List, Optional, Dict, Any
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class CompressionStats:
|
|
13
|
+
"""Statistics for compression operation."""
|
|
14
|
+
|
|
15
|
+
compression_ratio: float
|
|
16
|
+
quality_score: float
|
|
17
|
+
token_savings: int
|
|
18
|
+
processing_time: float
|
|
19
|
+
provider_optimized: str
|
|
20
|
+
original_tokens: int
|
|
21
|
+
compressed_tokens: int
|
|
22
|
+
cost_savings: float = 0.0
|
|
23
|
+
|
|
24
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
25
|
+
"""Convert to dictionary."""
|
|
26
|
+
return {
|
|
27
|
+
'compression_ratio': self.compression_ratio,
|
|
28
|
+
'quality_score': self.quality_score,
|
|
29
|
+
'token_savings': self.token_savings,
|
|
30
|
+
'processing_time': self.processing_time,
|
|
31
|
+
'provider_optimized': self.provider_optimized,
|
|
32
|
+
'original_tokens': self.original_tokens,
|
|
33
|
+
'compressed_tokens': self.compressed_tokens,
|
|
34
|
+
'cost_savings': self.cost_savings
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class QualityValidator:
|
|
39
|
+
"""Validates compression quality using multiple metrics."""
|
|
40
|
+
|
|
41
|
+
def __init__(self):
|
|
42
|
+
"""Initialize quality validator."""
|
|
43
|
+
from ..utils.structured_logging import get_logger
|
|
44
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
45
|
+
self.validation_methods = [
|
|
46
|
+
self._validate_compression_ratio,
|
|
47
|
+
self._validate_content_preservation,
|
|
48
|
+
self._validate_readability
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
def assess(self, original_content: str, rendered_images: List[Path],
|
|
52
|
+
provider: str = None) -> float:
|
|
53
|
+
"""
|
|
54
|
+
Assess compression quality using multiple metrics.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
original_content: Original text content
|
|
58
|
+
rendered_images: List of rendered image paths
|
|
59
|
+
provider: Provider name for provider-specific assessment
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
Quality score between 0.0 and 1.0
|
|
63
|
+
"""
|
|
64
|
+
scores = []
|
|
65
|
+
|
|
66
|
+
# Basic compression ratio validation
|
|
67
|
+
ratio_score = self._validate_compression_ratio(original_content, rendered_images)
|
|
68
|
+
scores.append(ratio_score)
|
|
69
|
+
|
|
70
|
+
# Content preservation assessment
|
|
71
|
+
preservation_score = self._validate_content_preservation(original_content, rendered_images)
|
|
72
|
+
scores.append(preservation_score)
|
|
73
|
+
|
|
74
|
+
# Readability assessment
|
|
75
|
+
readability_score = self._validate_readability(original_content, rendered_images, provider)
|
|
76
|
+
scores.append(readability_score)
|
|
77
|
+
|
|
78
|
+
# Weighted average (can be tuned based on empirical results)
|
|
79
|
+
weights = [0.3, 0.4, 0.3] # ratio, preservation, readability
|
|
80
|
+
weighted_score = sum(score * weight for score, weight in zip(scores, weights))
|
|
81
|
+
|
|
82
|
+
return min(1.0, max(0.0, weighted_score))
|
|
83
|
+
|
|
84
|
+
def _validate_compression_ratio(self, original_content: str, rendered_images: List[Path]) -> float:
|
|
85
|
+
"""Validate compression ratio is within expected range."""
|
|
86
|
+
from ..utils.token_utils import TokenUtils
|
|
87
|
+
|
|
88
|
+
# Estimate original tokens
|
|
89
|
+
original_tokens = TokenUtils.estimate_tokens(original_content)
|
|
90
|
+
|
|
91
|
+
# Estimate compressed tokens (rough approximation)
|
|
92
|
+
# Calculate accurate token count using proper VLM token calculation
|
|
93
|
+
from ..utils.vlm_token_calculator import VLMTokenCalculator
|
|
94
|
+
from pathlib import Path
|
|
95
|
+
|
|
96
|
+
calculator = VLMTokenCalculator()
|
|
97
|
+
try:
|
|
98
|
+
# Get provider from context or use default
|
|
99
|
+
provider_name = getattr(self, '_provider', 'openai')
|
|
100
|
+
model_name = getattr(self, '_model', '')
|
|
101
|
+
|
|
102
|
+
# Calculate tokens for all rendered images
|
|
103
|
+
if rendered_images and len(rendered_images) > 0:
|
|
104
|
+
# Assume rendered_images contains file paths or can be converted to paths
|
|
105
|
+
image_paths = []
|
|
106
|
+
for img in rendered_images:
|
|
107
|
+
if hasattr(img, 'path') and img.path:
|
|
108
|
+
image_paths.append(Path(img.path))
|
|
109
|
+
elif isinstance(img, (str, Path)):
|
|
110
|
+
image_paths.append(Path(img))
|
|
111
|
+
|
|
112
|
+
if image_paths:
|
|
113
|
+
token_analysis = calculator.calculate_tokens_for_images(
|
|
114
|
+
image_paths=image_paths,
|
|
115
|
+
provider=provider_name,
|
|
116
|
+
model=model_name
|
|
117
|
+
)
|
|
118
|
+
compressed_tokens = token_analysis['total_tokens']
|
|
119
|
+
self.logger.debug(f"Calculated {compressed_tokens} tokens for {len(image_paths)} images using {provider_name}")
|
|
120
|
+
else:
|
|
121
|
+
# Fallback to provider-specific base estimation
|
|
122
|
+
base_tokens = calculator.PROVIDER_CONFIGS.get(provider_name, {}).get('base_tokens', 512)
|
|
123
|
+
compressed_tokens = len(rendered_images) * base_tokens
|
|
124
|
+
self.logger.warning(f"Using fallback estimation: {compressed_tokens} tokens for {len(rendered_images)} images")
|
|
125
|
+
else:
|
|
126
|
+
compressed_tokens = 0
|
|
127
|
+
|
|
128
|
+
except Exception as e:
|
|
129
|
+
# Fallback to conservative estimate if calculation fails
|
|
130
|
+
self.logger.warning(f"VLM token calculation failed, using fallback: {e}")
|
|
131
|
+
compressed_tokens = len(rendered_images) * 1500 # Conservative fallback
|
|
132
|
+
|
|
133
|
+
if original_tokens == 0:
|
|
134
|
+
return 0.0
|
|
135
|
+
|
|
136
|
+
compression_ratio = original_tokens / compressed_tokens
|
|
137
|
+
|
|
138
|
+
# Score based on compression ratio
|
|
139
|
+
# Target: 3-4x compression
|
|
140
|
+
if 2.5 <= compression_ratio <= 5.0:
|
|
141
|
+
return 1.0 # Excellent compression
|
|
142
|
+
elif 2.0 <= compression_ratio < 2.5 or 5.0 < compression_ratio <= 6.0:
|
|
143
|
+
return 0.8 # Good compression
|
|
144
|
+
elif 1.5 <= compression_ratio < 2.0 or 6.0 < compression_ratio <= 8.0:
|
|
145
|
+
return 0.6 # Acceptable compression
|
|
146
|
+
else:
|
|
147
|
+
return 0.3 # Poor compression ratio
|
|
148
|
+
|
|
149
|
+
def _validate_content_preservation(self, original_content: str, rendered_images: List[Path]) -> float:
|
|
150
|
+
"""Validate that content is preserved in rendering."""
|
|
151
|
+
# Basic heuristics for content preservation
|
|
152
|
+
score = 1.0
|
|
153
|
+
|
|
154
|
+
# Check for content length preservation
|
|
155
|
+
# Images should roughly correspond to content length
|
|
156
|
+
content_length = len(original_content)
|
|
157
|
+
expected_images = max(1, content_length // 5000) # ~5000 chars per image
|
|
158
|
+
actual_images = len(rendered_images)
|
|
159
|
+
|
|
160
|
+
# Penalize significant deviations
|
|
161
|
+
if actual_images < expected_images * 0.5 or actual_images > expected_images * 2:
|
|
162
|
+
score *= 0.8
|
|
163
|
+
|
|
164
|
+
# Check for special characters and formatting
|
|
165
|
+
special_chars = sum(1 for c in original_content if not c.isalnum() and not c.isspace())
|
|
166
|
+
if special_chars > len(original_content) * 0.3: # >30% special chars
|
|
167
|
+
score *= 0.9 # Slight penalty for complex formatting
|
|
168
|
+
|
|
169
|
+
# Check for very long lines (may cause rendering issues)
|
|
170
|
+
lines = original_content.split('\n')
|
|
171
|
+
long_lines = sum(1 for line in lines if len(line) > 200)
|
|
172
|
+
if long_lines > len(lines) * 0.2: # >20% long lines
|
|
173
|
+
score *= 0.95
|
|
174
|
+
|
|
175
|
+
return score
|
|
176
|
+
|
|
177
|
+
def _validate_readability(self, original_content: str, rendered_images: List[Path],
|
|
178
|
+
provider: str = None) -> float:
|
|
179
|
+
"""Validate readability for the target provider."""
|
|
180
|
+
score = 1.0
|
|
181
|
+
|
|
182
|
+
# Provider-specific readability assessment
|
|
183
|
+
if provider == "openai":
|
|
184
|
+
# GPT-4o has excellent OCR, can handle dense text
|
|
185
|
+
score = 1.0
|
|
186
|
+
elif provider == "anthropic":
|
|
187
|
+
# Claude is font-sensitive, prefers clear rendering
|
|
188
|
+
score = 0.95
|
|
189
|
+
elif provider and "qwen" in provider.lower():
|
|
190
|
+
# Qwen models similar to Glyph's base model
|
|
191
|
+
score = 1.0
|
|
192
|
+
elif provider and "llava" in provider.lower():
|
|
193
|
+
# LLaVA models have limited OCR
|
|
194
|
+
score = 0.8
|
|
195
|
+
else:
|
|
196
|
+
# Conservative score for unknown providers
|
|
197
|
+
score = 0.85
|
|
198
|
+
|
|
199
|
+
# Adjust based on content characteristics
|
|
200
|
+
# Check for code content (harder to read in images)
|
|
201
|
+
code_indicators = ['def ', 'class ', 'import ', 'function', '{', '}', '#!/']
|
|
202
|
+
code_score = sum(1 for indicator in code_indicators if indicator in original_content)
|
|
203
|
+
if code_score > 5:
|
|
204
|
+
score *= 0.9 # Slight penalty for code content
|
|
205
|
+
|
|
206
|
+
# Check for mathematical notation (challenging for OCR)
|
|
207
|
+
math_indicators = ['∑', '∫', '∂', '√', '±', '≤', '≥', '≠', '∞']
|
|
208
|
+
math_score = sum(1 for indicator in math_indicators if indicator in original_content)
|
|
209
|
+
if math_score > 0:
|
|
210
|
+
score *= 0.85 # Penalty for mathematical notation
|
|
211
|
+
|
|
212
|
+
return score
|
|
213
|
+
|
|
214
|
+
def get_provider_threshold(self, provider: str) -> float:
|
|
215
|
+
"""Get quality threshold for specific provider."""
|
|
216
|
+
thresholds = {
|
|
217
|
+
"openai": 0.93,
|
|
218
|
+
"anthropic": 0.96,
|
|
219
|
+
"ollama": 0.90,
|
|
220
|
+
"lmstudio": 0.94,
|
|
221
|
+
"mlx": 0.88,
|
|
222
|
+
"huggingface": 0.85
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
return thresholds.get(provider, 0.90) # Default threshold
|
|
226
|
+
|