abstractcore 2.5.0__py3-none-any.whl → 2.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractcore/__init__.py +12 -0
- abstractcore/apps/__main__.py +8 -1
- abstractcore/apps/deepsearch.py +644 -0
- abstractcore/apps/intent.py +614 -0
- abstractcore/architectures/detection.py +250 -4
- abstractcore/assets/architecture_formats.json +14 -1
- abstractcore/assets/model_capabilities.json +583 -44
- abstractcore/compression/__init__.py +29 -0
- abstractcore/compression/analytics.py +420 -0
- abstractcore/compression/cache.py +250 -0
- abstractcore/compression/config.py +279 -0
- abstractcore/compression/exceptions.py +30 -0
- abstractcore/compression/glyph_processor.py +381 -0
- abstractcore/compression/optimizer.py +388 -0
- abstractcore/compression/orchestrator.py +380 -0
- abstractcore/compression/pil_text_renderer.py +818 -0
- abstractcore/compression/quality.py +226 -0
- abstractcore/compression/text_formatter.py +666 -0
- abstractcore/compression/vision_compressor.py +371 -0
- abstractcore/config/main.py +66 -1
- abstractcore/config/manager.py +111 -5
- abstractcore/core/session.py +105 -5
- abstractcore/events/__init__.py +1 -1
- abstractcore/media/auto_handler.py +312 -18
- abstractcore/media/handlers/local_handler.py +14 -2
- abstractcore/media/handlers/openai_handler.py +62 -3
- abstractcore/media/processors/__init__.py +11 -1
- abstractcore/media/processors/direct_pdf_processor.py +210 -0
- abstractcore/media/processors/glyph_pdf_processor.py +227 -0
- abstractcore/media/processors/image_processor.py +7 -1
- abstractcore/media/processors/text_processor.py +18 -3
- abstractcore/media/types.py +164 -7
- abstractcore/processing/__init__.py +5 -1
- abstractcore/processing/basic_deepsearch.py +2173 -0
- abstractcore/processing/basic_intent.py +690 -0
- abstractcore/providers/__init__.py +18 -0
- abstractcore/providers/anthropic_provider.py +29 -2
- abstractcore/providers/base.py +279 -6
- abstractcore/providers/huggingface_provider.py +658 -27
- abstractcore/providers/lmstudio_provider.py +52 -2
- abstractcore/providers/mlx_provider.py +103 -4
- abstractcore/providers/model_capabilities.py +352 -0
- abstractcore/providers/ollama_provider.py +44 -6
- abstractcore/providers/openai_provider.py +29 -2
- abstractcore/providers/registry.py +91 -19
- abstractcore/server/app.py +91 -81
- abstractcore/structured/handler.py +161 -1
- abstractcore/tools/common_tools.py +98 -3
- abstractcore/utils/__init__.py +4 -1
- abstractcore/utils/cli.py +114 -1
- abstractcore/utils/trace_export.py +287 -0
- abstractcore/utils/version.py +1 -1
- abstractcore/utils/vlm_token_calculator.py +655 -0
- {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/METADATA +140 -23
- abstractcore-2.5.3.dist-info/RECORD +107 -0
- {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/entry_points.txt +4 -0
- abstractcore-2.5.0.dist-info/RECORD +0 -86
- {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/WHEEL +0 -0
- {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/licenses/LICENSE +0 -0
- {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,655 @@
|
|
|
1
|
+
"""
|
|
2
|
+
VLM Token Calculator - Research-Based Accurate Token Estimation
|
|
3
|
+
|
|
4
|
+
This module provides state-of-the-art token calculations for images processed by different VLM providers,
|
|
5
|
+
integrating with AbstractCore's detection system and model capabilities database.
|
|
6
|
+
|
|
7
|
+
Key Research Insights Integrated:
|
|
8
|
+
- OpenAI GPT-4V: 85 base + 170 tokens per 512x512 tile (with resizing logic)
|
|
9
|
+
- Anthropic Claude: ~(width * height) / 750 with 1600 token cap
|
|
10
|
+
- Google Gemini: 258 tokens for small images, 258 per 768x768 tile for large
|
|
11
|
+
- Qwen-VL models: Variable patch sizes (14px, 16px) with adaptive resolution
|
|
12
|
+
- LLaMA Vision: 14px patches with specific resolution tiers
|
|
13
|
+
- Local models: Architecture-specific optimizations
|
|
14
|
+
|
|
15
|
+
References:
|
|
16
|
+
- Image Tokenization for Visual Models research
|
|
17
|
+
- Glyph Visual Text Compression framework
|
|
18
|
+
- OpenAI, Anthropic, Google official documentation
|
|
19
|
+
- Recent VLM architecture papers (2024-2025)
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
import math
|
|
23
|
+
from typing import Tuple, Dict, Any, Optional, List
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from PIL import Image
|
|
26
|
+
import logging
|
|
27
|
+
|
|
28
|
+
from ..utils.structured_logging import get_logger
|
|
29
|
+
from ..architectures.detection import get_model_capabilities, detect_architecture
|
|
30
|
+
|
|
31
|
+
logger = get_logger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class VLMTokenCalculator:
|
|
35
|
+
"""
|
|
36
|
+
Research-based VLM token calculator that integrates with AbstractCore's
|
|
37
|
+
model detection and capabilities system for maximum accuracy.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
# Provider-specific base configurations (enhanced with research data)
|
|
41
|
+
PROVIDER_CONFIGS = {
|
|
42
|
+
'openai': {
|
|
43
|
+
'base_tokens': 85,
|
|
44
|
+
'tokens_per_tile': 170,
|
|
45
|
+
'tile_size': 512,
|
|
46
|
+
'max_dimension': 2048,
|
|
47
|
+
'short_side_target': 768, # From research: resized to 768px short side
|
|
48
|
+
'detail_levels': {
|
|
49
|
+
'low': 85,
|
|
50
|
+
'high': 'calculated',
|
|
51
|
+
'auto': 'calculated'
|
|
52
|
+
}
|
|
53
|
+
},
|
|
54
|
+
'anthropic': {
|
|
55
|
+
'base_formula': 'pixel_area', # (width * height) / 750
|
|
56
|
+
'pixel_divisor': 750,
|
|
57
|
+
'max_dimension': 1568,
|
|
58
|
+
'token_cap': 1600,
|
|
59
|
+
'min_dimension_warning': 200
|
|
60
|
+
},
|
|
61
|
+
'google': {
|
|
62
|
+
'small_image_threshold': 384, # Both dimensions <= 384
|
|
63
|
+
'small_image_tokens': 258,
|
|
64
|
+
'tile_size': 768,
|
|
65
|
+
'tokens_per_tile': 258
|
|
66
|
+
},
|
|
67
|
+
'ollama': {
|
|
68
|
+
'base_tokens': 256,
|
|
69
|
+
'scaling_factor': 0.5,
|
|
70
|
+
'max_dimension': 1024
|
|
71
|
+
},
|
|
72
|
+
'lmstudio': {
|
|
73
|
+
'base_tokens': 512,
|
|
74
|
+
'scaling_factor': 0.7,
|
|
75
|
+
'max_dimension': 1024
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
# Model-specific overrides based on research and model capabilities
|
|
80
|
+
MODEL_SPECIFIC_CONFIGS = {
|
|
81
|
+
# Qwen VL models (from research: 28x28 pixel patches for 2.5, 32x32 for 3.0)
|
|
82
|
+
'qwen2.5-vl': {
|
|
83
|
+
'patch_size': 14,
|
|
84
|
+
'pixel_grouping': '28x28',
|
|
85
|
+
'max_image_tokens': 16384,
|
|
86
|
+
'adaptive_resolution': True,
|
|
87
|
+
'resolution_range': (56, 3584)
|
|
88
|
+
},
|
|
89
|
+
'qwen3-vl': {
|
|
90
|
+
'patch_size': 16,
|
|
91
|
+
'pixel_grouping': '32x32',
|
|
92
|
+
'max_image_tokens': 24576,
|
|
93
|
+
'adaptive_resolution': True,
|
|
94
|
+
'resolution_range': (64, 4096)
|
|
95
|
+
},
|
|
96
|
+
# LLaMA Vision models (from research: 14px patches, specific resolutions)
|
|
97
|
+
'llama3.2-vision': {
|
|
98
|
+
'patch_size': 14,
|
|
99
|
+
'supported_resolutions': [(560, 560), (1120, 560), (560, 1120), (1120, 1120)],
|
|
100
|
+
'max_image_tokens': 6400,
|
|
101
|
+
'base_tokens': 256
|
|
102
|
+
},
|
|
103
|
+
# Gemma Vision models (from research: SigLIP encoder, 896x896 fixed)
|
|
104
|
+
'gemma3': {
|
|
105
|
+
'fixed_resolution': (896, 896),
|
|
106
|
+
'vision_encoder': 'SigLIP-400M',
|
|
107
|
+
'tokens_per_image': 256,
|
|
108
|
+
'adaptive_windowing': True
|
|
109
|
+
},
|
|
110
|
+
# GLM models (Glyph research base model)
|
|
111
|
+
'glm-4': {
|
|
112
|
+
'optimized_for_glyph': True,
|
|
113
|
+
'text_image_processing': True,
|
|
114
|
+
'base_tokens': 512
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
def __init__(self):
|
|
119
|
+
"""Initialize the VLM token calculator."""
|
|
120
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
121
|
+
|
|
122
|
+
def calculate_tokens_for_image(self,
|
|
123
|
+
image_path: Optional[Path] = None,
|
|
124
|
+
width: Optional[int] = None,
|
|
125
|
+
height: Optional[int] = None,
|
|
126
|
+
provider: str = 'openai',
|
|
127
|
+
model: str = '',
|
|
128
|
+
detail_level: str = 'auto') -> Dict[str, Any]:
|
|
129
|
+
"""
|
|
130
|
+
Calculate accurate token count using model capabilities and research-based formulas.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
image_path: Path to image file
|
|
134
|
+
width: Image width in pixels
|
|
135
|
+
height: Image height in pixels
|
|
136
|
+
provider: VLM provider
|
|
137
|
+
model: Specific model name
|
|
138
|
+
detail_level: Detail level for applicable models
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
Dictionary with token count and calculation details
|
|
142
|
+
"""
|
|
143
|
+
# Get image dimensions
|
|
144
|
+
if image_path and image_path.exists():
|
|
145
|
+
try:
|
|
146
|
+
with Image.open(image_path) as img:
|
|
147
|
+
width, height = img.size
|
|
148
|
+
self.logger.debug(f"Loaded image dimensions: {width}x{height} from {image_path}")
|
|
149
|
+
except Exception as e:
|
|
150
|
+
self.logger.warning(f"Failed to load image {image_path}: {e}")
|
|
151
|
+
if width is None or height is None:
|
|
152
|
+
raise ValueError(f"Cannot determine image dimensions: {e}")
|
|
153
|
+
|
|
154
|
+
if width is None or height is None:
|
|
155
|
+
raise ValueError("Must provide either image_path or both width and height")
|
|
156
|
+
|
|
157
|
+
# Get model capabilities from AbstractCore's detection system
|
|
158
|
+
model_caps = get_model_capabilities(model) if model else {}
|
|
159
|
+
architecture = detect_architecture(model) if model else provider.lower()
|
|
160
|
+
|
|
161
|
+
# Determine calculation method based on model capabilities and research
|
|
162
|
+
calculation_result = self._calculate_with_model_awareness(
|
|
163
|
+
width, height, provider, model, architecture, model_caps, detail_level
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
# Add metadata about the calculation
|
|
167
|
+
calculation_result.update({
|
|
168
|
+
'image_dimensions': f"{width}x{height}",
|
|
169
|
+
'total_pixels': width * height,
|
|
170
|
+
'provider': provider,
|
|
171
|
+
'model': model,
|
|
172
|
+
'architecture': architecture,
|
|
173
|
+
'calculation_timestamp': self._get_timestamp()
|
|
174
|
+
})
|
|
175
|
+
|
|
176
|
+
return calculation_result
|
|
177
|
+
|
|
178
|
+
def _calculate_with_model_awareness(self, width: int, height: int, provider: str,
|
|
179
|
+
model: str, architecture: str, model_caps: Dict[str, Any],
|
|
180
|
+
detail_level: str) -> Dict[str, Any]:
|
|
181
|
+
"""Calculate tokens using model-specific knowledge and research insights."""
|
|
182
|
+
|
|
183
|
+
# Check for model-specific configurations first
|
|
184
|
+
model_config = self._get_model_specific_config(model, architecture)
|
|
185
|
+
if model_config:
|
|
186
|
+
return self._calculate_with_model_config(width, height, model_config, model_caps)
|
|
187
|
+
|
|
188
|
+
# Fall back to provider-specific calculations
|
|
189
|
+
provider_lower = provider.lower()
|
|
190
|
+
|
|
191
|
+
if provider_lower == 'openai':
|
|
192
|
+
return self._calculate_openai_tokens_enhanced(width, height, model, detail_level, model_caps)
|
|
193
|
+
elif provider_lower == 'anthropic':
|
|
194
|
+
return self._calculate_anthropic_tokens_enhanced(width, height, model, model_caps)
|
|
195
|
+
elif provider_lower == 'google':
|
|
196
|
+
return self._calculate_google_tokens(width, height, model, model_caps)
|
|
197
|
+
elif provider_lower in ['ollama', 'lmstudio']:
|
|
198
|
+
return self._calculate_local_tokens_enhanced(width, height, provider_lower, model, model_caps)
|
|
199
|
+
else:
|
|
200
|
+
self.logger.warning(f"Unknown provider '{provider}', using research-based estimation")
|
|
201
|
+
return self._calculate_research_based_fallback(width, height, model_caps)
|
|
202
|
+
|
|
203
|
+
def _get_model_specific_config(self, model: str, architecture: str) -> Optional[Dict[str, Any]]:
|
|
204
|
+
"""Get model-specific configuration based on research data."""
|
|
205
|
+
model_lower = model.lower()
|
|
206
|
+
|
|
207
|
+
# Check exact model matches first
|
|
208
|
+
for model_key, config in self.MODEL_SPECIFIC_CONFIGS.items():
|
|
209
|
+
if model_key in model_lower:
|
|
210
|
+
return config
|
|
211
|
+
|
|
212
|
+
# Check architecture-based matches
|
|
213
|
+
if 'qwen' in model_lower and 'vl' in model_lower:
|
|
214
|
+
if '2.5' in model_lower:
|
|
215
|
+
return self.MODEL_SPECIFIC_CONFIGS.get('qwen2.5-vl')
|
|
216
|
+
elif '3' in model_lower:
|
|
217
|
+
return self.MODEL_SPECIFIC_CONFIGS.get('qwen3-vl')
|
|
218
|
+
|
|
219
|
+
if 'llama' in model_lower and 'vision' in model_lower:
|
|
220
|
+
return self.MODEL_SPECIFIC_CONFIGS.get('llama3.2-vision')
|
|
221
|
+
|
|
222
|
+
if 'gemma' in model_lower and ('vision' in model_lower or '3' in model_lower):
|
|
223
|
+
return self.MODEL_SPECIFIC_CONFIGS.get('gemma3')
|
|
224
|
+
|
|
225
|
+
if 'glm' in model_lower:
|
|
226
|
+
return self.MODEL_SPECIFIC_CONFIGS.get('glm-4')
|
|
227
|
+
|
|
228
|
+
return None
|
|
229
|
+
|
|
230
|
+
def _calculate_with_model_config(self, width: int, height: int,
|
|
231
|
+
model_config: Dict[str, Any],
|
|
232
|
+
model_caps: Dict[str, Any]) -> Dict[str, Any]:
|
|
233
|
+
"""Calculate tokens using model-specific configuration."""
|
|
234
|
+
|
|
235
|
+
# Handle fixed resolution models (like Gemma3)
|
|
236
|
+
if 'fixed_resolution' in model_config:
|
|
237
|
+
target_width, target_height = model_config['fixed_resolution']
|
|
238
|
+
tokens = model_config.get('tokens_per_image', 256)
|
|
239
|
+
|
|
240
|
+
return {
|
|
241
|
+
'tokens': tokens,
|
|
242
|
+
'method': 'fixed_resolution',
|
|
243
|
+
'target_resolution': f"{target_width}x{target_height}",
|
|
244
|
+
'vision_encoder': model_config.get('vision_encoder', 'unknown'),
|
|
245
|
+
'adaptive_windowing': model_config.get('adaptive_windowing', False)
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
# Handle patch-based models (Qwen-VL, LLaMA Vision)
|
|
249
|
+
if 'patch_size' in model_config:
|
|
250
|
+
return self._calculate_patch_based_tokens(width, height, model_config, model_caps)
|
|
251
|
+
|
|
252
|
+
# Handle supported resolution models (LLaMA Vision)
|
|
253
|
+
if 'supported_resolutions' in model_config:
|
|
254
|
+
return self._calculate_resolution_tier_tokens(width, height, model_config, model_caps)
|
|
255
|
+
|
|
256
|
+
# Fallback to base tokens
|
|
257
|
+
tokens = model_config.get('base_tokens', 512)
|
|
258
|
+
return {
|
|
259
|
+
'tokens': tokens,
|
|
260
|
+
'method': 'model_specific_base',
|
|
261
|
+
'config_used': model_config
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
def _calculate_patch_based_tokens(self, width: int, height: int,
|
|
265
|
+
model_config: Dict[str, Any],
|
|
266
|
+
model_caps: Dict[str, Any]) -> Dict[str, Any]:
|
|
267
|
+
"""Calculate tokens for patch-based models like Qwen-VL."""
|
|
268
|
+
patch_size = model_config['patch_size']
|
|
269
|
+
max_tokens = model_config.get('max_image_tokens', 16384)
|
|
270
|
+
|
|
271
|
+
# Handle adaptive resolution
|
|
272
|
+
if model_config.get('adaptive_resolution'):
|
|
273
|
+
min_res, max_res = model_config.get('resolution_range', (56, 3584))
|
|
274
|
+
# Resize if outside supported range
|
|
275
|
+
if max(width, height) > max_res:
|
|
276
|
+
scale = max_res / max(width, height)
|
|
277
|
+
width = int(width * scale)
|
|
278
|
+
height = int(height * scale)
|
|
279
|
+
elif min(width, height) < min_res:
|
|
280
|
+
scale = min_res / min(width, height)
|
|
281
|
+
width = int(width * scale)
|
|
282
|
+
height = int(height * scale)
|
|
283
|
+
|
|
284
|
+
# Calculate patches
|
|
285
|
+
patches_width = math.ceil(width / patch_size)
|
|
286
|
+
patches_height = math.ceil(height / patch_size)
|
|
287
|
+
total_patches = patches_width * patches_height
|
|
288
|
+
|
|
289
|
+
# Apply token limit
|
|
290
|
+
tokens = min(total_patches, max_tokens)
|
|
291
|
+
|
|
292
|
+
return {
|
|
293
|
+
'tokens': tokens,
|
|
294
|
+
'method': 'patch_based',
|
|
295
|
+
'patch_size': patch_size,
|
|
296
|
+
'patches': f"{patches_width}x{patches_height}",
|
|
297
|
+
'total_patches': total_patches,
|
|
298
|
+
'max_tokens': max_tokens,
|
|
299
|
+
'pixel_grouping': model_config.get('pixel_grouping', f"{patch_size}x{patch_size}"),
|
|
300
|
+
'resized_to': f"{width}x{height}" if model_config.get('adaptive_resolution') else None
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
def _calculate_resolution_tier_tokens(self, width: int, height: int,
|
|
304
|
+
model_config: Dict[str, Any],
|
|
305
|
+
model_caps: Dict[str, Any]) -> Dict[str, Any]:
|
|
306
|
+
"""Calculate tokens for models with specific supported resolutions."""
|
|
307
|
+
supported_resolutions = model_config['supported_resolutions']
|
|
308
|
+
max_tokens = model_config.get('max_image_tokens', 6400)
|
|
309
|
+
base_tokens = model_config.get('base_tokens', 256)
|
|
310
|
+
|
|
311
|
+
# Find the best matching resolution
|
|
312
|
+
best_resolution = None
|
|
313
|
+
min_scale_factor = float('inf')
|
|
314
|
+
|
|
315
|
+
for res_width, res_height in supported_resolutions:
|
|
316
|
+
scale_w = res_width / width
|
|
317
|
+
scale_h = res_height / height
|
|
318
|
+
scale_factor = max(scale_w, scale_h) # Scale to fit
|
|
319
|
+
|
|
320
|
+
if scale_factor < min_scale_factor:
|
|
321
|
+
min_scale_factor = scale_factor
|
|
322
|
+
best_resolution = (res_width, res_height)
|
|
323
|
+
|
|
324
|
+
# Calculate tokens based on resolution tier
|
|
325
|
+
if best_resolution:
|
|
326
|
+
res_area = best_resolution[0] * best_resolution[1]
|
|
327
|
+
base_area = 560 * 560 # Base resolution area
|
|
328
|
+
tokens = int(base_tokens * (res_area / base_area))
|
|
329
|
+
tokens = min(tokens, max_tokens)
|
|
330
|
+
else:
|
|
331
|
+
tokens = base_tokens
|
|
332
|
+
|
|
333
|
+
return {
|
|
334
|
+
'tokens': tokens,
|
|
335
|
+
'method': 'resolution_tier',
|
|
336
|
+
'best_resolution': f"{best_resolution[0]}x{best_resolution[1]}" if best_resolution else None,
|
|
337
|
+
'scale_factor': min_scale_factor,
|
|
338
|
+
'max_tokens': max_tokens,
|
|
339
|
+
'supported_resolutions': supported_resolutions
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
def _calculate_openai_tokens_enhanced(self, width: int, height: int, model: str,
|
|
343
|
+
detail_level: str, model_caps: Dict[str, Any]) -> Dict[str, Any]:
|
|
344
|
+
"""Enhanced OpenAI token calculation with research-based improvements."""
|
|
345
|
+
config = self.PROVIDER_CONFIGS['openai']
|
|
346
|
+
|
|
347
|
+
if detail_level == 'low':
|
|
348
|
+
return {
|
|
349
|
+
'tokens': config['detail_levels']['low'],
|
|
350
|
+
'method': 'openai_low_detail',
|
|
351
|
+
'detail_level': 'low'
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
# Step 1: Resize to fit 2048x2048 square (preserving aspect ratio)
|
|
355
|
+
max_dim = config['max_dimension']
|
|
356
|
+
if width > max_dim or height > max_dim:
|
|
357
|
+
scale = min(max_dim / width, max_dim / height)
|
|
358
|
+
width = int(width * scale)
|
|
359
|
+
height = int(height * scale)
|
|
360
|
+
|
|
361
|
+
# Step 2: Resize so shortest side is 768px (from research)
|
|
362
|
+
short_side_target = config['short_side_target']
|
|
363
|
+
if min(width, height) != short_side_target:
|
|
364
|
+
scale = short_side_target / min(width, height)
|
|
365
|
+
width = int(width * scale)
|
|
366
|
+
height = int(height * scale)
|
|
367
|
+
|
|
368
|
+
# Step 3: Calculate tiles
|
|
369
|
+
tile_size = config['tile_size']
|
|
370
|
+
tiles_width = math.ceil(width / tile_size)
|
|
371
|
+
tiles_height = math.ceil(height / tile_size)
|
|
372
|
+
total_tiles = tiles_width * tiles_height
|
|
373
|
+
|
|
374
|
+
# Step 4: Apply formula
|
|
375
|
+
base_tokens = config['base_tokens']
|
|
376
|
+
tile_tokens = config['tokens_per_tile']
|
|
377
|
+
total_tokens = base_tokens + (total_tiles * tile_tokens)
|
|
378
|
+
|
|
379
|
+
return {
|
|
380
|
+
'tokens': total_tokens,
|
|
381
|
+
'method': 'openai_tile_based',
|
|
382
|
+
'detail_level': detail_level,
|
|
383
|
+
'resized_to': f"{width}x{height}",
|
|
384
|
+
'tiles': f"{tiles_width}x{tiles_height}",
|
|
385
|
+
'total_tiles': total_tiles,
|
|
386
|
+
'base_tokens': base_tokens,
|
|
387
|
+
'tile_tokens': tile_tokens
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
def _calculate_anthropic_tokens_enhanced(self, width: int, height: int, model: str,
|
|
391
|
+
model_caps: Dict[str, Any]) -> Dict[str, Any]:
|
|
392
|
+
"""Enhanced Anthropic token calculation based on research formula."""
|
|
393
|
+
config = self.PROVIDER_CONFIGS['anthropic']
|
|
394
|
+
|
|
395
|
+
# Resize if exceeds limits
|
|
396
|
+
max_dim = config['max_dimension']
|
|
397
|
+
original_size = f"{width}x{height}"
|
|
398
|
+
|
|
399
|
+
if width > max_dim or height > max_dim:
|
|
400
|
+
scale = min(max_dim / width, max_dim / height)
|
|
401
|
+
width = int(width * scale)
|
|
402
|
+
height = int(height * scale)
|
|
403
|
+
|
|
404
|
+
# Apply Anthropic's formula: (width * height) / 750
|
|
405
|
+
pixel_area = width * height
|
|
406
|
+
calculated_tokens = pixel_area / config['pixel_divisor']
|
|
407
|
+
|
|
408
|
+
# Apply token cap
|
|
409
|
+
tokens = min(int(calculated_tokens), config['token_cap'])
|
|
410
|
+
|
|
411
|
+
# Check for small image warning
|
|
412
|
+
min_dim_warning = None
|
|
413
|
+
if min(width, height) < config['min_dimension_warning']:
|
|
414
|
+
min_dim_warning = f"Image dimension below {config['min_dimension_warning']}px may degrade performance"
|
|
415
|
+
|
|
416
|
+
return {
|
|
417
|
+
'tokens': tokens,
|
|
418
|
+
'method': 'anthropic_pixel_area',
|
|
419
|
+
'formula': f"({width} * {height}) / {config['pixel_divisor']}",
|
|
420
|
+
'calculated_tokens': calculated_tokens,
|
|
421
|
+
'token_cap': config['token_cap'],
|
|
422
|
+
'resized_from': original_size if f"{width}x{height}" != original_size else None,
|
|
423
|
+
'warning': min_dim_warning
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
def _calculate_google_tokens(self, width: int, height: int, model: str,
|
|
427
|
+
model_caps: Dict[str, Any]) -> Dict[str, Any]:
|
|
428
|
+
"""Calculate tokens for Google Gemini models using hybrid approach."""
|
|
429
|
+
config = self.PROVIDER_CONFIGS['google']
|
|
430
|
+
|
|
431
|
+
# Check if it's a small image
|
|
432
|
+
threshold = config['small_image_threshold']
|
|
433
|
+
if width <= threshold and height <= threshold:
|
|
434
|
+
return {
|
|
435
|
+
'tokens': config['small_image_tokens'],
|
|
436
|
+
'method': 'google_small_image',
|
|
437
|
+
'threshold': f"{threshold}x{threshold}",
|
|
438
|
+
'classification': 'small'
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
# Large image: calculate tiles
|
|
442
|
+
tile_size = config['tile_size']
|
|
443
|
+
tiles_width = math.ceil(width / tile_size)
|
|
444
|
+
tiles_height = math.ceil(height / tile_size)
|
|
445
|
+
total_tiles = tiles_width * tiles_height
|
|
446
|
+
|
|
447
|
+
tokens = total_tiles * config['tokens_per_tile']
|
|
448
|
+
|
|
449
|
+
return {
|
|
450
|
+
'tokens': tokens,
|
|
451
|
+
'method': 'google_tiled',
|
|
452
|
+
'classification': 'large',
|
|
453
|
+
'tile_size': f"{tile_size}x{tile_size}",
|
|
454
|
+
'tiles': f"{tiles_width}x{tiles_height}",
|
|
455
|
+
'total_tiles': total_tiles,
|
|
456
|
+
'tokens_per_tile': config['tokens_per_tile']
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
def _calculate_local_tokens_enhanced(self, width: int, height: int, provider: str,
|
|
460
|
+
model: str, model_caps: Dict[str, Any]) -> Dict[str, Any]:
|
|
461
|
+
"""Enhanced local model token calculation with model capabilities integration."""
|
|
462
|
+
config = self.PROVIDER_CONFIGS[provider]
|
|
463
|
+
base_tokens = config['base_tokens']
|
|
464
|
+
|
|
465
|
+
# Get model-specific information from capabilities
|
|
466
|
+
vision_support = model_caps.get('vision_support', False)
|
|
467
|
+
max_image_tokens = model_caps.get('max_image_tokens', base_tokens)
|
|
468
|
+
image_patch_size = model_caps.get('image_patch_size', 16)
|
|
469
|
+
|
|
470
|
+
# Use patch-based calculation if patch size is available
|
|
471
|
+
if image_patch_size and vision_support:
|
|
472
|
+
patches_width = math.ceil(width / image_patch_size)
|
|
473
|
+
patches_height = math.ceil(height / image_patch_size)
|
|
474
|
+
total_patches = patches_width * patches_height
|
|
475
|
+
|
|
476
|
+
# Scale by efficiency factor
|
|
477
|
+
tokens = int(total_patches * config['scaling_factor'])
|
|
478
|
+
tokens = min(tokens, max_image_tokens)
|
|
479
|
+
|
|
480
|
+
return {
|
|
481
|
+
'tokens': tokens,
|
|
482
|
+
'method': f'{provider}_patch_based',
|
|
483
|
+
'patch_size': image_patch_size,
|
|
484
|
+
'patches': f"{patches_width}x{patches_height}",
|
|
485
|
+
'scaling_factor': config['scaling_factor'],
|
|
486
|
+
'max_tokens': max_image_tokens,
|
|
487
|
+
'vision_support': vision_support
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
# Fallback to area-based calculation
|
|
491
|
+
standard_pixels = 512 * 512
|
|
492
|
+
actual_pixels = width * height
|
|
493
|
+
scaling_factor = math.sqrt(actual_pixels / standard_pixels)
|
|
494
|
+
|
|
495
|
+
tokens = int(base_tokens * scaling_factor * config['scaling_factor'])
|
|
496
|
+
|
|
497
|
+
return {
|
|
498
|
+
'tokens': tokens,
|
|
499
|
+
'method': f'{provider}_area_based',
|
|
500
|
+
'base_tokens': base_tokens,
|
|
501
|
+
'scaling_factor': config['scaling_factor'],
|
|
502
|
+
'pixel_scaling': scaling_factor
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
def _calculate_research_based_fallback(self, width: int, height: int,
|
|
506
|
+
model_caps: Dict[str, Any]) -> Dict[str, Any]:
|
|
507
|
+
"""Research-based fallback calculation for unknown models."""
|
|
508
|
+
# Use Vision Transformer patch-based approach as fallback
|
|
509
|
+
patch_size = model_caps.get('image_patch_size', 16) # Default ViT patch size
|
|
510
|
+
|
|
511
|
+
patches_width = math.ceil(width / patch_size)
|
|
512
|
+
patches_height = math.ceil(height / patch_size)
|
|
513
|
+
total_patches = patches_width * patches_height
|
|
514
|
+
|
|
515
|
+
# Conservative token estimate
|
|
516
|
+
tokens = min(total_patches, 2048) # Cap at reasonable limit
|
|
517
|
+
|
|
518
|
+
return {
|
|
519
|
+
'tokens': tokens,
|
|
520
|
+
'method': 'research_based_fallback',
|
|
521
|
+
'patch_size': patch_size,
|
|
522
|
+
'patches': f"{patches_width}x{patches_height}",
|
|
523
|
+
'note': 'Using Vision Transformer patch-based estimation'
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
def calculate_tokens_for_images(self,
|
|
527
|
+
image_paths: List[Path],
|
|
528
|
+
provider: str = 'openai',
|
|
529
|
+
model: str = '',
|
|
530
|
+
detail_level: str = 'auto') -> Dict[str, Any]:
|
|
531
|
+
"""Calculate tokens for multiple images with detailed breakdown."""
|
|
532
|
+
results = {
|
|
533
|
+
'total_tokens': 0,
|
|
534
|
+
'image_count': len(image_paths),
|
|
535
|
+
'per_image_results': [],
|
|
536
|
+
'average_tokens_per_image': 0,
|
|
537
|
+
'provider': provider,
|
|
538
|
+
'model': model,
|
|
539
|
+
'calculation_summary': {}
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
method_counts = {}
|
|
543
|
+
|
|
544
|
+
for i, image_path in enumerate(image_paths):
|
|
545
|
+
try:
|
|
546
|
+
result = self.calculate_tokens_for_image(
|
|
547
|
+
image_path=image_path,
|
|
548
|
+
provider=provider,
|
|
549
|
+
model=model,
|
|
550
|
+
detail_level=detail_level
|
|
551
|
+
)
|
|
552
|
+
|
|
553
|
+
results['per_image_results'].append(result)
|
|
554
|
+
results['total_tokens'] += result['tokens']
|
|
555
|
+
|
|
556
|
+
# Track calculation methods
|
|
557
|
+
method = result.get('method', 'unknown')
|
|
558
|
+
method_counts[method] = method_counts.get(method, 0) + 1
|
|
559
|
+
|
|
560
|
+
except Exception as e:
|
|
561
|
+
self.logger.error(f"Failed to calculate tokens for {image_path}: {e}")
|
|
562
|
+
fallback_result = {
|
|
563
|
+
'tokens': 512, # Conservative fallback
|
|
564
|
+
'method': 'error_fallback',
|
|
565
|
+
'error': str(e),
|
|
566
|
+
'image_path': str(image_path)
|
|
567
|
+
}
|
|
568
|
+
results['per_image_results'].append(fallback_result)
|
|
569
|
+
results['total_tokens'] += 512
|
|
570
|
+
|
|
571
|
+
if results['image_count'] > 0:
|
|
572
|
+
results['average_tokens_per_image'] = results['total_tokens'] / results['image_count']
|
|
573
|
+
|
|
574
|
+
results['calculation_summary'] = {
|
|
575
|
+
'methods_used': method_counts,
|
|
576
|
+
'primary_method': max(method_counts.items(), key=lambda x: x[1])[0] if method_counts else 'none'
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
return results
|
|
580
|
+
|
|
581
|
+
def get_compression_ratio(self,
|
|
582
|
+
original_text_tokens: int,
|
|
583
|
+
image_paths: List[Path],
|
|
584
|
+
provider: str = 'openai',
|
|
585
|
+
model: str = '') -> Dict[str, Any]:
|
|
586
|
+
"""Calculate accurate compression ratio with enhanced analysis."""
|
|
587
|
+
image_analysis = self.calculate_tokens_for_images(
|
|
588
|
+
image_paths=image_paths,
|
|
589
|
+
provider=provider,
|
|
590
|
+
model=model
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
compressed_tokens = image_analysis['total_tokens']
|
|
594
|
+
compression_ratio = original_text_tokens / compressed_tokens if compressed_tokens > 0 else 0
|
|
595
|
+
|
|
596
|
+
return {
|
|
597
|
+
'original_tokens': original_text_tokens,
|
|
598
|
+
'compressed_tokens': compressed_tokens,
|
|
599
|
+
'compression_ratio': compression_ratio,
|
|
600
|
+
'images_created': len(image_paths),
|
|
601
|
+
'average_tokens_per_image': image_analysis['average_tokens_per_image'],
|
|
602
|
+
'provider': provider,
|
|
603
|
+
'model': model,
|
|
604
|
+
'calculation_methods': image_analysis['calculation_summary'],
|
|
605
|
+
'per_image_breakdown': image_analysis['per_image_results'],
|
|
606
|
+
'token_savings': original_text_tokens - compressed_tokens,
|
|
607
|
+
'efficiency_analysis': self._analyze_efficiency(compression_ratio, provider, model)
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
def _analyze_efficiency(self, ratio: float, provider: str, model: str) -> Dict[str, Any]:
|
|
611
|
+
"""Analyze compression efficiency and provide insights."""
|
|
612
|
+
if ratio > 10:
|
|
613
|
+
efficiency = "excellent"
|
|
614
|
+
insight = "Exceptional compression achieved, ideal for long-context processing"
|
|
615
|
+
elif ratio > 4:
|
|
616
|
+
efficiency = "very_good"
|
|
617
|
+
insight = "Strong compression ratio, significant token savings"
|
|
618
|
+
elif ratio > 2:
|
|
619
|
+
efficiency = "good"
|
|
620
|
+
insight = "Moderate compression, suitable for most use cases"
|
|
621
|
+
elif ratio > 1:
|
|
622
|
+
efficiency = "marginal"
|
|
623
|
+
insight = "Limited compression benefit, consider alternative approaches"
|
|
624
|
+
else:
|
|
625
|
+
efficiency = "poor"
|
|
626
|
+
insight = "No compression benefit, text processing may be more efficient"
|
|
627
|
+
|
|
628
|
+
return {
|
|
629
|
+
'efficiency_rating': efficiency,
|
|
630
|
+
'insight': insight,
|
|
631
|
+
'compression_ratio': ratio,
|
|
632
|
+
'recommended_use': ratio > 1.5
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
def _get_timestamp(self) -> str:
|
|
636
|
+
"""Get current timestamp for calculation metadata."""
|
|
637
|
+
from datetime import datetime
|
|
638
|
+
return datetime.now().isoformat()
|
|
639
|
+
|
|
640
|
+
|
|
641
|
+
# Convenience functions for backward compatibility
|
|
642
|
+
def calculate_image_tokens(image_path: Path, provider: str = 'openai', model: str = '') -> int:
|
|
643
|
+
"""Calculate tokens for a single image."""
|
|
644
|
+
calculator = VLMTokenCalculator()
|
|
645
|
+
result = calculator.calculate_tokens_for_image(image_path=image_path, provider=provider, model=model)
|
|
646
|
+
return result['tokens']
|
|
647
|
+
|
|
648
|
+
|
|
649
|
+
def calculate_glyph_compression_ratio(original_tokens: int,
|
|
650
|
+
image_paths: List[Path],
|
|
651
|
+
provider: str = 'openai',
|
|
652
|
+
model: str = '') -> Dict[str, Any]:
|
|
653
|
+
"""Calculate accurate Glyph compression ratio."""
|
|
654
|
+
calculator = VLMTokenCalculator()
|
|
655
|
+
return calculator.get_compression_ratio(original_tokens, image_paths, provider, model)
|