abstractcore 2.5.2__py3-none-any.whl → 2.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. abstractcore/__init__.py +12 -0
  2. abstractcore/architectures/detection.py +250 -4
  3. abstractcore/assets/architecture_formats.json +14 -1
  4. abstractcore/assets/model_capabilities.json +533 -10
  5. abstractcore/compression/__init__.py +29 -0
  6. abstractcore/compression/analytics.py +420 -0
  7. abstractcore/compression/cache.py +250 -0
  8. abstractcore/compression/config.py +279 -0
  9. abstractcore/compression/exceptions.py +30 -0
  10. abstractcore/compression/glyph_processor.py +381 -0
  11. abstractcore/compression/optimizer.py +388 -0
  12. abstractcore/compression/orchestrator.py +380 -0
  13. abstractcore/compression/pil_text_renderer.py +818 -0
  14. abstractcore/compression/quality.py +226 -0
  15. abstractcore/compression/text_formatter.py +666 -0
  16. abstractcore/compression/vision_compressor.py +371 -0
  17. abstractcore/config/main.py +64 -0
  18. abstractcore/config/manager.py +100 -5
  19. abstractcore/core/session.py +61 -6
  20. abstractcore/events/__init__.py +1 -1
  21. abstractcore/media/auto_handler.py +312 -18
  22. abstractcore/media/handlers/local_handler.py +14 -2
  23. abstractcore/media/handlers/openai_handler.py +62 -3
  24. abstractcore/media/processors/__init__.py +11 -1
  25. abstractcore/media/processors/direct_pdf_processor.py +210 -0
  26. abstractcore/media/processors/glyph_pdf_processor.py +227 -0
  27. abstractcore/media/processors/image_processor.py +7 -1
  28. abstractcore/media/processors/text_processor.py +18 -3
  29. abstractcore/media/types.py +164 -7
  30. abstractcore/providers/__init__.py +18 -0
  31. abstractcore/providers/anthropic_provider.py +28 -2
  32. abstractcore/providers/base.py +278 -6
  33. abstractcore/providers/huggingface_provider.py +563 -23
  34. abstractcore/providers/lmstudio_provider.py +38 -2
  35. abstractcore/providers/mlx_provider.py +27 -2
  36. abstractcore/providers/model_capabilities.py +352 -0
  37. abstractcore/providers/ollama_provider.py +38 -4
  38. abstractcore/providers/openai_provider.py +28 -2
  39. abstractcore/providers/registry.py +85 -13
  40. abstractcore/server/app.py +91 -81
  41. abstractcore/utils/__init__.py +4 -1
  42. abstractcore/utils/trace_export.py +287 -0
  43. abstractcore/utils/version.py +1 -1
  44. abstractcore/utils/vlm_token_calculator.py +655 -0
  45. {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/METADATA +107 -6
  46. {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/RECORD +50 -33
  47. {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/WHEEL +0 -0
  48. {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/entry_points.txt +0 -0
  49. {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/licenses/LICENSE +0 -0
  50. {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,655 @@
1
+ """
2
+ VLM Token Calculator - Research-Based Accurate Token Estimation
3
+
4
+ This module provides state-of-the-art token calculations for images processed by different VLM providers,
5
+ integrating with AbstractCore's detection system and model capabilities database.
6
+
7
+ Key Research Insights Integrated:
8
+ - OpenAI GPT-4V: 85 base + 170 tokens per 512x512 tile (with resizing logic)
9
+ - Anthropic Claude: ~(width * height) / 750 with 1600 token cap
10
+ - Google Gemini: 258 tokens for small images, 258 per 768x768 tile for large
11
+ - Qwen-VL models: Variable patch sizes (14px, 16px) with adaptive resolution
12
+ - LLaMA Vision: 14px patches with specific resolution tiers
13
+ - Local models: Architecture-specific optimizations
14
+
15
+ References:
16
+ - Image Tokenization for Visual Models research
17
+ - Glyph Visual Text Compression framework
18
+ - OpenAI, Anthropic, Google official documentation
19
+ - Recent VLM architecture papers (2024-2025)
20
+ """
21
+
22
+ import math
23
+ from typing import Tuple, Dict, Any, Optional, List
24
+ from pathlib import Path
25
+ from PIL import Image
26
+ import logging
27
+
28
+ from ..utils.structured_logging import get_logger
29
+ from ..architectures.detection import get_model_capabilities, detect_architecture
30
+
31
+ logger = get_logger(__name__)
32
+
33
+
34
+ class VLMTokenCalculator:
35
+ """
36
+ Research-based VLM token calculator that integrates with AbstractCore's
37
+ model detection and capabilities system for maximum accuracy.
38
+ """
39
+
40
+ # Provider-specific base configurations (enhanced with research data)
41
+ PROVIDER_CONFIGS = {
42
+ 'openai': {
43
+ 'base_tokens': 85,
44
+ 'tokens_per_tile': 170,
45
+ 'tile_size': 512,
46
+ 'max_dimension': 2048,
47
+ 'short_side_target': 768, # From research: resized to 768px short side
48
+ 'detail_levels': {
49
+ 'low': 85,
50
+ 'high': 'calculated',
51
+ 'auto': 'calculated'
52
+ }
53
+ },
54
+ 'anthropic': {
55
+ 'base_formula': 'pixel_area', # (width * height) / 750
56
+ 'pixel_divisor': 750,
57
+ 'max_dimension': 1568,
58
+ 'token_cap': 1600,
59
+ 'min_dimension_warning': 200
60
+ },
61
+ 'google': {
62
+ 'small_image_threshold': 384, # Both dimensions <= 384
63
+ 'small_image_tokens': 258,
64
+ 'tile_size': 768,
65
+ 'tokens_per_tile': 258
66
+ },
67
+ 'ollama': {
68
+ 'base_tokens': 256,
69
+ 'scaling_factor': 0.5,
70
+ 'max_dimension': 1024
71
+ },
72
+ 'lmstudio': {
73
+ 'base_tokens': 512,
74
+ 'scaling_factor': 0.7,
75
+ 'max_dimension': 1024
76
+ }
77
+ }
78
+
79
+ # Model-specific overrides based on research and model capabilities
80
+ MODEL_SPECIFIC_CONFIGS = {
81
+ # Qwen VL models (from research: 28x28 pixel patches for 2.5, 32x32 for 3.0)
82
+ 'qwen2.5-vl': {
83
+ 'patch_size': 14,
84
+ 'pixel_grouping': '28x28',
85
+ 'max_image_tokens': 16384,
86
+ 'adaptive_resolution': True,
87
+ 'resolution_range': (56, 3584)
88
+ },
89
+ 'qwen3-vl': {
90
+ 'patch_size': 16,
91
+ 'pixel_grouping': '32x32',
92
+ 'max_image_tokens': 24576,
93
+ 'adaptive_resolution': True,
94
+ 'resolution_range': (64, 4096)
95
+ },
96
+ # LLaMA Vision models (from research: 14px patches, specific resolutions)
97
+ 'llama3.2-vision': {
98
+ 'patch_size': 14,
99
+ 'supported_resolutions': [(560, 560), (1120, 560), (560, 1120), (1120, 1120)],
100
+ 'max_image_tokens': 6400,
101
+ 'base_tokens': 256
102
+ },
103
+ # Gemma Vision models (from research: SigLIP encoder, 896x896 fixed)
104
+ 'gemma3': {
105
+ 'fixed_resolution': (896, 896),
106
+ 'vision_encoder': 'SigLIP-400M',
107
+ 'tokens_per_image': 256,
108
+ 'adaptive_windowing': True
109
+ },
110
+ # GLM models (Glyph research base model)
111
+ 'glm-4': {
112
+ 'optimized_for_glyph': True,
113
+ 'text_image_processing': True,
114
+ 'base_tokens': 512
115
+ }
116
+ }
117
+
118
+ def __init__(self):
119
+ """Initialize the VLM token calculator."""
120
+ self.logger = get_logger(self.__class__.__name__)
121
+
122
+ def calculate_tokens_for_image(self,
123
+ image_path: Optional[Path] = None,
124
+ width: Optional[int] = None,
125
+ height: Optional[int] = None,
126
+ provider: str = 'openai',
127
+ model: str = '',
128
+ detail_level: str = 'auto') -> Dict[str, Any]:
129
+ """
130
+ Calculate accurate token count using model capabilities and research-based formulas.
131
+
132
+ Args:
133
+ image_path: Path to image file
134
+ width: Image width in pixels
135
+ height: Image height in pixels
136
+ provider: VLM provider
137
+ model: Specific model name
138
+ detail_level: Detail level for applicable models
139
+
140
+ Returns:
141
+ Dictionary with token count and calculation details
142
+ """
143
+ # Get image dimensions
144
+ if image_path and image_path.exists():
145
+ try:
146
+ with Image.open(image_path) as img:
147
+ width, height = img.size
148
+ self.logger.debug(f"Loaded image dimensions: {width}x{height} from {image_path}")
149
+ except Exception as e:
150
+ self.logger.warning(f"Failed to load image {image_path}: {e}")
151
+ if width is None or height is None:
152
+ raise ValueError(f"Cannot determine image dimensions: {e}")
153
+
154
+ if width is None or height is None:
155
+ raise ValueError("Must provide either image_path or both width and height")
156
+
157
+ # Get model capabilities from AbstractCore's detection system
158
+ model_caps = get_model_capabilities(model) if model else {}
159
+ architecture = detect_architecture(model) if model else provider.lower()
160
+
161
+ # Determine calculation method based on model capabilities and research
162
+ calculation_result = self._calculate_with_model_awareness(
163
+ width, height, provider, model, architecture, model_caps, detail_level
164
+ )
165
+
166
+ # Add metadata about the calculation
167
+ calculation_result.update({
168
+ 'image_dimensions': f"{width}x{height}",
169
+ 'total_pixels': width * height,
170
+ 'provider': provider,
171
+ 'model': model,
172
+ 'architecture': architecture,
173
+ 'calculation_timestamp': self._get_timestamp()
174
+ })
175
+
176
+ return calculation_result
177
+
178
+ def _calculate_with_model_awareness(self, width: int, height: int, provider: str,
179
+ model: str, architecture: str, model_caps: Dict[str, Any],
180
+ detail_level: str) -> Dict[str, Any]:
181
+ """Calculate tokens using model-specific knowledge and research insights."""
182
+
183
+ # Check for model-specific configurations first
184
+ model_config = self._get_model_specific_config(model, architecture)
185
+ if model_config:
186
+ return self._calculate_with_model_config(width, height, model_config, model_caps)
187
+
188
+ # Fall back to provider-specific calculations
189
+ provider_lower = provider.lower()
190
+
191
+ if provider_lower == 'openai':
192
+ return self._calculate_openai_tokens_enhanced(width, height, model, detail_level, model_caps)
193
+ elif provider_lower == 'anthropic':
194
+ return self._calculate_anthropic_tokens_enhanced(width, height, model, model_caps)
195
+ elif provider_lower == 'google':
196
+ return self._calculate_google_tokens(width, height, model, model_caps)
197
+ elif provider_lower in ['ollama', 'lmstudio']:
198
+ return self._calculate_local_tokens_enhanced(width, height, provider_lower, model, model_caps)
199
+ else:
200
+ self.logger.warning(f"Unknown provider '{provider}', using research-based estimation")
201
+ return self._calculate_research_based_fallback(width, height, model_caps)
202
+
203
+ def _get_model_specific_config(self, model: str, architecture: str) -> Optional[Dict[str, Any]]:
204
+ """Get model-specific configuration based on research data."""
205
+ model_lower = model.lower()
206
+
207
+ # Check exact model matches first
208
+ for model_key, config in self.MODEL_SPECIFIC_CONFIGS.items():
209
+ if model_key in model_lower:
210
+ return config
211
+
212
+ # Check architecture-based matches
213
+ if 'qwen' in model_lower and 'vl' in model_lower:
214
+ if '2.5' in model_lower:
215
+ return self.MODEL_SPECIFIC_CONFIGS.get('qwen2.5-vl')
216
+ elif '3' in model_lower:
217
+ return self.MODEL_SPECIFIC_CONFIGS.get('qwen3-vl')
218
+
219
+ if 'llama' in model_lower and 'vision' in model_lower:
220
+ return self.MODEL_SPECIFIC_CONFIGS.get('llama3.2-vision')
221
+
222
+ if 'gemma' in model_lower and ('vision' in model_lower or '3' in model_lower):
223
+ return self.MODEL_SPECIFIC_CONFIGS.get('gemma3')
224
+
225
+ if 'glm' in model_lower:
226
+ return self.MODEL_SPECIFIC_CONFIGS.get('glm-4')
227
+
228
+ return None
229
+
230
+ def _calculate_with_model_config(self, width: int, height: int,
231
+ model_config: Dict[str, Any],
232
+ model_caps: Dict[str, Any]) -> Dict[str, Any]:
233
+ """Calculate tokens using model-specific configuration."""
234
+
235
+ # Handle fixed resolution models (like Gemma3)
236
+ if 'fixed_resolution' in model_config:
237
+ target_width, target_height = model_config['fixed_resolution']
238
+ tokens = model_config.get('tokens_per_image', 256)
239
+
240
+ return {
241
+ 'tokens': tokens,
242
+ 'method': 'fixed_resolution',
243
+ 'target_resolution': f"{target_width}x{target_height}",
244
+ 'vision_encoder': model_config.get('vision_encoder', 'unknown'),
245
+ 'adaptive_windowing': model_config.get('adaptive_windowing', False)
246
+ }
247
+
248
+ # Handle patch-based models (Qwen-VL, LLaMA Vision)
249
+ if 'patch_size' in model_config:
250
+ return self._calculate_patch_based_tokens(width, height, model_config, model_caps)
251
+
252
+ # Handle supported resolution models (LLaMA Vision)
253
+ if 'supported_resolutions' in model_config:
254
+ return self._calculate_resolution_tier_tokens(width, height, model_config, model_caps)
255
+
256
+ # Fallback to base tokens
257
+ tokens = model_config.get('base_tokens', 512)
258
+ return {
259
+ 'tokens': tokens,
260
+ 'method': 'model_specific_base',
261
+ 'config_used': model_config
262
+ }
263
+
264
+ def _calculate_patch_based_tokens(self, width: int, height: int,
265
+ model_config: Dict[str, Any],
266
+ model_caps: Dict[str, Any]) -> Dict[str, Any]:
267
+ """Calculate tokens for patch-based models like Qwen-VL."""
268
+ patch_size = model_config['patch_size']
269
+ max_tokens = model_config.get('max_image_tokens', 16384)
270
+
271
+ # Handle adaptive resolution
272
+ if model_config.get('adaptive_resolution'):
273
+ min_res, max_res = model_config.get('resolution_range', (56, 3584))
274
+ # Resize if outside supported range
275
+ if max(width, height) > max_res:
276
+ scale = max_res / max(width, height)
277
+ width = int(width * scale)
278
+ height = int(height * scale)
279
+ elif min(width, height) < min_res:
280
+ scale = min_res / min(width, height)
281
+ width = int(width * scale)
282
+ height = int(height * scale)
283
+
284
+ # Calculate patches
285
+ patches_width = math.ceil(width / patch_size)
286
+ patches_height = math.ceil(height / patch_size)
287
+ total_patches = patches_width * patches_height
288
+
289
+ # Apply token limit
290
+ tokens = min(total_patches, max_tokens)
291
+
292
+ return {
293
+ 'tokens': tokens,
294
+ 'method': 'patch_based',
295
+ 'patch_size': patch_size,
296
+ 'patches': f"{patches_width}x{patches_height}",
297
+ 'total_patches': total_patches,
298
+ 'max_tokens': max_tokens,
299
+ 'pixel_grouping': model_config.get('pixel_grouping', f"{patch_size}x{patch_size}"),
300
+ 'resized_to': f"{width}x{height}" if model_config.get('adaptive_resolution') else None
301
+ }
302
+
303
+ def _calculate_resolution_tier_tokens(self, width: int, height: int,
304
+ model_config: Dict[str, Any],
305
+ model_caps: Dict[str, Any]) -> Dict[str, Any]:
306
+ """Calculate tokens for models with specific supported resolutions."""
307
+ supported_resolutions = model_config['supported_resolutions']
308
+ max_tokens = model_config.get('max_image_tokens', 6400)
309
+ base_tokens = model_config.get('base_tokens', 256)
310
+
311
+ # Find the best matching resolution
312
+ best_resolution = None
313
+ min_scale_factor = float('inf')
314
+
315
+ for res_width, res_height in supported_resolutions:
316
+ scale_w = res_width / width
317
+ scale_h = res_height / height
318
+ scale_factor = max(scale_w, scale_h) # Scale to fit
319
+
320
+ if scale_factor < min_scale_factor:
321
+ min_scale_factor = scale_factor
322
+ best_resolution = (res_width, res_height)
323
+
324
+ # Calculate tokens based on resolution tier
325
+ if best_resolution:
326
+ res_area = best_resolution[0] * best_resolution[1]
327
+ base_area = 560 * 560 # Base resolution area
328
+ tokens = int(base_tokens * (res_area / base_area))
329
+ tokens = min(tokens, max_tokens)
330
+ else:
331
+ tokens = base_tokens
332
+
333
+ return {
334
+ 'tokens': tokens,
335
+ 'method': 'resolution_tier',
336
+ 'best_resolution': f"{best_resolution[0]}x{best_resolution[1]}" if best_resolution else None,
337
+ 'scale_factor': min_scale_factor,
338
+ 'max_tokens': max_tokens,
339
+ 'supported_resolutions': supported_resolutions
340
+ }
341
+
342
+ def _calculate_openai_tokens_enhanced(self, width: int, height: int, model: str,
343
+ detail_level: str, model_caps: Dict[str, Any]) -> Dict[str, Any]:
344
+ """Enhanced OpenAI token calculation with research-based improvements."""
345
+ config = self.PROVIDER_CONFIGS['openai']
346
+
347
+ if detail_level == 'low':
348
+ return {
349
+ 'tokens': config['detail_levels']['low'],
350
+ 'method': 'openai_low_detail',
351
+ 'detail_level': 'low'
352
+ }
353
+
354
+ # Step 1: Resize to fit 2048x2048 square (preserving aspect ratio)
355
+ max_dim = config['max_dimension']
356
+ if width > max_dim or height > max_dim:
357
+ scale = min(max_dim / width, max_dim / height)
358
+ width = int(width * scale)
359
+ height = int(height * scale)
360
+
361
+ # Step 2: Resize so shortest side is 768px (from research)
362
+ short_side_target = config['short_side_target']
363
+ if min(width, height) != short_side_target:
364
+ scale = short_side_target / min(width, height)
365
+ width = int(width * scale)
366
+ height = int(height * scale)
367
+
368
+ # Step 3: Calculate tiles
369
+ tile_size = config['tile_size']
370
+ tiles_width = math.ceil(width / tile_size)
371
+ tiles_height = math.ceil(height / tile_size)
372
+ total_tiles = tiles_width * tiles_height
373
+
374
+ # Step 4: Apply formula
375
+ base_tokens = config['base_tokens']
376
+ tile_tokens = config['tokens_per_tile']
377
+ total_tokens = base_tokens + (total_tiles * tile_tokens)
378
+
379
+ return {
380
+ 'tokens': total_tokens,
381
+ 'method': 'openai_tile_based',
382
+ 'detail_level': detail_level,
383
+ 'resized_to': f"{width}x{height}",
384
+ 'tiles': f"{tiles_width}x{tiles_height}",
385
+ 'total_tiles': total_tiles,
386
+ 'base_tokens': base_tokens,
387
+ 'tile_tokens': tile_tokens
388
+ }
389
+
390
+ def _calculate_anthropic_tokens_enhanced(self, width: int, height: int, model: str,
391
+ model_caps: Dict[str, Any]) -> Dict[str, Any]:
392
+ """Enhanced Anthropic token calculation based on research formula."""
393
+ config = self.PROVIDER_CONFIGS['anthropic']
394
+
395
+ # Resize if exceeds limits
396
+ max_dim = config['max_dimension']
397
+ original_size = f"{width}x{height}"
398
+
399
+ if width > max_dim or height > max_dim:
400
+ scale = min(max_dim / width, max_dim / height)
401
+ width = int(width * scale)
402
+ height = int(height * scale)
403
+
404
+ # Apply Anthropic's formula: (width * height) / 750
405
+ pixel_area = width * height
406
+ calculated_tokens = pixel_area / config['pixel_divisor']
407
+
408
+ # Apply token cap
409
+ tokens = min(int(calculated_tokens), config['token_cap'])
410
+
411
+ # Check for small image warning
412
+ min_dim_warning = None
413
+ if min(width, height) < config['min_dimension_warning']:
414
+ min_dim_warning = f"Image dimension below {config['min_dimension_warning']}px may degrade performance"
415
+
416
+ return {
417
+ 'tokens': tokens,
418
+ 'method': 'anthropic_pixel_area',
419
+ 'formula': f"({width} * {height}) / {config['pixel_divisor']}",
420
+ 'calculated_tokens': calculated_tokens,
421
+ 'token_cap': config['token_cap'],
422
+ 'resized_from': original_size if f"{width}x{height}" != original_size else None,
423
+ 'warning': min_dim_warning
424
+ }
425
+
426
+ def _calculate_google_tokens(self, width: int, height: int, model: str,
427
+ model_caps: Dict[str, Any]) -> Dict[str, Any]:
428
+ """Calculate tokens for Google Gemini models using hybrid approach."""
429
+ config = self.PROVIDER_CONFIGS['google']
430
+
431
+ # Check if it's a small image
432
+ threshold = config['small_image_threshold']
433
+ if width <= threshold and height <= threshold:
434
+ return {
435
+ 'tokens': config['small_image_tokens'],
436
+ 'method': 'google_small_image',
437
+ 'threshold': f"{threshold}x{threshold}",
438
+ 'classification': 'small'
439
+ }
440
+
441
+ # Large image: calculate tiles
442
+ tile_size = config['tile_size']
443
+ tiles_width = math.ceil(width / tile_size)
444
+ tiles_height = math.ceil(height / tile_size)
445
+ total_tiles = tiles_width * tiles_height
446
+
447
+ tokens = total_tiles * config['tokens_per_tile']
448
+
449
+ return {
450
+ 'tokens': tokens,
451
+ 'method': 'google_tiled',
452
+ 'classification': 'large',
453
+ 'tile_size': f"{tile_size}x{tile_size}",
454
+ 'tiles': f"{tiles_width}x{tiles_height}",
455
+ 'total_tiles': total_tiles,
456
+ 'tokens_per_tile': config['tokens_per_tile']
457
+ }
458
+
459
+ def _calculate_local_tokens_enhanced(self, width: int, height: int, provider: str,
460
+ model: str, model_caps: Dict[str, Any]) -> Dict[str, Any]:
461
+ """Enhanced local model token calculation with model capabilities integration."""
462
+ config = self.PROVIDER_CONFIGS[provider]
463
+ base_tokens = config['base_tokens']
464
+
465
+ # Get model-specific information from capabilities
466
+ vision_support = model_caps.get('vision_support', False)
467
+ max_image_tokens = model_caps.get('max_image_tokens', base_tokens)
468
+ image_patch_size = model_caps.get('image_patch_size', 16)
469
+
470
+ # Use patch-based calculation if patch size is available
471
+ if image_patch_size and vision_support:
472
+ patches_width = math.ceil(width / image_patch_size)
473
+ patches_height = math.ceil(height / image_patch_size)
474
+ total_patches = patches_width * patches_height
475
+
476
+ # Scale by efficiency factor
477
+ tokens = int(total_patches * config['scaling_factor'])
478
+ tokens = min(tokens, max_image_tokens)
479
+
480
+ return {
481
+ 'tokens': tokens,
482
+ 'method': f'{provider}_patch_based',
483
+ 'patch_size': image_patch_size,
484
+ 'patches': f"{patches_width}x{patches_height}",
485
+ 'scaling_factor': config['scaling_factor'],
486
+ 'max_tokens': max_image_tokens,
487
+ 'vision_support': vision_support
488
+ }
489
+
490
+ # Fallback to area-based calculation
491
+ standard_pixels = 512 * 512
492
+ actual_pixels = width * height
493
+ scaling_factor = math.sqrt(actual_pixels / standard_pixels)
494
+
495
+ tokens = int(base_tokens * scaling_factor * config['scaling_factor'])
496
+
497
+ return {
498
+ 'tokens': tokens,
499
+ 'method': f'{provider}_area_based',
500
+ 'base_tokens': base_tokens,
501
+ 'scaling_factor': config['scaling_factor'],
502
+ 'pixel_scaling': scaling_factor
503
+ }
504
+
505
+ def _calculate_research_based_fallback(self, width: int, height: int,
506
+ model_caps: Dict[str, Any]) -> Dict[str, Any]:
507
+ """Research-based fallback calculation for unknown models."""
508
+ # Use Vision Transformer patch-based approach as fallback
509
+ patch_size = model_caps.get('image_patch_size', 16) # Default ViT patch size
510
+
511
+ patches_width = math.ceil(width / patch_size)
512
+ patches_height = math.ceil(height / patch_size)
513
+ total_patches = patches_width * patches_height
514
+
515
+ # Conservative token estimate
516
+ tokens = min(total_patches, 2048) # Cap at reasonable limit
517
+
518
+ return {
519
+ 'tokens': tokens,
520
+ 'method': 'research_based_fallback',
521
+ 'patch_size': patch_size,
522
+ 'patches': f"{patches_width}x{patches_height}",
523
+ 'note': 'Using Vision Transformer patch-based estimation'
524
+ }
525
+
526
+ def calculate_tokens_for_images(self,
527
+ image_paths: List[Path],
528
+ provider: str = 'openai',
529
+ model: str = '',
530
+ detail_level: str = 'auto') -> Dict[str, Any]:
531
+ """Calculate tokens for multiple images with detailed breakdown."""
532
+ results = {
533
+ 'total_tokens': 0,
534
+ 'image_count': len(image_paths),
535
+ 'per_image_results': [],
536
+ 'average_tokens_per_image': 0,
537
+ 'provider': provider,
538
+ 'model': model,
539
+ 'calculation_summary': {}
540
+ }
541
+
542
+ method_counts = {}
543
+
544
+ for i, image_path in enumerate(image_paths):
545
+ try:
546
+ result = self.calculate_tokens_for_image(
547
+ image_path=image_path,
548
+ provider=provider,
549
+ model=model,
550
+ detail_level=detail_level
551
+ )
552
+
553
+ results['per_image_results'].append(result)
554
+ results['total_tokens'] += result['tokens']
555
+
556
+ # Track calculation methods
557
+ method = result.get('method', 'unknown')
558
+ method_counts[method] = method_counts.get(method, 0) + 1
559
+
560
+ except Exception as e:
561
+ self.logger.error(f"Failed to calculate tokens for {image_path}: {e}")
562
+ fallback_result = {
563
+ 'tokens': 512, # Conservative fallback
564
+ 'method': 'error_fallback',
565
+ 'error': str(e),
566
+ 'image_path': str(image_path)
567
+ }
568
+ results['per_image_results'].append(fallback_result)
569
+ results['total_tokens'] += 512
570
+
571
+ if results['image_count'] > 0:
572
+ results['average_tokens_per_image'] = results['total_tokens'] / results['image_count']
573
+
574
+ results['calculation_summary'] = {
575
+ 'methods_used': method_counts,
576
+ 'primary_method': max(method_counts.items(), key=lambda x: x[1])[0] if method_counts else 'none'
577
+ }
578
+
579
+ return results
580
+
581
+ def get_compression_ratio(self,
582
+ original_text_tokens: int,
583
+ image_paths: List[Path],
584
+ provider: str = 'openai',
585
+ model: str = '') -> Dict[str, Any]:
586
+ """Calculate accurate compression ratio with enhanced analysis."""
587
+ image_analysis = self.calculate_tokens_for_images(
588
+ image_paths=image_paths,
589
+ provider=provider,
590
+ model=model
591
+ )
592
+
593
+ compressed_tokens = image_analysis['total_tokens']
594
+ compression_ratio = original_text_tokens / compressed_tokens if compressed_tokens > 0 else 0
595
+
596
+ return {
597
+ 'original_tokens': original_text_tokens,
598
+ 'compressed_tokens': compressed_tokens,
599
+ 'compression_ratio': compression_ratio,
600
+ 'images_created': len(image_paths),
601
+ 'average_tokens_per_image': image_analysis['average_tokens_per_image'],
602
+ 'provider': provider,
603
+ 'model': model,
604
+ 'calculation_methods': image_analysis['calculation_summary'],
605
+ 'per_image_breakdown': image_analysis['per_image_results'],
606
+ 'token_savings': original_text_tokens - compressed_tokens,
607
+ 'efficiency_analysis': self._analyze_efficiency(compression_ratio, provider, model)
608
+ }
609
+
610
+ def _analyze_efficiency(self, ratio: float, provider: str, model: str) -> Dict[str, Any]:
611
+ """Analyze compression efficiency and provide insights."""
612
+ if ratio > 10:
613
+ efficiency = "excellent"
614
+ insight = "Exceptional compression achieved, ideal for long-context processing"
615
+ elif ratio > 4:
616
+ efficiency = "very_good"
617
+ insight = "Strong compression ratio, significant token savings"
618
+ elif ratio > 2:
619
+ efficiency = "good"
620
+ insight = "Moderate compression, suitable for most use cases"
621
+ elif ratio > 1:
622
+ efficiency = "marginal"
623
+ insight = "Limited compression benefit, consider alternative approaches"
624
+ else:
625
+ efficiency = "poor"
626
+ insight = "No compression benefit, text processing may be more efficient"
627
+
628
+ return {
629
+ 'efficiency_rating': efficiency,
630
+ 'insight': insight,
631
+ 'compression_ratio': ratio,
632
+ 'recommended_use': ratio > 1.5
633
+ }
634
+
635
+ def _get_timestamp(self) -> str:
636
+ """Get current timestamp for calculation metadata."""
637
+ from datetime import datetime
638
+ return datetime.now().isoformat()
639
+
640
+
641
+ # Convenience functions for backward compatibility
642
+ def calculate_image_tokens(image_path: Path, provider: str = 'openai', model: str = '') -> int:
643
+ """Calculate tokens for a single image."""
644
+ calculator = VLMTokenCalculator()
645
+ result = calculator.calculate_tokens_for_image(image_path=image_path, provider=provider, model=model)
646
+ return result['tokens']
647
+
648
+
649
+ def calculate_glyph_compression_ratio(original_tokens: int,
650
+ image_paths: List[Path],
651
+ provider: str = 'openai',
652
+ model: str = '') -> Dict[str, Any]:
653
+ """Calculate accurate Glyph compression ratio."""
654
+ calculator = VLMTokenCalculator()
655
+ return calculator.get_compression_ratio(original_tokens, image_paths, provider, model)