abstractcore 2.5.2__py3-none-any.whl → 2.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. abstractcore/__init__.py +19 -1
  2. abstractcore/architectures/detection.py +252 -6
  3. abstractcore/assets/architecture_formats.json +14 -1
  4. abstractcore/assets/model_capabilities.json +533 -10
  5. abstractcore/compression/__init__.py +29 -0
  6. abstractcore/compression/analytics.py +420 -0
  7. abstractcore/compression/cache.py +250 -0
  8. abstractcore/compression/config.py +279 -0
  9. abstractcore/compression/exceptions.py +30 -0
  10. abstractcore/compression/glyph_processor.py +381 -0
  11. abstractcore/compression/optimizer.py +388 -0
  12. abstractcore/compression/orchestrator.py +380 -0
  13. abstractcore/compression/pil_text_renderer.py +818 -0
  14. abstractcore/compression/quality.py +226 -0
  15. abstractcore/compression/text_formatter.py +666 -0
  16. abstractcore/compression/vision_compressor.py +371 -0
  17. abstractcore/config/main.py +64 -0
  18. abstractcore/config/manager.py +100 -5
  19. abstractcore/core/retry.py +2 -2
  20. abstractcore/core/session.py +193 -7
  21. abstractcore/download.py +253 -0
  22. abstractcore/embeddings/manager.py +2 -2
  23. abstractcore/events/__init__.py +113 -2
  24. abstractcore/exceptions/__init__.py +49 -2
  25. abstractcore/media/auto_handler.py +312 -18
  26. abstractcore/media/handlers/local_handler.py +14 -2
  27. abstractcore/media/handlers/openai_handler.py +62 -3
  28. abstractcore/media/processors/__init__.py +11 -1
  29. abstractcore/media/processors/direct_pdf_processor.py +210 -0
  30. abstractcore/media/processors/glyph_pdf_processor.py +227 -0
  31. abstractcore/media/processors/image_processor.py +7 -1
  32. abstractcore/media/processors/office_processor.py +2 -2
  33. abstractcore/media/processors/text_processor.py +18 -3
  34. abstractcore/media/types.py +164 -7
  35. abstractcore/media/utils/image_scaler.py +2 -2
  36. abstractcore/media/vision_fallback.py +2 -2
  37. abstractcore/providers/__init__.py +18 -0
  38. abstractcore/providers/anthropic_provider.py +228 -8
  39. abstractcore/providers/base.py +378 -11
  40. abstractcore/providers/huggingface_provider.py +563 -23
  41. abstractcore/providers/lmstudio_provider.py +284 -4
  42. abstractcore/providers/mlx_provider.py +27 -2
  43. abstractcore/providers/model_capabilities.py +352 -0
  44. abstractcore/providers/ollama_provider.py +282 -6
  45. abstractcore/providers/openai_provider.py +286 -8
  46. abstractcore/providers/registry.py +85 -13
  47. abstractcore/providers/streaming.py +2 -2
  48. abstractcore/server/app.py +91 -81
  49. abstractcore/tools/common_tools.py +2 -2
  50. abstractcore/tools/handler.py +2 -2
  51. abstractcore/tools/parser.py +2 -2
  52. abstractcore/tools/registry.py +2 -2
  53. abstractcore/tools/syntax_rewriter.py +2 -2
  54. abstractcore/tools/tag_rewriter.py +3 -3
  55. abstractcore/utils/__init__.py +4 -1
  56. abstractcore/utils/self_fixes.py +2 -2
  57. abstractcore/utils/trace_export.py +287 -0
  58. abstractcore/utils/version.py +1 -1
  59. abstractcore/utils/vlm_token_calculator.py +655 -0
  60. {abstractcore-2.5.2.dist-info → abstractcore-2.6.0.dist-info}/METADATA +207 -8
  61. abstractcore-2.6.0.dist-info/RECORD +108 -0
  62. abstractcore-2.5.2.dist-info/RECORD +0 -90
  63. {abstractcore-2.5.2.dist-info → abstractcore-2.6.0.dist-info}/WHEEL +0 -0
  64. {abstractcore-2.5.2.dist-info → abstractcore-2.6.0.dist-info}/entry_points.txt +0 -0
  65. {abstractcore-2.5.2.dist-info → abstractcore-2.6.0.dist-info}/licenses/LICENSE +0 -0
  66. {abstractcore-2.5.2.dist-info → abstractcore-2.6.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,279 @@
1
+ """
2
+ Glyph compression configuration classes.
3
+ """
4
+
5
+ import os
6
+ from dataclasses import dataclass, field
7
+ from typing import Dict, Optional, Any
8
+ from pathlib import Path
9
+
10
+
11
+ @dataclass
12
+ class RenderingConfig:
13
+ """Configuration for text rendering to images."""
14
+
15
+ # Font configuration (EXTREME density optimization)
16
+ font_path: str = "Verdana.ttf"
17
+ font_name: Optional[str] = "OCRB" # Default to OCRB for optimal readability
18
+ font_size: int = 7 # Minimum readable font size
19
+ line_height: int = 8 # Ultra-tight line spacing
20
+
21
+ # Layout configuration (VLM-optimized defaults)
22
+ dpi: int = 72 # 72 for higher compression, 96 for better quality
23
+ target_width: Optional[int] = None # Target image width in pixels (default: 1024 for VLMs)
24
+ target_height: Optional[int] = None # Target image height in pixels (default: 768 for VLMs)
25
+ margin_x: int = 10 # Generous margins for better readability
26
+ margin_y: int = 10 # Generous margins for better readability
27
+ page_width: int = 595 # A4 width in points (used when target dimensions not set)
28
+ page_height: int = 842 # A4 height in points (used when target dimensions not set)
29
+
30
+ # Optimization settings
31
+ auto_crop_width: bool = True
32
+ auto_crop_last_page: bool = True
33
+ newline_markup: str = '<font color="#FF0000"> \\n </font>'
34
+
35
+ # Multi-column layout (optimized for readability)
36
+ columns: int = 2 # 2-column layout for optimal balance
37
+ column_gap: int = 10 # Optimal gap between columns
38
+
39
+ # Text formatting options
40
+ render_format: bool = True # Enable markdown-like formatting
41
+
42
+ def copy(self) -> 'RenderingConfig':
43
+ """Create a copy of this configuration."""
44
+ return RenderingConfig(
45
+ font_path=self.font_path,
46
+ font_name=self.font_name,
47
+ font_size=self.font_size,
48
+ line_height=self.line_height,
49
+ dpi=self.dpi,
50
+ target_width=self.target_width,
51
+ target_height=self.target_height,
52
+ margin_x=self.margin_x,
53
+ margin_y=self.margin_y,
54
+ page_width=self.page_width,
55
+ page_height=self.page_height,
56
+ auto_crop_width=self.auto_crop_width,
57
+ auto_crop_last_page=self.auto_crop_last_page,
58
+ newline_markup=self.newline_markup,
59
+ columns=self.columns,
60
+ column_gap=self.column_gap,
61
+ render_format=self.render_format
62
+ )
63
+
64
+ def to_dict(self) -> Dict[str, Any]:
65
+ """Convert to dictionary."""
66
+ return {
67
+ 'font_path': self.font_path,
68
+ 'font_name': self.font_name,
69
+ 'font_size': self.font_size,
70
+ 'line_height': self.line_height,
71
+ 'dpi': self.dpi,
72
+ 'target_width': self.target_width,
73
+ 'target_height': self.target_height,
74
+ 'margin_x': self.margin_x,
75
+ 'margin_y': self.margin_y,
76
+ 'page_width': self.page_width,
77
+ 'page_height': self.page_height,
78
+ 'auto_crop_width': self.auto_crop_width,
79
+ 'auto_crop_last_page': self.auto_crop_last_page,
80
+ 'newline_markup': self.newline_markup,
81
+ 'columns': self.columns,
82
+ 'column_gap': self.column_gap,
83
+ 'render_format': self.render_format
84
+ }
85
+
86
+
87
+ @dataclass
88
+ class GlyphConfig:
89
+ """Main configuration for Glyph compression."""
90
+
91
+ # Core settings
92
+ enabled: bool = True
93
+ global_default: str = "auto" # auto, always, never
94
+ quality_threshold: float = 0.95
95
+ min_token_threshold: int = 10000 # Minimum tokens to consider compression
96
+ target_compression_ratio: float = 3.0
97
+
98
+ # Cache settings
99
+ cache_directory: str = field(default_factory=lambda: str(Path.home() / ".abstractcore" / "glyph_cache"))
100
+ cache_size_gb: float = 1.0
101
+ cache_ttl_days: int = 7
102
+
103
+ # Provider optimization
104
+ provider_optimization: bool = True
105
+ preferred_provider: str = "openai/gpt-4o"
106
+
107
+ # Rendering configuration
108
+ rendering: RenderingConfig = field(default_factory=RenderingConfig)
109
+
110
+ # App-specific defaults
111
+ app_defaults: Dict[str, str] = field(default_factory=dict)
112
+
113
+ # Provider-specific profiles
114
+ provider_profiles: Dict[str, Dict[str, Any]] = field(default_factory=dict)
115
+
116
+ # Processing settings
117
+ temp_dir: Optional[str] = None
118
+ max_concurrent_compressions: int = 2
119
+ processing_timeout: int = 300 # 5 minutes
120
+
121
+ @classmethod
122
+ def default(cls) -> 'GlyphConfig':
123
+ """Create default configuration."""
124
+ config = cls()
125
+
126
+ # Set up default provider profiles based on Glyph research
127
+ config.provider_profiles = {
128
+ "openai": {
129
+ "dpi": 72,
130
+ "font_size": 9,
131
+ "quality_threshold": 0.93,
132
+ "newline_markup": '<font color="#FF0000"> \\n </font>'
133
+ },
134
+ "anthropic": {
135
+ "dpi": 96,
136
+ "font_size": 10,
137
+ "quality_threshold": 0.96,
138
+ "font_path": "Verdana.ttf"
139
+ },
140
+ "ollama": {
141
+ "dpi": 72,
142
+ "font_size": 9,
143
+ "auto_crop_width": True,
144
+ "auto_crop_last_page": True
145
+ },
146
+ "lmstudio": {
147
+ "dpi": 96,
148
+ "font_size": 10,
149
+ "quality_threshold": 0.94
150
+ }
151
+ }
152
+
153
+ # Set up default app preferences
154
+ config.app_defaults = {
155
+ "summarizer": "always",
156
+ "extractor": "never",
157
+ "judge": "auto",
158
+ "cli": "auto"
159
+ }
160
+
161
+ return config
162
+
163
+ @classmethod
164
+ def from_abstractcore_config(cls) -> 'GlyphConfig':
165
+ """Load Glyph config from AbstractCore's centralized configuration."""
166
+ try:
167
+ # Try to load from AbstractCore config system
168
+ from ..config import get_config_manager
169
+ config_manager = get_config_manager()
170
+
171
+ # Check if glyph_compression section exists
172
+ if hasattr(config_manager.config, 'glyph_compression'):
173
+ glyph_section = config_manager.config.glyph_compression
174
+ config = cls.default()
175
+
176
+ # Update with user settings
177
+ config.enabled = getattr(glyph_section, 'enabled', config.enabled)
178
+ config.global_default = getattr(glyph_section, 'global_default', config.global_default)
179
+ config.quality_threshold = getattr(glyph_section, 'quality_threshold', config.quality_threshold)
180
+ config.cache_directory = getattr(glyph_section, 'cache_directory', config.cache_directory)
181
+ config.preferred_provider = getattr(glyph_section, 'preferred_provider', config.preferred_provider)
182
+
183
+ # Update app defaults
184
+ if hasattr(glyph_section, 'app_defaults'):
185
+ config.app_defaults.update(glyph_section.app_defaults)
186
+
187
+ # Update provider profiles
188
+ if hasattr(glyph_section, 'provider_profiles'):
189
+ for provider, profile in glyph_section.provider_profiles.items():
190
+ if provider in config.provider_profiles:
191
+ config.provider_profiles[provider].update(profile)
192
+ else:
193
+ config.provider_profiles[provider] = profile
194
+
195
+ return config
196
+ else:
197
+ # No glyph config section, return defaults
198
+ return cls.default()
199
+
200
+ except (ImportError, AttributeError):
201
+ # Fallback to default if config system not available
202
+ return cls.default()
203
+
204
+ def save_to_abstractcore_config(self):
205
+ """Save Glyph config to AbstractCore's centralized configuration."""
206
+ try:
207
+ # Try to save to AbstractCore config system
208
+ from ..config import get_config_manager
209
+ config_manager = get_config_manager()
210
+
211
+ # Create glyph_compression section
212
+ glyph_config = {
213
+ 'enabled': self.enabled,
214
+ 'global_default': self.global_default,
215
+ 'quality_threshold': self.quality_threshold,
216
+ 'cache_directory': self.cache_directory,
217
+ 'preferred_provider': self.preferred_provider,
218
+ 'app_defaults': self.app_defaults,
219
+ 'provider_profiles': self.provider_profiles
220
+ }
221
+
222
+ # Save configuration
223
+ config_manager.set_glyph_compression(glyph_config)
224
+ config_manager.save()
225
+
226
+ except (ImportError, AttributeError):
227
+ # Silently fail if config system not available
228
+ # Could implement file-based fallback here if needed
229
+ pass
230
+
231
+ def get_provider_config(self, provider: str, model: str = None) -> RenderingConfig:
232
+ """Get provider-specific rendering configuration."""
233
+ base_config = self.rendering.copy()
234
+
235
+ # Apply provider-specific settings
236
+ if provider in self.provider_profiles:
237
+ profile = self.provider_profiles[provider]
238
+
239
+ # Update rendering config with provider settings
240
+ if 'dpi' in profile:
241
+ base_config.dpi = profile['dpi']
242
+ if 'font_size' in profile:
243
+ base_config.font_size = profile['font_size']
244
+ if 'font_path' in profile:
245
+ base_config.font_path = profile['font_path']
246
+ if 'newline_markup' in profile:
247
+ base_config.newline_markup = profile['newline_markup']
248
+ if 'auto_crop_width' in profile:
249
+ base_config.auto_crop_width = profile['auto_crop_width']
250
+ if 'auto_crop_last_page' in profile:
251
+ base_config.auto_crop_last_page = profile['auto_crop_last_page']
252
+
253
+ return base_config
254
+
255
+ def should_compress(self, content_length: int, provider: str, model: str, user_preference: str = None) -> bool:
256
+ """Determine if content should be compressed."""
257
+ # Check user preference first
258
+ preference = user_preference or self.global_default
259
+
260
+ if preference == "never":
261
+ return False
262
+ elif preference == "always":
263
+ return True
264
+
265
+ # Auto-decision logic
266
+ if content_length < self.min_token_threshold:
267
+ return False # Too small to benefit
268
+
269
+ # Check if provider supports vision
270
+ try:
271
+ from ..media.capabilities import get_model_capabilities
272
+ capabilities = get_model_capabilities(provider, model)
273
+ if not capabilities.get('vision_support', False):
274
+ return False # Provider doesn't support vision
275
+ except:
276
+ # Conservative approach if capabilities unknown
277
+ return False
278
+
279
+ return True # Beneficial for large content with vision support
@@ -0,0 +1,30 @@
1
+ """
2
+ Glyph compression exceptions.
3
+ """
4
+
5
+ from ..exceptions import AbstractCoreError
6
+
7
+
8
+ class CompressionError(AbstractCoreError):
9
+ """Base exception for compression-related errors."""
10
+ pass
11
+
12
+
13
+ class CompressionQualityError(CompressionError):
14
+ """Exception raised when compression quality is below threshold."""
15
+
16
+ def __init__(self, message: str, quality_score: float = None, threshold: float = None):
17
+ super().__init__(message)
18
+ self.quality_score = quality_score
19
+ self.threshold = threshold
20
+
21
+
22
+ class RenderingError(CompressionError):
23
+ """Exception raised when text rendering fails."""
24
+ pass
25
+
26
+
27
+ class CompressionCacheError(CompressionError):
28
+ """Exception raised when compression cache operations fail."""
29
+ pass
30
+
@@ -0,0 +1,381 @@
1
+ """
2
+ Glyph visual-text compression processor for AbstractCore.
3
+
4
+ Based on the actual Glyph implementation using reportlab for PDF generation
5
+ and pdf2image for conversion, with provider-specific optimization.
6
+ """
7
+
8
+ import time
9
+ import json
10
+ from pathlib import Path
11
+ from typing import List, Optional, Union, Dict, Any
12
+
13
+ from ..media.base import BaseMediaHandler
14
+ from ..media.types import MediaContent, MediaType, ContentFormat
15
+ from ..utils.structured_logging import get_logger
16
+ from ..utils.token_utils import TokenUtils
17
+
18
+ from .config import GlyphConfig, RenderingConfig
19
+ from .quality import QualityValidator, CompressionStats
20
+ from .cache import CompressionCache
21
+ # ReportLab renderers removed - now using PIL renderer exclusively
22
+ from .pil_text_renderer import PILTextRenderer
23
+ from .text_formatter import TextFormatter, FormattingConfig
24
+ from .exceptions import CompressionError, CompressionQualityError
25
+
26
+
27
+ class GlyphProcessor(BaseMediaHandler):
28
+ """
29
+ Glyph visual-text compression processor for AbstractCore.
30
+
31
+ Transforms long textual sequences into optimized images for processing
32
+ by Vision-Language Models (VLMs), achieving 3-4x token compression.
33
+ """
34
+
35
+ def __init__(self, config: Optional[GlyphConfig] = None, **kwargs):
36
+ """
37
+ Initialize Glyph processor.
38
+
39
+ Args:
40
+ config: Glyph configuration
41
+ **kwargs: Additional configuration passed to base handler
42
+ """
43
+ super().__init__(**kwargs)
44
+
45
+ self.config = config or GlyphConfig.from_abstractcore_config()
46
+ self.logger = get_logger(self.__class__.__name__)
47
+
48
+ # Initialize components
49
+ self.pil_text_renderer = None # Lazy initialization
50
+ self.quality_validator = QualityValidator()
51
+ self.cache = CompressionCache(
52
+ cache_dir=self.config.cache_directory,
53
+ max_size_gb=self.config.cache_size_gb,
54
+ ttl_days=self.config.cache_ttl_days
55
+ )
56
+
57
+ # Initialize text formatter
58
+ self.text_formatter = TextFormatter(FormattingConfig())
59
+
60
+ # Load provider profiles
61
+ self.provider_profiles = self.config.provider_profiles
62
+
63
+ self.logger.debug("GlyphProcessor initialized")
64
+
65
+ def _get_pil_text_renderer(self) -> 'PILTextRenderer':
66
+ """Get or create PIL text renderer instance (lazy initialization)."""
67
+ if self.pil_text_renderer is None:
68
+ self.pil_text_renderer = PILTextRenderer(self.config)
69
+ return self.pil_text_renderer
70
+
71
+ def can_process(self, content: str, provider: str, model: str) -> bool:
72
+ """
73
+ Determine if content should be compressed using Glyph.
74
+
75
+ Args:
76
+ content: Text content to evaluate
77
+ provider: Provider name
78
+ model: Model name
79
+
80
+ Returns:
81
+ True if compression should be applied
82
+ """
83
+ # Check if compression is enabled
84
+ if not self.config.enabled:
85
+ return False
86
+
87
+ # Estimate token count
88
+ token_count = TokenUtils.estimate_tokens(content, model)
89
+
90
+ # Check minimum token threshold
91
+ if token_count < self.config.min_token_threshold:
92
+ return False
93
+
94
+ # Check if provider supports vision
95
+ try:
96
+ # Lazy import to avoid circular dependency
97
+ from ..media.capabilities import get_model_capabilities
98
+ capabilities = get_model_capabilities(provider, model)
99
+ if not capabilities.get('vision_support', False):
100
+ return False
101
+ except Exception:
102
+ # Conservative approach if capabilities unknown
103
+ return False
104
+
105
+ return True
106
+
107
+ def process_text(self, content: str, provider: str = None, model: str = None, user_preference: str = "auto") -> List[MediaContent]:
108
+ """
109
+ Process text content into compressed visual format.
110
+
111
+ Args:
112
+ content: Text content to compress
113
+ provider: Provider name for optimization
114
+ model: Model name for optimization
115
+
116
+ Returns:
117
+ List of MediaContent objects with compressed images
118
+ """
119
+ start_time = time.time()
120
+
121
+ try:
122
+ # Get provider-specific configuration
123
+ render_config = self._get_provider_config(provider, model)
124
+
125
+ # Apply text formatting if enabled
126
+ processed_content = content
127
+ text_segments = None
128
+ if render_config.render_format:
129
+ self.logger.debug("Applying text formatting")
130
+ text_segments = self.text_formatter.format_text(content)
131
+ # For now, convert back to string for compatibility with existing renderer
132
+ processed_content = self.text_formatter.format_text_to_string(content)
133
+ self.logger.debug("Text formatting applied",
134
+ original_length=len(content),
135
+ formatted_length=len(processed_content),
136
+ segments_count=len(text_segments))
137
+ else:
138
+ self.logger.debug("Text formatting disabled, using raw content")
139
+
140
+ # Check cache first (use processed content for cache key)
141
+ cache_key = self._generate_cache_key(processed_content, render_config)
142
+ if cached_result := self.cache.get(cache_key):
143
+ self.logger.debug(f"Using cached compression for key {cache_key[:16]}...")
144
+ return self._create_media_content_from_images(cached_result, processed_content, provider, render_config)
145
+
146
+ # Always use PIL text renderer (ReportLab removed)
147
+ self.logger.debug("Using PIL text renderer")
148
+ pil_renderer = self._get_pil_text_renderer()
149
+
150
+ if render_config.render_format and text_segments:
151
+ # Use formatted text segments
152
+ self.logger.debug("Rendering with text formatting")
153
+ images = pil_renderer.segments_to_images(
154
+ segments=text_segments,
155
+ config=render_config,
156
+ output_dir=self.config.temp_dir,
157
+ unique_id=cache_key[:16]
158
+ )
159
+ else:
160
+ # Convert plain text to segments for PIL renderer
161
+ self.logger.debug("Rendering plain text (no formatting)")
162
+ from .text_formatter import TextSegment
163
+ plain_segments = [TextSegment(text=processed_content)]
164
+ images = pil_renderer.segments_to_images(
165
+ segments=plain_segments,
166
+ config=render_config,
167
+ output_dir=self.config.temp_dir,
168
+ unique_id=cache_key[:16]
169
+ )
170
+
171
+ # Quality validation (bypass if user explicitly wants compression)
172
+ quality_score = self.quality_validator.assess(content, images, provider)
173
+ min_threshold = self.quality_validator.get_provider_threshold(provider)
174
+
175
+ if user_preference != "always" and quality_score < min_threshold:
176
+ raise CompressionQualityError(
177
+ f"Compression quality {quality_score:.3f} below threshold {min_threshold:.3f} for {provider}",
178
+ quality_score=quality_score,
179
+ threshold=min_threshold
180
+ )
181
+ elif user_preference == "always" and quality_score < min_threshold:
182
+ self.logger.warning(f"Compression quality {quality_score:.3f} below threshold {min_threshold:.3f} for {provider}, but proceeding due to 'always' preference")
183
+
184
+ # Calculate compression statistics
185
+ original_tokens = TokenUtils.estimate_tokens(processed_content, model)
186
+ # Calculate accurate token count using proper VLM token calculation
187
+ from ..utils.vlm_token_calculator import VLMTokenCalculator
188
+ from pathlib import Path
189
+
190
+ calculator = VLMTokenCalculator()
191
+ try:
192
+ # Extract image paths from MediaContent objects
193
+ image_paths = []
194
+ for img in images:
195
+ if hasattr(img, 'metadata') and img.metadata.get('temp_file_path'):
196
+ image_paths.append(Path(img.metadata['temp_file_path']))
197
+ elif hasattr(img, 'file_path') and img.file_path:
198
+ image_paths.append(Path(img.file_path))
199
+
200
+ if image_paths:
201
+ token_analysis = calculator.calculate_tokens_for_images(
202
+ image_paths=image_paths,
203
+ provider=provider or 'openai',
204
+ model=model or ''
205
+ )
206
+ compressed_tokens = token_analysis['total_tokens']
207
+ self.logger.info(f"Accurate token calculation: {compressed_tokens} tokens for {len(image_paths)} images")
208
+ else:
209
+ # Fallback calculation
210
+ base_tokens = calculator.PROVIDER_CONFIGS.get(provider or 'openai', {}).get('base_tokens', 512)
211
+ compressed_tokens = len(images) * base_tokens
212
+ self.logger.warning(f"Using fallback token estimation: {compressed_tokens} tokens")
213
+
214
+ except Exception as e:
215
+ self.logger.warning(f"VLM token calculation failed, using fallback: {e}")
216
+ compressed_tokens = len(images) * 1500 # Conservative fallback
217
+ compression_ratio = original_tokens / compressed_tokens if compressed_tokens > 0 else 1.0
218
+
219
+ compression_stats = CompressionStats(
220
+ compression_ratio=compression_ratio,
221
+ quality_score=quality_score,
222
+ token_savings=original_tokens - compressed_tokens,
223
+ processing_time=time.time() - start_time,
224
+ provider_optimized=provider or "unknown",
225
+ original_tokens=original_tokens,
226
+ compressed_tokens=compressed_tokens
227
+ )
228
+
229
+ # Cache successful compression
230
+ self.cache.set(cache_key, images, compression_stats.to_dict())
231
+
232
+ # Create MediaContent objects
233
+ media_contents = self._create_media_content_from_images(
234
+ images, processed_content, provider, render_config, compression_stats
235
+ )
236
+
237
+ self.logger.info(
238
+ f"Glyph compression completed: {compression_ratio:.1f}x ratio, "
239
+ f"{quality_score:.1%} quality, {len(images)} images"
240
+ )
241
+
242
+ return media_contents
243
+
244
+ except Exception as e:
245
+ self.logger.error(f"Glyph compression failed: {e}")
246
+ raise CompressionError(f"Compression failed: {e}") from e
247
+
248
+ def _get_provider_config(self, provider: str, model: str) -> RenderingConfig:
249
+ """Get provider-specific rendering configuration."""
250
+ return self.config.get_provider_config(provider, model)
251
+
252
+ def _generate_cache_key(self, content: str, config: RenderingConfig) -> str:
253
+ """Generate cache key from content and configuration."""
254
+ import hashlib
255
+
256
+ # Create hash of content + configuration
257
+ content_hash = hashlib.sha256(content.encode('utf-8')).hexdigest()[:16]
258
+ config_hash = hashlib.sha256(
259
+ json.dumps(config.to_dict(), sort_keys=True).encode('utf-8')
260
+ ).hexdigest()[:8]
261
+
262
+ return f"{content_hash}_{config_hash}"
263
+
264
+ def _create_media_content_from_images(
265
+ self,
266
+ images: List[Path],
267
+ original_content: str,
268
+ provider: str,
269
+ render_config: RenderingConfig,
270
+ compression_stats: Optional[CompressionStats] = None
271
+ ) -> List[MediaContent]:
272
+ """Create MediaContent objects from rendered images."""
273
+ import base64
274
+
275
+ media_contents = []
276
+
277
+ for i, img_path in enumerate(images):
278
+ if not img_path.exists():
279
+ continue
280
+
281
+ # Read and encode image
282
+ with open(img_path, 'rb') as f:
283
+ image_data = f.read()
284
+
285
+ base64_data = base64.b64encode(image_data).decode('utf-8')
286
+
287
+ # Create metadata
288
+ metadata = {
289
+ "compression_ratio": compression_stats.compression_ratio if compression_stats else None,
290
+ "quality_score": compression_stats.quality_score if compression_stats else None,
291
+ "rendering_config": render_config.to_dict(),
292
+ "provider_optimized": provider,
293
+ "glyph_version": "1.0",
294
+ "processing_time": compression_stats.processing_time if compression_stats else None,
295
+ "image_index": i,
296
+ "total_images": len(images),
297
+ "original_content_length": len(original_content),
298
+ "dpi": render_config.dpi,
299
+ "font_config": {
300
+ "font_path": render_config.font_path,
301
+ "font_size": render_config.font_size,
302
+ "line_height": render_config.line_height
303
+ }
304
+ }
305
+
306
+ # Create MediaContent
307
+ media_content = MediaContent(
308
+ media_type=MediaType.IMAGE,
309
+ content=base64_data,
310
+ content_format=ContentFormat.BASE64,
311
+ mime_type="image/png",
312
+ file_path=str(img_path),
313
+ metadata=metadata
314
+ )
315
+
316
+ media_contents.append(media_content)
317
+
318
+ return media_contents
319
+
320
+ def _process_internal(self, file_path: Path, media_type: MediaType, **kwargs) -> MediaContent:
321
+ """
322
+ Internal processing method for file-based compression.
323
+
324
+ Args:
325
+ file_path: Path to text file
326
+ media_type: Detected media type
327
+ **kwargs: Additional parameters including provider and model
328
+
329
+ Returns:
330
+ MediaContent with compressed representation
331
+ """
332
+ # Read file content
333
+ try:
334
+ with open(file_path, 'r', encoding='utf-8') as f:
335
+ content = f.read()
336
+ except Exception as e:
337
+ raise CompressionError(f"Failed to read file {file_path}: {e}")
338
+
339
+ # Extract provider and model from kwargs
340
+ provider = kwargs.get('provider')
341
+ model = kwargs.get('model')
342
+
343
+ # Process text content
344
+ media_contents = self.process_text(content, provider, model)
345
+
346
+ # Return first image (for single file processing)
347
+ if media_contents:
348
+ return media_contents[0]
349
+ else:
350
+ raise CompressionError("No compressed content generated")
351
+
352
+ def supports_media_type(self, media_type: MediaType) -> bool:
353
+ """Check if this processor supports the media type."""
354
+ # Glyph processor handles text content for compression
355
+ return media_type in [MediaType.TEXT, MediaType.DOCUMENT]
356
+
357
+ def supports_format(self, media_type: MediaType, format_ext: str) -> bool:
358
+ """Check if this processor supports the format."""
359
+ if media_type in [MediaType.TEXT, MediaType.DOCUMENT]:
360
+ # Support text-based formats
361
+ supported_formats = {'txt', 'md', 'csv', 'tsv', 'json', 'yaml', 'yml'}
362
+ return format_ext.lower() in supported_formats
363
+ return False
364
+
365
+ def get_compression_stats(self) -> Dict[str, Any]:
366
+ """Get compression statistics."""
367
+ cache_stats = self.cache.get_stats()
368
+
369
+ return {
370
+ 'processor': 'GlyphProcessor',
371
+ 'version': '1.0',
372
+ 'cache_stats': cache_stats,
373
+ 'config': {
374
+ 'enabled': self.config.enabled,
375
+ 'quality_threshold': self.config.quality_threshold,
376
+ 'min_token_threshold': self.config.min_token_threshold,
377
+ 'target_compression_ratio': self.config.target_compression_ratio
378
+ },
379
+ 'provider_profiles': list(self.provider_profiles.keys())
380
+ }
381
+