abstractcore 2.5.0__py3-none-any.whl → 2.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. abstractcore/__init__.py +12 -0
  2. abstractcore/apps/__main__.py +8 -1
  3. abstractcore/apps/deepsearch.py +644 -0
  4. abstractcore/apps/intent.py +614 -0
  5. abstractcore/architectures/detection.py +250 -4
  6. abstractcore/assets/architecture_formats.json +14 -1
  7. abstractcore/assets/model_capabilities.json +583 -44
  8. abstractcore/compression/__init__.py +29 -0
  9. abstractcore/compression/analytics.py +420 -0
  10. abstractcore/compression/cache.py +250 -0
  11. abstractcore/compression/config.py +279 -0
  12. abstractcore/compression/exceptions.py +30 -0
  13. abstractcore/compression/glyph_processor.py +381 -0
  14. abstractcore/compression/optimizer.py +388 -0
  15. abstractcore/compression/orchestrator.py +380 -0
  16. abstractcore/compression/pil_text_renderer.py +818 -0
  17. abstractcore/compression/quality.py +226 -0
  18. abstractcore/compression/text_formatter.py +666 -0
  19. abstractcore/compression/vision_compressor.py +371 -0
  20. abstractcore/config/main.py +66 -1
  21. abstractcore/config/manager.py +111 -5
  22. abstractcore/core/session.py +105 -5
  23. abstractcore/events/__init__.py +1 -1
  24. abstractcore/media/auto_handler.py +312 -18
  25. abstractcore/media/handlers/local_handler.py +14 -2
  26. abstractcore/media/handlers/openai_handler.py +62 -3
  27. abstractcore/media/processors/__init__.py +11 -1
  28. abstractcore/media/processors/direct_pdf_processor.py +210 -0
  29. abstractcore/media/processors/glyph_pdf_processor.py +227 -0
  30. abstractcore/media/processors/image_processor.py +7 -1
  31. abstractcore/media/processors/text_processor.py +18 -3
  32. abstractcore/media/types.py +164 -7
  33. abstractcore/processing/__init__.py +5 -1
  34. abstractcore/processing/basic_deepsearch.py +2173 -0
  35. abstractcore/processing/basic_intent.py +690 -0
  36. abstractcore/providers/__init__.py +18 -0
  37. abstractcore/providers/anthropic_provider.py +29 -2
  38. abstractcore/providers/base.py +279 -6
  39. abstractcore/providers/huggingface_provider.py +658 -27
  40. abstractcore/providers/lmstudio_provider.py +52 -2
  41. abstractcore/providers/mlx_provider.py +103 -4
  42. abstractcore/providers/model_capabilities.py +352 -0
  43. abstractcore/providers/ollama_provider.py +44 -6
  44. abstractcore/providers/openai_provider.py +29 -2
  45. abstractcore/providers/registry.py +91 -19
  46. abstractcore/server/app.py +91 -81
  47. abstractcore/structured/handler.py +161 -1
  48. abstractcore/tools/common_tools.py +98 -3
  49. abstractcore/utils/__init__.py +4 -1
  50. abstractcore/utils/cli.py +114 -1
  51. abstractcore/utils/trace_export.py +287 -0
  52. abstractcore/utils/version.py +1 -1
  53. abstractcore/utils/vlm_token_calculator.py +655 -0
  54. {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/METADATA +140 -23
  55. abstractcore-2.5.3.dist-info/RECORD +107 -0
  56. {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/entry_points.txt +4 -0
  57. abstractcore-2.5.0.dist-info/RECORD +0 -86
  58. {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/WHEEL +0 -0
  59. {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/licenses/LICENSE +0 -0
  60. {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/top_level.txt +0 -0
@@ -30,6 +30,9 @@ class OpenAIMediaHandler(BaseProviderMediaHandler):
30
30
  """
31
31
  super().__init__("openai", model_capabilities, **kwargs)
32
32
 
33
+ # Store model name for Qwen-specific optimizations
34
+ self.model_name = kwargs.get('model_name', '')
35
+
33
36
  # OpenAI-specific configuration
34
37
  self.max_image_size = kwargs.get('max_image_size', 20 * 1024 * 1024) # 20MB
35
38
  self.supported_image_detail = kwargs.get('supported_image_detail', ['auto', 'low', 'high'])
@@ -118,11 +121,64 @@ class OpenAIMediaHandler(BaseProviderMediaHandler):
118
121
  # Add detail level if supported by model
119
122
  if self.model_capabilities.get('vision_support'):
120
123
  detail_level = media_content.metadata.get('detail_level', 'auto')
124
+ self.logger.debug(f"OpenAI Handler - MediaContent metadata: {media_content.metadata}")
125
+ self.logger.debug(f"OpenAI Handler - Found detail_level: {detail_level}")
126
+
127
+ # Auto-adjust detail level for Qwen models to prevent context overflow
128
+ if self._is_qwen_model() and detail_level == 'auto':
129
+ detail_level = self._get_optimal_detail_for_qwen(media_content)
130
+ self.logger.debug(f"OpenAI Handler - Qwen auto-adjusted detail_level: {detail_level}")
131
+
121
132
  if detail_level in self.supported_image_detail:
122
133
  image_obj["image_url"]["detail"] = detail_level
134
+ self.logger.info(f"OpenAI Handler - Setting detail level to '{detail_level}' for image")
135
+ else:
136
+ self.logger.warning(f"OpenAI Handler - Invalid detail level '{detail_level}', supported: {self.supported_image_detail}")
123
137
 
124
138
  return image_obj
125
139
 
140
+ def _is_qwen_model(self) -> bool:
141
+ """Check if the current model is a Qwen vision model."""
142
+ if not hasattr(self, 'model_name') or not self.model_name:
143
+ return False
144
+
145
+ model_name_lower = self.model_name.lower()
146
+ return any(qwen_variant in model_name_lower for qwen_variant in [
147
+ 'qwen3-vl', 'qwen2.5-vl', 'qwen-vl', 'qwen/qwen3-vl', 'qwen/qwen2.5-vl'
148
+ ])
149
+
150
+ def _get_optimal_detail_for_qwen(self, media_content: MediaContent) -> str:
151
+ """
152
+ Determine optimal detail level for Qwen models based on context constraints.
153
+
154
+ According to SiliconFlow documentation:
155
+ - detail=low: 256 tokens per image (448x448 resize)
156
+ - detail=high: Variable tokens based on resolution (can be 24,576+ tokens)
157
+
158
+ For Qwen3-VL-30B with 131,072 token context limit, we should use detail=low
159
+ when processing multiple images to avoid context overflow.
160
+ """
161
+ # Get model context limit
162
+ max_tokens = self.model_capabilities.get('max_tokens', 32768)
163
+ max_image_tokens = self.model_capabilities.get('max_image_tokens', 24576)
164
+
165
+ # Estimate how many images we might be processing
166
+ # This is a heuristic - in practice we'd need the full batch context
167
+ estimated_images = getattr(self, '_estimated_image_count', 1)
168
+
169
+ # Calculate potential token usage with high detail
170
+ high_detail_tokens = estimated_images * max_image_tokens
171
+
172
+ # Use low detail if high detail would consume >60% of context
173
+ context_threshold = max_tokens * 0.6
174
+
175
+ if high_detail_tokens > context_threshold:
176
+ self.logger.info(f"Using detail=low for Qwen model: {estimated_images} images would consume "
177
+ f"{high_detail_tokens:,} tokens (>{context_threshold:,} threshold)")
178
+ return 'low'
179
+ else:
180
+ return 'high'
181
+
126
182
  def _format_text_for_openai(self, media_content: MediaContent) -> Dict[str, Any]:
127
183
  """
128
184
  Format text/document content for OpenAI API.
@@ -226,12 +282,15 @@ class OpenAIMediaHandler(BaseProviderMediaHandler):
226
282
  Estimated token count
227
283
  """
228
284
  if media_content.media_type == MediaType.IMAGE:
229
- # OpenAI image token estimation
230
- # Base cost varies by detail level and image size
285
+ # Image token estimation varies by model
231
286
  detail_level = media_content.metadata.get('detail_level', 'auto')
232
287
 
233
288
  if detail_level == 'low':
234
- return 85 # Low detail images use 85 tokens
289
+ # Qwen models use 256 tokens for low detail, OpenAI uses 85
290
+ if self._is_qwen_model():
291
+ return 256 # Qwen low detail token count
292
+ else:
293
+ return 85 # OpenAI low detail token count
235
294
  else:
236
295
  # High detail calculation based on image dimensions
237
296
  width = media_content.metadata.get('final_size', [512, 512])[0]
@@ -10,4 +10,14 @@ from .text_processor import TextProcessor
10
10
  from .pdf_processor import PDFProcessor
11
11
  from .office_processor import OfficeProcessor
12
12
 
13
- __all__ = ['ImageProcessor', 'TextProcessor', 'PDFProcessor', 'OfficeProcessor']
13
+ # Import Glyph processor if available
14
+ try:
15
+ from ...compression.glyph_processor import GlyphProcessor
16
+ GLYPH_AVAILABLE = True
17
+ except ImportError:
18
+ GlyphProcessor = None
19
+ GLYPH_AVAILABLE = False
20
+
21
+ __all__ = ['ImageProcessor', 'TextProcessor', 'PDFProcessor', 'OfficeProcessor']
22
+ if GLYPH_AVAILABLE:
23
+ __all__.append('GlyphProcessor')
@@ -0,0 +1,210 @@
1
+ """
2
+ Direct PDF-to-image processor for Glyph compression.
3
+
4
+ This processor converts PDF pages directly to images without text extraction,
5
+ preserving all visual elements including mathematical formulas, tables, and images.
6
+ Supports multi-page layouts (e.g., 2 pages per image) for optimal compression.
7
+ """
8
+
9
+ from pathlib import Path
10
+ from typing import Optional, Dict, Any, List, Union, Tuple
11
+ import tempfile
12
+ import os
13
+ import hashlib
14
+ import math
15
+
16
+ try:
17
+ import pdf2image
18
+ PDF2IMAGE_AVAILABLE = True
19
+ except ImportError:
20
+ PDF2IMAGE_AVAILABLE = False
21
+ pdf2image = None
22
+
23
+ try:
24
+ from PIL import Image
25
+ PIL_AVAILABLE = True
26
+ except ImportError:
27
+ PIL_AVAILABLE = False
28
+ Image = None
29
+
30
+ from ..base import BaseMediaHandler, MediaProcessingError
31
+ from ..types import MediaContent, MediaType, ContentFormat
32
+
33
+
34
+ class DirectPDFProcessor(BaseMediaHandler):
35
+ """
36
+ Direct PDF-to-image processor that preserves all visual elements.
37
+
38
+ Converts PDF pages directly to images without text extraction,
39
+ maintaining perfect fidelity of mathematical formulas, tables, and images.
40
+ """
41
+
42
+ def __init__(self, **kwargs):
43
+ """Initialize the direct PDF processor."""
44
+ super().__init__(**kwargs)
45
+
46
+ if not PDF2IMAGE_AVAILABLE:
47
+ raise MediaProcessingError("pdf2image is required for DirectPDFProcessor")
48
+
49
+ if not PIL_AVAILABLE:
50
+ raise MediaProcessingError("PIL/Pillow is required for DirectPDFProcessor")
51
+
52
+ # Configuration
53
+ self.pages_per_image = kwargs.get('pages_per_image', 2) # 2 pages per image by default
54
+ self.dpi = kwargs.get('dpi', 150) # Higher DPI for better quality
55
+ self.layout = kwargs.get('layout', 'horizontal') # 'horizontal' or 'vertical'
56
+ self.gap = kwargs.get('gap', 20) # Gap between pages in pixels
57
+
58
+ self.logger.debug(f"DirectPDFProcessor initialized: {self.pages_per_image} pages per image")
59
+
60
+ def _process_internal(self, file_path: Path, media_type: MediaType, **kwargs) -> MediaContent:
61
+ """Process PDF directly to images."""
62
+ if media_type != MediaType.DOCUMENT:
63
+ raise MediaProcessingError(f"DirectPDFProcessor only handles documents, got {media_type}")
64
+
65
+ try:
66
+ # Convert PDF pages to images
67
+ image_paths = self._convert_pdf_to_combined_images(file_path)
68
+
69
+ # For now, return the first combined image
70
+ # In a full implementation, this would return all images
71
+ if image_paths:
72
+ with open(image_paths[0], 'rb') as f:
73
+ image_data = f.read()
74
+
75
+ # Encode as base64 for MediaContent
76
+ import base64
77
+ encoded_data = base64.b64encode(image_data).decode('utf-8')
78
+
79
+ # Get session info for metadata
80
+ from ...config import get_config_manager
81
+ config_manager = get_config_manager()
82
+ glyph_cache_base = Path(config_manager.config.cache.glyph_cache_dir).expanduser()
83
+ pdf_hash = hashlib.md5(str(file_path).encode()).hexdigest()[:8]
84
+ session_id = f"pdf_{pdf_hash}_{len(image_paths)}pages"
85
+
86
+ metadata = {
87
+ 'processing_method': 'direct_pdf_conversion',
88
+ 'pages_per_image': self.pages_per_image,
89
+ 'total_images': len(image_paths),
90
+ 'dpi': self.dpi,
91
+ 'layout': self.layout,
92
+ 'image_paths': [str(p) for p in image_paths],
93
+ 'glyph_session_id': session_id,
94
+ 'glyph_cache_dir': str(glyph_cache_base / session_id)
95
+ }
96
+
97
+ return self._create_media_content(
98
+ content=encoded_data,
99
+ file_path=file_path,
100
+ media_type=MediaType.IMAGE, # Return as image
101
+ content_format=ContentFormat.BASE64,
102
+ mime_type="image/png",
103
+ **metadata
104
+ )
105
+ else:
106
+ raise MediaProcessingError("No images generated from PDF")
107
+
108
+ except Exception as e:
109
+ raise MediaProcessingError(f"Failed to process PDF directly: {str(e)}") from e
110
+
111
+ def _convert_pdf_to_combined_images(self, pdf_path: Path) -> List[Path]:
112
+ """Convert PDF to combined images with multiple pages per image."""
113
+
114
+ # Convert all PDF pages to individual images
115
+ individual_images = pdf2image.convert_from_path(
116
+ pdf_path,
117
+ dpi=self.dpi,
118
+ fmt='PNG'
119
+ )
120
+
121
+ self.logger.info(f"Converted PDF to {len(individual_images)} individual page images")
122
+
123
+ # Use AbstractCore's centralized Glyph cache directory
124
+ from ...config import get_config_manager
125
+ config_manager = get_config_manager()
126
+ glyph_cache_base = Path(config_manager.config.cache.glyph_cache_dir).expanduser()
127
+
128
+ # Calculate number of combined images that will be created
129
+ num_combined_images = math.ceil(len(individual_images) / self.pages_per_image)
130
+
131
+ # Create a unique subdirectory for this PDF processing session
132
+ pdf_hash = hashlib.md5(str(pdf_path).encode()).hexdigest()[:8]
133
+ session_id = f"pdf_{pdf_hash}_{num_combined_images}pages"
134
+ glyph_dir = glyph_cache_base / session_id
135
+ glyph_dir.mkdir(parents=True, exist_ok=True)
136
+
137
+ # CRITICAL DEBUG LOG: Show exactly where images are being generated
138
+ self.logger.debug(f"🎯 GENERATING GLYPH IMAGES IN CACHE DIRECTORY: {glyph_dir}")
139
+ self.logger.info(f"DirectPDFProcessor: Creating {self.pages_per_image} pages per image in {glyph_dir}")
140
+
141
+ # Combine pages into multi-page images
142
+ combined_images = []
143
+
144
+ try:
145
+ for i in range(0, len(individual_images), self.pages_per_image):
146
+ # Get pages for this combined image
147
+ pages_batch = individual_images[i:i + self.pages_per_image]
148
+
149
+ # Create combined image
150
+ combined_image = self._combine_pages(pages_batch, i // self.pages_per_image)
151
+
152
+ # Save combined image in Glyph cache
153
+ output_path = glyph_dir / f"image_{i // self.pages_per_image + 1:03d}.png"
154
+ combined_image.save(output_path, 'PNG', optimize=True)
155
+ combined_images.append(output_path)
156
+
157
+ # CRITICAL DEBUG LOG: Show each image as it's created
158
+ self.logger.debug(f"📄 CREATED GLYPH IMAGE: {output_path} (pages {i+1}-{min(i+self.pages_per_image, len(individual_images))})")
159
+
160
+ except Exception as e:
161
+ # Clean up cache directory on error
162
+ import shutil
163
+ shutil.rmtree(glyph_dir, ignore_errors=True)
164
+ raise e
165
+
166
+ # CRITICAL DEBUG LOG: Final summary with exact paths
167
+ self.logger.info(f"Created {len(combined_images)} combined images from {len(individual_images)} pages")
168
+ self.logger.debug(f"🎯 ALL GLYPH IMAGES STORED IN CACHE: {glyph_dir}")
169
+ for i, img_path in enumerate(combined_images, 1):
170
+ self.logger.debug(f" 📄 Image {i}: {img_path}")
171
+
172
+ return combined_images
173
+
174
+ def _combine_pages(self, pages: List['Image.Image'], batch_index: int) -> 'Image.Image':
175
+ """Combine multiple PDF pages into a single image."""
176
+ if not pages:
177
+ raise ValueError("No pages to combine")
178
+
179
+ if len(pages) == 1:
180
+ return pages[0]
181
+
182
+ # Calculate dimensions for combined image
183
+ if self.layout == 'horizontal':
184
+ # Side-by-side layout (like an open book)
185
+ total_width = sum(page.width for page in pages) + self.gap * (len(pages) - 1)
186
+ total_height = max(page.height for page in pages)
187
+ else:
188
+ # Vertical layout (pages stacked)
189
+ total_width = max(page.width for page in pages)
190
+ total_height = sum(page.height for page in pages) + self.gap * (len(pages) - 1)
191
+
192
+ # Create new image with white background
193
+ combined = Image.new('RGB', (total_width, total_height), 'white')
194
+
195
+ # Paste pages into combined image
196
+ current_x, current_y = 0, 0
197
+
198
+ for page in pages:
199
+ combined.paste(page, (current_x, current_y))
200
+
201
+ if self.layout == 'horizontal':
202
+ current_x += page.width + self.gap
203
+ else:
204
+ current_y += page.height + self.gap
205
+
206
+ return combined
207
+
208
+ def get_combined_image_paths(self, pdf_path: Path) -> List[Path]:
209
+ """Get paths to all combined images (for external use)."""
210
+ return self._convert_pdf_to_combined_images(pdf_path)
@@ -0,0 +1,227 @@
1
+ """
2
+ Glyph-optimized PDF processor that preserves mathematical notation and table formatting.
3
+
4
+ This processor extracts PDF content while maintaining compact mathematical expressions
5
+ and tabular data formatting for optimal visual compression.
6
+ """
7
+
8
+ from pathlib import Path
9
+ from typing import Optional, Dict, Any, List, Union, Tuple
10
+ import re
11
+
12
+ try:
13
+ import pymupdf as fitz
14
+ PYMUPDF_AVAILABLE = True
15
+ except ImportError:
16
+ PYMUPDF_AVAILABLE = False
17
+ fitz = None
18
+
19
+ from ..base import BaseMediaHandler, MediaProcessingError
20
+ from ..types import MediaContent, MediaType, ContentFormat
21
+
22
+
23
+ class GlyphPDFProcessor(BaseMediaHandler):
24
+ """
25
+ Glyph-optimized PDF processor that preserves mathematical notation and tables.
26
+
27
+ Designed specifically for visual compression where maintaining compact
28
+ mathematical expressions and table layouts is crucial for compression ratio.
29
+ """
30
+
31
+ def __init__(self, **kwargs):
32
+ """Initialize the Glyph PDF processor."""
33
+ super().__init__(**kwargs)
34
+
35
+ if not PYMUPDF_AVAILABLE:
36
+ raise MediaProcessingError("PyMuPDF is required for GlyphPDFProcessor")
37
+
38
+ # Glyph-specific settings
39
+ self.preserve_math_notation = kwargs.get('preserve_math_notation', True)
40
+ self.preserve_table_layout = kwargs.get('preserve_table_layout', True)
41
+ self.compact_whitespace = kwargs.get('compact_whitespace', True)
42
+
43
+ self.logger.debug("GlyphPDFProcessor initialized for visual compression")
44
+
45
+ def _process_internal(self, file_path: Path, media_type: MediaType, **kwargs) -> MediaContent:
46
+ """Process PDF with Glyph optimization."""
47
+ if media_type != MediaType.DOCUMENT:
48
+ raise MediaProcessingError(f"GlyphPDFProcessor only handles documents, got {media_type}")
49
+
50
+ try:
51
+ # Extract content with Glyph optimizations
52
+ content, metadata = self._extract_glyph_optimized_content(file_path)
53
+
54
+ return self._create_media_content(
55
+ content=content,
56
+ file_path=file_path,
57
+ media_type=media_type,
58
+ content_format=ContentFormat.TEXT,
59
+ mime_type="application/pdf",
60
+ **metadata
61
+ )
62
+
63
+ except Exception as e:
64
+ raise MediaProcessingError(f"Failed to process PDF for Glyph: {str(e)}") from e
65
+
66
+ def _extract_glyph_optimized_content(self, file_path: Path) -> Tuple[str, Dict[str, Any]]:
67
+ """Extract PDF content optimized for Glyph visual compression."""
68
+ doc = fitz.open(str(file_path))
69
+
70
+ content_parts = []
71
+ total_chars = 0
72
+ page_count = 0
73
+
74
+ try:
75
+ for page_num in range(len(doc)):
76
+ page = doc[page_num]
77
+ page_count += 1
78
+
79
+ # Get text blocks with position information
80
+ blocks = page.get_text("dict")
81
+
82
+ # Process blocks to preserve mathematical notation and tables
83
+ page_content = self._process_page_blocks(blocks, page_num)
84
+
85
+ if page_content.strip():
86
+ content_parts.append(page_content)
87
+ total_chars += len(page_content)
88
+
89
+ # Combine all pages
90
+ full_content = "\n\n".join(content_parts)
91
+
92
+ # Apply Glyph-specific optimizations
93
+ optimized_content = self._apply_glyph_optimizations(full_content)
94
+
95
+ metadata = {
96
+ 'page_count': page_count,
97
+ 'character_count': len(optimized_content),
98
+ 'processing_method': 'glyph_optimized',
99
+ 'math_notation_preserved': self.preserve_math_notation,
100
+ 'table_layout_preserved': self.preserve_table_layout
101
+ }
102
+
103
+ return optimized_content, metadata
104
+
105
+ finally:
106
+ doc.close()
107
+
108
+ def _process_page_blocks(self, blocks: Dict, page_num: int) -> str:
109
+ """Process page blocks while preserving mathematical and tabular content."""
110
+ page_lines = []
111
+
112
+ for block in blocks.get("blocks", []):
113
+ if "lines" not in block:
114
+ continue
115
+
116
+ # Check if this block looks like a table
117
+ if self._is_table_block(block):
118
+ table_content = self._extract_table_content(block)
119
+ if table_content:
120
+ page_lines.append(table_content)
121
+ else:
122
+ # Process as regular text, preserving math notation
123
+ block_text = self._extract_block_text(block)
124
+ if block_text.strip():
125
+ page_lines.append(block_text)
126
+
127
+ return "\n".join(page_lines)
128
+
129
+ def _is_table_block(self, block: Dict) -> bool:
130
+ """Detect if a block represents tabular data."""
131
+ lines = block.get("lines", [])
132
+ if len(lines) < 2:
133
+ return False
134
+
135
+ # Look for patterns indicating tabular structure
136
+ # - Multiple columns of aligned text
137
+ # - Consistent spacing patterns
138
+ # - Numeric data in columns
139
+
140
+ x_positions = []
141
+ for line in lines:
142
+ for span in line.get("spans", []):
143
+ bbox = span.get("bbox", [])
144
+ if len(bbox) >= 4:
145
+ x_positions.append(bbox[0]) # Left x coordinate
146
+
147
+ if len(x_positions) < 4:
148
+ return False
149
+
150
+ # Check for multiple distinct column positions
151
+ unique_x = sorted(set(round(x, 1) for x in x_positions))
152
+ return len(unique_x) >= 3 # At least 3 columns suggests a table
153
+
154
+ def _extract_table_content(self, block: Dict) -> str:
155
+ """Extract table content in a compact format."""
156
+ lines = block.get("lines", [])
157
+ table_rows = []
158
+
159
+ for line in lines:
160
+ row_parts = []
161
+ spans = sorted(line.get("spans", []), key=lambda s: s.get("bbox", [0])[0])
162
+
163
+ for span in spans:
164
+ text = span.get("text", "").strip()
165
+ if text:
166
+ row_parts.append(text)
167
+
168
+ if row_parts:
169
+ # Use compact table format (pipe-separated)
170
+ table_rows.append(" | ".join(row_parts))
171
+
172
+ if table_rows:
173
+ return "\n".join(table_rows)
174
+ return ""
175
+
176
+ def _extract_block_text(self, block: Dict) -> str:
177
+ """Extract text from a block, preserving mathematical notation."""
178
+ lines = block.get("lines", [])
179
+ block_lines = []
180
+
181
+ for line in lines:
182
+ line_text = ""
183
+ for span in line.get("spans", []):
184
+ text = span.get("text", "")
185
+
186
+ # Preserve mathematical symbols and notation
187
+ if self.preserve_math_notation:
188
+ text = self._preserve_math_symbols(text)
189
+
190
+ line_text += text
191
+
192
+ if line_text.strip():
193
+ block_lines.append(line_text.strip())
194
+
195
+ return "\n".join(block_lines)
196
+
197
+ def _preserve_math_symbols(self, text: str) -> str:
198
+ """Preserve mathematical symbols and compact notation."""
199
+ # Don't expand mathematical symbols - keep them as-is
200
+ # This prevents "α" from becoming "alpha", "∑" from becoming "sum", etc.
201
+
202
+ # Remove excessive whitespace around mathematical operators
203
+ text = re.sub(r'\s*([+\-×÷=<>≤≥≠∈∉∪∩∀∃∑∏∫])\s*', r'\1', text)
204
+
205
+ # Preserve subscripts and superscripts in compact form
206
+ # Keep Unicode mathematical symbols intact
207
+
208
+ return text
209
+
210
+ def _apply_glyph_optimizations(self, content: str) -> str:
211
+ """Apply final optimizations for Glyph visual compression."""
212
+ if not self.compact_whitespace:
213
+ return content
214
+
215
+ # Remove excessive blank lines (keep max 1)
216
+ content = re.sub(r'\n\s*\n\s*\n+', '\n\n', content)
217
+
218
+ # Remove trailing whitespace
219
+ lines = [line.rstrip() for line in content.split('\n')]
220
+
221
+ # Remove excessive spaces (keep max 2 consecutive spaces)
222
+ optimized_lines = []
223
+ for line in lines:
224
+ line = re.sub(r' +', ' ', line) # Max 2 spaces
225
+ optimized_lines.append(line)
226
+
227
+ return '\n'.join(optimized_lines)
@@ -5,6 +5,8 @@ This module provides comprehensive image processing capabilities using PIL,
5
5
  optimized for vision model inputs across different providers.
6
6
  """
7
7
 
8
+ from __future__ import annotations # PEP 563 - deferred type hint evaluation for optional dependencies
9
+
8
10
  import base64
9
11
  import io
10
12
  import mimetypes
@@ -99,7 +101,11 @@ class ImageProcessor(BaseMediaHandler):
99
101
 
100
102
  try:
101
103
  # Override defaults with kwargs
102
- target_format = kwargs.get('target_format', 'jpeg')
104
+ # Preserve original format unless explicitly specified
105
+ original_format = file_path.suffix.lower().lstrip('.')
106
+ if original_format == 'jpg':
107
+ original_format = 'jpeg'
108
+ target_format = kwargs.get('target_format', original_format if original_format in ['png', 'jpeg', 'webp', 'gif'] else 'jpeg')
103
109
  model_name = kwargs.get('model_name', None)
104
110
 
105
111
  # Use model-specific maximum resolution if available
@@ -50,12 +50,25 @@ class TextProcessor(BaseMediaHandler):
50
50
 
51
51
  # Set capabilities for text processing
52
52
  from ..types import MediaCapabilities
53
+ # TextProcessor can handle any text file through its plain text fallback
54
+ # We list common formats but the processor is not limited to these
53
55
  self.capabilities = MediaCapabilities(
54
56
  vision_support=False,
55
57
  audio_support=False,
56
58
  video_support=False,
57
59
  document_support=True,
58
- supported_document_formats=['txt', 'md', 'csv', 'tsv', 'json', 'xml', 'html', 'htm'],
60
+ supported_document_formats=[
61
+ # Core text formats
62
+ 'txt', 'md', 'markdown', 'csv', 'tsv',
63
+ 'json', 'jsonl', 'xml', 'html', 'htm',
64
+ 'yaml', 'yml', 'toml', 'ini', 'cfg', 'conf',
65
+ # Programming languages (common examples)
66
+ 'py', 'js', 'java', 'c', 'cpp', 'go', 'rs', 'rb', 'php',
67
+ 'r', 'R', 'rmd', 'Rmd', 'sql', 'sh',
68
+ # Notebooks and documentation
69
+ 'ipynb', 'qmd', 'rst', 'tex', 'bib',
70
+ # Any other text-based format through fallback processing
71
+ ],
59
72
  max_file_size=self.max_file_size
60
73
  )
61
74
 
@@ -541,7 +554,8 @@ class TextProcessor(BaseMediaHandler):
541
554
  """
542
555
  return {
543
556
  'processor_type': 'TextProcessor',
544
- 'supported_formats': ['txt', 'md', 'csv', 'tsv', 'json', 'xml', 'html', 'htm'],
557
+ 'supported_formats': self.capabilities.supported_document_formats,
558
+ 'supports_any_text_file': True, # Through plain text fallback
545
559
  'capabilities': {
546
560
  'default_encoding': self.default_encoding,
547
561
  'csv_delimiter': self.csv_delimiter,
@@ -549,7 +563,8 @@ class TextProcessor(BaseMediaHandler):
549
563
  'preserve_structure': self.preserve_structure,
550
564
  'pandas_integration': PANDAS_AVAILABLE,
551
565
  'structured_formatting': True,
552
- 'metadata_extraction': True
566
+ 'metadata_extraction': True,
567
+ 'plain_text_fallback': True # Can handle any text file
553
568
  },
554
569
  'dependencies': {
555
570
  'pandas': PANDAS_AVAILABLE