abstractcore 2.5.0__py3-none-any.whl → 2.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractcore/__init__.py +12 -0
- abstractcore/apps/__main__.py +8 -1
- abstractcore/apps/deepsearch.py +644 -0
- abstractcore/apps/intent.py +614 -0
- abstractcore/architectures/detection.py +250 -4
- abstractcore/assets/architecture_formats.json +14 -1
- abstractcore/assets/model_capabilities.json +583 -44
- abstractcore/compression/__init__.py +29 -0
- abstractcore/compression/analytics.py +420 -0
- abstractcore/compression/cache.py +250 -0
- abstractcore/compression/config.py +279 -0
- abstractcore/compression/exceptions.py +30 -0
- abstractcore/compression/glyph_processor.py +381 -0
- abstractcore/compression/optimizer.py +388 -0
- abstractcore/compression/orchestrator.py +380 -0
- abstractcore/compression/pil_text_renderer.py +818 -0
- abstractcore/compression/quality.py +226 -0
- abstractcore/compression/text_formatter.py +666 -0
- abstractcore/compression/vision_compressor.py +371 -0
- abstractcore/config/main.py +66 -1
- abstractcore/config/manager.py +111 -5
- abstractcore/core/session.py +105 -5
- abstractcore/events/__init__.py +1 -1
- abstractcore/media/auto_handler.py +312 -18
- abstractcore/media/handlers/local_handler.py +14 -2
- abstractcore/media/handlers/openai_handler.py +62 -3
- abstractcore/media/processors/__init__.py +11 -1
- abstractcore/media/processors/direct_pdf_processor.py +210 -0
- abstractcore/media/processors/glyph_pdf_processor.py +227 -0
- abstractcore/media/processors/image_processor.py +7 -1
- abstractcore/media/processors/text_processor.py +18 -3
- abstractcore/media/types.py +164 -7
- abstractcore/processing/__init__.py +5 -1
- abstractcore/processing/basic_deepsearch.py +2173 -0
- abstractcore/processing/basic_intent.py +690 -0
- abstractcore/providers/__init__.py +18 -0
- abstractcore/providers/anthropic_provider.py +29 -2
- abstractcore/providers/base.py +279 -6
- abstractcore/providers/huggingface_provider.py +658 -27
- abstractcore/providers/lmstudio_provider.py +52 -2
- abstractcore/providers/mlx_provider.py +103 -4
- abstractcore/providers/model_capabilities.py +352 -0
- abstractcore/providers/ollama_provider.py +44 -6
- abstractcore/providers/openai_provider.py +29 -2
- abstractcore/providers/registry.py +91 -19
- abstractcore/server/app.py +91 -81
- abstractcore/structured/handler.py +161 -1
- abstractcore/tools/common_tools.py +98 -3
- abstractcore/utils/__init__.py +4 -1
- abstractcore/utils/cli.py +114 -1
- abstractcore/utils/trace_export.py +287 -0
- abstractcore/utils/version.py +1 -1
- abstractcore/utils/vlm_token_calculator.py +655 -0
- {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/METADATA +140 -23
- abstractcore-2.5.3.dist-info/RECORD +107 -0
- {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/entry_points.txt +4 -0
- abstractcore-2.5.0.dist-info/RECORD +0 -86
- {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/WHEEL +0 -0
- {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/licenses/LICENSE +0 -0
- {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/top_level.txt +0 -0
|
@@ -30,6 +30,9 @@ class OpenAIMediaHandler(BaseProviderMediaHandler):
|
|
|
30
30
|
"""
|
|
31
31
|
super().__init__("openai", model_capabilities, **kwargs)
|
|
32
32
|
|
|
33
|
+
# Store model name for Qwen-specific optimizations
|
|
34
|
+
self.model_name = kwargs.get('model_name', '')
|
|
35
|
+
|
|
33
36
|
# OpenAI-specific configuration
|
|
34
37
|
self.max_image_size = kwargs.get('max_image_size', 20 * 1024 * 1024) # 20MB
|
|
35
38
|
self.supported_image_detail = kwargs.get('supported_image_detail', ['auto', 'low', 'high'])
|
|
@@ -118,11 +121,64 @@ class OpenAIMediaHandler(BaseProviderMediaHandler):
|
|
|
118
121
|
# Add detail level if supported by model
|
|
119
122
|
if self.model_capabilities.get('vision_support'):
|
|
120
123
|
detail_level = media_content.metadata.get('detail_level', 'auto')
|
|
124
|
+
self.logger.debug(f"OpenAI Handler - MediaContent metadata: {media_content.metadata}")
|
|
125
|
+
self.logger.debug(f"OpenAI Handler - Found detail_level: {detail_level}")
|
|
126
|
+
|
|
127
|
+
# Auto-adjust detail level for Qwen models to prevent context overflow
|
|
128
|
+
if self._is_qwen_model() and detail_level == 'auto':
|
|
129
|
+
detail_level = self._get_optimal_detail_for_qwen(media_content)
|
|
130
|
+
self.logger.debug(f"OpenAI Handler - Qwen auto-adjusted detail_level: {detail_level}")
|
|
131
|
+
|
|
121
132
|
if detail_level in self.supported_image_detail:
|
|
122
133
|
image_obj["image_url"]["detail"] = detail_level
|
|
134
|
+
self.logger.info(f"OpenAI Handler - Setting detail level to '{detail_level}' for image")
|
|
135
|
+
else:
|
|
136
|
+
self.logger.warning(f"OpenAI Handler - Invalid detail level '{detail_level}', supported: {self.supported_image_detail}")
|
|
123
137
|
|
|
124
138
|
return image_obj
|
|
125
139
|
|
|
140
|
+
def _is_qwen_model(self) -> bool:
|
|
141
|
+
"""Check if the current model is a Qwen vision model."""
|
|
142
|
+
if not hasattr(self, 'model_name') or not self.model_name:
|
|
143
|
+
return False
|
|
144
|
+
|
|
145
|
+
model_name_lower = self.model_name.lower()
|
|
146
|
+
return any(qwen_variant in model_name_lower for qwen_variant in [
|
|
147
|
+
'qwen3-vl', 'qwen2.5-vl', 'qwen-vl', 'qwen/qwen3-vl', 'qwen/qwen2.5-vl'
|
|
148
|
+
])
|
|
149
|
+
|
|
150
|
+
def _get_optimal_detail_for_qwen(self, media_content: MediaContent) -> str:
|
|
151
|
+
"""
|
|
152
|
+
Determine optimal detail level for Qwen models based on context constraints.
|
|
153
|
+
|
|
154
|
+
According to SiliconFlow documentation:
|
|
155
|
+
- detail=low: 256 tokens per image (448x448 resize)
|
|
156
|
+
- detail=high: Variable tokens based on resolution (can be 24,576+ tokens)
|
|
157
|
+
|
|
158
|
+
For Qwen3-VL-30B with 131,072 token context limit, we should use detail=low
|
|
159
|
+
when processing multiple images to avoid context overflow.
|
|
160
|
+
"""
|
|
161
|
+
# Get model context limit
|
|
162
|
+
max_tokens = self.model_capabilities.get('max_tokens', 32768)
|
|
163
|
+
max_image_tokens = self.model_capabilities.get('max_image_tokens', 24576)
|
|
164
|
+
|
|
165
|
+
# Estimate how many images we might be processing
|
|
166
|
+
# This is a heuristic - in practice we'd need the full batch context
|
|
167
|
+
estimated_images = getattr(self, '_estimated_image_count', 1)
|
|
168
|
+
|
|
169
|
+
# Calculate potential token usage with high detail
|
|
170
|
+
high_detail_tokens = estimated_images * max_image_tokens
|
|
171
|
+
|
|
172
|
+
# Use low detail if high detail would consume >60% of context
|
|
173
|
+
context_threshold = max_tokens * 0.6
|
|
174
|
+
|
|
175
|
+
if high_detail_tokens > context_threshold:
|
|
176
|
+
self.logger.info(f"Using detail=low for Qwen model: {estimated_images} images would consume "
|
|
177
|
+
f"{high_detail_tokens:,} tokens (>{context_threshold:,} threshold)")
|
|
178
|
+
return 'low'
|
|
179
|
+
else:
|
|
180
|
+
return 'high'
|
|
181
|
+
|
|
126
182
|
def _format_text_for_openai(self, media_content: MediaContent) -> Dict[str, Any]:
|
|
127
183
|
"""
|
|
128
184
|
Format text/document content for OpenAI API.
|
|
@@ -226,12 +282,15 @@ class OpenAIMediaHandler(BaseProviderMediaHandler):
|
|
|
226
282
|
Estimated token count
|
|
227
283
|
"""
|
|
228
284
|
if media_content.media_type == MediaType.IMAGE:
|
|
229
|
-
#
|
|
230
|
-
# Base cost varies by detail level and image size
|
|
285
|
+
# Image token estimation varies by model
|
|
231
286
|
detail_level = media_content.metadata.get('detail_level', 'auto')
|
|
232
287
|
|
|
233
288
|
if detail_level == 'low':
|
|
234
|
-
|
|
289
|
+
# Qwen models use 256 tokens for low detail, OpenAI uses 85
|
|
290
|
+
if self._is_qwen_model():
|
|
291
|
+
return 256 # Qwen low detail token count
|
|
292
|
+
else:
|
|
293
|
+
return 85 # OpenAI low detail token count
|
|
235
294
|
else:
|
|
236
295
|
# High detail calculation based on image dimensions
|
|
237
296
|
width = media_content.metadata.get('final_size', [512, 512])[0]
|
|
@@ -10,4 +10,14 @@ from .text_processor import TextProcessor
|
|
|
10
10
|
from .pdf_processor import PDFProcessor
|
|
11
11
|
from .office_processor import OfficeProcessor
|
|
12
12
|
|
|
13
|
-
|
|
13
|
+
# Import Glyph processor if available
|
|
14
|
+
try:
|
|
15
|
+
from ...compression.glyph_processor import GlyphProcessor
|
|
16
|
+
GLYPH_AVAILABLE = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
GlyphProcessor = None
|
|
19
|
+
GLYPH_AVAILABLE = False
|
|
20
|
+
|
|
21
|
+
__all__ = ['ImageProcessor', 'TextProcessor', 'PDFProcessor', 'OfficeProcessor']
|
|
22
|
+
if GLYPH_AVAILABLE:
|
|
23
|
+
__all__.append('GlyphProcessor')
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Direct PDF-to-image processor for Glyph compression.
|
|
3
|
+
|
|
4
|
+
This processor converts PDF pages directly to images without text extraction,
|
|
5
|
+
preserving all visual elements including mathematical formulas, tables, and images.
|
|
6
|
+
Supports multi-page layouts (e.g., 2 pages per image) for optimal compression.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Optional, Dict, Any, List, Union, Tuple
|
|
11
|
+
import tempfile
|
|
12
|
+
import os
|
|
13
|
+
import hashlib
|
|
14
|
+
import math
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
import pdf2image
|
|
18
|
+
PDF2IMAGE_AVAILABLE = True
|
|
19
|
+
except ImportError:
|
|
20
|
+
PDF2IMAGE_AVAILABLE = False
|
|
21
|
+
pdf2image = None
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
from PIL import Image
|
|
25
|
+
PIL_AVAILABLE = True
|
|
26
|
+
except ImportError:
|
|
27
|
+
PIL_AVAILABLE = False
|
|
28
|
+
Image = None
|
|
29
|
+
|
|
30
|
+
from ..base import BaseMediaHandler, MediaProcessingError
|
|
31
|
+
from ..types import MediaContent, MediaType, ContentFormat
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DirectPDFProcessor(BaseMediaHandler):
|
|
35
|
+
"""
|
|
36
|
+
Direct PDF-to-image processor that preserves all visual elements.
|
|
37
|
+
|
|
38
|
+
Converts PDF pages directly to images without text extraction,
|
|
39
|
+
maintaining perfect fidelity of mathematical formulas, tables, and images.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, **kwargs):
|
|
43
|
+
"""Initialize the direct PDF processor."""
|
|
44
|
+
super().__init__(**kwargs)
|
|
45
|
+
|
|
46
|
+
if not PDF2IMAGE_AVAILABLE:
|
|
47
|
+
raise MediaProcessingError("pdf2image is required for DirectPDFProcessor")
|
|
48
|
+
|
|
49
|
+
if not PIL_AVAILABLE:
|
|
50
|
+
raise MediaProcessingError("PIL/Pillow is required for DirectPDFProcessor")
|
|
51
|
+
|
|
52
|
+
# Configuration
|
|
53
|
+
self.pages_per_image = kwargs.get('pages_per_image', 2) # 2 pages per image by default
|
|
54
|
+
self.dpi = kwargs.get('dpi', 150) # Higher DPI for better quality
|
|
55
|
+
self.layout = kwargs.get('layout', 'horizontal') # 'horizontal' or 'vertical'
|
|
56
|
+
self.gap = kwargs.get('gap', 20) # Gap between pages in pixels
|
|
57
|
+
|
|
58
|
+
self.logger.debug(f"DirectPDFProcessor initialized: {self.pages_per_image} pages per image")
|
|
59
|
+
|
|
60
|
+
def _process_internal(self, file_path: Path, media_type: MediaType, **kwargs) -> MediaContent:
|
|
61
|
+
"""Process PDF directly to images."""
|
|
62
|
+
if media_type != MediaType.DOCUMENT:
|
|
63
|
+
raise MediaProcessingError(f"DirectPDFProcessor only handles documents, got {media_type}")
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
# Convert PDF pages to images
|
|
67
|
+
image_paths = self._convert_pdf_to_combined_images(file_path)
|
|
68
|
+
|
|
69
|
+
# For now, return the first combined image
|
|
70
|
+
# In a full implementation, this would return all images
|
|
71
|
+
if image_paths:
|
|
72
|
+
with open(image_paths[0], 'rb') as f:
|
|
73
|
+
image_data = f.read()
|
|
74
|
+
|
|
75
|
+
# Encode as base64 for MediaContent
|
|
76
|
+
import base64
|
|
77
|
+
encoded_data = base64.b64encode(image_data).decode('utf-8')
|
|
78
|
+
|
|
79
|
+
# Get session info for metadata
|
|
80
|
+
from ...config import get_config_manager
|
|
81
|
+
config_manager = get_config_manager()
|
|
82
|
+
glyph_cache_base = Path(config_manager.config.cache.glyph_cache_dir).expanduser()
|
|
83
|
+
pdf_hash = hashlib.md5(str(file_path).encode()).hexdigest()[:8]
|
|
84
|
+
session_id = f"pdf_{pdf_hash}_{len(image_paths)}pages"
|
|
85
|
+
|
|
86
|
+
metadata = {
|
|
87
|
+
'processing_method': 'direct_pdf_conversion',
|
|
88
|
+
'pages_per_image': self.pages_per_image,
|
|
89
|
+
'total_images': len(image_paths),
|
|
90
|
+
'dpi': self.dpi,
|
|
91
|
+
'layout': self.layout,
|
|
92
|
+
'image_paths': [str(p) for p in image_paths],
|
|
93
|
+
'glyph_session_id': session_id,
|
|
94
|
+
'glyph_cache_dir': str(glyph_cache_base / session_id)
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
return self._create_media_content(
|
|
98
|
+
content=encoded_data,
|
|
99
|
+
file_path=file_path,
|
|
100
|
+
media_type=MediaType.IMAGE, # Return as image
|
|
101
|
+
content_format=ContentFormat.BASE64,
|
|
102
|
+
mime_type="image/png",
|
|
103
|
+
**metadata
|
|
104
|
+
)
|
|
105
|
+
else:
|
|
106
|
+
raise MediaProcessingError("No images generated from PDF")
|
|
107
|
+
|
|
108
|
+
except Exception as e:
|
|
109
|
+
raise MediaProcessingError(f"Failed to process PDF directly: {str(e)}") from e
|
|
110
|
+
|
|
111
|
+
def _convert_pdf_to_combined_images(self, pdf_path: Path) -> List[Path]:
|
|
112
|
+
"""Convert PDF to combined images with multiple pages per image."""
|
|
113
|
+
|
|
114
|
+
# Convert all PDF pages to individual images
|
|
115
|
+
individual_images = pdf2image.convert_from_path(
|
|
116
|
+
pdf_path,
|
|
117
|
+
dpi=self.dpi,
|
|
118
|
+
fmt='PNG'
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
self.logger.info(f"Converted PDF to {len(individual_images)} individual page images")
|
|
122
|
+
|
|
123
|
+
# Use AbstractCore's centralized Glyph cache directory
|
|
124
|
+
from ...config import get_config_manager
|
|
125
|
+
config_manager = get_config_manager()
|
|
126
|
+
glyph_cache_base = Path(config_manager.config.cache.glyph_cache_dir).expanduser()
|
|
127
|
+
|
|
128
|
+
# Calculate number of combined images that will be created
|
|
129
|
+
num_combined_images = math.ceil(len(individual_images) / self.pages_per_image)
|
|
130
|
+
|
|
131
|
+
# Create a unique subdirectory for this PDF processing session
|
|
132
|
+
pdf_hash = hashlib.md5(str(pdf_path).encode()).hexdigest()[:8]
|
|
133
|
+
session_id = f"pdf_{pdf_hash}_{num_combined_images}pages"
|
|
134
|
+
glyph_dir = glyph_cache_base / session_id
|
|
135
|
+
glyph_dir.mkdir(parents=True, exist_ok=True)
|
|
136
|
+
|
|
137
|
+
# CRITICAL DEBUG LOG: Show exactly where images are being generated
|
|
138
|
+
self.logger.debug(f"🎯 GENERATING GLYPH IMAGES IN CACHE DIRECTORY: {glyph_dir}")
|
|
139
|
+
self.logger.info(f"DirectPDFProcessor: Creating {self.pages_per_image} pages per image in {glyph_dir}")
|
|
140
|
+
|
|
141
|
+
# Combine pages into multi-page images
|
|
142
|
+
combined_images = []
|
|
143
|
+
|
|
144
|
+
try:
|
|
145
|
+
for i in range(0, len(individual_images), self.pages_per_image):
|
|
146
|
+
# Get pages for this combined image
|
|
147
|
+
pages_batch = individual_images[i:i + self.pages_per_image]
|
|
148
|
+
|
|
149
|
+
# Create combined image
|
|
150
|
+
combined_image = self._combine_pages(pages_batch, i // self.pages_per_image)
|
|
151
|
+
|
|
152
|
+
# Save combined image in Glyph cache
|
|
153
|
+
output_path = glyph_dir / f"image_{i // self.pages_per_image + 1:03d}.png"
|
|
154
|
+
combined_image.save(output_path, 'PNG', optimize=True)
|
|
155
|
+
combined_images.append(output_path)
|
|
156
|
+
|
|
157
|
+
# CRITICAL DEBUG LOG: Show each image as it's created
|
|
158
|
+
self.logger.debug(f"📄 CREATED GLYPH IMAGE: {output_path} (pages {i+1}-{min(i+self.pages_per_image, len(individual_images))})")
|
|
159
|
+
|
|
160
|
+
except Exception as e:
|
|
161
|
+
# Clean up cache directory on error
|
|
162
|
+
import shutil
|
|
163
|
+
shutil.rmtree(glyph_dir, ignore_errors=True)
|
|
164
|
+
raise e
|
|
165
|
+
|
|
166
|
+
# CRITICAL DEBUG LOG: Final summary with exact paths
|
|
167
|
+
self.logger.info(f"Created {len(combined_images)} combined images from {len(individual_images)} pages")
|
|
168
|
+
self.logger.debug(f"🎯 ALL GLYPH IMAGES STORED IN CACHE: {glyph_dir}")
|
|
169
|
+
for i, img_path in enumerate(combined_images, 1):
|
|
170
|
+
self.logger.debug(f" 📄 Image {i}: {img_path}")
|
|
171
|
+
|
|
172
|
+
return combined_images
|
|
173
|
+
|
|
174
|
+
def _combine_pages(self, pages: List['Image.Image'], batch_index: int) -> 'Image.Image':
|
|
175
|
+
"""Combine multiple PDF pages into a single image."""
|
|
176
|
+
if not pages:
|
|
177
|
+
raise ValueError("No pages to combine")
|
|
178
|
+
|
|
179
|
+
if len(pages) == 1:
|
|
180
|
+
return pages[0]
|
|
181
|
+
|
|
182
|
+
# Calculate dimensions for combined image
|
|
183
|
+
if self.layout == 'horizontal':
|
|
184
|
+
# Side-by-side layout (like an open book)
|
|
185
|
+
total_width = sum(page.width for page in pages) + self.gap * (len(pages) - 1)
|
|
186
|
+
total_height = max(page.height for page in pages)
|
|
187
|
+
else:
|
|
188
|
+
# Vertical layout (pages stacked)
|
|
189
|
+
total_width = max(page.width for page in pages)
|
|
190
|
+
total_height = sum(page.height for page in pages) + self.gap * (len(pages) - 1)
|
|
191
|
+
|
|
192
|
+
# Create new image with white background
|
|
193
|
+
combined = Image.new('RGB', (total_width, total_height), 'white')
|
|
194
|
+
|
|
195
|
+
# Paste pages into combined image
|
|
196
|
+
current_x, current_y = 0, 0
|
|
197
|
+
|
|
198
|
+
for page in pages:
|
|
199
|
+
combined.paste(page, (current_x, current_y))
|
|
200
|
+
|
|
201
|
+
if self.layout == 'horizontal':
|
|
202
|
+
current_x += page.width + self.gap
|
|
203
|
+
else:
|
|
204
|
+
current_y += page.height + self.gap
|
|
205
|
+
|
|
206
|
+
return combined
|
|
207
|
+
|
|
208
|
+
def get_combined_image_paths(self, pdf_path: Path) -> List[Path]:
|
|
209
|
+
"""Get paths to all combined images (for external use)."""
|
|
210
|
+
return self._convert_pdf_to_combined_images(pdf_path)
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Glyph-optimized PDF processor that preserves mathematical notation and table formatting.
|
|
3
|
+
|
|
4
|
+
This processor extracts PDF content while maintaining compact mathematical expressions
|
|
5
|
+
and tabular data formatting for optimal visual compression.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Optional, Dict, Any, List, Union, Tuple
|
|
10
|
+
import re
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
import pymupdf as fitz
|
|
14
|
+
PYMUPDF_AVAILABLE = True
|
|
15
|
+
except ImportError:
|
|
16
|
+
PYMUPDF_AVAILABLE = False
|
|
17
|
+
fitz = None
|
|
18
|
+
|
|
19
|
+
from ..base import BaseMediaHandler, MediaProcessingError
|
|
20
|
+
from ..types import MediaContent, MediaType, ContentFormat
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class GlyphPDFProcessor(BaseMediaHandler):
|
|
24
|
+
"""
|
|
25
|
+
Glyph-optimized PDF processor that preserves mathematical notation and tables.
|
|
26
|
+
|
|
27
|
+
Designed specifically for visual compression where maintaining compact
|
|
28
|
+
mathematical expressions and table layouts is crucial for compression ratio.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, **kwargs):
|
|
32
|
+
"""Initialize the Glyph PDF processor."""
|
|
33
|
+
super().__init__(**kwargs)
|
|
34
|
+
|
|
35
|
+
if not PYMUPDF_AVAILABLE:
|
|
36
|
+
raise MediaProcessingError("PyMuPDF is required for GlyphPDFProcessor")
|
|
37
|
+
|
|
38
|
+
# Glyph-specific settings
|
|
39
|
+
self.preserve_math_notation = kwargs.get('preserve_math_notation', True)
|
|
40
|
+
self.preserve_table_layout = kwargs.get('preserve_table_layout', True)
|
|
41
|
+
self.compact_whitespace = kwargs.get('compact_whitespace', True)
|
|
42
|
+
|
|
43
|
+
self.logger.debug("GlyphPDFProcessor initialized for visual compression")
|
|
44
|
+
|
|
45
|
+
def _process_internal(self, file_path: Path, media_type: MediaType, **kwargs) -> MediaContent:
|
|
46
|
+
"""Process PDF with Glyph optimization."""
|
|
47
|
+
if media_type != MediaType.DOCUMENT:
|
|
48
|
+
raise MediaProcessingError(f"GlyphPDFProcessor only handles documents, got {media_type}")
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
# Extract content with Glyph optimizations
|
|
52
|
+
content, metadata = self._extract_glyph_optimized_content(file_path)
|
|
53
|
+
|
|
54
|
+
return self._create_media_content(
|
|
55
|
+
content=content,
|
|
56
|
+
file_path=file_path,
|
|
57
|
+
media_type=media_type,
|
|
58
|
+
content_format=ContentFormat.TEXT,
|
|
59
|
+
mime_type="application/pdf",
|
|
60
|
+
**metadata
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
except Exception as e:
|
|
64
|
+
raise MediaProcessingError(f"Failed to process PDF for Glyph: {str(e)}") from e
|
|
65
|
+
|
|
66
|
+
def _extract_glyph_optimized_content(self, file_path: Path) -> Tuple[str, Dict[str, Any]]:
|
|
67
|
+
"""Extract PDF content optimized for Glyph visual compression."""
|
|
68
|
+
doc = fitz.open(str(file_path))
|
|
69
|
+
|
|
70
|
+
content_parts = []
|
|
71
|
+
total_chars = 0
|
|
72
|
+
page_count = 0
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
for page_num in range(len(doc)):
|
|
76
|
+
page = doc[page_num]
|
|
77
|
+
page_count += 1
|
|
78
|
+
|
|
79
|
+
# Get text blocks with position information
|
|
80
|
+
blocks = page.get_text("dict")
|
|
81
|
+
|
|
82
|
+
# Process blocks to preserve mathematical notation and tables
|
|
83
|
+
page_content = self._process_page_blocks(blocks, page_num)
|
|
84
|
+
|
|
85
|
+
if page_content.strip():
|
|
86
|
+
content_parts.append(page_content)
|
|
87
|
+
total_chars += len(page_content)
|
|
88
|
+
|
|
89
|
+
# Combine all pages
|
|
90
|
+
full_content = "\n\n".join(content_parts)
|
|
91
|
+
|
|
92
|
+
# Apply Glyph-specific optimizations
|
|
93
|
+
optimized_content = self._apply_glyph_optimizations(full_content)
|
|
94
|
+
|
|
95
|
+
metadata = {
|
|
96
|
+
'page_count': page_count,
|
|
97
|
+
'character_count': len(optimized_content),
|
|
98
|
+
'processing_method': 'glyph_optimized',
|
|
99
|
+
'math_notation_preserved': self.preserve_math_notation,
|
|
100
|
+
'table_layout_preserved': self.preserve_table_layout
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
return optimized_content, metadata
|
|
104
|
+
|
|
105
|
+
finally:
|
|
106
|
+
doc.close()
|
|
107
|
+
|
|
108
|
+
def _process_page_blocks(self, blocks: Dict, page_num: int) -> str:
|
|
109
|
+
"""Process page blocks while preserving mathematical and tabular content."""
|
|
110
|
+
page_lines = []
|
|
111
|
+
|
|
112
|
+
for block in blocks.get("blocks", []):
|
|
113
|
+
if "lines" not in block:
|
|
114
|
+
continue
|
|
115
|
+
|
|
116
|
+
# Check if this block looks like a table
|
|
117
|
+
if self._is_table_block(block):
|
|
118
|
+
table_content = self._extract_table_content(block)
|
|
119
|
+
if table_content:
|
|
120
|
+
page_lines.append(table_content)
|
|
121
|
+
else:
|
|
122
|
+
# Process as regular text, preserving math notation
|
|
123
|
+
block_text = self._extract_block_text(block)
|
|
124
|
+
if block_text.strip():
|
|
125
|
+
page_lines.append(block_text)
|
|
126
|
+
|
|
127
|
+
return "\n".join(page_lines)
|
|
128
|
+
|
|
129
|
+
def _is_table_block(self, block: Dict) -> bool:
|
|
130
|
+
"""Detect if a block represents tabular data."""
|
|
131
|
+
lines = block.get("lines", [])
|
|
132
|
+
if len(lines) < 2:
|
|
133
|
+
return False
|
|
134
|
+
|
|
135
|
+
# Look for patterns indicating tabular structure
|
|
136
|
+
# - Multiple columns of aligned text
|
|
137
|
+
# - Consistent spacing patterns
|
|
138
|
+
# - Numeric data in columns
|
|
139
|
+
|
|
140
|
+
x_positions = []
|
|
141
|
+
for line in lines:
|
|
142
|
+
for span in line.get("spans", []):
|
|
143
|
+
bbox = span.get("bbox", [])
|
|
144
|
+
if len(bbox) >= 4:
|
|
145
|
+
x_positions.append(bbox[0]) # Left x coordinate
|
|
146
|
+
|
|
147
|
+
if len(x_positions) < 4:
|
|
148
|
+
return False
|
|
149
|
+
|
|
150
|
+
# Check for multiple distinct column positions
|
|
151
|
+
unique_x = sorted(set(round(x, 1) for x in x_positions))
|
|
152
|
+
return len(unique_x) >= 3 # At least 3 columns suggests a table
|
|
153
|
+
|
|
154
|
+
def _extract_table_content(self, block: Dict) -> str:
|
|
155
|
+
"""Extract table content in a compact format."""
|
|
156
|
+
lines = block.get("lines", [])
|
|
157
|
+
table_rows = []
|
|
158
|
+
|
|
159
|
+
for line in lines:
|
|
160
|
+
row_parts = []
|
|
161
|
+
spans = sorted(line.get("spans", []), key=lambda s: s.get("bbox", [0])[0])
|
|
162
|
+
|
|
163
|
+
for span in spans:
|
|
164
|
+
text = span.get("text", "").strip()
|
|
165
|
+
if text:
|
|
166
|
+
row_parts.append(text)
|
|
167
|
+
|
|
168
|
+
if row_parts:
|
|
169
|
+
# Use compact table format (pipe-separated)
|
|
170
|
+
table_rows.append(" | ".join(row_parts))
|
|
171
|
+
|
|
172
|
+
if table_rows:
|
|
173
|
+
return "\n".join(table_rows)
|
|
174
|
+
return ""
|
|
175
|
+
|
|
176
|
+
def _extract_block_text(self, block: Dict) -> str:
|
|
177
|
+
"""Extract text from a block, preserving mathematical notation."""
|
|
178
|
+
lines = block.get("lines", [])
|
|
179
|
+
block_lines = []
|
|
180
|
+
|
|
181
|
+
for line in lines:
|
|
182
|
+
line_text = ""
|
|
183
|
+
for span in line.get("spans", []):
|
|
184
|
+
text = span.get("text", "")
|
|
185
|
+
|
|
186
|
+
# Preserve mathematical symbols and notation
|
|
187
|
+
if self.preserve_math_notation:
|
|
188
|
+
text = self._preserve_math_symbols(text)
|
|
189
|
+
|
|
190
|
+
line_text += text
|
|
191
|
+
|
|
192
|
+
if line_text.strip():
|
|
193
|
+
block_lines.append(line_text.strip())
|
|
194
|
+
|
|
195
|
+
return "\n".join(block_lines)
|
|
196
|
+
|
|
197
|
+
def _preserve_math_symbols(self, text: str) -> str:
|
|
198
|
+
"""Preserve mathematical symbols and compact notation."""
|
|
199
|
+
# Don't expand mathematical symbols - keep them as-is
|
|
200
|
+
# This prevents "α" from becoming "alpha", "∑" from becoming "sum", etc.
|
|
201
|
+
|
|
202
|
+
# Remove excessive whitespace around mathematical operators
|
|
203
|
+
text = re.sub(r'\s*([+\-×÷=<>≤≥≠∈∉∪∩∀∃∑∏∫])\s*', r'\1', text)
|
|
204
|
+
|
|
205
|
+
# Preserve subscripts and superscripts in compact form
|
|
206
|
+
# Keep Unicode mathematical symbols intact
|
|
207
|
+
|
|
208
|
+
return text
|
|
209
|
+
|
|
210
|
+
def _apply_glyph_optimizations(self, content: str) -> str:
|
|
211
|
+
"""Apply final optimizations for Glyph visual compression."""
|
|
212
|
+
if not self.compact_whitespace:
|
|
213
|
+
return content
|
|
214
|
+
|
|
215
|
+
# Remove excessive blank lines (keep max 1)
|
|
216
|
+
content = re.sub(r'\n\s*\n\s*\n+', '\n\n', content)
|
|
217
|
+
|
|
218
|
+
# Remove trailing whitespace
|
|
219
|
+
lines = [line.rstrip() for line in content.split('\n')]
|
|
220
|
+
|
|
221
|
+
# Remove excessive spaces (keep max 2 consecutive spaces)
|
|
222
|
+
optimized_lines = []
|
|
223
|
+
for line in lines:
|
|
224
|
+
line = re.sub(r' +', ' ', line) # Max 2 spaces
|
|
225
|
+
optimized_lines.append(line)
|
|
226
|
+
|
|
227
|
+
return '\n'.join(optimized_lines)
|
|
@@ -5,6 +5,8 @@ This module provides comprehensive image processing capabilities using PIL,
|
|
|
5
5
|
optimized for vision model inputs across different providers.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
+
from __future__ import annotations # PEP 563 - deferred type hint evaluation for optional dependencies
|
|
9
|
+
|
|
8
10
|
import base64
|
|
9
11
|
import io
|
|
10
12
|
import mimetypes
|
|
@@ -99,7 +101,11 @@ class ImageProcessor(BaseMediaHandler):
|
|
|
99
101
|
|
|
100
102
|
try:
|
|
101
103
|
# Override defaults with kwargs
|
|
102
|
-
|
|
104
|
+
# Preserve original format unless explicitly specified
|
|
105
|
+
original_format = file_path.suffix.lower().lstrip('.')
|
|
106
|
+
if original_format == 'jpg':
|
|
107
|
+
original_format = 'jpeg'
|
|
108
|
+
target_format = kwargs.get('target_format', original_format if original_format in ['png', 'jpeg', 'webp', 'gif'] else 'jpeg')
|
|
103
109
|
model_name = kwargs.get('model_name', None)
|
|
104
110
|
|
|
105
111
|
# Use model-specific maximum resolution if available
|
|
@@ -50,12 +50,25 @@ class TextProcessor(BaseMediaHandler):
|
|
|
50
50
|
|
|
51
51
|
# Set capabilities for text processing
|
|
52
52
|
from ..types import MediaCapabilities
|
|
53
|
+
# TextProcessor can handle any text file through its plain text fallback
|
|
54
|
+
# We list common formats but the processor is not limited to these
|
|
53
55
|
self.capabilities = MediaCapabilities(
|
|
54
56
|
vision_support=False,
|
|
55
57
|
audio_support=False,
|
|
56
58
|
video_support=False,
|
|
57
59
|
document_support=True,
|
|
58
|
-
supported_document_formats=[
|
|
60
|
+
supported_document_formats=[
|
|
61
|
+
# Core text formats
|
|
62
|
+
'txt', 'md', 'markdown', 'csv', 'tsv',
|
|
63
|
+
'json', 'jsonl', 'xml', 'html', 'htm',
|
|
64
|
+
'yaml', 'yml', 'toml', 'ini', 'cfg', 'conf',
|
|
65
|
+
# Programming languages (common examples)
|
|
66
|
+
'py', 'js', 'java', 'c', 'cpp', 'go', 'rs', 'rb', 'php',
|
|
67
|
+
'r', 'R', 'rmd', 'Rmd', 'sql', 'sh',
|
|
68
|
+
# Notebooks and documentation
|
|
69
|
+
'ipynb', 'qmd', 'rst', 'tex', 'bib',
|
|
70
|
+
# Any other text-based format through fallback processing
|
|
71
|
+
],
|
|
59
72
|
max_file_size=self.max_file_size
|
|
60
73
|
)
|
|
61
74
|
|
|
@@ -541,7 +554,8 @@ class TextProcessor(BaseMediaHandler):
|
|
|
541
554
|
"""
|
|
542
555
|
return {
|
|
543
556
|
'processor_type': 'TextProcessor',
|
|
544
|
-
'supported_formats':
|
|
557
|
+
'supported_formats': self.capabilities.supported_document_formats,
|
|
558
|
+
'supports_any_text_file': True, # Through plain text fallback
|
|
545
559
|
'capabilities': {
|
|
546
560
|
'default_encoding': self.default_encoding,
|
|
547
561
|
'csv_delimiter': self.csv_delimiter,
|
|
@@ -549,7 +563,8 @@ class TextProcessor(BaseMediaHandler):
|
|
|
549
563
|
'preserve_structure': self.preserve_structure,
|
|
550
564
|
'pandas_integration': PANDAS_AVAILABLE,
|
|
551
565
|
'structured_formatting': True,
|
|
552
|
-
'metadata_extraction': True
|
|
566
|
+
'metadata_extraction': True,
|
|
567
|
+
'plain_text_fallback': True # Can handle any text file
|
|
553
568
|
},
|
|
554
569
|
'dependencies': {
|
|
555
570
|
'pandas': PANDAS_AVAILABLE
|