abstractcore 2.5.2__py3-none-any.whl → 2.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractcore/__init__.py +19 -1
- abstractcore/architectures/detection.py +252 -6
- abstractcore/assets/architecture_formats.json +14 -1
- abstractcore/assets/model_capabilities.json +533 -10
- abstractcore/compression/__init__.py +29 -0
- abstractcore/compression/analytics.py +420 -0
- abstractcore/compression/cache.py +250 -0
- abstractcore/compression/config.py +279 -0
- abstractcore/compression/exceptions.py +30 -0
- abstractcore/compression/glyph_processor.py +381 -0
- abstractcore/compression/optimizer.py +388 -0
- abstractcore/compression/orchestrator.py +380 -0
- abstractcore/compression/pil_text_renderer.py +818 -0
- abstractcore/compression/quality.py +226 -0
- abstractcore/compression/text_formatter.py +666 -0
- abstractcore/compression/vision_compressor.py +371 -0
- abstractcore/config/main.py +64 -0
- abstractcore/config/manager.py +100 -5
- abstractcore/core/retry.py +2 -2
- abstractcore/core/session.py +193 -7
- abstractcore/download.py +253 -0
- abstractcore/embeddings/manager.py +2 -2
- abstractcore/events/__init__.py +113 -2
- abstractcore/exceptions/__init__.py +49 -2
- abstractcore/media/auto_handler.py +312 -18
- abstractcore/media/handlers/local_handler.py +14 -2
- abstractcore/media/handlers/openai_handler.py +62 -3
- abstractcore/media/processors/__init__.py +11 -1
- abstractcore/media/processors/direct_pdf_processor.py +210 -0
- abstractcore/media/processors/glyph_pdf_processor.py +227 -0
- abstractcore/media/processors/image_processor.py +7 -1
- abstractcore/media/processors/office_processor.py +2 -2
- abstractcore/media/processors/text_processor.py +18 -3
- abstractcore/media/types.py +164 -7
- abstractcore/media/utils/image_scaler.py +2 -2
- abstractcore/media/vision_fallback.py +2 -2
- abstractcore/providers/__init__.py +18 -0
- abstractcore/providers/anthropic_provider.py +228 -8
- abstractcore/providers/base.py +378 -11
- abstractcore/providers/huggingface_provider.py +563 -23
- abstractcore/providers/lmstudio_provider.py +284 -4
- abstractcore/providers/mlx_provider.py +27 -2
- abstractcore/providers/model_capabilities.py +352 -0
- abstractcore/providers/ollama_provider.py +282 -6
- abstractcore/providers/openai_provider.py +286 -8
- abstractcore/providers/registry.py +85 -13
- abstractcore/providers/streaming.py +2 -2
- abstractcore/server/app.py +91 -81
- abstractcore/tools/common_tools.py +2 -2
- abstractcore/tools/handler.py +2 -2
- abstractcore/tools/parser.py +2 -2
- abstractcore/tools/registry.py +2 -2
- abstractcore/tools/syntax_rewriter.py +2 -2
- abstractcore/tools/tag_rewriter.py +3 -3
- abstractcore/utils/__init__.py +4 -1
- abstractcore/utils/self_fixes.py +2 -2
- abstractcore/utils/trace_export.py +287 -0
- abstractcore/utils/version.py +1 -1
- abstractcore/utils/vlm_token_calculator.py +655 -0
- {abstractcore-2.5.2.dist-info → abstractcore-2.6.0.dist-info}/METADATA +207 -8
- abstractcore-2.6.0.dist-info/RECORD +108 -0
- abstractcore-2.5.2.dist-info/RECORD +0 -90
- {abstractcore-2.5.2.dist-info → abstractcore-2.6.0.dist-info}/WHEEL +0 -0
- {abstractcore-2.5.2.dist-info → abstractcore-2.6.0.dist-info}/entry_points.txt +0 -0
- {abstractcore-2.5.2.dist-info → abstractcore-2.6.0.dist-info}/licenses/LICENSE +0 -0
- {abstractcore-2.5.2.dist-info → abstractcore-2.6.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Direct PDF-to-image processor for Glyph compression.
|
|
3
|
+
|
|
4
|
+
This processor converts PDF pages directly to images without text extraction,
|
|
5
|
+
preserving all visual elements including mathematical formulas, tables, and images.
|
|
6
|
+
Supports multi-page layouts (e.g., 2 pages per image) for optimal compression.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Optional, Dict, Any, List, Union, Tuple
|
|
11
|
+
import tempfile
|
|
12
|
+
import os
|
|
13
|
+
import hashlib
|
|
14
|
+
import math
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
import pdf2image
|
|
18
|
+
PDF2IMAGE_AVAILABLE = True
|
|
19
|
+
except ImportError:
|
|
20
|
+
PDF2IMAGE_AVAILABLE = False
|
|
21
|
+
pdf2image = None
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
from PIL import Image
|
|
25
|
+
PIL_AVAILABLE = True
|
|
26
|
+
except ImportError:
|
|
27
|
+
PIL_AVAILABLE = False
|
|
28
|
+
Image = None
|
|
29
|
+
|
|
30
|
+
from ..base import BaseMediaHandler, MediaProcessingError
|
|
31
|
+
from ..types import MediaContent, MediaType, ContentFormat
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DirectPDFProcessor(BaseMediaHandler):
|
|
35
|
+
"""
|
|
36
|
+
Direct PDF-to-image processor that preserves all visual elements.
|
|
37
|
+
|
|
38
|
+
Converts PDF pages directly to images without text extraction,
|
|
39
|
+
maintaining perfect fidelity of mathematical formulas, tables, and images.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, **kwargs):
|
|
43
|
+
"""Initialize the direct PDF processor."""
|
|
44
|
+
super().__init__(**kwargs)
|
|
45
|
+
|
|
46
|
+
if not PDF2IMAGE_AVAILABLE:
|
|
47
|
+
raise MediaProcessingError("pdf2image is required for DirectPDFProcessor")
|
|
48
|
+
|
|
49
|
+
if not PIL_AVAILABLE:
|
|
50
|
+
raise MediaProcessingError("PIL/Pillow is required for DirectPDFProcessor")
|
|
51
|
+
|
|
52
|
+
# Configuration
|
|
53
|
+
self.pages_per_image = kwargs.get('pages_per_image', 2) # 2 pages per image by default
|
|
54
|
+
self.dpi = kwargs.get('dpi', 150) # Higher DPI for better quality
|
|
55
|
+
self.layout = kwargs.get('layout', 'horizontal') # 'horizontal' or 'vertical'
|
|
56
|
+
self.gap = kwargs.get('gap', 20) # Gap between pages in pixels
|
|
57
|
+
|
|
58
|
+
self.logger.debug(f"DirectPDFProcessor initialized: {self.pages_per_image} pages per image")
|
|
59
|
+
|
|
60
|
+
def _process_internal(self, file_path: Path, media_type: MediaType, **kwargs) -> MediaContent:
|
|
61
|
+
"""Process PDF directly to images."""
|
|
62
|
+
if media_type != MediaType.DOCUMENT:
|
|
63
|
+
raise MediaProcessingError(f"DirectPDFProcessor only handles documents, got {media_type}")
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
# Convert PDF pages to images
|
|
67
|
+
image_paths = self._convert_pdf_to_combined_images(file_path)
|
|
68
|
+
|
|
69
|
+
# For now, return the first combined image
|
|
70
|
+
# In a full implementation, this would return all images
|
|
71
|
+
if image_paths:
|
|
72
|
+
with open(image_paths[0], 'rb') as f:
|
|
73
|
+
image_data = f.read()
|
|
74
|
+
|
|
75
|
+
# Encode as base64 for MediaContent
|
|
76
|
+
import base64
|
|
77
|
+
encoded_data = base64.b64encode(image_data).decode('utf-8')
|
|
78
|
+
|
|
79
|
+
# Get session info for metadata
|
|
80
|
+
from ...config import get_config_manager
|
|
81
|
+
config_manager = get_config_manager()
|
|
82
|
+
glyph_cache_base = Path(config_manager.config.cache.glyph_cache_dir).expanduser()
|
|
83
|
+
pdf_hash = hashlib.md5(str(file_path).encode()).hexdigest()[:8]
|
|
84
|
+
session_id = f"pdf_{pdf_hash}_{len(image_paths)}pages"
|
|
85
|
+
|
|
86
|
+
metadata = {
|
|
87
|
+
'processing_method': 'direct_pdf_conversion',
|
|
88
|
+
'pages_per_image': self.pages_per_image,
|
|
89
|
+
'total_images': len(image_paths),
|
|
90
|
+
'dpi': self.dpi,
|
|
91
|
+
'layout': self.layout,
|
|
92
|
+
'image_paths': [str(p) for p in image_paths],
|
|
93
|
+
'glyph_session_id': session_id,
|
|
94
|
+
'glyph_cache_dir': str(glyph_cache_base / session_id)
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
return self._create_media_content(
|
|
98
|
+
content=encoded_data,
|
|
99
|
+
file_path=file_path,
|
|
100
|
+
media_type=MediaType.IMAGE, # Return as image
|
|
101
|
+
content_format=ContentFormat.BASE64,
|
|
102
|
+
mime_type="image/png",
|
|
103
|
+
**metadata
|
|
104
|
+
)
|
|
105
|
+
else:
|
|
106
|
+
raise MediaProcessingError("No images generated from PDF")
|
|
107
|
+
|
|
108
|
+
except Exception as e:
|
|
109
|
+
raise MediaProcessingError(f"Failed to process PDF directly: {str(e)}") from e
|
|
110
|
+
|
|
111
|
+
def _convert_pdf_to_combined_images(self, pdf_path: Path) -> List[Path]:
|
|
112
|
+
"""Convert PDF to combined images with multiple pages per image."""
|
|
113
|
+
|
|
114
|
+
# Convert all PDF pages to individual images
|
|
115
|
+
individual_images = pdf2image.convert_from_path(
|
|
116
|
+
pdf_path,
|
|
117
|
+
dpi=self.dpi,
|
|
118
|
+
fmt='PNG'
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
self.logger.info(f"Converted PDF to {len(individual_images)} individual page images")
|
|
122
|
+
|
|
123
|
+
# Use AbstractCore's centralized Glyph cache directory
|
|
124
|
+
from ...config import get_config_manager
|
|
125
|
+
config_manager = get_config_manager()
|
|
126
|
+
glyph_cache_base = Path(config_manager.config.cache.glyph_cache_dir).expanduser()
|
|
127
|
+
|
|
128
|
+
# Calculate number of combined images that will be created
|
|
129
|
+
num_combined_images = math.ceil(len(individual_images) / self.pages_per_image)
|
|
130
|
+
|
|
131
|
+
# Create a unique subdirectory for this PDF processing session
|
|
132
|
+
pdf_hash = hashlib.md5(str(pdf_path).encode()).hexdigest()[:8]
|
|
133
|
+
session_id = f"pdf_{pdf_hash}_{num_combined_images}pages"
|
|
134
|
+
glyph_dir = glyph_cache_base / session_id
|
|
135
|
+
glyph_dir.mkdir(parents=True, exist_ok=True)
|
|
136
|
+
|
|
137
|
+
# CRITICAL DEBUG LOG: Show exactly where images are being generated
|
|
138
|
+
self.logger.debug(f"🎯 GENERATING GLYPH IMAGES IN CACHE DIRECTORY: {glyph_dir}")
|
|
139
|
+
self.logger.info(f"DirectPDFProcessor: Creating {self.pages_per_image} pages per image in {glyph_dir}")
|
|
140
|
+
|
|
141
|
+
# Combine pages into multi-page images
|
|
142
|
+
combined_images = []
|
|
143
|
+
|
|
144
|
+
try:
|
|
145
|
+
for i in range(0, len(individual_images), self.pages_per_image):
|
|
146
|
+
# Get pages for this combined image
|
|
147
|
+
pages_batch = individual_images[i:i + self.pages_per_image]
|
|
148
|
+
|
|
149
|
+
# Create combined image
|
|
150
|
+
combined_image = self._combine_pages(pages_batch, i // self.pages_per_image)
|
|
151
|
+
|
|
152
|
+
# Save combined image in Glyph cache
|
|
153
|
+
output_path = glyph_dir / f"image_{i // self.pages_per_image + 1:03d}.png"
|
|
154
|
+
combined_image.save(output_path, 'PNG', optimize=True)
|
|
155
|
+
combined_images.append(output_path)
|
|
156
|
+
|
|
157
|
+
# CRITICAL DEBUG LOG: Show each image as it's created
|
|
158
|
+
self.logger.debug(f"📄 CREATED GLYPH IMAGE: {output_path} (pages {i+1}-{min(i+self.pages_per_image, len(individual_images))})")
|
|
159
|
+
|
|
160
|
+
except Exception as e:
|
|
161
|
+
# Clean up cache directory on error
|
|
162
|
+
import shutil
|
|
163
|
+
shutil.rmtree(glyph_dir, ignore_errors=True)
|
|
164
|
+
raise e
|
|
165
|
+
|
|
166
|
+
# CRITICAL DEBUG LOG: Final summary with exact paths
|
|
167
|
+
self.logger.info(f"Created {len(combined_images)} combined images from {len(individual_images)} pages")
|
|
168
|
+
self.logger.debug(f"🎯 ALL GLYPH IMAGES STORED IN CACHE: {glyph_dir}")
|
|
169
|
+
for i, img_path in enumerate(combined_images, 1):
|
|
170
|
+
self.logger.debug(f" 📄 Image {i}: {img_path}")
|
|
171
|
+
|
|
172
|
+
return combined_images
|
|
173
|
+
|
|
174
|
+
def _combine_pages(self, pages: List['Image.Image'], batch_index: int) -> 'Image.Image':
|
|
175
|
+
"""Combine multiple PDF pages into a single image."""
|
|
176
|
+
if not pages:
|
|
177
|
+
raise ValueError("No pages to combine")
|
|
178
|
+
|
|
179
|
+
if len(pages) == 1:
|
|
180
|
+
return pages[0]
|
|
181
|
+
|
|
182
|
+
# Calculate dimensions for combined image
|
|
183
|
+
if self.layout == 'horizontal':
|
|
184
|
+
# Side-by-side layout (like an open book)
|
|
185
|
+
total_width = sum(page.width for page in pages) + self.gap * (len(pages) - 1)
|
|
186
|
+
total_height = max(page.height for page in pages)
|
|
187
|
+
else:
|
|
188
|
+
# Vertical layout (pages stacked)
|
|
189
|
+
total_width = max(page.width for page in pages)
|
|
190
|
+
total_height = sum(page.height for page in pages) + self.gap * (len(pages) - 1)
|
|
191
|
+
|
|
192
|
+
# Create new image with white background
|
|
193
|
+
combined = Image.new('RGB', (total_width, total_height), 'white')
|
|
194
|
+
|
|
195
|
+
# Paste pages into combined image
|
|
196
|
+
current_x, current_y = 0, 0
|
|
197
|
+
|
|
198
|
+
for page in pages:
|
|
199
|
+
combined.paste(page, (current_x, current_y))
|
|
200
|
+
|
|
201
|
+
if self.layout == 'horizontal':
|
|
202
|
+
current_x += page.width + self.gap
|
|
203
|
+
else:
|
|
204
|
+
current_y += page.height + self.gap
|
|
205
|
+
|
|
206
|
+
return combined
|
|
207
|
+
|
|
208
|
+
def get_combined_image_paths(self, pdf_path: Path) -> List[Path]:
|
|
209
|
+
"""Get paths to all combined images (for external use)."""
|
|
210
|
+
return self._convert_pdf_to_combined_images(pdf_path)
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Glyph-optimized PDF processor that preserves mathematical notation and table formatting.
|
|
3
|
+
|
|
4
|
+
This processor extracts PDF content while maintaining compact mathematical expressions
|
|
5
|
+
and tabular data formatting for optimal visual compression.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Optional, Dict, Any, List, Union, Tuple
|
|
10
|
+
import re
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
import pymupdf as fitz
|
|
14
|
+
PYMUPDF_AVAILABLE = True
|
|
15
|
+
except ImportError:
|
|
16
|
+
PYMUPDF_AVAILABLE = False
|
|
17
|
+
fitz = None
|
|
18
|
+
|
|
19
|
+
from ..base import BaseMediaHandler, MediaProcessingError
|
|
20
|
+
from ..types import MediaContent, MediaType, ContentFormat
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class GlyphPDFProcessor(BaseMediaHandler):
|
|
24
|
+
"""
|
|
25
|
+
Glyph-optimized PDF processor that preserves mathematical notation and tables.
|
|
26
|
+
|
|
27
|
+
Designed specifically for visual compression where maintaining compact
|
|
28
|
+
mathematical expressions and table layouts is crucial for compression ratio.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, **kwargs):
|
|
32
|
+
"""Initialize the Glyph PDF processor."""
|
|
33
|
+
super().__init__(**kwargs)
|
|
34
|
+
|
|
35
|
+
if not PYMUPDF_AVAILABLE:
|
|
36
|
+
raise MediaProcessingError("PyMuPDF is required for GlyphPDFProcessor")
|
|
37
|
+
|
|
38
|
+
# Glyph-specific settings
|
|
39
|
+
self.preserve_math_notation = kwargs.get('preserve_math_notation', True)
|
|
40
|
+
self.preserve_table_layout = kwargs.get('preserve_table_layout', True)
|
|
41
|
+
self.compact_whitespace = kwargs.get('compact_whitespace', True)
|
|
42
|
+
|
|
43
|
+
self.logger.debug("GlyphPDFProcessor initialized for visual compression")
|
|
44
|
+
|
|
45
|
+
def _process_internal(self, file_path: Path, media_type: MediaType, **kwargs) -> MediaContent:
|
|
46
|
+
"""Process PDF with Glyph optimization."""
|
|
47
|
+
if media_type != MediaType.DOCUMENT:
|
|
48
|
+
raise MediaProcessingError(f"GlyphPDFProcessor only handles documents, got {media_type}")
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
# Extract content with Glyph optimizations
|
|
52
|
+
content, metadata = self._extract_glyph_optimized_content(file_path)
|
|
53
|
+
|
|
54
|
+
return self._create_media_content(
|
|
55
|
+
content=content,
|
|
56
|
+
file_path=file_path,
|
|
57
|
+
media_type=media_type,
|
|
58
|
+
content_format=ContentFormat.TEXT,
|
|
59
|
+
mime_type="application/pdf",
|
|
60
|
+
**metadata
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
except Exception as e:
|
|
64
|
+
raise MediaProcessingError(f"Failed to process PDF for Glyph: {str(e)}") from e
|
|
65
|
+
|
|
66
|
+
def _extract_glyph_optimized_content(self, file_path: Path) -> Tuple[str, Dict[str, Any]]:
|
|
67
|
+
"""Extract PDF content optimized for Glyph visual compression."""
|
|
68
|
+
doc = fitz.open(str(file_path))
|
|
69
|
+
|
|
70
|
+
content_parts = []
|
|
71
|
+
total_chars = 0
|
|
72
|
+
page_count = 0
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
for page_num in range(len(doc)):
|
|
76
|
+
page = doc[page_num]
|
|
77
|
+
page_count += 1
|
|
78
|
+
|
|
79
|
+
# Get text blocks with position information
|
|
80
|
+
blocks = page.get_text("dict")
|
|
81
|
+
|
|
82
|
+
# Process blocks to preserve mathematical notation and tables
|
|
83
|
+
page_content = self._process_page_blocks(blocks, page_num)
|
|
84
|
+
|
|
85
|
+
if page_content.strip():
|
|
86
|
+
content_parts.append(page_content)
|
|
87
|
+
total_chars += len(page_content)
|
|
88
|
+
|
|
89
|
+
# Combine all pages
|
|
90
|
+
full_content = "\n\n".join(content_parts)
|
|
91
|
+
|
|
92
|
+
# Apply Glyph-specific optimizations
|
|
93
|
+
optimized_content = self._apply_glyph_optimizations(full_content)
|
|
94
|
+
|
|
95
|
+
metadata = {
|
|
96
|
+
'page_count': page_count,
|
|
97
|
+
'character_count': len(optimized_content),
|
|
98
|
+
'processing_method': 'glyph_optimized',
|
|
99
|
+
'math_notation_preserved': self.preserve_math_notation,
|
|
100
|
+
'table_layout_preserved': self.preserve_table_layout
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
return optimized_content, metadata
|
|
104
|
+
|
|
105
|
+
finally:
|
|
106
|
+
doc.close()
|
|
107
|
+
|
|
108
|
+
def _process_page_blocks(self, blocks: Dict, page_num: int) -> str:
|
|
109
|
+
"""Process page blocks while preserving mathematical and tabular content."""
|
|
110
|
+
page_lines = []
|
|
111
|
+
|
|
112
|
+
for block in blocks.get("blocks", []):
|
|
113
|
+
if "lines" not in block:
|
|
114
|
+
continue
|
|
115
|
+
|
|
116
|
+
# Check if this block looks like a table
|
|
117
|
+
if self._is_table_block(block):
|
|
118
|
+
table_content = self._extract_table_content(block)
|
|
119
|
+
if table_content:
|
|
120
|
+
page_lines.append(table_content)
|
|
121
|
+
else:
|
|
122
|
+
# Process as regular text, preserving math notation
|
|
123
|
+
block_text = self._extract_block_text(block)
|
|
124
|
+
if block_text.strip():
|
|
125
|
+
page_lines.append(block_text)
|
|
126
|
+
|
|
127
|
+
return "\n".join(page_lines)
|
|
128
|
+
|
|
129
|
+
def _is_table_block(self, block: Dict) -> bool:
|
|
130
|
+
"""Detect if a block represents tabular data."""
|
|
131
|
+
lines = block.get("lines", [])
|
|
132
|
+
if len(lines) < 2:
|
|
133
|
+
return False
|
|
134
|
+
|
|
135
|
+
# Look for patterns indicating tabular structure
|
|
136
|
+
# - Multiple columns of aligned text
|
|
137
|
+
# - Consistent spacing patterns
|
|
138
|
+
# - Numeric data in columns
|
|
139
|
+
|
|
140
|
+
x_positions = []
|
|
141
|
+
for line in lines:
|
|
142
|
+
for span in line.get("spans", []):
|
|
143
|
+
bbox = span.get("bbox", [])
|
|
144
|
+
if len(bbox) >= 4:
|
|
145
|
+
x_positions.append(bbox[0]) # Left x coordinate
|
|
146
|
+
|
|
147
|
+
if len(x_positions) < 4:
|
|
148
|
+
return False
|
|
149
|
+
|
|
150
|
+
# Check for multiple distinct column positions
|
|
151
|
+
unique_x = sorted(set(round(x, 1) for x in x_positions))
|
|
152
|
+
return len(unique_x) >= 3 # At least 3 columns suggests a table
|
|
153
|
+
|
|
154
|
+
def _extract_table_content(self, block: Dict) -> str:
|
|
155
|
+
"""Extract table content in a compact format."""
|
|
156
|
+
lines = block.get("lines", [])
|
|
157
|
+
table_rows = []
|
|
158
|
+
|
|
159
|
+
for line in lines:
|
|
160
|
+
row_parts = []
|
|
161
|
+
spans = sorted(line.get("spans", []), key=lambda s: s.get("bbox", [0])[0])
|
|
162
|
+
|
|
163
|
+
for span in spans:
|
|
164
|
+
text = span.get("text", "").strip()
|
|
165
|
+
if text:
|
|
166
|
+
row_parts.append(text)
|
|
167
|
+
|
|
168
|
+
if row_parts:
|
|
169
|
+
# Use compact table format (pipe-separated)
|
|
170
|
+
table_rows.append(" | ".join(row_parts))
|
|
171
|
+
|
|
172
|
+
if table_rows:
|
|
173
|
+
return "\n".join(table_rows)
|
|
174
|
+
return ""
|
|
175
|
+
|
|
176
|
+
def _extract_block_text(self, block: Dict) -> str:
|
|
177
|
+
"""Extract text from a block, preserving mathematical notation."""
|
|
178
|
+
lines = block.get("lines", [])
|
|
179
|
+
block_lines = []
|
|
180
|
+
|
|
181
|
+
for line in lines:
|
|
182
|
+
line_text = ""
|
|
183
|
+
for span in line.get("spans", []):
|
|
184
|
+
text = span.get("text", "")
|
|
185
|
+
|
|
186
|
+
# Preserve mathematical symbols and notation
|
|
187
|
+
if self.preserve_math_notation:
|
|
188
|
+
text = self._preserve_math_symbols(text)
|
|
189
|
+
|
|
190
|
+
line_text += text
|
|
191
|
+
|
|
192
|
+
if line_text.strip():
|
|
193
|
+
block_lines.append(line_text.strip())
|
|
194
|
+
|
|
195
|
+
return "\n".join(block_lines)
|
|
196
|
+
|
|
197
|
+
def _preserve_math_symbols(self, text: str) -> str:
|
|
198
|
+
"""Preserve mathematical symbols and compact notation."""
|
|
199
|
+
# Don't expand mathematical symbols - keep them as-is
|
|
200
|
+
# This prevents "α" from becoming "alpha", "∑" from becoming "sum", etc.
|
|
201
|
+
|
|
202
|
+
# Remove excessive whitespace around mathematical operators
|
|
203
|
+
text = re.sub(r'\s*([+\-×÷=<>≤≥≠∈∉∪∩∀∃∑∏∫])\s*', r'\1', text)
|
|
204
|
+
|
|
205
|
+
# Preserve subscripts and superscripts in compact form
|
|
206
|
+
# Keep Unicode mathematical symbols intact
|
|
207
|
+
|
|
208
|
+
return text
|
|
209
|
+
|
|
210
|
+
def _apply_glyph_optimizations(self, content: str) -> str:
|
|
211
|
+
"""Apply final optimizations for Glyph visual compression."""
|
|
212
|
+
if not self.compact_whitespace:
|
|
213
|
+
return content
|
|
214
|
+
|
|
215
|
+
# Remove excessive blank lines (keep max 1)
|
|
216
|
+
content = re.sub(r'\n\s*\n\s*\n+', '\n\n', content)
|
|
217
|
+
|
|
218
|
+
# Remove trailing whitespace
|
|
219
|
+
lines = [line.rstrip() for line in content.split('\n')]
|
|
220
|
+
|
|
221
|
+
# Remove excessive spaces (keep max 2 consecutive spaces)
|
|
222
|
+
optimized_lines = []
|
|
223
|
+
for line in lines:
|
|
224
|
+
line = re.sub(r' +', ' ', line) # Max 2 spaces
|
|
225
|
+
optimized_lines.append(line)
|
|
226
|
+
|
|
227
|
+
return '\n'.join(optimized_lines)
|
|
@@ -5,6 +5,8 @@ This module provides comprehensive image processing capabilities using PIL,
|
|
|
5
5
|
optimized for vision model inputs across different providers.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
+
from __future__ import annotations # PEP 563 - deferred type hint evaluation for optional dependencies
|
|
9
|
+
|
|
8
10
|
import base64
|
|
9
11
|
import io
|
|
10
12
|
import mimetypes
|
|
@@ -99,7 +101,11 @@ class ImageProcessor(BaseMediaHandler):
|
|
|
99
101
|
|
|
100
102
|
try:
|
|
101
103
|
# Override defaults with kwargs
|
|
102
|
-
|
|
104
|
+
# Preserve original format unless explicitly specified
|
|
105
|
+
original_format = file_path.suffix.lower().lstrip('.')
|
|
106
|
+
if original_format == 'jpg':
|
|
107
|
+
original_format = 'jpeg'
|
|
108
|
+
target_format = kwargs.get('target_format', original_format if original_format in ['png', 'jpeg', 'webp', 'gif'] else 'jpeg')
|
|
103
109
|
model_name = kwargs.get('model_name', None)
|
|
104
110
|
|
|
105
111
|
# Use model-specific maximum resolution if available
|
|
@@ -6,13 +6,13 @@ This module provides comprehensive processing capabilities for Microsoft Office
|
|
|
6
6
|
document processing in 2025.
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
-
import logging
|
|
10
9
|
from pathlib import Path
|
|
11
10
|
from typing import Optional, Dict, Any, List, Union, Tuple
|
|
12
11
|
import json
|
|
13
12
|
|
|
14
13
|
from ..base import BaseMediaHandler, MediaProcessingError
|
|
15
14
|
from ..types import MediaContent, MediaType, ContentFormat, MediaProcessingResult
|
|
15
|
+
from ...utils.structured_logging import get_logger
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class OfficeProcessor(BaseMediaHandler):
|
|
@@ -36,7 +36,7 @@ class OfficeProcessor(BaseMediaHandler):
|
|
|
36
36
|
**kwargs: Additional configuration options
|
|
37
37
|
"""
|
|
38
38
|
super().__init__(**kwargs)
|
|
39
|
-
self.logger =
|
|
39
|
+
self.logger = get_logger(__name__)
|
|
40
40
|
|
|
41
41
|
# Configuration options
|
|
42
42
|
self.extract_tables = kwargs.get('extract_tables', True)
|
|
@@ -50,12 +50,25 @@ class TextProcessor(BaseMediaHandler):
|
|
|
50
50
|
|
|
51
51
|
# Set capabilities for text processing
|
|
52
52
|
from ..types import MediaCapabilities
|
|
53
|
+
# TextProcessor can handle any text file through its plain text fallback
|
|
54
|
+
# We list common formats but the processor is not limited to these
|
|
53
55
|
self.capabilities = MediaCapabilities(
|
|
54
56
|
vision_support=False,
|
|
55
57
|
audio_support=False,
|
|
56
58
|
video_support=False,
|
|
57
59
|
document_support=True,
|
|
58
|
-
supported_document_formats=[
|
|
60
|
+
supported_document_formats=[
|
|
61
|
+
# Core text formats
|
|
62
|
+
'txt', 'md', 'markdown', 'csv', 'tsv',
|
|
63
|
+
'json', 'jsonl', 'xml', 'html', 'htm',
|
|
64
|
+
'yaml', 'yml', 'toml', 'ini', 'cfg', 'conf',
|
|
65
|
+
# Programming languages (common examples)
|
|
66
|
+
'py', 'js', 'java', 'c', 'cpp', 'go', 'rs', 'rb', 'php',
|
|
67
|
+
'r', 'R', 'rmd', 'Rmd', 'sql', 'sh',
|
|
68
|
+
# Notebooks and documentation
|
|
69
|
+
'ipynb', 'qmd', 'rst', 'tex', 'bib',
|
|
70
|
+
# Any other text-based format through fallback processing
|
|
71
|
+
],
|
|
59
72
|
max_file_size=self.max_file_size
|
|
60
73
|
)
|
|
61
74
|
|
|
@@ -541,7 +554,8 @@ class TextProcessor(BaseMediaHandler):
|
|
|
541
554
|
"""
|
|
542
555
|
return {
|
|
543
556
|
'processor_type': 'TextProcessor',
|
|
544
|
-
'supported_formats':
|
|
557
|
+
'supported_formats': self.capabilities.supported_document_formats,
|
|
558
|
+
'supports_any_text_file': True, # Through plain text fallback
|
|
545
559
|
'capabilities': {
|
|
546
560
|
'default_encoding': self.default_encoding,
|
|
547
561
|
'csv_delimiter': self.csv_delimiter,
|
|
@@ -549,7 +563,8 @@ class TextProcessor(BaseMediaHandler):
|
|
|
549
563
|
'preserve_structure': self.preserve_structure,
|
|
550
564
|
'pandas_integration': PANDAS_AVAILABLE,
|
|
551
565
|
'structured_formatting': True,
|
|
552
|
-
'metadata_extraction': True
|
|
566
|
+
'metadata_extraction': True,
|
|
567
|
+
'plain_text_fallback': True # Can handle any text file
|
|
553
568
|
},
|
|
554
569
|
'dependencies': {
|
|
555
570
|
'pandas': PANDAS_AVAILABLE
|