abstractcore 2.9.1__py3-none-any.whl → 2.11.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractcore/__init__.py +7 -27
- abstractcore/apps/deepsearch.py +9 -4
- abstractcore/apps/extractor.py +33 -100
- abstractcore/apps/intent.py +19 -0
- abstractcore/apps/judge.py +20 -1
- abstractcore/apps/summarizer.py +20 -1
- abstractcore/architectures/detection.py +34 -1
- abstractcore/architectures/response_postprocessing.py +313 -0
- abstractcore/assets/architecture_formats.json +38 -8
- abstractcore/assets/model_capabilities.json +882 -160
- abstractcore/compression/__init__.py +1 -2
- abstractcore/compression/glyph_processor.py +6 -4
- abstractcore/config/main.py +52 -20
- abstractcore/config/manager.py +390 -12
- abstractcore/config/vision_config.py +5 -5
- abstractcore/core/interface.py +151 -3
- abstractcore/core/session.py +16 -10
- abstractcore/download.py +1 -1
- abstractcore/embeddings/manager.py +20 -6
- abstractcore/endpoint/__init__.py +2 -0
- abstractcore/endpoint/app.py +458 -0
- abstractcore/mcp/client.py +3 -1
- abstractcore/media/__init__.py +52 -17
- abstractcore/media/auto_handler.py +42 -22
- abstractcore/media/base.py +44 -1
- abstractcore/media/capabilities.py +12 -33
- abstractcore/media/enrichment.py +105 -0
- abstractcore/media/handlers/anthropic_handler.py +19 -28
- abstractcore/media/handlers/local_handler.py +124 -70
- abstractcore/media/handlers/openai_handler.py +19 -31
- abstractcore/media/processors/__init__.py +4 -2
- abstractcore/media/processors/audio_processor.py +57 -0
- abstractcore/media/processors/office_processor.py +8 -3
- abstractcore/media/processors/pdf_processor.py +46 -3
- abstractcore/media/processors/text_processor.py +22 -24
- abstractcore/media/processors/video_processor.py +58 -0
- abstractcore/media/types.py +97 -4
- abstractcore/media/utils/image_scaler.py +20 -2
- abstractcore/media/utils/video_frames.py +219 -0
- abstractcore/media/vision_fallback.py +136 -22
- abstractcore/processing/__init__.py +32 -3
- abstractcore/processing/basic_deepsearch.py +15 -10
- abstractcore/processing/basic_intent.py +3 -2
- abstractcore/processing/basic_judge.py +3 -2
- abstractcore/processing/basic_summarizer.py +1 -1
- abstractcore/providers/__init__.py +3 -1
- abstractcore/providers/anthropic_provider.py +95 -8
- abstractcore/providers/base.py +1516 -81
- abstractcore/providers/huggingface_provider.py +546 -69
- abstractcore/providers/lmstudio_provider.py +30 -916
- abstractcore/providers/mlx_provider.py +382 -35
- abstractcore/providers/model_capabilities.py +5 -1
- abstractcore/providers/ollama_provider.py +99 -15
- abstractcore/providers/openai_compatible_provider.py +406 -180
- abstractcore/providers/openai_provider.py +188 -44
- abstractcore/providers/openrouter_provider.py +76 -0
- abstractcore/providers/registry.py +61 -5
- abstractcore/providers/streaming.py +138 -33
- abstractcore/providers/vllm_provider.py +92 -817
- abstractcore/server/app.py +478 -28
- abstractcore/server/audio_endpoints.py +139 -0
- abstractcore/server/vision_endpoints.py +1319 -0
- abstractcore/structured/handler.py +316 -41
- abstractcore/tools/common_tools.py +5501 -2012
- abstractcore/tools/comms_tools.py +1641 -0
- abstractcore/tools/core.py +37 -7
- abstractcore/tools/handler.py +4 -9
- abstractcore/tools/parser.py +49 -2
- abstractcore/tools/tag_rewriter.py +2 -1
- abstractcore/tools/telegram_tdlib.py +407 -0
- abstractcore/tools/telegram_tools.py +261 -0
- abstractcore/utils/cli.py +1085 -72
- abstractcore/utils/structured_logging.py +29 -8
- abstractcore/utils/token_utils.py +2 -0
- abstractcore/utils/truncation.py +29 -0
- abstractcore/utils/version.py +3 -4
- abstractcore/utils/vlm_token_calculator.py +12 -2
- abstractcore-2.11.4.dist-info/METADATA +562 -0
- abstractcore-2.11.4.dist-info/RECORD +133 -0
- {abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/WHEEL +1 -1
- {abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/entry_points.txt +1 -0
- abstractcore-2.9.1.dist-info/METADATA +0 -1190
- abstractcore-2.9.1.dist-info/RECORD +0 -119
- {abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/licenses/LICENSE +0 -0
- {abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/top_level.txt +0 -0
|
@@ -10,6 +10,7 @@ from typing import Dict, Any, List, Optional, Union
|
|
|
10
10
|
|
|
11
11
|
from ..base import BaseProviderMediaHandler, MediaProcessingError
|
|
12
12
|
from ..types import MediaContent, MediaType, ContentFormat
|
|
13
|
+
from ..enrichment import build_enrichment_item
|
|
13
14
|
|
|
14
15
|
# Import vision detection from existing architecture system
|
|
15
16
|
try:
|
|
@@ -48,6 +49,10 @@ class LocalMediaHandler(BaseProviderMediaHandler):
|
|
|
48
49
|
self.prefer_text_extraction = kwargs.get('prefer_text_extraction', True)
|
|
49
50
|
self.embed_images_in_text = kwargs.get('embed_images_in_text', False)
|
|
50
51
|
|
|
52
|
+
# Collected "media enrichment" entries (input fallback transparency).
|
|
53
|
+
# Populated when a modality is converted into text context (e.g. image caption).
|
|
54
|
+
self.media_enrichment: List[Dict[str, Any]] = []
|
|
55
|
+
|
|
51
56
|
self.logger.debug(f"Initialized {provider_name} local media handler with model={self.model_name}, capabilities: {self.capabilities}")
|
|
52
57
|
|
|
53
58
|
def _process_internal(self, file_path: Path, media_type: MediaType, **kwargs) -> MediaContent:
|
|
@@ -232,6 +237,9 @@ class LocalMediaHandler(BaseProviderMediaHandler):
|
|
|
232
237
|
Returns:
|
|
233
238
|
Formatted message (structured dict for vision models, string for text-only)
|
|
234
239
|
"""
|
|
240
|
+
# Reset per-call enrichment collection.
|
|
241
|
+
self.media_enrichment = []
|
|
242
|
+
|
|
235
243
|
# Check if we have images in the media contents
|
|
236
244
|
has_images = any(mc.media_type == MediaType.IMAGE for mc in media_contents)
|
|
237
245
|
|
|
@@ -295,62 +303,127 @@ class LocalMediaHandler(BaseProviderMediaHandler):
|
|
|
295
303
|
This is often more reliable for local providers that don't have
|
|
296
304
|
robust multimodal support. For images on text-only models, uses vision fallback.
|
|
297
305
|
"""
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
if text.strip():
|
|
302
|
-
message_parts.append(text)
|
|
306
|
+
user_text = text.strip() if text else ""
|
|
307
|
+
image_context_parts: List[str] = []
|
|
308
|
+
other_parts: List[str] = []
|
|
303
309
|
|
|
304
310
|
# Add processed content from media
|
|
305
311
|
for i, media_content in enumerate(media_contents):
|
|
306
312
|
if media_content.media_type == MediaType.IMAGE:
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
313
|
+
file_name = media_content.metadata.get('file_name', 'image')
|
|
314
|
+
# In text-embedded mode, images are not passed natively.
|
|
315
|
+
# Always prefer the vision fallback (caption → text context) when configured.
|
|
316
|
+
try:
|
|
317
|
+
from ..vision_fallback import VisionFallbackHandler, VisionNotConfiguredError
|
|
318
|
+
|
|
319
|
+
fallback_handler = VisionFallbackHandler()
|
|
320
|
+
|
|
321
|
+
# Get the actual file path from media_content object
|
|
322
|
+
file_path = (
|
|
323
|
+
media_content.file_path
|
|
324
|
+
or media_content.metadata.get('file_path')
|
|
325
|
+
or media_content.metadata.get('file_name', 'image')
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
# Generate description using vision fallback
|
|
329
|
+
description, trace = fallback_handler.create_description_with_trace(
|
|
330
|
+
str(file_path), user_text or None
|
|
331
|
+
)
|
|
332
|
+
description = str(description or "").strip()
|
|
333
|
+
|
|
334
|
+
if description:
|
|
335
|
+
image_context_parts.append(f"Image {i+1} ({file_name}): {description}")
|
|
336
|
+
self.media_enrichment.append(
|
|
337
|
+
build_enrichment_item(
|
|
338
|
+
status="used",
|
|
339
|
+
input_modality="image",
|
|
340
|
+
summary_kind="caption",
|
|
341
|
+
policy=str(trace.get("strategy") or ""),
|
|
342
|
+
backend=trace.get("backend") if isinstance(trace, dict) else None,
|
|
343
|
+
input_index=i + 1,
|
|
344
|
+
input_name=str(file_name),
|
|
345
|
+
injected_text=description,
|
|
346
|
+
)
|
|
347
|
+
)
|
|
348
|
+
else:
|
|
349
|
+
other_parts.append(f"[Image {i+1}: {file_name} - no description returned]")
|
|
350
|
+
self.media_enrichment.append(
|
|
351
|
+
build_enrichment_item(
|
|
352
|
+
status="error",
|
|
353
|
+
input_modality="image",
|
|
354
|
+
summary_kind="caption",
|
|
355
|
+
policy=str(getattr(fallback_handler.vision_config, "strategy", "") or ""),
|
|
356
|
+
input_index=i + 1,
|
|
357
|
+
input_name=str(file_name),
|
|
358
|
+
error="Vision fallback returned empty description",
|
|
359
|
+
)
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
except VisionNotConfiguredError as e:
|
|
363
|
+
# Vision not configured - show warning to USER, not model
|
|
364
|
+
self.logger.warning("Vision capability not configured for text-only models")
|
|
365
|
+
self.logger.warning("To enable image analysis with text-only models:")
|
|
366
|
+
self.logger.warning("🔸 EASIEST: Download BLIP vision model (990MB): abstractcore --download-vision-model")
|
|
367
|
+
self.logger.warning("🔸 Use existing Ollama model: abstractcore --set-vision-caption qwen2.5vl:7b")
|
|
368
|
+
self.logger.warning("🔸 Use cloud API: abstractcore --set-vision-provider openai --model gpt-4o")
|
|
369
|
+
self.logger.warning("🔸 Interactive setup: abstractcore --configure")
|
|
370
|
+
self.logger.warning("Current status: abstractcore --status")
|
|
371
|
+
|
|
372
|
+
self.media_enrichment.append(
|
|
373
|
+
build_enrichment_item(
|
|
374
|
+
status="skipped",
|
|
375
|
+
input_modality="image",
|
|
376
|
+
summary_kind="caption",
|
|
377
|
+
policy="disabled",
|
|
378
|
+
input_index=i + 1,
|
|
379
|
+
input_name=str(file_name),
|
|
380
|
+
error=str(e),
|
|
381
|
+
)
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
# Provide minimal placeholder to model (not configuration instructions!)
|
|
385
|
+
other_parts.append(f"[Image {i+1}: {file_name}]")
|
|
386
|
+
|
|
387
|
+
except Exception as e:
|
|
388
|
+
self.logger.warning(f"Vision fallback failed: {e}")
|
|
389
|
+
self.media_enrichment.append(
|
|
390
|
+
build_enrichment_item(
|
|
391
|
+
status="error",
|
|
392
|
+
input_modality="image",
|
|
393
|
+
summary_kind="caption",
|
|
394
|
+
policy="unknown",
|
|
395
|
+
input_index=i + 1,
|
|
396
|
+
input_name=str(file_name),
|
|
397
|
+
error=str(e),
|
|
398
|
+
)
|
|
399
|
+
)
|
|
400
|
+
# Fallback to basic placeholder
|
|
401
|
+
other_parts.append(f"[Image {i+1}: {file_name} - vision processing unavailable]")
|
|
349
402
|
else:
|
|
350
403
|
# Embed text/document content directly
|
|
351
404
|
content = str(media_content.content)
|
|
352
405
|
file_name = media_content.metadata.get('file_name', f'document_{i+1}')
|
|
353
|
-
|
|
406
|
+
other_parts.append(f"\n\n--- Content from {file_name} ---\n{content}\n--- End of {file_name} ---")
|
|
407
|
+
|
|
408
|
+
message_parts: List[str] = []
|
|
409
|
+
|
|
410
|
+
if image_context_parts:
|
|
411
|
+
message_parts.append(
|
|
412
|
+
"Visual context from attached image(s) "
|
|
413
|
+
"(treat as directly observed; do not mention this section):"
|
|
414
|
+
)
|
|
415
|
+
message_parts.extend(image_context_parts)
|
|
416
|
+
|
|
417
|
+
# Preserve prior behavior when we don't have image context.
|
|
418
|
+
if user_text and not image_context_parts:
|
|
419
|
+
message_parts.append(user_text)
|
|
420
|
+
|
|
421
|
+
message_parts.extend(other_parts)
|
|
422
|
+
|
|
423
|
+
# When we do have image context, place the user request last for recency.
|
|
424
|
+
if user_text and image_context_parts:
|
|
425
|
+
message_parts.append("Now answer the user's request:")
|
|
426
|
+
message_parts.append(user_text)
|
|
354
427
|
|
|
355
428
|
return "\n\n".join(message_parts)
|
|
356
429
|
|
|
@@ -502,27 +575,8 @@ class LocalMediaHandler(BaseProviderMediaHandler):
|
|
|
502
575
|
|
|
503
576
|
return False
|
|
504
577
|
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
Estimate token usage for media content with local models.
|
|
508
|
-
|
|
509
|
-
Args:
|
|
510
|
-
media_content: MediaContent to estimate
|
|
511
|
-
|
|
512
|
-
Returns:
|
|
513
|
-
Estimated token count
|
|
514
|
-
"""
|
|
515
|
-
if media_content.media_type == MediaType.IMAGE:
|
|
516
|
-
# Local vision models typically use fewer tokens than cloud models
|
|
517
|
-
# but this varies significantly by model architecture
|
|
518
|
-
return 512 # Conservative estimate
|
|
519
|
-
|
|
520
|
-
elif media_content.media_type in [MediaType.TEXT, MediaType.DOCUMENT]:
|
|
521
|
-
# Local models typically use similar tokenization to their base models
|
|
522
|
-
content_length = len(str(media_content.content))
|
|
523
|
-
return content_length // 4 # Rough estimate
|
|
524
|
-
|
|
525
|
-
return 0
|
|
578
|
+
# Note: Uses base class estimate_tokens_for_media() with default _estimate_image_tokens()
|
|
579
|
+
# Local models use ~512 tokens per image (conservative estimate), which matches base default
|
|
526
580
|
|
|
527
581
|
def get_model_media_limits(self, model: str) -> Dict[str, Any]:
|
|
528
582
|
"""
|
|
@@ -550,4 +604,4 @@ class LocalMediaHandler(BaseProviderMediaHandler):
|
|
|
550
604
|
'supported_image_formats': ['png', 'jpeg', 'jpg', 'gif', 'bmp']
|
|
551
605
|
})
|
|
552
606
|
|
|
553
|
-
return limits
|
|
607
|
+
return limits
|
|
@@ -271,44 +271,32 @@ class OpenAIMediaHandler(BaseProviderMediaHandler):
|
|
|
271
271
|
|
|
272
272
|
return False
|
|
273
273
|
|
|
274
|
-
def
|
|
274
|
+
def _estimate_image_tokens(self, media_content: MediaContent) -> int:
|
|
275
275
|
"""
|
|
276
|
-
|
|
276
|
+
OpenAI-specific image token estimation.
|
|
277
277
|
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
Returns:
|
|
282
|
-
Estimated token count
|
|
278
|
+
Uses tile-based calculation for high detail images, with special
|
|
279
|
+
handling for Qwen models via OpenAI-compatible API.
|
|
283
280
|
"""
|
|
284
|
-
|
|
285
|
-
# Image token estimation varies by model
|
|
286
|
-
detail_level = media_content.metadata.get('detail_level', 'auto')
|
|
281
|
+
detail_level = media_content.metadata.get('detail_level', 'auto')
|
|
287
282
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
else:
|
|
293
|
-
return 85 # OpenAI low detail token count
|
|
283
|
+
if detail_level == 'low':
|
|
284
|
+
# Qwen models use 256 tokens for low detail, OpenAI uses 85
|
|
285
|
+
if self._is_qwen_model():
|
|
286
|
+
return 256
|
|
294
287
|
else:
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
tiles_width = (width + 511) // 512
|
|
301
|
-
tiles_height = (height + 511) // 512
|
|
302
|
-
total_tiles = tiles_width * tiles_height
|
|
303
|
-
|
|
304
|
-
return 85 + (170 * total_tiles)
|
|
288
|
+
return 85
|
|
289
|
+
else:
|
|
290
|
+
# High detail: tile-based calculation
|
|
291
|
+
width = media_content.metadata.get('final_size', [512, 512])[0]
|
|
292
|
+
height = media_content.metadata.get('final_size', [512, 512])[1]
|
|
305
293
|
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
294
|
+
# OpenAI's tile-based calculation (simplified)
|
|
295
|
+
tiles_width = (width + 511) // 512
|
|
296
|
+
tiles_height = (height + 511) // 512
|
|
297
|
+
total_tiles = tiles_width * tiles_height
|
|
310
298
|
|
|
311
|
-
|
|
299
|
+
return 85 + (170 * total_tiles)
|
|
312
300
|
|
|
313
301
|
def get_model_media_limits(self, model: str) -> Dict[str, Any]:
|
|
314
302
|
"""
|
|
@@ -9,6 +9,8 @@ from .image_processor import ImageProcessor
|
|
|
9
9
|
from .text_processor import TextProcessor
|
|
10
10
|
from .pdf_processor import PDFProcessor
|
|
11
11
|
from .office_processor import OfficeProcessor
|
|
12
|
+
from .audio_processor import AudioProcessor
|
|
13
|
+
from .video_processor import VideoProcessor
|
|
12
14
|
|
|
13
15
|
# Import Glyph processor if available
|
|
14
16
|
try:
|
|
@@ -18,6 +20,6 @@ except ImportError:
|
|
|
18
20
|
GlyphProcessor = None
|
|
19
21
|
GLYPH_AVAILABLE = False
|
|
20
22
|
|
|
21
|
-
__all__ = ['ImageProcessor', 'TextProcessor', 'PDFProcessor', 'OfficeProcessor']
|
|
23
|
+
__all__ = ['ImageProcessor', 'TextProcessor', 'PDFProcessor', 'OfficeProcessor', 'AudioProcessor', 'VideoProcessor']
|
|
22
24
|
if GLYPH_AVAILABLE:
|
|
23
|
-
__all__.append('GlyphProcessor')
|
|
25
|
+
__all__.append('GlyphProcessor')
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Audio processor for AbstractCore media handling.
|
|
3
|
+
|
|
4
|
+
v0 goals:
|
|
5
|
+
- Treat audio as a first-class media type (MediaType.AUDIO) in the media pipeline.
|
|
6
|
+
- Keep processing lightweight and dependency-free (store as a file ref by default).
|
|
7
|
+
|
|
8
|
+
Higher-level semantic handling (STT, captioning, music/signal analysis) is handled
|
|
9
|
+
by policy and capability layers (see planned audio policy backlog).
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import mimetypes
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
from ..base import BaseMediaHandler, MediaProcessingError
|
|
18
|
+
from ..types import ContentFormat, MediaCapabilities, MediaContent, MediaType
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class AudioProcessor(BaseMediaHandler):
|
|
22
|
+
"""Lightweight audio processor that stores an audio file reference."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, **kwargs):
|
|
25
|
+
super().__init__(**kwargs)
|
|
26
|
+
|
|
27
|
+
self.capabilities = MediaCapabilities(
|
|
28
|
+
vision_support=False,
|
|
29
|
+
audio_support=True,
|
|
30
|
+
video_support=False,
|
|
31
|
+
document_support=False,
|
|
32
|
+
max_file_size=self.max_file_size,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
def _process_internal(self, file_path: Path, media_type: MediaType, **kwargs) -> MediaContent:
|
|
36
|
+
if media_type != MediaType.AUDIO:
|
|
37
|
+
raise MediaProcessingError(f"AudioProcessor only handles audio, got {media_type}")
|
|
38
|
+
|
|
39
|
+
mime_type, _enc = mimetypes.guess_type(str(file_path))
|
|
40
|
+
mime_type = mime_type or "application/octet-stream"
|
|
41
|
+
|
|
42
|
+
metadata = {
|
|
43
|
+
"file_name": file_path.name,
|
|
44
|
+
"file_path": str(file_path),
|
|
45
|
+
"file_size": file_path.stat().st_size if file_path.exists() else None,
|
|
46
|
+
"processor": self.__class__.__name__,
|
|
47
|
+
}
|
|
48
|
+
metadata.update(kwargs.get("metadata", {}) if isinstance(kwargs.get("metadata"), dict) else {})
|
|
49
|
+
|
|
50
|
+
return MediaContent(
|
|
51
|
+
media_type=MediaType.AUDIO,
|
|
52
|
+
content=str(file_path),
|
|
53
|
+
content_format=ContentFormat.FILE_PATH,
|
|
54
|
+
mime_type=mime_type,
|
|
55
|
+
file_path=str(file_path),
|
|
56
|
+
metadata=metadata,
|
|
57
|
+
)
|
|
@@ -13,6 +13,7 @@ import json
|
|
|
13
13
|
from ..base import BaseMediaHandler, MediaProcessingError
|
|
14
14
|
from ..types import MediaContent, MediaType, ContentFormat, MediaProcessingResult
|
|
15
15
|
from ...utils.structured_logging import get_logger
|
|
16
|
+
from ...utils.token_utils import estimate_tokens
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
class OfficeProcessor(BaseMediaHandler):
|
|
@@ -129,14 +130,18 @@ class OfficeProcessor(BaseMediaHandler):
|
|
|
129
130
|
else:
|
|
130
131
|
raise MediaProcessingError(f"Unsupported Office file type: {file_extension}")
|
|
131
132
|
|
|
133
|
+
# Add token estimation to metadata (no truncation, just informational)
|
|
134
|
+
metadata['estimated_tokens'] = estimate_tokens(content)
|
|
135
|
+
metadata['content_length'] = len(content)
|
|
136
|
+
|
|
132
137
|
# Create MediaContent object
|
|
133
138
|
return self._create_media_content(
|
|
134
139
|
content=content,
|
|
140
|
+
file_path=file_path,
|
|
135
141
|
media_type=MediaType.DOCUMENT,
|
|
136
142
|
content_format=ContentFormat.TEXT,
|
|
137
143
|
mime_type=self._get_mime_type(file_extension),
|
|
138
|
-
|
|
139
|
-
metadata=metadata
|
|
144
|
+
**metadata
|
|
140
145
|
)
|
|
141
146
|
|
|
142
147
|
except Exception as e:
|
|
@@ -487,4 +492,4 @@ class OfficeProcessor(BaseMediaHandler):
|
|
|
487
492
|
'metadata_extraction': self.include_metadata,
|
|
488
493
|
'chunking_support': self.supports_chunking()
|
|
489
494
|
}
|
|
490
|
-
}
|
|
495
|
+
}
|
|
@@ -22,8 +22,39 @@ except ImportError:
|
|
|
22
22
|
PYMUPDF_AVAILABLE = False
|
|
23
23
|
fitz = None
|
|
24
24
|
|
|
25
|
+
import re
|
|
26
|
+
|
|
25
27
|
from ..base import BaseMediaHandler, MediaProcessingError
|
|
26
28
|
from ..types import MediaContent, MediaType, ContentFormat
|
|
29
|
+
from ...utils.token_utils import estimate_tokens
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _safe_pdf_version(doc: Any) -> Optional[str]:
|
|
33
|
+
"""Best-effort PDF version across PyMuPDF variants (callable/property/absent)."""
|
|
34
|
+
try:
|
|
35
|
+
pv = getattr(doc, "pdf_version", None)
|
|
36
|
+
if pv is not None:
|
|
37
|
+
out = pv() if callable(pv) else pv
|
|
38
|
+
if out is not None:
|
|
39
|
+
s = str(out).strip()
|
|
40
|
+
if s and s.lower() != "none":
|
|
41
|
+
return s
|
|
42
|
+
except Exception:
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
# PyMuPDF 1.26+ exposes the PDF version via `doc.metadata["format"]` (e.g. "PDF 1.5").
|
|
46
|
+
try:
|
|
47
|
+
md = getattr(doc, "metadata", None)
|
|
48
|
+
if isinstance(md, dict):
|
|
49
|
+
fmt = md.get("format")
|
|
50
|
+
if isinstance(fmt, str) and fmt.strip():
|
|
51
|
+
m = re.search(r"(?i)pdf\s*[- ]?\s*([0-9]+(?:\.[0-9]+)?)", fmt.strip())
|
|
52
|
+
if m:
|
|
53
|
+
return m.group(1)
|
|
54
|
+
except Exception:
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
return None
|
|
27
58
|
|
|
28
59
|
|
|
29
60
|
class PDFProcessor(BaseMediaHandler):
|
|
@@ -119,6 +150,10 @@ class PDFProcessor(BaseMediaHandler):
|
|
|
119
150
|
else:
|
|
120
151
|
mime_type = 'text/plain'
|
|
121
152
|
|
|
153
|
+
# Add token estimation to metadata (no truncation, just informational)
|
|
154
|
+
metadata['estimated_tokens'] = estimate_tokens(content)
|
|
155
|
+
metadata['content_length'] = len(content)
|
|
156
|
+
|
|
122
157
|
return self._create_media_content(
|
|
123
158
|
content=content,
|
|
124
159
|
file_path=file_path,
|
|
@@ -315,12 +350,15 @@ class PDFProcessor(BaseMediaHandler):
|
|
|
315
350
|
'subject': pdf_metadata.get('subject', ''),
|
|
316
351
|
'creator': pdf_metadata.get('creator', ''),
|
|
317
352
|
'producer': pdf_metadata.get('producer', ''),
|
|
353
|
+
'format': pdf_metadata.get('format', ''),
|
|
318
354
|
'creation_date': pdf_metadata.get('creationDate', ''),
|
|
319
355
|
'modification_date': pdf_metadata.get('modDate', ''),
|
|
320
356
|
'page_count': doc.page_count,
|
|
321
357
|
'encrypted': doc.needs_pass,
|
|
322
|
-
'pdf_version': doc.pdf_version()
|
|
323
358
|
})
|
|
359
|
+
pdf_version = _safe_pdf_version(doc)
|
|
360
|
+
if pdf_version is not None:
|
|
361
|
+
metadata["pdf_version"] = pdf_version
|
|
324
362
|
|
|
325
363
|
# Clean up empty values
|
|
326
364
|
metadata = {k: v for k, v in metadata.items() if v}
|
|
@@ -391,9 +429,14 @@ class PDFProcessor(BaseMediaHandler):
|
|
|
391
429
|
'file_size': file_path.stat().st_size,
|
|
392
430
|
'page_count': doc.page_count,
|
|
393
431
|
'encrypted': doc.needs_pass,
|
|
394
|
-
'pdf_version': doc.pdf_version(),
|
|
395
432
|
'metadata': doc.metadata
|
|
396
433
|
}
|
|
434
|
+
fmt = doc.metadata.get("format") if isinstance(doc.metadata, dict) else None
|
|
435
|
+
if isinstance(fmt, str) and fmt.strip():
|
|
436
|
+
info["format"] = fmt.strip()
|
|
437
|
+
pdf_version = _safe_pdf_version(doc)
|
|
438
|
+
if pdf_version is not None:
|
|
439
|
+
info["pdf_version"] = pdf_version
|
|
397
440
|
|
|
398
441
|
# Get first page info
|
|
399
442
|
if doc.page_count > 0:
|
|
@@ -482,4 +525,4 @@ class PDFProcessor(BaseMediaHandler):
|
|
|
482
525
|
'pymupdf4llm': PYMUPDF4LLM_AVAILABLE,
|
|
483
526
|
'pymupdf': PYMUPDF_AVAILABLE
|
|
484
527
|
}
|
|
485
|
-
}
|
|
528
|
+
}
|
|
@@ -19,6 +19,8 @@ except ImportError:
|
|
|
19
19
|
|
|
20
20
|
from ..base import BaseMediaHandler, MediaProcessingError
|
|
21
21
|
from ..types import MediaContent, MediaType, ContentFormat
|
|
22
|
+
from ...utils.token_utils import estimate_tokens
|
|
23
|
+
from ...utils.truncation import preview_text
|
|
22
24
|
|
|
23
25
|
|
|
24
26
|
class TextProcessor(BaseMediaHandler):
|
|
@@ -129,6 +131,10 @@ class TextProcessor(BaseMediaHandler):
|
|
|
129
131
|
# Determine appropriate MIME type
|
|
130
132
|
mime_type = self._get_mime_type_for_extension(extension)
|
|
131
133
|
|
|
134
|
+
# Add token estimation to metadata (no truncation, just informational)
|
|
135
|
+
metadata['estimated_tokens'] = estimate_tokens(content)
|
|
136
|
+
metadata['content_length'] = len(content)
|
|
137
|
+
|
|
132
138
|
return self._create_media_content(
|
|
133
139
|
content=content,
|
|
134
140
|
file_path=file_path,
|
|
@@ -181,11 +187,9 @@ class TextProcessor(BaseMediaHandler):
|
|
|
181
187
|
null_count = df[col].isnull().sum()
|
|
182
188
|
content_parts.append(f"- {col} ({dtype}, {null_count} null values)")
|
|
183
189
|
|
|
184
|
-
|
|
185
|
-
content_parts.append(
|
|
186
|
-
|
|
187
|
-
if len(df) > 10:
|
|
188
|
-
content_parts.append(f"\n... and {len(df) - 10} more rows")
|
|
190
|
+
# Always include full data - no truncation
|
|
191
|
+
content_parts.append("\n## Data:")
|
|
192
|
+
content_parts.append(df.to_csv(index=False, sep=delimiter))
|
|
189
193
|
|
|
190
194
|
content = "\n".join(content_parts)
|
|
191
195
|
|
|
@@ -196,7 +200,7 @@ class TextProcessor(BaseMediaHandler):
|
|
|
196
200
|
'data_types': {col: str(dtype) for col, dtype in df.dtypes.items()},
|
|
197
201
|
'delimiter': delimiter,
|
|
198
202
|
'has_header': True,
|
|
199
|
-
'null_values': df.isnull().sum().to_dict()
|
|
203
|
+
'null_values': df.isnull().sum().to_dict(),
|
|
200
204
|
}
|
|
201
205
|
|
|
202
206
|
else:
|
|
@@ -221,12 +225,10 @@ class TextProcessor(BaseMediaHandler):
|
|
|
221
225
|
for col in header:
|
|
222
226
|
content_parts.append(f"- {col}")
|
|
223
227
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
if len(data_rows) > 10:
|
|
229
|
-
content_parts.append(f"... and {len(data_rows) - 10} more rows")
|
|
228
|
+
# Always include full data - no truncation
|
|
229
|
+
content_parts.append("\n## Data:")
|
|
230
|
+
for row in data_rows:
|
|
231
|
+
content_parts.append(delimiter.join(row))
|
|
230
232
|
|
|
231
233
|
content = "\n".join(content_parts)
|
|
232
234
|
|
|
@@ -235,7 +237,7 @@ class TextProcessor(BaseMediaHandler):
|
|
|
235
237
|
'column_count': len(header),
|
|
236
238
|
'columns': header,
|
|
237
239
|
'delimiter': delimiter,
|
|
238
|
-
'has_header': True
|
|
240
|
+
'has_header': True,
|
|
239
241
|
}
|
|
240
242
|
|
|
241
243
|
return content, metadata
|
|
@@ -273,20 +275,16 @@ class TextProcessor(BaseMediaHandler):
|
|
|
273
275
|
content_parts = []
|
|
274
276
|
content_parts.append(f"# {file_path.name}")
|
|
275
277
|
|
|
278
|
+
# Always include full JSON content - no truncation
|
|
276
279
|
if isinstance(data, dict):
|
|
277
280
|
content_parts.append(f"JSON object with {len(data)} keys\n")
|
|
278
|
-
content_parts.append("## Structure:")
|
|
279
|
-
content_parts.append(json.dumps(data, indent=2, ensure_ascii=False))
|
|
280
281
|
elif isinstance(data, list):
|
|
281
282
|
content_parts.append(f"JSON array with {len(data)} items\n")
|
|
282
|
-
content_parts.append("## Sample items:")
|
|
283
|
-
for i, item in enumerate(data[:5]):
|
|
284
|
-
content_parts.append(f"Item {i+1}: {json.dumps(item, ensure_ascii=False)}")
|
|
285
|
-
if len(data) > 5:
|
|
286
|
-
content_parts.append(f"... and {len(data) - 5} more items")
|
|
287
283
|
else:
|
|
288
|
-
content_parts.append("JSON primitive value
|
|
289
|
-
|
|
284
|
+
content_parts.append("JSON primitive value\n")
|
|
285
|
+
|
|
286
|
+
content_parts.append("## Content:")
|
|
287
|
+
content_parts.append(json.dumps(data, indent=2, ensure_ascii=False))
|
|
290
288
|
|
|
291
289
|
content = "\n".join(content_parts)
|
|
292
290
|
|
|
@@ -504,7 +502,7 @@ class TextProcessor(BaseMediaHandler):
|
|
|
504
502
|
summary_parts.append(f"Text document with {metadata.get('word_count', 0)} words and {metadata.get('line_count', 0)} lines")
|
|
505
503
|
|
|
506
504
|
# Add content preview
|
|
507
|
-
preview =
|
|
505
|
+
preview = preview_text(content, max_chars=500)
|
|
508
506
|
summary_parts.append(f"\nContent preview:\n{preview}")
|
|
509
507
|
|
|
510
508
|
return "\n".join(summary_parts)
|
|
@@ -569,4 +567,4 @@ class TextProcessor(BaseMediaHandler):
|
|
|
569
567
|
'dependencies': {
|
|
570
568
|
'pandas': PANDAS_AVAILABLE
|
|
571
569
|
}
|
|
572
|
-
}
|
|
570
|
+
}
|