abstractcore 2.9.1__py3-none-any.whl → 2.11.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. abstractcore/__init__.py +7 -27
  2. abstractcore/apps/deepsearch.py +9 -4
  3. abstractcore/apps/extractor.py +33 -100
  4. abstractcore/apps/intent.py +19 -0
  5. abstractcore/apps/judge.py +20 -1
  6. abstractcore/apps/summarizer.py +20 -1
  7. abstractcore/architectures/detection.py +34 -1
  8. abstractcore/architectures/response_postprocessing.py +313 -0
  9. abstractcore/assets/architecture_formats.json +38 -8
  10. abstractcore/assets/model_capabilities.json +882 -160
  11. abstractcore/compression/__init__.py +1 -2
  12. abstractcore/compression/glyph_processor.py +6 -4
  13. abstractcore/config/main.py +52 -20
  14. abstractcore/config/manager.py +390 -12
  15. abstractcore/config/vision_config.py +5 -5
  16. abstractcore/core/interface.py +151 -3
  17. abstractcore/core/session.py +16 -10
  18. abstractcore/download.py +1 -1
  19. abstractcore/embeddings/manager.py +20 -6
  20. abstractcore/endpoint/__init__.py +2 -0
  21. abstractcore/endpoint/app.py +458 -0
  22. abstractcore/mcp/client.py +3 -1
  23. abstractcore/media/__init__.py +52 -17
  24. abstractcore/media/auto_handler.py +42 -22
  25. abstractcore/media/base.py +44 -1
  26. abstractcore/media/capabilities.py +12 -33
  27. abstractcore/media/enrichment.py +105 -0
  28. abstractcore/media/handlers/anthropic_handler.py +19 -28
  29. abstractcore/media/handlers/local_handler.py +124 -70
  30. abstractcore/media/handlers/openai_handler.py +19 -31
  31. abstractcore/media/processors/__init__.py +4 -2
  32. abstractcore/media/processors/audio_processor.py +57 -0
  33. abstractcore/media/processors/office_processor.py +8 -3
  34. abstractcore/media/processors/pdf_processor.py +46 -3
  35. abstractcore/media/processors/text_processor.py +22 -24
  36. abstractcore/media/processors/video_processor.py +58 -0
  37. abstractcore/media/types.py +97 -4
  38. abstractcore/media/utils/image_scaler.py +20 -2
  39. abstractcore/media/utils/video_frames.py +219 -0
  40. abstractcore/media/vision_fallback.py +136 -22
  41. abstractcore/processing/__init__.py +32 -3
  42. abstractcore/processing/basic_deepsearch.py +15 -10
  43. abstractcore/processing/basic_intent.py +3 -2
  44. abstractcore/processing/basic_judge.py +3 -2
  45. abstractcore/processing/basic_summarizer.py +1 -1
  46. abstractcore/providers/__init__.py +3 -1
  47. abstractcore/providers/anthropic_provider.py +95 -8
  48. abstractcore/providers/base.py +1516 -81
  49. abstractcore/providers/huggingface_provider.py +546 -69
  50. abstractcore/providers/lmstudio_provider.py +30 -916
  51. abstractcore/providers/mlx_provider.py +382 -35
  52. abstractcore/providers/model_capabilities.py +5 -1
  53. abstractcore/providers/ollama_provider.py +99 -15
  54. abstractcore/providers/openai_compatible_provider.py +406 -180
  55. abstractcore/providers/openai_provider.py +188 -44
  56. abstractcore/providers/openrouter_provider.py +76 -0
  57. abstractcore/providers/registry.py +61 -5
  58. abstractcore/providers/streaming.py +138 -33
  59. abstractcore/providers/vllm_provider.py +92 -817
  60. abstractcore/server/app.py +478 -28
  61. abstractcore/server/audio_endpoints.py +139 -0
  62. abstractcore/server/vision_endpoints.py +1319 -0
  63. abstractcore/structured/handler.py +316 -41
  64. abstractcore/tools/common_tools.py +5501 -2012
  65. abstractcore/tools/comms_tools.py +1641 -0
  66. abstractcore/tools/core.py +37 -7
  67. abstractcore/tools/handler.py +4 -9
  68. abstractcore/tools/parser.py +49 -2
  69. abstractcore/tools/tag_rewriter.py +2 -1
  70. abstractcore/tools/telegram_tdlib.py +407 -0
  71. abstractcore/tools/telegram_tools.py +261 -0
  72. abstractcore/utils/cli.py +1085 -72
  73. abstractcore/utils/structured_logging.py +29 -8
  74. abstractcore/utils/token_utils.py +2 -0
  75. abstractcore/utils/truncation.py +29 -0
  76. abstractcore/utils/version.py +3 -4
  77. abstractcore/utils/vlm_token_calculator.py +12 -2
  78. abstractcore-2.11.4.dist-info/METADATA +562 -0
  79. abstractcore-2.11.4.dist-info/RECORD +133 -0
  80. {abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/WHEEL +1 -1
  81. {abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/entry_points.txt +1 -0
  82. abstractcore-2.9.1.dist-info/METADATA +0 -1190
  83. abstractcore-2.9.1.dist-info/RECORD +0 -119
  84. {abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/licenses/LICENSE +0 -0
  85. {abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/top_level.txt +0 -0
@@ -10,6 +10,7 @@ from typing import Dict, Any, List, Optional, Union
10
10
 
11
11
  from ..base import BaseProviderMediaHandler, MediaProcessingError
12
12
  from ..types import MediaContent, MediaType, ContentFormat
13
+ from ..enrichment import build_enrichment_item
13
14
 
14
15
  # Import vision detection from existing architecture system
15
16
  try:
@@ -48,6 +49,10 @@ class LocalMediaHandler(BaseProviderMediaHandler):
48
49
  self.prefer_text_extraction = kwargs.get('prefer_text_extraction', True)
49
50
  self.embed_images_in_text = kwargs.get('embed_images_in_text', False)
50
51
 
52
+ # Collected "media enrichment" entries (input fallback transparency).
53
+ # Populated when a modality is converted into text context (e.g. image caption).
54
+ self.media_enrichment: List[Dict[str, Any]] = []
55
+
51
56
  self.logger.debug(f"Initialized {provider_name} local media handler with model={self.model_name}, capabilities: {self.capabilities}")
52
57
 
53
58
  def _process_internal(self, file_path: Path, media_type: MediaType, **kwargs) -> MediaContent:
@@ -232,6 +237,9 @@ class LocalMediaHandler(BaseProviderMediaHandler):
232
237
  Returns:
233
238
  Formatted message (structured dict for vision models, string for text-only)
234
239
  """
240
+ # Reset per-call enrichment collection.
241
+ self.media_enrichment = []
242
+
235
243
  # Check if we have images in the media contents
236
244
  has_images = any(mc.media_type == MediaType.IMAGE for mc in media_contents)
237
245
 
@@ -295,62 +303,127 @@ class LocalMediaHandler(BaseProviderMediaHandler):
295
303
  This is often more reliable for local providers that don't have
296
304
  robust multimodal support. For images on text-only models, uses vision fallback.
297
305
  """
298
- message_parts = []
299
-
300
- # Add main text
301
- if text.strip():
302
- message_parts.append(text)
306
+ user_text = text.strip() if text else ""
307
+ image_context_parts: List[str] = []
308
+ other_parts: List[str] = []
303
309
 
304
310
  # Add processed content from media
305
311
  for i, media_content in enumerate(media_contents):
306
312
  if media_content.media_type == MediaType.IMAGE:
307
- if self.capabilities.vision_support:
308
- # For vision models, we'll still need to handle images specially
309
- # This will be handled by the provider's generate method
310
- message_parts.append(f"[Image {i+1}: {media_content.metadata.get('file_name', 'image')}]")
311
- else:
312
- # Use vision fallback for text-only models
313
- try:
314
- from ..vision_fallback import VisionFallbackHandler, VisionNotConfiguredError
315
- fallback_handler = VisionFallbackHandler()
316
-
317
- # Get the actual file path from media_content object
318
- file_path = media_content.file_path or media_content.metadata.get('file_path') or media_content.metadata.get('file_name', 'image')
319
-
320
- # Generate description using vision fallback
321
- description = fallback_handler.create_description(str(file_path), text)
322
- # Remove the original question from message_parts if it exists
323
- if message_parts and text.strip() in message_parts[0]:
324
- message_parts.clear()
325
- # Completely different approach: make model think it's continuing its own observation
326
- # No questions, no external framing - just natural continuation
327
- simple_prompt = f"{description}"
328
- message_parts.append(simple_prompt)
329
-
330
- except VisionNotConfiguredError as e:
331
- # Vision not configured - show warning to USER, not model
332
- self.logger.warning("Vision capability not configured for text-only models")
333
- self.logger.warning("To enable image analysis with text-only models:")
334
- self.logger.warning("🔸 EASIEST: Download BLIP vision model (990MB): abstractcore --download-vision-model")
335
- self.logger.warning("🔸 Use existing Ollama model: abstractcore --set-vision-caption qwen2.5vl:7b")
336
- self.logger.warning("🔸 Use cloud API: abstractcore --set-vision-provider openai --model gpt-4o")
337
- self.logger.warning("🔸 Interactive setup: abstractcore --configure")
338
- self.logger.warning("Current status: abstractcore --status")
339
-
340
- # Provide minimal placeholder to model (not configuration instructions!)
341
- file_name = media_content.metadata.get('file_name', 'image')
342
- message_parts.append(f"[Image {i+1}: {file_name}]")
343
-
344
- except Exception as e:
345
- self.logger.warning(f"Vision fallback failed: {e}")
346
- # Fallback to basic placeholder
347
- file_name = media_content.metadata.get('file_name', 'image')
348
- message_parts.append(f"[Image {i+1}: {file_name} - vision processing unavailable]")
313
+ file_name = media_content.metadata.get('file_name', 'image')
314
+ # In text-embedded mode, images are not passed natively.
315
+ # Always prefer the vision fallback (caption text context) when configured.
316
+ try:
317
+ from ..vision_fallback import VisionFallbackHandler, VisionNotConfiguredError
318
+
319
+ fallback_handler = VisionFallbackHandler()
320
+
321
+ # Get the actual file path from media_content object
322
+ file_path = (
323
+ media_content.file_path
324
+ or media_content.metadata.get('file_path')
325
+ or media_content.metadata.get('file_name', 'image')
326
+ )
327
+
328
+ # Generate description using vision fallback
329
+ description, trace = fallback_handler.create_description_with_trace(
330
+ str(file_path), user_text or None
331
+ )
332
+ description = str(description or "").strip()
333
+
334
+ if description:
335
+ image_context_parts.append(f"Image {i+1} ({file_name}): {description}")
336
+ self.media_enrichment.append(
337
+ build_enrichment_item(
338
+ status="used",
339
+ input_modality="image",
340
+ summary_kind="caption",
341
+ policy=str(trace.get("strategy") or ""),
342
+ backend=trace.get("backend") if isinstance(trace, dict) else None,
343
+ input_index=i + 1,
344
+ input_name=str(file_name),
345
+ injected_text=description,
346
+ )
347
+ )
348
+ else:
349
+ other_parts.append(f"[Image {i+1}: {file_name} - no description returned]")
350
+ self.media_enrichment.append(
351
+ build_enrichment_item(
352
+ status="error",
353
+ input_modality="image",
354
+ summary_kind="caption",
355
+ policy=str(getattr(fallback_handler.vision_config, "strategy", "") or ""),
356
+ input_index=i + 1,
357
+ input_name=str(file_name),
358
+ error="Vision fallback returned empty description",
359
+ )
360
+ )
361
+
362
+ except VisionNotConfiguredError as e:
363
+ # Vision not configured - show warning to USER, not model
364
+ self.logger.warning("Vision capability not configured for text-only models")
365
+ self.logger.warning("To enable image analysis with text-only models:")
366
+ self.logger.warning("🔸 EASIEST: Download BLIP vision model (990MB): abstractcore --download-vision-model")
367
+ self.logger.warning("🔸 Use existing Ollama model: abstractcore --set-vision-caption qwen2.5vl:7b")
368
+ self.logger.warning("🔸 Use cloud API: abstractcore --set-vision-provider openai --model gpt-4o")
369
+ self.logger.warning("🔸 Interactive setup: abstractcore --configure")
370
+ self.logger.warning("Current status: abstractcore --status")
371
+
372
+ self.media_enrichment.append(
373
+ build_enrichment_item(
374
+ status="skipped",
375
+ input_modality="image",
376
+ summary_kind="caption",
377
+ policy="disabled",
378
+ input_index=i + 1,
379
+ input_name=str(file_name),
380
+ error=str(e),
381
+ )
382
+ )
383
+
384
+ # Provide minimal placeholder to model (not configuration instructions!)
385
+ other_parts.append(f"[Image {i+1}: {file_name}]")
386
+
387
+ except Exception as e:
388
+ self.logger.warning(f"Vision fallback failed: {e}")
389
+ self.media_enrichment.append(
390
+ build_enrichment_item(
391
+ status="error",
392
+ input_modality="image",
393
+ summary_kind="caption",
394
+ policy="unknown",
395
+ input_index=i + 1,
396
+ input_name=str(file_name),
397
+ error=str(e),
398
+ )
399
+ )
400
+ # Fallback to basic placeholder
401
+ other_parts.append(f"[Image {i+1}: {file_name} - vision processing unavailable]")
349
402
  else:
350
403
  # Embed text/document content directly
351
404
  content = str(media_content.content)
352
405
  file_name = media_content.metadata.get('file_name', f'document_{i+1}')
353
- message_parts.append(f"\n\n--- Content from {file_name} ---\n{content}\n--- End of {file_name} ---")
406
+ other_parts.append(f"\n\n--- Content from {file_name} ---\n{content}\n--- End of {file_name} ---")
407
+
408
+ message_parts: List[str] = []
409
+
410
+ if image_context_parts:
411
+ message_parts.append(
412
+ "Visual context from attached image(s) "
413
+ "(treat as directly observed; do not mention this section):"
414
+ )
415
+ message_parts.extend(image_context_parts)
416
+
417
+ # Preserve prior behavior when we don't have image context.
418
+ if user_text and not image_context_parts:
419
+ message_parts.append(user_text)
420
+
421
+ message_parts.extend(other_parts)
422
+
423
+ # When we do have image context, place the user request last for recency.
424
+ if user_text and image_context_parts:
425
+ message_parts.append("Now answer the user's request:")
426
+ message_parts.append(user_text)
354
427
 
355
428
  return "\n\n".join(message_parts)
356
429
 
@@ -502,27 +575,8 @@ class LocalMediaHandler(BaseProviderMediaHandler):
502
575
 
503
576
  return False
504
577
 
505
- def estimate_tokens_for_media(self, media_content: MediaContent) -> int:
506
- """
507
- Estimate token usage for media content with local models.
508
-
509
- Args:
510
- media_content: MediaContent to estimate
511
-
512
- Returns:
513
- Estimated token count
514
- """
515
- if media_content.media_type == MediaType.IMAGE:
516
- # Local vision models typically use fewer tokens than cloud models
517
- # but this varies significantly by model architecture
518
- return 512 # Conservative estimate
519
-
520
- elif media_content.media_type in [MediaType.TEXT, MediaType.DOCUMENT]:
521
- # Local models typically use similar tokenization to their base models
522
- content_length = len(str(media_content.content))
523
- return content_length // 4 # Rough estimate
524
-
525
- return 0
578
+ # Note: Uses base class estimate_tokens_for_media() with default _estimate_image_tokens()
579
+ # Local models use ~512 tokens per image (conservative estimate), which matches base default
526
580
 
527
581
  def get_model_media_limits(self, model: str) -> Dict[str, Any]:
528
582
  """
@@ -550,4 +604,4 @@ class LocalMediaHandler(BaseProviderMediaHandler):
550
604
  'supported_image_formats': ['png', 'jpeg', 'jpg', 'gif', 'bmp']
551
605
  })
552
606
 
553
- return limits
607
+ return limits
@@ -271,44 +271,32 @@ class OpenAIMediaHandler(BaseProviderMediaHandler):
271
271
 
272
272
  return False
273
273
 
274
- def estimate_tokens_for_media(self, media_content: MediaContent) -> int:
274
+ def _estimate_image_tokens(self, media_content: MediaContent) -> int:
275
275
  """
276
- Estimate token usage for media content.
276
+ OpenAI-specific image token estimation.
277
277
 
278
- Args:
279
- media_content: MediaContent to estimate
280
-
281
- Returns:
282
- Estimated token count
278
+ Uses tile-based calculation for high detail images, with special
279
+ handling for Qwen models via OpenAI-compatible API.
283
280
  """
284
- if media_content.media_type == MediaType.IMAGE:
285
- # Image token estimation varies by model
286
- detail_level = media_content.metadata.get('detail_level', 'auto')
281
+ detail_level = media_content.metadata.get('detail_level', 'auto')
287
282
 
288
- if detail_level == 'low':
289
- # Qwen models use 256 tokens for low detail, OpenAI uses 85
290
- if self._is_qwen_model():
291
- return 256 # Qwen low detail token count
292
- else:
293
- return 85 # OpenAI low detail token count
283
+ if detail_level == 'low':
284
+ # Qwen models use 256 tokens for low detail, OpenAI uses 85
285
+ if self._is_qwen_model():
286
+ return 256
294
287
  else:
295
- # High detail calculation based on image dimensions
296
- width = media_content.metadata.get('final_size', [512, 512])[0]
297
- height = media_content.metadata.get('final_size', [512, 512])[1]
298
-
299
- # OpenAI's tile-based calculation (simplified)
300
- tiles_width = (width + 511) // 512
301
- tiles_height = (height + 511) // 512
302
- total_tiles = tiles_width * tiles_height
303
-
304
- return 85 + (170 * total_tiles)
288
+ return 85
289
+ else:
290
+ # High detail: tile-based calculation
291
+ width = media_content.metadata.get('final_size', [512, 512])[0]
292
+ height = media_content.metadata.get('final_size', [512, 512])[1]
305
293
 
306
- elif media_content.media_type in [MediaType.TEXT, MediaType.DOCUMENT]:
307
- # Rough estimation: 4 characters per token
308
- content_length = len(str(media_content.content))
309
- return content_length // 4
294
+ # OpenAI's tile-based calculation (simplified)
295
+ tiles_width = (width + 511) // 512
296
+ tiles_height = (height + 511) // 512
297
+ total_tiles = tiles_width * tiles_height
310
298
 
311
- return 0
299
+ return 85 + (170 * total_tiles)
312
300
 
313
301
  def get_model_media_limits(self, model: str) -> Dict[str, Any]:
314
302
  """
@@ -9,6 +9,8 @@ from .image_processor import ImageProcessor
9
9
  from .text_processor import TextProcessor
10
10
  from .pdf_processor import PDFProcessor
11
11
  from .office_processor import OfficeProcessor
12
+ from .audio_processor import AudioProcessor
13
+ from .video_processor import VideoProcessor
12
14
 
13
15
  # Import Glyph processor if available
14
16
  try:
@@ -18,6 +20,6 @@ except ImportError:
18
20
  GlyphProcessor = None
19
21
  GLYPH_AVAILABLE = False
20
22
 
21
- __all__ = ['ImageProcessor', 'TextProcessor', 'PDFProcessor', 'OfficeProcessor']
23
+ __all__ = ['ImageProcessor', 'TextProcessor', 'PDFProcessor', 'OfficeProcessor', 'AudioProcessor', 'VideoProcessor']
22
24
  if GLYPH_AVAILABLE:
23
- __all__.append('GlyphProcessor')
25
+ __all__.append('GlyphProcessor')
@@ -0,0 +1,57 @@
1
+ """
2
+ Audio processor for AbstractCore media handling.
3
+
4
+ v0 goals:
5
+ - Treat audio as a first-class media type (MediaType.AUDIO) in the media pipeline.
6
+ - Keep processing lightweight and dependency-free (store as a file ref by default).
7
+
8
+ Higher-level semantic handling (STT, captioning, music/signal analysis) is handled
9
+ by policy and capability layers (see planned audio policy backlog).
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import mimetypes
15
+ from pathlib import Path
16
+
17
+ from ..base import BaseMediaHandler, MediaProcessingError
18
+ from ..types import ContentFormat, MediaCapabilities, MediaContent, MediaType
19
+
20
+
21
+ class AudioProcessor(BaseMediaHandler):
22
+ """Lightweight audio processor that stores an audio file reference."""
23
+
24
+ def __init__(self, **kwargs):
25
+ super().__init__(**kwargs)
26
+
27
+ self.capabilities = MediaCapabilities(
28
+ vision_support=False,
29
+ audio_support=True,
30
+ video_support=False,
31
+ document_support=False,
32
+ max_file_size=self.max_file_size,
33
+ )
34
+
35
+ def _process_internal(self, file_path: Path, media_type: MediaType, **kwargs) -> MediaContent:
36
+ if media_type != MediaType.AUDIO:
37
+ raise MediaProcessingError(f"AudioProcessor only handles audio, got {media_type}")
38
+
39
+ mime_type, _enc = mimetypes.guess_type(str(file_path))
40
+ mime_type = mime_type or "application/octet-stream"
41
+
42
+ metadata = {
43
+ "file_name": file_path.name,
44
+ "file_path": str(file_path),
45
+ "file_size": file_path.stat().st_size if file_path.exists() else None,
46
+ "processor": self.__class__.__name__,
47
+ }
48
+ metadata.update(kwargs.get("metadata", {}) if isinstance(kwargs.get("metadata"), dict) else {})
49
+
50
+ return MediaContent(
51
+ media_type=MediaType.AUDIO,
52
+ content=str(file_path),
53
+ content_format=ContentFormat.FILE_PATH,
54
+ mime_type=mime_type,
55
+ file_path=str(file_path),
56
+ metadata=metadata,
57
+ )
@@ -13,6 +13,7 @@ import json
13
13
  from ..base import BaseMediaHandler, MediaProcessingError
14
14
  from ..types import MediaContent, MediaType, ContentFormat, MediaProcessingResult
15
15
  from ...utils.structured_logging import get_logger
16
+ from ...utils.token_utils import estimate_tokens
16
17
 
17
18
 
18
19
  class OfficeProcessor(BaseMediaHandler):
@@ -129,14 +130,18 @@ class OfficeProcessor(BaseMediaHandler):
129
130
  else:
130
131
  raise MediaProcessingError(f"Unsupported Office file type: {file_extension}")
131
132
 
133
+ # Add token estimation to metadata (no truncation, just informational)
134
+ metadata['estimated_tokens'] = estimate_tokens(content)
135
+ metadata['content_length'] = len(content)
136
+
132
137
  # Create MediaContent object
133
138
  return self._create_media_content(
134
139
  content=content,
140
+ file_path=file_path,
135
141
  media_type=MediaType.DOCUMENT,
136
142
  content_format=ContentFormat.TEXT,
137
143
  mime_type=self._get_mime_type(file_extension),
138
- file_path=file_path,
139
- metadata=metadata
144
+ **metadata
140
145
  )
141
146
 
142
147
  except Exception as e:
@@ -487,4 +492,4 @@ class OfficeProcessor(BaseMediaHandler):
487
492
  'metadata_extraction': self.include_metadata,
488
493
  'chunking_support': self.supports_chunking()
489
494
  }
490
- }
495
+ }
@@ -22,8 +22,39 @@ except ImportError:
22
22
  PYMUPDF_AVAILABLE = False
23
23
  fitz = None
24
24
 
25
+ import re
26
+
25
27
  from ..base import BaseMediaHandler, MediaProcessingError
26
28
  from ..types import MediaContent, MediaType, ContentFormat
29
+ from ...utils.token_utils import estimate_tokens
30
+
31
+
32
+ def _safe_pdf_version(doc: Any) -> Optional[str]:
33
+ """Best-effort PDF version across PyMuPDF variants (callable/property/absent)."""
34
+ try:
35
+ pv = getattr(doc, "pdf_version", None)
36
+ if pv is not None:
37
+ out = pv() if callable(pv) else pv
38
+ if out is not None:
39
+ s = str(out).strip()
40
+ if s and s.lower() != "none":
41
+ return s
42
+ except Exception:
43
+ pass
44
+
45
+ # PyMuPDF 1.26+ exposes the PDF version via `doc.metadata["format"]` (e.g. "PDF 1.5").
46
+ try:
47
+ md = getattr(doc, "metadata", None)
48
+ if isinstance(md, dict):
49
+ fmt = md.get("format")
50
+ if isinstance(fmt, str) and fmt.strip():
51
+ m = re.search(r"(?i)pdf\s*[- ]?\s*([0-9]+(?:\.[0-9]+)?)", fmt.strip())
52
+ if m:
53
+ return m.group(1)
54
+ except Exception:
55
+ pass
56
+
57
+ return None
27
58
 
28
59
 
29
60
  class PDFProcessor(BaseMediaHandler):
@@ -119,6 +150,10 @@ class PDFProcessor(BaseMediaHandler):
119
150
  else:
120
151
  mime_type = 'text/plain'
121
152
 
153
+ # Add token estimation to metadata (no truncation, just informational)
154
+ metadata['estimated_tokens'] = estimate_tokens(content)
155
+ metadata['content_length'] = len(content)
156
+
122
157
  return self._create_media_content(
123
158
  content=content,
124
159
  file_path=file_path,
@@ -315,12 +350,15 @@ class PDFProcessor(BaseMediaHandler):
315
350
  'subject': pdf_metadata.get('subject', ''),
316
351
  'creator': pdf_metadata.get('creator', ''),
317
352
  'producer': pdf_metadata.get('producer', ''),
353
+ 'format': pdf_metadata.get('format', ''),
318
354
  'creation_date': pdf_metadata.get('creationDate', ''),
319
355
  'modification_date': pdf_metadata.get('modDate', ''),
320
356
  'page_count': doc.page_count,
321
357
  'encrypted': doc.needs_pass,
322
- 'pdf_version': doc.pdf_version()
323
358
  })
359
+ pdf_version = _safe_pdf_version(doc)
360
+ if pdf_version is not None:
361
+ metadata["pdf_version"] = pdf_version
324
362
 
325
363
  # Clean up empty values
326
364
  metadata = {k: v for k, v in metadata.items() if v}
@@ -391,9 +429,14 @@ class PDFProcessor(BaseMediaHandler):
391
429
  'file_size': file_path.stat().st_size,
392
430
  'page_count': doc.page_count,
393
431
  'encrypted': doc.needs_pass,
394
- 'pdf_version': doc.pdf_version(),
395
432
  'metadata': doc.metadata
396
433
  }
434
+ fmt = doc.metadata.get("format") if isinstance(doc.metadata, dict) else None
435
+ if isinstance(fmt, str) and fmt.strip():
436
+ info["format"] = fmt.strip()
437
+ pdf_version = _safe_pdf_version(doc)
438
+ if pdf_version is not None:
439
+ info["pdf_version"] = pdf_version
397
440
 
398
441
  # Get first page info
399
442
  if doc.page_count > 0:
@@ -482,4 +525,4 @@ class PDFProcessor(BaseMediaHandler):
482
525
  'pymupdf4llm': PYMUPDF4LLM_AVAILABLE,
483
526
  'pymupdf': PYMUPDF_AVAILABLE
484
527
  }
485
- }
528
+ }
@@ -19,6 +19,8 @@ except ImportError:
19
19
 
20
20
  from ..base import BaseMediaHandler, MediaProcessingError
21
21
  from ..types import MediaContent, MediaType, ContentFormat
22
+ from ...utils.token_utils import estimate_tokens
23
+ from ...utils.truncation import preview_text
22
24
 
23
25
 
24
26
  class TextProcessor(BaseMediaHandler):
@@ -129,6 +131,10 @@ class TextProcessor(BaseMediaHandler):
129
131
  # Determine appropriate MIME type
130
132
  mime_type = self._get_mime_type_for_extension(extension)
131
133
 
134
+ # Add token estimation to metadata (no truncation, just informational)
135
+ metadata['estimated_tokens'] = estimate_tokens(content)
136
+ metadata['content_length'] = len(content)
137
+
132
138
  return self._create_media_content(
133
139
  content=content,
134
140
  file_path=file_path,
@@ -181,11 +187,9 @@ class TextProcessor(BaseMediaHandler):
181
187
  null_count = df[col].isnull().sum()
182
188
  content_parts.append(f"- {col} ({dtype}, {null_count} null values)")
183
189
 
184
- content_parts.append("\n## Sample Data:")
185
- content_parts.append(df.head(10).to_string(index=False))
186
-
187
- if len(df) > 10:
188
- content_parts.append(f"\n... and {len(df) - 10} more rows")
190
+ # Always include full data - no truncation
191
+ content_parts.append("\n## Data:")
192
+ content_parts.append(df.to_csv(index=False, sep=delimiter))
189
193
 
190
194
  content = "\n".join(content_parts)
191
195
 
@@ -196,7 +200,7 @@ class TextProcessor(BaseMediaHandler):
196
200
  'data_types': {col: str(dtype) for col, dtype in df.dtypes.items()},
197
201
  'delimiter': delimiter,
198
202
  'has_header': True,
199
- 'null_values': df.isnull().sum().to_dict()
203
+ 'null_values': df.isnull().sum().to_dict(),
200
204
  }
201
205
 
202
206
  else:
@@ -221,12 +225,10 @@ class TextProcessor(BaseMediaHandler):
221
225
  for col in header:
222
226
  content_parts.append(f"- {col}")
223
227
 
224
- content_parts.append("\n## Sample Data:")
225
- for i, row in enumerate(data_rows[:10]):
226
- content_parts.append(f"Row {i+1}: {', '.join(row)}")
227
-
228
- if len(data_rows) > 10:
229
- content_parts.append(f"... and {len(data_rows) - 10} more rows")
228
+ # Always include full data - no truncation
229
+ content_parts.append("\n## Data:")
230
+ for row in data_rows:
231
+ content_parts.append(delimiter.join(row))
230
232
 
231
233
  content = "\n".join(content_parts)
232
234
 
@@ -235,7 +237,7 @@ class TextProcessor(BaseMediaHandler):
235
237
  'column_count': len(header),
236
238
  'columns': header,
237
239
  'delimiter': delimiter,
238
- 'has_header': True
240
+ 'has_header': True,
239
241
  }
240
242
 
241
243
  return content, metadata
@@ -273,20 +275,16 @@ class TextProcessor(BaseMediaHandler):
273
275
  content_parts = []
274
276
  content_parts.append(f"# {file_path.name}")
275
277
 
278
+ # Always include full JSON content - no truncation
276
279
  if isinstance(data, dict):
277
280
  content_parts.append(f"JSON object with {len(data)} keys\n")
278
- content_parts.append("## Structure:")
279
- content_parts.append(json.dumps(data, indent=2, ensure_ascii=False))
280
281
  elif isinstance(data, list):
281
282
  content_parts.append(f"JSON array with {len(data)} items\n")
282
- content_parts.append("## Sample items:")
283
- for i, item in enumerate(data[:5]):
284
- content_parts.append(f"Item {i+1}: {json.dumps(item, ensure_ascii=False)}")
285
- if len(data) > 5:
286
- content_parts.append(f"... and {len(data) - 5} more items")
287
283
  else:
288
- content_parts.append("JSON primitive value:")
289
- content_parts.append(json.dumps(data, indent=2, ensure_ascii=False))
284
+ content_parts.append("JSON primitive value\n")
285
+
286
+ content_parts.append("## Content:")
287
+ content_parts.append(json.dumps(data, indent=2, ensure_ascii=False))
290
288
 
291
289
  content = "\n".join(content_parts)
292
290
 
@@ -504,7 +502,7 @@ class TextProcessor(BaseMediaHandler):
504
502
  summary_parts.append(f"Text document with {metadata.get('word_count', 0)} words and {metadata.get('line_count', 0)} lines")
505
503
 
506
504
  # Add content preview
507
- preview = content[:500] + "..." if len(content) > 500 else content
505
+ preview = preview_text(content, max_chars=500)
508
506
  summary_parts.append(f"\nContent preview:\n{preview}")
509
507
 
510
508
  return "\n".join(summary_parts)
@@ -569,4 +567,4 @@ class TextProcessor(BaseMediaHandler):
569
567
  'dependencies': {
570
568
  'pandas': PANDAS_AVAILABLE
571
569
  }
572
- }
570
+ }