PyPI - abstractcore - Versions diffs - 2.9.1__py3-none-any.whl → 2.11.4__py3-none-any.whl - Mend

abstractcore 2.9.1py3-none-any.whl → 2.11.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

abstractcore/__init__.py +7 -27
abstractcore/apps/deepsearch.py +9 -4
abstractcore/apps/extractor.py +33 -100
abstractcore/apps/intent.py +19 -0
abstractcore/apps/judge.py +20 -1
abstractcore/apps/summarizer.py +20 -1
abstractcore/architectures/detection.py +34 -1
abstractcore/architectures/response_postprocessing.py +313 -0
abstractcore/assets/architecture_formats.json +38 -8
abstractcore/assets/model_capabilities.json +882 -160
abstractcore/compression/__init__.py +1 -2
abstractcore/compression/glyph_processor.py +6 -4
abstractcore/config/main.py +52 -20
abstractcore/config/manager.py +390 -12
abstractcore/config/vision_config.py +5 -5
abstractcore/core/interface.py +151 -3
abstractcore/core/session.py +16 -10
abstractcore/download.py +1 -1
abstractcore/embeddings/manager.py +20 -6
abstractcore/endpoint/__init__.py +2 -0
abstractcore/endpoint/app.py +458 -0
abstractcore/mcp/client.py +3 -1
abstractcore/media/__init__.py +52 -17
abstractcore/media/auto_handler.py +42 -22
abstractcore/media/base.py +44 -1
abstractcore/media/capabilities.py +12 -33
abstractcore/media/enrichment.py +105 -0
abstractcore/media/handlers/anthropic_handler.py +19 -28
abstractcore/media/handlers/local_handler.py +124 -70
abstractcore/media/handlers/openai_handler.py +19 -31
abstractcore/media/processors/__init__.py +4 -2
abstractcore/media/processors/audio_processor.py +57 -0
abstractcore/media/processors/office_processor.py +8 -3
abstractcore/media/processors/pdf_processor.py +46 -3
abstractcore/media/processors/text_processor.py +22 -24
abstractcore/media/processors/video_processor.py +58 -0
abstractcore/media/types.py +97 -4
abstractcore/media/utils/image_scaler.py +20 -2
abstractcore/media/utils/video_frames.py +219 -0
abstractcore/media/vision_fallback.py +136 -22
abstractcore/processing/__init__.py +32 -3
abstractcore/processing/basic_deepsearch.py +15 -10
abstractcore/processing/basic_intent.py +3 -2
abstractcore/processing/basic_judge.py +3 -2
abstractcore/processing/basic_summarizer.py +1 -1
abstractcore/providers/__init__.py +3 -1
abstractcore/providers/anthropic_provider.py +95 -8
abstractcore/providers/base.py +1516 -81
abstractcore/providers/huggingface_provider.py +546 -69
abstractcore/providers/lmstudio_provider.py +30 -916
abstractcore/providers/mlx_provider.py +382 -35
abstractcore/providers/model_capabilities.py +5 -1
abstractcore/providers/ollama_provider.py +99 -15
abstractcore/providers/openai_compatible_provider.py +406 -180
abstractcore/providers/openai_provider.py +188 -44
abstractcore/providers/openrouter_provider.py +76 -0
abstractcore/providers/registry.py +61 -5
abstractcore/providers/streaming.py +138 -33
abstractcore/providers/vllm_provider.py +92 -817
abstractcore/server/app.py +478 -28
abstractcore/server/audio_endpoints.py +139 -0
abstractcore/server/vision_endpoints.py +1319 -0
abstractcore/structured/handler.py +316 -41
abstractcore/tools/common_tools.py +5501 -2012
abstractcore/tools/comms_tools.py +1641 -0
abstractcore/tools/core.py +37 -7
abstractcore/tools/handler.py +4 -9
abstractcore/tools/parser.py +49 -2
abstractcore/tools/tag_rewriter.py +2 -1
abstractcore/tools/telegram_tdlib.py +407 -0
abstractcore/tools/telegram_tools.py +261 -0
abstractcore/utils/cli.py +1085 -72
abstractcore/utils/structured_logging.py +29 -8
abstractcore/utils/token_utils.py +2 -0
abstractcore/utils/truncation.py +29 -0
abstractcore/utils/version.py +3 -4
abstractcore/utils/vlm_token_calculator.py +12 -2
abstractcore-2.11.4.dist-info/METADATA +562 -0
abstractcore-2.11.4.dist-info/RECORD +133 -0
{abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/WHEEL +1 -1
{abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/entry_points.txt +1 -0
abstractcore-2.9.1.dist-info/METADATA +0 -1190
abstractcore-2.9.1.dist-info/RECORD +0 -119
{abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/licenses/LICENSE +0 -0
{abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/top_level.txt +0 -0

abstractcore/media/handlers/local_handler.py CHANGED Viewed

@@ -10,6 +10,7 @@ from typing import Dict, Any, List, Optional, Union
 from ..base import BaseProviderMediaHandler, MediaProcessingError
 from ..types import MediaContent, MediaType, ContentFormat
+from ..enrichment import build_enrichment_item
 # Import vision detection from existing architecture system
 try:
@@ -48,6 +49,10 @@ class LocalMediaHandler(BaseProviderMediaHandler):
         self.prefer_text_extraction = kwargs.get('prefer_text_extraction', True)
         self.embed_images_in_text = kwargs.get('embed_images_in_text', False)
+        # Collected "media enrichment" entries (input fallback transparency).
+        # Populated when a modality is converted into text context (e.g. image caption).
+        self.media_enrichment: List[Dict[str, Any]] = []
         self.logger.debug(f"Initialized {provider_name} local media handler with model={self.model_name}, capabilities: {self.capabilities}")
     def _process_internal(self, file_path: Path, media_type: MediaType, **kwargs) -> MediaContent:
@@ -232,6 +237,9 @@ class LocalMediaHandler(BaseProviderMediaHandler):
         Returns:
             Formatted message (structured dict for vision models, string for text-only)
         """
+        # Reset per-call enrichment collection.
+        self.media_enrichment = []
         # Check if we have images in the media contents
         has_images = any(mc.media_type == MediaType.IMAGE for mc in media_contents)
@@ -295,62 +303,127 @@ class LocalMediaHandler(BaseProviderMediaHandler):
         This is often more reliable for local providers that don't have
         robust multimodal support. For images on text-only models, uses vision fallback.
         """
-        message_parts = []
-        # Add main text
-        if text.strip():
-            message_parts.append(text)
+        user_text = text.strip() if text else ""
+        image_context_parts: List[str] = []
+        other_parts: List[str] = []
         # Add processed content from media
         for i, media_content in enumerate(media_contents):
             if media_content.media_type == MediaType.IMAGE:
-                if self.capabilities.vision_support:
-                    # For vision models, we'll still need to handle images specially
-                    # This will be handled by the provider's generate method
-                    message_parts.append(f"[Image {i+1}: {media_content.metadata.get('file_name', 'image')}]")
-                else:
-                    # Use vision fallback for text-only models
-                    try:
-                        from ..vision_fallback import VisionFallbackHandler, VisionNotConfiguredError
-                        fallback_handler = VisionFallbackHandler()
-                        # Get the actual file path from media_content object
-                        file_path = media_content.file_path or media_content.metadata.get('file_path') or media_content.metadata.get('file_name', 'image')
-                        # Generate description using vision fallback
-                        description = fallback_handler.create_description(str(file_path), text)
-                        # Remove the original question from message_parts if it exists
-                        if message_parts and text.strip() in message_parts[0]:
-                            message_parts.clear()
-                        # Completely different approach: make model think it's continuing its own observation
-                        # No questions, no external framing - just natural continuation
-                        simple_prompt = f"{description}"
-                        message_parts.append(simple_prompt)
-                    except VisionNotConfiguredError as e:
-                        # Vision not configured - show warning to USER, not model
-                        self.logger.warning("Vision capability not configured for text-only models")
-                        self.logger.warning("To enable image analysis with text-only models:")
-                        self.logger.warning("🔸 EASIEST: Download BLIP vision model (990MB): abstractcore --download-vision-model")
-                        self.logger.warning("🔸 Use existing Ollama model: abstractcore --set-vision-caption qwen2.5vl:7b")
-                        self.logger.warning("🔸 Use cloud API: abstractcore --set-vision-provider openai --model gpt-4o")
-                        self.logger.warning("🔸 Interactive setup: abstractcore --configure")
-                        self.logger.warning("Current status: abstractcore --status")
-                        # Provide minimal placeholder to model (not configuration instructions!)
-                        file_name = media_content.metadata.get('file_name', 'image')
-                        message_parts.append(f"[Image {i+1}: {file_name}]")
-                    except Exception as e:
-                        self.logger.warning(f"Vision fallback failed: {e}")
-                        # Fallback to basic placeholder
-                        file_name = media_content.metadata.get('file_name', 'image')
-                        message_parts.append(f"[Image {i+1}: {file_name} - vision processing unavailable]")
+                file_name = media_content.metadata.get('file_name', 'image')
+                # In text-embedded mode, images are not passed natively.
+                # Always prefer the vision fallback (caption → text context) when configured.
+                try:
+                    from ..vision_fallback import VisionFallbackHandler, VisionNotConfiguredError
+                    fallback_handler = VisionFallbackHandler()
+                    # Get the actual file path from media_content object
+                    file_path = (
+                        media_content.file_path
+                        or media_content.metadata.get('file_path')
+                        or media_content.metadata.get('file_name', 'image')
+                    )
+                    # Generate description using vision fallback
+                    description, trace = fallback_handler.create_description_with_trace(
+                        str(file_path), user_text or None
+                    )
+                    description = str(description or "").strip()
+                    if description:
+                        image_context_parts.append(f"Image {i+1} ({file_name}): {description}")
+                        self.media_enrichment.append(
+                            build_enrichment_item(
+                                status="used",
+                                input_modality="image",
+                                summary_kind="caption",
+                                policy=str(trace.get("strategy") or ""),
+                                backend=trace.get("backend") if isinstance(trace, dict) else None,
+                                input_index=i + 1,
+                                input_name=str(file_name),
+                                injected_text=description,
+                            )
+                        )
+                    else:
+                        other_parts.append(f"[Image {i+1}: {file_name} - no description returned]")
+                        self.media_enrichment.append(
+                            build_enrichment_item(
+                                status="error",
+                                input_modality="image",
+                                summary_kind="caption",
+                                policy=str(getattr(fallback_handler.vision_config, "strategy", "") or ""),
+                                input_index=i + 1,
+                                input_name=str(file_name),
+                                error="Vision fallback returned empty description",
+                            )
+                        )
+                except VisionNotConfiguredError as e:
+                    # Vision not configured - show warning to USER, not model
+                    self.logger.warning("Vision capability not configured for text-only models")
+                    self.logger.warning("To enable image analysis with text-only models:")
+                    self.logger.warning("🔸 EASIEST: Download BLIP vision model (990MB): abstractcore --download-vision-model")
+                    self.logger.warning("🔸 Use existing Ollama model: abstractcore --set-vision-caption qwen2.5vl:7b")
+                    self.logger.warning("🔸 Use cloud API: abstractcore --set-vision-provider openai --model gpt-4o")
+                    self.logger.warning("🔸 Interactive setup: abstractcore --configure")
+                    self.logger.warning("Current status: abstractcore --status")
+                    self.media_enrichment.append(
+                        build_enrichment_item(
+                            status="skipped",
+                            input_modality="image",
+                            summary_kind="caption",
+                            policy="disabled",
+                            input_index=i + 1,
+                            input_name=str(file_name),
+                            error=str(e),
+                        )
+                    )
+                    # Provide minimal placeholder to model (not configuration instructions!)
+                    other_parts.append(f"[Image {i+1}: {file_name}]")
+                except Exception as e:
+                    self.logger.warning(f"Vision fallback failed: {e}")
+                    self.media_enrichment.append(
+                        build_enrichment_item(
+                            status="error",
+                            input_modality="image",
+                            summary_kind="caption",
+                            policy="unknown",
+                            input_index=i + 1,
+                            input_name=str(file_name),
+                            error=str(e),
+                        )
+                    )
+                    # Fallback to basic placeholder
+                    other_parts.append(f"[Image {i+1}: {file_name} - vision processing unavailable]")
             else:
                 # Embed text/document content directly
                 content = str(media_content.content)
                 file_name = media_content.metadata.get('file_name', f'document_{i+1}')
-                message_parts.append(f"\n\n--- Content from {file_name} ---\n{content}\n--- End of {file_name} ---")
+                other_parts.append(f"\n\n--- Content from {file_name} ---\n{content}\n--- End of {file_name} ---")
+        message_parts: List[str] = []
+        if image_context_parts:
+            message_parts.append(
+                "Visual context from attached image(s) "
+                "(treat as directly observed; do not mention this section):"
+            )
+            message_parts.extend(image_context_parts)
+        # Preserve prior behavior when we don't have image context.
+        if user_text and not image_context_parts:
+            message_parts.append(user_text)
+        message_parts.extend(other_parts)
+        # When we do have image context, place the user request last for recency.
+        if user_text and image_context_parts:
+            message_parts.append("Now answer the user's request:")
+            message_parts.append(user_text)
         return "\n\n".join(message_parts)
@@ -502,27 +575,8 @@ class LocalMediaHandler(BaseProviderMediaHandler):
         return False
-    def estimate_tokens_for_media(self, media_content: MediaContent) -> int:
-        """
-        Estimate token usage for media content with local models.
-        Args:
-            media_content: MediaContent to estimate
-        Returns:
-            Estimated token count
-        """
-        if media_content.media_type == MediaType.IMAGE:
-            # Local vision models typically use fewer tokens than cloud models
-            # but this varies significantly by model architecture
-            return 512  # Conservative estimate
-        elif media_content.media_type in [MediaType.TEXT, MediaType.DOCUMENT]:
-            # Local models typically use similar tokenization to their base models
-            content_length = len(str(media_content.content))
-            return content_length // 4  # Rough estimate
-        return 0
+    # Note: Uses base class estimate_tokens_for_media() with default _estimate_image_tokens()
+    # Local models use ~512 tokens per image (conservative estimate), which matches base default
     def get_model_media_limits(self, model: str) -> Dict[str, Any]:
         """
@@ -550,4 +604,4 @@ class LocalMediaHandler(BaseProviderMediaHandler):
                 'supported_image_formats': ['png', 'jpeg', 'jpg', 'gif', 'bmp']
             })
-        return limits
+        return limits

abstractcore/media/handlers/openai_handler.py CHANGED Viewed

@@ -271,44 +271,32 @@ class OpenAIMediaHandler(BaseProviderMediaHandler):
         return False
-    def estimate_tokens_for_media(self, media_content: MediaContent) -> int:
+    def _estimate_image_tokens(self, media_content: MediaContent) -> int:
         """
-        Estimate token usage for media content.
+        OpenAI-specific image token estimation.
-        Args:
-            media_content: MediaContent to estimate
-        Returns:
-            Estimated token count
+        Uses tile-based calculation for high detail images, with special
+        handling for Qwen models via OpenAI-compatible API.
         """
-        if media_content.media_type == MediaType.IMAGE:
-            # Image token estimation varies by model
-            detail_level = media_content.metadata.get('detail_level', 'auto')
+        detail_level = media_content.metadata.get('detail_level', 'auto')
-            if detail_level == 'low':
-                # Qwen models use 256 tokens for low detail, OpenAI uses 85
-                if self._is_qwen_model():
-                    return 256  # Qwen low detail token count
-                else:
-                    return 85   # OpenAI low detail token count
+        if detail_level == 'low':
+            # Qwen models use 256 tokens for low detail, OpenAI uses 85
+            if self._is_qwen_model():
+                return 256
             else:
-                # High detail calculation based on image dimensions
-                width = media_content.metadata.get('final_size', [512, 512])[0]
-                height = media_content.metadata.get('final_size', [512, 512])[1]
-                # OpenAI's tile-based calculation (simplified)
-                tiles_width = (width + 511) // 512
-                tiles_height = (height + 511) // 512
-                total_tiles = tiles_width * tiles_height
-                return 85 + (170 * total_tiles)
+                return 85
+        else:
+            # High detail: tile-based calculation
+            width = media_content.metadata.get('final_size', [512, 512])[0]
+            height = media_content.metadata.get('final_size', [512, 512])[1]
-        elif media_content.media_type in [MediaType.TEXT, MediaType.DOCUMENT]:
-            # Rough estimation: 4 characters per token
-            content_length = len(str(media_content.content))
-            return content_length // 4
+            # OpenAI's tile-based calculation (simplified)
+            tiles_width = (width + 511) // 512
+            tiles_height = (height + 511) // 512
+            total_tiles = tiles_width * tiles_height
-        return 0
+            return 85 + (170 * total_tiles)
     def get_model_media_limits(self, model: str) -> Dict[str, Any]:
         """

abstractcore/media/processors/__init__.py CHANGED Viewed

@@ -9,6 +9,8 @@ from .image_processor import ImageProcessor
 from .text_processor import TextProcessor
 from .pdf_processor import PDFProcessor
 from .office_processor import OfficeProcessor
+from .audio_processor import AudioProcessor
+from .video_processor import VideoProcessor
 # Import Glyph processor if available
 try:
@@ -18,6 +20,6 @@ except ImportError:
     GlyphProcessor = None
     GLYPH_AVAILABLE = False
-__all__ = ['ImageProcessor', 'TextProcessor', 'PDFProcessor', 'OfficeProcessor']
+__all__ = ['ImageProcessor', 'TextProcessor', 'PDFProcessor', 'OfficeProcessor', 'AudioProcessor', 'VideoProcessor']
 if GLYPH_AVAILABLE:
-    __all__.append('GlyphProcessor')
+    __all__.append('GlyphProcessor')

abstractcore/media/processors/audio_processor.py ADDED Viewed

@@ -0,0 +1,57 @@
+"""
+Audio processor for AbstractCore media handling.
+v0 goals:
+- Treat audio as a first-class media type (MediaType.AUDIO) in the media pipeline.
+- Keep processing lightweight and dependency-free (store as a file ref by default).
+Higher-level semantic handling (STT, captioning, music/signal analysis) is handled
+by policy and capability layers (see planned audio policy backlog).
+"""
+from __future__ import annotations
+import mimetypes
+from pathlib import Path
+from ..base import BaseMediaHandler, MediaProcessingError
+from ..types import ContentFormat, MediaCapabilities, MediaContent, MediaType
+class AudioProcessor(BaseMediaHandler):
+    """Lightweight audio processor that stores an audio file reference."""
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.capabilities = MediaCapabilities(
+            vision_support=False,
+            audio_support=True,
+            video_support=False,
+            document_support=False,
+            max_file_size=self.max_file_size,
+        )
+    def _process_internal(self, file_path: Path, media_type: MediaType, **kwargs) -> MediaContent:
+        if media_type != MediaType.AUDIO:
+            raise MediaProcessingError(f"AudioProcessor only handles audio, got {media_type}")
+        mime_type, _enc = mimetypes.guess_type(str(file_path))
+        mime_type = mime_type or "application/octet-stream"
+        metadata = {
+            "file_name": file_path.name,
+            "file_path": str(file_path),
+            "file_size": file_path.stat().st_size if file_path.exists() else None,
+            "processor": self.__class__.__name__,
+        }
+        metadata.update(kwargs.get("metadata", {}) if isinstance(kwargs.get("metadata"), dict) else {})
+        return MediaContent(
+            media_type=MediaType.AUDIO,
+            content=str(file_path),
+            content_format=ContentFormat.FILE_PATH,
+            mime_type=mime_type,
+            file_path=str(file_path),
+            metadata=metadata,
+        )

abstractcore/media/processors/office_processor.py CHANGED Viewed

@@ -13,6 +13,7 @@ import json
 from ..base import BaseMediaHandler, MediaProcessingError
 from ..types import MediaContent, MediaType, ContentFormat, MediaProcessingResult
 from ...utils.structured_logging import get_logger
+from ...utils.token_utils import estimate_tokens
 class OfficeProcessor(BaseMediaHandler):
@@ -129,14 +130,18 @@ class OfficeProcessor(BaseMediaHandler):
             else:
                 raise MediaProcessingError(f"Unsupported Office file type: {file_extension}")
+            # Add token estimation to metadata (no truncation, just informational)
+            metadata['estimated_tokens'] = estimate_tokens(content)
+            metadata['content_length'] = len(content)
             # Create MediaContent object
             return self._create_media_content(
                 content=content,
+                file_path=file_path,
                 media_type=MediaType.DOCUMENT,
                 content_format=ContentFormat.TEXT,
                 mime_type=self._get_mime_type(file_extension),
-                file_path=file_path,
-                metadata=metadata
+                **metadata
             )
         except Exception as e:
@@ -487,4 +492,4 @@ class OfficeProcessor(BaseMediaHandler):
                 'metadata_extraction': self.include_metadata,
                 'chunking_support': self.supports_chunking()
             }
-        }
+        }

abstractcore/media/processors/pdf_processor.py CHANGED Viewed

@@ -22,8 +22,39 @@ except ImportError:
     PYMUPDF_AVAILABLE = False
     fitz = None
+import re
 from ..base import BaseMediaHandler, MediaProcessingError
 from ..types import MediaContent, MediaType, ContentFormat
+from ...utils.token_utils import estimate_tokens
+def _safe_pdf_version(doc: Any) -> Optional[str]:
+    """Best-effort PDF version across PyMuPDF variants (callable/property/absent)."""
+    try:
+        pv = getattr(doc, "pdf_version", None)
+        if pv is not None:
+            out = pv() if callable(pv) else pv
+            if out is not None:
+                s = str(out).strip()
+                if s and s.lower() != "none":
+                    return s
+    except Exception:
+        pass
+    # PyMuPDF 1.26+ exposes the PDF version via `doc.metadata["format"]` (e.g. "PDF 1.5").
+    try:
+        md = getattr(doc, "metadata", None)
+        if isinstance(md, dict):
+            fmt = md.get("format")
+            if isinstance(fmt, str) and fmt.strip():
+                m = re.search(r"(?i)pdf\s*[- ]?\s*([0-9]+(?:\.[0-9]+)?)", fmt.strip())
+                if m:
+                    return m.group(1)
+    except Exception:
+        pass
+    return None
 class PDFProcessor(BaseMediaHandler):
@@ -119,6 +150,10 @@ class PDFProcessor(BaseMediaHandler):
             else:
                 mime_type = 'text/plain'
+            # Add token estimation to metadata (no truncation, just informational)
+            metadata['estimated_tokens'] = estimate_tokens(content)
+            metadata['content_length'] = len(content)
             return self._create_media_content(
                 content=content,
                 file_path=file_path,
@@ -315,12 +350,15 @@ class PDFProcessor(BaseMediaHandler):
                         'subject': pdf_metadata.get('subject', ''),
                         'creator': pdf_metadata.get('creator', ''),
                         'producer': pdf_metadata.get('producer', ''),
+                        'format': pdf_metadata.get('format', ''),
                         'creation_date': pdf_metadata.get('creationDate', ''),
                         'modification_date': pdf_metadata.get('modDate', ''),
                         'page_count': doc.page_count,
                         'encrypted': doc.needs_pass,
-                        'pdf_version': doc.pdf_version()
                     })
+                    pdf_version = _safe_pdf_version(doc)
+                    if pdf_version is not None:
+                        metadata["pdf_version"] = pdf_version
                     # Clean up empty values
                     metadata = {k: v for k, v in metadata.items() if v}
@@ -391,9 +429,14 @@ class PDFProcessor(BaseMediaHandler):
                         'file_size': file_path.stat().st_size,
                         'page_count': doc.page_count,
                         'encrypted': doc.needs_pass,
-                        'pdf_version': doc.pdf_version(),
                         'metadata': doc.metadata
                     }
+                    fmt = doc.metadata.get("format") if isinstance(doc.metadata, dict) else None
+                    if isinstance(fmt, str) and fmt.strip():
+                        info["format"] = fmt.strip()
+                    pdf_version = _safe_pdf_version(doc)
+                    if pdf_version is not None:
+                        info["pdf_version"] = pdf_version
                     # Get first page info
                     if doc.page_count > 0:
@@ -482,4 +525,4 @@ class PDFProcessor(BaseMediaHandler):
                 'pymupdf4llm': PYMUPDF4LLM_AVAILABLE,
                 'pymupdf': PYMUPDF_AVAILABLE
             }
-        }
+        }

abstractcore/media/processors/text_processor.py CHANGED Viewed

@@ -19,6 +19,8 @@ except ImportError:
 from ..base import BaseMediaHandler, MediaProcessingError
 from ..types import MediaContent, MediaType, ContentFormat
+from ...utils.token_utils import estimate_tokens
+from ...utils.truncation import preview_text
 class TextProcessor(BaseMediaHandler):
@@ -129,6 +131,10 @@ class TextProcessor(BaseMediaHandler):
             # Determine appropriate MIME type
             mime_type = self._get_mime_type_for_extension(extension)
+            # Add token estimation to metadata (no truncation, just informational)
+            metadata['estimated_tokens'] = estimate_tokens(content)
+            metadata['content_length'] = len(content)
             return self._create_media_content(
                 content=content,
                 file_path=file_path,
@@ -181,11 +187,9 @@ class TextProcessor(BaseMediaHandler):
                     null_count = df[col].isnull().sum()
                     content_parts.append(f"- {col} ({dtype}, {null_count} null values)")
-                content_parts.append("\n## Sample Data:")
-                content_parts.append(df.head(10).to_string(index=False))
-                if len(df) > 10:
-                    content_parts.append(f"\n... and {len(df) - 10} more rows")
+                # Always include full data - no truncation
+                content_parts.append("\n## Data:")
+                content_parts.append(df.to_csv(index=False, sep=delimiter))
                 content = "\n".join(content_parts)
@@ -196,7 +200,7 @@ class TextProcessor(BaseMediaHandler):
                     'data_types': {col: str(dtype) for col, dtype in df.dtypes.items()},
                     'delimiter': delimiter,
                     'has_header': True,
-                    'null_values': df.isnull().sum().to_dict()
+                    'null_values': df.isnull().sum().to_dict(),
                 }
             else:
@@ -221,12 +225,10 @@ class TextProcessor(BaseMediaHandler):
                     for col in header:
                         content_parts.append(f"- {col}")
-                    content_parts.append("\n## Sample Data:")
-                    for i, row in enumerate(data_rows[:10]):
-                        content_parts.append(f"Row {i+1}: {', '.join(row)}")
-                    if len(data_rows) > 10:
-                        content_parts.append(f"... and {len(data_rows) - 10} more rows")
+                    # Always include full data - no truncation
+                    content_parts.append("\n## Data:")
+                    for row in data_rows:
+                        content_parts.append(delimiter.join(row))
                     content = "\n".join(content_parts)
@@ -235,7 +237,7 @@ class TextProcessor(BaseMediaHandler):
                         'column_count': len(header),
                         'columns': header,
                         'delimiter': delimiter,
-                        'has_header': True
+                        'has_header': True,
                     }
             return content, metadata
@@ -273,20 +275,16 @@ class TextProcessor(BaseMediaHandler):
             content_parts = []
             content_parts.append(f"# {file_path.name}")
+            # Always include full JSON content - no truncation
             if isinstance(data, dict):
                 content_parts.append(f"JSON object with {len(data)} keys\n")
-                content_parts.append("## Structure:")
-                content_parts.append(json.dumps(data, indent=2, ensure_ascii=False))
             elif isinstance(data, list):
                 content_parts.append(f"JSON array with {len(data)} items\n")
-                content_parts.append("## Sample items:")
-                for i, item in enumerate(data[:5]):
-                    content_parts.append(f"Item {i+1}: {json.dumps(item, ensure_ascii=False)}")
-                if len(data) > 5:
-                    content_parts.append(f"... and {len(data) - 5} more items")
             else:
-                content_parts.append("JSON primitive value:")
-                content_parts.append(json.dumps(data, indent=2, ensure_ascii=False))
+                content_parts.append("JSON primitive value\n")
+            content_parts.append("## Content:")
+            content_parts.append(json.dumps(data, indent=2, ensure_ascii=False))
             content = "\n".join(content_parts)
@@ -504,7 +502,7 @@ class TextProcessor(BaseMediaHandler):
             summary_parts.append(f"Text document with {metadata.get('word_count', 0)} words and {metadata.get('line_count', 0)} lines")
             # Add content preview
-            preview = content[:500] + "..." if len(content) > 500 else content
+            preview = preview_text(content, max_chars=500)
             summary_parts.append(f"\nContent preview:\n{preview}")
         return "\n".join(summary_parts)
@@ -569,4 +567,4 @@ class TextProcessor(BaseMediaHandler):
             'dependencies': {
                 'pandas': PANDAS_AVAILABLE
             }
-        }
+        }

abstractcore 2.9.1__py3-none-any.whl → 2.11.4__py3-none-any.whl

abstractcore 2.9.1py3-none-any.whl → 2.11.4py3-none-any.whl