PyPI - abstractcore - Versions diffs - 2.9.1__py3-none-any.whl → 2.11.2__py3-none-any.whl - Mend

abstractcore 2.9.1py3-none-any.whl → 2.11.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

abstractcore/__init__.py +7 -27
abstractcore/apps/extractor.py +33 -100
abstractcore/apps/intent.py +19 -0
abstractcore/apps/judge.py +20 -1
abstractcore/apps/summarizer.py +20 -1
abstractcore/architectures/detection.py +34 -1
abstractcore/architectures/response_postprocessing.py +313 -0
abstractcore/assets/architecture_formats.json +38 -8
abstractcore/assets/model_capabilities.json +781 -160
abstractcore/compression/__init__.py +1 -2
abstractcore/compression/glyph_processor.py +6 -4
abstractcore/config/main.py +31 -19
abstractcore/config/manager.py +389 -11
abstractcore/config/vision_config.py +5 -5
abstractcore/core/interface.py +151 -3
abstractcore/core/session.py +16 -10
abstractcore/download.py +1 -1
abstractcore/embeddings/manager.py +20 -6
abstractcore/endpoint/__init__.py +2 -0
abstractcore/endpoint/app.py +458 -0
abstractcore/mcp/client.py +3 -1
abstractcore/media/__init__.py +52 -17
abstractcore/media/auto_handler.py +42 -22
abstractcore/media/base.py +44 -1
abstractcore/media/capabilities.py +12 -33
abstractcore/media/enrichment.py +105 -0
abstractcore/media/handlers/anthropic_handler.py +19 -28
abstractcore/media/handlers/local_handler.py +124 -70
abstractcore/media/handlers/openai_handler.py +19 -31
abstractcore/media/processors/__init__.py +4 -2
abstractcore/media/processors/audio_processor.py +57 -0
abstractcore/media/processors/office_processor.py +8 -3
abstractcore/media/processors/pdf_processor.py +46 -3
abstractcore/media/processors/text_processor.py +22 -24
abstractcore/media/processors/video_processor.py +58 -0
abstractcore/media/types.py +97 -4
abstractcore/media/utils/image_scaler.py +20 -2
abstractcore/media/utils/video_frames.py +219 -0
abstractcore/media/vision_fallback.py +136 -22
abstractcore/processing/__init__.py +32 -3
abstractcore/processing/basic_deepsearch.py +15 -10
abstractcore/processing/basic_intent.py +3 -2
abstractcore/processing/basic_judge.py +3 -2
abstractcore/processing/basic_summarizer.py +1 -1
abstractcore/providers/__init__.py +3 -1
abstractcore/providers/anthropic_provider.py +95 -8
abstractcore/providers/base.py +1516 -81
abstractcore/providers/huggingface_provider.py +546 -69
abstractcore/providers/lmstudio_provider.py +35 -923
abstractcore/providers/mlx_provider.py +382 -35
abstractcore/providers/model_capabilities.py +5 -1
abstractcore/providers/ollama_provider.py +99 -15
abstractcore/providers/openai_compatible_provider.py +406 -180
abstractcore/providers/openai_provider.py +188 -44
abstractcore/providers/openrouter_provider.py +76 -0
abstractcore/providers/registry.py +61 -5
abstractcore/providers/streaming.py +138 -33
abstractcore/providers/vllm_provider.py +92 -817
abstractcore/server/app.py +461 -13
abstractcore/server/audio_endpoints.py +139 -0
abstractcore/server/vision_endpoints.py +1319 -0
abstractcore/structured/handler.py +316 -41
abstractcore/tools/common_tools.py +5501 -2012
abstractcore/tools/comms_tools.py +1641 -0
abstractcore/tools/core.py +37 -7
abstractcore/tools/handler.py +4 -9
abstractcore/tools/parser.py +49 -2
abstractcore/tools/tag_rewriter.py +2 -1
abstractcore/tools/telegram_tdlib.py +407 -0
abstractcore/tools/telegram_tools.py +261 -0
abstractcore/utils/cli.py +1085 -72
abstractcore/utils/token_utils.py +2 -0
abstractcore/utils/truncation.py +29 -0
abstractcore/utils/version.py +3 -4
abstractcore/utils/vlm_token_calculator.py +12 -2
abstractcore-2.11.2.dist-info/METADATA +562 -0
abstractcore-2.11.2.dist-info/RECORD +133 -0
{abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/WHEEL +1 -1
{abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/entry_points.txt +1 -0
abstractcore-2.9.1.dist-info/METADATA +0 -1190
abstractcore-2.9.1.dist-info/RECORD +0 -119
{abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/licenses/LICENSE +0 -0
{abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/top_level.txt +0 -0

abstractcore/providers/huggingface_provider.py CHANGED Viewed

@@ -5,6 +5,7 @@ Supports both transformers models and GGUF models via llama-cpp-python.
 import os
 import json
+import threading
 from pathlib import Path
 from typing import List, Dict, Any, Optional, Union, Iterator, Type
@@ -39,6 +40,9 @@ from ..exceptions import ModelNotFoundError, format_model_error
 from ..tools import UniversalToolHandler, execute_tools
 from ..events import EventType
+_MPS_GENERATION_LOCK = threading.Lock()
 # Try to import transformers (standard HuggingFace support)
 try:
     from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer, AutoProcessor, AutoModelForImageTextToText, pipeline
@@ -84,6 +88,22 @@ def _get_local_model_path(model_name: str) -> Optional[str]:
 class HuggingFaceProvider(BaseProvider):
     """HuggingFace provider with dual support for transformers and GGUF models"""
+    @staticmethod
+    def _resolve_requested_device(device: Optional[str]) -> Optional[str]:
+        """Resolve the requested device from explicit arg or env override.
+        Supported env var: ABSTRACTCORE_HF_DEVICE=cpu|mps|cuda|auto
+        """
+        if isinstance(device, str) and device.strip():
+            return device.strip().lower()
+        env_device = os.environ.get("ABSTRACTCORE_HF_DEVICE")
+        if isinstance(env_device, str) and env_device.strip():
+            val = env_device.strip().lower()
+            if val in {"auto", "cpu", "mps", "cuda"}:
+                return val
+        return None
     def __init__(self, model: str = "unsloth/Qwen3-4B-Instruct-2507-GGUF",
                  device: Optional[str] = None,
                  n_gpu_layers: Optional[int] = None,
@@ -122,7 +142,7 @@ class HuggingFaceProvider(BaseProvider):
         # Store provider-specific configuration
         self.n_gpu_layers = n_gpu_layers
         self.model_type = None  # Will be "transformers" or "gguf"
-        self.device = device
+        self.device = self._resolve_requested_device(device)
         # Store transformers-specific parameters
         self.transformers_kwargs = {
@@ -154,7 +174,7 @@ class HuggingFaceProvider(BaseProvider):
             self._setup_device_transformers()
             self._load_transformers_model()
-    def unload(self) -> None:
+    def unload_model(self, model_name: str) -> None:
         """
         Unload the model from memory.
@@ -187,14 +207,61 @@ class HuggingFaceProvider(BaseProvider):
             if hasattr(self, 'logger'):
                 self.logger.warning(f"Error during unload: {e}")
-    def __del__(self):
-        """Properly clean up resources to minimize garbage collection issues"""
+    def supports_prompt_cache(self) -> bool:
+        """GGUF backends can use llama.cpp prompt caching (prefix state cache)."""
+        return getattr(self, "model_type", None) == "gguf"
+    def prompt_cache_set(
+        self,
+        key: str,
+        *,
+        make_default: bool = True,
+        ttl_s: Optional[float] = None,
+        capacity_bytes: Optional[int] = None,
+        **kwargs,
+    ) -> bool:
+        """Create/reset a llama.cpp prompt cache for the given key (GGUF only)."""
+        _ = kwargs
+        normalized = self._normalize_prompt_cache_key(key)
+        if normalized is None:
+            return False
+        if not self.supports_prompt_cache():
+            return False
+        if not super().prompt_cache_set(normalized, make_default=make_default):
+            return False
         try:
-            self.unload()
+            from llama_cpp.llama_cache import LlamaRAMCache
+        except Exception:
+            return False
+        cap = int(capacity_bytes) if isinstance(capacity_bytes, int) and capacity_bytes > 0 else (512 << 20)
+        cache_obj = LlamaRAMCache(capacity_bytes=cap)
+        try:
+            self._prompt_cache_store.set(normalized, cache_obj, ttl_s=ttl_s, meta={"backend": "llama_cpp"})
+        except Exception:
+            return False
+        # Best-effort: activate this cache on the shared llama instance.
+        try:
+            if getattr(self, "llm", None) is not None and hasattr(self.llm, "set_cache"):
+                self.llm.set_cache(cache_obj)
         except Exception:
-            # Silently handle any cleanup errors - this is expected during shutdown
             pass
+        return True
+    def prompt_cache_clear(self, key: Optional[str] = None) -> bool:
+        """Clear llama.cpp prompt caches (GGUF only; best-effort)."""
+        cleared = super().prompt_cache_clear(key)
+        try:
+            if getattr(self, "llm", None) is not None and hasattr(self.llm, "set_cache"):
+                self.llm.set_cache(None)
+        except Exception:
+            pass
+        return cleared
     def _is_gguf_model(self, model: str) -> bool:
         """Detect if the model is a GGUF model"""
         # Check if it's a .gguf file path
@@ -238,19 +305,75 @@ class HuggingFaceProvider(BaseProvider):
         return any(vision_keyword in model_lower for vision_keyword in vision_models)
     def _setup_device_transformers(self):
-        """Setup device for transformers models"""
+        """Setup device for transformers models (best-effort).
+        We validate explicit device requests even when Transformers isn't available,
+        since Torch availability (MPS/CUDA) may still matter for downstream behavior.
+        """
+        try:
+            import torch  # type: ignore
+        except Exception:
+            self.device = "cpu"
+            return
+        requested = str(self.device or "").strip().lower() if isinstance(self.device, str) else ""
+        if requested and requested != "auto":
+            # Respect explicit user/env request, but fall back safely if unavailable.
+            if requested == "mps":
+                if hasattr(torch.backends, "mps") and torch.backends.mps.is_built() and not torch.backends.mps.is_available():
+                    self.logger.warning(
+                        "HuggingFaceProvider requested device=mps but MPS is not available. "
+                        "This usually means the process cannot see Metal devices (sandboxed execution). "
+                        "Falling back to CPU. To silence this, set ABSTRACTCORE_HF_DEVICE=cpu."
+                    )
+                    self.device = "cpu"
+                else:
+                    self.device = "mps"
+            elif requested == "cuda":
+                if torch.cuda.is_available():
+                    self.device = "cuda"
+                else:
+                    self.logger.warning(
+                        "HuggingFaceProvider requested device=cuda but CUDA is not available; falling back to CPU."
+                    )
+                    self.device = "cpu"
+            else:
+                self.device = "cpu"
+            return
         if not TRANSFORMERS_AVAILABLE:
+            # Without transformers, default to CPU for safety.
+            self.device = "cpu"
             return
-        if self.device:
-            self.device = self.device
-        elif torch.backends.mps.is_available():
+        # Auto device selection.
+        if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
             self.device = "mps"
         elif torch.cuda.is_available():
             self.device = "cuda"
         else:
             self.device = "cpu"
+        # Apple Silicon: MPS built but unavailable is usually a sandbox / Metal visibility issue.
+        try:
+            import platform
+            if (
+                self.device == "cpu"
+                and platform.system() == "Darwin"
+                and platform.machine() == "arm64"
+                and hasattr(torch.backends, "mps")
+                and torch.backends.mps.is_built()
+                and not torch.backends.mps.is_available()
+            ):
+                self.logger.warning(
+                    "PyTorch was built with MPS support, but MPS is not available. "
+                    "This often indicates the process cannot access Metal devices (sandboxed execution). "
+                    "Run outside the sandbox or force CPU via ABSTRACTCORE_HF_DEVICE=cpu."
+                )
+        except Exception:
+            pass
     def _setup_device_gguf(self):
         """Setup device for GGUF models"""
         # Auto-detect GPU layers if not specified
@@ -396,6 +519,15 @@ class HuggingFaceProvider(BaseProvider):
             # Respect offline-first configuration
             if _config.should_force_local_files_only():
                 vision_kwargs['local_files_only'] = True
+            # Safer defaults on GPU backends: float16 unless caller provided torch_dtype.
+            try:
+                if self.device in {"mps", "cuda"} and "torch_dtype" not in vision_kwargs:
+                    import torch as _torch
+                    vision_kwargs["torch_dtype"] = _torch.float16
+            except Exception:
+                pass
             # Use local cache path if offline mode is enabled and model is cached
             model_path = self.model
@@ -419,6 +551,11 @@ class HuggingFaceProvider(BaseProvider):
             # Move to device (only if not using device_map)
             if self.device in ["cuda", "mps"] and 'device_map' not in self.transformers_kwargs:
                 self.model_instance = self.model_instance.to(self.device)
+            try:
+                self.model_instance.eval()
+            except Exception:
+                pass
             # For vision models, we don't use the standard pipeline
             self.pipeline = None
@@ -737,7 +874,7 @@ class HuggingFaceProvider(BaseProvider):
             # Check if Outlines is required but unavailable
             if self.structured_output_method == "native_outlines" and not OUTLINES_AVAILABLE:
                 return GenerateResponse(
-                    content="Error: structured_output_method='native_outlines' requires Outlines library. Install with: pip install abstractcore[huggingface]",
+                    content="Error: structured_output_method='native_outlines' requires Outlines library. Install with: pip install \"abstractcore[huggingface]\"",
                     model=self.model,
                     finish_reason="error"
                 )
@@ -787,6 +924,7 @@ class HuggingFaceProvider(BaseProvider):
         # Build input text with tool and media support
         # Handle media content first if present
+        media_enrichment = None
         if media:
             try:
                 from ..media.handlers import LocalMediaHandler
@@ -794,6 +932,7 @@ class HuggingFaceProvider(BaseProvider):
                 # Create multimodal message combining text and media
                 multimodal_message = media_handler.create_multimodal_message(prompt, media)
+                media_enrichment = getattr(media_handler, "media_enrichment", None)
                 # For local providers, we get text-embedded content
                 if isinstance(multimodal_message, str):
@@ -812,7 +951,7 @@ class HuggingFaceProvider(BaseProvider):
                         else:
                             prompt = str(multimodal_message["content"])
             except ImportError:
-                self.logger.warning("Media processing not available. Install with: pip install abstractcore[media]")
+                self.logger.warning("Media processing not available. Install with: pip install \"abstractcore[media]\"")
             except Exception as e:
                 self.logger.warning(f"Failed to process media content: {e}")
@@ -821,15 +960,19 @@ class HuggingFaceProvider(BaseProvider):
         # Generation parameters using unified system
         generation_kwargs = self._prepare_generation_kwargs(**kwargs)
         max_new_tokens = self._get_provider_max_tokens_param(generation_kwargs)
-        temperature = kwargs.get("temperature", self.temperature)
+        temperature = generation_kwargs.get("temperature", self.temperature)
         top_p = kwargs.get("top_p", 0.9)
-        seed_value = kwargs.get("seed", self.seed)
+        seed_value = generation_kwargs.get("seed")
         try:
             if stream:
                 return self._stream_generate_transformers_with_tools(input_text, max_new_tokens, temperature, top_p, tools, kwargs.get('tool_call_tags'), seed_value)
             else:
                 response = self._single_generate_transformers(input_text, max_new_tokens, temperature, top_p, seed_value)
+                if media_enrichment:
+                    from ..media.enrichment import merge_enrichment_metadata
+                    response.metadata = merge_enrichment_metadata(response.metadata, media_enrichment)
                 # Handle tool execution for prompted models
                 if tools and self.tool_handler.supports_prompted and response.content:
@@ -982,41 +1125,101 @@ class HuggingFaceProvider(BaseProvider):
             )
         try:
+            # Server/gateway sometimes call providers with prompt="" + messages=[...] + media=[...].
+            # For multimodal models, the user text and the media must live in the SAME user turn.
+            # Best-effort: if prompt is empty, lift the last user message text into the prompt and
+            # remove that message from the history to avoid duplication.
+            prompt_text = prompt
+            messages_for_context = list(messages) if isinstance(messages, list) else None
+            if (not isinstance(prompt_text, str) or not prompt_text.strip()) and media and messages_for_context:
+                for i in range(len(messages_for_context) - 1, -1, -1):
+                    msg = messages_for_context[i] or {}
+                    role = str(msg.get("role", "") or "").strip().lower()
+                    if role != "user":
+                        continue
+                    content = msg.get("content", "")
+                    lifted = None
+                    if isinstance(content, str) and content.strip():
+                        lifted = content.strip()
+                    elif isinstance(content, list):
+                        # OpenAI-style list content: [{"type":"text","text":"..."}, ...]
+                        for item in content:
+                            if not isinstance(item, dict):
+                                continue
+                            if str(item.get("type", "") or "").strip().lower() == "text":
+                                text_val = item.get("text")
+                                if isinstance(text_val, str) and text_val.strip():
+                                    lifted = text_val.strip()
+                                    break
+                    if lifted:
+                        prompt_text = lifted
+                        del messages_for_context[i]
+                    break
             # Build messages for vision model
             chat_messages = []
             if system_prompt:
                 chat_messages.append({"role": "system", "content": system_prompt})
-            if messages:
-                chat_messages.extend(messages)
+            if messages_for_context:
+                chat_messages.extend(messages_for_context)
             # Build user message with media content
             user_content = []
             # Add text content
-            if prompt:
-                user_content.append({"type": "text", "text": prompt})
+            if isinstance(prompt_text, str) and prompt_text.strip():
+                user_content.append({"type": "text", "text": prompt_text.strip()})
-            # Add media content (images)
+            # Add media content (images, video)
+            has_video = False
+            try:
+                from ..media.types import MediaType, ContentFormat
+            except Exception:
+                MediaType = None  # type: ignore[assignment]
+                ContentFormat = None  # type: ignore[assignment]
             if media:
                 for media_item in media:
-                    if hasattr(media_item, 'file_path') and media_item.file_path:
-                        # Use file path directly
-                        user_content.append({
-                            "type": "image",
-                            "url": str(media_item.file_path)
-                        })
-                    elif hasattr(media_item, 'content') and media_item.content:
-                        # Handle base64 content
-                        if media_item.content_format == 'BASE64':
-                            # Create data URL for base64 content
-                            mime_type = getattr(media_item, 'mime_type', 'image/png')
-                            data_url = f"data:{mime_type};base64,{media_item.content}"
-                            user_content.append({
-                                "type": "image",
-                                "url": data_url
-                            })
+                    media_type = getattr(media_item, "media_type", None)
+                    # Text markers (e.g. provenance / policy annotations) should be preserved for the model.
+                    if MediaType is not None and media_type == MediaType.TEXT:
+                        txt = getattr(media_item, "content", None)
+                        if isinstance(txt, str) and txt.strip():
+                            user_content.append({"type": "text", "text": txt.strip()})
+                        continue
+                    # Video inputs
+                    if MediaType is not None and media_type == MediaType.VIDEO:
+                        has_video = True
+                        # The actual video content is provided to the processor via `videos=...`;
+                        # the chat template only needs a `<video>` placeholder token.
+                        user_content.append({"type": "video"})
+                        continue
+                    # Image inputs
+                    if MediaType is None or media_type == MediaType.IMAGE:
+                        if getattr(media_item, "file_path", None):
+                            user_content.append({"type": "image", "url": str(media_item.file_path)})
+                            continue
+                        content = getattr(media_item, "content", None)
+                        if not content:
+                            continue
+                        content_format = getattr(media_item, "content_format", None)
+                        is_base64 = False
+                        if ContentFormat is not None and content_format == ContentFormat.BASE64:
+                            is_base64 = True
+                        elif isinstance(content_format, str) and content_format.strip().lower() == "base64":
+                            is_base64 = True
+                        if is_base64:
+                            mime_type = getattr(media_item, "mime_type", "image/png")
+                            data_url = f"data:{mime_type};base64,{content}"
+                            user_content.append({"type": "image", "url": data_url})
             # Add user message
             chat_messages.append({
@@ -1024,48 +1227,285 @@ class HuggingFaceProvider(BaseProvider):
                 "content": user_content
             })
-            # Process messages using the processor
-            inputs = self.processor.apply_chat_template(
-                chat_messages,
-                tokenize=True,
-                add_generation_prompt=True,
-                return_dict=True,
-                return_tensors="pt"
-            ).to(self.model_instance.device)
+            # Process messages using the processor.
+            #
+            # Some multimodal processors (e.g. LlavaNextVideoProcessor) return a *string*
+            # from apply_chat_template; for those we must call the processor separately
+            # with explicit images/videos tensors and keep video frame counts bounded.
+            if has_video:
+                # Resolve max frames for video sampling (keep small to avoid huge context).
+                max_frames_raw = kwargs.get("video_max_frames", None)
+                if max_frames_raw is None:
+                    try:
+                        from ..config.manager import get_config_manager
+                        cfg_video = getattr(get_config_manager().config, "video", None)
+                        max_frames_raw = getattr(cfg_video, "max_frames_native", None) if cfg_video is not None else None
+                        if max_frames_raw is None:
+                            max_frames_raw = getattr(cfg_video, "max_frames", None) if cfg_video is not None else None
+                    except Exception:
+                        max_frames_raw = 3
+                try:
+                    max_video_frames = max(1, int(max_frames_raw))
+                except Exception:
+                    max_video_frames = 3
+                sampling_strategy_raw = kwargs.get("video_sampling_strategy", None)
+                if sampling_strategy_raw is None:
+                    try:
+                        from ..config.manager import get_config_manager
+                        sampling_strategy_raw = getattr(get_config_manager().config, "video", None).sampling_strategy  # type: ignore[union-attr]
+                    except Exception:
+                        sampling_strategy_raw = "uniform"
+                sampling_strategy = str(sampling_strategy_raw or "uniform").strip().lower()
+                if sampling_strategy not in {"uniform", "keyframes"}:
+                    sampling_strategy = "uniform"
+                max_frame_side_raw = kwargs.get("video_max_frame_side", None)
+                if max_frame_side_raw is None:
+                    try:
+                        from ..config.manager import get_config_manager
+                        max_frame_side_raw = getattr(get_config_manager().config, "video", None).max_frame_side  # type: ignore[union-attr]
+                    except Exception:
+                        max_frame_side_raw = 1024
+                try:
+                    max_frame_side = int(max_frame_side_raw) if max_frame_side_raw is not None else None
+                except Exception:
+                    max_frame_side = 1024
+                if isinstance(max_frame_side, int) and max_frame_side <= 0:
+                    max_frame_side = None
+                # Build multimodal-typed messages for chat_template renderers that expect list content.
+                # NOTE: Many HF native-video VLMs are brittle in multi-turn mode if prior turns
+                # referenced media but we only retained text history (no `<video>` placeholders).
+                # This can cause follow-ups like "and this one?" to over-weight the previous
+                # text-only answer and ignore the newly attached video.
+                #
+                # To make follow-ups robust, collapse prior USER/ASSISTANT turns into a single
+                # text block inside the current user message, and keep exactly one `<video>`
+                # placeholder (the current attachment) in the chat template input.
+                history_lines = []
+                if messages_for_context:
+                    for msg in messages_for_context:
+                        role = str(msg.get("role", "user") or "").strip().lower()
+                        if role not in {"user", "assistant"}:
+                            continue
+                        content = msg.get("content", "")
+                        text = ""
+                        if isinstance(content, str):
+                            text = content
+                        elif isinstance(content, list):
+                            # OpenAI-style list content: [{"type":"text","text":"..."}, ...]
+                            for item in content:
+                                if not isinstance(item, dict):
+                                    continue
+                                if str(item.get("type", "") or "").strip().lower() != "text":
+                                    continue
+                                v = item.get("text")
+                                if isinstance(v, str) and v.strip():
+                                    text = v
+                                    break
+                        else:
+                            text = str(content)
+                        text = str(text or "").strip()
+                        if not text:
+                            continue
+                        prefix = "USER" if role == "user" else "ASSISTANT"
+                        history_lines.append(f"{prefix}: {text}")
+                if history_lines:
+                    history_block = "Prior chat context (text-only):\n" + "\n".join(history_lines) + "\n\n"
+                    # Cap to avoid pathological prompt growth; keep the most recent tail.
+                    if len(history_block) > 8_000:
+                        history_block = "Prior chat context (text-only; truncated):\n…\n" + history_block[-7_800:]
+                    user_content = [{"type": "text", "text": history_block}] + list(user_content)
+                mm_messages = []
+                if system_prompt:
+                    mm_messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]})
+                mm_messages.append({"role": "user", "content": user_content})
+                prompt_text = self.processor.apply_chat_template(mm_messages, add_generation_prompt=True)
+                # Prepare explicit video inputs for the processor.
+                #
+                # Prefer ffmpeg-sampled frames (our own extraction) over relying on torchvision/torchcodec
+                # decoding inside Transformers, which can vary by platform/codec support (notably for .mov).
+                video_paths = []
+                image_inputs = []
+                for media_item in (media or []):
+                    if MediaType is not None and getattr(media_item, "media_type", None) == MediaType.VIDEO:
+                        video_path = getattr(media_item, "file_path", None) or getattr(media_item, "content", None)
+                        if not isinstance(video_path, str) or not video_path.strip():
+                            raise ValueError("Video MediaContent must provide file_path for HuggingFace video models.")
+                        video_paths.append(video_path)
+                    elif MediaType is not None and getattr(media_item, "media_type", None) == MediaType.IMAGE:
+                        fp = getattr(media_item, "file_path", None)
+                        if isinstance(fp, str) and fp.strip():
+                            try:
+                                from PIL import Image as PILImage
+                            except ImportError as e:
+                                raise RuntimeError(f"PIL is required for HuggingFace image inputs: {e}")
+                            image_inputs.append(PILImage.open(fp).convert("RGB"))
+                processor_call: Dict[str, Any] = {"text": prompt_text, "return_tensors": "pt"}
+                if image_inputs:
+                    processor_call["images"] = image_inputs if len(image_inputs) > 1 else image_inputs[0]
+                if video_paths:
+                    # Try ffmpeg frame sampling first.
+                    video_frame_inputs = []
+                    temp_dirs = []
+                    try:
+                        from pathlib import Path
+                        import tempfile
+                        from ..media.utils.video_frames import extract_video_frames
+                        from PIL import Image as PILImage
+                        for vp in video_paths:
+                            out_dir = Path(tempfile.mkdtemp(prefix="abstractcore_hf_video_frames_"))
+                            temp_dirs.append(out_dir)
+                            frames, _timestamps_s = extract_video_frames(
+                                Path(vp),
+                                max_frames=max_video_frames,
+                                frame_format="jpg",
+                                sampling_strategy=sampling_strategy,
+                                max_side=max_frame_side,
+                                output_dir=out_dir,
+                            )
+                            if not frames:
+                                raise RuntimeError("No frames extracted")
+                            video_frame_inputs.append([PILImage.open(p).convert("RGB") for p in frames])
+                        # Single video -> pass list[PIL]; multiple videos -> list[list[PIL]]
+                        processor_call["videos"] = (
+                            video_frame_inputs[0]
+                            if len(video_frame_inputs) == 1
+                            else video_frame_inputs
+                        )
+                    except Exception:
+                        # If anything goes wrong with ffmpeg sampling, fall back to transformers decode.
+                        processor_call["videos"] = video_paths if len(video_paths) > 1 else video_paths[0]
+                        processor_call["videos_kwargs"] = {"do_sample_frames": True, "num_frames": max_video_frames}
+                    finally:
+                        # Cleanup extracted frames directories (frames are already loaded into memory as PIL).
+                        for d in temp_dirs:
+                            try:
+                                import shutil
+                                shutil.rmtree(d, ignore_errors=True)
+                            except Exception:
+                                pass
+                inputs = self.processor(**processor_call)
+                if hasattr(inputs, "to"):
+                    inputs = inputs.to(self.model_instance.device)
+            else:
+                templated = self.processor.apply_chat_template(
+                    chat_messages,
+                    tokenize=True,
+                    add_generation_prompt=True,
+                    return_dict=True,
+                    return_tensors="pt",
+                )
+                if isinstance(templated, str):
+                    # Processor returned a prompt string; fall back to explicit processor call.
+                    image_inputs = []
+                    for media_item in (media or []):
+                        if MediaType is not None and getattr(media_item, "media_type", None) == MediaType.IMAGE:
+                            fp = getattr(media_item, "file_path", None)
+                            if isinstance(fp, str) and fp.strip():
+                                try:
+                                    from PIL import Image as PILImage
+                                except ImportError as e:
+                                    raise RuntimeError(f"PIL is required for HuggingFace image inputs: {e}")
+                                image_inputs.append(PILImage.open(fp).convert("RGB"))
+                    processor_call: Dict[str, Any] = {"text": templated, "return_tensors": "pt"}
+                    if image_inputs:
+                        processor_call["images"] = image_inputs if len(image_inputs) > 1 else image_inputs[0]
+                    inputs = self.processor(**processor_call)
+                    if hasattr(inputs, "to"):
+                        inputs = inputs.to(self.model_instance.device)
+                else:
+                    inputs = templated.to(self.model_instance.device)
-            # Generation parameters
+            temperature_value = kwargs.get("temperature", self.temperature)
+            # For HF multimodal video models, default to greedy decoding unless the caller explicitly
+            # provided a temperature. This avoids premature EOS producing unusably short answers.
+            if has_video and ("temperature" in kwargs) and kwargs.get("temperature") is None:
+                temperature_value = 0.0
+            if temperature_value is None:
+                temperature_value = self.temperature
+            max_new_tokens_raw = kwargs.get("max_output_tokens", None)
+            if max_new_tokens_raw is None:
+                max_new_tokens_raw = kwargs.get("max_tokens", None)
+            if max_new_tokens_raw is None:
+                max_new_tokens_raw = self.max_output_tokens or 512
+            try:
+                max_new_tokens_value = max(1, int(max_new_tokens_raw))
+            except Exception:
+                max_new_tokens_value = int(self.max_output_tokens or 512)
+            do_sample = True
+            try:
+                if temperature_value is None or float(temperature_value) <= 0:
+                    do_sample = False
+                    temperature_value = 0.0
+            except Exception:
+                do_sample = True
             generation_kwargs = {
-                "max_new_tokens": kwargs.get("max_tokens", self.max_output_tokens or 512),
-                "temperature": kwargs.get("temperature", self.temperature),
-                "do_sample": True,
+                "max_new_tokens": max_new_tokens_value,
+                "temperature": temperature_value,
+                "do_sample": do_sample,
                 "pad_token_id": self.processor.tokenizer.eos_token_id,
             }
             # Add seed if provided
-            seed_value = kwargs.get("seed", self.seed)
+            seed_value = self._normalize_seed(kwargs.get("seed", self.seed))
             if seed_value is not None:
                 torch.manual_seed(seed_value)
                 if torch.cuda.is_available():
                     torch.cuda.manual_seed_all(seed_value)
             # Generate response
-            # For Apple Silicon, move inputs to CPU if MPS causes issues
-            if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
-                try:
-                    generated_ids = self.model_instance.generate(**inputs, **generation_kwargs)
-                except RuntimeError as e:
-                    if "MPS: Unsupported Border padding mode" in str(e):
-                        self.logger.warning("MPS Border padding mode error detected, falling back to CPU")
-                        # Move model and inputs to CPU
-                        cpu_model = self.model_instance.to('cpu')
-                        cpu_inputs = {k: v.to('cpu') if hasattr(v, 'to') else v for k, v in inputs.items()}
-                        generated_ids = cpu_model.generate(**cpu_inputs, **generation_kwargs)
-                        # Move model back to original device
-                        self.model_instance.to(self.model_instance.device)
+            generated_ids = None
+            try:
+                with torch.inference_mode():
+                    use_mps_lock = str(getattr(self, "device", "") or "").strip().lower() == "mps"
+                    if use_mps_lock:
+                        with _MPS_GENERATION_LOCK:
+                            generated_ids = self.model_instance.generate(**inputs, **generation_kwargs)
                     else:
-                        raise e
-            else:
-                generated_ids = self.model_instance.generate(**inputs, **generation_kwargs)
+                        generated_ids = self.model_instance.generate(**inputs, **generation_kwargs)
+            except RuntimeError as e:
+                if str(getattr(self, "device", "") or "").strip().lower() == "mps":
+                    raise RuntimeError(
+                        "HuggingFaceProvider vision/video generation failed on MPS. "
+                        "If this persists, force CPU via ABSTRACTCORE_HF_DEVICE=cpu."
+                    ) from e
+                raise
+            finally:
+                # Best-effort: keep MPS memory pressure low between calls.
+                try:
+                    if hasattr(torch, "mps") and hasattr(torch.mps, "empty_cache"):
+                        if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+                            if hasattr(torch.mps, "synchronize"):
+                                torch.mps.synchronize()
+                            torch.mps.empty_cache()
+                except Exception:
+                    pass
+                try:
+                    import gc
+                    gc.collect()
+                except Exception:
+                    pass
             # Decode response
             output_text = self.processor.decode(
@@ -1080,7 +1520,7 @@ class HuggingFaceProvider(BaseProvider):
             input_tokens = inputs["input_ids"].shape[1]
             output_tokens = len(generated_ids[0]) - input_tokens
-            return GenerateResponse(
+            response = GenerateResponse(
                 content=output_text.strip(),
                 model=self.model,
                 finish_reason="stop",
@@ -1093,15 +1533,25 @@ class HuggingFaceProvider(BaseProvider):
                 },
                 gen_time=gen_time
             )
+            if stream:
+                def _single_chunk_stream() -> Iterator[GenerateResponse]:
+                    yield response
+                return _single_chunk_stream()
+            return response
         except Exception as e:
             gen_time = (time.time() - start_time) * 1000 if 'start_time' in locals() else 0.0
-            return GenerateResponse(
+            error_resp = GenerateResponse(
                 content=f"Error in vision model generation: {str(e)}",
                 model=self.model,
                 finish_reason="error",
                 gen_time=gen_time
             )
+            if stream:
+                def _error_stream() -> Iterator[GenerateResponse]:
+                    yield error_resp
+                return _error_stream()
+            return error_resp
     def _patch_deepseek_for_mps(self):
         """Patch DeepSeek-OCR model to work with MPS instead of CUDA"""
@@ -1177,6 +1627,7 @@ class HuggingFaceProvider(BaseProvider):
             chat_messages.extend(messages)
         # Handle media content for the user message - use proper vision format for GGUF models
+        media_enrichment = None
         if media:
             try:
                 from ..architectures.detection import supports_vision
@@ -1222,10 +1673,11 @@ class HuggingFaceProvider(BaseProvider):
                     from ..media.handlers import LocalMediaHandler
                     media_handler = LocalMediaHandler("huggingface", self.model_capabilities, model_name=self.model)
                     multimodal_message = media_handler.create_multimodal_message(prompt, media)
+                    media_enrichment = getattr(media_handler, "media_enrichment", None)
                     user_message_content = multimodal_message if isinstance(multimodal_message, str) else prompt
             except ImportError:
-                self.logger.warning("Media processing not available. Install with: pip install abstractcore[media]")
+                self.logger.warning("Media processing not available. Install with: pip install \"abstractcore[media]\"")
                 user_message_content = prompt
             except Exception as e:
                 self.logger.warning(f"Failed to process media content: {e}")
@@ -1235,6 +1687,27 @@ class HuggingFaceProvider(BaseProvider):
         chat_messages.append({"role": "user", "content": user_message_content})
+        # Prompt caching (GGUF/llama.cpp): best-effort per-key cache selection.
+        prompt_cache_key = kwargs.get("prompt_cache_key")
+        if isinstance(prompt_cache_key, str) and prompt_cache_key.strip():
+            key = prompt_cache_key.strip()
+            cache_obj = self._prompt_cache_store.get(key)
+            if cache_obj is None:
+                self.prompt_cache_set(key, make_default=False)
+                cache_obj = self._prompt_cache_store.get(key)
+            try:
+                if cache_obj is not None and hasattr(self.llm, "set_cache"):
+                    self.llm.set_cache(cache_obj)
+            except Exception:
+                pass
+        else:
+            # Disable cache for this request when no key is provided.
+            try:
+                if hasattr(self.llm, "set_cache"):
+                    self.llm.set_cache(None)
+            except Exception:
+                pass
         # Prepare parameters using unified system
         unified_kwargs = self._prepare_generation_kwargs(**kwargs)
         max_output_tokens = self._get_provider_max_tokens_param(unified_kwargs)
@@ -1242,13 +1715,13 @@ class HuggingFaceProvider(BaseProvider):
         generation_kwargs = {
             "messages": chat_messages,
             "max_tokens": max_output_tokens,  # This is max_output_tokens for llama-cpp
-            "temperature": kwargs.get("temperature", self.temperature),
+            "temperature": unified_kwargs.get("temperature", self.temperature),
             "top_p": kwargs.get("top_p", 0.9),
             "stream": stream
         }
         # Add seed if provided (GGUF/llama-cpp supports seed)
-        seed_value = kwargs.get("seed", self.seed)
+        seed_value = unified_kwargs.get("seed")
         if seed_value is not None:
             generation_kwargs["seed"] = seed_value
@@ -1305,6 +1778,10 @@ class HuggingFaceProvider(BaseProvider):
                 return self._stream_generate_gguf_with_tools(generation_kwargs, tools, has_native_tools, kwargs.get('tool_call_tags'))
             else:
                 response = self._single_generate_gguf(generation_kwargs)
+                if media_enrichment:
+                    from ..media.enrichment import merge_enrichment_metadata
+                    response.metadata = merge_enrichment_metadata(response.metadata, media_enrichment)
                 # Handle tool execution for both native and prompted responses
                 if tools and (response.has_tool_calls() or

abstractcore 2.9.1__py3-none-any.whl → 2.11.2__py3-none-any.whl

abstractcore 2.9.1py3-none-any.whl → 2.11.2py3-none-any.whl