PyPI - abstractcore - Versions diffs - 2.9.1__py3-none-any.whl → 2.11.2__py3-none-any.whl - Mend

abstractcore 2.9.1py3-none-any.whl → 2.11.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

abstractcore/__init__.py +7 -27
abstractcore/apps/extractor.py +33 -100
abstractcore/apps/intent.py +19 -0
abstractcore/apps/judge.py +20 -1
abstractcore/apps/summarizer.py +20 -1
abstractcore/architectures/detection.py +34 -1
abstractcore/architectures/response_postprocessing.py +313 -0
abstractcore/assets/architecture_formats.json +38 -8
abstractcore/assets/model_capabilities.json +781 -160
abstractcore/compression/__init__.py +1 -2
abstractcore/compression/glyph_processor.py +6 -4
abstractcore/config/main.py +31 -19
abstractcore/config/manager.py +389 -11
abstractcore/config/vision_config.py +5 -5
abstractcore/core/interface.py +151 -3
abstractcore/core/session.py +16 -10
abstractcore/download.py +1 -1
abstractcore/embeddings/manager.py +20 -6
abstractcore/endpoint/__init__.py +2 -0
abstractcore/endpoint/app.py +458 -0
abstractcore/mcp/client.py +3 -1
abstractcore/media/__init__.py +52 -17
abstractcore/media/auto_handler.py +42 -22
abstractcore/media/base.py +44 -1
abstractcore/media/capabilities.py +12 -33
abstractcore/media/enrichment.py +105 -0
abstractcore/media/handlers/anthropic_handler.py +19 -28
abstractcore/media/handlers/local_handler.py +124 -70
abstractcore/media/handlers/openai_handler.py +19 -31
abstractcore/media/processors/__init__.py +4 -2
abstractcore/media/processors/audio_processor.py +57 -0
abstractcore/media/processors/office_processor.py +8 -3
abstractcore/media/processors/pdf_processor.py +46 -3
abstractcore/media/processors/text_processor.py +22 -24
abstractcore/media/processors/video_processor.py +58 -0
abstractcore/media/types.py +97 -4
abstractcore/media/utils/image_scaler.py +20 -2
abstractcore/media/utils/video_frames.py +219 -0
abstractcore/media/vision_fallback.py +136 -22
abstractcore/processing/__init__.py +32 -3
abstractcore/processing/basic_deepsearch.py +15 -10
abstractcore/processing/basic_intent.py +3 -2
abstractcore/processing/basic_judge.py +3 -2
abstractcore/processing/basic_summarizer.py +1 -1
abstractcore/providers/__init__.py +3 -1
abstractcore/providers/anthropic_provider.py +95 -8
abstractcore/providers/base.py +1516 -81
abstractcore/providers/huggingface_provider.py +546 -69
abstractcore/providers/lmstudio_provider.py +35 -923
abstractcore/providers/mlx_provider.py +382 -35
abstractcore/providers/model_capabilities.py +5 -1
abstractcore/providers/ollama_provider.py +99 -15
abstractcore/providers/openai_compatible_provider.py +406 -180
abstractcore/providers/openai_provider.py +188 -44
abstractcore/providers/openrouter_provider.py +76 -0
abstractcore/providers/registry.py +61 -5
abstractcore/providers/streaming.py +138 -33
abstractcore/providers/vllm_provider.py +92 -817
abstractcore/server/app.py +461 -13
abstractcore/server/audio_endpoints.py +139 -0
abstractcore/server/vision_endpoints.py +1319 -0
abstractcore/structured/handler.py +316 -41
abstractcore/tools/common_tools.py +5501 -2012
abstractcore/tools/comms_tools.py +1641 -0
abstractcore/tools/core.py +37 -7
abstractcore/tools/handler.py +4 -9
abstractcore/tools/parser.py +49 -2
abstractcore/tools/tag_rewriter.py +2 -1
abstractcore/tools/telegram_tdlib.py +407 -0
abstractcore/tools/telegram_tools.py +261 -0
abstractcore/utils/cli.py +1085 -72
abstractcore/utils/token_utils.py +2 -0
abstractcore/utils/truncation.py +29 -0
abstractcore/utils/version.py +3 -4
abstractcore/utils/vlm_token_calculator.py +12 -2
abstractcore-2.11.2.dist-info/METADATA +562 -0
abstractcore-2.11.2.dist-info/RECORD +133 -0
{abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/WHEEL +1 -1
{abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/entry_points.txt +1 -0
abstractcore-2.9.1.dist-info/METADATA +0 -1190
abstractcore-2.9.1.dist-info/RECORD +0 -119
{abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/licenses/LICENSE +0 -0
{abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/top_level.txt +0 -0

abstractcore/providers/base.py CHANGED Viewed

@@ -9,8 +9,10 @@ import warnings
 import json
 import re
 import socket
-from collections import deque
-from typing import List, Dict, Any, Optional, Union, Iterator, AsyncIterator, Type, TYPE_CHECKING
+import hashlib
+from collections import deque, OrderedDict
+from dataclasses import dataclass, field
+from typing import List, Dict, Any, Optional, Union, Iterator, AsyncIterator, Type, TYPE_CHECKING, Tuple
 from abc import ABC, abstractmethod
 try:
@@ -26,14 +28,20 @@ from ..events import EventType, Event
 from datetime import datetime
 from ..utils.structured_logging import get_logger
 from ..utils.jsonish import loads_dict_like
+from ..utils.truncation import preview_text
 from ..exceptions import (
     ProviderAPIError,
     AuthenticationError,
     RateLimitError,
     InvalidRequestError,
+    UnsupportedFeatureError,
     ModelNotFoundError
 )
 from ..architectures import detect_architecture, get_architecture_format, get_model_capabilities
+from ..architectures.response_postprocessing import (
+    normalize_assistant_text,
+    strip_output_wrappers,
+)
 from ..tools import execute_tools
 from ..core.retry import RetryManager, RetryConfig
@@ -42,6 +50,178 @@ if TYPE_CHECKING:  # pragma: no cover
     from ..media.types import MediaContent
+@dataclass
+class _PromptCacheEntry:
+    value: Any
+    created_at_s: float
+    last_accessed_at_s: float
+    ttl_s: Optional[float] = None
+    meta: Dict[str, Any] = field(default_factory=dict)
+class PromptCacheStore:
+    """Best-effort in-process prompt cache store (LRU + optional TTL).
+    Providers can store arbitrary backend-specific cache objects keyed by a caller-provided string
+    (`prompt_cache_key`). This is primarily useful for local inference backends (MLX, llama.cpp).
+    Notes:
+    - This store is intentionally simple and in-process only.
+    - Callers should treat prompt caches as potentially sensitive (they contain user prompt state).
+    """
+    def __init__(self, *, max_entries: int = 32, default_ttl_s: Optional[float] = None):
+        self._max_entries = int(max_entries) if max_entries and int(max_entries) > 0 else 32
+        self._default_ttl_s = default_ttl_s if default_ttl_s is None else float(default_ttl_s)
+        self._entries: "OrderedDict[str, _PromptCacheEntry]" = OrderedDict()
+    def _is_expired(self, entry: _PromptCacheEntry) -> bool:
+        ttl_s = entry.ttl_s if entry.ttl_s is not None else self._default_ttl_s
+        if ttl_s is None:
+            return False
+        return (time.time() - entry.last_accessed_at_s) > float(ttl_s)
+    def get(self, key: str) -> Optional[Any]:
+        if not isinstance(key, str) or not key.strip():
+            return None
+        key = key.strip()
+        entry = self._entries.get(key)
+        if entry is None:
+            return None
+        if self._is_expired(entry):
+            self.delete(key)
+            return None
+        entry.last_accessed_at_s = time.time()
+        self._entries.move_to_end(key)
+        return entry.value
+    def set(
+        self,
+        key: str,
+        value: Any,
+        *,
+        ttl_s: Optional[float] = None,
+        meta: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        if not isinstance(key, str) or not key.strip():
+            raise ValueError("prompt cache key must be a non-empty string")
+        key = key.strip()
+        now = time.time()
+        self._entries[key] = _PromptCacheEntry(
+            value=value,
+            created_at_s=now,
+            last_accessed_at_s=now,
+            ttl_s=ttl_s,
+            meta=dict(meta or {}),
+        )
+        self._entries.move_to_end(key)
+        while len(self._entries) > self._max_entries:
+            self._entries.popitem(last=False)
+    def delete(self, key: str) -> bool:
+        if not isinstance(key, str) or not key.strip():
+            return False
+        key = key.strip()
+        return self._entries.pop(key, None) is not None
+    def clear(self) -> None:
+        self._entries.clear()
+    def stats(self) -> Dict[str, Any]:
+        # Opportunistically purge expired entries.
+        expired = []
+        for k, v in self._entries.items():
+            if self._is_expired(v):
+                expired.append(k)
+        for k in expired:
+            self.delete(k)
+        return {
+            "entries": len(self._entries),
+            "max_entries": self._max_entries,
+            "default_ttl_s": self._default_ttl_s,
+        }
+    def keys(self) -> List[str]:
+        return list(self._entries.keys())
+    def meta(self, key: str) -> Optional[Dict[str, Any]]:
+        if not isinstance(key, str) or not key.strip():
+            return None
+        entry = self._entries.get(key.strip())
+        if entry is None:
+            return None
+        return dict(entry.meta or {})
+@dataclass(frozen=True)
+class PromptCacheModule:
+    """A single cacheable module of prompt context.
+    This is intentionally generic and JSON-serializable so higher-level layers (runtime/agent/memory)
+    can express cache intent without hard-coding provider-specific prompt formats.
+    """
+    module_id: str
+    system_prompt: Optional[str] = None
+    prompt: Optional[str] = None
+    messages: Optional[List[Dict[str, Any]]] = None
+    tools: Optional[List[Dict[str, Any]]] = None
+    add_generation_prompt: bool = False
+    scope: str = "private"  # "private" | "shared" (advisory; enforcement is host-dependent)
+    meta: Dict[str, Any] = field(default_factory=dict)
+    def normalized(self) -> "PromptCacheModule":
+        module_id = str(self.module_id or "").strip()
+        system_prompt = str(self.system_prompt).strip() if isinstance(self.system_prompt, str) and self.system_prompt else None
+        prompt = str(self.prompt).strip() if isinstance(self.prompt, str) and self.prompt else None
+        messages = None
+        if isinstance(self.messages, list) and self.messages:
+            out: List[Dict[str, Any]] = []
+            for m in self.messages:
+                if isinstance(m, dict):
+                    out.append(dict(m))
+            messages = out or None
+        tools = None
+        if isinstance(self.tools, list) and self.tools:
+            out_tools: List[Dict[str, Any]] = []
+            for t in self.tools:
+                if isinstance(t, dict):
+                    out_tools.append(dict(t))
+            tools = out_tools or None
+        add_generation_prompt = bool(self.add_generation_prompt)
+        scope = str(self.scope or "private").strip().lower() or "private"
+        if scope not in {"private", "shared"}:
+            scope = "private"
+        meta = dict(self.meta or {})
+        return PromptCacheModule(
+            module_id=module_id,
+            system_prompt=system_prompt,
+            prompt=prompt,
+            messages=messages,
+            tools=tools,
+            add_generation_prompt=add_generation_prompt,
+            scope=scope,
+            meta=meta,
+        )
+    def fingerprint(self, *, version: int = 1) -> str:
+        """Stable module fingerprint for hierarchical cache keys (hex sha256)."""
+        mod = self.normalized()
+        payload = {
+            "v": int(version),
+            "module_id": mod.module_id,
+            "system_prompt": mod.system_prompt,
+            "prompt": mod.prompt,
+            "messages": mod.messages,
+            "tools": mod.tools,
+            "add_generation_prompt": bool(mod.add_generation_prompt),
+            "scope": mod.scope,
+        }
+        raw = json.dumps(payload, sort_keys=True, ensure_ascii=False, separators=(",", ":"))
+        return hashlib.sha256(raw.encode("utf-8")).hexdigest()
 class BaseProvider(AbstractCoreInterface, ABC):
     """
     Base provider class with integrated telemetry and events.
@@ -60,6 +240,7 @@ class BaseProvider(AbstractCoreInterface, ABC):
         self.architecture_config = get_architecture_format(self.architecture)
         self.model_capabilities = get_model_capabilities(model)
+        # #[WARNING:TIMEOUT]
         # Setup timeout configuration (centralized defaults).
         #
         # Semantics:
@@ -136,12 +317,33 @@ class BaseProvider(AbstractCoreInterface, ABC):
         self.enable_tracing = kwargs.get('enable_tracing', False)
         self._traces = deque(maxlen=kwargs.get('max_traces', 100))  # Ring buffer for memory efficiency
+        # Prompt caching (best-effort; provider-specific behavior).
+        #
+        # - Remote providers (OpenAI): supports `prompt_cache_key` pass-through (server-managed caching).
+        # - Local runtimes (MLX / llama.cpp): can store KV/prefix caches in-process keyed by `prompt_cache_key`.
+        self._default_prompt_cache_key: Optional[str] = None
+        prompt_cache_max_entries = kwargs.get("prompt_cache_max_entries", kwargs.get("prompt_cache_max_items", 32))
+        prompt_cache_ttl_s = kwargs.get("prompt_cache_ttl_s", None)
+        self._prompt_cache_store = PromptCacheStore(
+            max_entries=int(prompt_cache_max_entries) if prompt_cache_max_entries is not None else 32,
+            default_ttl_s=prompt_cache_ttl_s,
+        )
         # Provider created successfully - no event emission needed
         # (The simplified event system focuses on generation and tool events only)
         # Set default token limits if not provided
         self._initialize_token_limits()
+    def __init_subclass__(cls, **kwargs):  # pragma: no cover
+        super().__init_subclass__(**kwargs)
+        # Enforce a single unload path: providers must implement `unload_model()` and must not define `unload()`.
+        if "unload" in cls.__dict__:
+            raise TypeError(
+                f"{cls.__name__} defines unload(). "
+                "Providers must implement unload_model(model_name) and must not provide any other unload entrypoint."
+            )
     def _track_generation(self, prompt: str, response: Optional[GenerateResponse],
                          start_time: float, success: bool = True,
                          error: Optional[Exception] = None, stream: bool = False):
@@ -174,7 +376,7 @@ class BaseProvider(AbstractCoreInterface, ABC):
         # Emit comprehensive event with all data in one dict
         event_data = {
-            "prompt": prompt[:100] + "..." if len(prompt) > 100 else prompt,
+            "prompt": preview_text(prompt, max_chars=100),
             "success": success,
             "error": str(error) if error else None,
             "response_length": len(response.content) if response and response.content else 0,
@@ -222,7 +424,7 @@ class BaseProvider(AbstractCoreInterface, ABC):
         event_data = {
             "tool_name": tool_name,
             "arguments": arguments,
-            "result": str(result)[:100] if result else None,
+            "result": preview_text(result, max_chars=100) if result else None,
             "error": str(error) if error else None,
             "success": success
         }
@@ -268,9 +470,11 @@ class BaseProvider(AbstractCoreInterface, ABC):
         # Extract generation parameters
         temperature = kwargs.get('temperature', self.temperature)
+        if temperature is None:
+            temperature = self.temperature
         max_tokens = kwargs.get('max_tokens', self.max_tokens)
         max_output_tokens = kwargs.get('max_output_tokens', self.max_output_tokens)
-        seed = kwargs.get('seed', self.seed)
+        seed = self._normalize_seed(kwargs.get('seed', self.seed))
         top_p = kwargs.get('top_p', getattr(self, 'top_p', None))
         top_k = kwargs.get('top_k', getattr(self, 'top_k', None))
@@ -393,7 +597,10 @@ class BaseProvider(AbstractCoreInterface, ABC):
             if _looks_like_timeout(error) and not _has_explicit_duration(msg):
                 t = _configured_timeout_s()
                 if t is not None:
-                    return ProviderAPIError(f"{_provider_label()} API error: timed out after {t}s")
+                    return ProviderAPIError(
+                        f"{_provider_label()} API error: timed out after {t}s "
+                        "(configured timeout; set timeout=None or default_timeout=0 for unlimited)"
+                    )
                 return ProviderAPIError(f"{_provider_label()} API error: timed out")
             return error
@@ -404,7 +611,10 @@ class BaseProvider(AbstractCoreInterface, ABC):
         if _looks_like_timeout(error):
             t = _configured_timeout_s()
             if t is not None:
-                return ProviderAPIError(f"{_provider_label()} API error: timed out after {t}s")
+                return ProviderAPIError(
+                    f"{_provider_label()} API error: timed out after {t}s "
+                    "(configured timeout; set timeout=None or default_timeout=0 for unlimited)"
+                )
             return ProviderAPIError(f"{_provider_label()} API error: timed out")
         error_str = str(error).lower()
@@ -418,6 +628,233 @@ class BaseProvider(AbstractCoreInterface, ABC):
         else:
             return ProviderAPIError(f"API error: {error}")
+    @staticmethod
+    def _normalize_thinking_request(thinking: Optional[Union[bool, str]]) -> Tuple[Optional[bool], Optional[str]]:
+        """Normalize `thinking=` into (enabled, level).
+        - enabled: True/False/None (None == "auto")
+        - level: Optional[str] in {"low","medium","high"} when requested
+        """
+        if thinking is None:
+            return None, None
+        if isinstance(thinking, bool):
+            return thinking, None
+        if isinstance(thinking, str):
+            s = thinking.strip().lower()
+            if not s or s == "auto":
+                return None, None
+            if s in {"on", "true", "yes"}:
+                return True, None
+            if s in {"off", "false", "no"}:
+                return False, None
+            if s in {"low", "medium", "high"}:
+                return True, s
+        raise ValueError('thinking must be one of: None, bool, "auto", "on", "off", "low", "medium", "high"')
+    def _model_reasoning_levels(self) -> List[str]:
+        levels = None
+        for src in (self.model_capabilities, self.architecture_config):
+            if not isinstance(src, dict):
+                continue
+            value = src.get("reasoning_levels")
+            if isinstance(value, list) and value:
+                levels = value
+                break
+        if not isinstance(levels, list):
+            return []
+        out: List[str] = []
+        for x in levels:
+            if isinstance(x, str) and x.strip():
+                out.append(x.strip().lower())
+        # Deduplicate while preserving order.
+        seen: set[str] = set()
+        uniq: List[str] = []
+        for x in out:
+            if x in seen:
+                continue
+            seen.add(x)
+            uniq.append(x)
+        return uniq
+    def _model_supports_thinking_control(self) -> bool:
+        caps = self.model_capabilities if isinstance(self.model_capabilities, dict) else {}
+        arch = self.architecture_config if isinstance(self.architecture_config, dict) else {}
+        if caps.get("thinking_support") is True:
+            return True
+        if isinstance(caps.get("thinking_tags"), (list, tuple)) and len(caps.get("thinking_tags")) == 2:
+            return True
+        if isinstance(caps.get("thinking_output_field"), str) and caps.get("thinking_output_field").strip():
+            return True
+        if self._model_reasoning_levels():
+            return True
+        if isinstance(arch.get("thinking_tags"), (list, tuple)) and len(arch.get("thinking_tags")) == 2:
+            return True
+        if isinstance(arch.get("thinking_control"), str) and arch.get("thinking_control").strip():
+            return True
+        if arch.get("reasoning_support") is True:
+            return True
+        if isinstance(arch.get("reasoning_levels"), list) and arch.get("reasoning_levels"):
+            return True
+        return False
+    def _apply_thinking_request(
+        self,
+        *,
+        thinking: Optional[Union[bool, str]],
+        prompt: str,
+        messages: Optional[List[Dict[str, str]]],
+        system_prompt: Optional[str],
+        kwargs: Dict[str, Any],
+    ) -> Tuple[str, Optional[List[Dict[str, str]]], Optional[str], Dict[str, Any]]:
+        """Apply unified thinking controls to the request."""
+        enabled, level = self._normalize_thinking_request(thinking)
+        if enabled is None and level is None:
+            return prompt, messages, system_prompt, kwargs
+        supports_control = self._model_supports_thinking_control()
+        reasoning_levels = self._model_reasoning_levels()
+        if level is not None and reasoning_levels and level not in reasoning_levels:
+            warnings.warn(
+                f"thinking level '{level}' requested but not supported for model '{self.model}' "
+                f"(supported: {reasoning_levels}); falling back to thinking='on'.",
+                RuntimeWarning,
+                stacklevel=3,
+            )
+            level = None
+            enabled = True
+        if level is not None and not reasoning_levels:
+            warnings.warn(
+                f"thinking level '{level}' requested but model '{self.model}' has no configured reasoning_levels; "
+                "falling back to thinking='on'.",
+                RuntimeWarning,
+                stacklevel=3,
+            )
+            level = None
+            enabled = True
+        handled_by_model_prompt = False
+        # Harmony (GPT-OSS): control via system message `Reasoning: low|medium|high`.
+        msg_fmt = str((self.architecture_config or {}).get("message_format") or "").strip().lower()
+        resp_fmt = str((self.model_capabilities or {}).get("response_format") or "").strip().lower()
+        is_harmony = msg_fmt == "harmony" or resp_fmt == "harmony"
+        if is_harmony:
+            target_level: Optional[str] = None
+            if level is not None:
+                target_level = level
+            elif enabled is False:
+                warnings.warn(
+                    f"thinking='off' requested for Harmony model '{self.model}', but GPT-OSS reasoning traces "
+                    "cannot be fully disabled; using Reasoning: low.",
+                    RuntimeWarning,
+                    stacklevel=3,
+                )
+                target_level = "low"
+            elif enabled is True:
+                # Make the default explicit when the caller opts-in.
+                target_level = "medium"
+            if target_level:
+                line = f"Reasoning: {target_level}"
+                if isinstance(system_prompt, str) and system_prompt.strip():
+                    # Replace any existing Reasoning line; otherwise prepend.
+                    if re.search(r"(?mi)^\\s*Reasoning\\s*:\\s*(low|medium|high)\\s*$", system_prompt):
+                        system_prompt = re.sub(
+                            r"(?mi)^\\s*Reasoning\\s*:\\s*(low|medium|high)\\s*$",
+                            line,
+                            system_prompt,
+                            count=1,
+                        )
+                    else:
+                        system_prompt = f"{line}\n{system_prompt}"
+                else:
+                    system_prompt = line
+                handled_by_model_prompt = True
+        # Model-level control token for disabling thinking (e.g., GLM `/nothink`).
+        thinking_control = None
+        for src in (self.model_capabilities, self.architecture_config):
+            if not isinstance(src, dict):
+                continue
+            token = src.get("thinking_control")
+            if isinstance(token, str) and token.strip():
+                thinking_control = token.strip()
+        if enabled is False and thinking_control:
+            handled_by_model_prompt = True
+            def _append_control(text: str) -> str:
+                if thinking_control in text:
+                    return text
+                return f"{text.rstrip()}\n{thinking_control}".strip()
+            if isinstance(prompt, str) and prompt.strip():
+                prompt = _append_control(prompt)
+            elif isinstance(messages, list) and messages:
+                # Append to the most recent user turn, if possible.
+                new_messages: List[Dict[str, str]] = []
+                appended = False
+                for m in messages:
+                    if not isinstance(m, dict):
+                        continue
+                    new_messages.append(dict(m))
+                for m in reversed(new_messages):
+                    if m.get("role") == "user" and isinstance(m.get("content"), str) and m["content"].strip():
+                        m["content"] = _append_control(m["content"])
+                        appended = True
+                        break
+                messages = new_messages
+                if not appended:
+                    warnings.warn(
+                        f"thinking='off' requested for model '{self.model}', but no user prompt was available "
+                        f"to append thinking_control='{thinking_control}'.",
+                        RuntimeWarning,
+                        stacklevel=3,
+                    )
+        kwargs, handled_by_provider = self._apply_provider_thinking_kwargs(
+            enabled=enabled,
+            level=level,
+            kwargs=kwargs,
+        )
+        if not supports_control and thinking is not None:
+            warnings.warn(
+                f"thinking={thinking!r} requested but model '{self.model}' is not marked as thinking-capable "
+                "in model_capabilities.json; the request may be ignored.",
+                RuntimeWarning,
+                stacklevel=3,
+            )
+        if not handled_by_model_prompt and not handled_by_provider and (enabled is False or level is not None):
+            warnings.warn(
+                f"thinking={thinking!r} requested but provider '{self.provider or self.__class__.__name__}' "
+                "does not implement a thinking control mapping for this model; the request may be ignored.",
+                RuntimeWarning,
+                stacklevel=3,
+            )
+        return prompt, messages, system_prompt, kwargs
+    def _apply_provider_thinking_kwargs(
+        self,
+        *,
+        enabled: Optional[bool],
+        level: Optional[str],
+        kwargs: Dict[str, Any],
+    ) -> Tuple[Dict[str, Any], bool]:
+        """Provider-specific thinking knob hook (default: unsupported)."""
+        _ = (enabled, level)
+        return kwargs, False
     def generate_with_telemetry(self,
                                prompt: str,
                                messages: Optional[List[Dict[str, str]]] = None,
@@ -430,6 +867,7 @@ class BaseProvider(AbstractCoreInterface, ABC):
                                tool_call_tags: Optional[str] = None,  # Tool call tag rewriting
                                execute_tools: Optional[bool] = None,  # Tool execution control
                                glyph_compression: Optional[str] = None,  # Glyph compression preference
+                               thinking: Optional[Union[bool, str]] = None,  # Unified reasoning/thinking control
                                **kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse], BaseModel]:
         """
         Generate with integrated telemetry and error handling.
@@ -447,6 +885,7 @@ class BaseProvider(AbstractCoreInterface, ABC):
             tool_call_tags: Optional tool call tag format for rewriting
             execute_tools: Whether to execute tools automatically (True) or let agent handle execution (False)
             glyph_compression: Glyph compression preference ("auto", "always", "never")
+            thinking: Unified reasoning/thinking control (auto/on/off or low/medium/high when supported)
         """
         # Normalize token limit naming at the provider boundary.
         #
@@ -458,6 +897,18 @@ class BaseProvider(AbstractCoreInterface, ABC):
         if "max_output_tokens" not in kwargs and "max_tokens" in kwargs and kwargs.get("max_tokens") is not None:
             kwargs["max_output_tokens"] = kwargs.pop("max_tokens")
+        # Prompt caching: apply a default `prompt_cache_key` if configured.
+        self._apply_default_prompt_cache_key(kwargs)
+        # Apply unified thinking controls (provider-agnostic + provider-specific mappings).
+        prompt, messages, system_prompt, kwargs = self._apply_thinking_request(
+            thinking=thinking,
+            prompt=prompt,
+            messages=messages,
+            system_prompt=system_prompt,
+            kwargs=kwargs,
+        )
         # Handle structured output request
         if response_model is not None:
             if not PYDANTIC_AVAILABLE:
@@ -466,8 +917,12 @@ class BaseProvider(AbstractCoreInterface, ABC):
                     "Install with: pip install pydantic>=2.0.0"
                 )
-            # Handle hybrid case: tools + structured output
-            if tools is not None:
+            # Handle hybrid case: tools + structured output.
+            #
+            # NOTE: `tools=[]` should behave like "no tools". Treating an empty list as
+            # "tools present" triggers the hybrid 2-pass flow (unstructured call + structured
+            # follow-up) which is both slower and can cause provider-side timeouts/unloads.
+            if isinstance(tools, list) and len(tools) > 0:
                 return self._handle_tools_with_structured_output(
                     prompt=prompt,
                     messages=messages,
@@ -500,6 +955,7 @@ class BaseProvider(AbstractCoreInterface, ABC):
         # Process media content if provided
         processed_media = None
         media_metadata = None
+        media_enrichment = None
         if media:
             compression_pref = glyph_compression or kwargs.get('glyph_compression', 'auto')
             processed_media = self._process_media_content(media, compression_pref)
@@ -511,6 +967,639 @@ class BaseProvider(AbstractCoreInterface, ABC):
                     if hasattr(media_content, 'metadata') and media_content.metadata:
                         media_metadata.append(media_content.metadata)
+        # Audio input policy (v0): avoid placeholder degradation and require explicit fallbacks.
+        if processed_media:
+            try:
+                from ..media.types import ContentFormat, MediaType
+                from ..media.enrichment import build_enrichment_item
+                from ..capabilities.errors import CapabilityUnavailableError
+            except Exception:
+                ContentFormat = None  # type: ignore[assignment]
+                MediaType = None  # type: ignore[assignment]
+                build_enrichment_item = None  # type: ignore[assignment]
+                CapabilityUnavailableError = Exception  # type: ignore[assignment]
+            if MediaType is not None:
+                audio_items = [mc for mc in processed_media if getattr(mc, "media_type", None) == MediaType.AUDIO]
+            else:
+                audio_items = []
+            if audio_items:
+                # Resolve policy: per-call kwarg > config default.
+                policy_raw = kwargs.pop("audio_policy", None)
+                if policy_raw is None:
+                    policy_raw = kwargs.pop("audio_handling_policy", None)
+                if policy_raw is None:
+                    try:
+                        from ..config.manager import get_config_manager
+                        policy_raw = getattr(get_config_manager().config, "audio", None).strategy  # type: ignore[union-attr]
+                    except Exception:
+                        policy_raw = "native_only"
+                policy = str(policy_raw or "native_only").strip().lower()
+                model_supports_audio = bool(getattr(self, "model_capabilities", {}).get("audio_support", False))
+                if policy in ("native_only", "native", "disabled"):
+                    if not model_supports_audio:
+                        raise UnsupportedFeatureError(
+                            f"Audio input is not supported by model '{self.model}'. "
+                            "Choose an audio-capable model, or pass audio_policy='speech_to_text' "
+                            "(requires an STT capability plugin, e.g. install abstractvoice)."
+                        )
+                    # Keep audio media for provider-native handling (provider support may still vary).
+                elif policy in ("speech_to_text", "stt"):
+                    stt_language = kwargs.pop("audio_language", None)
+                    if stt_language is None:
+                        stt_language = kwargs.pop("stt_language", None)
+                    if stt_language is None:
+                        try:
+                            from ..config.manager import get_config_manager
+                            stt_language = getattr(get_config_manager().config, "audio", None).stt_language  # type: ignore[union-attr]
+                        except Exception:
+                            stt_language = None
+                    audio_context_parts: List[str] = []
+                    enrichments: List[Dict[str, Any]] = []
+                    # Resolve backend id (best-effort) for transparency metadata.
+                    backend_id = getattr(getattr(self, "audio", None), "backend_id", None)
+                    backend = {"kind": "plugin"}
+                    if isinstance(backend_id, str) and backend_id.strip():
+                        backend["backend_id"] = backend_id.strip()
+                    for idx, mc in enumerate(audio_items):
+                        name = None
+                        try:
+                            name = mc.metadata.get("file_name") if hasattr(mc, "metadata") and isinstance(mc.metadata, dict) else None
+                        except Exception:
+                            name = None
+                        if not isinstance(name, str) or not name.strip():
+                            name = mc.file_path if getattr(mc, "file_path", None) else f"audio_{idx+1}"
+                        # Prefer a file path when available.
+                        audio_input: Any = None
+                        try:
+                            if getattr(mc, "file_path", None):
+                                audio_input = str(mc.file_path)
+                            elif getattr(mc, "content_format", None) == ContentFormat.FILE_PATH and isinstance(getattr(mc, "content", None), str):
+                                audio_input = str(mc.content)
+                            elif isinstance(getattr(mc, "content", None), (bytes, bytearray)):
+                                audio_input = bytes(mc.content)
+                        except Exception:
+                            audio_input = None
+                        if audio_input is None:
+                            raise UnsupportedFeatureError("Audio STT fallback requires a file path or raw bytes for the audio input.")
+                        try:
+                            transcript = self.audio.transcribe(audio_input, language=stt_language)
+                        except CapabilityUnavailableError as e:  # type: ignore[misc]
+                            raise UnsupportedFeatureError(str(e))
+                        transcript = str(transcript or "").strip()
+                        audio_context_parts.append(f"Audio {idx+1} ({name}): {transcript}")
+                        if build_enrichment_item is not None:
+                            enrichments.append(
+                                build_enrichment_item(
+                                    status="used",
+                                    input_modality="audio",
+                                    summary_kind="transcript",
+                                    policy="speech_to_text",
+                                    backend=backend,
+                                    input_index=idx + 1,
+                                    input_name=str(name),
+                                    injected_text=transcript,
+                                )
+                            )
+                    # Remove audio media from the provider call (we injected text context instead).
+                    processed_media = [mc for mc in processed_media if getattr(mc, "media_type", None) != MediaType.AUDIO]
+                    # Inject audio context into the prompt (similar recency semantics as vision fallback).
+                    original_prompt = prompt.strip() if isinstance(prompt, str) else ""
+                    parts: List[str] = []
+                    parts.append(
+                        "Audio context from attached audio file(s) "
+                        "(treat as directly observed; do not mention this section):"
+                    )
+                    parts.extend(audio_context_parts)
+                    if original_prompt:
+                        parts.append("Now answer the user's request:")
+                        parts.append(original_prompt)
+                    prompt = "\n\n".join(parts) if parts else original_prompt
+                    media_enrichment = enrichments
+                elif policy == "auto":
+                    if model_supports_audio:
+                        pass  # provider-native path
+                    else:
+                        # Explicit "auto" allows fallback, but never silently for default policy.
+                        # Re-enter through the explicit STT path by recursion is risky; inline minimal.
+                        stt_language = kwargs.pop("audio_language", None) or kwargs.pop("stt_language", None)
+                        audio_context_parts: List[str] = []
+                        enrichments: List[Dict[str, Any]] = []
+                        backend_id = getattr(getattr(self, "audio", None), "backend_id", None)
+                        backend = {"kind": "plugin"}
+                        if isinstance(backend_id, str) and backend_id.strip():
+                            backend["backend_id"] = backend_id.strip()
+                        for idx, mc in enumerate(audio_items):
+                            name = None
+                            try:
+                                name = mc.metadata.get("file_name") if hasattr(mc, "metadata") and isinstance(mc.metadata, dict) else None
+                            except Exception:
+                                name = None
+                            if not isinstance(name, str) or not name.strip():
+                                name = mc.file_path if getattr(mc, "file_path", None) else f"audio_{idx+1}"
+                            audio_input: Any = None
+                            try:
+                                if getattr(mc, "file_path", None):
+                                    audio_input = str(mc.file_path)
+                                elif getattr(mc, "content_format", None) == ContentFormat.FILE_PATH and isinstance(getattr(mc, "content", None), str):
+                                    audio_input = str(mc.content)
+                                elif isinstance(getattr(mc, "content", None), (bytes, bytearray)):
+                                    audio_input = bytes(mc.content)
+                            except Exception:
+                                audio_input = None
+                            if audio_input is None:
+                                raise UnsupportedFeatureError("Audio STT fallback requires a file path or raw bytes for the audio input.")
+                            try:
+                                transcript = self.audio.transcribe(audio_input, language=stt_language)
+                            except CapabilityUnavailableError as e:  # type: ignore[misc]
+                                raise UnsupportedFeatureError(str(e))
+                            transcript = str(transcript or "").strip()
+                            audio_context_parts.append(f"Audio {idx+1} ({name}): {transcript}")
+                            if build_enrichment_item is not None:
+                                enrichments.append(
+                                    build_enrichment_item(
+                                        status="used",
+                                        input_modality="audio",
+                                        summary_kind="transcript",
+                                        policy="auto",
+                                        backend=backend,
+                                        input_index=idx + 1,
+                                        input_name=str(name),
+                                        injected_text=transcript,
+                                    )
+                                )
+                        processed_media = [mc for mc in processed_media if getattr(mc, "media_type", None) != MediaType.AUDIO]
+                        original_prompt = prompt.strip() if isinstance(prompt, str) else ""
+                        parts: List[str] = []
+                        parts.append(
+                            "Audio context from attached audio file(s) "
+                            "(treat as directly observed; do not mention this section):"
+                        )
+                        parts.extend(audio_context_parts)
+                        if original_prompt:
+                            parts.append("Now answer the user's request:")
+                            parts.append(original_prompt)
+                        prompt = "\n\n".join(parts) if parts else original_prompt
+                        media_enrichment = enrichments
+                elif policy == "caption":
+                    raise UnsupportedFeatureError(
+                        "audio_policy='caption' is not configured in v0. "
+                        "Use audio_policy='speech_to_text' for speech, or configure a future audio caption backend."
+                    )
+                else:
+                    raise ValueError(f"Unknown audio_policy '{policy}'. Expected one of: native_only, speech_to_text, auto, caption.")
+        # Video input policy (v0): allow native video where supported; otherwise fall back to sampled frames.
+        # Note: most providers do not accept native video inputs; frame sampling provides a portable path.
+        if processed_media:
+            try:
+                from ..media.types import MediaType
+                from ..media.enrichment import build_enrichment_item
+            except Exception:
+                MediaType = None  # type: ignore[assignment]
+                build_enrichment_item = None  # type: ignore[assignment]
+            if MediaType is not None:
+                video_items = [mc for mc in processed_media if getattr(mc, "media_type", None) == MediaType.VIDEO]
+            else:
+                video_items = []
+            if video_items:
+                policy_raw = kwargs.pop("video_policy", None)
+                if policy_raw is None:
+                    policy_raw = kwargs.pop("video_handling_policy", None)
+                if policy_raw is None:
+                    try:
+                        from ..config.manager import get_config_manager
+                        policy_raw = getattr(get_config_manager().config, "video", None).strategy  # type: ignore[union-attr]
+                    except Exception:
+                        policy_raw = "native_only"
+                policy = str(policy_raw or "native_only").strip().lower()
+                provider_name = str(getattr(self, "provider", "") or "").strip().lower()
+                model_supports_native_video = bool(
+                    provider_name == "huggingface"
+                    and isinstance(getattr(self, "model_capabilities", None), dict)
+                    and getattr(self, "model_capabilities", {}).get("video_support", False)
+                )
+                cfg_video = None
+                try:
+                    from ..config.manager import get_config_manager
+                    cfg_video = getattr(get_config_manager().config, "video", None)
+                except Exception:
+                    cfg_video = None
+                # Sampling controls (best-effort; keep small by default).
+                # NOTE: do not `pop` here: native video backends may also need the resolved values.
+                max_frames_raw = kwargs.get("video_max_frames", None)
+                if max_frames_raw is None:
+                    max_frames_raw = kwargs.get("max_video_frames", None)
+                if max_frames_raw is None:
+                    fallback_default = getattr(cfg_video, "max_frames", 3) if cfg_video is not None else 3
+                    native_default = getattr(cfg_video, "max_frames_native", None) if cfg_video is not None else None
+                    if native_default is None:
+                        native_default = fallback_default
+                    use_native_default = bool(
+                        model_supports_native_video and policy in ("native_only", "native", "disabled", "auto")
+                    )
+                    max_frames_raw = native_default if use_native_default else fallback_default
+                try:
+                    max_frames = max(1, int(max_frames_raw))
+                except Exception:
+                    max_frames = 3
+                frame_format_raw = kwargs.get("video_frame_format", None)
+                if frame_format_raw is None:
+                    try:
+                        from ..config.manager import get_config_manager
+                        frame_format_raw = getattr(get_config_manager().config, "video", None).frame_format  # type: ignore[union-attr]
+                    except Exception:
+                        frame_format_raw = "jpg"
+                frame_format = str(frame_format_raw or "jpg").strip().lower()
+                if frame_format not in {"jpg", "jpeg", "png"}:
+                    frame_format = "jpg"
+                if frame_format == "jpeg":
+                    frame_format = "jpg"
+                sampling_strategy_raw = kwargs.get("video_sampling_strategy", None)
+                if sampling_strategy_raw is None:
+                    try:
+                        from ..config.manager import get_config_manager
+                        sampling_strategy_raw = getattr(get_config_manager().config, "video", None).sampling_strategy  # type: ignore[union-attr]
+                    except Exception:
+                        sampling_strategy_raw = "uniform"
+                sampling_strategy = str(sampling_strategy_raw or "uniform").strip().lower()
+                if sampling_strategy not in {"uniform", "keyframes"}:
+                    sampling_strategy = "uniform"
+                max_frame_side_raw = kwargs.get("video_max_frame_side", None)
+                if max_frame_side_raw is None:
+                    max_frame_side_raw = kwargs.get("video_frame_max_side", None)
+                if max_frame_side_raw is None:
+                    max_frame_side_raw = getattr(cfg_video, "max_frame_side", 1024) if cfg_video is not None else 1024
+                try:
+                    max_frame_side = int(max_frame_side_raw) if max_frame_side_raw is not None else None
+                except Exception:
+                    max_frame_side = 1024
+                if isinstance(max_frame_side, int) and max_frame_side <= 0:
+                    max_frame_side = None
+                # Expose normalized sampling values to provider-native implementations.
+                kwargs["video_max_frames"] = max_frames
+                kwargs["video_frame_format"] = frame_format
+                kwargs["video_sampling_strategy"] = sampling_strategy
+                kwargs["video_max_frame_side"] = max_frame_side
+                if policy in ("native_only", "native", "disabled"):
+                    if not model_supports_native_video:
+                        raise UnsupportedFeatureError(
+                            f"Video input is not supported by model '{self.model}'. "
+                            "Choose a video-capable model, or pass video_policy='frames_caption' "
+                            "(samples frames and uses vision/image handling)."
+                        )
+                    # Keep video media for provider-native handling.
+                    try:
+                        from pathlib import Path
+                        from ..media.utils.video_frames import probe_duration_s
+                        for idx, mc in enumerate(video_items):
+                            video_path_raw = getattr(mc, "file_path", None) or getattr(mc, "content", None)
+                            if not isinstance(video_path_raw, str) or not video_path_raw.strip():
+                                continue
+                            vp = Path(video_path_raw)
+                            duration_s = probe_duration_s(vp)
+                            file_bytes = None
+                            try:
+                                file_bytes = int(vp.stat().st_size)
+                            except Exception:
+                                file_bytes = None
+                            avg_gap_s = None
+                            try:
+                                if isinstance(duration_s, (int, float)) and duration_s > 0 and max_frames > 0:
+                                    avg_gap_s = float(duration_s) / float(max_frames + 1)
+                            except Exception:
+                                avg_gap_s = None
+                            self.logger.info(
+                                "Video input policy: native video enabled (video will be sampled/budgeted for model input).",
+                                provider=provider_name,
+                                model=self.model,
+                                video_policy=policy,
+                                video_index=idx + 1,
+                                video_name=vp.name,
+                                video_duration_s=duration_s,
+                                video_bytes=file_bytes,
+                                video_max_frames=max_frames,
+                                video_sampling_strategy=sampling_strategy,
+                                video_max_frame_side=max_frame_side,
+                                video_avg_gap_s=avg_gap_s,
+                            )
+                            if isinstance(avg_gap_s, float) and avg_gap_s >= 10.0:
+                                self.logger.warning(
+                                    "Video sampling is sparse; important events may be missed. "
+                                    "Consider increasing video_max_frames/video.max_frames_native or using keyframes sampling.",
+                                    provider=provider_name,
+                                    model=self.model,
+                                    video_policy=policy,
+                                    video_name=vp.name,
+                                    video_duration_s=duration_s,
+                                    video_max_frames=max_frames,
+                                    video_avg_gap_s=avg_gap_s,
+                                )
+                    except Exception:
+                        pass
+                    # Insert a short marker to disambiguate native-video inputs across turns.
+                    #
+                    # Without this, follow-ups like "and this one?" can be brittle for native
+                    # video VLMs (they may over-weight the previous text-only answer and ignore
+                    # that a *new* video is attached in the current call).
+                    try:
+                        from ..media.types import MediaContent, ContentFormat
+                    except Exception:
+                        MediaContent = None  # type: ignore[assignment]
+                        ContentFormat = None  # type: ignore[assignment]
+                    if MediaContent is not None and ContentFormat is not None:
+                        try:
+                            from pathlib import Path
+                            from ..media.utils.video_frames import probe_duration_s
+                        except Exception:
+                            Path = None  # type: ignore[assignment]
+                            probe_duration_s = None  # type: ignore[assignment]
+                        new_media: List[Any] = []
+                        video_group_index = 0
+                        for mc in processed_media:
+                            if getattr(mc, "media_type", None) != MediaType.VIDEO:  # type: ignore[operator]
+                                new_media.append(mc)
+                                continue
+                            video_group_index += 1
+                            video_path_raw = getattr(mc, "file_path", None) or getattr(mc, "content", None)
+                            video_name = f"video_{video_group_index}"
+                            duration_s = None
+                            file_bytes = None
+                            try:
+                                if Path is not None and isinstance(video_path_raw, str) and video_path_raw.strip():
+                                    vp = Path(video_path_raw)
+                                    video_name = vp.name or video_name
+                                    try:
+                                        file_bytes = int(vp.stat().st_size)
+                                    except Exception:
+                                        file_bytes = None
+                                    if callable(probe_duration_s):
+                                        try:
+                                            duration_s = probe_duration_s(vp)
+                                        except Exception:
+                                            duration_s = None
+                            except Exception:
+                                duration_s = None
+                                file_bytes = None
+                            marker = MediaContent(
+                                media_type=MediaType.TEXT,
+                                content=(
+                                    f"Video {video_group_index} ({video_name}) is attached below. "
+                                    "This is the current video for this user message. "
+                                    "Answer the user's question about this video as if you watched it. "
+                                    "If earlier turns mention other videos, images, or audio, ignore them unless the user explicitly asks you to compare. "
+                                    "Do not mention tool activity, attachments lists, sampling, frames, extraction, or this marker."
+                                ),
+                                content_format=ContentFormat.TEXT,
+                                mime_type="text/plain",
+                                file_path=None,
+                                metadata={
+                                    "processor": "VideoNativeInputMarker",
+                                    "source_video": video_name,
+                                    "duration_s": duration_s,
+                                    "bytes": file_bytes,
+                                    "max_frames": max_frames,
+                                    "sampling_strategy": sampling_strategy,
+                                    "max_frame_side": max_frame_side,
+                                },
+                            )
+                            new_media.append(marker)
+                            new_media.append(mc)
+                        processed_media = new_media
+                elif policy in ("frames_caption", "frames", "frame_caption"):
+                    # Convert each video into a small set of sampled frames (images).
+                    try:
+                        from pathlib import Path
+                        import tempfile
+                        from ..media import AutoMediaHandler
+                        from ..media.utils.video_frames import extract_video_frames, probe_duration_s
+                    except Exception as e:
+                        raise UnsupportedFeatureError(f"Video frame fallback is not available: {e}")
+                    enrichments: List[Dict[str, Any]] = []
+                    new_media: List[Any] = []
+                    video_group_index = 0
+                    for idx, mc in enumerate(processed_media):
+                        if getattr(mc, "media_type", None) != MediaType.VIDEO:  # type: ignore[operator]
+                            new_media.append(mc)
+                            continue
+                        video_group_index += 1
+                        video_path_raw = getattr(mc, "file_path", None) or getattr(mc, "content", None)
+                        if not isinstance(video_path_raw, str) or not video_path_raw.strip():
+                            raise UnsupportedFeatureError("Video frame fallback requires a video file path.")
+                        video_path = Path(video_path_raw)
+                        if not video_path.exists():
+                            raise UnsupportedFeatureError(f"Video file not found: {video_path}")
+                        out_dir = Path(tempfile.mkdtemp(prefix="abstractcore_video_frames_"))
+                        duration_s = probe_duration_s(video_path)
+                        file_bytes = None
+                        try:
+                            file_bytes = int(video_path.stat().st_size)
+                        except Exception:
+                            file_bytes = None
+                        frames, timestamps_s = extract_video_frames(
+                            video_path,
+                            max_frames=max_frames,
+                            frame_format=frame_format,
+                            sampling_strategy=sampling_strategy,
+                            max_side=max_frame_side,
+                            output_dir=out_dir,
+                        )
+                        if not frames:
+                            raise UnsupportedFeatureError("Video frame fallback failed: no frames extracted.")
+                        handler = AutoMediaHandler(enable_glyph_compression=False)
+                        frame_media: List[Any] = []
+                        max_res = None
+                        if isinstance(max_frame_side, int) and max_frame_side > 0:
+                            max_res = (max_frame_side, max_frame_side)
+                        for fp in frames:
+                            res = handler.process_file(
+                                fp,
+                                provider=self.provider,
+                                model=self.model,
+                                glyph_compression="never",
+                                max_resolution=max_res,
+                            )
+                            if res and getattr(res, "success", False) and getattr(res, "media_content", None) is not None:
+                                frame_media.append(res.media_content)
+                        if not frame_media:
+                            raise UnsupportedFeatureError("Video frame fallback failed: extracted frames could not be processed as images.")
+                        avg_gap_s = None
+                        try:
+                            if isinstance(duration_s, (int, float)) and duration_s > 0 and max_frames > 0:
+                                avg_gap_s = float(duration_s) / float(max_frames + 1)
+                        except Exception:
+                            avg_gap_s = None
+                        self.logger.info(
+                            "Video input policy: frames_caption (sampling frames for downstream vision handling).",
+                            provider=provider_name,
+                            model=self.model,
+                            video_policy="frames_caption",
+                            video_index=video_group_index,
+                            video_name=video_path.name,
+                            video_duration_s=duration_s,
+                            video_bytes=file_bytes,
+                            extracted_frames=len(frame_media),
+                            video_max_frames=max_frames,
+                            video_sampling_strategy=sampling_strategy,
+                            video_max_frame_side=max_frame_side,
+                            video_avg_gap_s=avg_gap_s,
+                        )
+                        if isinstance(avg_gap_s, float) and avg_gap_s >= 10.0:
+                            self.logger.warning(
+                                "Video sampling is sparse; important events may be missed. "
+                                "Consider increasing video_max_frames/video.max_frames or using keyframes sampling.",
+                                provider=provider_name,
+                                model=self.model,
+                                video_policy="frames_caption",
+                                video_name=video_path.name,
+                                video_duration_s=duration_s,
+                                extracted_frames=len(frame_media),
+                                video_max_frames=max_frames,
+                                video_avg_gap_s=avg_gap_s,
+                            )
+                        # Insert a short text marker to avoid the model treating sampled frames as
+                        # unrelated standalone images (especially in follow-up prompts like "and this one?").
+                        try:
+                            from ..media.types import MediaContent, ContentFormat
+                        except Exception:
+                            MediaContent = None  # type: ignore[assignment]
+                            ContentFormat = None  # type: ignore[assignment]
+                        if MediaContent is not None and ContentFormat is not None:
+                            marker = MediaContent(
+                                media_type=MediaType.TEXT,
+                                content=(
+                                    f"Video {video_group_index} ({video_path.name}) — "
+                                    "the following images belong to this video in chronological order. "
+                                    "Answer the user's question about this video as if you watched it. "
+                                    "Do not mention frames, timestamps, sampling, extraction, or this marker."
+                                ),
+                                content_format=ContentFormat.TEXT,
+                                mime_type="text/plain",
+                                file_path=None,
+                                metadata={
+                                    "processor": "VideoFrameFallback",
+                                    "source_video": video_path.name,
+                                    "frame_count": len(frame_media),
+                                    "timestamps_s": timestamps_s,
+                                    "duration_s": duration_s,
+                                    "bytes": file_bytes,
+                                },
+                            )
+                            new_media.append(marker)
+                        new_media.extend(frame_media)
+                        if build_enrichment_item is not None:
+                            enrichments.append(
+                                build_enrichment_item(
+                                    status="used",
+                                    input_modality="video",
+                                    summary_kind="frames",
+                                    policy="frames_caption",
+                                    backend={"kind": "unknown", "source": "ffmpeg"},
+                                    input_index=idx + 1,
+                                    input_name=str(video_path.name),
+                                    artifact={
+                                        "frame_count": len(frame_media),
+                                        "timestamps_s": timestamps_s,
+                                        "duration_s": duration_s,
+                                        "bytes": file_bytes,
+                                    },
+                                )
+                            )
+                    processed_media = new_media
+                    if enrichments:
+                        if media_enrichment is None:
+                            media_enrichment = enrichments
+                        else:
+                            media_enrichment.extend(enrichments)
+                elif policy == "auto":
+                    if model_supports_native_video:
+                        # Use native video when available.
+                        pass
+                    else:
+                        # Auto fallback: sample frames and proceed with existing image pipeline.
+                        # This works well for vision-capable models; for text-only models it requires a vision fallback.
+                        policy_to_use = "frames_caption"
+                        kwargs["video_policy"] = policy_to_use
+                        # Re-run this branch once with explicit policy.
+                        return self.generate_with_telemetry(
+                            prompt=prompt,
+                            messages=messages,
+                            system_prompt=system_prompt,
+                            tools=tools,
+                            media=processed_media,
+                            response_model=response_model,
+                            retry_strategy=retry_strategy,
+                            tool_call_tags=tool_call_tags,
+                            execute_tools=execute_tools,
+                            stream=stream,
+                            **kwargs,
+                        )
+                else:
+                    raise ValueError(f"Unknown video_policy '{policy}'. Expected one of: native_only, frames_caption, auto.")
         # Convert tools to ToolDefinition objects first (outside retry loop)
         converted_tools = None
         if tools:
@@ -545,7 +1634,10 @@ class BaseProvider(AbstractCoreInterface, ABC):
         if not should_execute_tools and converted_tools:
             # If tools are provided but execution is disabled,
             # we still pass them to the provider for generation but won't execute them
-            self.logger.info("Tool execution disabled - tools will be generated but not executed")
+            self.logger.debug(
+                "Provider-side tool execution disabled (expected for runtime/host tool execution); "
+                "tools will be sent for generation only."
+            )
         # Define generation function for retry wrapper
         def _execute_generation():
@@ -554,7 +1646,7 @@ class BaseProvider(AbstractCoreInterface, ABC):
             # Emit generation started event (covers request received)
             event_data = {
-                "prompt": prompt[:100] + "..." if len(prompt) > 100 else prompt,
+                "prompt": preview_text(prompt, max_chars=100),
                 "has_tools": bool(tools),
                 "stream": stream,
                 "model": self.model,
@@ -613,7 +1705,11 @@ class BaseProvider(AbstractCoreInterface, ABC):
                         ttft_ms: Optional[float] = None
                         for processed_chunk in processor.process_stream(response, converted_tools):
                             if isinstance(processed_chunk.content, str) and processed_chunk.content:
-                                processed_chunk.content = self._strip_output_wrappers(processed_chunk.content)
+                                processed_chunk.content = strip_output_wrappers(
+                                    processed_chunk.content,
+                                    architecture_format=self.architecture_config,
+                                    model_capabilities=self.model_capabilities,
+                                )
                             if ttft_ms is None:
                                 has_content = isinstance(processed_chunk.content, str) and bool(processed_chunk.content)
                                 has_tools = isinstance(processed_chunk.tool_calls, list) and bool(processed_chunk.tool_calls)
@@ -651,9 +1747,29 @@ class BaseProvider(AbstractCoreInterface, ABC):
                     if tool_call_tags and response.content and not self._should_clean_tool_call_markup(tool_call_tags):
                         response = self._apply_non_streaming_tag_rewriting(response, tool_call_tags)
-                # Strip model-specific output wrappers (e.g. GLM <|begin_of_box|>…<|end_of_box|>).
+                # Normalize provider output (wrapper tokens, Harmony transcripts, think tags).
                 if response and isinstance(response.content, str) and response.content:
-                    response.content = self._strip_output_wrappers(response.content)
+                    cleaned, reasoning = normalize_assistant_text(
+                        response.content,
+                        architecture_format=self.architecture_config,
+                        model_capabilities=self.model_capabilities,
+                    )
+                    response.content = cleaned
+                    if isinstance(reasoning, str) and reasoning.strip():
+                        if response.metadata is None or not isinstance(response.metadata, dict):
+                            response.metadata = {}
+                        existing = response.metadata.get("reasoning")
+                        if isinstance(existing, str) and existing.strip():
+                            if reasoning.strip() not in existing:
+                                response.metadata["reasoning"] = f"{existing.strip()}\n\n{reasoning.strip()}"
+                        else:
+                            response.metadata["reasoning"] = reasoning.strip()
+                # Attach media enrichment transparency metadata (caption/STT/etc.).
+                if media_enrichment and response:
+                    from ..media.enrichment import merge_enrichment_metadata
+                    response.metadata = merge_enrichment_metadata(response.metadata, media_enrichment)
                 # Add visual token calculation if media metadata is available
                 if media_metadata and response:
@@ -689,7 +1805,7 @@ class BaseProvider(AbstractCoreInterface, ABC):
             emit_global(EventType.ERROR, {
                 "error": str(e),
                 "error_type": type(e).__name__,
-                "prompt": prompt[:100] + "..." if len(prompt) > 100 else prompt,
+                "prompt": preview_text(prompt, max_chars=100),
                 "model": self.model,
                 "provider": self.__class__.__name__
             }, source=self.__class__.__name__)
@@ -980,12 +2096,37 @@ class BaseProvider(AbstractCoreInterface, ABC):
         result_kwargs["max_output_tokens"] = effective_max_output_i
         # Add unified generation parameters with fallback hierarchy: kwargs → instance → defaults
-        result_kwargs["temperature"] = result_kwargs.get("temperature", self.temperature)
-        if self.seed is not None:
-            result_kwargs["seed"] = result_kwargs.get("seed", self.seed)
+        temperature = result_kwargs.get("temperature", self.temperature)
+        if temperature is None:
+            temperature = self.temperature
+        result_kwargs["temperature"] = temperature
+        seed_value = self._normalize_seed(result_kwargs.get("seed", self.seed))
+        if seed_value is not None:
+            result_kwargs["seed"] = seed_value
+        else:
+            # Do not forward seed when unset/random (None or negative sentinel like -1).
+            result_kwargs.pop("seed", None)
         return result_kwargs
+    @staticmethod
+    def _normalize_seed(seed: Any) -> Optional[int]:
+        """Normalize seed semantics across providers.
+        - None or any negative value -> None (meaning: don't send a provider seed / random).
+        - Non-bool numeric-ish values -> int(seed) if >= 0.
+        """
+        try:
+            if seed is None:
+                return None
+            if isinstance(seed, bool):
+                return None
+            seed_i = int(seed)
+            return seed_i if seed_i >= 0 else None
+        except Exception:
+            return None
     def _extract_generation_params(self, **kwargs) -> Dict[str, Any]:
         """
         Extract generation parameters with consistent fallback hierarchy.
@@ -996,10 +2137,13 @@ class BaseProvider(AbstractCoreInterface, ABC):
         params = {}
         # Temperature (always present)
-        params["temperature"] = kwargs.get("temperature", self.temperature)
+        temperature = kwargs.get("temperature", self.temperature)
+        if temperature is None:
+            temperature = self.temperature
+        params["temperature"] = temperature
         # Seed (only if not None)
-        seed_value = kwargs.get("seed", self.seed)
+        seed_value = self._normalize_seed(kwargs.get("seed", self.seed))
         if seed_value is not None:
             params["seed"] = seed_value
@@ -1041,7 +2185,10 @@ class BaseProvider(AbstractCoreInterface, ABC):
         if not should_execute:
             # Tool execution disabled - return response with tool calls but don't execute
-            self.logger.info("Tool execution disabled - returning response with tool calls")
+            self.logger.debug(
+                "Provider-side tool execution disabled (expected for runtime/host tool execution); "
+                "returning response with tool calls."
+            )
             return response
         # Emit tool started event
@@ -1098,7 +2245,8 @@ class BaseProvider(AbstractCoreInterface, ABC):
             finish_reason=response.finish_reason,
             raw_response=response.raw_response,
             usage=response.usage,
-            tool_calls=response.tool_calls  # Keep original format
+            tool_calls=response.tool_calls,  # Keep original format
+            metadata=response.metadata,
         )
     def _format_tool_results(self, tool_calls: List, tool_results: List) -> str:
@@ -1106,9 +2254,7 @@ class BaseProvider(AbstractCoreInterface, ABC):
         results_text = "\n\nTool Results:\n"
         for call, result in zip(tool_calls, tool_results):
             # Format parameters for display (limit size)
-            params_str = str(call.arguments) if call.arguments else "{}"
-            if len(params_str) > 100:
-                params_str = params_str[:97] + "..."
+            params_str = preview_text(str(call.arguments) if call.arguments else "{}", max_chars=100)
             # Show tool name and parameters for transparency
             results_text += f"🔧 Tool: {call.name}({params_str})\n"
@@ -1174,26 +2320,341 @@ class BaseProvider(AbstractCoreInterface, ABC):
         """Update HTTP client timeout if the provider has one. Override in subclasses."""
         pass
-    # Memory management methods
-    def unload(self) -> None:
+    # Prompt cache management methods
+    def supports_prompt_cache(self) -> bool:
+        """Return True if this provider supports best-effort prompt caching.
+        Semantics differ by provider:
+        - Remote providers (OpenAI): `prompt_cache_key` is forwarded; cache is managed server-side.
+        - Local providers (MLX / llama.cpp): in-process KV/prefix caches can be retained across calls.
         """
-        Unload the model from memory.
+        return False
+    # Provider-specific prompt cache backend hooks (optional)
+    #
+    # Providers that implement in-process KV caching (MLX, llama.cpp, etc.) can override these to enable
+    # `prompt_cache_update`, `prompt_cache_fork`, and `prompt_cache_prepare_modules`.
+    def _prompt_cache_backend_create(self) -> Optional[Any]:
+        return None
+    def _prompt_cache_backend_clone(self, cache_value: Any) -> Optional[Any]:
+        _ = cache_value
+        return None
-        For local providers (MLX, HuggingFace), this explicitly frees model memory.
-        For server-based providers (Ollama, LMStudio), this requests server unload.
-        For API providers (OpenAI, Anthropic), this is a no-op.
+    def _prompt_cache_backend_append(
+        self,
+        cache_value: Any,
+        *,
+        prompt: str = "",
+        messages: Optional[List[Dict[str, Any]]] = None,
+        system_prompt: Optional[str] = None,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        add_generation_prompt: bool = False,
+        **kwargs,
+    ) -> bool:
+        _ = (cache_value, prompt, messages, system_prompt, tools, add_generation_prompt, kwargs)
+        return False
+    def _prompt_cache_backend_token_count(self, cache_value: Any) -> Optional[int]:
+        _ = cache_value
+        return None
+    def _normalize_prompt_cache_key(self, key: Any) -> Optional[str]:
+        if not isinstance(key, str):
+            return None
+        key = key.strip()
+        return key if key else None
+    def _apply_default_prompt_cache_key(self, kwargs: Dict[str, Any]) -> None:
+        # Explicit caller override wins (even if None / empty to disable).
+        if "prompt_cache_key" in kwargs:
+            kwargs["prompt_cache_key"] = self._normalize_prompt_cache_key(kwargs.get("prompt_cache_key"))
+            return
+        if self._default_prompt_cache_key and self.supports_prompt_cache():
+            kwargs["prompt_cache_key"] = self._default_prompt_cache_key
+    def get_prompt_cache_stats(self) -> Dict[str, Any]:
+        """Return basic prompt cache stats (in-process store only)."""
+        stats = self._prompt_cache_store.stats()
+        stats["default_key"] = self._default_prompt_cache_key
+        try:
+            keys = self._prompt_cache_store.keys()
+            if isinstance(keys, list):
+                stats["keys"] = list(keys)
+                meta_by_key: Dict[str, Any] = {}
+                for k in keys:
+                    meta = self._prompt_cache_store.meta(k)
+                    if isinstance(meta, dict) and meta:
+                        meta_by_key[str(k)] = dict(meta)
+                if meta_by_key:
+                    stats["meta_by_key"] = meta_by_key
+        except Exception:
+            pass
+        return stats
-        After calling unload(), the provider instance should not be used for generation.
-        Create a new provider instance if you need to generate again.
+    def prompt_cache_set(self, key: str, *, make_default: bool = True, **kwargs) -> bool:
+        """Set the default prompt cache key for this provider instance.
-        Usage:
-            provider = create_llm("mlx", model="...")
-            response = provider.generate("Hello")
-            provider.unload()  # Free memory
-            del provider  # Remove reference
+        Provider-specific cache allocation/warming is implemented by subclasses when applicable.
+        """
+        normalized = self._normalize_prompt_cache_key(key)
+        if normalized is None:
+            return False
+        if not self.supports_prompt_cache():
+            return False
+        _ = kwargs
+        # Best-effort: allocate backend cache if the provider supports it.
+        if self._prompt_cache_store.get(normalized) is None:
+            created = self._prompt_cache_backend_create()
+            if created is not None:
+                try:
+                    self._prompt_cache_store.set(normalized, created, meta={"backend": "provider"})
+                except Exception:
+                    pass
+        if make_default:
+            self._default_prompt_cache_key = normalized
+        return True
+    def prompt_cache_update(
+        self,
+        key: str,
+        *,
+        prompt: str = "",
+        messages: Optional[List[Dict[str, Any]]] = None,
+        system_prompt: Optional[str] = None,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        add_generation_prompt: bool = False,
+        ttl_s: Optional[float] = None,
+        **kwargs,
+    ) -> bool:
+        """Append new prompt context into an existing cache key (best-effort).
+        Semantics:
+        - Local runtimes can implement true KV prefill updates (append-only).
+        - Remote providers typically cannot be “pre-filled” explicitly; they may ignore this.
+        Arguments are intentionally similar to `generate()` so higher-level code can reuse its own
+        prompt/module construction logic.
+        """
+        normalized = self._normalize_prompt_cache_key(key)
+        if normalized is None:
+            return False
+        if not self.supports_prompt_cache():
+            return False
+        # Ensure the cache exists if the provider can allocate a backend cache object.
+        cache_value = self._prompt_cache_store.get(normalized)
+        if cache_value is None:
+            if not self.prompt_cache_set(normalized, make_default=False):
+                return False
+            cache_value = self._prompt_cache_store.get(normalized)
+            if cache_value is None:
+                return False
+        ok = self._prompt_cache_backend_append(
+            cache_value,
+            prompt=str(prompt or ""),
+            messages=messages,
+            system_prompt=system_prompt,
+            tools=tools,
+            add_generation_prompt=bool(add_generation_prompt),
+            **kwargs,
+        )
+        if not ok:
+            return False
+        # Update TTL/metadata best-effort.
+        if ttl_s is not None:
+            try:
+                meta = self._prompt_cache_store.meta(normalized) or {}
+                self._prompt_cache_store.set(normalized, cache_value, ttl_s=ttl_s, meta=meta)
+            except Exception:
+                pass
+        return True
+    def prompt_cache_fork(
+        self,
+        from_key: str,
+        to_key: str,
+        *,
+        make_default: bool = False,
+        ttl_s: Optional[float] = None,
+        **kwargs,
+    ) -> bool:
+        """Create a new cache key by cloning another cache (best-effort).
+        This is the primitive needed for hierarchical/module caches:
+        - build stable shared prefixes (persona, memory blueprints, tool schemas)
+        - fork them into per-session caches that can be appended/mutated safely.
+        """
+        _ = kwargs
+        src = self._normalize_prompt_cache_key(from_key)
+        dst = self._normalize_prompt_cache_key(to_key)
+        if src is None or dst is None:
+            return False
+        if not self.supports_prompt_cache():
+            return False
+        src_value = self._prompt_cache_store.get(src)
+        if src_value is None:
+            return False
+        cloned = self._prompt_cache_backend_clone(src_value)
+        if cloned is None:
+            return False
+        try:
+            meta = self._prompt_cache_store.meta(src) or {}
+            meta = dict(meta)
+            meta.setdefault("forked_from", src)
+            self._prompt_cache_store.set(dst, cloned, ttl_s=ttl_s, meta=meta)
+        except Exception:
+            return False
+        if make_default:
+            self._default_prompt_cache_key = dst
+        return True
+    def prompt_cache_prepare_modules(
+        self,
+        *,
+        namespace: str,
+        modules: List[Union[PromptCacheModule, Dict[str, Any]]],
+        make_default: bool = False,
+        ttl_s: Optional[float] = None,
+        version: int = 1,
+    ) -> Dict[str, Any]:
+        """Ensure hierarchical prefix caches exist for an ordered module list (best-effort).
+        This builds immutable prefix caches (by derived keys) so callers can:
+        - reuse stable sub-prefixes (persona, memory blueprints, etc.)
+        - fork the final prefix into a per-session cache for incremental chat
+        Returns a JSON-serializable dict containing per-module derived keys.
+        """
+        ns = str(namespace or "").strip()
+        if not ns:
+            return {"supported": False, "error": "namespace required"}
+        if not self.supports_prompt_cache():
+            return {"supported": False, "error": "provider does not support prompt caching"}
+        normalized_modules: List[PromptCacheModule] = []
+        for m in modules or []:
+            if isinstance(m, PromptCacheModule):
+                normalized_modules.append(m.normalized())
+            elif isinstance(m, dict):
+                try:
+                    normalized_modules.append(PromptCacheModule(**m).normalized())
+                except Exception:
+                    continue
+        if not normalized_modules:
+            return {"supported": False, "error": "no modules provided"}
+        # Derive deterministic prefix keys per module boundary.
+        prefix_hash = hashlib.sha256(f"acore-prompt-cache:{int(version)}".encode("utf-8")).hexdigest()
+        derived: List[Dict[str, Any]] = []
+        keys: List[str] = []
+        for mod in normalized_modules:
+            prefix_hash = hashlib.sha256((prefix_hash + mod.fingerprint(version=version)).encode("utf-8")).hexdigest()
+            key = f"{ns}:{prefix_hash[:16]}"
+            keys.append(key)
+            derived.append({"module_id": mod.module_id, "cache_key": key, "module_hash": mod.fingerprint(version=version)})
+        # Find the longest existing prefix cache.
+        start_idx = -1
+        for i, key in enumerate(keys):
+            if self._prompt_cache_store.get(key) is None:
+                break
+            start_idx = i
+        # Start from existing prefix (clone to avoid mutating the stored snapshot).
+        current_cache: Optional[Any] = None
+        if start_idx >= 0:
+            existing = self._prompt_cache_store.get(keys[start_idx])
+            if existing is not None:
+                current_cache = self._prompt_cache_backend_clone(existing) or None
+        # If we have no starting cache, start from empty backend cache.
+        if current_cache is None:
+            current_cache = self._prompt_cache_backend_create()
+            if current_cache is None:
+                return {"supported": False, "error": "provider does not implement in-process cache backend"}
+        # Build missing caches.
+        for j in range(start_idx + 1, len(keys)):
+            mod = normalized_modules[j]
+            ok = self._prompt_cache_backend_append(
+                current_cache,
+                prompt=str(mod.prompt or ""),
+                messages=mod.messages,
+                system_prompt=mod.system_prompt,
+                tools=mod.tools,
+                add_generation_prompt=bool(mod.add_generation_prompt),
+            )
+            if not ok:
+                return {"supported": False, "error": f"failed to append module '{mod.module_id}'"}
+            snapshot = self._prompt_cache_backend_clone(current_cache) or None
+            if snapshot is None:
+                return {"supported": False, "error": "provider does not support cache cloning"}
+            meta = {
+                "namespace": ns,
+                "module_id": mod.module_id,
+                "module_hash": mod.fingerprint(version=version),
+                "index": j,
+                "backend": "provider",
+            }
+            tok = self._prompt_cache_backend_token_count(snapshot)
+            if isinstance(tok, int) and tok >= 0:
+                meta["token_count"] = tok
+            self._prompt_cache_store.set(keys[j], snapshot, ttl_s=ttl_s, meta=meta)
+        if make_default:
+            self._default_prompt_cache_key = keys[-1]
+        return {
+            "supported": True,
+            "namespace": ns,
+            "version": int(version),
+            "modules": derived,
+            "final_cache_key": keys[-1],
+        }
+    def prompt_cache_clear(self, key: Optional[str] = None) -> bool:
+        """Clear prompt caches for this provider instance (best-effort)."""
+        normalized = self._normalize_prompt_cache_key(key) if key is not None else None
+        if not self.supports_prompt_cache():
+            return False
+        if normalized is None:
+            self._default_prompt_cache_key = None
+            self._prompt_cache_store.clear()
+            return True
+        cleared = self._prompt_cache_store.delete(normalized)
+        if self._default_prompt_cache_key == normalized:
+            self._default_prompt_cache_key = None
+        return cleared
+    # Memory management methods
+    @abstractmethod
+    def unload_model(self, model_name: str) -> None:
+        """
+        Unload/cleanup resources for a specific model.
+        This is the single canonical unload entrypoint across providers.
+        Providers must implement this as a best-effort cleanup hook:
+        - In-process providers (e.g. MLX, HuggingFace): free local model resources.
+        - Some self-hosted servers (e.g. Ollama): may request server-side eviction/unload.
+        - OpenAI-compatible servers (e.g. LMStudio, vLLM, openai-compatible): typically only close client
+          connections; server-side model unloading may not be available and is controlled by the server (TTL/eviction).
+        - Cloud APIs (e.g. OpenAI, Anthropic): usually a no-op (safe to call).
         """
-        # Default implementation does nothing (suitable for API providers)
-        pass
     # Token configuration helpers - expose interface methods for user convenience
     def get_token_configuration_summary(self) -> str:
@@ -1202,7 +2663,19 @@ class BaseProvider(AbstractCoreInterface, ABC):
     def validate_token_constraints(self) -> List[str]:
         """Validate token configuration and return warnings/suggestions"""
-        return super().validate_token_constraints()
+        warnings_list = super().validate_token_constraints()
+        # Embedding models are not text-generative: output token limits are irrelevant and can
+        # legitimately be 0 (e.g. Nomic Embed). Suppress misleading output-token warnings.
+        try:
+            caps = getattr(self, "model_capabilities", None)
+            model_type = caps.get("model_type") if isinstance(caps, dict) else None
+            if isinstance(model_type, str) and model_type.strip().lower() == "embedding":
+                warnings_list = [w for w in warnings_list if "max_output_tokens" not in str(w)]
+        except Exception:
+            pass
+        return warnings_list
     def calculate_token_budget(self, input_text: str, desired_output_tokens: int,
                               safety_margin: float = 0.1) -> tuple[int, List[str]]:
@@ -1239,7 +2712,7 @@ class BaseProvider(AbstractCoreInterface, ABC):
         except ImportError as e:
             raise ImportError(
                 f"Media processing requires additional dependencies. "
-                f"Install with: pip install abstractcore[media]. Error: {e}"
+                f"Install with: pip install \"abstractcore[media]\". Error: {e}"
             )
         processed_media = []
@@ -1506,45 +2979,6 @@ class BaseProvider(AbstractCoreInterface, ABC):
         # Return original response if rewriting fails
         return response
-    def _strip_output_wrappers(self, content: str) -> str:
-        """Strip known model-specific wrapper tokens around assistant output.
-        Some model/server combinations emit wrapper tokens like:
-          <|begin_of_box|> ... <|end_of_box|>
-        We remove these only when they appear as leading/trailing wrappers (not when
-        embedded mid-text).
-        """
-        if not isinstance(content, str) or not content:
-            return content
-        wrappers: Dict[str, str] = {}
-        for src in (self.architecture_config, self.model_capabilities):
-            if not isinstance(src, dict):
-                continue
-            w = src.get("output_wrappers")
-            if not isinstance(w, dict):
-                continue
-            start = w.get("start")
-            end = w.get("end")
-            if isinstance(start, str) and start.strip():
-                wrappers.setdefault("start", start.strip())
-            if isinstance(end, str) and end.strip():
-                wrappers.setdefault("end", end.strip())
-        if not wrappers:
-            return content
-        out = content
-        start_token = wrappers.get("start")
-        end_token = wrappers.get("end")
-        if isinstance(start_token, str) and start_token:
-            out = re.sub(r"^\s*" + re.escape(start_token) + r"\s*", "", out, count=1)
-        if isinstance(end_token, str) and end_token:
-            out = re.sub(r"\s*" + re.escape(end_token) + r"\s*$", "", out, count=1)
-        return out
     def _normalize_tool_calls_passthrough(
         self,
         *,
@@ -2014,6 +3448,7 @@ Please provide a structured response."""
         Returns:
             GenerateResponse, AsyncIterator[GenerateResponse] for streaming, or BaseModel for structured output
         """
+        self._apply_default_prompt_cache_key(kwargs)
         response = await self._agenerate_internal(
             prompt, messages, system_prompt, tools, media, stream, **kwargs
         )

abstractcore 2.9.1__py3-none-any.whl → 2.11.2__py3-none-any.whl

abstractcore 2.9.1py3-none-any.whl → 2.11.2py3-none-any.whl