abstractcore 2.9.1__py3-none-any.whl → 2.11.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. abstractcore/__init__.py +7 -27
  2. abstractcore/apps/extractor.py +33 -100
  3. abstractcore/apps/intent.py +19 -0
  4. abstractcore/apps/judge.py +20 -1
  5. abstractcore/apps/summarizer.py +20 -1
  6. abstractcore/architectures/detection.py +34 -1
  7. abstractcore/architectures/response_postprocessing.py +313 -0
  8. abstractcore/assets/architecture_formats.json +38 -8
  9. abstractcore/assets/model_capabilities.json +781 -160
  10. abstractcore/compression/__init__.py +1 -2
  11. abstractcore/compression/glyph_processor.py +6 -4
  12. abstractcore/config/main.py +31 -19
  13. abstractcore/config/manager.py +389 -11
  14. abstractcore/config/vision_config.py +5 -5
  15. abstractcore/core/interface.py +151 -3
  16. abstractcore/core/session.py +16 -10
  17. abstractcore/download.py +1 -1
  18. abstractcore/embeddings/manager.py +20 -6
  19. abstractcore/endpoint/__init__.py +2 -0
  20. abstractcore/endpoint/app.py +458 -0
  21. abstractcore/mcp/client.py +3 -1
  22. abstractcore/media/__init__.py +52 -17
  23. abstractcore/media/auto_handler.py +42 -22
  24. abstractcore/media/base.py +44 -1
  25. abstractcore/media/capabilities.py +12 -33
  26. abstractcore/media/enrichment.py +105 -0
  27. abstractcore/media/handlers/anthropic_handler.py +19 -28
  28. abstractcore/media/handlers/local_handler.py +124 -70
  29. abstractcore/media/handlers/openai_handler.py +19 -31
  30. abstractcore/media/processors/__init__.py +4 -2
  31. abstractcore/media/processors/audio_processor.py +57 -0
  32. abstractcore/media/processors/office_processor.py +8 -3
  33. abstractcore/media/processors/pdf_processor.py +46 -3
  34. abstractcore/media/processors/text_processor.py +22 -24
  35. abstractcore/media/processors/video_processor.py +58 -0
  36. abstractcore/media/types.py +97 -4
  37. abstractcore/media/utils/image_scaler.py +20 -2
  38. abstractcore/media/utils/video_frames.py +219 -0
  39. abstractcore/media/vision_fallback.py +136 -22
  40. abstractcore/processing/__init__.py +32 -3
  41. abstractcore/processing/basic_deepsearch.py +15 -10
  42. abstractcore/processing/basic_intent.py +3 -2
  43. abstractcore/processing/basic_judge.py +3 -2
  44. abstractcore/processing/basic_summarizer.py +1 -1
  45. abstractcore/providers/__init__.py +3 -1
  46. abstractcore/providers/anthropic_provider.py +95 -8
  47. abstractcore/providers/base.py +1516 -81
  48. abstractcore/providers/huggingface_provider.py +546 -69
  49. abstractcore/providers/lmstudio_provider.py +35 -923
  50. abstractcore/providers/mlx_provider.py +382 -35
  51. abstractcore/providers/model_capabilities.py +5 -1
  52. abstractcore/providers/ollama_provider.py +99 -15
  53. abstractcore/providers/openai_compatible_provider.py +406 -180
  54. abstractcore/providers/openai_provider.py +188 -44
  55. abstractcore/providers/openrouter_provider.py +76 -0
  56. abstractcore/providers/registry.py +61 -5
  57. abstractcore/providers/streaming.py +138 -33
  58. abstractcore/providers/vllm_provider.py +92 -817
  59. abstractcore/server/app.py +461 -13
  60. abstractcore/server/audio_endpoints.py +139 -0
  61. abstractcore/server/vision_endpoints.py +1319 -0
  62. abstractcore/structured/handler.py +316 -41
  63. abstractcore/tools/common_tools.py +5501 -2012
  64. abstractcore/tools/comms_tools.py +1641 -0
  65. abstractcore/tools/core.py +37 -7
  66. abstractcore/tools/handler.py +4 -9
  67. abstractcore/tools/parser.py +49 -2
  68. abstractcore/tools/tag_rewriter.py +2 -1
  69. abstractcore/tools/telegram_tdlib.py +407 -0
  70. abstractcore/tools/telegram_tools.py +261 -0
  71. abstractcore/utils/cli.py +1085 -72
  72. abstractcore/utils/token_utils.py +2 -0
  73. abstractcore/utils/truncation.py +29 -0
  74. abstractcore/utils/version.py +3 -4
  75. abstractcore/utils/vlm_token_calculator.py +12 -2
  76. abstractcore-2.11.2.dist-info/METADATA +562 -0
  77. abstractcore-2.11.2.dist-info/RECORD +133 -0
  78. {abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/WHEEL +1 -1
  79. {abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/entry_points.txt +1 -0
  80. abstractcore-2.9.1.dist-info/METADATA +0 -1190
  81. abstractcore-2.9.1.dist-info/RECORD +0 -119
  82. {abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/licenses/LICENSE +0 -0
  83. {abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/top_level.txt +0 -0
@@ -9,8 +9,10 @@ import warnings
9
9
  import json
10
10
  import re
11
11
  import socket
12
- from collections import deque
13
- from typing import List, Dict, Any, Optional, Union, Iterator, AsyncIterator, Type, TYPE_CHECKING
12
+ import hashlib
13
+ from collections import deque, OrderedDict
14
+ from dataclasses import dataclass, field
15
+ from typing import List, Dict, Any, Optional, Union, Iterator, AsyncIterator, Type, TYPE_CHECKING, Tuple
14
16
  from abc import ABC, abstractmethod
15
17
 
16
18
  try:
@@ -26,14 +28,20 @@ from ..events import EventType, Event
26
28
  from datetime import datetime
27
29
  from ..utils.structured_logging import get_logger
28
30
  from ..utils.jsonish import loads_dict_like
31
+ from ..utils.truncation import preview_text
29
32
  from ..exceptions import (
30
33
  ProviderAPIError,
31
34
  AuthenticationError,
32
35
  RateLimitError,
33
36
  InvalidRequestError,
37
+ UnsupportedFeatureError,
34
38
  ModelNotFoundError
35
39
  )
36
40
  from ..architectures import detect_architecture, get_architecture_format, get_model_capabilities
41
+ from ..architectures.response_postprocessing import (
42
+ normalize_assistant_text,
43
+ strip_output_wrappers,
44
+ )
37
45
  from ..tools import execute_tools
38
46
  from ..core.retry import RetryManager, RetryConfig
39
47
 
@@ -42,6 +50,178 @@ if TYPE_CHECKING: # pragma: no cover
42
50
  from ..media.types import MediaContent
43
51
 
44
52
 
53
+ @dataclass
54
+ class _PromptCacheEntry:
55
+ value: Any
56
+ created_at_s: float
57
+ last_accessed_at_s: float
58
+ ttl_s: Optional[float] = None
59
+ meta: Dict[str, Any] = field(default_factory=dict)
60
+
61
+
62
+ class PromptCacheStore:
63
+ """Best-effort in-process prompt cache store (LRU + optional TTL).
64
+
65
+ Providers can store arbitrary backend-specific cache objects keyed by a caller-provided string
66
+ (`prompt_cache_key`). This is primarily useful for local inference backends (MLX, llama.cpp).
67
+
68
+ Notes:
69
+ - This store is intentionally simple and in-process only.
70
+ - Callers should treat prompt caches as potentially sensitive (they contain user prompt state).
71
+ """
72
+
73
+ def __init__(self, *, max_entries: int = 32, default_ttl_s: Optional[float] = None):
74
+ self._max_entries = int(max_entries) if max_entries and int(max_entries) > 0 else 32
75
+ self._default_ttl_s = default_ttl_s if default_ttl_s is None else float(default_ttl_s)
76
+ self._entries: "OrderedDict[str, _PromptCacheEntry]" = OrderedDict()
77
+
78
+ def _is_expired(self, entry: _PromptCacheEntry) -> bool:
79
+ ttl_s = entry.ttl_s if entry.ttl_s is not None else self._default_ttl_s
80
+ if ttl_s is None:
81
+ return False
82
+ return (time.time() - entry.last_accessed_at_s) > float(ttl_s)
83
+
84
+ def get(self, key: str) -> Optional[Any]:
85
+ if not isinstance(key, str) or not key.strip():
86
+ return None
87
+ key = key.strip()
88
+ entry = self._entries.get(key)
89
+ if entry is None:
90
+ return None
91
+ if self._is_expired(entry):
92
+ self.delete(key)
93
+ return None
94
+ entry.last_accessed_at_s = time.time()
95
+ self._entries.move_to_end(key)
96
+ return entry.value
97
+
98
+ def set(
99
+ self,
100
+ key: str,
101
+ value: Any,
102
+ *,
103
+ ttl_s: Optional[float] = None,
104
+ meta: Optional[Dict[str, Any]] = None,
105
+ ) -> None:
106
+ if not isinstance(key, str) or not key.strip():
107
+ raise ValueError("prompt cache key must be a non-empty string")
108
+ key = key.strip()
109
+ now = time.time()
110
+ self._entries[key] = _PromptCacheEntry(
111
+ value=value,
112
+ created_at_s=now,
113
+ last_accessed_at_s=now,
114
+ ttl_s=ttl_s,
115
+ meta=dict(meta or {}),
116
+ )
117
+ self._entries.move_to_end(key)
118
+ while len(self._entries) > self._max_entries:
119
+ self._entries.popitem(last=False)
120
+
121
+ def delete(self, key: str) -> bool:
122
+ if not isinstance(key, str) or not key.strip():
123
+ return False
124
+ key = key.strip()
125
+ return self._entries.pop(key, None) is not None
126
+
127
+ def clear(self) -> None:
128
+ self._entries.clear()
129
+
130
+ def stats(self) -> Dict[str, Any]:
131
+ # Opportunistically purge expired entries.
132
+ expired = []
133
+ for k, v in self._entries.items():
134
+ if self._is_expired(v):
135
+ expired.append(k)
136
+ for k in expired:
137
+ self.delete(k)
138
+
139
+ return {
140
+ "entries": len(self._entries),
141
+ "max_entries": self._max_entries,
142
+ "default_ttl_s": self._default_ttl_s,
143
+ }
144
+
145
+ def keys(self) -> List[str]:
146
+ return list(self._entries.keys())
147
+
148
+ def meta(self, key: str) -> Optional[Dict[str, Any]]:
149
+ if not isinstance(key, str) or not key.strip():
150
+ return None
151
+ entry = self._entries.get(key.strip())
152
+ if entry is None:
153
+ return None
154
+ return dict(entry.meta or {})
155
+
156
+
157
+ @dataclass(frozen=True)
158
+ class PromptCacheModule:
159
+ """A single cacheable module of prompt context.
160
+
161
+ This is intentionally generic and JSON-serializable so higher-level layers (runtime/agent/memory)
162
+ can express cache intent without hard-coding provider-specific prompt formats.
163
+ """
164
+
165
+ module_id: str
166
+ system_prompt: Optional[str] = None
167
+ prompt: Optional[str] = None
168
+ messages: Optional[List[Dict[str, Any]]] = None
169
+ tools: Optional[List[Dict[str, Any]]] = None
170
+ add_generation_prompt: bool = False
171
+ scope: str = "private" # "private" | "shared" (advisory; enforcement is host-dependent)
172
+ meta: Dict[str, Any] = field(default_factory=dict)
173
+
174
+ def normalized(self) -> "PromptCacheModule":
175
+ module_id = str(self.module_id or "").strip()
176
+ system_prompt = str(self.system_prompt).strip() if isinstance(self.system_prompt, str) and self.system_prompt else None
177
+ prompt = str(self.prompt).strip() if isinstance(self.prompt, str) and self.prompt else None
178
+ messages = None
179
+ if isinstance(self.messages, list) and self.messages:
180
+ out: List[Dict[str, Any]] = []
181
+ for m in self.messages:
182
+ if isinstance(m, dict):
183
+ out.append(dict(m))
184
+ messages = out or None
185
+ tools = None
186
+ if isinstance(self.tools, list) and self.tools:
187
+ out_tools: List[Dict[str, Any]] = []
188
+ for t in self.tools:
189
+ if isinstance(t, dict):
190
+ out_tools.append(dict(t))
191
+ tools = out_tools or None
192
+ add_generation_prompt = bool(self.add_generation_prompt)
193
+ scope = str(self.scope or "private").strip().lower() or "private"
194
+ if scope not in {"private", "shared"}:
195
+ scope = "private"
196
+ meta = dict(self.meta or {})
197
+ return PromptCacheModule(
198
+ module_id=module_id,
199
+ system_prompt=system_prompt,
200
+ prompt=prompt,
201
+ messages=messages,
202
+ tools=tools,
203
+ add_generation_prompt=add_generation_prompt,
204
+ scope=scope,
205
+ meta=meta,
206
+ )
207
+
208
+ def fingerprint(self, *, version: int = 1) -> str:
209
+ """Stable module fingerprint for hierarchical cache keys (hex sha256)."""
210
+ mod = self.normalized()
211
+ payload = {
212
+ "v": int(version),
213
+ "module_id": mod.module_id,
214
+ "system_prompt": mod.system_prompt,
215
+ "prompt": mod.prompt,
216
+ "messages": mod.messages,
217
+ "tools": mod.tools,
218
+ "add_generation_prompt": bool(mod.add_generation_prompt),
219
+ "scope": mod.scope,
220
+ }
221
+ raw = json.dumps(payload, sort_keys=True, ensure_ascii=False, separators=(",", ":"))
222
+ return hashlib.sha256(raw.encode("utf-8")).hexdigest()
223
+
224
+
45
225
  class BaseProvider(AbstractCoreInterface, ABC):
46
226
  """
47
227
  Base provider class with integrated telemetry and events.
@@ -60,6 +240,7 @@ class BaseProvider(AbstractCoreInterface, ABC):
60
240
  self.architecture_config = get_architecture_format(self.architecture)
61
241
  self.model_capabilities = get_model_capabilities(model)
62
242
 
243
+ # #[WARNING:TIMEOUT]
63
244
  # Setup timeout configuration (centralized defaults).
64
245
  #
65
246
  # Semantics:
@@ -136,12 +317,33 @@ class BaseProvider(AbstractCoreInterface, ABC):
136
317
  self.enable_tracing = kwargs.get('enable_tracing', False)
137
318
  self._traces = deque(maxlen=kwargs.get('max_traces', 100)) # Ring buffer for memory efficiency
138
319
 
320
+ # Prompt caching (best-effort; provider-specific behavior).
321
+ #
322
+ # - Remote providers (OpenAI): supports `prompt_cache_key` pass-through (server-managed caching).
323
+ # - Local runtimes (MLX / llama.cpp): can store KV/prefix caches in-process keyed by `prompt_cache_key`.
324
+ self._default_prompt_cache_key: Optional[str] = None
325
+ prompt_cache_max_entries = kwargs.get("prompt_cache_max_entries", kwargs.get("prompt_cache_max_items", 32))
326
+ prompt_cache_ttl_s = kwargs.get("prompt_cache_ttl_s", None)
327
+ self._prompt_cache_store = PromptCacheStore(
328
+ max_entries=int(prompt_cache_max_entries) if prompt_cache_max_entries is not None else 32,
329
+ default_ttl_s=prompt_cache_ttl_s,
330
+ )
331
+
139
332
  # Provider created successfully - no event emission needed
140
333
  # (The simplified event system focuses on generation and tool events only)
141
334
 
142
335
  # Set default token limits if not provided
143
336
  self._initialize_token_limits()
144
337
 
338
+ def __init_subclass__(cls, **kwargs): # pragma: no cover
339
+ super().__init_subclass__(**kwargs)
340
+ # Enforce a single unload path: providers must implement `unload_model()` and must not define `unload()`.
341
+ if "unload" in cls.__dict__:
342
+ raise TypeError(
343
+ f"{cls.__name__} defines unload(). "
344
+ "Providers must implement unload_model(model_name) and must not provide any other unload entrypoint."
345
+ )
346
+
145
347
  def _track_generation(self, prompt: str, response: Optional[GenerateResponse],
146
348
  start_time: float, success: bool = True,
147
349
  error: Optional[Exception] = None, stream: bool = False):
@@ -174,7 +376,7 @@ class BaseProvider(AbstractCoreInterface, ABC):
174
376
 
175
377
  # Emit comprehensive event with all data in one dict
176
378
  event_data = {
177
- "prompt": prompt[:100] + "..." if len(prompt) > 100 else prompt,
379
+ "prompt": preview_text(prompt, max_chars=100),
178
380
  "success": success,
179
381
  "error": str(error) if error else None,
180
382
  "response_length": len(response.content) if response and response.content else 0,
@@ -222,7 +424,7 @@ class BaseProvider(AbstractCoreInterface, ABC):
222
424
  event_data = {
223
425
  "tool_name": tool_name,
224
426
  "arguments": arguments,
225
- "result": str(result)[:100] if result else None,
427
+ "result": preview_text(result, max_chars=100) if result else None,
226
428
  "error": str(error) if error else None,
227
429
  "success": success
228
430
  }
@@ -268,9 +470,11 @@ class BaseProvider(AbstractCoreInterface, ABC):
268
470
 
269
471
  # Extract generation parameters
270
472
  temperature = kwargs.get('temperature', self.temperature)
473
+ if temperature is None:
474
+ temperature = self.temperature
271
475
  max_tokens = kwargs.get('max_tokens', self.max_tokens)
272
476
  max_output_tokens = kwargs.get('max_output_tokens', self.max_output_tokens)
273
- seed = kwargs.get('seed', self.seed)
477
+ seed = self._normalize_seed(kwargs.get('seed', self.seed))
274
478
  top_p = kwargs.get('top_p', getattr(self, 'top_p', None))
275
479
  top_k = kwargs.get('top_k', getattr(self, 'top_k', None))
276
480
 
@@ -393,7 +597,10 @@ class BaseProvider(AbstractCoreInterface, ABC):
393
597
  if _looks_like_timeout(error) and not _has_explicit_duration(msg):
394
598
  t = _configured_timeout_s()
395
599
  if t is not None:
396
- return ProviderAPIError(f"{_provider_label()} API error: timed out after {t}s")
600
+ return ProviderAPIError(
601
+ f"{_provider_label()} API error: timed out after {t}s "
602
+ "(configured timeout; set timeout=None or default_timeout=0 for unlimited)"
603
+ )
397
604
  return ProviderAPIError(f"{_provider_label()} API error: timed out")
398
605
  return error
399
606
 
@@ -404,7 +611,10 @@ class BaseProvider(AbstractCoreInterface, ABC):
404
611
  if _looks_like_timeout(error):
405
612
  t = _configured_timeout_s()
406
613
  if t is not None:
407
- return ProviderAPIError(f"{_provider_label()} API error: timed out after {t}s")
614
+ return ProviderAPIError(
615
+ f"{_provider_label()} API error: timed out after {t}s "
616
+ "(configured timeout; set timeout=None or default_timeout=0 for unlimited)"
617
+ )
408
618
  return ProviderAPIError(f"{_provider_label()} API error: timed out")
409
619
 
410
620
  error_str = str(error).lower()
@@ -418,6 +628,233 @@ class BaseProvider(AbstractCoreInterface, ABC):
418
628
  else:
419
629
  return ProviderAPIError(f"API error: {error}")
420
630
 
631
+ @staticmethod
632
+ def _normalize_thinking_request(thinking: Optional[Union[bool, str]]) -> Tuple[Optional[bool], Optional[str]]:
633
+ """Normalize `thinking=` into (enabled, level).
634
+
635
+ - enabled: True/False/None (None == "auto")
636
+ - level: Optional[str] in {"low","medium","high"} when requested
637
+ """
638
+ if thinking is None:
639
+ return None, None
640
+
641
+ if isinstance(thinking, bool):
642
+ return thinking, None
643
+
644
+ if isinstance(thinking, str):
645
+ s = thinking.strip().lower()
646
+ if not s or s == "auto":
647
+ return None, None
648
+ if s in {"on", "true", "yes"}:
649
+ return True, None
650
+ if s in {"off", "false", "no"}:
651
+ return False, None
652
+ if s in {"low", "medium", "high"}:
653
+ return True, s
654
+
655
+ raise ValueError('thinking must be one of: None, bool, "auto", "on", "off", "low", "medium", "high"')
656
+
657
+ def _model_reasoning_levels(self) -> List[str]:
658
+ levels = None
659
+ for src in (self.model_capabilities, self.architecture_config):
660
+ if not isinstance(src, dict):
661
+ continue
662
+ value = src.get("reasoning_levels")
663
+ if isinstance(value, list) and value:
664
+ levels = value
665
+ break
666
+ if not isinstance(levels, list):
667
+ return []
668
+ out: List[str] = []
669
+ for x in levels:
670
+ if isinstance(x, str) and x.strip():
671
+ out.append(x.strip().lower())
672
+ # Deduplicate while preserving order.
673
+ seen: set[str] = set()
674
+ uniq: List[str] = []
675
+ for x in out:
676
+ if x in seen:
677
+ continue
678
+ seen.add(x)
679
+ uniq.append(x)
680
+ return uniq
681
+
682
+ def _model_supports_thinking_control(self) -> bool:
683
+ caps = self.model_capabilities if isinstance(self.model_capabilities, dict) else {}
684
+ arch = self.architecture_config if isinstance(self.architecture_config, dict) else {}
685
+
686
+ if caps.get("thinking_support") is True:
687
+ return True
688
+ if isinstance(caps.get("thinking_tags"), (list, tuple)) and len(caps.get("thinking_tags")) == 2:
689
+ return True
690
+ if isinstance(caps.get("thinking_output_field"), str) and caps.get("thinking_output_field").strip():
691
+ return True
692
+ if self._model_reasoning_levels():
693
+ return True
694
+
695
+ if isinstance(arch.get("thinking_tags"), (list, tuple)) and len(arch.get("thinking_tags")) == 2:
696
+ return True
697
+ if isinstance(arch.get("thinking_control"), str) and arch.get("thinking_control").strip():
698
+ return True
699
+ if arch.get("reasoning_support") is True:
700
+ return True
701
+ if isinstance(arch.get("reasoning_levels"), list) and arch.get("reasoning_levels"):
702
+ return True
703
+
704
+ return False
705
+
706
+ def _apply_thinking_request(
707
+ self,
708
+ *,
709
+ thinking: Optional[Union[bool, str]],
710
+ prompt: str,
711
+ messages: Optional[List[Dict[str, str]]],
712
+ system_prompt: Optional[str],
713
+ kwargs: Dict[str, Any],
714
+ ) -> Tuple[str, Optional[List[Dict[str, str]]], Optional[str], Dict[str, Any]]:
715
+ """Apply unified thinking controls to the request."""
716
+ enabled, level = self._normalize_thinking_request(thinking)
717
+ if enabled is None and level is None:
718
+ return prompt, messages, system_prompt, kwargs
719
+
720
+ supports_control = self._model_supports_thinking_control()
721
+ reasoning_levels = self._model_reasoning_levels()
722
+
723
+ if level is not None and reasoning_levels and level not in reasoning_levels:
724
+ warnings.warn(
725
+ f"thinking level '{level}' requested but not supported for model '{self.model}' "
726
+ f"(supported: {reasoning_levels}); falling back to thinking='on'.",
727
+ RuntimeWarning,
728
+ stacklevel=3,
729
+ )
730
+ level = None
731
+ enabled = True
732
+
733
+ if level is not None and not reasoning_levels:
734
+ warnings.warn(
735
+ f"thinking level '{level}' requested but model '{self.model}' has no configured reasoning_levels; "
736
+ "falling back to thinking='on'.",
737
+ RuntimeWarning,
738
+ stacklevel=3,
739
+ )
740
+ level = None
741
+ enabled = True
742
+
743
+ handled_by_model_prompt = False
744
+
745
+ # Harmony (GPT-OSS): control via system message `Reasoning: low|medium|high`.
746
+ msg_fmt = str((self.architecture_config or {}).get("message_format") or "").strip().lower()
747
+ resp_fmt = str((self.model_capabilities or {}).get("response_format") or "").strip().lower()
748
+ is_harmony = msg_fmt == "harmony" or resp_fmt == "harmony"
749
+ if is_harmony:
750
+ target_level: Optional[str] = None
751
+ if level is not None:
752
+ target_level = level
753
+ elif enabled is False:
754
+ warnings.warn(
755
+ f"thinking='off' requested for Harmony model '{self.model}', but GPT-OSS reasoning traces "
756
+ "cannot be fully disabled; using Reasoning: low.",
757
+ RuntimeWarning,
758
+ stacklevel=3,
759
+ )
760
+ target_level = "low"
761
+ elif enabled is True:
762
+ # Make the default explicit when the caller opts-in.
763
+ target_level = "medium"
764
+
765
+ if target_level:
766
+ line = f"Reasoning: {target_level}"
767
+ if isinstance(system_prompt, str) and system_prompt.strip():
768
+ # Replace any existing Reasoning line; otherwise prepend.
769
+ if re.search(r"(?mi)^\\s*Reasoning\\s*:\\s*(low|medium|high)\\s*$", system_prompt):
770
+ system_prompt = re.sub(
771
+ r"(?mi)^\\s*Reasoning\\s*:\\s*(low|medium|high)\\s*$",
772
+ line,
773
+ system_prompt,
774
+ count=1,
775
+ )
776
+ else:
777
+ system_prompt = f"{line}\n{system_prompt}"
778
+ else:
779
+ system_prompt = line
780
+ handled_by_model_prompt = True
781
+
782
+ # Model-level control token for disabling thinking (e.g., GLM `/nothink`).
783
+ thinking_control = None
784
+ for src in (self.model_capabilities, self.architecture_config):
785
+ if not isinstance(src, dict):
786
+ continue
787
+ token = src.get("thinking_control")
788
+ if isinstance(token, str) and token.strip():
789
+ thinking_control = token.strip()
790
+
791
+ if enabled is False and thinking_control:
792
+ handled_by_model_prompt = True
793
+
794
+ def _append_control(text: str) -> str:
795
+ if thinking_control in text:
796
+ return text
797
+ return f"{text.rstrip()}\n{thinking_control}".strip()
798
+
799
+ if isinstance(prompt, str) and prompt.strip():
800
+ prompt = _append_control(prompt)
801
+ elif isinstance(messages, list) and messages:
802
+ # Append to the most recent user turn, if possible.
803
+ new_messages: List[Dict[str, str]] = []
804
+ appended = False
805
+ for m in messages:
806
+ if not isinstance(m, dict):
807
+ continue
808
+ new_messages.append(dict(m))
809
+ for m in reversed(new_messages):
810
+ if m.get("role") == "user" and isinstance(m.get("content"), str) and m["content"].strip():
811
+ m["content"] = _append_control(m["content"])
812
+ appended = True
813
+ break
814
+ messages = new_messages
815
+ if not appended:
816
+ warnings.warn(
817
+ f"thinking='off' requested for model '{self.model}', but no user prompt was available "
818
+ f"to append thinking_control='{thinking_control}'.",
819
+ RuntimeWarning,
820
+ stacklevel=3,
821
+ )
822
+
823
+ kwargs, handled_by_provider = self._apply_provider_thinking_kwargs(
824
+ enabled=enabled,
825
+ level=level,
826
+ kwargs=kwargs,
827
+ )
828
+
829
+ if not supports_control and thinking is not None:
830
+ warnings.warn(
831
+ f"thinking={thinking!r} requested but model '{self.model}' is not marked as thinking-capable "
832
+ "in model_capabilities.json; the request may be ignored.",
833
+ RuntimeWarning,
834
+ stacklevel=3,
835
+ )
836
+
837
+ if not handled_by_model_prompt and not handled_by_provider and (enabled is False or level is not None):
838
+ warnings.warn(
839
+ f"thinking={thinking!r} requested but provider '{self.provider or self.__class__.__name__}' "
840
+ "does not implement a thinking control mapping for this model; the request may be ignored.",
841
+ RuntimeWarning,
842
+ stacklevel=3,
843
+ )
844
+
845
+ return prompt, messages, system_prompt, kwargs
846
+
847
+ def _apply_provider_thinking_kwargs(
848
+ self,
849
+ *,
850
+ enabled: Optional[bool],
851
+ level: Optional[str],
852
+ kwargs: Dict[str, Any],
853
+ ) -> Tuple[Dict[str, Any], bool]:
854
+ """Provider-specific thinking knob hook (default: unsupported)."""
855
+ _ = (enabled, level)
856
+ return kwargs, False
857
+
421
858
  def generate_with_telemetry(self,
422
859
  prompt: str,
423
860
  messages: Optional[List[Dict[str, str]]] = None,
@@ -430,6 +867,7 @@ class BaseProvider(AbstractCoreInterface, ABC):
430
867
  tool_call_tags: Optional[str] = None, # Tool call tag rewriting
431
868
  execute_tools: Optional[bool] = None, # Tool execution control
432
869
  glyph_compression: Optional[str] = None, # Glyph compression preference
870
+ thinking: Optional[Union[bool, str]] = None, # Unified reasoning/thinking control
433
871
  **kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse], BaseModel]:
434
872
  """
435
873
  Generate with integrated telemetry and error handling.
@@ -447,6 +885,7 @@ class BaseProvider(AbstractCoreInterface, ABC):
447
885
  tool_call_tags: Optional tool call tag format for rewriting
448
886
  execute_tools: Whether to execute tools automatically (True) or let agent handle execution (False)
449
887
  glyph_compression: Glyph compression preference ("auto", "always", "never")
888
+ thinking: Unified reasoning/thinking control (auto/on/off or low/medium/high when supported)
450
889
  """
451
890
  # Normalize token limit naming at the provider boundary.
452
891
  #
@@ -458,6 +897,18 @@ class BaseProvider(AbstractCoreInterface, ABC):
458
897
  if "max_output_tokens" not in kwargs and "max_tokens" in kwargs and kwargs.get("max_tokens") is not None:
459
898
  kwargs["max_output_tokens"] = kwargs.pop("max_tokens")
460
899
 
900
+ # Prompt caching: apply a default `prompt_cache_key` if configured.
901
+ self._apply_default_prompt_cache_key(kwargs)
902
+
903
+ # Apply unified thinking controls (provider-agnostic + provider-specific mappings).
904
+ prompt, messages, system_prompt, kwargs = self._apply_thinking_request(
905
+ thinking=thinking,
906
+ prompt=prompt,
907
+ messages=messages,
908
+ system_prompt=system_prompt,
909
+ kwargs=kwargs,
910
+ )
911
+
461
912
  # Handle structured output request
462
913
  if response_model is not None:
463
914
  if not PYDANTIC_AVAILABLE:
@@ -466,8 +917,12 @@ class BaseProvider(AbstractCoreInterface, ABC):
466
917
  "Install with: pip install pydantic>=2.0.0"
467
918
  )
468
919
 
469
- # Handle hybrid case: tools + structured output
470
- if tools is not None:
920
+ # Handle hybrid case: tools + structured output.
921
+ #
922
+ # NOTE: `tools=[]` should behave like "no tools". Treating an empty list as
923
+ # "tools present" triggers the hybrid 2-pass flow (unstructured call + structured
924
+ # follow-up) which is both slower and can cause provider-side timeouts/unloads.
925
+ if isinstance(tools, list) and len(tools) > 0:
471
926
  return self._handle_tools_with_structured_output(
472
927
  prompt=prompt,
473
928
  messages=messages,
@@ -500,6 +955,7 @@ class BaseProvider(AbstractCoreInterface, ABC):
500
955
  # Process media content if provided
501
956
  processed_media = None
502
957
  media_metadata = None
958
+ media_enrichment = None
503
959
  if media:
504
960
  compression_pref = glyph_compression or kwargs.get('glyph_compression', 'auto')
505
961
  processed_media = self._process_media_content(media, compression_pref)
@@ -511,6 +967,639 @@ class BaseProvider(AbstractCoreInterface, ABC):
511
967
  if hasattr(media_content, 'metadata') and media_content.metadata:
512
968
  media_metadata.append(media_content.metadata)
513
969
 
970
+ # Audio input policy (v0): avoid placeholder degradation and require explicit fallbacks.
971
+ if processed_media:
972
+ try:
973
+ from ..media.types import ContentFormat, MediaType
974
+ from ..media.enrichment import build_enrichment_item
975
+ from ..capabilities.errors import CapabilityUnavailableError
976
+ except Exception:
977
+ ContentFormat = None # type: ignore[assignment]
978
+ MediaType = None # type: ignore[assignment]
979
+ build_enrichment_item = None # type: ignore[assignment]
980
+ CapabilityUnavailableError = Exception # type: ignore[assignment]
981
+
982
+ if MediaType is not None:
983
+ audio_items = [mc for mc in processed_media if getattr(mc, "media_type", None) == MediaType.AUDIO]
984
+ else:
985
+ audio_items = []
986
+
987
+ if audio_items:
988
+ # Resolve policy: per-call kwarg > config default.
989
+ policy_raw = kwargs.pop("audio_policy", None)
990
+ if policy_raw is None:
991
+ policy_raw = kwargs.pop("audio_handling_policy", None)
992
+ if policy_raw is None:
993
+ try:
994
+ from ..config.manager import get_config_manager
995
+
996
+ policy_raw = getattr(get_config_manager().config, "audio", None).strategy # type: ignore[union-attr]
997
+ except Exception:
998
+ policy_raw = "native_only"
999
+
1000
+ policy = str(policy_raw or "native_only").strip().lower()
1001
+ model_supports_audio = bool(getattr(self, "model_capabilities", {}).get("audio_support", False))
1002
+
1003
+ if policy in ("native_only", "native", "disabled"):
1004
+ if not model_supports_audio:
1005
+ raise UnsupportedFeatureError(
1006
+ f"Audio input is not supported by model '{self.model}'. "
1007
+ "Choose an audio-capable model, or pass audio_policy='speech_to_text' "
1008
+ "(requires an STT capability plugin, e.g. install abstractvoice)."
1009
+ )
1010
+ # Keep audio media for provider-native handling (provider support may still vary).
1011
+
1012
+ elif policy in ("speech_to_text", "stt"):
1013
+ stt_language = kwargs.pop("audio_language", None)
1014
+ if stt_language is None:
1015
+ stt_language = kwargs.pop("stt_language", None)
1016
+ if stt_language is None:
1017
+ try:
1018
+ from ..config.manager import get_config_manager
1019
+
1020
+ stt_language = getattr(get_config_manager().config, "audio", None).stt_language # type: ignore[union-attr]
1021
+ except Exception:
1022
+ stt_language = None
1023
+
1024
+ audio_context_parts: List[str] = []
1025
+ enrichments: List[Dict[str, Any]] = []
1026
+
1027
+ # Resolve backend id (best-effort) for transparency metadata.
1028
+ backend_id = getattr(getattr(self, "audio", None), "backend_id", None)
1029
+ backend = {"kind": "plugin"}
1030
+ if isinstance(backend_id, str) and backend_id.strip():
1031
+ backend["backend_id"] = backend_id.strip()
1032
+
1033
+ for idx, mc in enumerate(audio_items):
1034
+ name = None
1035
+ try:
1036
+ name = mc.metadata.get("file_name") if hasattr(mc, "metadata") and isinstance(mc.metadata, dict) else None
1037
+ except Exception:
1038
+ name = None
1039
+ if not isinstance(name, str) or not name.strip():
1040
+ name = mc.file_path if getattr(mc, "file_path", None) else f"audio_{idx+1}"
1041
+
1042
+ # Prefer a file path when available.
1043
+ audio_input: Any = None
1044
+ try:
1045
+ if getattr(mc, "file_path", None):
1046
+ audio_input = str(mc.file_path)
1047
+ elif getattr(mc, "content_format", None) == ContentFormat.FILE_PATH and isinstance(getattr(mc, "content", None), str):
1048
+ audio_input = str(mc.content)
1049
+ elif isinstance(getattr(mc, "content", None), (bytes, bytearray)):
1050
+ audio_input = bytes(mc.content)
1051
+ except Exception:
1052
+ audio_input = None
1053
+
1054
+ if audio_input is None:
1055
+ raise UnsupportedFeatureError("Audio STT fallback requires a file path or raw bytes for the audio input.")
1056
+
1057
+ try:
1058
+ transcript = self.audio.transcribe(audio_input, language=stt_language)
1059
+ except CapabilityUnavailableError as e: # type: ignore[misc]
1060
+ raise UnsupportedFeatureError(str(e))
1061
+
1062
+ transcript = str(transcript or "").strip()
1063
+ audio_context_parts.append(f"Audio {idx+1} ({name}): {transcript}")
1064
+ if build_enrichment_item is not None:
1065
+ enrichments.append(
1066
+ build_enrichment_item(
1067
+ status="used",
1068
+ input_modality="audio",
1069
+ summary_kind="transcript",
1070
+ policy="speech_to_text",
1071
+ backend=backend,
1072
+ input_index=idx + 1,
1073
+ input_name=str(name),
1074
+ injected_text=transcript,
1075
+ )
1076
+ )
1077
+
1078
+ # Remove audio media from the provider call (we injected text context instead).
1079
+ processed_media = [mc for mc in processed_media if getattr(mc, "media_type", None) != MediaType.AUDIO]
1080
+
1081
+ # Inject audio context into the prompt (similar recency semantics as vision fallback).
1082
+ original_prompt = prompt.strip() if isinstance(prompt, str) else ""
1083
+ parts: List[str] = []
1084
+ parts.append(
1085
+ "Audio context from attached audio file(s) "
1086
+ "(treat as directly observed; do not mention this section):"
1087
+ )
1088
+ parts.extend(audio_context_parts)
1089
+ if original_prompt:
1090
+ parts.append("Now answer the user's request:")
1091
+ parts.append(original_prompt)
1092
+ prompt = "\n\n".join(parts) if parts else original_prompt
1093
+
1094
+ media_enrichment = enrichments
1095
+
1096
+ elif policy == "auto":
1097
+ if model_supports_audio:
1098
+ pass # provider-native path
1099
+ else:
1100
+ # Explicit "auto" allows fallback, but never silently for default policy.
1101
+ # Re-enter through the explicit STT path by recursion is risky; inline minimal.
1102
+ stt_language = kwargs.pop("audio_language", None) or kwargs.pop("stt_language", None)
1103
+ audio_context_parts: List[str] = []
1104
+ enrichments: List[Dict[str, Any]] = []
1105
+ backend_id = getattr(getattr(self, "audio", None), "backend_id", None)
1106
+ backend = {"kind": "plugin"}
1107
+ if isinstance(backend_id, str) and backend_id.strip():
1108
+ backend["backend_id"] = backend_id.strip()
1109
+ for idx, mc in enumerate(audio_items):
1110
+ name = None
1111
+ try:
1112
+ name = mc.metadata.get("file_name") if hasattr(mc, "metadata") and isinstance(mc.metadata, dict) else None
1113
+ except Exception:
1114
+ name = None
1115
+ if not isinstance(name, str) or not name.strip():
1116
+ name = mc.file_path if getattr(mc, "file_path", None) else f"audio_{idx+1}"
1117
+ audio_input: Any = None
1118
+ try:
1119
+ if getattr(mc, "file_path", None):
1120
+ audio_input = str(mc.file_path)
1121
+ elif getattr(mc, "content_format", None) == ContentFormat.FILE_PATH and isinstance(getattr(mc, "content", None), str):
1122
+ audio_input = str(mc.content)
1123
+ elif isinstance(getattr(mc, "content", None), (bytes, bytearray)):
1124
+ audio_input = bytes(mc.content)
1125
+ except Exception:
1126
+ audio_input = None
1127
+ if audio_input is None:
1128
+ raise UnsupportedFeatureError("Audio STT fallback requires a file path or raw bytes for the audio input.")
1129
+ try:
1130
+ transcript = self.audio.transcribe(audio_input, language=stt_language)
1131
+ except CapabilityUnavailableError as e: # type: ignore[misc]
1132
+ raise UnsupportedFeatureError(str(e))
1133
+ transcript = str(transcript or "").strip()
1134
+ audio_context_parts.append(f"Audio {idx+1} ({name}): {transcript}")
1135
+ if build_enrichment_item is not None:
1136
+ enrichments.append(
1137
+ build_enrichment_item(
1138
+ status="used",
1139
+ input_modality="audio",
1140
+ summary_kind="transcript",
1141
+ policy="auto",
1142
+ backend=backend,
1143
+ input_index=idx + 1,
1144
+ input_name=str(name),
1145
+ injected_text=transcript,
1146
+ )
1147
+ )
1148
+ processed_media = [mc for mc in processed_media if getattr(mc, "media_type", None) != MediaType.AUDIO]
1149
+ original_prompt = prompt.strip() if isinstance(prompt, str) else ""
1150
+ parts: List[str] = []
1151
+ parts.append(
1152
+ "Audio context from attached audio file(s) "
1153
+ "(treat as directly observed; do not mention this section):"
1154
+ )
1155
+ parts.extend(audio_context_parts)
1156
+ if original_prompt:
1157
+ parts.append("Now answer the user's request:")
1158
+ parts.append(original_prompt)
1159
+ prompt = "\n\n".join(parts) if parts else original_prompt
1160
+ media_enrichment = enrichments
1161
+
1162
+ elif policy == "caption":
1163
+ raise UnsupportedFeatureError(
1164
+ "audio_policy='caption' is not configured in v0. "
1165
+ "Use audio_policy='speech_to_text' for speech, or configure a future audio caption backend."
1166
+ )
1167
+ else:
1168
+ raise ValueError(f"Unknown audio_policy '{policy}'. Expected one of: native_only, speech_to_text, auto, caption.")
1169
+
1170
+ # Video input policy (v0): allow native video where supported; otherwise fall back to sampled frames.
1171
+ # Note: most providers do not accept native video inputs; frame sampling provides a portable path.
1172
+ if processed_media:
1173
+ try:
1174
+ from ..media.types import MediaType
1175
+ from ..media.enrichment import build_enrichment_item
1176
+ except Exception:
1177
+ MediaType = None # type: ignore[assignment]
1178
+ build_enrichment_item = None # type: ignore[assignment]
1179
+
1180
+ if MediaType is not None:
1181
+ video_items = [mc for mc in processed_media if getattr(mc, "media_type", None) == MediaType.VIDEO]
1182
+ else:
1183
+ video_items = []
1184
+
1185
+ if video_items:
1186
+ policy_raw = kwargs.pop("video_policy", None)
1187
+ if policy_raw is None:
1188
+ policy_raw = kwargs.pop("video_handling_policy", None)
1189
+ if policy_raw is None:
1190
+ try:
1191
+ from ..config.manager import get_config_manager
1192
+
1193
+ policy_raw = getattr(get_config_manager().config, "video", None).strategy # type: ignore[union-attr]
1194
+ except Exception:
1195
+ policy_raw = "native_only"
1196
+
1197
+ policy = str(policy_raw or "native_only").strip().lower()
1198
+
1199
+ provider_name = str(getattr(self, "provider", "") or "").strip().lower()
1200
+ model_supports_native_video = bool(
1201
+ provider_name == "huggingface"
1202
+ and isinstance(getattr(self, "model_capabilities", None), dict)
1203
+ and getattr(self, "model_capabilities", {}).get("video_support", False)
1204
+ )
1205
+
1206
+ cfg_video = None
1207
+ try:
1208
+ from ..config.manager import get_config_manager
1209
+
1210
+ cfg_video = getattr(get_config_manager().config, "video", None)
1211
+ except Exception:
1212
+ cfg_video = None
1213
+
1214
+ # Sampling controls (best-effort; keep small by default).
1215
+ # NOTE: do not `pop` here: native video backends may also need the resolved values.
1216
+ max_frames_raw = kwargs.get("video_max_frames", None)
1217
+ if max_frames_raw is None:
1218
+ max_frames_raw = kwargs.get("max_video_frames", None)
1219
+ if max_frames_raw is None:
1220
+ fallback_default = getattr(cfg_video, "max_frames", 3) if cfg_video is not None else 3
1221
+ native_default = getattr(cfg_video, "max_frames_native", None) if cfg_video is not None else None
1222
+ if native_default is None:
1223
+ native_default = fallback_default
1224
+
1225
+ use_native_default = bool(
1226
+ model_supports_native_video and policy in ("native_only", "native", "disabled", "auto")
1227
+ )
1228
+ max_frames_raw = native_default if use_native_default else fallback_default
1229
+ try:
1230
+ max_frames = max(1, int(max_frames_raw))
1231
+ except Exception:
1232
+ max_frames = 3
1233
+
1234
+ frame_format_raw = kwargs.get("video_frame_format", None)
1235
+ if frame_format_raw is None:
1236
+ try:
1237
+ from ..config.manager import get_config_manager
1238
+
1239
+ frame_format_raw = getattr(get_config_manager().config, "video", None).frame_format # type: ignore[union-attr]
1240
+ except Exception:
1241
+ frame_format_raw = "jpg"
1242
+ frame_format = str(frame_format_raw or "jpg").strip().lower()
1243
+ if frame_format not in {"jpg", "jpeg", "png"}:
1244
+ frame_format = "jpg"
1245
+ if frame_format == "jpeg":
1246
+ frame_format = "jpg"
1247
+
1248
+ sampling_strategy_raw = kwargs.get("video_sampling_strategy", None)
1249
+ if sampling_strategy_raw is None:
1250
+ try:
1251
+ from ..config.manager import get_config_manager
1252
+
1253
+ sampling_strategy_raw = getattr(get_config_manager().config, "video", None).sampling_strategy # type: ignore[union-attr]
1254
+ except Exception:
1255
+ sampling_strategy_raw = "uniform"
1256
+ sampling_strategy = str(sampling_strategy_raw or "uniform").strip().lower()
1257
+ if sampling_strategy not in {"uniform", "keyframes"}:
1258
+ sampling_strategy = "uniform"
1259
+
1260
+ max_frame_side_raw = kwargs.get("video_max_frame_side", None)
1261
+ if max_frame_side_raw is None:
1262
+ max_frame_side_raw = kwargs.get("video_frame_max_side", None)
1263
+ if max_frame_side_raw is None:
1264
+ max_frame_side_raw = getattr(cfg_video, "max_frame_side", 1024) if cfg_video is not None else 1024
1265
+ try:
1266
+ max_frame_side = int(max_frame_side_raw) if max_frame_side_raw is not None else None
1267
+ except Exception:
1268
+ max_frame_side = 1024
1269
+ if isinstance(max_frame_side, int) and max_frame_side <= 0:
1270
+ max_frame_side = None
1271
+
1272
+ # Expose normalized sampling values to provider-native implementations.
1273
+ kwargs["video_max_frames"] = max_frames
1274
+ kwargs["video_frame_format"] = frame_format
1275
+ kwargs["video_sampling_strategy"] = sampling_strategy
1276
+ kwargs["video_max_frame_side"] = max_frame_side
1277
+
1278
+ if policy in ("native_only", "native", "disabled"):
1279
+ if not model_supports_native_video:
1280
+ raise UnsupportedFeatureError(
1281
+ f"Video input is not supported by model '{self.model}'. "
1282
+ "Choose a video-capable model, or pass video_policy='frames_caption' "
1283
+ "(samples frames and uses vision/image handling)."
1284
+ )
1285
+ # Keep video media for provider-native handling.
1286
+ try:
1287
+ from pathlib import Path
1288
+
1289
+ from ..media.utils.video_frames import probe_duration_s
1290
+
1291
+ for idx, mc in enumerate(video_items):
1292
+ video_path_raw = getattr(mc, "file_path", None) or getattr(mc, "content", None)
1293
+ if not isinstance(video_path_raw, str) or not video_path_raw.strip():
1294
+ continue
1295
+ vp = Path(video_path_raw)
1296
+ duration_s = probe_duration_s(vp)
1297
+ file_bytes = None
1298
+ try:
1299
+ file_bytes = int(vp.stat().st_size)
1300
+ except Exception:
1301
+ file_bytes = None
1302
+
1303
+ avg_gap_s = None
1304
+ try:
1305
+ if isinstance(duration_s, (int, float)) and duration_s > 0 and max_frames > 0:
1306
+ avg_gap_s = float(duration_s) / float(max_frames + 1)
1307
+ except Exception:
1308
+ avg_gap_s = None
1309
+
1310
+ self.logger.info(
1311
+ "Video input policy: native video enabled (video will be sampled/budgeted for model input).",
1312
+ provider=provider_name,
1313
+ model=self.model,
1314
+ video_policy=policy,
1315
+ video_index=idx + 1,
1316
+ video_name=vp.name,
1317
+ video_duration_s=duration_s,
1318
+ video_bytes=file_bytes,
1319
+ video_max_frames=max_frames,
1320
+ video_sampling_strategy=sampling_strategy,
1321
+ video_max_frame_side=max_frame_side,
1322
+ video_avg_gap_s=avg_gap_s,
1323
+ )
1324
+ if isinstance(avg_gap_s, float) and avg_gap_s >= 10.0:
1325
+ self.logger.warning(
1326
+ "Video sampling is sparse; important events may be missed. "
1327
+ "Consider increasing video_max_frames/video.max_frames_native or using keyframes sampling.",
1328
+ provider=provider_name,
1329
+ model=self.model,
1330
+ video_policy=policy,
1331
+ video_name=vp.name,
1332
+ video_duration_s=duration_s,
1333
+ video_max_frames=max_frames,
1334
+ video_avg_gap_s=avg_gap_s,
1335
+ )
1336
+ except Exception:
1337
+ pass
1338
+
1339
+ # Insert a short marker to disambiguate native-video inputs across turns.
1340
+ #
1341
+ # Without this, follow-ups like "and this one?" can be brittle for native
1342
+ # video VLMs (they may over-weight the previous text-only answer and ignore
1343
+ # that a *new* video is attached in the current call).
1344
+ try:
1345
+ from ..media.types import MediaContent, ContentFormat
1346
+ except Exception:
1347
+ MediaContent = None # type: ignore[assignment]
1348
+ ContentFormat = None # type: ignore[assignment]
1349
+
1350
+ if MediaContent is not None and ContentFormat is not None:
1351
+ try:
1352
+ from pathlib import Path
1353
+
1354
+ from ..media.utils.video_frames import probe_duration_s
1355
+ except Exception:
1356
+ Path = None # type: ignore[assignment]
1357
+ probe_duration_s = None # type: ignore[assignment]
1358
+
1359
+ new_media: List[Any] = []
1360
+ video_group_index = 0
1361
+ for mc in processed_media:
1362
+ if getattr(mc, "media_type", None) != MediaType.VIDEO: # type: ignore[operator]
1363
+ new_media.append(mc)
1364
+ continue
1365
+
1366
+ video_group_index += 1
1367
+ video_path_raw = getattr(mc, "file_path", None) or getattr(mc, "content", None)
1368
+
1369
+ video_name = f"video_{video_group_index}"
1370
+ duration_s = None
1371
+ file_bytes = None
1372
+ try:
1373
+ if Path is not None and isinstance(video_path_raw, str) and video_path_raw.strip():
1374
+ vp = Path(video_path_raw)
1375
+ video_name = vp.name or video_name
1376
+ try:
1377
+ file_bytes = int(vp.stat().st_size)
1378
+ except Exception:
1379
+ file_bytes = None
1380
+ if callable(probe_duration_s):
1381
+ try:
1382
+ duration_s = probe_duration_s(vp)
1383
+ except Exception:
1384
+ duration_s = None
1385
+ except Exception:
1386
+ duration_s = None
1387
+ file_bytes = None
1388
+
1389
+ marker = MediaContent(
1390
+ media_type=MediaType.TEXT,
1391
+ content=(
1392
+ f"Video {video_group_index} ({video_name}) is attached below. "
1393
+ "This is the current video for this user message. "
1394
+ "Answer the user's question about this video as if you watched it. "
1395
+ "If earlier turns mention other videos, images, or audio, ignore them unless the user explicitly asks you to compare. "
1396
+ "Do not mention tool activity, attachments lists, sampling, frames, extraction, or this marker."
1397
+ ),
1398
+ content_format=ContentFormat.TEXT,
1399
+ mime_type="text/plain",
1400
+ file_path=None,
1401
+ metadata={
1402
+ "processor": "VideoNativeInputMarker",
1403
+ "source_video": video_name,
1404
+ "duration_s": duration_s,
1405
+ "bytes": file_bytes,
1406
+ "max_frames": max_frames,
1407
+ "sampling_strategy": sampling_strategy,
1408
+ "max_frame_side": max_frame_side,
1409
+ },
1410
+ )
1411
+ new_media.append(marker)
1412
+ new_media.append(mc)
1413
+
1414
+ processed_media = new_media
1415
+
1416
+ elif policy in ("frames_caption", "frames", "frame_caption"):
1417
+ # Convert each video into a small set of sampled frames (images).
1418
+ try:
1419
+ from pathlib import Path
1420
+ import tempfile
1421
+
1422
+ from ..media import AutoMediaHandler
1423
+ from ..media.utils.video_frames import extract_video_frames, probe_duration_s
1424
+ except Exception as e:
1425
+ raise UnsupportedFeatureError(f"Video frame fallback is not available: {e}")
1426
+
1427
+ enrichments: List[Dict[str, Any]] = []
1428
+ new_media: List[Any] = []
1429
+
1430
+ video_group_index = 0
1431
+ for idx, mc in enumerate(processed_media):
1432
+ if getattr(mc, "media_type", None) != MediaType.VIDEO: # type: ignore[operator]
1433
+ new_media.append(mc)
1434
+ continue
1435
+
1436
+ video_group_index += 1
1437
+ video_path_raw = getattr(mc, "file_path", None) or getattr(mc, "content", None)
1438
+ if not isinstance(video_path_raw, str) or not video_path_raw.strip():
1439
+ raise UnsupportedFeatureError("Video frame fallback requires a video file path.")
1440
+ video_path = Path(video_path_raw)
1441
+ if not video_path.exists():
1442
+ raise UnsupportedFeatureError(f"Video file not found: {video_path}")
1443
+
1444
+ out_dir = Path(tempfile.mkdtemp(prefix="abstractcore_video_frames_"))
1445
+ duration_s = probe_duration_s(video_path)
1446
+ file_bytes = None
1447
+ try:
1448
+ file_bytes = int(video_path.stat().st_size)
1449
+ except Exception:
1450
+ file_bytes = None
1451
+ frames, timestamps_s = extract_video_frames(
1452
+ video_path,
1453
+ max_frames=max_frames,
1454
+ frame_format=frame_format,
1455
+ sampling_strategy=sampling_strategy,
1456
+ max_side=max_frame_side,
1457
+ output_dir=out_dir,
1458
+ )
1459
+ if not frames:
1460
+ raise UnsupportedFeatureError("Video frame fallback failed: no frames extracted.")
1461
+
1462
+ handler = AutoMediaHandler(enable_glyph_compression=False)
1463
+ frame_media: List[Any] = []
1464
+ max_res = None
1465
+ if isinstance(max_frame_side, int) and max_frame_side > 0:
1466
+ max_res = (max_frame_side, max_frame_side)
1467
+ for fp in frames:
1468
+ res = handler.process_file(
1469
+ fp,
1470
+ provider=self.provider,
1471
+ model=self.model,
1472
+ glyph_compression="never",
1473
+ max_resolution=max_res,
1474
+ )
1475
+ if res and getattr(res, "success", False) and getattr(res, "media_content", None) is not None:
1476
+ frame_media.append(res.media_content)
1477
+
1478
+ if not frame_media:
1479
+ raise UnsupportedFeatureError("Video frame fallback failed: extracted frames could not be processed as images.")
1480
+
1481
+ avg_gap_s = None
1482
+ try:
1483
+ if isinstance(duration_s, (int, float)) and duration_s > 0 and max_frames > 0:
1484
+ avg_gap_s = float(duration_s) / float(max_frames + 1)
1485
+ except Exception:
1486
+ avg_gap_s = None
1487
+
1488
+ self.logger.info(
1489
+ "Video input policy: frames_caption (sampling frames for downstream vision handling).",
1490
+ provider=provider_name,
1491
+ model=self.model,
1492
+ video_policy="frames_caption",
1493
+ video_index=video_group_index,
1494
+ video_name=video_path.name,
1495
+ video_duration_s=duration_s,
1496
+ video_bytes=file_bytes,
1497
+ extracted_frames=len(frame_media),
1498
+ video_max_frames=max_frames,
1499
+ video_sampling_strategy=sampling_strategy,
1500
+ video_max_frame_side=max_frame_side,
1501
+ video_avg_gap_s=avg_gap_s,
1502
+ )
1503
+ if isinstance(avg_gap_s, float) and avg_gap_s >= 10.0:
1504
+ self.logger.warning(
1505
+ "Video sampling is sparse; important events may be missed. "
1506
+ "Consider increasing video_max_frames/video.max_frames or using keyframes sampling.",
1507
+ provider=provider_name,
1508
+ model=self.model,
1509
+ video_policy="frames_caption",
1510
+ video_name=video_path.name,
1511
+ video_duration_s=duration_s,
1512
+ extracted_frames=len(frame_media),
1513
+ video_max_frames=max_frames,
1514
+ video_avg_gap_s=avg_gap_s,
1515
+ )
1516
+
1517
+ # Insert a short text marker to avoid the model treating sampled frames as
1518
+ # unrelated standalone images (especially in follow-up prompts like "and this one?").
1519
+ try:
1520
+ from ..media.types import MediaContent, ContentFormat
1521
+ except Exception:
1522
+ MediaContent = None # type: ignore[assignment]
1523
+ ContentFormat = None # type: ignore[assignment]
1524
+
1525
+ if MediaContent is not None and ContentFormat is not None:
1526
+ marker = MediaContent(
1527
+ media_type=MediaType.TEXT,
1528
+ content=(
1529
+ f"Video {video_group_index} ({video_path.name}) — "
1530
+ "the following images belong to this video in chronological order. "
1531
+ "Answer the user's question about this video as if you watched it. "
1532
+ "Do not mention frames, timestamps, sampling, extraction, or this marker."
1533
+ ),
1534
+ content_format=ContentFormat.TEXT,
1535
+ mime_type="text/plain",
1536
+ file_path=None,
1537
+ metadata={
1538
+ "processor": "VideoFrameFallback",
1539
+ "source_video": video_path.name,
1540
+ "frame_count": len(frame_media),
1541
+ "timestamps_s": timestamps_s,
1542
+ "duration_s": duration_s,
1543
+ "bytes": file_bytes,
1544
+ },
1545
+ )
1546
+ new_media.append(marker)
1547
+
1548
+ new_media.extend(frame_media)
1549
+
1550
+ if build_enrichment_item is not None:
1551
+ enrichments.append(
1552
+ build_enrichment_item(
1553
+ status="used",
1554
+ input_modality="video",
1555
+ summary_kind="frames",
1556
+ policy="frames_caption",
1557
+ backend={"kind": "unknown", "source": "ffmpeg"},
1558
+ input_index=idx + 1,
1559
+ input_name=str(video_path.name),
1560
+ artifact={
1561
+ "frame_count": len(frame_media),
1562
+ "timestamps_s": timestamps_s,
1563
+ "duration_s": duration_s,
1564
+ "bytes": file_bytes,
1565
+ },
1566
+ )
1567
+ )
1568
+
1569
+ processed_media = new_media
1570
+ if enrichments:
1571
+ if media_enrichment is None:
1572
+ media_enrichment = enrichments
1573
+ else:
1574
+ media_enrichment.extend(enrichments)
1575
+
1576
+ elif policy == "auto":
1577
+ if model_supports_native_video:
1578
+ # Use native video when available.
1579
+ pass
1580
+ else:
1581
+ # Auto fallback: sample frames and proceed with existing image pipeline.
1582
+ # This works well for vision-capable models; for text-only models it requires a vision fallback.
1583
+ policy_to_use = "frames_caption"
1584
+ kwargs["video_policy"] = policy_to_use
1585
+ # Re-run this branch once with explicit policy.
1586
+ return self.generate_with_telemetry(
1587
+ prompt=prompt,
1588
+ messages=messages,
1589
+ system_prompt=system_prompt,
1590
+ tools=tools,
1591
+ media=processed_media,
1592
+ response_model=response_model,
1593
+ retry_strategy=retry_strategy,
1594
+ tool_call_tags=tool_call_tags,
1595
+ execute_tools=execute_tools,
1596
+ stream=stream,
1597
+ **kwargs,
1598
+ )
1599
+
1600
+ else:
1601
+ raise ValueError(f"Unknown video_policy '{policy}'. Expected one of: native_only, frames_caption, auto.")
1602
+
514
1603
  # Convert tools to ToolDefinition objects first (outside retry loop)
515
1604
  converted_tools = None
516
1605
  if tools:
@@ -545,7 +1634,10 @@ class BaseProvider(AbstractCoreInterface, ABC):
545
1634
  if not should_execute_tools and converted_tools:
546
1635
  # If tools are provided but execution is disabled,
547
1636
  # we still pass them to the provider for generation but won't execute them
548
- self.logger.info("Tool execution disabled - tools will be generated but not executed")
1637
+ self.logger.debug(
1638
+ "Provider-side tool execution disabled (expected for runtime/host tool execution); "
1639
+ "tools will be sent for generation only."
1640
+ )
549
1641
 
550
1642
  # Define generation function for retry wrapper
551
1643
  def _execute_generation():
@@ -554,7 +1646,7 @@ class BaseProvider(AbstractCoreInterface, ABC):
554
1646
 
555
1647
  # Emit generation started event (covers request received)
556
1648
  event_data = {
557
- "prompt": prompt[:100] + "..." if len(prompt) > 100 else prompt,
1649
+ "prompt": preview_text(prompt, max_chars=100),
558
1650
  "has_tools": bool(tools),
559
1651
  "stream": stream,
560
1652
  "model": self.model,
@@ -613,7 +1705,11 @@ class BaseProvider(AbstractCoreInterface, ABC):
613
1705
  ttft_ms: Optional[float] = None
614
1706
  for processed_chunk in processor.process_stream(response, converted_tools):
615
1707
  if isinstance(processed_chunk.content, str) and processed_chunk.content:
616
- processed_chunk.content = self._strip_output_wrappers(processed_chunk.content)
1708
+ processed_chunk.content = strip_output_wrappers(
1709
+ processed_chunk.content,
1710
+ architecture_format=self.architecture_config,
1711
+ model_capabilities=self.model_capabilities,
1712
+ )
617
1713
  if ttft_ms is None:
618
1714
  has_content = isinstance(processed_chunk.content, str) and bool(processed_chunk.content)
619
1715
  has_tools = isinstance(processed_chunk.tool_calls, list) and bool(processed_chunk.tool_calls)
@@ -651,9 +1747,29 @@ class BaseProvider(AbstractCoreInterface, ABC):
651
1747
  if tool_call_tags and response.content and not self._should_clean_tool_call_markup(tool_call_tags):
652
1748
  response = self._apply_non_streaming_tag_rewriting(response, tool_call_tags)
653
1749
 
654
- # Strip model-specific output wrappers (e.g. GLM <|begin_of_box|>…<|end_of_box|>).
1750
+ # Normalize provider output (wrapper tokens, Harmony transcripts, think tags).
655
1751
  if response and isinstance(response.content, str) and response.content:
656
- response.content = self._strip_output_wrappers(response.content)
1752
+ cleaned, reasoning = normalize_assistant_text(
1753
+ response.content,
1754
+ architecture_format=self.architecture_config,
1755
+ model_capabilities=self.model_capabilities,
1756
+ )
1757
+ response.content = cleaned
1758
+ if isinstance(reasoning, str) and reasoning.strip():
1759
+ if response.metadata is None or not isinstance(response.metadata, dict):
1760
+ response.metadata = {}
1761
+ existing = response.metadata.get("reasoning")
1762
+ if isinstance(existing, str) and existing.strip():
1763
+ if reasoning.strip() not in existing:
1764
+ response.metadata["reasoning"] = f"{existing.strip()}\n\n{reasoning.strip()}"
1765
+ else:
1766
+ response.metadata["reasoning"] = reasoning.strip()
1767
+
1768
+ # Attach media enrichment transparency metadata (caption/STT/etc.).
1769
+ if media_enrichment and response:
1770
+ from ..media.enrichment import merge_enrichment_metadata
1771
+
1772
+ response.metadata = merge_enrichment_metadata(response.metadata, media_enrichment)
657
1773
 
658
1774
  # Add visual token calculation if media metadata is available
659
1775
  if media_metadata and response:
@@ -689,7 +1805,7 @@ class BaseProvider(AbstractCoreInterface, ABC):
689
1805
  emit_global(EventType.ERROR, {
690
1806
  "error": str(e),
691
1807
  "error_type": type(e).__name__,
692
- "prompt": prompt[:100] + "..." if len(prompt) > 100 else prompt,
1808
+ "prompt": preview_text(prompt, max_chars=100),
693
1809
  "model": self.model,
694
1810
  "provider": self.__class__.__name__
695
1811
  }, source=self.__class__.__name__)
@@ -980,12 +2096,37 @@ class BaseProvider(AbstractCoreInterface, ABC):
980
2096
  result_kwargs["max_output_tokens"] = effective_max_output_i
981
2097
 
982
2098
  # Add unified generation parameters with fallback hierarchy: kwargs → instance → defaults
983
- result_kwargs["temperature"] = result_kwargs.get("temperature", self.temperature)
984
- if self.seed is not None:
985
- result_kwargs["seed"] = result_kwargs.get("seed", self.seed)
2099
+ temperature = result_kwargs.get("temperature", self.temperature)
2100
+ if temperature is None:
2101
+ temperature = self.temperature
2102
+ result_kwargs["temperature"] = temperature
2103
+
2104
+ seed_value = self._normalize_seed(result_kwargs.get("seed", self.seed))
2105
+ if seed_value is not None:
2106
+ result_kwargs["seed"] = seed_value
2107
+ else:
2108
+ # Do not forward seed when unset/random (None or negative sentinel like -1).
2109
+ result_kwargs.pop("seed", None)
986
2110
 
987
2111
  return result_kwargs
988
2112
 
2113
+ @staticmethod
2114
+ def _normalize_seed(seed: Any) -> Optional[int]:
2115
+ """Normalize seed semantics across providers.
2116
+
2117
+ - None or any negative value -> None (meaning: don't send a provider seed / random).
2118
+ - Non-bool numeric-ish values -> int(seed) if >= 0.
2119
+ """
2120
+ try:
2121
+ if seed is None:
2122
+ return None
2123
+ if isinstance(seed, bool):
2124
+ return None
2125
+ seed_i = int(seed)
2126
+ return seed_i if seed_i >= 0 else None
2127
+ except Exception:
2128
+ return None
2129
+
989
2130
  def _extract_generation_params(self, **kwargs) -> Dict[str, Any]:
990
2131
  """
991
2132
  Extract generation parameters with consistent fallback hierarchy.
@@ -996,10 +2137,13 @@ class BaseProvider(AbstractCoreInterface, ABC):
996
2137
  params = {}
997
2138
 
998
2139
  # Temperature (always present)
999
- params["temperature"] = kwargs.get("temperature", self.temperature)
2140
+ temperature = kwargs.get("temperature", self.temperature)
2141
+ if temperature is None:
2142
+ temperature = self.temperature
2143
+ params["temperature"] = temperature
1000
2144
 
1001
2145
  # Seed (only if not None)
1002
- seed_value = kwargs.get("seed", self.seed)
2146
+ seed_value = self._normalize_seed(kwargs.get("seed", self.seed))
1003
2147
  if seed_value is not None:
1004
2148
  params["seed"] = seed_value
1005
2149
 
@@ -1041,7 +2185,10 @@ class BaseProvider(AbstractCoreInterface, ABC):
1041
2185
 
1042
2186
  if not should_execute:
1043
2187
  # Tool execution disabled - return response with tool calls but don't execute
1044
- self.logger.info("Tool execution disabled - returning response with tool calls")
2188
+ self.logger.debug(
2189
+ "Provider-side tool execution disabled (expected for runtime/host tool execution); "
2190
+ "returning response with tool calls."
2191
+ )
1045
2192
  return response
1046
2193
 
1047
2194
  # Emit tool started event
@@ -1098,7 +2245,8 @@ class BaseProvider(AbstractCoreInterface, ABC):
1098
2245
  finish_reason=response.finish_reason,
1099
2246
  raw_response=response.raw_response,
1100
2247
  usage=response.usage,
1101
- tool_calls=response.tool_calls # Keep original format
2248
+ tool_calls=response.tool_calls, # Keep original format
2249
+ metadata=response.metadata,
1102
2250
  )
1103
2251
 
1104
2252
  def _format_tool_results(self, tool_calls: List, tool_results: List) -> str:
@@ -1106,9 +2254,7 @@ class BaseProvider(AbstractCoreInterface, ABC):
1106
2254
  results_text = "\n\nTool Results:\n"
1107
2255
  for call, result in zip(tool_calls, tool_results):
1108
2256
  # Format parameters for display (limit size)
1109
- params_str = str(call.arguments) if call.arguments else "{}"
1110
- if len(params_str) > 100:
1111
- params_str = params_str[:97] + "..."
2257
+ params_str = preview_text(str(call.arguments) if call.arguments else "{}", max_chars=100)
1112
2258
 
1113
2259
  # Show tool name and parameters for transparency
1114
2260
  results_text += f"🔧 Tool: {call.name}({params_str})\n"
@@ -1174,26 +2320,341 @@ class BaseProvider(AbstractCoreInterface, ABC):
1174
2320
  """Update HTTP client timeout if the provider has one. Override in subclasses."""
1175
2321
  pass
1176
2322
 
1177
- # Memory management methods
1178
- def unload(self) -> None:
2323
+ # Prompt cache management methods
2324
+ def supports_prompt_cache(self) -> bool:
2325
+ """Return True if this provider supports best-effort prompt caching.
2326
+
2327
+ Semantics differ by provider:
2328
+ - Remote providers (OpenAI): `prompt_cache_key` is forwarded; cache is managed server-side.
2329
+ - Local providers (MLX / llama.cpp): in-process KV/prefix caches can be retained across calls.
1179
2330
  """
1180
- Unload the model from memory.
2331
+ return False
2332
+
2333
+ # Provider-specific prompt cache backend hooks (optional)
2334
+ #
2335
+ # Providers that implement in-process KV caching (MLX, llama.cpp, etc.) can override these to enable
2336
+ # `prompt_cache_update`, `prompt_cache_fork`, and `prompt_cache_prepare_modules`.
2337
+ def _prompt_cache_backend_create(self) -> Optional[Any]:
2338
+ return None
2339
+
2340
+ def _prompt_cache_backend_clone(self, cache_value: Any) -> Optional[Any]:
2341
+ _ = cache_value
2342
+ return None
1181
2343
 
1182
- For local providers (MLX, HuggingFace), this explicitly frees model memory.
1183
- For server-based providers (Ollama, LMStudio), this requests server unload.
1184
- For API providers (OpenAI, Anthropic), this is a no-op.
2344
+ def _prompt_cache_backend_append(
2345
+ self,
2346
+ cache_value: Any,
2347
+ *,
2348
+ prompt: str = "",
2349
+ messages: Optional[List[Dict[str, Any]]] = None,
2350
+ system_prompt: Optional[str] = None,
2351
+ tools: Optional[List[Dict[str, Any]]] = None,
2352
+ add_generation_prompt: bool = False,
2353
+ **kwargs,
2354
+ ) -> bool:
2355
+ _ = (cache_value, prompt, messages, system_prompt, tools, add_generation_prompt, kwargs)
2356
+ return False
2357
+
2358
+ def _prompt_cache_backend_token_count(self, cache_value: Any) -> Optional[int]:
2359
+ _ = cache_value
2360
+ return None
2361
+
2362
+ def _normalize_prompt_cache_key(self, key: Any) -> Optional[str]:
2363
+ if not isinstance(key, str):
2364
+ return None
2365
+ key = key.strip()
2366
+ return key if key else None
2367
+
2368
+ def _apply_default_prompt_cache_key(self, kwargs: Dict[str, Any]) -> None:
2369
+ # Explicit caller override wins (even if None / empty to disable).
2370
+ if "prompt_cache_key" in kwargs:
2371
+ kwargs["prompt_cache_key"] = self._normalize_prompt_cache_key(kwargs.get("prompt_cache_key"))
2372
+ return
2373
+
2374
+ if self._default_prompt_cache_key and self.supports_prompt_cache():
2375
+ kwargs["prompt_cache_key"] = self._default_prompt_cache_key
2376
+
2377
+ def get_prompt_cache_stats(self) -> Dict[str, Any]:
2378
+ """Return basic prompt cache stats (in-process store only)."""
2379
+ stats = self._prompt_cache_store.stats()
2380
+ stats["default_key"] = self._default_prompt_cache_key
2381
+ try:
2382
+ keys = self._prompt_cache_store.keys()
2383
+ if isinstance(keys, list):
2384
+ stats["keys"] = list(keys)
2385
+ meta_by_key: Dict[str, Any] = {}
2386
+ for k in keys:
2387
+ meta = self._prompt_cache_store.meta(k)
2388
+ if isinstance(meta, dict) and meta:
2389
+ meta_by_key[str(k)] = dict(meta)
2390
+ if meta_by_key:
2391
+ stats["meta_by_key"] = meta_by_key
2392
+ except Exception:
2393
+ pass
2394
+ return stats
1185
2395
 
1186
- After calling unload(), the provider instance should not be used for generation.
1187
- Create a new provider instance if you need to generate again.
2396
+ def prompt_cache_set(self, key: str, *, make_default: bool = True, **kwargs) -> bool:
2397
+ """Set the default prompt cache key for this provider instance.
1188
2398
 
1189
- Usage:
1190
- provider = create_llm("mlx", model="...")
1191
- response = provider.generate("Hello")
1192
- provider.unload() # Free memory
1193
- del provider # Remove reference
2399
+ Provider-specific cache allocation/warming is implemented by subclasses when applicable.
2400
+ """
2401
+ normalized = self._normalize_prompt_cache_key(key)
2402
+ if normalized is None:
2403
+ return False
2404
+ if not self.supports_prompt_cache():
2405
+ return False
2406
+ _ = kwargs
2407
+ # Best-effort: allocate backend cache if the provider supports it.
2408
+ if self._prompt_cache_store.get(normalized) is None:
2409
+ created = self._prompt_cache_backend_create()
2410
+ if created is not None:
2411
+ try:
2412
+ self._prompt_cache_store.set(normalized, created, meta={"backend": "provider"})
2413
+ except Exception:
2414
+ pass
2415
+ if make_default:
2416
+ self._default_prompt_cache_key = normalized
2417
+ return True
2418
+
2419
+ def prompt_cache_update(
2420
+ self,
2421
+ key: str,
2422
+ *,
2423
+ prompt: str = "",
2424
+ messages: Optional[List[Dict[str, Any]]] = None,
2425
+ system_prompt: Optional[str] = None,
2426
+ tools: Optional[List[Dict[str, Any]]] = None,
2427
+ add_generation_prompt: bool = False,
2428
+ ttl_s: Optional[float] = None,
2429
+ **kwargs,
2430
+ ) -> bool:
2431
+ """Append new prompt context into an existing cache key (best-effort).
2432
+
2433
+ Semantics:
2434
+ - Local runtimes can implement true KV prefill updates (append-only).
2435
+ - Remote providers typically cannot be “pre-filled” explicitly; they may ignore this.
2436
+
2437
+ Arguments are intentionally similar to `generate()` so higher-level code can reuse its own
2438
+ prompt/module construction logic.
2439
+ """
2440
+ normalized = self._normalize_prompt_cache_key(key)
2441
+ if normalized is None:
2442
+ return False
2443
+ if not self.supports_prompt_cache():
2444
+ return False
2445
+
2446
+ # Ensure the cache exists if the provider can allocate a backend cache object.
2447
+ cache_value = self._prompt_cache_store.get(normalized)
2448
+ if cache_value is None:
2449
+ if not self.prompt_cache_set(normalized, make_default=False):
2450
+ return False
2451
+ cache_value = self._prompt_cache_store.get(normalized)
2452
+ if cache_value is None:
2453
+ return False
2454
+
2455
+ ok = self._prompt_cache_backend_append(
2456
+ cache_value,
2457
+ prompt=str(prompt or ""),
2458
+ messages=messages,
2459
+ system_prompt=system_prompt,
2460
+ tools=tools,
2461
+ add_generation_prompt=bool(add_generation_prompt),
2462
+ **kwargs,
2463
+ )
2464
+ if not ok:
2465
+ return False
2466
+
2467
+ # Update TTL/metadata best-effort.
2468
+ if ttl_s is not None:
2469
+ try:
2470
+ meta = self._prompt_cache_store.meta(normalized) or {}
2471
+ self._prompt_cache_store.set(normalized, cache_value, ttl_s=ttl_s, meta=meta)
2472
+ except Exception:
2473
+ pass
2474
+ return True
2475
+
2476
+ def prompt_cache_fork(
2477
+ self,
2478
+ from_key: str,
2479
+ to_key: str,
2480
+ *,
2481
+ make_default: bool = False,
2482
+ ttl_s: Optional[float] = None,
2483
+ **kwargs,
2484
+ ) -> bool:
2485
+ """Create a new cache key by cloning another cache (best-effort).
2486
+
2487
+ This is the primitive needed for hierarchical/module caches:
2488
+ - build stable shared prefixes (persona, memory blueprints, tool schemas)
2489
+ - fork them into per-session caches that can be appended/mutated safely.
2490
+ """
2491
+ _ = kwargs
2492
+ src = self._normalize_prompt_cache_key(from_key)
2493
+ dst = self._normalize_prompt_cache_key(to_key)
2494
+ if src is None or dst is None:
2495
+ return False
2496
+ if not self.supports_prompt_cache():
2497
+ return False
2498
+
2499
+ src_value = self._prompt_cache_store.get(src)
2500
+ if src_value is None:
2501
+ return False
2502
+
2503
+ cloned = self._prompt_cache_backend_clone(src_value)
2504
+ if cloned is None:
2505
+ return False
2506
+
2507
+ try:
2508
+ meta = self._prompt_cache_store.meta(src) or {}
2509
+ meta = dict(meta)
2510
+ meta.setdefault("forked_from", src)
2511
+ self._prompt_cache_store.set(dst, cloned, ttl_s=ttl_s, meta=meta)
2512
+ except Exception:
2513
+ return False
2514
+
2515
+ if make_default:
2516
+ self._default_prompt_cache_key = dst
2517
+ return True
2518
+
2519
+ def prompt_cache_prepare_modules(
2520
+ self,
2521
+ *,
2522
+ namespace: str,
2523
+ modules: List[Union[PromptCacheModule, Dict[str, Any]]],
2524
+ make_default: bool = False,
2525
+ ttl_s: Optional[float] = None,
2526
+ version: int = 1,
2527
+ ) -> Dict[str, Any]:
2528
+ """Ensure hierarchical prefix caches exist for an ordered module list (best-effort).
2529
+
2530
+ This builds immutable prefix caches (by derived keys) so callers can:
2531
+ - reuse stable sub-prefixes (persona, memory blueprints, etc.)
2532
+ - fork the final prefix into a per-session cache for incremental chat
2533
+
2534
+ Returns a JSON-serializable dict containing per-module derived keys.
2535
+ """
2536
+ ns = str(namespace or "").strip()
2537
+ if not ns:
2538
+ return {"supported": False, "error": "namespace required"}
2539
+ if not self.supports_prompt_cache():
2540
+ return {"supported": False, "error": "provider does not support prompt caching"}
2541
+
2542
+ normalized_modules: List[PromptCacheModule] = []
2543
+ for m in modules or []:
2544
+ if isinstance(m, PromptCacheModule):
2545
+ normalized_modules.append(m.normalized())
2546
+ elif isinstance(m, dict):
2547
+ try:
2548
+ normalized_modules.append(PromptCacheModule(**m).normalized())
2549
+ except Exception:
2550
+ continue
2551
+
2552
+ if not normalized_modules:
2553
+ return {"supported": False, "error": "no modules provided"}
2554
+
2555
+ # Derive deterministic prefix keys per module boundary.
2556
+ prefix_hash = hashlib.sha256(f"acore-prompt-cache:{int(version)}".encode("utf-8")).hexdigest()
2557
+ derived: List[Dict[str, Any]] = []
2558
+ keys: List[str] = []
2559
+ for mod in normalized_modules:
2560
+ prefix_hash = hashlib.sha256((prefix_hash + mod.fingerprint(version=version)).encode("utf-8")).hexdigest()
2561
+ key = f"{ns}:{prefix_hash[:16]}"
2562
+ keys.append(key)
2563
+ derived.append({"module_id": mod.module_id, "cache_key": key, "module_hash": mod.fingerprint(version=version)})
2564
+
2565
+ # Find the longest existing prefix cache.
2566
+ start_idx = -1
2567
+ for i, key in enumerate(keys):
2568
+ if self._prompt_cache_store.get(key) is None:
2569
+ break
2570
+ start_idx = i
2571
+
2572
+ # Start from existing prefix (clone to avoid mutating the stored snapshot).
2573
+ current_cache: Optional[Any] = None
2574
+ if start_idx >= 0:
2575
+ existing = self._prompt_cache_store.get(keys[start_idx])
2576
+ if existing is not None:
2577
+ current_cache = self._prompt_cache_backend_clone(existing) or None
2578
+
2579
+ # If we have no starting cache, start from empty backend cache.
2580
+ if current_cache is None:
2581
+ current_cache = self._prompt_cache_backend_create()
2582
+ if current_cache is None:
2583
+ return {"supported": False, "error": "provider does not implement in-process cache backend"}
2584
+
2585
+ # Build missing caches.
2586
+ for j in range(start_idx + 1, len(keys)):
2587
+ mod = normalized_modules[j]
2588
+ ok = self._prompt_cache_backend_append(
2589
+ current_cache,
2590
+ prompt=str(mod.prompt or ""),
2591
+ messages=mod.messages,
2592
+ system_prompt=mod.system_prompt,
2593
+ tools=mod.tools,
2594
+ add_generation_prompt=bool(mod.add_generation_prompt),
2595
+ )
2596
+ if not ok:
2597
+ return {"supported": False, "error": f"failed to append module '{mod.module_id}'"}
2598
+
2599
+ snapshot = self._prompt_cache_backend_clone(current_cache) or None
2600
+ if snapshot is None:
2601
+ return {"supported": False, "error": "provider does not support cache cloning"}
2602
+
2603
+ meta = {
2604
+ "namespace": ns,
2605
+ "module_id": mod.module_id,
2606
+ "module_hash": mod.fingerprint(version=version),
2607
+ "index": j,
2608
+ "backend": "provider",
2609
+ }
2610
+ tok = self._prompt_cache_backend_token_count(snapshot)
2611
+ if isinstance(tok, int) and tok >= 0:
2612
+ meta["token_count"] = tok
2613
+
2614
+ self._prompt_cache_store.set(keys[j], snapshot, ttl_s=ttl_s, meta=meta)
2615
+
2616
+ if make_default:
2617
+ self._default_prompt_cache_key = keys[-1]
2618
+
2619
+ return {
2620
+ "supported": True,
2621
+ "namespace": ns,
2622
+ "version": int(version),
2623
+ "modules": derived,
2624
+ "final_cache_key": keys[-1],
2625
+ }
2626
+
2627
+ def prompt_cache_clear(self, key: Optional[str] = None) -> bool:
2628
+ """Clear prompt caches for this provider instance (best-effort)."""
2629
+ normalized = self._normalize_prompt_cache_key(key) if key is not None else None
2630
+ if not self.supports_prompt_cache():
2631
+ return False
2632
+
2633
+ if normalized is None:
2634
+ self._default_prompt_cache_key = None
2635
+ self._prompt_cache_store.clear()
2636
+ return True
2637
+
2638
+ cleared = self._prompt_cache_store.delete(normalized)
2639
+ if self._default_prompt_cache_key == normalized:
2640
+ self._default_prompt_cache_key = None
2641
+ return cleared
2642
+
2643
+ # Memory management methods
2644
+ @abstractmethod
2645
+ def unload_model(self, model_name: str) -> None:
2646
+ """
2647
+ Unload/cleanup resources for a specific model.
2648
+
2649
+ This is the single canonical unload entrypoint across providers.
2650
+ Providers must implement this as a best-effort cleanup hook:
2651
+
2652
+ - In-process providers (e.g. MLX, HuggingFace): free local model resources.
2653
+ - Some self-hosted servers (e.g. Ollama): may request server-side eviction/unload.
2654
+ - OpenAI-compatible servers (e.g. LMStudio, vLLM, openai-compatible): typically only close client
2655
+ connections; server-side model unloading may not be available and is controlled by the server (TTL/eviction).
2656
+ - Cloud APIs (e.g. OpenAI, Anthropic): usually a no-op (safe to call).
1194
2657
  """
1195
- # Default implementation does nothing (suitable for API providers)
1196
- pass
1197
2658
 
1198
2659
  # Token configuration helpers - expose interface methods for user convenience
1199
2660
  def get_token_configuration_summary(self) -> str:
@@ -1202,7 +2663,19 @@ class BaseProvider(AbstractCoreInterface, ABC):
1202
2663
 
1203
2664
  def validate_token_constraints(self) -> List[str]:
1204
2665
  """Validate token configuration and return warnings/suggestions"""
1205
- return super().validate_token_constraints()
2666
+ warnings_list = super().validate_token_constraints()
2667
+
2668
+ # Embedding models are not text-generative: output token limits are irrelevant and can
2669
+ # legitimately be 0 (e.g. Nomic Embed). Suppress misleading output-token warnings.
2670
+ try:
2671
+ caps = getattr(self, "model_capabilities", None)
2672
+ model_type = caps.get("model_type") if isinstance(caps, dict) else None
2673
+ if isinstance(model_type, str) and model_type.strip().lower() == "embedding":
2674
+ warnings_list = [w for w in warnings_list if "max_output_tokens" not in str(w)]
2675
+ except Exception:
2676
+ pass
2677
+
2678
+ return warnings_list
1206
2679
 
1207
2680
  def calculate_token_budget(self, input_text: str, desired_output_tokens: int,
1208
2681
  safety_margin: float = 0.1) -> tuple[int, List[str]]:
@@ -1239,7 +2712,7 @@ class BaseProvider(AbstractCoreInterface, ABC):
1239
2712
  except ImportError as e:
1240
2713
  raise ImportError(
1241
2714
  f"Media processing requires additional dependencies. "
1242
- f"Install with: pip install abstractcore[media]. Error: {e}"
2715
+ f"Install with: pip install \"abstractcore[media]\". Error: {e}"
1243
2716
  )
1244
2717
 
1245
2718
  processed_media = []
@@ -1506,45 +2979,6 @@ class BaseProvider(AbstractCoreInterface, ABC):
1506
2979
  # Return original response if rewriting fails
1507
2980
  return response
1508
2981
 
1509
- def _strip_output_wrappers(self, content: str) -> str:
1510
- """Strip known model-specific wrapper tokens around assistant output.
1511
-
1512
- Some model/server combinations emit wrapper tokens like:
1513
- <|begin_of_box|> ... <|end_of_box|>
1514
- We remove these only when they appear as leading/trailing wrappers (not when
1515
- embedded mid-text).
1516
- """
1517
- if not isinstance(content, str) or not content:
1518
- return content
1519
-
1520
- wrappers: Dict[str, str] = {}
1521
- for src in (self.architecture_config, self.model_capabilities):
1522
- if not isinstance(src, dict):
1523
- continue
1524
- w = src.get("output_wrappers")
1525
- if not isinstance(w, dict):
1526
- continue
1527
- start = w.get("start")
1528
- end = w.get("end")
1529
- if isinstance(start, str) and start.strip():
1530
- wrappers.setdefault("start", start.strip())
1531
- if isinstance(end, str) and end.strip():
1532
- wrappers.setdefault("end", end.strip())
1533
-
1534
- if not wrappers:
1535
- return content
1536
-
1537
- out = content
1538
- start_token = wrappers.get("start")
1539
- end_token = wrappers.get("end")
1540
-
1541
- if isinstance(start_token, str) and start_token:
1542
- out = re.sub(r"^\s*" + re.escape(start_token) + r"\s*", "", out, count=1)
1543
- if isinstance(end_token, str) and end_token:
1544
- out = re.sub(r"\s*" + re.escape(end_token) + r"\s*$", "", out, count=1)
1545
-
1546
- return out
1547
-
1548
2982
  def _normalize_tool_calls_passthrough(
1549
2983
  self,
1550
2984
  *,
@@ -2014,6 +3448,7 @@ Please provide a structured response."""
2014
3448
  Returns:
2015
3449
  GenerateResponse, AsyncIterator[GenerateResponse] for streaming, or BaseModel for structured output
2016
3450
  """
3451
+ self._apply_default_prompt_cache_key(kwargs)
2017
3452
  response = await self._agenerate_internal(
2018
3453
  prompt, messages, system_prompt, tools, media, stream, **kwargs
2019
3454
  )