openhands-sdk 1.3.0__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. openhands/sdk/__init__.py +4 -0
  2. openhands/sdk/agent/agent.py +55 -22
  3. openhands/sdk/agent/base.py +8 -1
  4. openhands/sdk/agent/prompts/system_prompt.j2 +1 -11
  5. openhands/sdk/agent/utils.py +5 -0
  6. openhands/sdk/context/agent_context.py +30 -0
  7. openhands/sdk/context/skills/__init__.py +2 -0
  8. openhands/sdk/context/skills/skill.py +202 -1
  9. openhands/sdk/conversation/__init__.py +5 -1
  10. openhands/sdk/conversation/base.py +15 -6
  11. openhands/sdk/conversation/conversation.py +10 -1
  12. openhands/sdk/conversation/conversation_stats.py +38 -1
  13. openhands/sdk/conversation/fifo_lock.py +14 -8
  14. openhands/sdk/conversation/impl/local_conversation.py +21 -5
  15. openhands/sdk/conversation/secret_source.py +1 -1
  16. openhands/sdk/conversation/state.py +8 -0
  17. openhands/sdk/conversation/types.py +5 -0
  18. openhands/sdk/event/conversation_state.py +8 -0
  19. openhands/sdk/llm/__init__.py +3 -0
  20. openhands/sdk/llm/llm.py +82 -16
  21. openhands/sdk/llm/llm_registry.py +1 -1
  22. openhands/sdk/llm/options/chat_options.py +12 -24
  23. openhands/sdk/llm/options/responses_options.py +9 -1
  24. openhands/sdk/llm/router/base.py +3 -0
  25. openhands/sdk/llm/streaming.py +9 -0
  26. openhands/sdk/llm/utils/model_features.py +12 -0
  27. openhands/sdk/logger/logger.py +7 -0
  28. openhands/sdk/tool/tool.py +18 -1
  29. openhands/sdk/utils/models.py +90 -9
  30. openhands/sdk/utils/truncate.py +81 -8
  31. openhands/sdk/workspace/__init__.py +3 -1
  32. openhands/sdk/workspace/models.py +7 -1
  33. openhands/sdk/workspace/remote/async_remote_workspace.py +22 -1
  34. openhands/sdk/workspace/remote/base.py +13 -0
  35. {openhands_sdk-1.3.0.dist-info → openhands_sdk-1.4.1.dist-info}/METADATA +2 -2
  36. {openhands_sdk-1.3.0.dist-info → openhands_sdk-1.4.1.dist-info}/RECORD +38 -37
  37. {openhands_sdk-1.3.0.dist-info → openhands_sdk-1.4.1.dist-info}/WHEEL +0 -0
  38. {openhands_sdk-1.3.0.dist-info → openhands_sdk-1.4.1.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,6 @@
1
- from pydantic import BaseModel, Field, PrivateAttr
1
+ from typing import Any
2
+
3
+ from pydantic import BaseModel, Field, PrivateAttr, model_serializer
2
4
 
3
5
  from openhands.sdk.llm.llm_registry import RegistryEvent
4
6
  from openhands.sdk.llm.utils.metrics import Metrics
@@ -18,6 +20,41 @@ class ConversationStats(BaseModel):
18
20
 
19
21
  _restored_usage_ids: set[str] = PrivateAttr(default_factory=set)
20
22
 
23
+ @model_serializer(mode="wrap")
24
+ def _serialize_with_context(self, serializer: Any, info: Any) -> dict[str, Any]:
25
+ """Serialize metrics based on context.
26
+
27
+ By default, preserves full metrics history including costs,
28
+ response_latencies, and token_usages lists for persistence.
29
+
30
+ When context={'use_snapshot': True} is passed, converts Metrics to
31
+ MetricsSnapshot format to minimize payload size for network transmission.
32
+
33
+ Args:
34
+ serializer: Pydantic's default serializer
35
+ info: Serialization info containing context
36
+
37
+ Returns:
38
+ Dictionary with metrics serialized based on context
39
+ """
40
+ # Get the default serialization
41
+ data = serializer(self)
42
+
43
+ # Check if we should use snapshot serialization
44
+ context = info.context if info else None
45
+ use_snapshot = context.get("use_snapshot", False) if context else False
46
+
47
+ if use_snapshot and "usage_to_metrics" in data:
48
+ # Replace each Metrics with its snapshot
49
+ usage_to_snapshots = {}
50
+ for usage_id, metrics in self.usage_to_metrics.items():
51
+ snapshot = metrics.get_snapshot()
52
+ usage_to_snapshots[usage_id] = snapshot.model_dump()
53
+
54
+ data["usage_to_metrics"] = usage_to_snapshots
55
+
56
+ return data
57
+
21
58
  def get_combined_metrics(self) -> Metrics:
22
59
  total_metrics = Metrics()
23
60
  for metrics in self.usage_to_metrics.values():
@@ -50,7 +50,6 @@ class FIFOLock:
50
50
  Returns:
51
51
  True if lock was acquired, False otherwise.
52
52
  """
53
- me = threading.Condition(self._mutex)
54
53
  ident = threading.get_ident()
55
54
  start = time.monotonic()
56
55
 
@@ -60,21 +59,27 @@ class FIFOLock:
60
59
  self._count += 1
61
60
  return True
62
61
 
62
+ if self._owner is None and not self._waiters:
63
+ self._owner = ident
64
+ self._count = 1
65
+ return True
66
+
67
+ if not blocking:
68
+ # Give up immediately
69
+ return False
70
+
63
71
  # Add to wait queue
72
+ me = threading.Condition(self._mutex)
64
73
  self._waiters.append(me)
65
74
 
66
75
  while True:
67
76
  # If I'm at the front of the queue and nobody owns it → acquire
68
77
  if self._waiters[0] is me and self._owner is None:
78
+ self._waiters.popleft()
69
79
  self._owner = ident
70
80
  self._count = 1
71
81
  return True
72
82
 
73
- if not blocking:
74
- # Give up immediately
75
- self._waiters.remove(me)
76
- return False
77
-
78
83
  if timeout >= 0:
79
84
  remaining = timeout - (time.monotonic() - start)
80
85
  if remaining <= 0:
@@ -95,11 +100,12 @@ class FIFOLock:
95
100
  with self._mutex:
96
101
  if self._owner != ident:
97
102
  raise RuntimeError("Cannot release lock not owned by current thread")
98
-
103
+ assert self._count >= 1, (
104
+ "When releasing the resource, the count must be >= 1"
105
+ )
99
106
  self._count -= 1
100
107
  if self._count == 0:
101
108
  self._owner = None
102
- self._waiters.popleft()
103
109
  if self._waiters:
104
110
  self._waiters[0].notify()
105
111
 
@@ -4,7 +4,6 @@ from collections.abc import Mapping
4
4
  from pathlib import Path
5
5
 
6
6
  from openhands.sdk.agent.base import AgentBase
7
- from openhands.sdk.agent.utils import make_llm_completion, prepare_llm_messages
8
7
  from openhands.sdk.context.prompts.prompt import render_template
9
8
  from openhands.sdk.conversation.base import BaseConversation
10
9
  from openhands.sdk.conversation.exceptions import ConversationRunError
@@ -15,7 +14,11 @@ from openhands.sdk.conversation.state import (
15
14
  )
16
15
  from openhands.sdk.conversation.stuck_detector import StuckDetector
17
16
  from openhands.sdk.conversation.title_utils import generate_conversation_title
18
- from openhands.sdk.conversation.types import ConversationCallbackType, ConversationID
17
+ from openhands.sdk.conversation.types import (
18
+ ConversationCallbackType,
19
+ ConversationID,
20
+ ConversationTokenCallbackType,
21
+ )
19
22
  from openhands.sdk.conversation.visualizer import (
20
23
  ConversationVisualizerBase,
21
24
  DefaultConversationVisualizer,
@@ -46,6 +49,7 @@ class LocalConversation(BaseConversation):
46
49
  _state: ConversationState
47
50
  _visualizer: ConversationVisualizerBase | None
48
51
  _on_event: ConversationCallbackType
52
+ _on_token: ConversationTokenCallbackType | None
49
53
  max_iteration_per_run: int
50
54
  _stuck_detector: StuckDetector | None
51
55
  llm_registry: LLMRegistry
@@ -58,6 +62,7 @@ class LocalConversation(BaseConversation):
58
62
  persistence_dir: str | Path | None = None,
59
63
  conversation_id: ConversationID | None = None,
60
64
  callbacks: list[ConversationCallbackType] | None = None,
65
+ token_callbacks: list[ConversationTokenCallbackType] | None = None,
61
66
  max_iteration_per_run: int = 500,
62
67
  stuck_detection: bool = True,
63
68
  visualizer: (
@@ -78,6 +83,7 @@ class LocalConversation(BaseConversation):
78
83
  be used to identify the conversation. The user might want to
79
84
  suffix their persistent filestore with this ID.
80
85
  callbacks: Optional list of callback functions to handle events
86
+ token_callbacks: Optional list of callbacks invoked for streaming deltas
81
87
  max_iteration_per_run: Maximum number of iterations per run
82
88
  visualizer: Visualization configuration. Can be:
83
89
  - ConversationVisualizerBase subclass: Class to instantiate
@@ -143,6 +149,12 @@ class LocalConversation(BaseConversation):
143
149
  self._visualizer = None
144
150
 
145
151
  self._on_event = BaseConversation.compose_callbacks(composed_list)
152
+ self._on_token = (
153
+ BaseConversation.compose_callbacks(token_callbacks)
154
+ if token_callbacks
155
+ else None
156
+ )
157
+
146
158
  self.max_iteration_per_run = max_iteration_per_run
147
159
 
148
160
  # Initialize stuck detector
@@ -305,8 +317,9 @@ class LocalConversation(BaseConversation):
305
317
  ConversationExecutionStatus.RUNNING
306
318
  )
307
319
 
308
- # step must mutate the SAME state object
309
- self.agent.step(self, on_event=self._on_event)
320
+ self.agent.step(
321
+ self, on_event=self._on_event, on_token=self._on_token
322
+ )
310
323
  iteration += 1
311
324
 
312
325
  # Check for non-finished terminal conditions
@@ -436,7 +449,7 @@ class LocalConversation(BaseConversation):
436
449
  executable_tool = tool.as_executable()
437
450
  executable_tool.executor.close()
438
451
  except NotImplementedError:
439
- # Tool has no executor, skip it
452
+ # Tool has no executor, skip it without erroring
440
453
  continue
441
454
  except Exception as e:
442
455
  logger.warning(f"Error closing executor for tool '{tool.name}': {e}")
@@ -456,6 +469,9 @@ class LocalConversation(BaseConversation):
456
469
  Returns:
457
470
  A string response from the agent
458
471
  """
472
+ # Import here to avoid circular imports
473
+ from openhands.sdk.agent.utils import make_llm_completion, prepare_llm_messages
474
+
459
475
  template_dir = (
460
476
  Path(__file__).parent.parent.parent / "context" / "prompts" / "templates"
461
477
  )
@@ -45,7 +45,7 @@ class LookupSecret(SecretSource):
45
45
  headers: dict[str, str] = Field(default_factory=dict)
46
46
 
47
47
  def get_value(self):
48
- response = httpx.get(self.url, headers=self.headers)
48
+ response = httpx.get(self.url, headers=self.headers, timeout=30.0)
49
49
  response.raise_for_status()
50
50
  return response.text
51
51
 
@@ -2,6 +2,7 @@
2
2
  import json
3
3
  from collections.abc import Sequence
4
4
  from enum import Enum
5
+ from pathlib import Path
5
6
  from typing import Any, Self
6
7
 
7
8
  from pydantic import AliasChoices, Field, PrivateAttr
@@ -124,6 +125,13 @@ class ConversationState(OpenHandsModel):
124
125
  def events(self) -> EventLog:
125
126
  return self._events
126
127
 
128
+ @property
129
+ def env_observation_persistence_dir(self) -> str | None:
130
+ """Directory for persisting environment observation files."""
131
+ if self.persistence_dir is None:
132
+ return None
133
+ return str(Path(self.persistence_dir) / "observations")
134
+
127
135
  def set_on_state_change(self, callback: ConversationCallbackType | None) -> None:
128
136
  """Set a callback to be called when state changes.
129
137
 
@@ -2,9 +2,14 @@ import uuid
2
2
  from collections.abc import Callable
3
3
 
4
4
  from openhands.sdk.event.base import Event
5
+ from openhands.sdk.llm.streaming import TokenCallbackType
5
6
 
6
7
 
7
8
  ConversationCallbackType = Callable[[Event], None]
9
+ """Type alias for event callback functions."""
10
+
11
+ ConversationTokenCallbackType = TokenCallbackType
12
+ """Callback type invoked for streaming LLM deltas."""
8
13
 
9
14
  ConversationID = uuid.UUID
10
15
  """Type alias for conversation IDs."""
@@ -49,6 +49,14 @@ class ConversationStateUpdateEvent(Event):
49
49
 
50
50
  @field_validator("value")
51
51
  def validate_value(cls, value, info):
52
+ # Prevent circular import
53
+ from openhands.sdk.conversation.conversation_stats import ConversationStats
54
+
55
+ # For ConversationStats, use snapshot serialization to avoid
56
+ # sending lengthy lists over WebSocket
57
+ if isinstance(value, ConversationStats):
58
+ return value.model_dump(mode="json", context={"use_snapshot": True})
59
+
52
60
  key = info.data.get("key")
53
61
  if key is None:
54
62
  # Allow value without key for flexibility
@@ -12,6 +12,7 @@ from openhands.sdk.llm.message import (
12
12
  content_to_str,
13
13
  )
14
14
  from openhands.sdk.llm.router import RouterLLM
15
+ from openhands.sdk.llm.streaming import LLMStreamChunk, TokenCallbackType
15
16
  from openhands.sdk.llm.utils.metrics import Metrics, MetricsSnapshot
16
17
  from openhands.sdk.llm.utils.unverified_models import (
17
18
  UNVERIFIED_MODELS_EXCLUDING_BEDROCK,
@@ -34,6 +35,8 @@ __all__ = [
34
35
  "RedactedThinkingBlock",
35
36
  "ReasoningItemModel",
36
37
  "content_to_str",
38
+ "LLMStreamChunk",
39
+ "TokenCallbackType",
37
40
  "Metrics",
38
41
  "MetricsSnapshot",
39
42
  "VERIFIED_MODELS",
openhands/sdk/llm/llm.py CHANGED
@@ -10,7 +10,6 @@ from typing import TYPE_CHECKING, Any, ClassVar, Literal, get_args, get_origin
10
10
 
11
11
  import httpx # noqa: F401
12
12
  from pydantic import (
13
- AliasChoices,
14
13
  BaseModel,
15
14
  ConfigDict,
16
15
  Field,
@@ -40,6 +39,7 @@ from typing import cast
40
39
 
41
40
  from litellm import (
42
41
  ChatCompletionToolParam,
42
+ CustomStreamWrapper,
43
43
  ResponseInputParam,
44
44
  completion as litellm_completion,
45
45
  )
@@ -72,6 +72,9 @@ from openhands.sdk.llm.message import (
72
72
  from openhands.sdk.llm.mixins.non_native_fc import NonNativeToolCallingMixin
73
73
  from openhands.sdk.llm.options.chat_options import select_chat_options
74
74
  from openhands.sdk.llm.options.responses_options import select_responses_options
75
+ from openhands.sdk.llm.streaming import (
76
+ TokenCallbackType,
77
+ )
75
78
  from openhands.sdk.llm.utils.metrics import Metrics, MetricsSnapshot
76
79
  from openhands.sdk.llm.utils.model_features import get_default_temperature, get_features
77
80
  from openhands.sdk.llm.utils.retry_mixin import RetryMixin
@@ -168,6 +171,19 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
168
171
  ge=1,
169
172
  description="The maximum number of output tokens. This is sent to the LLM.",
170
173
  )
174
+ model_canonical_name: str | None = Field(
175
+ default=None,
176
+ description=(
177
+ "Optional canonical model name for feature registry lookups. "
178
+ "The OpenHands SDK maintains a model feature registry that "
179
+ "maps model names to capabilities (e.g., vision support, "
180
+ "prompt caching, responses API support). When using proxied or "
181
+ "aliased model identifiers, set this field to the canonical "
182
+ "model name (e.g., 'openai/gpt-4o') to ensure correct "
183
+ "capability detection. If not provided, the 'model' field "
184
+ "will be used for capability lookups."
185
+ ),
186
+ )
171
187
  extra_headers: dict[str, str] | None = Field(
172
188
  default=None,
173
189
  description="Optional HTTP headers to forward to LiteLLM requests.",
@@ -184,6 +200,14 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
184
200
  )
185
201
  ollama_base_url: str | None = Field(default=None)
186
202
 
203
+ stream: bool = Field(
204
+ default=False,
205
+ description=(
206
+ "Enable streaming responses from the LLM. "
207
+ "When enabled, the provided `on_token` callback in .completions "
208
+ "and .responses will be invoked for each chunk of tokens."
209
+ ),
210
+ )
187
211
  drop_params: bool = Field(default=True)
188
212
  modify_params: bool = Field(
189
213
  default=True,
@@ -240,6 +264,14 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
240
264
  description="If True, ask for ['reasoning.encrypted_content'] "
241
265
  "in Responses API include.",
242
266
  )
267
+ # Prompt cache retention only applies to GPT-5+ models; filtered in chat options
268
+ prompt_cache_retention: str | None = Field(
269
+ default="24h",
270
+ description=(
271
+ "Retention policy for prompt cache. Only sent for GPT-5+ models; "
272
+ "explicitly stripped for all other models."
273
+ ),
274
+ )
243
275
  extended_thinking_budget: int | None = Field(
244
276
  default=200_000,
245
277
  description="The budget tokens for extended thinking, "
@@ -256,7 +288,6 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
256
288
  )
257
289
  usage_id: str = Field(
258
290
  default="default",
259
- validation_alias=AliasChoices("usage_id", "service_id"),
260
291
  serialization_alias="usage_id",
261
292
  description=(
262
293
  "Unique usage identifier for the LLM. Used for registry lookups, "
@@ -338,7 +369,8 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
338
369
  if model_val.startswith("openhands/"):
339
370
  model_name = model_val.removeprefix("openhands/")
340
371
  d["model"] = f"litellm_proxy/{model_name}"
341
- d["base_url"] = "https://llm-proxy.app.all-hands.dev/"
372
+ # Set base_url (default to the app proxy when base_url is unset)
373
+ d["base_url"] = d.get("base_url", "https://llm-proxy.app.all-hands.dev/")
342
374
 
343
375
  # HF doesn't support the OpenAI default value for top_p (1)
344
376
  if model_val.startswith("huggingface"):
@@ -447,6 +479,7 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
447
479
  tools: Sequence[ToolDefinition] | None = None,
448
480
  _return_metrics: bool = False,
449
481
  add_security_risk_prediction: bool = False,
482
+ on_token: TokenCallbackType | None = None,
450
483
  **kwargs,
451
484
  ) -> LLMResponse:
452
485
  """Generate a completion from the language model.
@@ -466,9 +499,11 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
466
499
  >>> response = llm.completion(messages)
467
500
  >>> print(response.content)
468
501
  """
469
- # Check if streaming is requested
470
- if kwargs.get("stream", False):
471
- raise ValueError("Streaming is not supported")
502
+ enable_streaming = bool(kwargs.get("stream", False)) or self.stream
503
+ if enable_streaming:
504
+ if on_token is None:
505
+ raise ValueError("Streaming requires an on_token callback")
506
+ kwargs["stream"] = True
472
507
 
473
508
  # 1) serialize messages
474
509
  formatted_messages = self.format_messages_for_llm(messages)
@@ -531,7 +566,12 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
531
566
  self._telemetry.on_request(log_ctx=log_ctx)
532
567
  # Merge retry-modified kwargs (like temperature) with call_kwargs
533
568
  final_kwargs = {**call_kwargs, **retry_kwargs}
534
- resp = self._transport_call(messages=formatted_messages, **final_kwargs)
569
+ resp = self._transport_call(
570
+ messages=formatted_messages,
571
+ **final_kwargs,
572
+ enable_streaming=enable_streaming,
573
+ on_token=on_token,
574
+ )
535
575
  raw_resp: ModelResponse | None = None
536
576
  if use_mock_tools:
537
577
  raw_resp = copy.deepcopy(resp)
@@ -588,15 +628,15 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
588
628
  store: bool | None = None,
589
629
  _return_metrics: bool = False,
590
630
  add_security_risk_prediction: bool = False,
631
+ on_token: TokenCallbackType | None = None,
591
632
  **kwargs,
592
633
  ) -> LLMResponse:
593
634
  """Alternative invocation path using OpenAI Responses API via LiteLLM.
594
635
 
595
636
  Maps Message[] -> (instructions, input[]) and returns LLMResponse.
596
- Non-stream only for v1.
597
637
  """
598
638
  # Streaming not yet supported
599
- if kwargs.get("stream", False):
639
+ if kwargs.get("stream", False) or self.stream or on_token is not None:
600
640
  raise ValueError("Streaming is not supported for Responses API yet")
601
641
 
602
642
  # Build instructions + input list using dedicated Responses formatter
@@ -707,7 +747,12 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
707
747
  # Transport + helpers
708
748
  # =========================================================================
709
749
  def _transport_call(
710
- self, *, messages: list[dict[str, Any]], **kwargs
750
+ self,
751
+ *,
752
+ messages: list[dict[str, Any]],
753
+ enable_streaming: bool = False,
754
+ on_token: TokenCallbackType | None = None,
755
+ **kwargs,
711
756
  ) -> ModelResponse:
712
757
  # litellm.modify_params is GLOBAL; guard it for thread-safety
713
758
  with self._litellm_modify_params_ctx(self.modify_params):
@@ -729,6 +774,11 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
729
774
  "ignore",
730
775
  category=UserWarning,
731
776
  )
777
+ warnings.filterwarnings(
778
+ "ignore",
779
+ category=DeprecationWarning,
780
+ message="Accessing the 'model_fields' attribute.*",
781
+ )
732
782
  # Extract api_key value with type assertion for type checker
733
783
  api_key_value: str | None = None
734
784
  if self.api_key:
@@ -747,6 +797,14 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
747
797
  messages=messages,
748
798
  **kwargs,
749
799
  )
800
+ if enable_streaming and on_token is not None:
801
+ assert isinstance(ret, CustomStreamWrapper)
802
+ chunks = []
803
+ for chunk in ret:
804
+ on_token(chunk)
805
+ chunks.append(chunk)
806
+ ret = litellm.stream_chunk_builder(chunks, messages=messages)
807
+
750
808
  assert isinstance(ret, ModelResponse), (
751
809
  f"Expected ModelResponse, got {type(ret)}"
752
810
  )
@@ -764,11 +822,15 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
764
822
  # =========================================================================
765
823
  # Capabilities, formatting, and info
766
824
  # =========================================================================
825
+ def _model_name_for_capabilities(self) -> str:
826
+ """Return canonical name for capability lookups (e.g., vision support)."""
827
+ return self.model_canonical_name or self.model
828
+
767
829
  def _init_model_info_and_caps(self) -> None:
768
830
  self._model_info = get_litellm_model_info(
769
831
  secret_api_key=self.api_key,
770
832
  base_url=self.base_url,
771
- model=self.model,
833
+ model=self._model_name_for_capabilities(),
772
834
  )
773
835
 
774
836
  # Context window and max_output_tokens
@@ -828,9 +890,10 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
828
890
  # we can go with it, but we will need to keep an eye if model_info is correct for Vertex or other providers # noqa: E501
829
891
  # remove when litellm is updated to fix https://github.com/BerriAI/litellm/issues/5608 # noqa: E501
830
892
  # Check both the full model name and the name after proxy prefix for vision support # noqa: E501
893
+ model_for_caps = self._model_name_for_capabilities()
831
894
  return (
832
- supports_vision(self.model)
833
- or supports_vision(self.model.split("/")[-1])
895
+ supports_vision(model_for_caps)
896
+ or supports_vision(model_for_caps.split("/")[-1])
834
897
  or (
835
898
  self._model_info is not None
836
899
  and self._model_info.get("supports_vision", False)
@@ -849,13 +912,16 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
849
912
  return False
850
913
  # We don't need to look-up model_info, because
851
914
  # only Anthropic models need explicit caching breakpoints
852
- return self.caching_prompt and get_features(self.model).supports_prompt_cache
915
+ return (
916
+ self.caching_prompt
917
+ and get_features(self._model_name_for_capabilities()).supports_prompt_cache
918
+ )
853
919
 
854
920
  def uses_responses_api(self) -> bool:
855
921
  """Whether this model uses the OpenAI Responses API path."""
856
922
 
857
923
  # by default, uses = supports
858
- return get_features(self.model).supports_responses_api
924
+ return get_features(self._model_name_for_capabilities()).supports_responses_api
859
925
 
860
926
  @property
861
927
  def model_info(self) -> dict | None:
@@ -892,7 +958,7 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
892
958
  message.cache_enabled = self.is_caching_prompt_active()
893
959
  message.vision_enabled = self.vision_is_active()
894
960
  message.function_calling_enabled = self.native_tool_calling
895
- model_features = get_features(self.model)
961
+ model_features = get_features(self._model_name_for_capabilities())
896
962
  message.force_string_serializer = (
897
963
  self.force_string_serializer
898
964
  if self.force_string_serializer is not None
@@ -82,7 +82,7 @@ class LLMRegistry:
82
82
  if usage_id in self._usage_to_llm:
83
83
  message = (
84
84
  f"Usage ID '{usage_id}' already exists in registry. "
85
- "Use a different usage_id on the LLM (previously service_id) or "
85
+ "Use a different usage_id on the LLM or "
86
86
  "call get() to retrieve the existing LLM."
87
87
  )
88
88
  raise ValueError(message)
@@ -4,7 +4,6 @@ from typing import Any
4
4
 
5
5
  from openhands.sdk.llm.options.common import apply_defaults_if_absent
6
6
  from openhands.sdk.llm.utils.model_features import get_features
7
- from openhands.sdk.utils.deprecation import warn_cleanup
8
7
 
9
8
 
10
9
  def select_chat_options(
@@ -35,28 +34,10 @@ def select_chat_options(
35
34
 
36
35
  # Reasoning-model quirks
37
36
  if get_features(llm.model).supports_reasoning_effort:
38
- # Claude models use different parameter format
39
- if "claude-opus-4-5" in llm.model.lower():
40
- warn_cleanup(
41
- "Claude Opus 4.5 effort parameter workaround",
42
- cleanup_by="1.4.0",
43
- details=(
44
- "LiteLLM does not yet redirect reasoning_effort to "
45
- "output_config.effort for Claude Opus 4.5. Remove this workaround "
46
- "once LiteLLM adds native support."
47
- ),
48
- )
49
- # Claude uses output_config.effort instead of reasoning_effort
50
- if llm.reasoning_effort is not None:
51
- out["output_config"] = {"effort": llm.reasoning_effort}
52
- # Claude requires beta header for effort parameter
53
- if "extra_headers" not in out:
54
- out["extra_headers"] = {}
55
- out["extra_headers"]["anthropic-beta"] = "effort-2025-11-24"
56
- else:
57
- # OpenAI/other models use reasoning_effort parameter
58
- if llm.reasoning_effort is not None:
59
- out["reasoning_effort"] = llm.reasoning_effort
37
+ # LiteLLM automatically handles reasoning_effort for all models, including
38
+ # Claude Opus 4.5 (maps to output_config and adds beta header automatically)
39
+ if llm.reasoning_effort is not None:
40
+ out["reasoning_effort"] = llm.reasoning_effort
60
41
 
61
42
  # All reasoning models ignore temp/top_p
62
43
  out.pop("temperature", None)
@@ -98,7 +79,14 @@ def select_chat_options(
98
79
  out.pop("tools", None)
99
80
  out.pop("tool_choice", None)
100
81
 
101
- # Always forward extra_body if provided; let the LLM provider validate
82
+ # Send prompt_cache_retention only if model supports it
83
+ if (
84
+ get_features(llm.model).supports_prompt_cache_retention
85
+ and llm.prompt_cache_retention
86
+ ):
87
+ out["prompt_cache_retention"] = llm.prompt_cache_retention
88
+
89
+ # Pass through user-provided extra_body unchanged
102
90
  if llm.litellm_extra_body:
103
91
  out["extra_body"] = llm.litellm_extra_body
104
92
 
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  from typing import Any
4
4
 
5
5
  from openhands.sdk.llm.options.common import apply_defaults_if_absent
6
+ from openhands.sdk.llm.utils.model_features import get_features
6
7
 
7
8
 
8
9
  def select_responses_options(
@@ -50,7 +51,14 @@ def select_responses_options(
50
51
  if llm.reasoning_summary:
51
52
  out["reasoning"]["summary"] = llm.reasoning_summary
52
53
 
53
- # Always forward extra_body if provided; let the LLM provider validate
54
+ # Send prompt_cache_retention only if model supports it
55
+ if (
56
+ get_features(llm.model).supports_prompt_cache_retention
57
+ and llm.prompt_cache_retention
58
+ ):
59
+ out["prompt_cache_retention"] = llm.prompt_cache_retention
60
+
61
+ # Pass through user-provided extra_body unchanged
54
62
  if llm.litellm_extra_body:
55
63
  out["extra_body"] = llm.litellm_extra_body
56
64
 
@@ -10,6 +10,7 @@ from pydantic import (
10
10
  from openhands.sdk.llm.llm import LLM
11
11
  from openhands.sdk.llm.llm_response import LLMResponse
12
12
  from openhands.sdk.llm.message import Message
13
+ from openhands.sdk.llm.streaming import TokenCallbackType
13
14
  from openhands.sdk.logger import get_logger
14
15
  from openhands.sdk.tool.tool import ToolDefinition
15
16
 
@@ -52,6 +53,7 @@ class RouterLLM(LLM):
52
53
  tools: Sequence[ToolDefinition] | None = None,
53
54
  return_metrics: bool = False,
54
55
  add_security_risk_prediction: bool = False,
56
+ on_token: TokenCallbackType | None = None,
55
57
  **kwargs,
56
58
  ) -> LLMResponse:
57
59
  """
@@ -70,6 +72,7 @@ class RouterLLM(LLM):
70
72
  tools=tools,
71
73
  _return_metrics=return_metrics,
72
74
  add_security_risk_prediction=add_security_risk_prediction,
75
+ on_token=on_token,
73
76
  **kwargs,
74
77
  )
75
78
 
@@ -0,0 +1,9 @@
1
+ from collections.abc import Callable
2
+
3
+ from litellm.types.utils import ModelResponseStream
4
+
5
+
6
+ # Type alias for stream chunks
7
+ LLMStreamChunk = ModelResponseStream
8
+
9
+ TokenCallbackType = Callable[[LLMStreamChunk], None]