openhands-sdk 1.8.2__py3-none-any.whl → 1.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. openhands/sdk/agent/agent.py +64 -0
  2. openhands/sdk/agent/base.py +22 -10
  3. openhands/sdk/context/skills/skill.py +59 -1
  4. openhands/sdk/context/skills/utils.py +6 -65
  5. openhands/sdk/conversation/base.py +5 -0
  6. openhands/sdk/conversation/impl/remote_conversation.py +16 -3
  7. openhands/sdk/conversation/visualizer/base.py +23 -0
  8. openhands/sdk/critic/__init__.py +4 -1
  9. openhands/sdk/critic/base.py +17 -20
  10. openhands/sdk/critic/impl/__init__.py +2 -0
  11. openhands/sdk/critic/impl/agent_finished.py +9 -5
  12. openhands/sdk/critic/impl/api/__init__.py +18 -0
  13. openhands/sdk/critic/impl/api/chat_template.py +232 -0
  14. openhands/sdk/critic/impl/api/client.py +313 -0
  15. openhands/sdk/critic/impl/api/critic.py +90 -0
  16. openhands/sdk/critic/impl/api/taxonomy.py +180 -0
  17. openhands/sdk/critic/result.py +148 -0
  18. openhands/sdk/event/llm_convertible/action.py +10 -0
  19. openhands/sdk/event/llm_convertible/message.py +10 -0
  20. openhands/sdk/git/cached_repo.py +459 -0
  21. openhands/sdk/git/utils.py +118 -3
  22. openhands/sdk/hooks/__init__.py +7 -1
  23. openhands/sdk/hooks/config.py +154 -45
  24. openhands/sdk/llm/utils/model_features.py +3 -0
  25. openhands/sdk/plugin/__init__.py +17 -0
  26. openhands/sdk/plugin/fetch.py +231 -0
  27. openhands/sdk/plugin/plugin.py +61 -4
  28. openhands/sdk/plugin/types.py +394 -1
  29. {openhands_sdk-1.8.2.dist-info → openhands_sdk-1.9.1.dist-info}/METADATA +5 -1
  30. {openhands_sdk-1.8.2.dist-info → openhands_sdk-1.9.1.dist-info}/RECORD +32 -24
  31. {openhands_sdk-1.8.2.dist-info → openhands_sdk-1.9.1.dist-info}/WHEEL +1 -1
  32. {openhands_sdk-1.8.2.dist-info → openhands_sdk-1.9.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,180 @@
1
+ """Critic taxonomy - mapping of features to categories for visualization."""
2
+
3
+ import math
4
+ from typing import Any
5
+
6
+
7
+ # Feature to category mapping
8
+ FEATURE_CATEGORIES: dict[str, str] = {
9
+ # General Context & Task Classification
10
+ "user_goal_summary": "general_context",
11
+ "overall_sentiment": "general_context",
12
+ # Agent Behavioral Issues
13
+ "misunderstood_intention": "agent_behavioral_issues",
14
+ "did_not_follow_instruction": "agent_behavioral_issues",
15
+ "insufficient_analysis": "agent_behavioral_issues",
16
+ "insufficient_clarification": "agent_behavioral_issues",
17
+ "improper_tool_use_or_setup": "agent_behavioral_issues",
18
+ "loop_behavior": "agent_behavioral_issues",
19
+ "insufficient_testing": "agent_behavioral_issues",
20
+ "insufficient_debugging": "agent_behavioral_issues",
21
+ "incomplete_implementation": "agent_behavioral_issues",
22
+ "file_management_errors": "agent_behavioral_issues",
23
+ "scope_creep": "agent_behavioral_issues",
24
+ "risky_actions_or_permission": "agent_behavioral_issues",
25
+ "other_agent_issue": "agent_behavioral_issues",
26
+ # User Follow-Up Patterns
27
+ "follow_up_timing": "user_followup_patterns",
28
+ "clarification_or_restatement": "user_followup_patterns",
29
+ "correction": "user_followup_patterns",
30
+ "direction_change": "user_followup_patterns",
31
+ "vcs_update_requests": "user_followup_patterns",
32
+ "progress_or_scope_concern": "user_followup_patterns",
33
+ "frustration_or_complaint": "user_followup_patterns",
34
+ "removal_or_reversion_request": "user_followup_patterns",
35
+ "other_user_issue": "user_followup_patterns",
36
+ # Infrastructure Issues
37
+ "infrastructure_external_issue": "infrastructure_issues",
38
+ "infrastructure_agent_caused_issue": "infrastructure_issues",
39
+ }
40
+
41
+ # Category display names for visualization
42
+ CATEGORY_DISPLAY_NAMES: dict[str, str] = {
43
+ "general_context": "General Context",
44
+ "agent_behavioral_issues": "Detected Agent Behavioral Issues",
45
+ "user_followup_patterns": "Predicted User Follow-Up Patterns",
46
+ "infrastructure_issues": "Detected Infrastructure Issues",
47
+ }
48
+
49
+
50
+ def get_category(feature_name: str) -> str | None:
51
+ """Get the category for a feature.
52
+
53
+ Args:
54
+ feature_name: Name of the feature
55
+
56
+ Returns:
57
+ Category name or None if not found
58
+ """
59
+ return FEATURE_CATEGORIES.get(feature_name)
60
+
61
+
62
+ def _softmax_normalize(probs: dict[str, float]) -> dict[str, float]:
63
+ """Apply softmax normalization to convert logits to probabilities.
64
+
65
+ Args:
66
+ probs: Dictionary of names to raw probability/logit values
67
+
68
+ Returns:
69
+ Dictionary with softmax-normalized probabilities that sum to 1.0
70
+ """
71
+ if not probs:
72
+ return {}
73
+
74
+ values = list(probs.values())
75
+ exp_values = [math.exp(v) for v in values]
76
+ exp_sum = sum(exp_values)
77
+ normalized = [exp_v / exp_sum for exp_v in exp_values]
78
+
79
+ return dict(zip(probs.keys(), normalized))
80
+
81
+
82
+ def categorize_features(
83
+ probs_dict: dict[str, float],
84
+ display_threshold: float = 0.2,
85
+ ) -> dict[str, Any]:
86
+ """Categorize features from probability dictionary into taxonomy groups.
87
+
88
+ This function takes raw probability outputs from the critic model and
89
+ organizes them into categories ready for visualization.
90
+
91
+ Args:
92
+ probs_dict: Dictionary of feature names to probability values
93
+ display_threshold: Minimum probability to include a feature (default: 0.2)
94
+
95
+ Returns:
96
+ Dictionary with categorized features ready for visualization:
97
+ {
98
+ "sentiment": {
99
+ "predicted": "Neutral",
100
+ "probability": 0.77,
101
+ "all": {"positive": 0.10, "neutral": 0.77, "negative": 0.13}
102
+ },
103
+ "agent_behavioral_issues": [
104
+ {"name": "loop_behavior", "display_name": "Loop Behavior",
105
+ "probability": 0.85},
106
+ ...
107
+ ],
108
+ "user_followup_patterns": [...],
109
+ "infrastructure_issues": [...],
110
+ "other": [...]
111
+ }
112
+ """
113
+ result: dict[str, Any] = {
114
+ "sentiment": None,
115
+ "agent_behavioral_issues": [],
116
+ "user_followup_patterns": [],
117
+ "infrastructure_issues": [],
118
+ "other": [],
119
+ }
120
+
121
+ # Extract sentiment features and apply softmax normalization
122
+ raw_sentiment_probs = {}
123
+ for feature_name, prob in probs_dict.items():
124
+ if feature_name.startswith("sentiment_"):
125
+ short_name = feature_name.replace("sentiment_", "")
126
+ raw_sentiment_probs[short_name] = prob
127
+
128
+ if raw_sentiment_probs:
129
+ # Apply softmax normalization to convert logits to probabilities
130
+ sentiment_probs = _softmax_normalize(raw_sentiment_probs)
131
+ max_sentiment = max(sentiment_probs.items(), key=lambda x: x[1])
132
+ result["sentiment"] = {
133
+ "predicted": max_sentiment[0].capitalize(),
134
+ "probability": max_sentiment[1],
135
+ "all": sentiment_probs,
136
+ }
137
+
138
+ # Categorize other features
139
+ for feature_name, prob in probs_dict.items():
140
+ # Skip sentiment features (already processed)
141
+ if feature_name.startswith("sentiment_"):
142
+ continue
143
+
144
+ # Skip 'success' as it's redundant with the score
145
+ if feature_name == "success":
146
+ continue
147
+
148
+ # Skip features below threshold
149
+ if prob < display_threshold:
150
+ continue
151
+
152
+ category = FEATURE_CATEGORIES.get(feature_name)
153
+ feature_entry = {
154
+ "name": feature_name,
155
+ "display_name": feature_name.replace("_", " ").title(),
156
+ "probability": prob,
157
+ }
158
+
159
+ if category == "general_context":
160
+ # Skip general context features for now
161
+ continue
162
+ elif category == "agent_behavioral_issues":
163
+ result["agent_behavioral_issues"].append(feature_entry)
164
+ elif category == "user_followup_patterns":
165
+ result["user_followup_patterns"].append(feature_entry)
166
+ elif category == "infrastructure_issues":
167
+ result["infrastructure_issues"].append(feature_entry)
168
+ else:
169
+ result["other"].append(feature_entry)
170
+
171
+ # Sort each category by probability (descending)
172
+ for key in [
173
+ "agent_behavioral_issues",
174
+ "user_followup_patterns",
175
+ "infrastructure_issues",
176
+ "other",
177
+ ]:
178
+ result[key] = sorted(result[key], key=lambda x: x["probability"], reverse=True)
179
+
180
+ return result
@@ -0,0 +1,148 @@
1
+ from typing import Any, ClassVar
2
+
3
+ from pydantic import BaseModel, Field
4
+ from rich.text import Text
5
+
6
+
7
+ class CriticResult(BaseModel):
8
+ """A critic result is a score and a message."""
9
+
10
+ THRESHOLD: ClassVar[float] = 0.5
11
+ DISPLAY_THRESHOLD: ClassVar[float] = 0.2 # Only show scores above this threshold
12
+
13
+ score: float = Field(
14
+ description="A predicted probability of success between 0 and 1.",
15
+ ge=0.0,
16
+ le=1.0,
17
+ )
18
+ message: str | None = Field(description="An optional message explaining the score.")
19
+ metadata: dict[str, Any] | None = Field(
20
+ default=None,
21
+ description=(
22
+ "Optional metadata about the critic evaluation. "
23
+ "Can include event_ids and categorized_features for visualization."
24
+ ),
25
+ )
26
+
27
+ @property
28
+ def success(self) -> bool:
29
+ """Whether the agent is successful."""
30
+ return self.score >= CriticResult.THRESHOLD
31
+
32
+ @staticmethod
33
+ def _get_star_rating(score: float) -> str:
34
+ """Convert score (0-1) to a 5-star rating string.
35
+
36
+ Each star represents 20% of the score.
37
+ """
38
+ filled_stars = round(score * 5)
39
+ empty_stars = 5 - filled_stars
40
+ return "★" * filled_stars + "☆" * empty_stars
41
+
42
+ @staticmethod
43
+ def _get_star_style(score: float) -> str:
44
+ """Get the style for the star rating based on score."""
45
+ if score >= 0.6:
46
+ return "green"
47
+ elif score >= 0.4:
48
+ return "yellow"
49
+ else:
50
+ return "red"
51
+
52
+ @property
53
+ def visualize(self) -> Text:
54
+ """Return Rich Text representation of the critic result."""
55
+ content = Text()
56
+ content.append("\n\nCritic: agent success likelihood ", style="bold")
57
+
58
+ # Display star rating with percentage
59
+ stars = self._get_star_rating(self.score)
60
+ style = self._get_star_style(self.score)
61
+ percentage = self.score * 100
62
+ content.append(stars, style=style)
63
+ content.append(f" ({percentage:.1f}%)", style="dim")
64
+
65
+ # Use categorized features from metadata if available
66
+ if self.metadata and "categorized_features" in self.metadata:
67
+ categorized = self.metadata["categorized_features"]
68
+ self._append_categorized_features(content, categorized)
69
+ else:
70
+ # Fallback: display message as-is
71
+ if self.message:
72
+ content.append(f"\n {self.message}\n")
73
+ else:
74
+ content.append("\n")
75
+
76
+ return content
77
+
78
+ def _append_categorized_features(
79
+ self, content: Text, categorized: dict[str, Any]
80
+ ) -> None:
81
+ """Append categorized features to content, each category on its own line."""
82
+ has_content = False
83
+
84
+ # Agent behavioral issues
85
+ agent_issues = categorized.get("agent_behavioral_issues", [])
86
+ if agent_issues:
87
+ content.append("\n ")
88
+ content.append("Potential Issues: ", style="bold")
89
+ self._append_feature_list_inline(content, agent_issues)
90
+ has_content = True
91
+
92
+ # User follow-up patterns
93
+ user_patterns = categorized.get("user_followup_patterns", [])
94
+ if user_patterns:
95
+ content.append("\n ")
96
+ content.append("Likely Follow-up: ", style="bold")
97
+ self._append_feature_list_inline(content, user_patterns)
98
+ has_content = True
99
+
100
+ # Infrastructure issues
101
+ infra_issues = categorized.get("infrastructure_issues", [])
102
+ if infra_issues:
103
+ content.append("\n ")
104
+ content.append("Infrastructure: ", style="bold")
105
+ self._append_feature_list_inline(content, infra_issues)
106
+ has_content = True
107
+
108
+ # Other metrics
109
+ other = categorized.get("other", [])
110
+ if other:
111
+ content.append("\n ")
112
+ content.append("Other: ", style="bold")
113
+ self._append_feature_list_inline(content, other, is_other=True)
114
+ has_content = True
115
+
116
+ if not has_content:
117
+ content.append("\n")
118
+ else:
119
+ content.append("\n")
120
+
121
+ def _append_feature_list_inline(
122
+ self,
123
+ content: Text,
124
+ features: list[dict[str, Any]],
125
+ is_other: bool = False,
126
+ ) -> None:
127
+ """Append features inline with likelihood percentages."""
128
+ for i, feature in enumerate(features):
129
+ display_name = feature.get("display_name", feature.get("name", "Unknown"))
130
+ prob = feature.get("probability", 0.0)
131
+ percentage = prob * 100
132
+
133
+ # Get style based on probability
134
+ if is_other:
135
+ prob_style = "white"
136
+ elif prob >= 0.7:
137
+ prob_style = "red bold"
138
+ elif prob >= 0.5:
139
+ prob_style = "yellow"
140
+ else:
141
+ prob_style = "dim"
142
+
143
+ # Add dot separator between features
144
+ if i > 0:
145
+ content.append(" · ", style="dim")
146
+
147
+ content.append(f"{display_name}", style="white")
148
+ content.append(f" (likelihood {percentage:.0f}%)", style=prob_style)
@@ -3,6 +3,7 @@ from collections.abc import Sequence
3
3
  from pydantic import Field
4
4
  from rich.text import Text
5
5
 
6
+ from openhands.sdk.critic.result import CriticResult
6
7
  from openhands.sdk.event.base import N_CHAR_PREVIEW, EventID, LLMConvertibleEvent
7
8
  from openhands.sdk.event.types import SourceType, ToolCallID
8
9
  from openhands.sdk.llm import (
@@ -65,6 +66,11 @@ class ActionEvent(LLMConvertibleEvent):
65
66
  description="The LLM's assessment of the safety risk of this action.",
66
67
  )
67
68
 
69
+ critic_result: CriticResult | None = Field(
70
+ default=None,
71
+ description="Optional critic evaluation of this action and preceding history.",
72
+ )
73
+
68
74
  summary: str | None = Field(
69
75
  default=None,
70
76
  description=(
@@ -125,6 +131,10 @@ class ActionEvent(LLMConvertibleEvent):
125
131
  content.append("Function call:\n", style="bold")
126
132
  content.append(f"- {self.tool_call.name} ({self.tool_call.id})\n")
127
133
 
134
+ # Display critic result if available
135
+ if self.critic_result is not None:
136
+ content.append(self.critic_result.visualize)
137
+
128
138
  return content
129
139
 
130
140
  def to_llm_message(self) -> Message:
@@ -5,6 +5,7 @@ from typing import ClassVar
5
5
  from pydantic import ConfigDict, Field
6
6
  from rich.text import Text
7
7
 
8
+ from openhands.sdk.critic.result import CriticResult
8
9
  from openhands.sdk.event.base import N_CHAR_PREVIEW, EventID, LLMConvertibleEvent
9
10
  from openhands.sdk.event.types import SourceType
10
11
  from openhands.sdk.llm import (
@@ -51,6 +52,11 @@ class MessageEvent(LLMConvertibleEvent):
51
52
  ),
52
53
  )
53
54
 
55
+ critic_result: CriticResult | None = Field(
56
+ default=None,
57
+ description="Optional critic evaluation of this message and preceding history.",
58
+ )
59
+
54
60
  @property
55
61
  def reasoning_content(self) -> str:
56
62
  return self.llm_message.reasoning_content or ""
@@ -101,6 +107,10 @@ class MessageEvent(LLMConvertibleEvent):
101
107
  )
102
108
  content.append(" ".join(text_parts))
103
109
 
110
+ # Display critic result if available
111
+ if self.critic_result is not None:
112
+ content.append(self.critic_result.visualize)
113
+
104
114
  return content
105
115
 
106
116
  def to_llm_message(self) -> Message: