openhands-sdk 1.8.2__py3-none-any.whl → 1.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openhands/sdk/agent/agent.py +64 -0
- openhands/sdk/agent/base.py +22 -10
- openhands/sdk/context/skills/skill.py +59 -1
- openhands/sdk/context/skills/utils.py +6 -65
- openhands/sdk/conversation/base.py +5 -0
- openhands/sdk/conversation/impl/remote_conversation.py +16 -3
- openhands/sdk/conversation/visualizer/base.py +23 -0
- openhands/sdk/critic/__init__.py +4 -1
- openhands/sdk/critic/base.py +17 -20
- openhands/sdk/critic/impl/__init__.py +2 -0
- openhands/sdk/critic/impl/agent_finished.py +9 -5
- openhands/sdk/critic/impl/api/__init__.py +18 -0
- openhands/sdk/critic/impl/api/chat_template.py +232 -0
- openhands/sdk/critic/impl/api/client.py +313 -0
- openhands/sdk/critic/impl/api/critic.py +90 -0
- openhands/sdk/critic/impl/api/taxonomy.py +180 -0
- openhands/sdk/critic/result.py +148 -0
- openhands/sdk/event/llm_convertible/action.py +10 -0
- openhands/sdk/event/llm_convertible/message.py +10 -0
- openhands/sdk/git/cached_repo.py +459 -0
- openhands/sdk/git/utils.py +118 -3
- openhands/sdk/hooks/__init__.py +7 -1
- openhands/sdk/hooks/config.py +154 -45
- openhands/sdk/llm/utils/model_features.py +3 -0
- openhands/sdk/plugin/__init__.py +17 -0
- openhands/sdk/plugin/fetch.py +231 -0
- openhands/sdk/plugin/plugin.py +61 -4
- openhands/sdk/plugin/types.py +394 -1
- {openhands_sdk-1.8.2.dist-info → openhands_sdk-1.9.1.dist-info}/METADATA +5 -1
- {openhands_sdk-1.8.2.dist-info → openhands_sdk-1.9.1.dist-info}/RECORD +32 -24
- {openhands_sdk-1.8.2.dist-info → openhands_sdk-1.9.1.dist-info}/WHEEL +1 -1
- {openhands_sdk-1.8.2.dist-info → openhands_sdk-1.9.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
"""Critic taxonomy - mapping of features to categories for visualization."""
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
# Feature to category mapping
|
|
8
|
+
FEATURE_CATEGORIES: dict[str, str] = {
|
|
9
|
+
# General Context & Task Classification
|
|
10
|
+
"user_goal_summary": "general_context",
|
|
11
|
+
"overall_sentiment": "general_context",
|
|
12
|
+
# Agent Behavioral Issues
|
|
13
|
+
"misunderstood_intention": "agent_behavioral_issues",
|
|
14
|
+
"did_not_follow_instruction": "agent_behavioral_issues",
|
|
15
|
+
"insufficient_analysis": "agent_behavioral_issues",
|
|
16
|
+
"insufficient_clarification": "agent_behavioral_issues",
|
|
17
|
+
"improper_tool_use_or_setup": "agent_behavioral_issues",
|
|
18
|
+
"loop_behavior": "agent_behavioral_issues",
|
|
19
|
+
"insufficient_testing": "agent_behavioral_issues",
|
|
20
|
+
"insufficient_debugging": "agent_behavioral_issues",
|
|
21
|
+
"incomplete_implementation": "agent_behavioral_issues",
|
|
22
|
+
"file_management_errors": "agent_behavioral_issues",
|
|
23
|
+
"scope_creep": "agent_behavioral_issues",
|
|
24
|
+
"risky_actions_or_permission": "agent_behavioral_issues",
|
|
25
|
+
"other_agent_issue": "agent_behavioral_issues",
|
|
26
|
+
# User Follow-Up Patterns
|
|
27
|
+
"follow_up_timing": "user_followup_patterns",
|
|
28
|
+
"clarification_or_restatement": "user_followup_patterns",
|
|
29
|
+
"correction": "user_followup_patterns",
|
|
30
|
+
"direction_change": "user_followup_patterns",
|
|
31
|
+
"vcs_update_requests": "user_followup_patterns",
|
|
32
|
+
"progress_or_scope_concern": "user_followup_patterns",
|
|
33
|
+
"frustration_or_complaint": "user_followup_patterns",
|
|
34
|
+
"removal_or_reversion_request": "user_followup_patterns",
|
|
35
|
+
"other_user_issue": "user_followup_patterns",
|
|
36
|
+
# Infrastructure Issues
|
|
37
|
+
"infrastructure_external_issue": "infrastructure_issues",
|
|
38
|
+
"infrastructure_agent_caused_issue": "infrastructure_issues",
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
# Category display names for visualization
|
|
42
|
+
CATEGORY_DISPLAY_NAMES: dict[str, str] = {
|
|
43
|
+
"general_context": "General Context",
|
|
44
|
+
"agent_behavioral_issues": "Detected Agent Behavioral Issues",
|
|
45
|
+
"user_followup_patterns": "Predicted User Follow-Up Patterns",
|
|
46
|
+
"infrastructure_issues": "Detected Infrastructure Issues",
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def get_category(feature_name: str) -> str | None:
|
|
51
|
+
"""Get the category for a feature.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
feature_name: Name of the feature
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Category name or None if not found
|
|
58
|
+
"""
|
|
59
|
+
return FEATURE_CATEGORIES.get(feature_name)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _softmax_normalize(probs: dict[str, float]) -> dict[str, float]:
|
|
63
|
+
"""Apply softmax normalization to convert logits to probabilities.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
probs: Dictionary of names to raw probability/logit values
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Dictionary with softmax-normalized probabilities that sum to 1.0
|
|
70
|
+
"""
|
|
71
|
+
if not probs:
|
|
72
|
+
return {}
|
|
73
|
+
|
|
74
|
+
values = list(probs.values())
|
|
75
|
+
exp_values = [math.exp(v) for v in values]
|
|
76
|
+
exp_sum = sum(exp_values)
|
|
77
|
+
normalized = [exp_v / exp_sum for exp_v in exp_values]
|
|
78
|
+
|
|
79
|
+
return dict(zip(probs.keys(), normalized))
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def categorize_features(
|
|
83
|
+
probs_dict: dict[str, float],
|
|
84
|
+
display_threshold: float = 0.2,
|
|
85
|
+
) -> dict[str, Any]:
|
|
86
|
+
"""Categorize features from probability dictionary into taxonomy groups.
|
|
87
|
+
|
|
88
|
+
This function takes raw probability outputs from the critic model and
|
|
89
|
+
organizes them into categories ready for visualization.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
probs_dict: Dictionary of feature names to probability values
|
|
93
|
+
display_threshold: Minimum probability to include a feature (default: 0.2)
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Dictionary with categorized features ready for visualization:
|
|
97
|
+
{
|
|
98
|
+
"sentiment": {
|
|
99
|
+
"predicted": "Neutral",
|
|
100
|
+
"probability": 0.77,
|
|
101
|
+
"all": {"positive": 0.10, "neutral": 0.77, "negative": 0.13}
|
|
102
|
+
},
|
|
103
|
+
"agent_behavioral_issues": [
|
|
104
|
+
{"name": "loop_behavior", "display_name": "Loop Behavior",
|
|
105
|
+
"probability": 0.85},
|
|
106
|
+
...
|
|
107
|
+
],
|
|
108
|
+
"user_followup_patterns": [...],
|
|
109
|
+
"infrastructure_issues": [...],
|
|
110
|
+
"other": [...]
|
|
111
|
+
}
|
|
112
|
+
"""
|
|
113
|
+
result: dict[str, Any] = {
|
|
114
|
+
"sentiment": None,
|
|
115
|
+
"agent_behavioral_issues": [],
|
|
116
|
+
"user_followup_patterns": [],
|
|
117
|
+
"infrastructure_issues": [],
|
|
118
|
+
"other": [],
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
# Extract sentiment features and apply softmax normalization
|
|
122
|
+
raw_sentiment_probs = {}
|
|
123
|
+
for feature_name, prob in probs_dict.items():
|
|
124
|
+
if feature_name.startswith("sentiment_"):
|
|
125
|
+
short_name = feature_name.replace("sentiment_", "")
|
|
126
|
+
raw_sentiment_probs[short_name] = prob
|
|
127
|
+
|
|
128
|
+
if raw_sentiment_probs:
|
|
129
|
+
# Apply softmax normalization to convert logits to probabilities
|
|
130
|
+
sentiment_probs = _softmax_normalize(raw_sentiment_probs)
|
|
131
|
+
max_sentiment = max(sentiment_probs.items(), key=lambda x: x[1])
|
|
132
|
+
result["sentiment"] = {
|
|
133
|
+
"predicted": max_sentiment[0].capitalize(),
|
|
134
|
+
"probability": max_sentiment[1],
|
|
135
|
+
"all": sentiment_probs,
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
# Categorize other features
|
|
139
|
+
for feature_name, prob in probs_dict.items():
|
|
140
|
+
# Skip sentiment features (already processed)
|
|
141
|
+
if feature_name.startswith("sentiment_"):
|
|
142
|
+
continue
|
|
143
|
+
|
|
144
|
+
# Skip 'success' as it's redundant with the score
|
|
145
|
+
if feature_name == "success":
|
|
146
|
+
continue
|
|
147
|
+
|
|
148
|
+
# Skip features below threshold
|
|
149
|
+
if prob < display_threshold:
|
|
150
|
+
continue
|
|
151
|
+
|
|
152
|
+
category = FEATURE_CATEGORIES.get(feature_name)
|
|
153
|
+
feature_entry = {
|
|
154
|
+
"name": feature_name,
|
|
155
|
+
"display_name": feature_name.replace("_", " ").title(),
|
|
156
|
+
"probability": prob,
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
if category == "general_context":
|
|
160
|
+
# Skip general context features for now
|
|
161
|
+
continue
|
|
162
|
+
elif category == "agent_behavioral_issues":
|
|
163
|
+
result["agent_behavioral_issues"].append(feature_entry)
|
|
164
|
+
elif category == "user_followup_patterns":
|
|
165
|
+
result["user_followup_patterns"].append(feature_entry)
|
|
166
|
+
elif category == "infrastructure_issues":
|
|
167
|
+
result["infrastructure_issues"].append(feature_entry)
|
|
168
|
+
else:
|
|
169
|
+
result["other"].append(feature_entry)
|
|
170
|
+
|
|
171
|
+
# Sort each category by probability (descending)
|
|
172
|
+
for key in [
|
|
173
|
+
"agent_behavioral_issues",
|
|
174
|
+
"user_followup_patterns",
|
|
175
|
+
"infrastructure_issues",
|
|
176
|
+
"other",
|
|
177
|
+
]:
|
|
178
|
+
result[key] = sorted(result[key], key=lambda x: x["probability"], reverse=True)
|
|
179
|
+
|
|
180
|
+
return result
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
from typing import Any, ClassVar
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
from rich.text import Text
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class CriticResult(BaseModel):
|
|
8
|
+
"""A critic result is a score and a message."""
|
|
9
|
+
|
|
10
|
+
THRESHOLD: ClassVar[float] = 0.5
|
|
11
|
+
DISPLAY_THRESHOLD: ClassVar[float] = 0.2 # Only show scores above this threshold
|
|
12
|
+
|
|
13
|
+
score: float = Field(
|
|
14
|
+
description="A predicted probability of success between 0 and 1.",
|
|
15
|
+
ge=0.0,
|
|
16
|
+
le=1.0,
|
|
17
|
+
)
|
|
18
|
+
message: str | None = Field(description="An optional message explaining the score.")
|
|
19
|
+
metadata: dict[str, Any] | None = Field(
|
|
20
|
+
default=None,
|
|
21
|
+
description=(
|
|
22
|
+
"Optional metadata about the critic evaluation. "
|
|
23
|
+
"Can include event_ids and categorized_features for visualization."
|
|
24
|
+
),
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
@property
|
|
28
|
+
def success(self) -> bool:
|
|
29
|
+
"""Whether the agent is successful."""
|
|
30
|
+
return self.score >= CriticResult.THRESHOLD
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def _get_star_rating(score: float) -> str:
|
|
34
|
+
"""Convert score (0-1) to a 5-star rating string.
|
|
35
|
+
|
|
36
|
+
Each star represents 20% of the score.
|
|
37
|
+
"""
|
|
38
|
+
filled_stars = round(score * 5)
|
|
39
|
+
empty_stars = 5 - filled_stars
|
|
40
|
+
return "★" * filled_stars + "☆" * empty_stars
|
|
41
|
+
|
|
42
|
+
@staticmethod
|
|
43
|
+
def _get_star_style(score: float) -> str:
|
|
44
|
+
"""Get the style for the star rating based on score."""
|
|
45
|
+
if score >= 0.6:
|
|
46
|
+
return "green"
|
|
47
|
+
elif score >= 0.4:
|
|
48
|
+
return "yellow"
|
|
49
|
+
else:
|
|
50
|
+
return "red"
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def visualize(self) -> Text:
|
|
54
|
+
"""Return Rich Text representation of the critic result."""
|
|
55
|
+
content = Text()
|
|
56
|
+
content.append("\n\nCritic: agent success likelihood ", style="bold")
|
|
57
|
+
|
|
58
|
+
# Display star rating with percentage
|
|
59
|
+
stars = self._get_star_rating(self.score)
|
|
60
|
+
style = self._get_star_style(self.score)
|
|
61
|
+
percentage = self.score * 100
|
|
62
|
+
content.append(stars, style=style)
|
|
63
|
+
content.append(f" ({percentage:.1f}%)", style="dim")
|
|
64
|
+
|
|
65
|
+
# Use categorized features from metadata if available
|
|
66
|
+
if self.metadata and "categorized_features" in self.metadata:
|
|
67
|
+
categorized = self.metadata["categorized_features"]
|
|
68
|
+
self._append_categorized_features(content, categorized)
|
|
69
|
+
else:
|
|
70
|
+
# Fallback: display message as-is
|
|
71
|
+
if self.message:
|
|
72
|
+
content.append(f"\n {self.message}\n")
|
|
73
|
+
else:
|
|
74
|
+
content.append("\n")
|
|
75
|
+
|
|
76
|
+
return content
|
|
77
|
+
|
|
78
|
+
def _append_categorized_features(
|
|
79
|
+
self, content: Text, categorized: dict[str, Any]
|
|
80
|
+
) -> None:
|
|
81
|
+
"""Append categorized features to content, each category on its own line."""
|
|
82
|
+
has_content = False
|
|
83
|
+
|
|
84
|
+
# Agent behavioral issues
|
|
85
|
+
agent_issues = categorized.get("agent_behavioral_issues", [])
|
|
86
|
+
if agent_issues:
|
|
87
|
+
content.append("\n ")
|
|
88
|
+
content.append("Potential Issues: ", style="bold")
|
|
89
|
+
self._append_feature_list_inline(content, agent_issues)
|
|
90
|
+
has_content = True
|
|
91
|
+
|
|
92
|
+
# User follow-up patterns
|
|
93
|
+
user_patterns = categorized.get("user_followup_patterns", [])
|
|
94
|
+
if user_patterns:
|
|
95
|
+
content.append("\n ")
|
|
96
|
+
content.append("Likely Follow-up: ", style="bold")
|
|
97
|
+
self._append_feature_list_inline(content, user_patterns)
|
|
98
|
+
has_content = True
|
|
99
|
+
|
|
100
|
+
# Infrastructure issues
|
|
101
|
+
infra_issues = categorized.get("infrastructure_issues", [])
|
|
102
|
+
if infra_issues:
|
|
103
|
+
content.append("\n ")
|
|
104
|
+
content.append("Infrastructure: ", style="bold")
|
|
105
|
+
self._append_feature_list_inline(content, infra_issues)
|
|
106
|
+
has_content = True
|
|
107
|
+
|
|
108
|
+
# Other metrics
|
|
109
|
+
other = categorized.get("other", [])
|
|
110
|
+
if other:
|
|
111
|
+
content.append("\n ")
|
|
112
|
+
content.append("Other: ", style="bold")
|
|
113
|
+
self._append_feature_list_inline(content, other, is_other=True)
|
|
114
|
+
has_content = True
|
|
115
|
+
|
|
116
|
+
if not has_content:
|
|
117
|
+
content.append("\n")
|
|
118
|
+
else:
|
|
119
|
+
content.append("\n")
|
|
120
|
+
|
|
121
|
+
def _append_feature_list_inline(
|
|
122
|
+
self,
|
|
123
|
+
content: Text,
|
|
124
|
+
features: list[dict[str, Any]],
|
|
125
|
+
is_other: bool = False,
|
|
126
|
+
) -> None:
|
|
127
|
+
"""Append features inline with likelihood percentages."""
|
|
128
|
+
for i, feature in enumerate(features):
|
|
129
|
+
display_name = feature.get("display_name", feature.get("name", "Unknown"))
|
|
130
|
+
prob = feature.get("probability", 0.0)
|
|
131
|
+
percentage = prob * 100
|
|
132
|
+
|
|
133
|
+
# Get style based on probability
|
|
134
|
+
if is_other:
|
|
135
|
+
prob_style = "white"
|
|
136
|
+
elif prob >= 0.7:
|
|
137
|
+
prob_style = "red bold"
|
|
138
|
+
elif prob >= 0.5:
|
|
139
|
+
prob_style = "yellow"
|
|
140
|
+
else:
|
|
141
|
+
prob_style = "dim"
|
|
142
|
+
|
|
143
|
+
# Add dot separator between features
|
|
144
|
+
if i > 0:
|
|
145
|
+
content.append(" · ", style="dim")
|
|
146
|
+
|
|
147
|
+
content.append(f"{display_name}", style="white")
|
|
148
|
+
content.append(f" (likelihood {percentage:.0f}%)", style=prob_style)
|
|
@@ -3,6 +3,7 @@ from collections.abc import Sequence
|
|
|
3
3
|
from pydantic import Field
|
|
4
4
|
from rich.text import Text
|
|
5
5
|
|
|
6
|
+
from openhands.sdk.critic.result import CriticResult
|
|
6
7
|
from openhands.sdk.event.base import N_CHAR_PREVIEW, EventID, LLMConvertibleEvent
|
|
7
8
|
from openhands.sdk.event.types import SourceType, ToolCallID
|
|
8
9
|
from openhands.sdk.llm import (
|
|
@@ -65,6 +66,11 @@ class ActionEvent(LLMConvertibleEvent):
|
|
|
65
66
|
description="The LLM's assessment of the safety risk of this action.",
|
|
66
67
|
)
|
|
67
68
|
|
|
69
|
+
critic_result: CriticResult | None = Field(
|
|
70
|
+
default=None,
|
|
71
|
+
description="Optional critic evaluation of this action and preceding history.",
|
|
72
|
+
)
|
|
73
|
+
|
|
68
74
|
summary: str | None = Field(
|
|
69
75
|
default=None,
|
|
70
76
|
description=(
|
|
@@ -125,6 +131,10 @@ class ActionEvent(LLMConvertibleEvent):
|
|
|
125
131
|
content.append("Function call:\n", style="bold")
|
|
126
132
|
content.append(f"- {self.tool_call.name} ({self.tool_call.id})\n")
|
|
127
133
|
|
|
134
|
+
# Display critic result if available
|
|
135
|
+
if self.critic_result is not None:
|
|
136
|
+
content.append(self.critic_result.visualize)
|
|
137
|
+
|
|
128
138
|
return content
|
|
129
139
|
|
|
130
140
|
def to_llm_message(self) -> Message:
|
|
@@ -5,6 +5,7 @@ from typing import ClassVar
|
|
|
5
5
|
from pydantic import ConfigDict, Field
|
|
6
6
|
from rich.text import Text
|
|
7
7
|
|
|
8
|
+
from openhands.sdk.critic.result import CriticResult
|
|
8
9
|
from openhands.sdk.event.base import N_CHAR_PREVIEW, EventID, LLMConvertibleEvent
|
|
9
10
|
from openhands.sdk.event.types import SourceType
|
|
10
11
|
from openhands.sdk.llm import (
|
|
@@ -51,6 +52,11 @@ class MessageEvent(LLMConvertibleEvent):
|
|
|
51
52
|
),
|
|
52
53
|
)
|
|
53
54
|
|
|
55
|
+
critic_result: CriticResult | None = Field(
|
|
56
|
+
default=None,
|
|
57
|
+
description="Optional critic evaluation of this message and preceding history.",
|
|
58
|
+
)
|
|
59
|
+
|
|
54
60
|
@property
|
|
55
61
|
def reasoning_content(self) -> str:
|
|
56
62
|
return self.llm_message.reasoning_content or ""
|
|
@@ -101,6 +107,10 @@ class MessageEvent(LLMConvertibleEvent):
|
|
|
101
107
|
)
|
|
102
108
|
content.append(" ".join(text_parts))
|
|
103
109
|
|
|
110
|
+
# Display critic result if available
|
|
111
|
+
if self.critic_result is not None:
|
|
112
|
+
content.append(self.critic_result.visualize)
|
|
113
|
+
|
|
104
114
|
return content
|
|
105
115
|
|
|
106
116
|
def to_llm_message(self) -> Message:
|