strands-agents-evals 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {strands_agents_evals-0.1.3.dist-info → strands_agents_evals-0.1.5.dist-info}/METADATA +2 -1
- {strands_agents_evals-0.1.3.dist-info → strands_agents_evals-0.1.5.dist-info}/RECORD +25 -18
- strands_evals/evaluators/__init__.py +4 -0
- strands_evals/evaluators/conciseness_evaluator.py +139 -0
- strands_evals/evaluators/evaluator.py +4 -0
- strands_evals/evaluators/faithfulness_evaluator.py +21 -16
- strands_evals/evaluators/goal_success_rate_evaluator.py +21 -16
- strands_evals/evaluators/harmfulness_evaluator.py +21 -16
- strands_evals/evaluators/helpfulness_evaluator.py +21 -16
- strands_evals/evaluators/interactions_evaluator.py +6 -4
- strands_evals/evaluators/output_evaluator.py +6 -4
- strands_evals/evaluators/prompt_templates/conciseness/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/conciseness/conciseness_v0.py +9 -0
- strands_evals/evaluators/prompt_templates/response_relevance/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/response_relevance/response_relevance_v0.py +29 -0
- strands_evals/evaluators/response_relevance_evaluator.py +144 -0
- strands_evals/evaluators/tool_parameter_accuracy_evaluator.py +19 -8
- strands_evals/evaluators/tool_selection_accuracy_evaluator.py +19 -8
- strands_evals/evaluators/trajectory_evaluator.py +6 -4
- strands_evals/experiment.py +281 -90
- strands_evals/extractors/trace_extractor.py +13 -1
- strands_evals/utils.py +37 -0
- {strands_agents_evals-0.1.3.dist-info → strands_agents_evals-0.1.5.dist-info}/WHEEL +0 -0
- {strands_agents_evals-0.1.3.dist-info → strands_agents_evals-0.1.5.dist-info}/licenses/LICENSE +0 -0
- {strands_agents_evals-0.1.3.dist-info → strands_agents_evals-0.1.5.dist-info}/licenses/NOTICE +0 -0
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import cast
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Field
|
|
5
|
+
from strands import Agent
|
|
6
|
+
from strands.agent.agent_result import AgentResult
|
|
7
|
+
from strands.models.model import Model
|
|
8
|
+
from typing_extensions import TypeVar, Union
|
|
9
|
+
|
|
10
|
+
from ..types.evaluation import EvaluationData, EvaluationOutput
|
|
11
|
+
from ..types.trace import EvaluationLevel, TextContent, ToolExecution, TraceLevelInput
|
|
12
|
+
from .evaluator import Evaluator
|
|
13
|
+
from .prompt_templates.response_relevance import get_template
|
|
14
|
+
|
|
15
|
+
InputT = TypeVar("InputT")
|
|
16
|
+
OutputT = TypeVar("OutputT")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ResponseRelevanceScore(str, Enum):
|
|
20
|
+
"""Categorical response relevance ratings."""
|
|
21
|
+
|
|
22
|
+
NOT_AT_ALL = "Not At All"
|
|
23
|
+
NOT_GENERALLY = "Not Generally"
|
|
24
|
+
NEUTRAL_MIXED = "Neutral/Mixed"
|
|
25
|
+
GENERALLY_YES = "Generally Yes"
|
|
26
|
+
COMPLETELY_YES = "Completely Yes"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ResponseRelevanceRating(BaseModel):
|
|
30
|
+
"""Structured output for response relevance evaluation."""
|
|
31
|
+
|
|
32
|
+
reasoning: str = Field(description="Step by step reasoning to derive the final score")
|
|
33
|
+
score: ResponseRelevanceScore = Field(description="Categorical response relevance rating")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ResponseRelevanceEvaluator(Evaluator[InputT, OutputT]):
|
|
37
|
+
"""Evaluates the relevance of agent responses to user questions."""
|
|
38
|
+
|
|
39
|
+
evaluation_level = EvaluationLevel.TRACE_LEVEL
|
|
40
|
+
|
|
41
|
+
_score_mapping = {
|
|
42
|
+
ResponseRelevanceScore.NOT_AT_ALL: 0.0,
|
|
43
|
+
ResponseRelevanceScore.NOT_GENERALLY: 0.25,
|
|
44
|
+
ResponseRelevanceScore.NEUTRAL_MIXED: 0.5,
|
|
45
|
+
ResponseRelevanceScore.GENERALLY_YES: 0.75,
|
|
46
|
+
ResponseRelevanceScore.COMPLETELY_YES: 1.0,
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
version: str = "v0",
|
|
52
|
+
model: Union[Model, str, None] = None,
|
|
53
|
+
system_prompt: str | None = None,
|
|
54
|
+
include_inputs: bool = True,
|
|
55
|
+
):
|
|
56
|
+
super().__init__()
|
|
57
|
+
self.system_prompt = system_prompt or get_template(version).SYSTEM_PROMPT
|
|
58
|
+
self.version = version
|
|
59
|
+
self.model = model
|
|
60
|
+
self.include_inputs = include_inputs
|
|
61
|
+
|
|
62
|
+
def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
|
|
63
|
+
parsed_input = self._get_last_turn(evaluation_case)
|
|
64
|
+
prompt = self._format_prompt(parsed_input)
|
|
65
|
+
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
|
|
66
|
+
result = evaluator_agent(prompt, structured_output_model=ResponseRelevanceRating)
|
|
67
|
+
return self._create_evaluation_output(result)
|
|
68
|
+
|
|
69
|
+
async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
|
|
70
|
+
parsed_input = self._get_last_turn(evaluation_case)
|
|
71
|
+
prompt = self._format_prompt(parsed_input)
|
|
72
|
+
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
|
|
73
|
+
result = await evaluator_agent.invoke_async(prompt, structured_output_model=ResponseRelevanceRating)
|
|
74
|
+
return self._create_evaluation_output(result)
|
|
75
|
+
|
|
76
|
+
def _create_evaluation_output(self, result: AgentResult) -> list[EvaluationOutput]:
|
|
77
|
+
rating = cast(ResponseRelevanceRating, result.structured_output)
|
|
78
|
+
normalized_score = self._score_mapping[rating.score]
|
|
79
|
+
return [
|
|
80
|
+
EvaluationOutput(
|
|
81
|
+
score=normalized_score,
|
|
82
|
+
test_pass=normalized_score >= 0.5,
|
|
83
|
+
reason=rating.reasoning,
|
|
84
|
+
label=rating.score,
|
|
85
|
+
)
|
|
86
|
+
]
|
|
87
|
+
|
|
88
|
+
def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
|
|
89
|
+
"""Extract the most recent turn from the conversation for evaluation."""
|
|
90
|
+
parsed_inputs = self._parse_trajectory(evaluation_case)
|
|
91
|
+
if not parsed_inputs:
|
|
92
|
+
raise ValueError(
|
|
93
|
+
"No turn-level inputs could be parsed from the trajectory. "
|
|
94
|
+
"Ensure actual_trajectory is a Session with at least one AgentInvocationSpan."
|
|
95
|
+
)
|
|
96
|
+
return parsed_inputs[-1]
|
|
97
|
+
|
|
98
|
+
def _extract_user_prompt(self, parsed_input: TraceLevelInput) -> str:
|
|
99
|
+
"""Extract user prompt from last message in session history.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
parsed_input: Trace-level input containing session history
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
User prompt text, or empty string if not available
|
|
106
|
+
"""
|
|
107
|
+
if not parsed_input.session_history:
|
|
108
|
+
return ""
|
|
109
|
+
|
|
110
|
+
last_msg = parsed_input.session_history[-1]
|
|
111
|
+
if not isinstance(last_msg, list) and self._has_text_content(last_msg):
|
|
112
|
+
first_content = last_msg.content[0]
|
|
113
|
+
if isinstance(first_content, TextContent):
|
|
114
|
+
return first_content.text
|
|
115
|
+
|
|
116
|
+
return ""
|
|
117
|
+
|
|
118
|
+
def _format_prompt(self, parsed_input: TraceLevelInput) -> str:
|
|
119
|
+
"""Format evaluation prompt from parsed trace data.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
parsed_input: Trace-level input containing agent response and session history
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
Formatted prompt string with conversation context and target response
|
|
126
|
+
"""
|
|
127
|
+
parts = []
|
|
128
|
+
|
|
129
|
+
if parsed_input.session_history:
|
|
130
|
+
history_lines = []
|
|
131
|
+
for msg in parsed_input.session_history:
|
|
132
|
+
if isinstance(msg, list) and msg and isinstance(msg[0], ToolExecution):
|
|
133
|
+
continue # Skip tool execution lists
|
|
134
|
+
if not isinstance(msg, list) and self._has_text_content(msg):
|
|
135
|
+
first_content = msg.content[0]
|
|
136
|
+
if isinstance(first_content, TextContent):
|
|
137
|
+
history_lines.append(f"{msg.role.value.capitalize()}: {first_content.text}")
|
|
138
|
+
history_str = "\n".join(history_lines)
|
|
139
|
+
parts.append(f"# Previous turns:\n{history_str}")
|
|
140
|
+
|
|
141
|
+
user_prompt = self._extract_user_prompt(parsed_input)
|
|
142
|
+
parts.append(f"# Target turn to evaluate:\nUser: {user_prompt}\nAssistant: {parsed_input.agent_response.text}")
|
|
143
|
+
|
|
144
|
+
return "\n\n".join(parts)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
|
+
from typing import cast
|
|
2
3
|
|
|
3
4
|
from pydantic import BaseModel, Field
|
|
4
5
|
from strands import Agent
|
|
@@ -56,12 +57,17 @@ class ToolParameterAccuracyEvaluator(Evaluator[InputT, OutputT]):
|
|
|
56
57
|
for tool_input in tool_inputs:
|
|
57
58
|
prompt = self._format_prompt(tool_input)
|
|
58
59
|
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
|
|
59
|
-
|
|
60
|
+
result = evaluator_agent(prompt, structured_output_model=ToolParameterAccuracyRating)
|
|
61
|
+
rating = cast(ToolParameterAccuracyRating, result.structured_output)
|
|
60
62
|
normalized_score = self._score_mapping[rating.score]
|
|
61
|
-
|
|
62
|
-
|
|
63
|
+
results.append(
|
|
64
|
+
EvaluationOutput(
|
|
65
|
+
score=normalized_score,
|
|
66
|
+
test_pass=normalized_score == 1.0,
|
|
67
|
+
reason=rating.reasoning,
|
|
68
|
+
label=rating.score,
|
|
69
|
+
)
|
|
63
70
|
)
|
|
64
|
-
results.append(result)
|
|
65
71
|
|
|
66
72
|
return results
|
|
67
73
|
|
|
@@ -72,12 +78,17 @@ class ToolParameterAccuracyEvaluator(Evaluator[InputT, OutputT]):
|
|
|
72
78
|
for tool_input in tool_inputs:
|
|
73
79
|
prompt = self._format_prompt(tool_input)
|
|
74
80
|
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
|
|
75
|
-
|
|
81
|
+
result = await evaluator_agent.invoke_async(prompt, structured_output_model=ToolParameterAccuracyRating)
|
|
82
|
+
rating = cast(ToolParameterAccuracyRating, result.structured_output)
|
|
76
83
|
normalized_score = self._score_mapping[rating.score]
|
|
77
|
-
|
|
78
|
-
|
|
84
|
+
results.append(
|
|
85
|
+
EvaluationOutput(
|
|
86
|
+
score=normalized_score,
|
|
87
|
+
test_pass=normalized_score == 1.0,
|
|
88
|
+
reason=rating.reasoning,
|
|
89
|
+
label=rating.score,
|
|
90
|
+
)
|
|
79
91
|
)
|
|
80
|
-
results.append(result)
|
|
81
92
|
|
|
82
93
|
return results
|
|
83
94
|
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
|
+
from typing import cast
|
|
2
3
|
|
|
3
4
|
from pydantic import BaseModel, Field
|
|
4
5
|
from strands import Agent
|
|
@@ -56,12 +57,17 @@ class ToolSelectionAccuracyEvaluator(Evaluator[InputT, OutputT]):
|
|
|
56
57
|
for tool_input in tool_inputs:
|
|
57
58
|
prompt = self._format_prompt(tool_input)
|
|
58
59
|
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
|
|
59
|
-
|
|
60
|
+
result = evaluator_agent(prompt, structured_output_model=ToolSelectionRating)
|
|
61
|
+
rating = cast(ToolSelectionRating, result.structured_output)
|
|
60
62
|
normalized_score = self._score_mapping[rating.score]
|
|
61
|
-
|
|
62
|
-
|
|
63
|
+
results.append(
|
|
64
|
+
EvaluationOutput(
|
|
65
|
+
score=normalized_score,
|
|
66
|
+
test_pass=normalized_score == 1.0,
|
|
67
|
+
reason=rating.reasoning,
|
|
68
|
+
label=rating.score,
|
|
69
|
+
)
|
|
63
70
|
)
|
|
64
|
-
results.append(result)
|
|
65
71
|
|
|
66
72
|
return results
|
|
67
73
|
|
|
@@ -72,12 +78,17 @@ class ToolSelectionAccuracyEvaluator(Evaluator[InputT, OutputT]):
|
|
|
72
78
|
for tool_input in tool_inputs:
|
|
73
79
|
prompt = self._format_prompt(tool_input)
|
|
74
80
|
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
|
|
75
|
-
|
|
81
|
+
result = await evaluator_agent.invoke_async(prompt, structured_output_model=ToolSelectionRating)
|
|
82
|
+
rating = cast(ToolSelectionRating, result.structured_output)
|
|
76
83
|
normalized_score = self._score_mapping[rating.score]
|
|
77
|
-
|
|
78
|
-
|
|
84
|
+
results.append(
|
|
85
|
+
EvaluationOutput(
|
|
86
|
+
score=normalized_score,
|
|
87
|
+
test_pass=normalized_score == 1.0,
|
|
88
|
+
reason=rating.reasoning,
|
|
89
|
+
label=rating.score,
|
|
90
|
+
)
|
|
79
91
|
)
|
|
80
|
-
results.append(result)
|
|
81
92
|
|
|
82
93
|
return results
|
|
83
94
|
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from typing import cast
|
|
2
|
+
|
|
1
3
|
from strands import Agent
|
|
2
4
|
from strands.models.model import Model
|
|
3
5
|
from typing_extensions import Any, TypeVar, Union
|
|
@@ -74,8 +76,8 @@ class TrajectoryEvaluator(Evaluator[InputT, OutputT]):
|
|
|
74
76
|
include_inputs=self.include_inputs,
|
|
75
77
|
uses_trajectory=True,
|
|
76
78
|
)
|
|
77
|
-
result = evaluator_agent
|
|
78
|
-
return [result]
|
|
79
|
+
result = evaluator_agent(evaluation_prompt, structured_output_model=EvaluationOutput)
|
|
80
|
+
return [cast(EvaluationOutput, result.structured_output)]
|
|
79
81
|
|
|
80
82
|
async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
|
|
81
83
|
"""
|
|
@@ -96,5 +98,5 @@ class TrajectoryEvaluator(Evaluator[InputT, OutputT]):
|
|
|
96
98
|
include_inputs=self.include_inputs,
|
|
97
99
|
uses_trajectory=True,
|
|
98
100
|
)
|
|
99
|
-
result = await evaluator_agent.
|
|
100
|
-
return [result]
|
|
101
|
+
result = await evaluator_agent.invoke_async(evaluation_prompt, structured_output_model=EvaluationOutput)
|
|
102
|
+
return [cast(EvaluationOutput, result.structured_output)]
|