strands-agents-evals 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {strands_agents_evals-0.1.3.dist-info → strands_agents_evals-0.1.5.dist-info}/METADATA +2 -1
  2. {strands_agents_evals-0.1.3.dist-info → strands_agents_evals-0.1.5.dist-info}/RECORD +25 -18
  3. strands_evals/evaluators/__init__.py +4 -0
  4. strands_evals/evaluators/conciseness_evaluator.py +139 -0
  5. strands_evals/evaluators/evaluator.py +4 -0
  6. strands_evals/evaluators/faithfulness_evaluator.py +21 -16
  7. strands_evals/evaluators/goal_success_rate_evaluator.py +21 -16
  8. strands_evals/evaluators/harmfulness_evaluator.py +21 -16
  9. strands_evals/evaluators/helpfulness_evaluator.py +21 -16
  10. strands_evals/evaluators/interactions_evaluator.py +6 -4
  11. strands_evals/evaluators/output_evaluator.py +6 -4
  12. strands_evals/evaluators/prompt_templates/conciseness/__init__.py +11 -0
  13. strands_evals/evaluators/prompt_templates/conciseness/conciseness_v0.py +9 -0
  14. strands_evals/evaluators/prompt_templates/response_relevance/__init__.py +11 -0
  15. strands_evals/evaluators/prompt_templates/response_relevance/response_relevance_v0.py +29 -0
  16. strands_evals/evaluators/response_relevance_evaluator.py +144 -0
  17. strands_evals/evaluators/tool_parameter_accuracy_evaluator.py +19 -8
  18. strands_evals/evaluators/tool_selection_accuracy_evaluator.py +19 -8
  19. strands_evals/evaluators/trajectory_evaluator.py +6 -4
  20. strands_evals/experiment.py +281 -90
  21. strands_evals/extractors/trace_extractor.py +13 -1
  22. strands_evals/utils.py +37 -0
  23. {strands_agents_evals-0.1.3.dist-info → strands_agents_evals-0.1.5.dist-info}/WHEEL +0 -0
  24. {strands_agents_evals-0.1.3.dist-info → strands_agents_evals-0.1.5.dist-info}/licenses/LICENSE +0 -0
  25. {strands_agents_evals-0.1.3.dist-info → strands_agents_evals-0.1.5.dist-info}/licenses/NOTICE +0 -0
@@ -0,0 +1,144 @@
1
+ from enum import Enum
2
+ from typing import cast
3
+
4
+ from pydantic import BaseModel, Field
5
+ from strands import Agent
6
+ from strands.agent.agent_result import AgentResult
7
+ from strands.models.model import Model
8
+ from typing_extensions import TypeVar, Union
9
+
10
+ from ..types.evaluation import EvaluationData, EvaluationOutput
11
+ from ..types.trace import EvaluationLevel, TextContent, ToolExecution, TraceLevelInput
12
+ from .evaluator import Evaluator
13
+ from .prompt_templates.response_relevance import get_template
14
+
15
+ InputT = TypeVar("InputT")
16
+ OutputT = TypeVar("OutputT")
17
+
18
+
19
+ class ResponseRelevanceScore(str, Enum):
20
+ """Categorical response relevance ratings."""
21
+
22
+ NOT_AT_ALL = "Not At All"
23
+ NOT_GENERALLY = "Not Generally"
24
+ NEUTRAL_MIXED = "Neutral/Mixed"
25
+ GENERALLY_YES = "Generally Yes"
26
+ COMPLETELY_YES = "Completely Yes"
27
+
28
+
29
+ class ResponseRelevanceRating(BaseModel):
30
+ """Structured output for response relevance evaluation."""
31
+
32
+ reasoning: str = Field(description="Step by step reasoning to derive the final score")
33
+ score: ResponseRelevanceScore = Field(description="Categorical response relevance rating")
34
+
35
+
36
+ class ResponseRelevanceEvaluator(Evaluator[InputT, OutputT]):
37
+ """Evaluates the relevance of agent responses to user questions."""
38
+
39
+ evaluation_level = EvaluationLevel.TRACE_LEVEL
40
+
41
+ _score_mapping = {
42
+ ResponseRelevanceScore.NOT_AT_ALL: 0.0,
43
+ ResponseRelevanceScore.NOT_GENERALLY: 0.25,
44
+ ResponseRelevanceScore.NEUTRAL_MIXED: 0.5,
45
+ ResponseRelevanceScore.GENERALLY_YES: 0.75,
46
+ ResponseRelevanceScore.COMPLETELY_YES: 1.0,
47
+ }
48
+
49
+ def __init__(
50
+ self,
51
+ version: str = "v0",
52
+ model: Union[Model, str, None] = None,
53
+ system_prompt: str | None = None,
54
+ include_inputs: bool = True,
55
+ ):
56
+ super().__init__()
57
+ self.system_prompt = system_prompt or get_template(version).SYSTEM_PROMPT
58
+ self.version = version
59
+ self.model = model
60
+ self.include_inputs = include_inputs
61
+
62
+ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
63
+ parsed_input = self._get_last_turn(evaluation_case)
64
+ prompt = self._format_prompt(parsed_input)
65
+ evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
66
+ result = evaluator_agent(prompt, structured_output_model=ResponseRelevanceRating)
67
+ return self._create_evaluation_output(result)
68
+
69
+ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
70
+ parsed_input = self._get_last_turn(evaluation_case)
71
+ prompt = self._format_prompt(parsed_input)
72
+ evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
73
+ result = await evaluator_agent.invoke_async(prompt, structured_output_model=ResponseRelevanceRating)
74
+ return self._create_evaluation_output(result)
75
+
76
+ def _create_evaluation_output(self, result: AgentResult) -> list[EvaluationOutput]:
77
+ rating = cast(ResponseRelevanceRating, result.structured_output)
78
+ normalized_score = self._score_mapping[rating.score]
79
+ return [
80
+ EvaluationOutput(
81
+ score=normalized_score,
82
+ test_pass=normalized_score >= 0.5,
83
+ reason=rating.reasoning,
84
+ label=rating.score,
85
+ )
86
+ ]
87
+
88
+ def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
89
+ """Extract the most recent turn from the conversation for evaluation."""
90
+ parsed_inputs = self._parse_trajectory(evaluation_case)
91
+ if not parsed_inputs:
92
+ raise ValueError(
93
+ "No turn-level inputs could be parsed from the trajectory. "
94
+ "Ensure actual_trajectory is a Session with at least one AgentInvocationSpan."
95
+ )
96
+ return parsed_inputs[-1]
97
+
98
+ def _extract_user_prompt(self, parsed_input: TraceLevelInput) -> str:
99
+ """Extract user prompt from last message in session history.
100
+
101
+ Args:
102
+ parsed_input: Trace-level input containing session history
103
+
104
+ Returns:
105
+ User prompt text, or empty string if not available
106
+ """
107
+ if not parsed_input.session_history:
108
+ return ""
109
+
110
+ last_msg = parsed_input.session_history[-1]
111
+ if not isinstance(last_msg, list) and self._has_text_content(last_msg):
112
+ first_content = last_msg.content[0]
113
+ if isinstance(first_content, TextContent):
114
+ return first_content.text
115
+
116
+ return ""
117
+
118
+ def _format_prompt(self, parsed_input: TraceLevelInput) -> str:
119
+ """Format evaluation prompt from parsed trace data.
120
+
121
+ Args:
122
+ parsed_input: Trace-level input containing agent response and session history
123
+
124
+ Returns:
125
+ Formatted prompt string with conversation context and target response
126
+ """
127
+ parts = []
128
+
129
+ if parsed_input.session_history:
130
+ history_lines = []
131
+ for msg in parsed_input.session_history:
132
+ if isinstance(msg, list) and msg and isinstance(msg[0], ToolExecution):
133
+ continue # Skip tool execution lists
134
+ if not isinstance(msg, list) and self._has_text_content(msg):
135
+ first_content = msg.content[0]
136
+ if isinstance(first_content, TextContent):
137
+ history_lines.append(f"{msg.role.value.capitalize()}: {first_content.text}")
138
+ history_str = "\n".join(history_lines)
139
+ parts.append(f"# Previous turns:\n{history_str}")
140
+
141
+ user_prompt = self._extract_user_prompt(parsed_input)
142
+ parts.append(f"# Target turn to evaluate:\nUser: {user_prompt}\nAssistant: {parsed_input.agent_response.text}")
143
+
144
+ return "\n\n".join(parts)
@@ -1,4 +1,5 @@
1
1
  from enum import Enum
2
+ from typing import cast
2
3
 
3
4
  from pydantic import BaseModel, Field
4
5
  from strands import Agent
@@ -56,12 +57,17 @@ class ToolParameterAccuracyEvaluator(Evaluator[InputT, OutputT]):
56
57
  for tool_input in tool_inputs:
57
58
  prompt = self._format_prompt(tool_input)
58
59
  evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
59
- rating = evaluator_agent.structured_output(ToolParameterAccuracyRating, prompt)
60
+ result = evaluator_agent(prompt, structured_output_model=ToolParameterAccuracyRating)
61
+ rating = cast(ToolParameterAccuracyRating, result.structured_output)
60
62
  normalized_score = self._score_mapping[rating.score]
61
- result = EvaluationOutput(
62
- score=normalized_score, test_pass=normalized_score == 1.0, reason=rating.reasoning, label=rating.score
63
+ results.append(
64
+ EvaluationOutput(
65
+ score=normalized_score,
66
+ test_pass=normalized_score == 1.0,
67
+ reason=rating.reasoning,
68
+ label=rating.score,
69
+ )
63
70
  )
64
- results.append(result)
65
71
 
66
72
  return results
67
73
 
@@ -72,12 +78,17 @@ class ToolParameterAccuracyEvaluator(Evaluator[InputT, OutputT]):
72
78
  for tool_input in tool_inputs:
73
79
  prompt = self._format_prompt(tool_input)
74
80
  evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
75
- rating = await evaluator_agent.structured_output_async(ToolParameterAccuracyRating, prompt)
81
+ result = await evaluator_agent.invoke_async(prompt, structured_output_model=ToolParameterAccuracyRating)
82
+ rating = cast(ToolParameterAccuracyRating, result.structured_output)
76
83
  normalized_score = self._score_mapping[rating.score]
77
- result = EvaluationOutput(
78
- score=normalized_score, test_pass=normalized_score == 1.0, reason=rating.reasoning, label=rating.score
84
+ results.append(
85
+ EvaluationOutput(
86
+ score=normalized_score,
87
+ test_pass=normalized_score == 1.0,
88
+ reason=rating.reasoning,
89
+ label=rating.score,
90
+ )
79
91
  )
80
- results.append(result)
81
92
 
82
93
  return results
83
94
 
@@ -1,4 +1,5 @@
1
1
  from enum import Enum
2
+ from typing import cast
2
3
 
3
4
  from pydantic import BaseModel, Field
4
5
  from strands import Agent
@@ -56,12 +57,17 @@ class ToolSelectionAccuracyEvaluator(Evaluator[InputT, OutputT]):
56
57
  for tool_input in tool_inputs:
57
58
  prompt = self._format_prompt(tool_input)
58
59
  evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
59
- rating = evaluator_agent.structured_output(ToolSelectionRating, prompt)
60
+ result = evaluator_agent(prompt, structured_output_model=ToolSelectionRating)
61
+ rating = cast(ToolSelectionRating, result.structured_output)
60
62
  normalized_score = self._score_mapping[rating.score]
61
- result = EvaluationOutput(
62
- score=normalized_score, test_pass=normalized_score == 1.0, reason=rating.reasoning, label=rating.score
63
+ results.append(
64
+ EvaluationOutput(
65
+ score=normalized_score,
66
+ test_pass=normalized_score == 1.0,
67
+ reason=rating.reasoning,
68
+ label=rating.score,
69
+ )
63
70
  )
64
- results.append(result)
65
71
 
66
72
  return results
67
73
 
@@ -72,12 +78,17 @@ class ToolSelectionAccuracyEvaluator(Evaluator[InputT, OutputT]):
72
78
  for tool_input in tool_inputs:
73
79
  prompt = self._format_prompt(tool_input)
74
80
  evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
75
- rating = await evaluator_agent.structured_output_async(ToolSelectionRating, prompt)
81
+ result = await evaluator_agent.invoke_async(prompt, structured_output_model=ToolSelectionRating)
82
+ rating = cast(ToolSelectionRating, result.structured_output)
76
83
  normalized_score = self._score_mapping[rating.score]
77
- result = EvaluationOutput(
78
- score=normalized_score, test_pass=normalized_score == 1.0, reason=rating.reasoning, label=rating.score
84
+ results.append(
85
+ EvaluationOutput(
86
+ score=normalized_score,
87
+ test_pass=normalized_score == 1.0,
88
+ reason=rating.reasoning,
89
+ label=rating.score,
90
+ )
79
91
  )
80
- results.append(result)
81
92
 
82
93
  return results
83
94
 
@@ -1,3 +1,5 @@
1
+ from typing import cast
2
+
1
3
  from strands import Agent
2
4
  from strands.models.model import Model
3
5
  from typing_extensions import Any, TypeVar, Union
@@ -74,8 +76,8 @@ class TrajectoryEvaluator(Evaluator[InputT, OutputT]):
74
76
  include_inputs=self.include_inputs,
75
77
  uses_trajectory=True,
76
78
  )
77
- result = evaluator_agent.structured_output(EvaluationOutput, evaluation_prompt)
78
- return [result]
79
+ result = evaluator_agent(evaluation_prompt, structured_output_model=EvaluationOutput)
80
+ return [cast(EvaluationOutput, result.structured_output)]
79
81
 
80
82
  async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
81
83
  """
@@ -96,5 +98,5 @@ class TrajectoryEvaluator(Evaluator[InputT, OutputT]):
96
98
  include_inputs=self.include_inputs,
97
99
  uses_trajectory=True,
98
100
  )
99
- result = await evaluator_agent.structured_output_async(EvaluationOutput, evaluation_prompt)
100
- return [result]
101
+ result = await evaluator_agent.invoke_async(evaluation_prompt, structured_output_model=EvaluationOutput)
102
+ return [cast(EvaluationOutput, result.structured_output)]