strands-agents-evals 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {strands_agents_evals-0.1.4.dist-info → strands_agents_evals-0.1.5.dist-info}/METADATA +2 -1
  2. {strands_agents_evals-0.1.4.dist-info → strands_agents_evals-0.1.5.dist-info}/RECORD +25 -18
  3. strands_evals/evaluators/__init__.py +4 -0
  4. strands_evals/evaluators/conciseness_evaluator.py +139 -0
  5. strands_evals/evaluators/evaluator.py +4 -0
  6. strands_evals/evaluators/faithfulness_evaluator.py +21 -16
  7. strands_evals/evaluators/goal_success_rate_evaluator.py +21 -16
  8. strands_evals/evaluators/harmfulness_evaluator.py +21 -16
  9. strands_evals/evaluators/helpfulness_evaluator.py +21 -16
  10. strands_evals/evaluators/interactions_evaluator.py +6 -4
  11. strands_evals/evaluators/output_evaluator.py +6 -4
  12. strands_evals/evaluators/prompt_templates/conciseness/__init__.py +11 -0
  13. strands_evals/evaluators/prompt_templates/conciseness/conciseness_v0.py +9 -0
  14. strands_evals/evaluators/prompt_templates/response_relevance/__init__.py +11 -0
  15. strands_evals/evaluators/prompt_templates/response_relevance/response_relevance_v0.py +29 -0
  16. strands_evals/evaluators/response_relevance_evaluator.py +144 -0
  17. strands_evals/evaluators/tool_parameter_accuracy_evaluator.py +19 -8
  18. strands_evals/evaluators/tool_selection_accuracy_evaluator.py +19 -8
  19. strands_evals/evaluators/trajectory_evaluator.py +6 -4
  20. strands_evals/experiment.py +281 -90
  21. strands_evals/extractors/trace_extractor.py +1 -2
  22. strands_evals/utils.py +37 -0
  23. {strands_agents_evals-0.1.4.dist-info → strands_agents_evals-0.1.5.dist-info}/WHEEL +0 -0
  24. {strands_agents_evals-0.1.4.dist-info → strands_agents_evals-0.1.5.dist-info}/licenses/LICENSE +0 -0
  25. {strands_agents_evals-0.1.4.dist-info → strands_agents_evals-0.1.5.dist-info}/licenses/NOTICE +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: strands-agents-evals
3
- Version: 0.1.4
3
+ Version: 0.1.5
4
4
  Summary: Evaluation framework for Strands
5
5
  Author-email: AWS <opensource@amazon.com>
6
6
  License: Apache-2.0
@@ -15,6 +15,7 @@ Requires-Dist: pydantic<3.0.0,>=2.0.0
15
15
  Requires-Dist: rich<15.0.0,>=14.0.0
16
16
  Requires-Dist: strands-agents-tools<1.0.0,>=0.1.0
17
17
  Requires-Dist: strands-agents>=1.0.0
18
+ Requires-Dist: tenacity<10.0.0,>=8.0.0
18
19
  Requires-Dist: typing-extensions>=4.0
19
20
  Provides-Extra: dev
20
21
  Requires-Dist: hatch<2.0.0,>=1.0.0; extra == 'dev'
@@ -1,20 +1,25 @@
1
1
  strands_evals/__init__.py,sha256=WnYsQGtkatrCKM8v_i_oCtBHNJfPaTrOg2ThUlf55Pk,485
2
2
  strands_evals/case.py,sha256=KWAL947NkmNzg9FFdTsL6KI9AFLQ8IcFjaOjcs9x5to,2131
3
- strands_evals/experiment.py,sha256=yySXFW5p9xkDSvkxHBBDncxXKiuj0aDFY7iKoUyprwc,28745
3
+ strands_evals/experiment.py,sha256=d9SWLjEnWjxqTfTB0fivPZRkVD8xFagS5r_OMxaqxmc,37723
4
+ strands_evals/utils.py,sha256=a8mkCjtyPTcz4YtcbC9La8OnNGE_Tl9Lqxf06ZbVfCA,1175
4
5
  strands_evals/display/display_console.py,sha256=bOTr6RepgnifALz2DgXnnk3c4Jjxu_mA68-pFr7xry0,5932
5
- strands_evals/evaluators/__init__.py,sha256=OfZU5RkYewHOAnEjPKdxiEvPnfOOWNZc_9nQpAfARfI,887
6
- strands_evals/evaluators/evaluator.py,sha256=XEesDeT83H93B1X_w8s0Nsb1KKHy26QO8b99Hi6vKbc,7466
7
- strands_evals/evaluators/faithfulness_evaluator.py,sha256=i6oLgG58BxYAv-lottapn4XfSBncvGyYNIXNRqSILAQ,4742
8
- strands_evals/evaluators/goal_success_rate_evaluator.py,sha256=ZKP2Us62_cwVwUVng9QlOytkrMXC7ObOlp7xr-obOw8,3373
9
- strands_evals/evaluators/harmfulness_evaluator.py,sha256=odKugWJUbVGuPzU3gEQjdumqIkmbKRIyPEKLESQt-vQ,5315
10
- strands_evals/evaluators/helpfulness_evaluator.py,sha256=7lRLhDsr1PSbPAILmDG015lfxn6iKa_dQleuFzlzMcQ,5922
11
- strands_evals/evaluators/interactions_evaluator.py,sha256=-JB85oXiEGyCr7oUH5nzwJpPw4GBWcmMKNzOHjErRSo,10694
12
- strands_evals/evaluators/output_evaluator.py,sha256=XEul2qc7cArl192cojKeB4BZ9EX8pCbpAQv7xwIqJAk,2949
13
- strands_evals/evaluators/tool_parameter_accuracy_evaluator.py,sha256=jLHA2hR3E23N3fOZEeOR_KqqkYwzP2FWNnTZOd55h9E,4642
14
- strands_evals/evaluators/tool_selection_accuracy_evaluator.py,sha256=bDa_-k3ye9v80urdyFIpsURSmdK6g7muZ_w0NfN_E3Y,4575
15
- strands_evals/evaluators/trajectory_evaluator.py,sha256=MIq0dxGxMBw-cOt8zc80jrVnO-McqtBQ5E4_0An2ka4,3989
6
+ strands_evals/evaluators/__init__.py,sha256=IHDzg31Od5lkwonbb329KcPHbn3FsKi8VallqddmF7E,1074
7
+ strands_evals/evaluators/conciseness_evaluator.py,sha256=pqO1CTR-NTyOU2vuhZZrIW9mpq3XstZysoKdNuPOoZo,5566
8
+ strands_evals/evaluators/evaluator.py,sha256=iW8A62wG0ZjXSz95eYraOTZ4FbhkhOOHIyj7Puol9fw,7619
9
+ strands_evals/evaluators/faithfulness_evaluator.py,sha256=dICHVCYa3_lj5FKzS7dnepDk7IgN2CEh0rY64OitPgQ,4957
10
+ strands_evals/evaluators/goal_success_rate_evaluator.py,sha256=jglDcwNu9vEZLYwq7XsQrvvIWiTPQl_Eqs0BxDfY4r0,3586
11
+ strands_evals/evaluators/harmfulness_evaluator.py,sha256=OeEjTogThr6FIzPFwlNiDiXzNhP2ET3hzw7NfDgcOMs,5528
12
+ strands_evals/evaluators/helpfulness_evaluator.py,sha256=5PvTTqOU1Q3Wvuxxa4KZvE_es_LtdL5EGvhhKU305NA,6135
13
+ strands_evals/evaluators/interactions_evaluator.py,sha256=j1zCrLGQxSzFe02eM4qkWHyJtokKWFqhzKz2_AzVTLE,10822
14
+ strands_evals/evaluators/output_evaluator.py,sha256=0pfM4oJ9UOTeJ1NYweBufUpgCLUSPs5Xy9xrKXGyCIg,3077
15
+ strands_evals/evaluators/response_relevance_evaluator.py,sha256=SJf4Q65_LWEMHre4cupFdC3hMCvLwYiXPP2IS20jMFY,5885
16
+ strands_evals/evaluators/tool_parameter_accuracy_evaluator.py,sha256=YYBsMxS8o2UUcK75-6zBqZGwDzCmj7ymZrVEmzPB7xI,4989
17
+ strands_evals/evaluators/tool_selection_accuracy_evaluator.py,sha256=RdzM1peqGmawfwwYO6J5U27y2_lNp8FM-qmTKw-M8LM,4906
18
+ strands_evals/evaluators/trajectory_evaluator.py,sha256=YJSjsy_4E0TJl-ERmelKObF-yF0npVExKWlTPDeLmQQ,4117
16
19
  strands_evals/evaluators/prompt_templates/case_prompt_template.py,sha256=NQH3flsOxQOp4sLAQ0g6Q4YjkpuIWzNJqZ8-bSIn78c,2687
17
20
  strands_evals/evaluators/prompt_templates/prompt_templates.py,sha256=tx4auXcHo-wxsQtJ9wCJGQbbURmhsRkB4LS-CzmAwwA,14468
21
+ strands_evals/evaluators/prompt_templates/conciseness/__init__.py,sha256=G6L1Jw4fAPS9R0CwPcCQ77v-O9NDya5gx3Kwt-MTsvo,176
22
+ strands_evals/evaluators/prompt_templates/conciseness/conciseness_v0.py,sha256=02anpXMdfLS1X21oZPr2-2-M_-Cm7LeOsxMdMRxoUUM,583
18
23
  strands_evals/evaluators/prompt_templates/faithfulness/__init__.py,sha256=_18J1msOfOikbvyYefAmOJEFaBJ2G75ybkTyQeHNvRA,178
19
24
  strands_evals/evaluators/prompt_templates/faithfulness/faithfulness_v0.py,sha256=0NVqDYnppwg-pBb6PW6PGaOuBwlYz7bVJLu0EgkI4dk,1410
20
25
  strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py,sha256=cfOeRMakkG_FL7_1ewJl3AuCJGT13ZSWM7tnInLJetw,188
@@ -23,6 +28,8 @@ strands_evals/evaluators/prompt_templates/harmfulness/__init__.py,sha256=Y2KQcYK
23
28
  strands_evals/evaluators/prompt_templates/harmfulness/harmfulness_v0.py,sha256=uGvZlfcPxfdHpUR_If2-xpMCX6-ynFlT0MnjZzqH3xA,1108
24
29
  strands_evals/evaluators/prompt_templates/helpfulness/__init__.py,sha256=8j55Lwo3qmkVDflZWXwxjauot7IAayN5ua1X9yQS1vM,176
25
30
  strands_evals/evaluators/prompt_templates/helpfulness/helpfulness_v0.py,sha256=MUlK5B0RDdQdcL5Ke_cp5V6sbuVZcwdlBnUoC3Ju9B4,1569
31
+ strands_evals/evaluators/prompt_templates/response_relevance/__init__.py,sha256=Yn7xcYvWmsby4oE6z0WYhLT-IU-z7hQ-QYNM1EwGGaE,190
32
+ strands_evals/evaluators/prompt_templates/response_relevance/response_relevance_v0.py,sha256=KXfNZHnQx7L4qu3o1rg7rvfnD0UjI9fIWOsMAcrDaRY,1663
26
33
  strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/__init__.py,sha256=lFx-tDGveafaqAAEvVuYU3-Pj9G0-14GlSrMkq80wX0,200
27
34
  strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/tool_parameter_accuracy_v0.py,sha256=D_m1aJN7AXeA8Z2iIysxvwx52TtiozGZFWvnXvG8Tms,2259
28
35
  strands_evals/evaluators/prompt_templates/tool_selection_accuracy/__init__.py,sha256=AiMdoIS4-MqGeeNQkzPkjBnOxW5W5-jw20Qbn6fOjik,200
@@ -31,7 +38,7 @@ strands_evals/extractors/__init__.py,sha256=Jmlrk-m8sSS_LwmCVSloIkg3BjOgRzNEezja
31
38
  strands_evals/extractors/graph_extractor.py,sha256=TeT-58JB9roqSvy2ELz1kg8WF5YO-cfLlGZTO0F9s_4,1105
32
39
  strands_evals/extractors/swarm_extractor.py,sha256=Sm1XFCkAGVdF3XDyO3iF-20I8C6sAQ8JPNP5fgotOFU,2682
33
40
  strands_evals/extractors/tools_use_extractor.py,sha256=emLL63LKldL2IA2u5wZL0ZhklZJqX0KLr5xFRt-S4i4,6600
34
- strands_evals/extractors/trace_extractor.py,sha256=TJKl0OdjFhh-htlV1Wxzem8TQdb0rxa-efkq_e0pAdo,7287
41
+ strands_evals/extractors/trace_extractor.py,sha256=Qbxi5UE9KgqQdJR1HIpqLAXdze_M9lJ_ASSaB5MOzk8,7259
35
42
  strands_evals/generators/__init__.py,sha256=B1F30DAIf0kPyBdE4PAZvSby-dTelqb_7hFJoATqVb0,89
36
43
  strands_evals/generators/experiment_generator.py,sha256=6wLTL0iG2b0YAiu0w8dDiaBxOIy7p_Fs7l3hCjgQc0w,22655
37
44
  strands_evals/generators/topic_planner.py,sha256=FtgTVDlV9hWJyO8E4Z__nEWvvrOJzmTW4y6yZ9Alx1A,2436
@@ -61,8 +68,8 @@ strands_evals/types/evaluation_report.py,sha256=vT86zO4Qn9CQbULo3aziGMdG-1qWLdcB
61
68
  strands_evals/types/trace.py,sha256=BFoEylzAlENyPH702T5MDz-_H21-Wfx-FFTSXX1tDfY,4844
62
69
  strands_evals/types/simulation/__init__.py,sha256=-mz5lW6qFfIMm4dJGaP9pXY3xeiefLbB0XevjdFykkU,133
63
70
  strands_evals/types/simulation/actor.py,sha256=ESTV8165c3Ad5QT4yYmjm-A-oZdwZ0Rf0Lq7zokjTPo,1163
64
- strands_agents_evals-0.1.4.dist-info/METADATA,sha256=VQm_tm1Umm3fi_HfujW0Ovm_XyvQQCjEJrAL4-dGjKQ,17721
65
- strands_agents_evals-0.1.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
66
- strands_agents_evals-0.1.4.dist-info/licenses/LICENSE,sha256=yIWKWnZEC7ykhOE0z330Y4XQkN0YssWOZQ-TkliALls,10141
67
- strands_agents_evals-0.1.4.dist-info/licenses/NOTICE,sha256=Eg13ogOmcI7JpMjxniFnKG81vwU3X8X7P_IlpvVg5RU,66
68
- strands_agents_evals-0.1.4.dist-info/RECORD,,
71
+ strands_agents_evals-0.1.5.dist-info/METADATA,sha256=cj7lVVMEKiOonsr3Zm4Qbjf-AHgZa0TsmFYvFiBpGVU,17760
72
+ strands_agents_evals-0.1.5.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
73
+ strands_agents_evals-0.1.5.dist-info/licenses/LICENSE,sha256=yIWKWnZEC7ykhOE0z330Y4XQkN0YssWOZQ-TkliALls,10141
74
+ strands_agents_evals-0.1.5.dist-info/licenses/NOTICE,sha256=Eg13ogOmcI7JpMjxniFnKG81vwU3X8X7P_IlpvVg5RU,66
75
+ strands_agents_evals-0.1.5.dist-info/RECORD,,
@@ -1,3 +1,4 @@
1
+ from .conciseness_evaluator import ConcisenessEvaluator
1
2
  from .evaluator import Evaluator
2
3
  from .faithfulness_evaluator import FaithfulnessEvaluator
3
4
  from .goal_success_rate_evaluator import GoalSuccessRateEvaluator
@@ -5,6 +6,7 @@ from .harmfulness_evaluator import HarmfulnessEvaluator
5
6
  from .helpfulness_evaluator import HelpfulnessEvaluator
6
7
  from .interactions_evaluator import InteractionsEvaluator
7
8
  from .output_evaluator import OutputEvaluator
9
+ from .response_relevance_evaluator import ResponseRelevanceEvaluator
8
10
  from .tool_parameter_accuracy_evaluator import ToolParameterAccuracyEvaluator
9
11
  from .tool_selection_accuracy_evaluator import ToolSelectionAccuracyEvaluator
10
12
  from .trajectory_evaluator import TrajectoryEvaluator
@@ -18,6 +20,8 @@ __all__ = [
18
20
  "HarmfulnessEvaluator",
19
21
  "GoalSuccessRateEvaluator",
20
22
  "FaithfulnessEvaluator",
23
+ "ResponseRelevanceEvaluator",
21
24
  "ToolSelectionAccuracyEvaluator",
22
25
  "ToolParameterAccuracyEvaluator",
26
+ "ConcisenessEvaluator",
23
27
  ]
@@ -0,0 +1,139 @@
1
+ from enum import Enum
2
+ from typing import cast
3
+
4
+ from pydantic import BaseModel, Field
5
+ from strands import Agent
6
+ from strands.models.model import Model
7
+ from typing_extensions import TypeVar, Union
8
+
9
+ from ..types.evaluation import EvaluationData, EvaluationOutput
10
+ from ..types.trace import EvaluationLevel, TextContent, ToolExecution, TraceLevelInput
11
+ from .evaluator import Evaluator
12
+ from .prompt_templates.conciseness import get_template
13
+
14
+ InputT = TypeVar("InputT")
15
+ OutputT = TypeVar("OutputT")
16
+
17
+
18
+ class ConcisenessScore(str, Enum):
19
+ """Categorical conciseness ratings."""
20
+
21
+ NOT_CONCISE = "Not Concise"
22
+ PARTIALLY_CONCISE = "Partially Concise"
23
+ PERFECTLY_CONCISE = "Perfectly Concise"
24
+
25
+
26
+ class ConcisenessRating(BaseModel):
27
+ """Structured output for conciseness evaluation."""
28
+
29
+ reasoning: str = Field(description="Step by step reasoning to derive the final score")
30
+ score: ConcisenessScore = Field(description="Categorical conciseness rating")
31
+
32
+
33
+ class ConcisenessEvaluator(Evaluator[InputT, OutputT]):
34
+ """Evaluates how concise the assistant's response is."""
35
+
36
+ evaluation_level = EvaluationLevel.TRACE_LEVEL
37
+
38
+ _score_mapping = {
39
+ ConcisenessScore.NOT_CONCISE: 0.0,
40
+ ConcisenessScore.PARTIALLY_CONCISE: 0.5,
41
+ ConcisenessScore.PERFECTLY_CONCISE: 1.0,
42
+ }
43
+
44
+ def __init__(
45
+ self,
46
+ version: str = "v0",
47
+ model: Union[Model, str, None] = None,
48
+ system_prompt: str | None = None,
49
+ include_inputs: bool = True,
50
+ ):
51
+ super().__init__()
52
+ self.system_prompt = system_prompt or get_template(version).SYSTEM_PROMPT
53
+ self.version = version
54
+ self.model = model
55
+ self.include_inputs = include_inputs
56
+
57
+ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
58
+ parsed_input = self._get_last_turn(evaluation_case)
59
+ prompt = self._format_prompt(parsed_input)
60
+ evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
61
+ result = evaluator_agent(prompt, structured_output_model=ConcisenessRating)
62
+ return self._create_evaluation_output(result)
63
+
64
+ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
65
+ parsed_input = self._get_last_turn(evaluation_case)
66
+ prompt = self._format_prompt(parsed_input)
67
+ evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
68
+ result = await evaluator_agent.invoke_async(prompt, structured_output_model=ConcisenessRating)
69
+ return self._create_evaluation_output(result)
70
+
71
+ def _create_evaluation_output(self, result) -> list[EvaluationOutput]:
72
+ rating = cast(ConcisenessRating, result.structured_output)
73
+ normalized_score = self._score_mapping[rating.score]
74
+ return [
75
+ EvaluationOutput(
76
+ score=normalized_score,
77
+ test_pass=normalized_score >= 0.5,
78
+ reason=rating.reasoning,
79
+ label=rating.score,
80
+ )
81
+ ]
82
+
83
+ def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
84
+ """Extract the most recent turn from the conversation for evaluation."""
85
+ parsed_inputs = self._parse_trajectory(evaluation_case)
86
+ if not parsed_inputs:
87
+ raise ValueError(
88
+ "No turn-level inputs could be parsed from the trajectory. "
89
+ "Ensure actual_trajectory is a Session with at least one AgentInvocationSpan."
90
+ )
91
+ return parsed_inputs[-1]
92
+
93
+ def _extract_user_prompt(self, parsed_input: TraceLevelInput) -> str:
94
+ """Extract user prompt from last message in session history.
95
+
96
+ Args:
97
+ parsed_input: Trace-level input containing session history
98
+
99
+ Returns:
100
+ User prompt text, or empty string if not available
101
+ """
102
+ if not parsed_input.session_history:
103
+ return ""
104
+
105
+ last_msg = parsed_input.session_history[-1]
106
+ if not isinstance(last_msg, list) and self._has_text_content(last_msg):
107
+ first_content = last_msg.content[0]
108
+ if isinstance(first_content, TextContent):
109
+ return first_content.text
110
+
111
+ return ""
112
+
113
+ def _format_prompt(self, parsed_input: TraceLevelInput) -> str:
114
+ """Format evaluation prompt from parsed trace data.
115
+
116
+ Args:
117
+ parsed_input: Trace-level input containing agent response and session history
118
+
119
+ Returns:
120
+ Formatted prompt string with conversation history and target turn
121
+ """
122
+ parts = []
123
+
124
+ if parsed_input.session_history:
125
+ history_lines = []
126
+ for msg in parsed_input.session_history:
127
+ if isinstance(msg, list) and msg and isinstance(msg[0], ToolExecution):
128
+ continue # Skip tool execution lists
129
+ if not isinstance(msg, list) and self._has_text_content(msg):
130
+ first_content = msg.content[0]
131
+ if isinstance(first_content, TextContent):
132
+ history_lines.append(f"{msg.role.value.capitalize()}: {first_content.text}")
133
+ history_str = "\n".join(history_lines)
134
+ parts.append(f"# Previous turns:\n{history_str}")
135
+
136
+ user_prompt = self._extract_user_prompt(parsed_input)
137
+ parts.append(f"# Target turn to evaluate:\nUser: {user_prompt}\nAssistant: {parsed_input.agent_response.text}")
138
+
139
+ return "\n\n".join(parts)
@@ -63,6 +63,10 @@ class Evaluator(Generic[InputT, OutputT]):
63
63
 
64
64
  @staticmethod
65
65
  def _default_aggregator(outputs: list[EvaluationOutput]) -> tuple[float, bool, str]:
66
+ # Handle empty outputs list to avoid division by zero
67
+ if not outputs:
68
+ return (0.0, False, "No evaluation outputs produced")
69
+
66
70
  avg_score = sum(o.score for o in outputs) / len(outputs)
67
71
  all_pass = all(o.test_pass for o in outputs)
68
72
  combined_reason = " | ".join(o.reason for o in outputs if o.reason)
@@ -1,4 +1,5 @@
1
1
  from enum import Enum
2
+ from typing import cast
2
3
 
3
4
  from pydantic import BaseModel, Field
4
5
  from strands import Agent
@@ -59,29 +60,33 @@ class FaithfulnessEvaluator(Evaluator[InputT, OutputT]):
59
60
  parsed_input = self._get_last_turn(evaluation_case)
60
61
  prompt = self._format_prompt(parsed_input)
61
62
  evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
62
- rating = evaluator_agent.structured_output(FaithfulnessRating, prompt)
63
+ result = evaluator_agent(prompt, structured_output_model=FaithfulnessRating)
64
+ rating = cast(FaithfulnessRating, result.structured_output)
63
65
  normalized_score = self._score_mapping[rating.score]
64
- result = EvaluationOutput(
65
- score=normalized_score,
66
- test_pass=normalized_score >= 0.5,
67
- reason=rating.reasoning,
68
- label=rating.score,
69
- )
70
- return [result]
66
+ return [
67
+ EvaluationOutput(
68
+ score=normalized_score,
69
+ test_pass=normalized_score >= 0.5,
70
+ reason=rating.reasoning,
71
+ label=rating.score,
72
+ )
73
+ ]
71
74
 
72
75
  async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
73
76
  parsed_input = self._get_last_turn(evaluation_case)
74
77
  prompt = self._format_prompt(parsed_input)
75
78
  evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
76
- rating = await evaluator_agent.structured_output_async(FaithfulnessRating, prompt)
79
+ result = await evaluator_agent.invoke_async(prompt, structured_output_model=FaithfulnessRating)
80
+ rating = cast(FaithfulnessRating, result.structured_output)
77
81
  normalized_score = self._score_mapping[rating.score]
78
- result = EvaluationOutput(
79
- score=normalized_score,
80
- test_pass=normalized_score >= 0.5,
81
- reason=rating.reasoning,
82
- label=rating.score,
83
- )
84
- return [result]
82
+ return [
83
+ EvaluationOutput(
84
+ score=normalized_score,
85
+ test_pass=normalized_score >= 0.5,
86
+ reason=rating.reasoning,
87
+ label=rating.score,
88
+ )
89
+ ]
85
90
 
86
91
  def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
87
92
  """Extract the most recent turn from the conversation for evaluation."""
@@ -1,4 +1,5 @@
1
1
  from enum import Enum
2
+ from typing import cast
2
3
 
3
4
  from pydantic import BaseModel, Field
4
5
  from strands import Agent
@@ -53,29 +54,33 @@ class GoalSuccessRateEvaluator(Evaluator[InputT, OutputT]):
53
54
  session_input = self._parse_trajectory(evaluation_case)
54
55
  prompt = self._format_prompt(session_input)
55
56
  evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
56
- rating = evaluator_agent.structured_output(GoalSuccessRating, prompt)
57
+ result = evaluator_agent(prompt, structured_output_model=GoalSuccessRating)
58
+ rating = cast(GoalSuccessRating, result.structured_output)
57
59
  normalized_score = self._score_mapping[rating.score]
58
- result = EvaluationOutput(
59
- score=normalized_score,
60
- test_pass=normalized_score >= 1.0,
61
- reason=rating.reasoning,
62
- label=rating.score,
63
- )
64
- return [result]
60
+ return [
61
+ EvaluationOutput(
62
+ score=normalized_score,
63
+ test_pass=normalized_score >= 1.0,
64
+ reason=rating.reasoning,
65
+ label=rating.score,
66
+ )
67
+ ]
65
68
 
66
69
  async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
67
70
  session_input = self._parse_trajectory(evaluation_case)
68
71
  prompt = self._format_prompt(session_input)
69
72
  evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
70
- rating = await evaluator_agent.structured_output_async(GoalSuccessRating, prompt)
73
+ result = await evaluator_agent.invoke_async(prompt, structured_output_model=GoalSuccessRating)
74
+ rating = cast(GoalSuccessRating, result.structured_output)
71
75
  normalized_score = self._score_mapping[rating.score]
72
- result = EvaluationOutput(
73
- score=normalized_score,
74
- test_pass=normalized_score >= 1.0,
75
- reason=rating.reasoning,
76
- label=rating.score,
77
- )
78
- return [result]
76
+ return [
77
+ EvaluationOutput(
78
+ score=normalized_score,
79
+ test_pass=normalized_score >= 1.0,
80
+ reason=rating.reasoning,
81
+ label=rating.score,
82
+ )
83
+ ]
79
84
 
80
85
  def _format_prompt(self, session_input: SessionLevelInput) -> str:
81
86
  """Format evaluation prompt from session-level input."""
@@ -1,4 +1,5 @@
1
1
  from enum import Enum
2
+ from typing import cast
2
3
 
3
4
  from pydantic import BaseModel, Field
4
5
  from strands import Agent
@@ -52,29 +53,33 @@ class HarmfulnessEvaluator(Evaluator[InputT, OutputT]):
52
53
  parsed_input = self._get_last_turn(evaluation_case)
53
54
  prompt = self._format_prompt(parsed_input)
54
55
  evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
55
- rating = evaluator_agent.structured_output(HarmfulnessRating, prompt)
56
+ result = evaluator_agent(prompt, structured_output_model=HarmfulnessRating)
57
+ rating = cast(HarmfulnessRating, result.structured_output)
56
58
  normalized_score = self._score_mapping[rating.score]
57
- result = EvaluationOutput(
58
- score=normalized_score,
59
- test_pass=normalized_score == 1.0,
60
- reason=rating.reasoning,
61
- label=rating.score,
62
- )
63
- return [result]
59
+ return [
60
+ EvaluationOutput(
61
+ score=normalized_score,
62
+ test_pass=normalized_score == 1.0,
63
+ reason=rating.reasoning,
64
+ label=rating.score,
65
+ )
66
+ ]
64
67
 
65
68
  async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
66
69
  parsed_input = self._get_last_turn(evaluation_case)
67
70
  prompt = self._format_prompt(parsed_input)
68
71
  evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
69
- rating = await evaluator_agent.structured_output_async(HarmfulnessRating, prompt)
72
+ result = await evaluator_agent.invoke_async(prompt, structured_output_model=HarmfulnessRating)
73
+ rating = cast(HarmfulnessRating, result.structured_output)
70
74
  normalized_score = self._score_mapping[rating.score]
71
- result = EvaluationOutput(
72
- score=normalized_score,
73
- test_pass=normalized_score == 1.0,
74
- reason=rating.reasoning,
75
- label=rating.score,
76
- )
77
- return [result]
75
+ return [
76
+ EvaluationOutput(
77
+ score=normalized_score,
78
+ test_pass=normalized_score == 1.0,
79
+ reason=rating.reasoning,
80
+ label=rating.score,
81
+ )
82
+ ]
78
83
 
79
84
  def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
80
85
  """Extract the most recent turn from the conversation for evaluation."""
@@ -1,4 +1,5 @@
1
1
  from enum import Enum
2
+ from typing import cast
2
3
 
3
4
  from pydantic import BaseModel, Field
4
5
  from strands import Agent
@@ -65,29 +66,33 @@ class HelpfulnessEvaluator(Evaluator[InputT, OutputT]):
65
66
  parsed_input = self._get_last_turn(evaluation_case)
66
67
  prompt = self._format_prompt(parsed_input)
67
68
  evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
68
- rating = evaluator_agent.structured_output(HelpfulnessRating, prompt)
69
+ result = evaluator_agent(prompt, structured_output_model=HelpfulnessRating)
70
+ rating = cast(HelpfulnessRating, result.structured_output)
69
71
  normalized_score = self._score_mapping[rating.score]
70
- result = EvaluationOutput(
71
- score=normalized_score,
72
- test_pass=normalized_score >= 0.5,
73
- reason=rating.reasoning,
74
- label=rating.score,
75
- )
76
- return [result]
72
+ return [
73
+ EvaluationOutput(
74
+ score=normalized_score,
75
+ test_pass=normalized_score >= 0.5,
76
+ reason=rating.reasoning,
77
+ label=rating.score,
78
+ )
79
+ ]
77
80
 
78
81
  async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
79
82
  parsed_input = self._get_last_turn(evaluation_case)
80
83
  prompt = self._format_prompt(parsed_input)
81
84
  evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
82
- rating = await evaluator_agent.structured_output_async(HelpfulnessRating, prompt)
85
+ result = await evaluator_agent.invoke_async(prompt, structured_output_model=HelpfulnessRating)
86
+ rating = cast(HelpfulnessRating, result.structured_output)
83
87
  normalized_score = self._score_mapping[rating.score]
84
- result = EvaluationOutput(
85
- score=normalized_score,
86
- test_pass=normalized_score >= 0.5,
87
- reason=rating.reasoning,
88
- label=rating.score,
89
- )
90
- return [result]
88
+ return [
89
+ EvaluationOutput(
90
+ score=normalized_score,
91
+ test_pass=normalized_score >= 0.5,
92
+ reason=rating.reasoning,
93
+ label=rating.score,
94
+ )
95
+ ]
91
96
 
92
97
  def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
93
98
  """Extract the most recent turn from the conversation for evaluation."""
@@ -1,3 +1,5 @@
1
+ from typing import cast
2
+
1
3
  from strands import Agent
2
4
  from strands.agent.conversation_manager import SlidingWindowConversationManager
3
5
  from strands.models.model import Model
@@ -198,8 +200,8 @@ class InteractionsEvaluator(Evaluator[InputT, OutputT]):
198
200
  for i in range(num_interactions):
199
201
  is_last = i == num_interactions - 1
200
202
  evaluation_prompt = self._compose_prompt(evaluation_case, i, is_last)
201
- result = evaluator_agent.structured_output(EvaluationOutput, evaluation_prompt)
202
- results.append(result)
203
+ result = evaluator_agent(evaluation_prompt, structured_output_model=EvaluationOutput)
204
+ results.append(cast(EvaluationOutput, result.structured_output))
203
205
 
204
206
  return results
205
207
 
@@ -238,7 +240,7 @@ class InteractionsEvaluator(Evaluator[InputT, OutputT]):
238
240
  for i in range(num_interactions):
239
241
  is_last = i == num_interactions - 1
240
242
  evaluation_prompt = self._compose_prompt(evaluation_case, i, is_last)
241
- result = await evaluator_agent.structured_output_async(EvaluationOutput, evaluation_prompt)
242
- results.append(result)
243
+ result = await evaluator_agent.invoke_async(evaluation_prompt, structured_output_model=EvaluationOutput)
244
+ results.append(cast(EvaluationOutput, result.structured_output))
243
245
 
244
246
  return results
@@ -1,3 +1,5 @@
1
+ from typing import cast
2
+
1
3
  from strands import Agent
2
4
  from strands.models.model import Model
3
5
  from typing_extensions import TypeVar, Union
@@ -51,8 +53,8 @@ class OutputEvaluator(Evaluator[InputT, OutputT]):
51
53
  evaluation_prompt = compose_test_prompt(
52
54
  evaluation_case=evaluation_case, rubric=self.rubric, include_inputs=self.include_inputs
53
55
  )
54
- result = evaluator_agent.structured_output(EvaluationOutput, evaluation_prompt)
55
- return [result]
56
+ result = evaluator_agent(evaluation_prompt, structured_output_model=EvaluationOutput)
57
+ return [cast(EvaluationOutput, result.structured_output)]
56
58
 
57
59
  async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
58
60
  """
@@ -68,5 +70,5 @@ class OutputEvaluator(Evaluator[InputT, OutputT]):
68
70
  evaluation_prompt = compose_test_prompt(
69
71
  evaluation_case=evaluation_case, rubric=self.rubric, include_inputs=self.include_inputs
70
72
  )
71
- result = await evaluator_agent.structured_output_async(EvaluationOutput, evaluation_prompt)
72
- return [result]
73
+ result = await evaluator_agent.invoke_async(evaluation_prompt, structured_output_model=EvaluationOutput)
74
+ return [cast(EvaluationOutput, result.structured_output)]
@@ -0,0 +1,11 @@
1
+ from . import conciseness_v0
2
+
3
+ VERSIONS = {
4
+ "v0": conciseness_v0,
5
+ }
6
+
7
+ DEFAULT_VERSION = "v0"
8
+
9
+
10
+ def get_template(version: str = DEFAULT_VERSION):
11
+ return VERSIONS[version]
@@ -0,0 +1,9 @@
1
+ SYSTEM_PROMPT = """You are evaluating how concise the Assistant's response is.
2
+ A concise response provides exactly what was requested using the minimum necessary words, without extra explanations, pleasantries, or repetition unless specifically asked for.
3
+
4
+ ## Scoring
5
+ - Perfectly Concise: delivers exactly what was asked with no unnecessary content
6
+ - Partially Concise: minor extra wording but still focused
7
+ - Not Concise: verbose, repetitive, or includes substantial unnecessary content
8
+
9
+ **IMPORTANT**: The agent prompt and tools ALWAYS takes priority over your own knowledge."""
@@ -0,0 +1,11 @@
1
+ from . import response_relevance_v0
2
+
3
+ VERSIONS = {
4
+ "v0": response_relevance_v0,
5
+ }
6
+
7
+ DEFAULT_VERSION = "v0"
8
+
9
+
10
+ def get_template(version: str = DEFAULT_VERSION):
11
+ return VERSIONS[version]
@@ -0,0 +1,29 @@
1
+ SYSTEM_PROMPT = """You are an objective judge evaluating the relevance of an AI assistant's response to the user's question. Your task is to assess how focused the response is on addressing the given question.
2
+
3
+ # Evaluation Guidelines:
4
+
5
+ When evaluating the relevance of the response, consider the following rubrics:
6
+
7
+ - If everything in the response can be understood to directly address the input, the response is perfectly relevant.
8
+ - If anything in the response is unrelated to the input, the response is less relevant.
9
+ - Relevance only evaluates whether the response is on topic. Content that indicates that the assistant understood the question, but was unable to answer it truthfully, faithfully, coherently or correctly still counts as a relevant response. Only content that is extraneous to answering the question should be penalized.
10
+ - Duplicate information does not penalize relevance. The response could say the same thing multiple times. If that thing is a relevant answer to the user's query, relevance is not penalized.
11
+
12
+ # Rating Scale:
13
+
14
+ 1. Not At All
15
+ - No part of the response is relevant to the question
16
+
17
+ 2. Not Generally
18
+ - An overwhelming amount of the response is irrelevant or the relevant information is not a direct answer
19
+
20
+ 3. Neutral/Mixed
21
+ - Roughly half of the response is relevant to the question
22
+
23
+ 4. Generally Yes
24
+ - An overwhelming amount of the response is relevant to the question
25
+
26
+ 5. Completely Yes
27
+ - Every piece of the response is relevant to the question
28
+
29
+ IMPORTANT: The tool output ALWAYS takes priority over your own knowledge. Focus on whether the response addresses the user's question, not on factual accuracy."""