strands-agents-evals 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. strands_agents_evals-0.1.0.dist-info/METADATA +408 -0
  2. strands_agents_evals-0.1.0.dist-info/RECORD +68 -0
  3. strands_agents_evals-0.1.0.dist-info/WHEEL +4 -0
  4. strands_agents_evals-0.1.0.dist-info/licenses/LICENSE +175 -0
  5. strands_agents_evals-0.1.0.dist-info/licenses/NOTICE +1 -0
  6. strands_evals/__init__.py +22 -0
  7. strands_evals/case.py +53 -0
  8. strands_evals/display/display_console.py +150 -0
  9. strands_evals/evaluators/__init__.py +23 -0
  10. strands_evals/evaluators/evaluator.py +182 -0
  11. strands_evals/evaluators/faithfulness_evaluator.py +116 -0
  12. strands_evals/evaluators/goal_success_rate_evaluator.py +90 -0
  13. strands_evals/evaluators/harmfulness_evaluator.py +135 -0
  14. strands_evals/evaluators/helpfulness_evaluator.py +148 -0
  15. strands_evals/evaluators/interactions_evaluator.py +244 -0
  16. strands_evals/evaluators/output_evaluator.py +72 -0
  17. strands_evals/evaluators/prompt_templates/case_prompt_template.py +63 -0
  18. strands_evals/evaluators/prompt_templates/faithfulness/__init__.py +11 -0
  19. strands_evals/evaluators/prompt_templates/faithfulness/faithfulness_v0.py +30 -0
  20. strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py +11 -0
  21. strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v0.py +17 -0
  22. strands_evals/evaluators/prompt_templates/harmfulness/__init__.py +11 -0
  23. strands_evals/evaluators/prompt_templates/harmfulness/harmfulness_v0.py +8 -0
  24. strands_evals/evaluators/prompt_templates/helpfulness/__init__.py +11 -0
  25. strands_evals/evaluators/prompt_templates/helpfulness/helpfulness_v0.py +38 -0
  26. strands_evals/evaluators/prompt_templates/prompt_templates.py +176 -0
  27. strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/__init__.py +11 -0
  28. strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/tool_parameter_accuracy_v0.py +40 -0
  29. strands_evals/evaluators/prompt_templates/tool_selection_accuracy/__init__.py +11 -0
  30. strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection_accuracy_v0.py +23 -0
  31. strands_evals/evaluators/tool_parameter_accuracy_evaluator.py +112 -0
  32. strands_evals/evaluators/tool_selection_accuracy_evaluator.py +112 -0
  33. strands_evals/evaluators/trajectory_evaluator.py +100 -0
  34. strands_evals/experiment.py +652 -0
  35. strands_evals/extractors/__init__.py +3 -0
  36. strands_evals/extractors/graph_extractor.py +30 -0
  37. strands_evals/extractors/swarm_extractor.py +73 -0
  38. strands_evals/extractors/tools_use_extractor.py +164 -0
  39. strands_evals/extractors/trace_extractor.py +166 -0
  40. strands_evals/generators/__init__.py +3 -0
  41. strands_evals/generators/experiment_generator.py +498 -0
  42. strands_evals/generators/prompt_template/prompt_templates.py +75 -0
  43. strands_evals/generators/topic_planner.py +60 -0
  44. strands_evals/mappers/__init__.py +6 -0
  45. strands_evals/mappers/session_mapper.py +27 -0
  46. strands_evals/mappers/strands_in_memory_session_mapper.py +473 -0
  47. strands_evals/simulation/README.md +323 -0
  48. strands_evals/simulation/__init__.py +6 -0
  49. strands_evals/simulation/actor_simulator.py +292 -0
  50. strands_evals/simulation/profiles/__init__.py +5 -0
  51. strands_evals/simulation/profiles/actor_profile.py +26 -0
  52. strands_evals/simulation/prompt_templates/__init__.py +11 -0
  53. strands_evals/simulation/prompt_templates/actor_profile_extraction.py +25 -0
  54. strands_evals/simulation/prompt_templates/actor_system_prompt.py +64 -0
  55. strands_evals/simulation/prompt_templates/goal_completion.py +27 -0
  56. strands_evals/simulation/tools/__init__.py +5 -0
  57. strands_evals/simulation/tools/goal_completion.py +93 -0
  58. strands_evals/telemetry/__init__.py +15 -0
  59. strands_evals/telemetry/_cloudwatch_logger.py +209 -0
  60. strands_evals/telemetry/config.py +207 -0
  61. strands_evals/telemetry/tracer.py +38 -0
  62. strands_evals/tools/evaluation_tools.py +67 -0
  63. strands_evals/types/__init__.py +11 -0
  64. strands_evals/types/evaluation.py +105 -0
  65. strands_evals/types/evaluation_report.py +244 -0
  66. strands_evals/types/simulation/__init__.py +5 -0
  67. strands_evals/types/simulation/actor.py +34 -0
  68. strands_evals/types/trace.py +205 -0
@@ -0,0 +1,22 @@
1
+ __version__ = "0.1.0"
2
+
3
+ from . import evaluators, extractors, generators, simulation, telemetry, types
4
+ from .case import Case
5
+ from .experiment import Experiment
6
+ from .simulation import ActorSimulator, UserSimulator
7
+ from .telemetry import StrandsEvalsTelemetry, get_tracer
8
+
9
+ __all__ = [
10
+ "Experiment",
11
+ "Case",
12
+ "evaluators",
13
+ "extractors",
14
+ "types",
15
+ "generators",
16
+ "simulation",
17
+ "telemetry",
18
+ "StrandsEvalsTelemetry",
19
+ "get_tracer",
20
+ "ActorSimulator",
21
+ "UserSimulator",
22
+ ]
strands_evals/case.py ADDED
@@ -0,0 +1,53 @@
1
+ import uuid
2
+
3
+ from pydantic import BaseModel, Field
4
+ from typing_extensions import Any, Generic, TypeVar
5
+
6
+ from .types.evaluation import Interaction
7
+
8
+ InputT = TypeVar("InputT")
9
+ OutputT = TypeVar("OutputT")
10
+
11
+
12
+ class Case(BaseModel, Generic[InputT, OutputT]):
13
+ """
14
+ A single test case, representing a row in an Experiment.
15
+
16
+ Each test case represents a single test scenario with inputs to test.
17
+ Optionally, a test case may contains a name, expected outputs, expected trajectory, expected interactions
18
+ and arbitrary metadata.
19
+
20
+ Attributes:
21
+ input: The input to the task. eg. the query to the agent
22
+ name: The name of the test case. This will be used to identify the test in the summary report.
23
+ session_id: The session ID for the test case. Automatically generates a UUID4 if not provided.
24
+ expected_output: The expected response given the input. eg. the agent's response
25
+ expected_trajectory: The expected trajectory of a task given the input. eg. sequence of tools
26
+ expected_interactions: The expected interaction sequence given the input (ideal for multi-agent systems).
27
+ metadata: Additional information about the test case.
28
+
29
+ Example:
30
+ case = Case[str,str](name="Simple Math",
31
+ input="What is 2x2?",
32
+ expected_output="2x2 is 4.",
33
+ expected_trajectory=["calculator],
34
+ metadata={"category": "math"})
35
+
36
+ simple_test_case = Case(input="What is 2x2?")
37
+
38
+ case_with_interaction = Case(
39
+ input="What is 2x2?",
40
+ expected_interactions=[
41
+ {"agent_1":"Hello, what would you like to do?"},
42
+ {"agent_2":"What is 2x2?"}
43
+ ]
44
+ )
45
+ """
46
+
47
+ name: str | None = None
48
+ session_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
49
+ input: InputT
50
+ expected_output: OutputT | None = None
51
+ expected_trajectory: list[Any] | None = None
52
+ expected_interactions: list[Interaction] | None = None
53
+ metadata: dict[str, Any] | None = None
@@ -0,0 +1,150 @@
1
+ from rich.console import Console
2
+ from rich.panel import Panel
3
+ from rich.prompt import Prompt
4
+ from rich.table import Table
5
+ from rich.tree import Tree
6
+
7
+ console = Console()
8
+
9
+
10
+ class CollapsibleTableReportDisplay:
11
+ """
12
+ Interactive console display for evaluation reports with expandable/collapsible test case details.
13
+
14
+ This class provides an interactive rich console interface for displaying evaluation results
15
+ with the ability to expand or collapse individual test cases to show or hide details.
16
+
17
+ Attributes:
18
+ items: Dictionary of test cases with their details and expansion state
19
+ overall_score: The overall evaluation score
20
+ include_input: Whether to display input values in the table
21
+ include_output: Whether to display output values in the table
22
+
23
+
24
+ items should follow the following structure:
25
+ {
26
+ "0": {"details": {
27
+ "name": str,
28
+ "score": float,
29
+ "test_pass: bool,
30
+ "reason": str,
31
+ ... # will display everything that's given like actual_output etc.
32
+ },
33
+ "detailed_results": list[EvaluationOutput],
34
+ },
35
+ "expanded": bool
36
+ }
37
+ """
38
+
39
+ def __init__(self, items: dict, overall_score: float):
40
+ """
41
+ Initialize the collapsible table display.
42
+
43
+ Args:
44
+ items: Dictionary of test cases with their details and expansion state
45
+ overall_score: The overall evaluation score
46
+ """
47
+ self.items = items
48
+ self.overall_score = overall_score
49
+
50
+ def display_items(self):
51
+ """
52
+ Display the evaluation report as a rich table with expandable/collapsible rows.
53
+
54
+ Renders a table showing test case results with expansion indicators.
55
+ Expanded rows show full details, while collapsed rows show minimal information.
56
+ """
57
+ overall_score_string = f"[bold blue]Overall Score: {self.overall_score:.2f}[/bold blue]"
58
+ pass_count = sum([1 if case["details"]["test_pass"] else 0 for case in self.items.values()])
59
+ pass_rate = pass_count / len(self.items)
60
+ overall_pass_rate = f"[bold blue]Pass Rate: {pass_rate}[/bold blue]"
61
+ spacing = " "
62
+ console.print(Panel(f"{overall_score_string}{spacing}{overall_pass_rate}", title="📊 Evaluation Report"))
63
+
64
+ # Create Table and headers
65
+ table = Table(title="Test Case Results", show_lines=True)
66
+ colors_mapping = {
67
+ "index": "cyan",
68
+ "name": "magenta",
69
+ "score": "green",
70
+ }
71
+ headers = ["index"] + list(self.items["0"]["details"].keys())
72
+ for header in headers:
73
+ if header in colors_mapping:
74
+ table.add_column(header, style=colors_mapping[header])
75
+ else:
76
+ table.add_column(header, style="yellow")
77
+
78
+ for key, item in self.items.items():
79
+ symbol = "▼" if item["expanded"] else "▶"
80
+ case = item["details"]
81
+ pass_status = "✅" if case["test_pass"] else "❌"
82
+ other_fields = list(case.values())[3:]
83
+ if item["expanded"]: # We always to render at least the index, name, score, test_pass, and reason
84
+ renderables = [
85
+ f"{symbol} {key}",
86
+ case.get("name", f"Test {key}"),
87
+ case.get("score"),
88
+ pass_status,
89
+ ] + other_fields
90
+ else:
91
+ renderables = [
92
+ f"{symbol} {key}",
93
+ case.get("name", f"Test {key}"),
94
+ case.get("score"),
95
+ pass_status,
96
+ ] + len(other_fields) * ["..."]
97
+ table.add_row(*renderables)
98
+
99
+ console.print(table)
100
+
101
+ for key, item in self.items.items():
102
+ if item["expanded"] and item.get("detailed_results"):
103
+ detailed_results = item["detailed_results"]
104
+ if len(detailed_results) > 1: # Only show if multiple metrics
105
+ tree = Tree(f"[bold cyan]📋 Detailed Metrics for Case {key}[/bold cyan]")
106
+ for i, result in enumerate(detailed_results):
107
+ status = "✅" if result.test_pass else "❌"
108
+ metric_node = tree.add(f"[yellow]Metric {i + 1}[/yellow]: Score={result.score:.2f} {status}")
109
+ if result.reason:
110
+ metric_node.add(f"[dim]{result.reason}[/dim]")
111
+ console.print(tree)
112
+ console.print()
113
+
114
+ def run(self, static: bool = False):
115
+ """
116
+ Run the interactive display loop. If static, then the terminal will only display the report.
117
+
118
+ Args:
119
+ static: Whether to display only or allow interaction with the report.
120
+
121
+ Provides an interactive console interface where users can:
122
+ - Expand/collapse individual test cases by entering their number
123
+ - Expand all test cases with 'o'
124
+ - Collapse all test cases with 'c'
125
+ - Quit the interactive view with 'q'
126
+ """
127
+ while True:
128
+ console.clear()
129
+ self.display_items()
130
+
131
+ if static:
132
+ return
133
+
134
+ choice = Prompt.ask(
135
+ "\nEnter the test case number to expand/collapse it, o to expand all, "
136
+ "and c to collapse all (q to quit)."
137
+ )
138
+
139
+ if choice.lower() == "q":
140
+ break
141
+
142
+ if choice.lower() == "o":
143
+ for key in self.items:
144
+ self.items[key]["expanded"] = True
145
+ elif choice.lower() == "c":
146
+ for key in self.items:
147
+ self.items[key]["expanded"] = False
148
+ else:
149
+ if choice in self.items:
150
+ self.items[choice]["expanded"] = not self.items[choice]["expanded"]
@@ -0,0 +1,23 @@
1
+ from .evaluator import Evaluator
2
+ from .faithfulness_evaluator import FaithfulnessEvaluator
3
+ from .goal_success_rate_evaluator import GoalSuccessRateEvaluator
4
+ from .harmfulness_evaluator import HarmfulnessEvaluator
5
+ from .helpfulness_evaluator import HelpfulnessEvaluator
6
+ from .interactions_evaluator import InteractionsEvaluator
7
+ from .output_evaluator import OutputEvaluator
8
+ from .tool_parameter_accuracy_evaluator import ToolParameterAccuracyEvaluator
9
+ from .tool_selection_accuracy_evaluator import ToolSelectionAccuracyEvaluator
10
+ from .trajectory_evaluator import TrajectoryEvaluator
11
+
12
+ __all__ = [
13
+ "Evaluator",
14
+ "OutputEvaluator",
15
+ "TrajectoryEvaluator",
16
+ "InteractionsEvaluator",
17
+ "HelpfulnessEvaluator",
18
+ "HarmfulnessEvaluator",
19
+ "GoalSuccessRateEvaluator",
20
+ "FaithfulnessEvaluator",
21
+ "ToolSelectionAccuracyEvaluator",
22
+ "ToolParameterAccuracyEvaluator",
23
+ ]
@@ -0,0 +1,182 @@
1
+ import inspect
2
+ import logging
3
+
4
+ from strands.models.model import Model
5
+ from typing_extensions import Any, Generic, TypeGuard, TypeVar, Union
6
+
7
+ from ..extractors import TraceExtractor
8
+ from ..types.evaluation import EvaluationData, EvaluationOutput
9
+ from ..types.trace import AssistantMessage, Context, EvaluationLevel, Session, TextContent, ToolConfig, UserMessage
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ InputT = TypeVar("InputT")
14
+ OutputT = TypeVar("OutputT")
15
+
16
+ DEFAULT_BEDROCK_MODEL_ID = "us.anthropic.claude-sonnet-4-20250514-v1:0"
17
+
18
+
19
+ class Evaluator(Generic[InputT, OutputT]):
20
+ """
21
+ Base class for evaluators.
22
+
23
+ Evaluators can assess the performance of a task on all test cases.
24
+ Subclasses must implement the `evaluate` method.
25
+ """
26
+
27
+ # Optional: subclasses can set this to enable trace parsing
28
+ evaluation_level: EvaluationLevel | None = None
29
+ _trace_extractor: TraceExtractor | None = None
30
+
31
+ def __init__(self, trace_extractor: TraceExtractor | None = None):
32
+ """Initialize evaluator with optional custom trace extractor.
33
+
34
+ Args:
35
+ trace_extractor: Custom trace extractor. If None and evaluation_level is set,
36
+ a default TraceExtractor will be created.
37
+ """
38
+ self.aggregator = self._default_aggregator
39
+ if trace_extractor:
40
+ self._trace_extractor = trace_extractor
41
+ elif self.evaluation_level:
42
+ self._trace_extractor = TraceExtractor(self.evaluation_level)
43
+
44
+ def _get_model_id(self, model: Union[Model, str, None]) -> str:
45
+ """Extract model_id from a Model instance or string for serialization.
46
+
47
+ This helper method should be called in subclass __init__ methods that accept a model parameter.
48
+
49
+ Args:
50
+ model: Model instance, string model ID, or None
51
+
52
+ Returns:
53
+ The model ID string, DEFAULT_BEDROCK_MODEL_ID if None, or empty string for invalid types
54
+ """
55
+ if isinstance(model, str):
56
+ return model
57
+ elif isinstance(model, Model) and hasattr(model, "config") and isinstance(model.config, dict):
58
+ return model.config.get("model_id", "")
59
+ elif model is None:
60
+ return DEFAULT_BEDROCK_MODEL_ID
61
+ else:
62
+ return ""
63
+
64
+ @staticmethod
65
+ def _default_aggregator(outputs: list[EvaluationOutput]) -> tuple[float, bool, str]:
66
+ avg_score = sum(o.score for o in outputs) / len(outputs)
67
+ all_pass = all(o.test_pass for o in outputs)
68
+ combined_reason = " | ".join(o.reason for o in outputs if o.reason)
69
+ return avg_score, all_pass, combined_reason
70
+
71
+ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
72
+ """
73
+ Evaluate the performance of the task on the given test cases.
74
+
75
+ Args:
76
+ evaluation_case: The test case with all of the neccessary context to be evaluated.
77
+
78
+ Raises:
79
+ NotImplementedError: This method is not implemented in the base class.
80
+ """
81
+ raise NotImplementedError("This method should be implemented in subclasses.")
82
+
83
+ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
84
+ """
85
+ Evaluate the performance of the task on the given test cases asynchronously.
86
+
87
+ Args:
88
+ evaluation_case: The test case with all of the neccessary context to be evaluated.
89
+
90
+ Raises:
91
+ NotImplementedError: This method is not implemented in the base class.
92
+ """
93
+ raise NotImplementedError(
94
+ "This method should be implemented in subclasses, especially if you want to run evaluations asynchronously."
95
+ )
96
+
97
+ def _parse_trajectory(self, evaluation_case: EvaluationData[InputT, OutputT]) -> Any:
98
+ """Parse Session trajectory using TraceExtractor."""
99
+ if not self._trace_extractor:
100
+ raise ValueError("No trace extractor configured. Set evaluation_level or provide trace_extractor.")
101
+
102
+ trajectory = evaluation_case.actual_trajectory
103
+ if not isinstance(trajectory, Session):
104
+ raise TypeError(
105
+ f"Trace parsing requires actual_trajectory to be a Session object, got {type(trajectory).__name__}."
106
+ )
107
+
108
+ return self._trace_extractor.extract(trajectory)
109
+
110
+ def _format_tools(self, tools: list[ToolConfig]) -> str:
111
+ """Format available tools for prompt display."""
112
+ return "\n".join([f"- {tool.name}: {tool.description or 'No description'}" for tool in tools])
113
+
114
+ def _format_session_history(self, contexts: list[Context]) -> str:
115
+ """Format session history with tool executions for prompt display."""
116
+ lines = []
117
+ for ctx in contexts:
118
+ lines.append(f"User: {ctx.user_prompt.text}")
119
+ if ctx.tool_execution_history:
120
+ for tool_exec in ctx.tool_execution_history:
121
+ lines.append(f"Action: {tool_exec.tool_call.name}({tool_exec.tool_call.arguments})")
122
+ lines.append(f"Tool: {tool_exec.tool_result.content}")
123
+ lines.append(f"Assistant: {ctx.agent_response.text}")
124
+ return "\n".join(lines)
125
+
126
+ def _has_text_content(self, msg: UserMessage | AssistantMessage) -> TypeGuard[UserMessage | AssistantMessage]:
127
+ """Check if a message object has accessible text content.
128
+
129
+ Args:
130
+ msg: Message object to check (UserMessage or AssistantMessage)
131
+
132
+ Returns:
133
+ True if msg has content attribute with at least one item that is TextContent
134
+ """
135
+ return (
136
+ hasattr(msg, "content")
137
+ and bool(msg.content)
138
+ and len(msg.content) > 0
139
+ and isinstance(msg.content[0], TextContent)
140
+ )
141
+
142
+ @classmethod
143
+ def get_type_name(cls) -> str:
144
+ """
145
+ Get the name of the evaluator type.
146
+
147
+ Returns:
148
+ str: The name of the evaluator type.
149
+ """
150
+ return cls.__name__
151
+
152
+ def to_dict(self) -> dict:
153
+ """
154
+ Convert the evaluator into a dictionary.
155
+
156
+ Returns:
157
+ dict: A dictionary containing the evaluator's information. Omit private attributes
158
+ (attributes starting with '_') and attributes with default values.
159
+ """
160
+
161
+ _dict = {"evaluator_type": self.get_type_name()}
162
+
163
+ # Get default values from __init__ signature
164
+ sig = inspect.signature(self.__class__.__init__)
165
+ defaults = {k: v.default for k, v in sig.parameters.items() if v.default != inspect.Parameter.empty}
166
+ exclude_attrs = {"aggregator"}
167
+ for k, v in self.__dict__.items():
168
+ if not k.startswith("_") and k not in exclude_attrs:
169
+ # Handle model attribute specially
170
+ if k == "model":
171
+ if isinstance(v, Model):
172
+ # Serialize Model instance to model_id
173
+ _dict["model_id"] = self._get_model_id(v)
174
+ elif v is None and "model" in defaults and defaults["model"] is None:
175
+ # model=None is default, serialize as model_id with default value
176
+ _dict["model_id"] = self._get_model_id(None)
177
+ elif v is not None:
178
+ # String model ID, include as-is
179
+ _dict[k] = v
180
+ elif k not in defaults or v != defaults[k]:
181
+ _dict[k] = v
182
+ return _dict
@@ -0,0 +1,116 @@
1
+ from enum import Enum
2
+
3
+ from pydantic import BaseModel, Field
4
+ from strands import Agent
5
+ from strands.models.model import Model
6
+ from typing_extensions import TypeVar, Union
7
+
8
+ from ..types.evaluation import EvaluationData, EvaluationOutput
9
+ from ..types.trace import EvaluationLevel, TraceLevelInput
10
+ from .evaluator import Evaluator
11
+ from .prompt_templates.faithfulness import get_template
12
+
13
+ InputT = TypeVar("InputT")
14
+ OutputT = TypeVar("OutputT")
15
+
16
+
17
+ class FaithfulnessScore(str, Enum):
18
+ """Categorical faithfulness ratings."""
19
+
20
+ NOT_AT_ALL = "Not At All"
21
+ NOT_GENERALLY = "Not Generally"
22
+ NEUTRAL = "Neutral/Mixed"
23
+ GENERALLY_YES = "Generally Yes"
24
+ COMPLETELY_YES = "Completely Yes"
25
+
26
+
27
+ class FaithfulnessRating(BaseModel):
28
+ """Structured output for faithfulness evaluation."""
29
+
30
+ reasoning: str = Field(description="Step by step reasoning to derive the final score")
31
+ score: FaithfulnessScore = Field(description="Categorical faithfulness rating")
32
+
33
+
34
+ class FaithfulnessEvaluator(Evaluator[InputT, OutputT]):
35
+ """Evaluates faithfulness of agent responses against conversation history."""
36
+
37
+ evaluation_level = EvaluationLevel.TRACE_LEVEL
38
+
39
+ _score_mapping = {
40
+ FaithfulnessScore.NOT_AT_ALL: 0.0,
41
+ FaithfulnessScore.NOT_GENERALLY: 0.25,
42
+ FaithfulnessScore.NEUTRAL: 0.5,
43
+ FaithfulnessScore.GENERALLY_YES: 0.75,
44
+ FaithfulnessScore.COMPLETELY_YES: 1.0,
45
+ }
46
+
47
+ def __init__(
48
+ self,
49
+ version: str = "v0",
50
+ model: Union[Model, str, None] = None,
51
+ system_prompt: str | None = None,
52
+ ):
53
+ super().__init__()
54
+ self.system_prompt = system_prompt if system_prompt is not None else get_template(version).SYSTEM_PROMPT
55
+ self.version = version
56
+ self.model = model
57
+
58
+ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
59
+ parsed_input = self._get_last_turn(evaluation_case)
60
+ prompt = self._format_prompt(parsed_input)
61
+ evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
62
+ rating = evaluator_agent.structured_output(FaithfulnessRating, prompt)
63
+ normalized_score = self._score_mapping[rating.score]
64
+ result = EvaluationOutput(
65
+ score=normalized_score,
66
+ test_pass=normalized_score >= 0.5,
67
+ reason=rating.reasoning,
68
+ label=rating.score,
69
+ )
70
+ return [result]
71
+
72
+ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
73
+ parsed_input = self._get_last_turn(evaluation_case)
74
+ prompt = self._format_prompt(parsed_input)
75
+ evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
76
+ rating = await evaluator_agent.structured_output_async(FaithfulnessRating, prompt)
77
+ normalized_score = self._score_mapping[rating.score]
78
+ result = EvaluationOutput(
79
+ score=normalized_score,
80
+ test_pass=normalized_score >= 0.5,
81
+ reason=rating.reasoning,
82
+ label=rating.score,
83
+ )
84
+ return [result]
85
+
86
+ def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
87
+ """Extract the most recent turn from the conversation for evaluation."""
88
+ parsed_inputs = self._parse_trajectory(evaluation_case)
89
+ if not parsed_inputs:
90
+ raise ValueError(
91
+ "No turn-level inputs could be parsed from the trajectory. "
92
+ "Ensure actual_trajectory is a Session with at least one AgentInvocationSpan."
93
+ )
94
+ return parsed_inputs[-1]
95
+
96
+ def _format_prompt(self, parsed_input: TraceLevelInput) -> str:
97
+ """Format evaluation prompt from parsed turn data."""
98
+ parts = []
99
+
100
+ if parsed_input.session_history:
101
+ history_lines = []
102
+ for msg in parsed_input.session_history:
103
+ if isinstance(msg, list):
104
+ # Handle tool execution lists
105
+ for tool_exec in msg:
106
+ history_lines.append(f"Action: {tool_exec.tool_call.name}({tool_exec.tool_call.arguments})")
107
+ history_lines.append(f"Tool: {tool_exec.tool_result.content}")
108
+ else:
109
+ text = msg.content[0].text if msg.content and hasattr(msg.content[0], "text") else ""
110
+ history_lines.append(f"{msg.role.value.capitalize()}: {text}")
111
+ history_str = "\n".join(history_lines)
112
+ parts.append(f"# Conversation History:\n{history_str}")
113
+
114
+ parts.append(f"# Assistant's Response:\n{parsed_input.agent_response.text}")
115
+
116
+ return "\n\n".join(parts)
@@ -0,0 +1,90 @@
1
+ from enum import Enum
2
+
3
+ from pydantic import BaseModel, Field
4
+ from strands import Agent
5
+ from strands.models.model import Model
6
+ from typing_extensions import TypeVar, Union
7
+
8
+ from ..types.evaluation import EvaluationData, EvaluationOutput
9
+ from ..types.trace import EvaluationLevel, SessionLevelInput
10
+ from .evaluator import Evaluator
11
+ from .prompt_templates.goal_success_rate import get_template
12
+
13
+ InputT = TypeVar("InputT")
14
+ OutputT = TypeVar("OutputT")
15
+
16
+
17
+ class GoalSuccessScore(str, Enum):
18
+ """Binary goal success ratings."""
19
+
20
+ YES = "Yes"
21
+ NO = "No"
22
+
23
+
24
+ class GoalSuccessRating(BaseModel):
25
+ """Structured output for goal success evaluation."""
26
+
27
+ reasoning: str = Field(description="Step by step reasoning to derive the final score")
28
+ score: GoalSuccessScore = Field(description="Score should be one of 'Yes' or 'No'")
29
+
30
+
31
+ class GoalSuccessRateEvaluator(Evaluator[InputT, OutputT]):
32
+ """Evaluates whether all user goals were successfully achieved in a conversation."""
33
+
34
+ evaluation_level = EvaluationLevel.SESSION_LEVEL
35
+
36
+ _score_mapping = {
37
+ GoalSuccessScore.YES: 1.0,
38
+ GoalSuccessScore.NO: 0.0,
39
+ }
40
+
41
+ def __init__(
42
+ self,
43
+ version: str = "v0",
44
+ model: Union[Model, str, None] = None,
45
+ system_prompt: str | None = None,
46
+ ):
47
+ super().__init__()
48
+ self.system_prompt = system_prompt if system_prompt is not None else get_template(version).SYSTEM_PROMPT
49
+ self.version = version
50
+ self.model = model
51
+
52
+ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
53
+ session_input = self._parse_trajectory(evaluation_case)
54
+ prompt = self._format_prompt(session_input)
55
+ evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
56
+ rating = evaluator_agent.structured_output(GoalSuccessRating, prompt)
57
+ normalized_score = self._score_mapping[rating.score]
58
+ result = EvaluationOutput(
59
+ score=normalized_score,
60
+ test_pass=normalized_score >= 1.0,
61
+ reason=rating.reasoning,
62
+ label=rating.score,
63
+ )
64
+ return [result]
65
+
66
+ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
67
+ session_input = self._parse_trajectory(evaluation_case)
68
+ prompt = self._format_prompt(session_input)
69
+ evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
70
+ rating = await evaluator_agent.structured_output_async(GoalSuccessRating, prompt)
71
+ normalized_score = self._score_mapping[rating.score]
72
+ result = EvaluationOutput(
73
+ score=normalized_score,
74
+ test_pass=normalized_score >= 1.0,
75
+ reason=rating.reasoning,
76
+ label=rating.score,
77
+ )
78
+ return [result]
79
+
80
+ def _format_prompt(self, session_input: SessionLevelInput) -> str:
81
+ """Format evaluation prompt from session-level input."""
82
+ parts = []
83
+
84
+ if session_input.available_tools:
85
+ parts.append(f"# Available tools\n{self._format_tools(session_input.available_tools)}")
86
+
87
+ if session_input.session_history:
88
+ parts.append(f"# Conversation record\n{self._format_session_history(session_input.session_history)}")
89
+
90
+ return "\n\n".join(parts)