strands-agents-evals 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- strands_agents_evals-0.1.0.dist-info/METADATA +408 -0
- strands_agents_evals-0.1.0.dist-info/RECORD +68 -0
- strands_agents_evals-0.1.0.dist-info/WHEEL +4 -0
- strands_agents_evals-0.1.0.dist-info/licenses/LICENSE +175 -0
- strands_agents_evals-0.1.0.dist-info/licenses/NOTICE +1 -0
- strands_evals/__init__.py +22 -0
- strands_evals/case.py +53 -0
- strands_evals/display/display_console.py +150 -0
- strands_evals/evaluators/__init__.py +23 -0
- strands_evals/evaluators/evaluator.py +182 -0
- strands_evals/evaluators/faithfulness_evaluator.py +116 -0
- strands_evals/evaluators/goal_success_rate_evaluator.py +90 -0
- strands_evals/evaluators/harmfulness_evaluator.py +135 -0
- strands_evals/evaluators/helpfulness_evaluator.py +148 -0
- strands_evals/evaluators/interactions_evaluator.py +244 -0
- strands_evals/evaluators/output_evaluator.py +72 -0
- strands_evals/evaluators/prompt_templates/case_prompt_template.py +63 -0
- strands_evals/evaluators/prompt_templates/faithfulness/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/faithfulness/faithfulness_v0.py +30 -0
- strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v0.py +17 -0
- strands_evals/evaluators/prompt_templates/harmfulness/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/harmfulness/harmfulness_v0.py +8 -0
- strands_evals/evaluators/prompt_templates/helpfulness/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/helpfulness/helpfulness_v0.py +38 -0
- strands_evals/evaluators/prompt_templates/prompt_templates.py +176 -0
- strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/tool_parameter_accuracy_v0.py +40 -0
- strands_evals/evaluators/prompt_templates/tool_selection_accuracy/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection_accuracy_v0.py +23 -0
- strands_evals/evaluators/tool_parameter_accuracy_evaluator.py +112 -0
- strands_evals/evaluators/tool_selection_accuracy_evaluator.py +112 -0
- strands_evals/evaluators/trajectory_evaluator.py +100 -0
- strands_evals/experiment.py +652 -0
- strands_evals/extractors/__init__.py +3 -0
- strands_evals/extractors/graph_extractor.py +30 -0
- strands_evals/extractors/swarm_extractor.py +73 -0
- strands_evals/extractors/tools_use_extractor.py +164 -0
- strands_evals/extractors/trace_extractor.py +166 -0
- strands_evals/generators/__init__.py +3 -0
- strands_evals/generators/experiment_generator.py +498 -0
- strands_evals/generators/prompt_template/prompt_templates.py +75 -0
- strands_evals/generators/topic_planner.py +60 -0
- strands_evals/mappers/__init__.py +6 -0
- strands_evals/mappers/session_mapper.py +27 -0
- strands_evals/mappers/strands_in_memory_session_mapper.py +473 -0
- strands_evals/simulation/README.md +323 -0
- strands_evals/simulation/__init__.py +6 -0
- strands_evals/simulation/actor_simulator.py +292 -0
- strands_evals/simulation/profiles/__init__.py +5 -0
- strands_evals/simulation/profiles/actor_profile.py +26 -0
- strands_evals/simulation/prompt_templates/__init__.py +11 -0
- strands_evals/simulation/prompt_templates/actor_profile_extraction.py +25 -0
- strands_evals/simulation/prompt_templates/actor_system_prompt.py +64 -0
- strands_evals/simulation/prompt_templates/goal_completion.py +27 -0
- strands_evals/simulation/tools/__init__.py +5 -0
- strands_evals/simulation/tools/goal_completion.py +93 -0
- strands_evals/telemetry/__init__.py +15 -0
- strands_evals/telemetry/_cloudwatch_logger.py +209 -0
- strands_evals/telemetry/config.py +207 -0
- strands_evals/telemetry/tracer.py +38 -0
- strands_evals/tools/evaluation_tools.py +67 -0
- strands_evals/types/__init__.py +11 -0
- strands_evals/types/evaluation.py +105 -0
- strands_evals/types/evaluation_report.py +244 -0
- strands_evals/types/simulation/__init__.py +5 -0
- strands_evals/types/simulation/actor.py +34 -0
- strands_evals/types/trace.py +205 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
2
|
+
|
|
3
|
+
from . import evaluators, extractors, generators, simulation, telemetry, types
|
|
4
|
+
from .case import Case
|
|
5
|
+
from .experiment import Experiment
|
|
6
|
+
from .simulation import ActorSimulator, UserSimulator
|
|
7
|
+
from .telemetry import StrandsEvalsTelemetry, get_tracer
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"Experiment",
|
|
11
|
+
"Case",
|
|
12
|
+
"evaluators",
|
|
13
|
+
"extractors",
|
|
14
|
+
"types",
|
|
15
|
+
"generators",
|
|
16
|
+
"simulation",
|
|
17
|
+
"telemetry",
|
|
18
|
+
"StrandsEvalsTelemetry",
|
|
19
|
+
"get_tracer",
|
|
20
|
+
"ActorSimulator",
|
|
21
|
+
"UserSimulator",
|
|
22
|
+
]
|
strands_evals/case.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
from typing_extensions import Any, Generic, TypeVar
|
|
5
|
+
|
|
6
|
+
from .types.evaluation import Interaction
|
|
7
|
+
|
|
8
|
+
InputT = TypeVar("InputT")
|
|
9
|
+
OutputT = TypeVar("OutputT")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Case(BaseModel, Generic[InputT, OutputT]):
|
|
13
|
+
"""
|
|
14
|
+
A single test case, representing a row in an Experiment.
|
|
15
|
+
|
|
16
|
+
Each test case represents a single test scenario with inputs to test.
|
|
17
|
+
Optionally, a test case may contains a name, expected outputs, expected trajectory, expected interactions
|
|
18
|
+
and arbitrary metadata.
|
|
19
|
+
|
|
20
|
+
Attributes:
|
|
21
|
+
input: The input to the task. eg. the query to the agent
|
|
22
|
+
name: The name of the test case. This will be used to identify the test in the summary report.
|
|
23
|
+
session_id: The session ID for the test case. Automatically generates a UUID4 if not provided.
|
|
24
|
+
expected_output: The expected response given the input. eg. the agent's response
|
|
25
|
+
expected_trajectory: The expected trajectory of a task given the input. eg. sequence of tools
|
|
26
|
+
expected_interactions: The expected interaction sequence given the input (ideal for multi-agent systems).
|
|
27
|
+
metadata: Additional information about the test case.
|
|
28
|
+
|
|
29
|
+
Example:
|
|
30
|
+
case = Case[str,str](name="Simple Math",
|
|
31
|
+
input="What is 2x2?",
|
|
32
|
+
expected_output="2x2 is 4.",
|
|
33
|
+
expected_trajectory=["calculator],
|
|
34
|
+
metadata={"category": "math"})
|
|
35
|
+
|
|
36
|
+
simple_test_case = Case(input="What is 2x2?")
|
|
37
|
+
|
|
38
|
+
case_with_interaction = Case(
|
|
39
|
+
input="What is 2x2?",
|
|
40
|
+
expected_interactions=[
|
|
41
|
+
{"agent_1":"Hello, what would you like to do?"},
|
|
42
|
+
{"agent_2":"What is 2x2?"}
|
|
43
|
+
]
|
|
44
|
+
)
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
name: str | None = None
|
|
48
|
+
session_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
|
49
|
+
input: InputT
|
|
50
|
+
expected_output: OutputT | None = None
|
|
51
|
+
expected_trajectory: list[Any] | None = None
|
|
52
|
+
expected_interactions: list[Interaction] | None = None
|
|
53
|
+
metadata: dict[str, Any] | None = None
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
from rich.console import Console
|
|
2
|
+
from rich.panel import Panel
|
|
3
|
+
from rich.prompt import Prompt
|
|
4
|
+
from rich.table import Table
|
|
5
|
+
from rich.tree import Tree
|
|
6
|
+
|
|
7
|
+
console = Console()
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class CollapsibleTableReportDisplay:
|
|
11
|
+
"""
|
|
12
|
+
Interactive console display for evaluation reports with expandable/collapsible test case details.
|
|
13
|
+
|
|
14
|
+
This class provides an interactive rich console interface for displaying evaluation results
|
|
15
|
+
with the ability to expand or collapse individual test cases to show or hide details.
|
|
16
|
+
|
|
17
|
+
Attributes:
|
|
18
|
+
items: Dictionary of test cases with their details and expansion state
|
|
19
|
+
overall_score: The overall evaluation score
|
|
20
|
+
include_input: Whether to display input values in the table
|
|
21
|
+
include_output: Whether to display output values in the table
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
items should follow the following structure:
|
|
25
|
+
{
|
|
26
|
+
"0": {"details": {
|
|
27
|
+
"name": str,
|
|
28
|
+
"score": float,
|
|
29
|
+
"test_pass: bool,
|
|
30
|
+
"reason": str,
|
|
31
|
+
... # will display everything that's given like actual_output etc.
|
|
32
|
+
},
|
|
33
|
+
"detailed_results": list[EvaluationOutput],
|
|
34
|
+
},
|
|
35
|
+
"expanded": bool
|
|
36
|
+
}
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(self, items: dict, overall_score: float):
|
|
40
|
+
"""
|
|
41
|
+
Initialize the collapsible table display.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
items: Dictionary of test cases with their details and expansion state
|
|
45
|
+
overall_score: The overall evaluation score
|
|
46
|
+
"""
|
|
47
|
+
self.items = items
|
|
48
|
+
self.overall_score = overall_score
|
|
49
|
+
|
|
50
|
+
def display_items(self):
|
|
51
|
+
"""
|
|
52
|
+
Display the evaluation report as a rich table with expandable/collapsible rows.
|
|
53
|
+
|
|
54
|
+
Renders a table showing test case results with expansion indicators.
|
|
55
|
+
Expanded rows show full details, while collapsed rows show minimal information.
|
|
56
|
+
"""
|
|
57
|
+
overall_score_string = f"[bold blue]Overall Score: {self.overall_score:.2f}[/bold blue]"
|
|
58
|
+
pass_count = sum([1 if case["details"]["test_pass"] else 0 for case in self.items.values()])
|
|
59
|
+
pass_rate = pass_count / len(self.items)
|
|
60
|
+
overall_pass_rate = f"[bold blue]Pass Rate: {pass_rate}[/bold blue]"
|
|
61
|
+
spacing = " "
|
|
62
|
+
console.print(Panel(f"{overall_score_string}{spacing}{overall_pass_rate}", title="📊 Evaluation Report"))
|
|
63
|
+
|
|
64
|
+
# Create Table and headers
|
|
65
|
+
table = Table(title="Test Case Results", show_lines=True)
|
|
66
|
+
colors_mapping = {
|
|
67
|
+
"index": "cyan",
|
|
68
|
+
"name": "magenta",
|
|
69
|
+
"score": "green",
|
|
70
|
+
}
|
|
71
|
+
headers = ["index"] + list(self.items["0"]["details"].keys())
|
|
72
|
+
for header in headers:
|
|
73
|
+
if header in colors_mapping:
|
|
74
|
+
table.add_column(header, style=colors_mapping[header])
|
|
75
|
+
else:
|
|
76
|
+
table.add_column(header, style="yellow")
|
|
77
|
+
|
|
78
|
+
for key, item in self.items.items():
|
|
79
|
+
symbol = "▼" if item["expanded"] else "▶"
|
|
80
|
+
case = item["details"]
|
|
81
|
+
pass_status = "✅" if case["test_pass"] else "❌"
|
|
82
|
+
other_fields = list(case.values())[3:]
|
|
83
|
+
if item["expanded"]: # We always to render at least the index, name, score, test_pass, and reason
|
|
84
|
+
renderables = [
|
|
85
|
+
f"{symbol} {key}",
|
|
86
|
+
case.get("name", f"Test {key}"),
|
|
87
|
+
case.get("score"),
|
|
88
|
+
pass_status,
|
|
89
|
+
] + other_fields
|
|
90
|
+
else:
|
|
91
|
+
renderables = [
|
|
92
|
+
f"{symbol} {key}",
|
|
93
|
+
case.get("name", f"Test {key}"),
|
|
94
|
+
case.get("score"),
|
|
95
|
+
pass_status,
|
|
96
|
+
] + len(other_fields) * ["..."]
|
|
97
|
+
table.add_row(*renderables)
|
|
98
|
+
|
|
99
|
+
console.print(table)
|
|
100
|
+
|
|
101
|
+
for key, item in self.items.items():
|
|
102
|
+
if item["expanded"] and item.get("detailed_results"):
|
|
103
|
+
detailed_results = item["detailed_results"]
|
|
104
|
+
if len(detailed_results) > 1: # Only show if multiple metrics
|
|
105
|
+
tree = Tree(f"[bold cyan]📋 Detailed Metrics for Case {key}[/bold cyan]")
|
|
106
|
+
for i, result in enumerate(detailed_results):
|
|
107
|
+
status = "✅" if result.test_pass else "❌"
|
|
108
|
+
metric_node = tree.add(f"[yellow]Metric {i + 1}[/yellow]: Score={result.score:.2f} {status}")
|
|
109
|
+
if result.reason:
|
|
110
|
+
metric_node.add(f"[dim]{result.reason}[/dim]")
|
|
111
|
+
console.print(tree)
|
|
112
|
+
console.print()
|
|
113
|
+
|
|
114
|
+
def run(self, static: bool = False):
|
|
115
|
+
"""
|
|
116
|
+
Run the interactive display loop. If static, then the terminal will only display the report.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
static: Whether to display only or allow interaction with the report.
|
|
120
|
+
|
|
121
|
+
Provides an interactive console interface where users can:
|
|
122
|
+
- Expand/collapse individual test cases by entering their number
|
|
123
|
+
- Expand all test cases with 'o'
|
|
124
|
+
- Collapse all test cases with 'c'
|
|
125
|
+
- Quit the interactive view with 'q'
|
|
126
|
+
"""
|
|
127
|
+
while True:
|
|
128
|
+
console.clear()
|
|
129
|
+
self.display_items()
|
|
130
|
+
|
|
131
|
+
if static:
|
|
132
|
+
return
|
|
133
|
+
|
|
134
|
+
choice = Prompt.ask(
|
|
135
|
+
"\nEnter the test case number to expand/collapse it, o to expand all, "
|
|
136
|
+
"and c to collapse all (q to quit)."
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
if choice.lower() == "q":
|
|
140
|
+
break
|
|
141
|
+
|
|
142
|
+
if choice.lower() == "o":
|
|
143
|
+
for key in self.items:
|
|
144
|
+
self.items[key]["expanded"] = True
|
|
145
|
+
elif choice.lower() == "c":
|
|
146
|
+
for key in self.items:
|
|
147
|
+
self.items[key]["expanded"] = False
|
|
148
|
+
else:
|
|
149
|
+
if choice in self.items:
|
|
150
|
+
self.items[choice]["expanded"] = not self.items[choice]["expanded"]
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from .evaluator import Evaluator
|
|
2
|
+
from .faithfulness_evaluator import FaithfulnessEvaluator
|
|
3
|
+
from .goal_success_rate_evaluator import GoalSuccessRateEvaluator
|
|
4
|
+
from .harmfulness_evaluator import HarmfulnessEvaluator
|
|
5
|
+
from .helpfulness_evaluator import HelpfulnessEvaluator
|
|
6
|
+
from .interactions_evaluator import InteractionsEvaluator
|
|
7
|
+
from .output_evaluator import OutputEvaluator
|
|
8
|
+
from .tool_parameter_accuracy_evaluator import ToolParameterAccuracyEvaluator
|
|
9
|
+
from .tool_selection_accuracy_evaluator import ToolSelectionAccuracyEvaluator
|
|
10
|
+
from .trajectory_evaluator import TrajectoryEvaluator
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"Evaluator",
|
|
14
|
+
"OutputEvaluator",
|
|
15
|
+
"TrajectoryEvaluator",
|
|
16
|
+
"InteractionsEvaluator",
|
|
17
|
+
"HelpfulnessEvaluator",
|
|
18
|
+
"HarmfulnessEvaluator",
|
|
19
|
+
"GoalSuccessRateEvaluator",
|
|
20
|
+
"FaithfulnessEvaluator",
|
|
21
|
+
"ToolSelectionAccuracyEvaluator",
|
|
22
|
+
"ToolParameterAccuracyEvaluator",
|
|
23
|
+
]
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
import inspect
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
from strands.models.model import Model
|
|
5
|
+
from typing_extensions import Any, Generic, TypeGuard, TypeVar, Union
|
|
6
|
+
|
|
7
|
+
from ..extractors import TraceExtractor
|
|
8
|
+
from ..types.evaluation import EvaluationData, EvaluationOutput
|
|
9
|
+
from ..types.trace import AssistantMessage, Context, EvaluationLevel, Session, TextContent, ToolConfig, UserMessage
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
InputT = TypeVar("InputT")
|
|
14
|
+
OutputT = TypeVar("OutputT")
|
|
15
|
+
|
|
16
|
+
DEFAULT_BEDROCK_MODEL_ID = "us.anthropic.claude-sonnet-4-20250514-v1:0"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Evaluator(Generic[InputT, OutputT]):
|
|
20
|
+
"""
|
|
21
|
+
Base class for evaluators.
|
|
22
|
+
|
|
23
|
+
Evaluators can assess the performance of a task on all test cases.
|
|
24
|
+
Subclasses must implement the `evaluate` method.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
# Optional: subclasses can set this to enable trace parsing
|
|
28
|
+
evaluation_level: EvaluationLevel | None = None
|
|
29
|
+
_trace_extractor: TraceExtractor | None = None
|
|
30
|
+
|
|
31
|
+
def __init__(self, trace_extractor: TraceExtractor | None = None):
|
|
32
|
+
"""Initialize evaluator with optional custom trace extractor.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
trace_extractor: Custom trace extractor. If None and evaluation_level is set,
|
|
36
|
+
a default TraceExtractor will be created.
|
|
37
|
+
"""
|
|
38
|
+
self.aggregator = self._default_aggregator
|
|
39
|
+
if trace_extractor:
|
|
40
|
+
self._trace_extractor = trace_extractor
|
|
41
|
+
elif self.evaluation_level:
|
|
42
|
+
self._trace_extractor = TraceExtractor(self.evaluation_level)
|
|
43
|
+
|
|
44
|
+
def _get_model_id(self, model: Union[Model, str, None]) -> str:
|
|
45
|
+
"""Extract model_id from a Model instance or string for serialization.
|
|
46
|
+
|
|
47
|
+
This helper method should be called in subclass __init__ methods that accept a model parameter.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
model: Model instance, string model ID, or None
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
The model ID string, DEFAULT_BEDROCK_MODEL_ID if None, or empty string for invalid types
|
|
54
|
+
"""
|
|
55
|
+
if isinstance(model, str):
|
|
56
|
+
return model
|
|
57
|
+
elif isinstance(model, Model) and hasattr(model, "config") and isinstance(model.config, dict):
|
|
58
|
+
return model.config.get("model_id", "")
|
|
59
|
+
elif model is None:
|
|
60
|
+
return DEFAULT_BEDROCK_MODEL_ID
|
|
61
|
+
else:
|
|
62
|
+
return ""
|
|
63
|
+
|
|
64
|
+
@staticmethod
|
|
65
|
+
def _default_aggregator(outputs: list[EvaluationOutput]) -> tuple[float, bool, str]:
|
|
66
|
+
avg_score = sum(o.score for o in outputs) / len(outputs)
|
|
67
|
+
all_pass = all(o.test_pass for o in outputs)
|
|
68
|
+
combined_reason = " | ".join(o.reason for o in outputs if o.reason)
|
|
69
|
+
return avg_score, all_pass, combined_reason
|
|
70
|
+
|
|
71
|
+
def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
|
|
72
|
+
"""
|
|
73
|
+
Evaluate the performance of the task on the given test cases.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
evaluation_case: The test case with all of the neccessary context to be evaluated.
|
|
77
|
+
|
|
78
|
+
Raises:
|
|
79
|
+
NotImplementedError: This method is not implemented in the base class.
|
|
80
|
+
"""
|
|
81
|
+
raise NotImplementedError("This method should be implemented in subclasses.")
|
|
82
|
+
|
|
83
|
+
async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
|
|
84
|
+
"""
|
|
85
|
+
Evaluate the performance of the task on the given test cases asynchronously.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
evaluation_case: The test case with all of the neccessary context to be evaluated.
|
|
89
|
+
|
|
90
|
+
Raises:
|
|
91
|
+
NotImplementedError: This method is not implemented in the base class.
|
|
92
|
+
"""
|
|
93
|
+
raise NotImplementedError(
|
|
94
|
+
"This method should be implemented in subclasses, especially if you want to run evaluations asynchronously."
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
def _parse_trajectory(self, evaluation_case: EvaluationData[InputT, OutputT]) -> Any:
|
|
98
|
+
"""Parse Session trajectory using TraceExtractor."""
|
|
99
|
+
if not self._trace_extractor:
|
|
100
|
+
raise ValueError("No trace extractor configured. Set evaluation_level or provide trace_extractor.")
|
|
101
|
+
|
|
102
|
+
trajectory = evaluation_case.actual_trajectory
|
|
103
|
+
if not isinstance(trajectory, Session):
|
|
104
|
+
raise TypeError(
|
|
105
|
+
f"Trace parsing requires actual_trajectory to be a Session object, got {type(trajectory).__name__}."
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
return self._trace_extractor.extract(trajectory)
|
|
109
|
+
|
|
110
|
+
def _format_tools(self, tools: list[ToolConfig]) -> str:
|
|
111
|
+
"""Format available tools for prompt display."""
|
|
112
|
+
return "\n".join([f"- {tool.name}: {tool.description or 'No description'}" for tool in tools])
|
|
113
|
+
|
|
114
|
+
def _format_session_history(self, contexts: list[Context]) -> str:
|
|
115
|
+
"""Format session history with tool executions for prompt display."""
|
|
116
|
+
lines = []
|
|
117
|
+
for ctx in contexts:
|
|
118
|
+
lines.append(f"User: {ctx.user_prompt.text}")
|
|
119
|
+
if ctx.tool_execution_history:
|
|
120
|
+
for tool_exec in ctx.tool_execution_history:
|
|
121
|
+
lines.append(f"Action: {tool_exec.tool_call.name}({tool_exec.tool_call.arguments})")
|
|
122
|
+
lines.append(f"Tool: {tool_exec.tool_result.content}")
|
|
123
|
+
lines.append(f"Assistant: {ctx.agent_response.text}")
|
|
124
|
+
return "\n".join(lines)
|
|
125
|
+
|
|
126
|
+
def _has_text_content(self, msg: UserMessage | AssistantMessage) -> TypeGuard[UserMessage | AssistantMessage]:
|
|
127
|
+
"""Check if a message object has accessible text content.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
msg: Message object to check (UserMessage or AssistantMessage)
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
True if msg has content attribute with at least one item that is TextContent
|
|
134
|
+
"""
|
|
135
|
+
return (
|
|
136
|
+
hasattr(msg, "content")
|
|
137
|
+
and bool(msg.content)
|
|
138
|
+
and len(msg.content) > 0
|
|
139
|
+
and isinstance(msg.content[0], TextContent)
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
@classmethod
|
|
143
|
+
def get_type_name(cls) -> str:
|
|
144
|
+
"""
|
|
145
|
+
Get the name of the evaluator type.
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
str: The name of the evaluator type.
|
|
149
|
+
"""
|
|
150
|
+
return cls.__name__
|
|
151
|
+
|
|
152
|
+
def to_dict(self) -> dict:
|
|
153
|
+
"""
|
|
154
|
+
Convert the evaluator into a dictionary.
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
dict: A dictionary containing the evaluator's information. Omit private attributes
|
|
158
|
+
(attributes starting with '_') and attributes with default values.
|
|
159
|
+
"""
|
|
160
|
+
|
|
161
|
+
_dict = {"evaluator_type": self.get_type_name()}
|
|
162
|
+
|
|
163
|
+
# Get default values from __init__ signature
|
|
164
|
+
sig = inspect.signature(self.__class__.__init__)
|
|
165
|
+
defaults = {k: v.default for k, v in sig.parameters.items() if v.default != inspect.Parameter.empty}
|
|
166
|
+
exclude_attrs = {"aggregator"}
|
|
167
|
+
for k, v in self.__dict__.items():
|
|
168
|
+
if not k.startswith("_") and k not in exclude_attrs:
|
|
169
|
+
# Handle model attribute specially
|
|
170
|
+
if k == "model":
|
|
171
|
+
if isinstance(v, Model):
|
|
172
|
+
# Serialize Model instance to model_id
|
|
173
|
+
_dict["model_id"] = self._get_model_id(v)
|
|
174
|
+
elif v is None and "model" in defaults and defaults["model"] is None:
|
|
175
|
+
# model=None is default, serialize as model_id with default value
|
|
176
|
+
_dict["model_id"] = self._get_model_id(None)
|
|
177
|
+
elif v is not None:
|
|
178
|
+
# String model ID, include as-is
|
|
179
|
+
_dict[k] = v
|
|
180
|
+
elif k not in defaults or v != defaults[k]:
|
|
181
|
+
_dict[k] = v
|
|
182
|
+
return _dict
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
from strands import Agent
|
|
5
|
+
from strands.models.model import Model
|
|
6
|
+
from typing_extensions import TypeVar, Union
|
|
7
|
+
|
|
8
|
+
from ..types.evaluation import EvaluationData, EvaluationOutput
|
|
9
|
+
from ..types.trace import EvaluationLevel, TraceLevelInput
|
|
10
|
+
from .evaluator import Evaluator
|
|
11
|
+
from .prompt_templates.faithfulness import get_template
|
|
12
|
+
|
|
13
|
+
InputT = TypeVar("InputT")
|
|
14
|
+
OutputT = TypeVar("OutputT")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class FaithfulnessScore(str, Enum):
|
|
18
|
+
"""Categorical faithfulness ratings."""
|
|
19
|
+
|
|
20
|
+
NOT_AT_ALL = "Not At All"
|
|
21
|
+
NOT_GENERALLY = "Not Generally"
|
|
22
|
+
NEUTRAL = "Neutral/Mixed"
|
|
23
|
+
GENERALLY_YES = "Generally Yes"
|
|
24
|
+
COMPLETELY_YES = "Completely Yes"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class FaithfulnessRating(BaseModel):
|
|
28
|
+
"""Structured output for faithfulness evaluation."""
|
|
29
|
+
|
|
30
|
+
reasoning: str = Field(description="Step by step reasoning to derive the final score")
|
|
31
|
+
score: FaithfulnessScore = Field(description="Categorical faithfulness rating")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class FaithfulnessEvaluator(Evaluator[InputT, OutputT]):
|
|
35
|
+
"""Evaluates faithfulness of agent responses against conversation history."""
|
|
36
|
+
|
|
37
|
+
evaluation_level = EvaluationLevel.TRACE_LEVEL
|
|
38
|
+
|
|
39
|
+
_score_mapping = {
|
|
40
|
+
FaithfulnessScore.NOT_AT_ALL: 0.0,
|
|
41
|
+
FaithfulnessScore.NOT_GENERALLY: 0.25,
|
|
42
|
+
FaithfulnessScore.NEUTRAL: 0.5,
|
|
43
|
+
FaithfulnessScore.GENERALLY_YES: 0.75,
|
|
44
|
+
FaithfulnessScore.COMPLETELY_YES: 1.0,
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
version: str = "v0",
|
|
50
|
+
model: Union[Model, str, None] = None,
|
|
51
|
+
system_prompt: str | None = None,
|
|
52
|
+
):
|
|
53
|
+
super().__init__()
|
|
54
|
+
self.system_prompt = system_prompt if system_prompt is not None else get_template(version).SYSTEM_PROMPT
|
|
55
|
+
self.version = version
|
|
56
|
+
self.model = model
|
|
57
|
+
|
|
58
|
+
def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
|
|
59
|
+
parsed_input = self._get_last_turn(evaluation_case)
|
|
60
|
+
prompt = self._format_prompt(parsed_input)
|
|
61
|
+
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
|
|
62
|
+
rating = evaluator_agent.structured_output(FaithfulnessRating, prompt)
|
|
63
|
+
normalized_score = self._score_mapping[rating.score]
|
|
64
|
+
result = EvaluationOutput(
|
|
65
|
+
score=normalized_score,
|
|
66
|
+
test_pass=normalized_score >= 0.5,
|
|
67
|
+
reason=rating.reasoning,
|
|
68
|
+
label=rating.score,
|
|
69
|
+
)
|
|
70
|
+
return [result]
|
|
71
|
+
|
|
72
|
+
async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
|
|
73
|
+
parsed_input = self._get_last_turn(evaluation_case)
|
|
74
|
+
prompt = self._format_prompt(parsed_input)
|
|
75
|
+
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
|
|
76
|
+
rating = await evaluator_agent.structured_output_async(FaithfulnessRating, prompt)
|
|
77
|
+
normalized_score = self._score_mapping[rating.score]
|
|
78
|
+
result = EvaluationOutput(
|
|
79
|
+
score=normalized_score,
|
|
80
|
+
test_pass=normalized_score >= 0.5,
|
|
81
|
+
reason=rating.reasoning,
|
|
82
|
+
label=rating.score,
|
|
83
|
+
)
|
|
84
|
+
return [result]
|
|
85
|
+
|
|
86
|
+
def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
|
|
87
|
+
"""Extract the most recent turn from the conversation for evaluation."""
|
|
88
|
+
parsed_inputs = self._parse_trajectory(evaluation_case)
|
|
89
|
+
if not parsed_inputs:
|
|
90
|
+
raise ValueError(
|
|
91
|
+
"No turn-level inputs could be parsed from the trajectory. "
|
|
92
|
+
"Ensure actual_trajectory is a Session with at least one AgentInvocationSpan."
|
|
93
|
+
)
|
|
94
|
+
return parsed_inputs[-1]
|
|
95
|
+
|
|
96
|
+
def _format_prompt(self, parsed_input: TraceLevelInput) -> str:
|
|
97
|
+
"""Format evaluation prompt from parsed turn data."""
|
|
98
|
+
parts = []
|
|
99
|
+
|
|
100
|
+
if parsed_input.session_history:
|
|
101
|
+
history_lines = []
|
|
102
|
+
for msg in parsed_input.session_history:
|
|
103
|
+
if isinstance(msg, list):
|
|
104
|
+
# Handle tool execution lists
|
|
105
|
+
for tool_exec in msg:
|
|
106
|
+
history_lines.append(f"Action: {tool_exec.tool_call.name}({tool_exec.tool_call.arguments})")
|
|
107
|
+
history_lines.append(f"Tool: {tool_exec.tool_result.content}")
|
|
108
|
+
else:
|
|
109
|
+
text = msg.content[0].text if msg.content and hasattr(msg.content[0], "text") else ""
|
|
110
|
+
history_lines.append(f"{msg.role.value.capitalize()}: {text}")
|
|
111
|
+
history_str = "\n".join(history_lines)
|
|
112
|
+
parts.append(f"# Conversation History:\n{history_str}")
|
|
113
|
+
|
|
114
|
+
parts.append(f"# Assistant's Response:\n{parsed_input.agent_response.text}")
|
|
115
|
+
|
|
116
|
+
return "\n\n".join(parts)
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
from strands import Agent
|
|
5
|
+
from strands.models.model import Model
|
|
6
|
+
from typing_extensions import TypeVar, Union
|
|
7
|
+
|
|
8
|
+
from ..types.evaluation import EvaluationData, EvaluationOutput
|
|
9
|
+
from ..types.trace import EvaluationLevel, SessionLevelInput
|
|
10
|
+
from .evaluator import Evaluator
|
|
11
|
+
from .prompt_templates.goal_success_rate import get_template
|
|
12
|
+
|
|
13
|
+
InputT = TypeVar("InputT")
|
|
14
|
+
OutputT = TypeVar("OutputT")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class GoalSuccessScore(str, Enum):
|
|
18
|
+
"""Binary goal success ratings."""
|
|
19
|
+
|
|
20
|
+
YES = "Yes"
|
|
21
|
+
NO = "No"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class GoalSuccessRating(BaseModel):
|
|
25
|
+
"""Structured output for goal success evaluation."""
|
|
26
|
+
|
|
27
|
+
reasoning: str = Field(description="Step by step reasoning to derive the final score")
|
|
28
|
+
score: GoalSuccessScore = Field(description="Score should be one of 'Yes' or 'No'")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class GoalSuccessRateEvaluator(Evaluator[InputT, OutputT]):
|
|
32
|
+
"""Evaluates whether all user goals were successfully achieved in a conversation."""
|
|
33
|
+
|
|
34
|
+
evaluation_level = EvaluationLevel.SESSION_LEVEL
|
|
35
|
+
|
|
36
|
+
_score_mapping = {
|
|
37
|
+
GoalSuccessScore.YES: 1.0,
|
|
38
|
+
GoalSuccessScore.NO: 0.0,
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
version: str = "v0",
|
|
44
|
+
model: Union[Model, str, None] = None,
|
|
45
|
+
system_prompt: str | None = None,
|
|
46
|
+
):
|
|
47
|
+
super().__init__()
|
|
48
|
+
self.system_prompt = system_prompt if system_prompt is not None else get_template(version).SYSTEM_PROMPT
|
|
49
|
+
self.version = version
|
|
50
|
+
self.model = model
|
|
51
|
+
|
|
52
|
+
def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
|
|
53
|
+
session_input = self._parse_trajectory(evaluation_case)
|
|
54
|
+
prompt = self._format_prompt(session_input)
|
|
55
|
+
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
|
|
56
|
+
rating = evaluator_agent.structured_output(GoalSuccessRating, prompt)
|
|
57
|
+
normalized_score = self._score_mapping[rating.score]
|
|
58
|
+
result = EvaluationOutput(
|
|
59
|
+
score=normalized_score,
|
|
60
|
+
test_pass=normalized_score >= 1.0,
|
|
61
|
+
reason=rating.reasoning,
|
|
62
|
+
label=rating.score,
|
|
63
|
+
)
|
|
64
|
+
return [result]
|
|
65
|
+
|
|
66
|
+
async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
|
|
67
|
+
session_input = self._parse_trajectory(evaluation_case)
|
|
68
|
+
prompt = self._format_prompt(session_input)
|
|
69
|
+
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
|
|
70
|
+
rating = await evaluator_agent.structured_output_async(GoalSuccessRating, prompt)
|
|
71
|
+
normalized_score = self._score_mapping[rating.score]
|
|
72
|
+
result = EvaluationOutput(
|
|
73
|
+
score=normalized_score,
|
|
74
|
+
test_pass=normalized_score >= 1.0,
|
|
75
|
+
reason=rating.reasoning,
|
|
76
|
+
label=rating.score,
|
|
77
|
+
)
|
|
78
|
+
return [result]
|
|
79
|
+
|
|
80
|
+
def _format_prompt(self, session_input: SessionLevelInput) -> str:
|
|
81
|
+
"""Format evaluation prompt from session-level input."""
|
|
82
|
+
parts = []
|
|
83
|
+
|
|
84
|
+
if session_input.available_tools:
|
|
85
|
+
parts.append(f"# Available tools\n{self._format_tools(session_input.available_tools)}")
|
|
86
|
+
|
|
87
|
+
if session_input.session_history:
|
|
88
|
+
parts.append(f"# Conversation record\n{self._format_session_history(session_input.session_history)}")
|
|
89
|
+
|
|
90
|
+
return "\n\n".join(parts)
|