strands-agents-evals 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. strands_agents_evals-0.1.0.dist-info/METADATA +408 -0
  2. strands_agents_evals-0.1.0.dist-info/RECORD +68 -0
  3. strands_agents_evals-0.1.0.dist-info/WHEEL +4 -0
  4. strands_agents_evals-0.1.0.dist-info/licenses/LICENSE +175 -0
  5. strands_agents_evals-0.1.0.dist-info/licenses/NOTICE +1 -0
  6. strands_evals/__init__.py +22 -0
  7. strands_evals/case.py +53 -0
  8. strands_evals/display/display_console.py +150 -0
  9. strands_evals/evaluators/__init__.py +23 -0
  10. strands_evals/evaluators/evaluator.py +182 -0
  11. strands_evals/evaluators/faithfulness_evaluator.py +116 -0
  12. strands_evals/evaluators/goal_success_rate_evaluator.py +90 -0
  13. strands_evals/evaluators/harmfulness_evaluator.py +135 -0
  14. strands_evals/evaluators/helpfulness_evaluator.py +148 -0
  15. strands_evals/evaluators/interactions_evaluator.py +244 -0
  16. strands_evals/evaluators/output_evaluator.py +72 -0
  17. strands_evals/evaluators/prompt_templates/case_prompt_template.py +63 -0
  18. strands_evals/evaluators/prompt_templates/faithfulness/__init__.py +11 -0
  19. strands_evals/evaluators/prompt_templates/faithfulness/faithfulness_v0.py +30 -0
  20. strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py +11 -0
  21. strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v0.py +17 -0
  22. strands_evals/evaluators/prompt_templates/harmfulness/__init__.py +11 -0
  23. strands_evals/evaluators/prompt_templates/harmfulness/harmfulness_v0.py +8 -0
  24. strands_evals/evaluators/prompt_templates/helpfulness/__init__.py +11 -0
  25. strands_evals/evaluators/prompt_templates/helpfulness/helpfulness_v0.py +38 -0
  26. strands_evals/evaluators/prompt_templates/prompt_templates.py +176 -0
  27. strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/__init__.py +11 -0
  28. strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/tool_parameter_accuracy_v0.py +40 -0
  29. strands_evals/evaluators/prompt_templates/tool_selection_accuracy/__init__.py +11 -0
  30. strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection_accuracy_v0.py +23 -0
  31. strands_evals/evaluators/tool_parameter_accuracy_evaluator.py +112 -0
  32. strands_evals/evaluators/tool_selection_accuracy_evaluator.py +112 -0
  33. strands_evals/evaluators/trajectory_evaluator.py +100 -0
  34. strands_evals/experiment.py +652 -0
  35. strands_evals/extractors/__init__.py +3 -0
  36. strands_evals/extractors/graph_extractor.py +30 -0
  37. strands_evals/extractors/swarm_extractor.py +73 -0
  38. strands_evals/extractors/tools_use_extractor.py +164 -0
  39. strands_evals/extractors/trace_extractor.py +166 -0
  40. strands_evals/generators/__init__.py +3 -0
  41. strands_evals/generators/experiment_generator.py +498 -0
  42. strands_evals/generators/prompt_template/prompt_templates.py +75 -0
  43. strands_evals/generators/topic_planner.py +60 -0
  44. strands_evals/mappers/__init__.py +6 -0
  45. strands_evals/mappers/session_mapper.py +27 -0
  46. strands_evals/mappers/strands_in_memory_session_mapper.py +473 -0
  47. strands_evals/simulation/README.md +323 -0
  48. strands_evals/simulation/__init__.py +6 -0
  49. strands_evals/simulation/actor_simulator.py +292 -0
  50. strands_evals/simulation/profiles/__init__.py +5 -0
  51. strands_evals/simulation/profiles/actor_profile.py +26 -0
  52. strands_evals/simulation/prompt_templates/__init__.py +11 -0
  53. strands_evals/simulation/prompt_templates/actor_profile_extraction.py +25 -0
  54. strands_evals/simulation/prompt_templates/actor_system_prompt.py +64 -0
  55. strands_evals/simulation/prompt_templates/goal_completion.py +27 -0
  56. strands_evals/simulation/tools/__init__.py +5 -0
  57. strands_evals/simulation/tools/goal_completion.py +93 -0
  58. strands_evals/telemetry/__init__.py +15 -0
  59. strands_evals/telemetry/_cloudwatch_logger.py +209 -0
  60. strands_evals/telemetry/config.py +207 -0
  61. strands_evals/telemetry/tracer.py +38 -0
  62. strands_evals/tools/evaluation_tools.py +67 -0
  63. strands_evals/types/__init__.py +11 -0
  64. strands_evals/types/evaluation.py +105 -0
  65. strands_evals/types/evaluation_report.py +244 -0
  66. strands_evals/types/simulation/__init__.py +5 -0
  67. strands_evals/types/simulation/actor.py +34 -0
  68. strands_evals/types/trace.py +205 -0
@@ -0,0 +1,73 @@
1
+ from strands.multiagent import MultiAgentResult, SwarmResult
2
+
3
+
4
+ def extract_swarm_handoffs(swarm_result: SwarmResult) -> list[dict]:
5
+ """
6
+ Extract handoff information from swarm execution results.
7
+
8
+ Args:
9
+ swarm_result: Result object from swarm execution
10
+
11
+ Returns:
12
+ list: Handoff information with from/to agents and output messages
13
+ [{from: str, to: str, messages: list[str]}, ...]
14
+ """
15
+ hand_off_info = []
16
+ for node_name, node_info in swarm_result.results.items():
17
+ if isinstance(node_info.result, Exception) or isinstance(node_info.result, MultiAgentResult):
18
+ continue
19
+ messages = [m["text"] for m in node_info.result.message["content"]]
20
+ added = False
21
+ for tool_name, tool_info in node_info.result.metrics.tool_metrics.items():
22
+ if tool_name == "handoff_to_agent":
23
+ hand_off_info.append(
24
+ {"from": node_name, "to": tool_info.tool["input"]["agent_name"], "messages": messages}
25
+ )
26
+ added = True
27
+ if not added:
28
+ hand_off_info.append({"from": node_name, "to": None, "messages": messages})
29
+
30
+ return hand_off_info
31
+
32
+
33
+ def extract_swarm_interactions_from_handoffs(handoffs_info: list[dict]) -> list[dict]:
34
+ """
35
+ Convert handoff information to interaction format for evaluation.
36
+
37
+ Args:
38
+ handoffs_info: List of handoff information from extract_swarm_handoffs
39
+
40
+ Returns:
41
+ list: Interactions with node names, messages, and dependencies
42
+ [{node_name: str, messages: list[str], dependencies: list[str]}, ...]
43
+ """
44
+ dependencies: dict[str, list[str]] = {}
45
+ interactions = []
46
+ for handoff in handoffs_info:
47
+ if handoff["to"] not in dependencies:
48
+ dependencies[handoff["to"]] = []
49
+ dependencies[handoff["to"]].append(handoff["from"])
50
+ interactions.append({"node_name": handoff["from"], "messages": handoff["messages"]})
51
+ for i in interactions:
52
+ node_name = i["node_name"]
53
+ if node_name not in dependencies:
54
+ dependencies[node_name] = []
55
+
56
+ i["dependencies"] = dependencies[node_name]
57
+
58
+ return interactions
59
+
60
+
61
+ def extract_swarm_interactions(swarm_result: SwarmResult) -> list[dict]:
62
+ """
63
+ Extract interactions from swarm execution results.
64
+
65
+ Args:
66
+ swarm_result: Result object from swarm execution
67
+
68
+ Returns:
69
+ list: Interactions with node names, messages, and dependencies
70
+ [{node_name: str, messages: list[str], dependencies: list[str]}, ...]
71
+ """
72
+ handoff_info = extract_swarm_handoffs(swarm_result)
73
+ return extract_swarm_interactions_from_handoffs(handoff_info)
@@ -0,0 +1,164 @@
1
+ from typing import Union, cast
2
+
3
+ from strands import Agent
4
+
5
+ from ..types.trace import Session, ToolLevelInput
6
+ from .trace_extractor import TraceExtractor
7
+
8
+
9
+ def extract_agent_tools_used_from_messages(agent_messages):
10
+ """
11
+ Extract tool usage information from agent message history.
12
+
13
+ Args:
14
+ agent_messages: List of message dictionaries from agent conversation
15
+
16
+ Returns:
17
+ list: Tool usage information with name, input, and tool_result
18
+ [{name: str, input: dict, tool_result: str}, ...]
19
+ """
20
+ tools_used = []
21
+ for i, message in enumerate(agent_messages):
22
+ if message.get("role") == "assistant":
23
+ message_info = message.get("content")
24
+ if len(message_info) > 0:
25
+ tool = None
26
+ for message in message_info:
27
+ if "toolUse" in message:
28
+ tool = message.get("toolUse")
29
+
30
+ if tool:
31
+ tool_name = tool.get("name")
32
+ tool_input = tool.get("input")
33
+ tool_id = tool.get("toolUseId")
34
+ # get the tool result from the next message
35
+ tool_result = None
36
+ next_message_i = i + 1
37
+ while next_message_i < len(agent_messages):
38
+ next_message = agent_messages[next_message_i]
39
+ next_message_i += 1
40
+
41
+ if next_message.get("role") == "user":
42
+ content = next_message.get("content")
43
+ if content:
44
+ tool_result_dict = content[0].get("toolResult")
45
+ if tool_result_dict.get("toolUseId") == tool_id:
46
+ tool_result_content = tool_result_dict.get("content", [])
47
+ if len(tool_result_content) > 0:
48
+ tool_result = tool_result_content[0].get("text")
49
+ break
50
+
51
+ tools_used.append({"name": tool_name, "input": tool_input, "tool_result": tool_result})
52
+ return tools_used
53
+
54
+
55
+ def extract_agent_tools_used_from_metrics(agent_result):
56
+ """
57
+ Extract tool usage metrics from agent execution result.
58
+
59
+ Args:
60
+ agent_result: Agent result object containing metrics
61
+
62
+ Returns:
63
+ list: Tool metrics with name, input, counts, and timing
64
+ [{
65
+ name: str,
66
+ input: dict,
67
+ call_count: int,
68
+ success_count: int,
69
+ total_time: float
70
+ }, ...]
71
+ """
72
+ tool_metrics = agent_result.metrics.tool_metrics
73
+ tools_used = []
74
+ for tool_name, tool_info in tool_metrics.items():
75
+ tools_used.append(
76
+ {
77
+ "name": tool_name,
78
+ "input": tool_info.tool.get("input"),
79
+ "call_count": tool_info.call_count,
80
+ "success_count": tool_info.success_count,
81
+ "total_time": tool_info.total_time,
82
+ }
83
+ )
84
+ return tools_used
85
+
86
+
87
+ def extract_agent_tools_used_from_trace(session: Session) -> list[dict]:
88
+ """
89
+ Extract tool usage information from trace data (Session object).
90
+ This function uses TraceExtractor to parse the session at TOOL_LEVEL,
91
+ then transforms the ToolLevelInput objects into the same format as
92
+ extract_agent_tools_used_from_messages for consistency.
93
+
94
+ Args:
95
+ session: Session object containing trace data
96
+
97
+ Returns:
98
+ list: Tool usage information with name, input, and tool_result
99
+ [{name: str, input: dict, tool_result: str}, ...]
100
+ """
101
+ from ..types.trace import EvaluationLevel
102
+
103
+ # Use TraceExtractor to get tool-level inputs
104
+ extractor = TraceExtractor(evaluation_level=EvaluationLevel.TOOL_LEVEL)
105
+ tool_inputs = cast(list[ToolLevelInput], extractor.extract(session))
106
+
107
+ # Transform to the same format as message-based extraction
108
+ tools_used = []
109
+ for tool_input in tool_inputs:
110
+ tool_execution = tool_input.tool_execution_details
111
+ tool_name = tool_execution.tool_call.name
112
+ tool_input_args = tool_execution.tool_call.arguments
113
+ tool_result = tool_execution.tool_result.content if tool_execution.tool_result else None
114
+
115
+ tools_used.append({"name": tool_name, "input": tool_input_args, "tool_result": tool_result})
116
+
117
+ return tools_used
118
+
119
+
120
+ def extract_agent_tools_used(source: Union[list, Session]) -> list[dict]:
121
+ """
122
+ Extract tool usage information from either agent messages or trace data.
123
+ This is a unified interface that automatically detects the input type and uses
124
+ the appropriate extraction method:
125
+ - If source is a Session object, uses trace-based extraction
126
+ - If source is a list, uses message-based extraction
127
+
128
+ Args:
129
+ source: Either agent_messages (list) or Session object
130
+
131
+ Returns:
132
+ list: Tool usage information with name, input, and tool_result
133
+ [{name: str, input: dict, tool_result: str}, ...]
134
+ Raises:
135
+ TypeError: If source is neither a list nor a Session object
136
+ """
137
+ if isinstance(source, Session):
138
+ return extract_agent_tools_used_from_trace(source)
139
+ elif isinstance(source, list):
140
+ return extract_agent_tools_used_from_messages(source)
141
+ else:
142
+ raise TypeError(f"source must be either a list (agent messages) or Session object, got {type(source).__name__}")
143
+
144
+
145
+ def extract_tools_description(agent: Agent, is_short: bool = True):
146
+ """
147
+ Extract a dictionary of all tools used in a given agent.
148
+
149
+ Args:
150
+ agent (Agent): Target agent to extract tool registry from
151
+ is_short (bool, optional): Whether to return only the description of the tools or everything. Defaults to True.
152
+
153
+ Returns:
154
+ dict: Tool name and its corresponding description
155
+ {<tool_name>: <tool_description>, ...}
156
+ """
157
+ description = agent.tool_registry.get_all_tools_config()
158
+ if is_short:
159
+ shorten_descrip = {}
160
+ for tool_name, tool_info in description.items():
161
+ shorten_descrip[tool_name] = tool_info["description"]
162
+ return shorten_descrip
163
+
164
+ return description
@@ -0,0 +1,166 @@
1
+ import logging
2
+
3
+ from typing_extensions import Union
4
+
5
+ from ..types.trace import (
6
+ AgentInvocationSpan,
7
+ AssistantMessage,
8
+ Context,
9
+ EvaluationLevel,
10
+ Session,
11
+ SessionLevelInput,
12
+ SpanInfo,
13
+ TextContent,
14
+ ToolConfig,
15
+ ToolExecution,
16
+ ToolExecutionSpan,
17
+ ToolLevelInput,
18
+ TraceLevelInput,
19
+ UserMessage,
20
+ )
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class TraceExtractor:
26
+ """Extracts structured evaluation inputs from Session traces."""
27
+
28
+ def __init__(self, evaluation_level: EvaluationLevel):
29
+ self.evaluation_level = evaluation_level
30
+
31
+ def extract(self, session: Session) -> Union[list[TraceLevelInput], list[ToolLevelInput], SessionLevelInput]:
32
+ """Extract evaluation inputs based on configured level."""
33
+ if not isinstance(session, Session):
34
+ raise TypeError(f"Expected Session object, got {type(session).__name__}")
35
+
36
+ if self.evaluation_level == EvaluationLevel.TRACE_LEVEL:
37
+ return self._extract_trace_level(session)
38
+ elif self.evaluation_level == EvaluationLevel.TOOL_LEVEL:
39
+ return self._extract_tool_level(session)
40
+ elif self.evaluation_level == EvaluationLevel.SESSION_LEVEL:
41
+ return self._extract_session_level(session)
42
+ else:
43
+ raise ValueError(f"Unsupported evaluation level: {self.evaluation_level}")
44
+
45
+ def _extract_trace_level(self, session: Session) -> list[TraceLevelInput]:
46
+ """Extract trace-level inputs with session history up to each turn."""
47
+ evaluation_inputs: list[TraceLevelInput] = []
48
+ previous_turns: list[Union[UserMessage, AssistantMessage]] = []
49
+
50
+ for trace in session.traces:
51
+ for span in trace.spans:
52
+ if not isinstance(span, AgentInvocationSpan):
53
+ continue
54
+
55
+ try:
56
+ text_content = TextContent(text=span.user_prompt)
57
+ previous_turns.append(UserMessage(content=[text_content]))
58
+ except (AttributeError, TypeError, ValueError) as e:
59
+ logger.warning(f"Failed to create user message: {e}")
60
+ continue
61
+
62
+ trace_input = TraceLevelInput(
63
+ span_info=span.span_info,
64
+ agent_response=TextContent(text=span.agent_response),
65
+ session_history=list(previous_turns),
66
+ )
67
+ evaluation_inputs.append(trace_input)
68
+
69
+ try:
70
+ text_content = TextContent(text=span.agent_response)
71
+ previous_turns.append(AssistantMessage(content=[text_content]))
72
+ except (AttributeError, TypeError, ValueError) as e:
73
+ logger.warning(f"Failed to create assistant message: {e}")
74
+
75
+ return evaluation_inputs
76
+
77
+ def _extract_tool_level(self, session: Session) -> list[ToolLevelInput]:
78
+ """Extract tool-level inputs with session and tool context."""
79
+ evaluator_inputs: list[ToolLevelInput] = []
80
+ session_history: list[Union[UserMessage, list[ToolExecution], AssistantMessage]] = []
81
+ available_tools: list[ToolConfig] = []
82
+
83
+ for trace in session.traces:
84
+ agent_span = self._find_agent_invocation_span(trace)
85
+ tool_spans = self._find_tool_execution_spans(trace)
86
+
87
+ if agent_span and agent_span.available_tools:
88
+ available_tools = agent_span.available_tools
89
+
90
+ if agent_span and agent_span.user_prompt:
91
+ session_history.append(UserMessage(content=[TextContent(text=agent_span.user_prompt)]))
92
+
93
+ for tool_span in tool_spans:
94
+ evaluator_inputs.append(
95
+ ToolLevelInput(
96
+ span_info=tool_span.span_info,
97
+ available_tools=available_tools,
98
+ tool_execution_details=tool_span,
99
+ session_history=list(session_history),
100
+ )
101
+ )
102
+
103
+ if tool_spans:
104
+ tool_executions = [
105
+ ToolExecution(tool_call=span.tool_call, tool_result=span.tool_result) for span in tool_spans
106
+ ]
107
+ session_history.append(tool_executions)
108
+
109
+ if agent_span and agent_span.agent_response:
110
+ session_history.append(AssistantMessage(content=[TextContent(text=agent_span.agent_response)]))
111
+
112
+ return evaluator_inputs
113
+
114
+ def _find_agent_invocation_span(self, trace) -> AgentInvocationSpan | None:
115
+ """Find the AgentInvocationSpan in a trace."""
116
+ for span in trace.spans:
117
+ if isinstance(span, AgentInvocationSpan):
118
+ return span
119
+ return None
120
+
121
+ def _find_tool_execution_spans(self, trace) -> list[ToolExecutionSpan]:
122
+ """Find all ToolExecutionSpans in a trace."""
123
+ return [span for span in trace.spans if isinstance(span, ToolExecutionSpan)]
124
+
125
+ def _extract_session_level(self, session: Session) -> SessionLevelInput:
126
+ """Extract session-level input with full history."""
127
+ session_history: list[Context] = []
128
+ available_tools: list[ToolConfig] = []
129
+ span_info: SpanInfo | None = None
130
+
131
+ for trace in session.traces:
132
+ tool_calls: list[ToolExecutionSpan] = []
133
+
134
+ for span in trace.spans:
135
+ if isinstance(span, ToolExecutionSpan):
136
+ tool_calls.append(span)
137
+
138
+ for span in trace.spans:
139
+ if isinstance(span, AgentInvocationSpan):
140
+ if not span_info:
141
+ span_info = span.span_info
142
+ if span.available_tools and not available_tools:
143
+ available_tools = span.available_tools
144
+
145
+ tool_executions = (
146
+ [ToolExecution(tool_call=tc.tool_call, tool_result=tc.tool_result) for tc in tool_calls]
147
+ if tool_calls
148
+ else None
149
+ )
150
+
151
+ session_history.append(
152
+ Context(
153
+ user_prompt=TextContent(text=span.user_prompt),
154
+ agent_response=TextContent(text=span.agent_response),
155
+ tool_execution_history=tool_executions,
156
+ )
157
+ )
158
+
159
+ if not span_info:
160
+ raise ValueError("No AgentInvocationSpan found in session")
161
+
162
+ return SessionLevelInput(
163
+ span_info=span_info,
164
+ session_history=session_history,
165
+ available_tools=available_tools if available_tools else None,
166
+ )
@@ -0,0 +1,3 @@
1
+ from .experiment_generator import ExperimentGenerator
2
+
3
+ __all__ = ["ExperimentGenerator"]