PyPI - strands-agents-evals - Versions diffs - 0.1.0__py3-none-any.whl - Mend

strands-agents-evals 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

strands_agents_evals-0.1.0.dist-info/METADATA +408 -0
strands_agents_evals-0.1.0.dist-info/RECORD +68 -0
strands_agents_evals-0.1.0.dist-info/WHEEL +4 -0
strands_agents_evals-0.1.0.dist-info/licenses/LICENSE +175 -0
strands_agents_evals-0.1.0.dist-info/licenses/NOTICE +1 -0
strands_evals/__init__.py +22 -0
strands_evals/case.py +53 -0
strands_evals/display/display_console.py +150 -0
strands_evals/evaluators/__init__.py +23 -0
strands_evals/evaluators/evaluator.py +182 -0
strands_evals/evaluators/faithfulness_evaluator.py +116 -0
strands_evals/evaluators/goal_success_rate_evaluator.py +90 -0
strands_evals/evaluators/harmfulness_evaluator.py +135 -0
strands_evals/evaluators/helpfulness_evaluator.py +148 -0
strands_evals/evaluators/interactions_evaluator.py +244 -0
strands_evals/evaluators/output_evaluator.py +72 -0
strands_evals/evaluators/prompt_templates/case_prompt_template.py +63 -0
strands_evals/evaluators/prompt_templates/faithfulness/__init__.py +11 -0
strands_evals/evaluators/prompt_templates/faithfulness/faithfulness_v0.py +30 -0
strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py +11 -0
strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v0.py +17 -0
strands_evals/evaluators/prompt_templates/harmfulness/__init__.py +11 -0
strands_evals/evaluators/prompt_templates/harmfulness/harmfulness_v0.py +8 -0
strands_evals/evaluators/prompt_templates/helpfulness/__init__.py +11 -0
strands_evals/evaluators/prompt_templates/helpfulness/helpfulness_v0.py +38 -0
strands_evals/evaluators/prompt_templates/prompt_templates.py +176 -0
strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/__init__.py +11 -0
strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/tool_parameter_accuracy_v0.py +40 -0
strands_evals/evaluators/prompt_templates/tool_selection_accuracy/__init__.py +11 -0
strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection_accuracy_v0.py +23 -0
strands_evals/evaluators/tool_parameter_accuracy_evaluator.py +112 -0
strands_evals/evaluators/tool_selection_accuracy_evaluator.py +112 -0
strands_evals/evaluators/trajectory_evaluator.py +100 -0
strands_evals/experiment.py +652 -0
strands_evals/extractors/__init__.py +3 -0
strands_evals/extractors/graph_extractor.py +30 -0
strands_evals/extractors/swarm_extractor.py +73 -0
strands_evals/extractors/tools_use_extractor.py +164 -0
strands_evals/extractors/trace_extractor.py +166 -0
strands_evals/generators/__init__.py +3 -0
strands_evals/generators/experiment_generator.py +498 -0
strands_evals/generators/prompt_template/prompt_templates.py +75 -0
strands_evals/generators/topic_planner.py +60 -0
strands_evals/mappers/__init__.py +6 -0
strands_evals/mappers/session_mapper.py +27 -0
strands_evals/mappers/strands_in_memory_session_mapper.py +473 -0
strands_evals/simulation/README.md +323 -0
strands_evals/simulation/__init__.py +6 -0
strands_evals/simulation/actor_simulator.py +292 -0
strands_evals/simulation/profiles/__init__.py +5 -0
strands_evals/simulation/profiles/actor_profile.py +26 -0
strands_evals/simulation/prompt_templates/__init__.py +11 -0
strands_evals/simulation/prompt_templates/actor_profile_extraction.py +25 -0
strands_evals/simulation/prompt_templates/actor_system_prompt.py +64 -0
strands_evals/simulation/prompt_templates/goal_completion.py +27 -0
strands_evals/simulation/tools/__init__.py +5 -0
strands_evals/simulation/tools/goal_completion.py +93 -0
strands_evals/telemetry/__init__.py +15 -0
strands_evals/telemetry/_cloudwatch_logger.py +209 -0
strands_evals/telemetry/config.py +207 -0
strands_evals/telemetry/tracer.py +38 -0
strands_evals/tools/evaluation_tools.py +67 -0
strands_evals/types/__init__.py +11 -0
strands_evals/types/evaluation.py +105 -0
strands_evals/types/evaluation_report.py +244 -0
strands_evals/types/simulation/__init__.py +5 -0
strands_evals/types/simulation/actor.py +34 -0
strands_evals/types/trace.py +205 -0

strands_evals/extractors/swarm_extractor.py ADDED Viewed

@@ -0,0 +1,73 @@
+from strands.multiagent import MultiAgentResult, SwarmResult
+def extract_swarm_handoffs(swarm_result: SwarmResult) -> list[dict]:
+    """
+    Extract handoff information from swarm execution results.
+    Args:
+        swarm_result: Result object from swarm execution
+    Returns:
+        list: Handoff information with from/to agents and output messages
+        [{from: str, to: str, messages: list[str]}, ...]
+    """
+    hand_off_info = []
+    for node_name, node_info in swarm_result.results.items():
+        if isinstance(node_info.result, Exception) or isinstance(node_info.result, MultiAgentResult):
+            continue
+        messages = [m["text"] for m in node_info.result.message["content"]]
+        added = False
+        for tool_name, tool_info in node_info.result.metrics.tool_metrics.items():
+            if tool_name == "handoff_to_agent":
+                hand_off_info.append(
+                    {"from": node_name, "to": tool_info.tool["input"]["agent_name"], "messages": messages}
+                )
+                added = True
+        if not added:
+            hand_off_info.append({"from": node_name, "to": None, "messages": messages})
+    return hand_off_info
+def extract_swarm_interactions_from_handoffs(handoffs_info: list[dict]) -> list[dict]:
+    """
+    Convert handoff information to interaction format for evaluation.
+    Args:
+        handoffs_info: List of handoff information from extract_swarm_handoffs
+    Returns:
+        list: Interactions with node names, messages, and dependencies
+        [{node_name: str, messages: list[str], dependencies: list[str]}, ...]
+    """
+    dependencies: dict[str, list[str]] = {}
+    interactions = []
+    for handoff in handoffs_info:
+        if handoff["to"] not in dependencies:
+            dependencies[handoff["to"]] = []
+        dependencies[handoff["to"]].append(handoff["from"])
+        interactions.append({"node_name": handoff["from"], "messages": handoff["messages"]})
+    for i in interactions:
+        node_name = i["node_name"]
+        if node_name not in dependencies:
+            dependencies[node_name] = []
+        i["dependencies"] = dependencies[node_name]
+    return interactions
+def extract_swarm_interactions(swarm_result: SwarmResult) -> list[dict]:
+    """
+    Extract interactions from swarm execution results.
+    Args:
+        swarm_result: Result object from swarm execution
+    Returns:
+        list: Interactions with node names, messages, and dependencies
+        [{node_name: str, messages: list[str], dependencies: list[str]}, ...]
+    """
+    handoff_info = extract_swarm_handoffs(swarm_result)
+    return extract_swarm_interactions_from_handoffs(handoff_info)

strands_evals/extractors/tools_use_extractor.py ADDED Viewed

@@ -0,0 +1,164 @@
+from typing import Union, cast
+from strands import Agent
+from ..types.trace import Session, ToolLevelInput
+from .trace_extractor import TraceExtractor
+def extract_agent_tools_used_from_messages(agent_messages):
+    """
+    Extract tool usage information from agent message history.
+    Args:
+        agent_messages: List of message dictionaries from agent conversation
+    Returns:
+        list: Tool usage information with name, input, and tool_result
+        [{name: str, input: dict, tool_result: str}, ...]
+    """
+    tools_used = []
+    for i, message in enumerate(agent_messages):
+        if message.get("role") == "assistant":
+            message_info = message.get("content")
+            if len(message_info) > 0:
+                tool = None
+                for message in message_info:
+                    if "toolUse" in message:
+                        tool = message.get("toolUse")
+                if tool:
+                    tool_name = tool.get("name")
+                    tool_input = tool.get("input")
+                    tool_id = tool.get("toolUseId")
+                    # get the tool result from the next message
+                    tool_result = None
+                    next_message_i = i + 1
+                    while next_message_i < len(agent_messages):
+                        next_message = agent_messages[next_message_i]
+                        next_message_i += 1
+                        if next_message.get("role") == "user":
+                            content = next_message.get("content")
+                            if content:
+                                tool_result_dict = content[0].get("toolResult")
+                                if tool_result_dict.get("toolUseId") == tool_id:
+                                    tool_result_content = tool_result_dict.get("content", [])
+                                    if len(tool_result_content) > 0:
+                                        tool_result = tool_result_content[0].get("text")
+                                        break
+                    tools_used.append({"name": tool_name, "input": tool_input, "tool_result": tool_result})
+    return tools_used
+def extract_agent_tools_used_from_metrics(agent_result):
+    """
+    Extract tool usage metrics from agent execution result.
+    Args:
+        agent_result: Agent result object containing metrics
+    Returns:
+        list: Tool metrics with name, input, counts, and timing
+        [{
+            name: str,
+            input: dict,
+            call_count: int,
+            success_count: int,
+            total_time: float
+        }, ...]
+    """
+    tool_metrics = agent_result.metrics.tool_metrics
+    tools_used = []
+    for tool_name, tool_info in tool_metrics.items():
+        tools_used.append(
+            {
+                "name": tool_name,
+                "input": tool_info.tool.get("input"),
+                "call_count": tool_info.call_count,
+                "success_count": tool_info.success_count,
+                "total_time": tool_info.total_time,
+            }
+        )
+    return tools_used
+def extract_agent_tools_used_from_trace(session: Session) -> list[dict]:
+    """
+    Extract tool usage information from trace data (Session object).
+    This function uses TraceExtractor to parse the session at TOOL_LEVEL,
+    then transforms the ToolLevelInput objects into the same format as
+    extract_agent_tools_used_from_messages for consistency.
+    Args:
+        session: Session object containing trace data
+    Returns:
+        list: Tool usage information with name, input, and tool_result
+        [{name: str, input: dict, tool_result: str}, ...]
+    """
+    from ..types.trace import EvaluationLevel
+    # Use TraceExtractor to get tool-level inputs
+    extractor = TraceExtractor(evaluation_level=EvaluationLevel.TOOL_LEVEL)
+    tool_inputs = cast(list[ToolLevelInput], extractor.extract(session))
+    # Transform to the same format as message-based extraction
+    tools_used = []
+    for tool_input in tool_inputs:
+        tool_execution = tool_input.tool_execution_details
+        tool_name = tool_execution.tool_call.name
+        tool_input_args = tool_execution.tool_call.arguments
+        tool_result = tool_execution.tool_result.content if tool_execution.tool_result else None
+        tools_used.append({"name": tool_name, "input": tool_input_args, "tool_result": tool_result})
+    return tools_used
+def extract_agent_tools_used(source: Union[list, Session]) -> list[dict]:
+    """
+    Extract tool usage information from either agent messages or trace data.
+    This is a unified interface that automatically detects the input type and uses
+    the appropriate extraction method:
+    - If source is a Session object, uses trace-based extraction
+    - If source is a list, uses message-based extraction
+    Args:
+        source: Either agent_messages (list) or Session object
+    Returns:
+        list: Tool usage information with name, input, and tool_result
+        [{name: str, input: dict, tool_result: str}, ...]
+    Raises:
+        TypeError: If source is neither a list nor a Session object
+    """
+    if isinstance(source, Session):
+        return extract_agent_tools_used_from_trace(source)
+    elif isinstance(source, list):
+        return extract_agent_tools_used_from_messages(source)
+    else:
+        raise TypeError(f"source must be either a list (agent messages) or Session object, got {type(source).__name__}")
+def extract_tools_description(agent: Agent, is_short: bool = True):
+    """
+    Extract a dictionary of all tools used in a given agent.
+    Args:
+        agent (Agent): Target agent to extract tool registry from
+        is_short (bool, optional): Whether to return only the description of the tools or everything. Defaults to True.
+    Returns:
+        dict: Tool name and its corresponding description
+        {<tool_name>: <tool_description>, ...}
+    """
+    description = agent.tool_registry.get_all_tools_config()
+    if is_short:
+        shorten_descrip = {}
+        for tool_name, tool_info in description.items():
+            shorten_descrip[tool_name] = tool_info["description"]
+        return shorten_descrip
+    return description

strands_evals/extractors/trace_extractor.py ADDED Viewed

@@ -0,0 +1,166 @@
+import logging
+from typing_extensions import Union
+from ..types.trace import (
+    AgentInvocationSpan,
+    AssistantMessage,
+    Context,
+    EvaluationLevel,
+    Session,
+    SessionLevelInput,
+    SpanInfo,
+    TextContent,
+    ToolConfig,
+    ToolExecution,
+    ToolExecutionSpan,
+    ToolLevelInput,
+    TraceLevelInput,
+    UserMessage,
+)
+logger = logging.getLogger(__name__)
+class TraceExtractor:
+    """Extracts structured evaluation inputs from Session traces."""
+    def __init__(self, evaluation_level: EvaluationLevel):
+        self.evaluation_level = evaluation_level
+    def extract(self, session: Session) -> Union[list[TraceLevelInput], list[ToolLevelInput], SessionLevelInput]:
+        """Extract evaluation inputs based on configured level."""
+        if not isinstance(session, Session):
+            raise TypeError(f"Expected Session object, got {type(session).__name__}")
+        if self.evaluation_level == EvaluationLevel.TRACE_LEVEL:
+            return self._extract_trace_level(session)
+        elif self.evaluation_level == EvaluationLevel.TOOL_LEVEL:
+            return self._extract_tool_level(session)
+        elif self.evaluation_level == EvaluationLevel.SESSION_LEVEL:
+            return self._extract_session_level(session)
+        else:
+            raise ValueError(f"Unsupported evaluation level: {self.evaluation_level}")
+    def _extract_trace_level(self, session: Session) -> list[TraceLevelInput]:
+        """Extract trace-level inputs with session history up to each turn."""
+        evaluation_inputs: list[TraceLevelInput] = []
+        previous_turns: list[Union[UserMessage, AssistantMessage]] = []
+        for trace in session.traces:
+            for span in trace.spans:
+                if not isinstance(span, AgentInvocationSpan):
+                    continue
+                try:
+                    text_content = TextContent(text=span.user_prompt)
+                    previous_turns.append(UserMessage(content=[text_content]))
+                except (AttributeError, TypeError, ValueError) as e:
+                    logger.warning(f"Failed to create user message: {e}")
+                    continue
+                trace_input = TraceLevelInput(
+                    span_info=span.span_info,
+                    agent_response=TextContent(text=span.agent_response),
+                    session_history=list(previous_turns),
+                )
+                evaluation_inputs.append(trace_input)
+                try:
+                    text_content = TextContent(text=span.agent_response)
+                    previous_turns.append(AssistantMessage(content=[text_content]))
+                except (AttributeError, TypeError, ValueError) as e:
+                    logger.warning(f"Failed to create assistant message: {e}")
+        return evaluation_inputs
+    def _extract_tool_level(self, session: Session) -> list[ToolLevelInput]:
+        """Extract tool-level inputs with session and tool context."""
+        evaluator_inputs: list[ToolLevelInput] = []
+        session_history: list[Union[UserMessage, list[ToolExecution], AssistantMessage]] = []
+        available_tools: list[ToolConfig] = []
+        for trace in session.traces:
+            agent_span = self._find_agent_invocation_span(trace)
+            tool_spans = self._find_tool_execution_spans(trace)
+            if agent_span and agent_span.available_tools:
+                available_tools = agent_span.available_tools
+            if agent_span and agent_span.user_prompt:
+                session_history.append(UserMessage(content=[TextContent(text=agent_span.user_prompt)]))
+            for tool_span in tool_spans:
+                evaluator_inputs.append(
+                    ToolLevelInput(
+                        span_info=tool_span.span_info,
+                        available_tools=available_tools,
+                        tool_execution_details=tool_span,
+                        session_history=list(session_history),
+                    )
+                )
+            if tool_spans:
+                tool_executions = [
+                    ToolExecution(tool_call=span.tool_call, tool_result=span.tool_result) for span in tool_spans
+                ]
+                session_history.append(tool_executions)
+            if agent_span and agent_span.agent_response:
+                session_history.append(AssistantMessage(content=[TextContent(text=agent_span.agent_response)]))
+        return evaluator_inputs
+    def _find_agent_invocation_span(self, trace) -> AgentInvocationSpan | None:
+        """Find the AgentInvocationSpan in a trace."""
+        for span in trace.spans:
+            if isinstance(span, AgentInvocationSpan):
+                return span
+        return None
+    def _find_tool_execution_spans(self, trace) -> list[ToolExecutionSpan]:
+        """Find all ToolExecutionSpans in a trace."""
+        return [span for span in trace.spans if isinstance(span, ToolExecutionSpan)]
+    def _extract_session_level(self, session: Session) -> SessionLevelInput:
+        """Extract session-level input with full history."""
+        session_history: list[Context] = []
+        available_tools: list[ToolConfig] = []
+        span_info: SpanInfo | None = None
+        for trace in session.traces:
+            tool_calls: list[ToolExecutionSpan] = []
+            for span in trace.spans:
+                if isinstance(span, ToolExecutionSpan):
+                    tool_calls.append(span)
+            for span in trace.spans:
+                if isinstance(span, AgentInvocationSpan):
+                    if not span_info:
+                        span_info = span.span_info
+                    if span.available_tools and not available_tools:
+                        available_tools = span.available_tools
+                    tool_executions = (
+                        [ToolExecution(tool_call=tc.tool_call, tool_result=tc.tool_result) for tc in tool_calls]
+                        if tool_calls
+                        else None
+                    )
+                    session_history.append(
+                        Context(
+                            user_prompt=TextContent(text=span.user_prompt),
+                            agent_response=TextContent(text=span.agent_response),
+                            tool_execution_history=tool_executions,
+                        )
+                    )
+        if not span_info:
+            raise ValueError("No AgentInvocationSpan found in session")
+        return SessionLevelInput(
+            span_info=span_info,
+            session_history=session_history,
+            available_tools=available_tools if available_tools else None,
+        )

strands_evals/generators/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .experiment_generator import ExperimentGenerator
+__all__ = ["ExperimentGenerator"]