strands-agents-evals 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: strands-agents-evals
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: Evaluation framework for Strands
5
5
  Author-email: AWS <opensource@amazon.com>
6
6
  License: Apache-2.0
@@ -30,8 +30,8 @@ strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection
30
30
  strands_evals/extractors/__init__.py,sha256=Jmlrk-m8sSS_LwmCVSloIkg3BjOgRzNEezjaAGMw5rw,74
31
31
  strands_evals/extractors/graph_extractor.py,sha256=TeT-58JB9roqSvy2ELz1kg8WF5YO-cfLlGZTO0F9s_4,1105
32
32
  strands_evals/extractors/swarm_extractor.py,sha256=Sm1XFCkAGVdF3XDyO3iF-20I8C6sAQ8JPNP5fgotOFU,2682
33
- strands_evals/extractors/tools_use_extractor.py,sha256=o2e9ZuPqQ_hdcrDkl1Rw9h7Ipfc-CsLNkWemxcRvglg,6409
34
- strands_evals/extractors/trace_extractor.py,sha256=l7gk5rUFoUcxQduPJz49OX66SdgeK1MLt81aF1yr4Lc,6653
33
+ strands_evals/extractors/tools_use_extractor.py,sha256=emLL63LKldL2IA2u5wZL0ZhklZJqX0KLr5xFRt-S4i4,6600
34
+ strands_evals/extractors/trace_extractor.py,sha256=TJKl0OdjFhh-htlV1Wxzem8TQdb0rxa-efkq_e0pAdo,7287
35
35
  strands_evals/generators/__init__.py,sha256=B1F30DAIf0kPyBdE4PAZvSby-dTelqb_7hFJoATqVb0,89
36
36
  strands_evals/generators/experiment_generator.py,sha256=6wLTL0iG2b0YAiu0w8dDiaBxOIy7p_Fs7l3hCjgQc0w,22655
37
37
  strands_evals/generators/topic_planner.py,sha256=FtgTVDlV9hWJyO8E4Z__nEWvvrOJzmTW4y6yZ9Alx1A,2436
@@ -61,8 +61,8 @@ strands_evals/types/evaluation_report.py,sha256=vT86zO4Qn9CQbULo3aziGMdG-1qWLdcB
61
61
  strands_evals/types/trace.py,sha256=BFoEylzAlENyPH702T5MDz-_H21-Wfx-FFTSXX1tDfY,4844
62
62
  strands_evals/types/simulation/__init__.py,sha256=-mz5lW6qFfIMm4dJGaP9pXY3xeiefLbB0XevjdFykkU,133
63
63
  strands_evals/types/simulation/actor.py,sha256=ESTV8165c3Ad5QT4yYmjm-A-oZdwZ0Rf0Lq7zokjTPo,1163
64
- strands_agents_evals-0.1.2.dist-info/METADATA,sha256=eO-nDiGzAJxDmmwBwPVvNfTdds0408trSL4cKqLR8b4,17721
65
- strands_agents_evals-0.1.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
66
- strands_agents_evals-0.1.2.dist-info/licenses/LICENSE,sha256=yIWKWnZEC7ykhOE0z330Y4XQkN0YssWOZQ-TkliALls,10141
67
- strands_agents_evals-0.1.2.dist-info/licenses/NOTICE,sha256=Eg13ogOmcI7JpMjxniFnKG81vwU3X8X7P_IlpvVg5RU,66
68
- strands_agents_evals-0.1.2.dist-info/RECORD,,
64
+ strands_agents_evals-0.1.4.dist-info/METADATA,sha256=VQm_tm1Umm3fi_HfujW0Ovm_XyvQQCjEJrAL4-dGjKQ,17721
65
+ strands_agents_evals-0.1.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
66
+ strands_agents_evals-0.1.4.dist-info/licenses/LICENSE,sha256=yIWKWnZEC7ykhOE0z330Y4XQkN0YssWOZQ-TkliALls,10141
67
+ strands_agents_evals-0.1.4.dist-info/licenses/NOTICE,sha256=Eg13ogOmcI7JpMjxniFnKG81vwU3X8X7P_IlpvVg5RU,66
68
+ strands_agents_evals-0.1.4.dist-info/RECORD,,
@@ -22,37 +22,39 @@ def extract_agent_tools_used_from_messages(agent_messages):
22
22
  if message.get("role") == "assistant":
23
23
  message_info = message.get("content")
24
24
  if len(message_info) > 0:
25
- tool = None
25
+ tools = []
26
26
  for message in message_info:
27
27
  if "toolUse" in message:
28
+ tools.append(message.get("toolUse"))
29
+
30
+ for tool in tools:
31
+ if tool:
32
+ tool_name = tool.get("name")
33
+ tool_input = tool.get("input")
34
+ tool_id = tool.get("toolUseId")
35
+ # get the tool result from the next message
36
+ tool_result = None
37
+ is_error = False
38
+ next_message_i = i + 1
39
+ while next_message_i < len(agent_messages):
40
+ next_message = agent_messages[next_message_i]
41
+ next_message_i += 1
42
+
43
+ if next_message.get("role") == "user":
44
+ content = next_message.get("content")
45
+ if content:
46
+ tool_result_dict = content[0].get("toolResult")
47
+ if tool_result_dict and tool_result_dict.get("toolUseId") == tool_id:
48
+ tool_result_content = tool_result_dict.get("content", [])
49
+ if len(tool_result_content) > 0:
50
+ tool_result = tool_result_content[0].get("text")
51
+ is_error = tool_result_dict.get("status") == "error"
52
+ break
53
+
54
+ tools_used.append(
55
+ {"name": tool_name, "input": tool_input, "tool_result": tool_result, "is_error": is_error}
56
+ )
28
57
  tool = message.get("toolUse")
29
-
30
- if tool:
31
- tool_name = tool.get("name")
32
- tool_input = tool.get("input")
33
- tool_id = tool.get("toolUseId")
34
- # get the tool result from the next message
35
- tool_result = None
36
- is_error = False
37
- next_message_i = i + 1
38
- while next_message_i < len(agent_messages):
39
- next_message = agent_messages[next_message_i]
40
- next_message_i += 1
41
-
42
- if next_message.get("role") == "user":
43
- content = next_message.get("content")
44
- if content:
45
- tool_result_dict = content[0].get("toolResult")
46
- if tool_result_dict and tool_result_dict.get("toolUseId") == tool_id:
47
- tool_result_content = tool_result_dict.get("content", [])
48
- if len(tool_result_content) > 0:
49
- tool_result = tool_result_content[0].get("text")
50
- is_error = tool_result_dict.get("status") == "error"
51
- break
52
-
53
- tools_used.append(
54
- {"name": tool_name, "input": tool_input, "tool_result": tool_result, "is_error": is_error}
55
- )
56
58
  return tools_used
57
59
 
58
60
 
@@ -45,9 +45,11 @@ class TraceExtractor:
45
45
  def _extract_trace_level(self, session: Session) -> list[TraceLevelInput]:
46
46
  """Extract trace-level inputs with session history up to each turn."""
47
47
  evaluation_inputs: list[TraceLevelInput] = []
48
- previous_turns: list[Union[UserMessage, AssistantMessage]] = []
48
+ previous_turns: list[Union[UserMessage, list[ToolExecution], AssistantMessage]] = []
49
49
 
50
50
  for trace in session.traces:
51
+ tool_spans = self._find_tool_execution_spans(trace)
52
+
51
53
  for span in trace.spans:
52
54
  if not isinstance(span, AgentInvocationSpan):
53
55
  continue
@@ -59,6 +61,17 @@ class TraceExtractor:
59
61
  logger.warning(f"Failed to create user message: {e}")
60
62
  continue
61
63
 
64
+ # Include tool executions in session history
65
+ if tool_spans:
66
+ try:
67
+ tool_executions = [
68
+ ToolExecution(tool_call=ts.tool_call, tool_result=ts.tool_result)
69
+ for ts in tool_spans
70
+ ]
71
+ previous_turns.append(tool_executions)
72
+ except (AttributeError, TypeError, ValueError) as e:
73
+ logger.warning(f"Failed to create tool executions: {e}")
74
+
62
75
  trace_input = TraceLevelInput(
63
76
  span_info=span.span_info,
64
77
  agent_response=TextContent(text=span.agent_response),