strands-agents-evals 0.1.2__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/PKG-INFO +1 -1
  2. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/extractors/tools_use_extractor.py +30 -28
  3. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/extractors/trace_extractor.py +14 -1
  4. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/extractors/test_tools_use_extractor.py +64 -0
  5. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  6. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  7. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  8. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  9. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/.github/dependabot.yml +0 -0
  10. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/.github/workflows/integration-test.yml +0 -0
  11. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/.github/workflows/pr-and-push.yml +0 -0
  12. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/.github/workflows/pypi-publish-on-release.yml +0 -0
  13. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/.github/workflows/test-lint.yml +0 -0
  14. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/.gitignore +0 -0
  15. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/.pre-commit-config.yaml +0 -0
  16. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/CODE_OF_CONDUCT.md +0 -0
  17. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/CONTRIBUTING.md +0 -0
  18. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/LICENSE +0 -0
  19. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/NOTICE +0 -0
  20. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/README.md +0 -0
  21. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/STYLE_GUIDE.md +0 -0
  22. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/pyproject.toml +0 -0
  23. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/__init__.py +0 -0
  24. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/__init__.py +0 -0
  25. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/case.py +0 -0
  26. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/display/display_console.py +0 -0
  27. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/__init__.py +0 -0
  28. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/evaluator.py +0 -0
  29. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/faithfulness_evaluator.py +0 -0
  30. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/goal_success_rate_evaluator.py +0 -0
  31. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/harmfulness_evaluator.py +0 -0
  32. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/helpfulness_evaluator.py +0 -0
  33. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/interactions_evaluator.py +0 -0
  34. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/output_evaluator.py +0 -0
  35. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/case_prompt_template.py +0 -0
  36. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/faithfulness/__init__.py +0 -0
  37. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/faithfulness/faithfulness_v0.py +0 -0
  38. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py +0 -0
  39. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v0.py +0 -0
  40. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/harmfulness/__init__.py +0 -0
  41. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/harmfulness/harmfulness_v0.py +0 -0
  42. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/helpfulness/__init__.py +0 -0
  43. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/helpfulness/helpfulness_v0.py +0 -0
  44. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/prompt_templates.py +0 -0
  45. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/__init__.py +0 -0
  46. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/tool_parameter_accuracy_v0.py +0 -0
  47. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/tool_selection_accuracy/__init__.py +0 -0
  48. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection_accuracy_v0.py +0 -0
  49. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/tool_parameter_accuracy_evaluator.py +0 -0
  50. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/tool_selection_accuracy_evaluator.py +0 -0
  51. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/trajectory_evaluator.py +0 -0
  52. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/experiment.py +0 -0
  53. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/extractors/__init__.py +0 -0
  54. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/extractors/graph_extractor.py +0 -0
  55. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/extractors/swarm_extractor.py +0 -0
  56. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/generators/__init__.py +0 -0
  57. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/generators/experiment_generator.py +0 -0
  58. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/generators/prompt_template/prompt_templates.py +0 -0
  59. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/generators/topic_planner.py +0 -0
  60. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/mappers/__init__.py +0 -0
  61. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/mappers/session_mapper.py +0 -0
  62. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/mappers/strands_in_memory_session_mapper.py +0 -0
  63. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/simulation/README.md +0 -0
  64. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/simulation/__init__.py +0 -0
  65. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/simulation/actor_simulator.py +0 -0
  66. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/simulation/profiles/__init__.py +0 -0
  67. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/simulation/profiles/actor_profile.py +0 -0
  68. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/simulation/prompt_templates/__init__.py +0 -0
  69. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/simulation/prompt_templates/actor_profile_extraction.py +0 -0
  70. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/simulation/prompt_templates/actor_system_prompt.py +0 -0
  71. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/simulation/prompt_templates/goal_completion.py +0 -0
  72. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/simulation/tools/__init__.py +0 -0
  73. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/simulation/tools/goal_completion.py +0 -0
  74. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/telemetry/__init__.py +0 -0
  75. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/telemetry/_cloudwatch_logger.py +0 -0
  76. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/telemetry/config.py +0 -0
  77. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/telemetry/tracer.py +0 -0
  78. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/tools/evaluation_tools.py +0 -0
  79. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/types/__init__.py +0 -0
  80. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/types/evaluation.py +0 -0
  81. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/types/evaluation_report.py +0 -0
  82. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/types/simulation/__init__.py +0 -0
  83. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/types/simulation/actor.py +0 -0
  84. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/src/strands_evals/types/trace.py +0 -0
  85. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/__init__.py +0 -0
  86. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/evaluators/test_evaluator.py +0 -0
  87. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/evaluators/test_faithfulness_evaluator.py +0 -0
  88. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py +0 -0
  89. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/evaluators/test_harmfulness_evaluator.py +0 -0
  90. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/evaluators/test_helpfulness_evaluator.py +0 -0
  91. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/evaluators/test_interactions_evaluator.py +0 -0
  92. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/evaluators/test_output_evaluator.py +0 -0
  93. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/evaluators/test_tool_parameter_accuracy_evaluator.py +0 -0
  94. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/evaluators/test_tool_selection_accuracy_evaluator.py +0 -0
  95. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/evaluators/test_trajectory_evaluator.py +0 -0
  96. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/extractors/test_graph_extractor.py +0 -0
  97. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/extractors/test_swarm_extractor.py +0 -0
  98. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/extractors/test_trace_extractor.py +0 -0
  99. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/generators/test_experiment_generator.py +0 -0
  100. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/generators/test_topic_planner.py +0 -0
  101. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/mappers/__init__.py +0 -0
  102. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/mappers/test_strands_in_memory_mapper.py +0 -0
  103. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/simulation/__init__.py +0 -0
  104. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/simulation/test_actor_simulator.py +0 -0
  105. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/simulation/test_goal_completion.py +0 -0
  106. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/telemetry/test_config.py +0 -0
  107. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/telemetry/test_tracer.py +0 -0
  108. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/test_cases.py +0 -0
  109. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/test_experiment.py +0 -0
  110. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/tools/test_evaluation_tools.py +0 -0
  111. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/strands_evals/types/test_trace.py +0 -0
  112. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests/test_integration.py +0 -0
  113. {strands_agents_evals-0.1.2 → strands_agents_evals-0.1.4}/tests_integ/test_output_evaluator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: strands-agents-evals
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: Evaluation framework for Strands
5
5
  Author-email: AWS <opensource@amazon.com>
6
6
  License: Apache-2.0
@@ -22,37 +22,39 @@ def extract_agent_tools_used_from_messages(agent_messages):
22
22
  if message.get("role") == "assistant":
23
23
  message_info = message.get("content")
24
24
  if len(message_info) > 0:
25
- tool = None
25
+ tools = []
26
26
  for message in message_info:
27
27
  if "toolUse" in message:
28
+ tools.append(message.get("toolUse"))
29
+
30
+ for tool in tools:
31
+ if tool:
32
+ tool_name = tool.get("name")
33
+ tool_input = tool.get("input")
34
+ tool_id = tool.get("toolUseId")
35
+ # get the tool result from the next message
36
+ tool_result = None
37
+ is_error = False
38
+ next_message_i = i + 1
39
+ while next_message_i < len(agent_messages):
40
+ next_message = agent_messages[next_message_i]
41
+ next_message_i += 1
42
+
43
+ if next_message.get("role") == "user":
44
+ content = next_message.get("content")
45
+ if content:
46
+ tool_result_dict = content[0].get("toolResult")
47
+ if tool_result_dict and tool_result_dict.get("toolUseId") == tool_id:
48
+ tool_result_content = tool_result_dict.get("content", [])
49
+ if len(tool_result_content) > 0:
50
+ tool_result = tool_result_content[0].get("text")
51
+ is_error = tool_result_dict.get("status") == "error"
52
+ break
53
+
54
+ tools_used.append(
55
+ {"name": tool_name, "input": tool_input, "tool_result": tool_result, "is_error": is_error}
56
+ )
28
57
  tool = message.get("toolUse")
29
-
30
- if tool:
31
- tool_name = tool.get("name")
32
- tool_input = tool.get("input")
33
- tool_id = tool.get("toolUseId")
34
- # get the tool result from the next message
35
- tool_result = None
36
- is_error = False
37
- next_message_i = i + 1
38
- while next_message_i < len(agent_messages):
39
- next_message = agent_messages[next_message_i]
40
- next_message_i += 1
41
-
42
- if next_message.get("role") == "user":
43
- content = next_message.get("content")
44
- if content:
45
- tool_result_dict = content[0].get("toolResult")
46
- if tool_result_dict and tool_result_dict.get("toolUseId") == tool_id:
47
- tool_result_content = tool_result_dict.get("content", [])
48
- if len(tool_result_content) > 0:
49
- tool_result = tool_result_content[0].get("text")
50
- is_error = tool_result_dict.get("status") == "error"
51
- break
52
-
53
- tools_used.append(
54
- {"name": tool_name, "input": tool_input, "tool_result": tool_result, "is_error": is_error}
55
- )
56
58
  return tools_used
57
59
 
58
60
 
@@ -45,9 +45,11 @@ class TraceExtractor:
45
45
  def _extract_trace_level(self, session: Session) -> list[TraceLevelInput]:
46
46
  """Extract trace-level inputs with session history up to each turn."""
47
47
  evaluation_inputs: list[TraceLevelInput] = []
48
- previous_turns: list[Union[UserMessage, AssistantMessage]] = []
48
+ previous_turns: list[Union[UserMessage, list[ToolExecution], AssistantMessage]] = []
49
49
 
50
50
  for trace in session.traces:
51
+ tool_spans = self._find_tool_execution_spans(trace)
52
+
51
53
  for span in trace.spans:
52
54
  if not isinstance(span, AgentInvocationSpan):
53
55
  continue
@@ -59,6 +61,17 @@ class TraceExtractor:
59
61
  logger.warning(f"Failed to create user message: {e}")
60
62
  continue
61
63
 
64
+ # Include tool executions in session history
65
+ if tool_spans:
66
+ try:
67
+ tool_executions = [
68
+ ToolExecution(tool_call=ts.tool_call, tool_result=ts.tool_result)
69
+ for ts in tool_spans
70
+ ]
71
+ previous_turns.append(tool_executions)
72
+ except (AttributeError, TypeError, ValueError) as e:
73
+ logger.warning(f"Failed to create tool executions: {e}")
74
+
62
75
  trace_input = TraceLevelInput(
63
76
  span_info=span.span_info,
64
77
  agent_response=TextContent(text=span.agent_response),
@@ -48,6 +48,70 @@ def test_tools_use_extractor_extract_from_messages_with_tools():
48
48
  assert result[0]["is_error"] is False
49
49
 
50
50
 
51
+ def test_tools_use_extractor_extract_from_messages_with_multiple_tools():
52
+ """Test extracting multiple tool usages from messages"""
53
+ messages = [
54
+ {"role": "user", "content": [{"text": "Calculate 2+2 and search for weather"}]},
55
+ {
56
+ "role": "assistant",
57
+ "content": [
58
+ {"text": "I'll calculate and search for you."},
59
+ {
60
+ "toolUse": {
61
+ "toolUseId": "tool1",
62
+ "name": "calculator",
63
+ "input": {"expression": "2+2"},
64
+ }
65
+ },
66
+ {
67
+ "toolUse": {
68
+ "toolUseId": "tool2",
69
+ "name": "web_search",
70
+ "input": {"query": "current weather"},
71
+ }
72
+ },
73
+ ],
74
+ },
75
+ {
76
+ "role": "user",
77
+ "content": [
78
+ {
79
+ "toolResult": {
80
+ "status": "success",
81
+ "content": [{"text": "Result: 4"}],
82
+ "toolUseId": "tool1",
83
+ }
84
+ }
85
+ ],
86
+ },
87
+ {
88
+ "role": "user",
89
+ "content": [
90
+ {
91
+ "toolResult": {
92
+ "status": "success",
93
+ "content": [{"text": "Sunny, 25°C"}],
94
+ "toolUseId": "tool2",
95
+ }
96
+ }
97
+ ],
98
+ },
99
+ {"role": "assistant", "content": [{"text": "Results: 4 and sunny weather."}]},
100
+ ]
101
+
102
+ result = extract_agent_tools_used_from_messages(messages)
103
+
104
+ assert len(result) == 2
105
+ assert result[0]["name"] == "calculator"
106
+ assert result[0]["input"] == {"expression": "2+2"}
107
+ assert result[0]["tool_result"] == "Result: 4"
108
+ assert result[0]["is_error"] is False
109
+ assert result[1]["name"] == "web_search"
110
+ assert result[1]["input"] == {"query": "current weather"}
111
+ assert result[1]["tool_result"] == "Sunny, 25°C"
112
+ assert result[1]["is_error"] is False
113
+
114
+
51
115
  def test_tools_use_extractor_extract_from_messages_no_tools():
52
116
  """Test extracting tool usage from messages without tool usage"""
53
117
  messages = [