strands-agents-evals 0.1.3__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/PKG-INFO +1 -1
  2. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/extractors/trace_extractor.py +14 -1
  3. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  4. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  5. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  6. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  7. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/.github/dependabot.yml +0 -0
  8. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/.github/workflows/integration-test.yml +0 -0
  9. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/.github/workflows/pr-and-push.yml +0 -0
  10. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/.github/workflows/pypi-publish-on-release.yml +0 -0
  11. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/.github/workflows/test-lint.yml +0 -0
  12. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/.gitignore +0 -0
  13. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/.pre-commit-config.yaml +0 -0
  14. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/CODE_OF_CONDUCT.md +0 -0
  15. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/CONTRIBUTING.md +0 -0
  16. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/LICENSE +0 -0
  17. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/NOTICE +0 -0
  18. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/README.md +0 -0
  19. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/STYLE_GUIDE.md +0 -0
  20. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/pyproject.toml +0 -0
  21. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/__init__.py +0 -0
  22. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/__init__.py +0 -0
  23. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/case.py +0 -0
  24. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/display/display_console.py +0 -0
  25. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/__init__.py +0 -0
  26. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/evaluator.py +0 -0
  27. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/faithfulness_evaluator.py +0 -0
  28. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/goal_success_rate_evaluator.py +0 -0
  29. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/harmfulness_evaluator.py +0 -0
  30. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/helpfulness_evaluator.py +0 -0
  31. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/interactions_evaluator.py +0 -0
  32. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/output_evaluator.py +0 -0
  33. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/case_prompt_template.py +0 -0
  34. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/faithfulness/__init__.py +0 -0
  35. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/faithfulness/faithfulness_v0.py +0 -0
  36. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py +0 -0
  37. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v0.py +0 -0
  38. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/harmfulness/__init__.py +0 -0
  39. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/harmfulness/harmfulness_v0.py +0 -0
  40. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/helpfulness/__init__.py +0 -0
  41. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/helpfulness/helpfulness_v0.py +0 -0
  42. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/prompt_templates.py +0 -0
  43. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/__init__.py +0 -0
  44. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/tool_parameter_accuracy_v0.py +0 -0
  45. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/tool_selection_accuracy/__init__.py +0 -0
  46. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection_accuracy_v0.py +0 -0
  47. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/tool_parameter_accuracy_evaluator.py +0 -0
  48. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/tool_selection_accuracy_evaluator.py +0 -0
  49. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/evaluators/trajectory_evaluator.py +0 -0
  50. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/experiment.py +0 -0
  51. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/extractors/__init__.py +0 -0
  52. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/extractors/graph_extractor.py +0 -0
  53. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/extractors/swarm_extractor.py +0 -0
  54. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/extractors/tools_use_extractor.py +0 -0
  55. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/generators/__init__.py +0 -0
  56. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/generators/experiment_generator.py +0 -0
  57. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/generators/prompt_template/prompt_templates.py +0 -0
  58. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/generators/topic_planner.py +0 -0
  59. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/mappers/__init__.py +0 -0
  60. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/mappers/session_mapper.py +0 -0
  61. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/mappers/strands_in_memory_session_mapper.py +0 -0
  62. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/simulation/README.md +0 -0
  63. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/simulation/__init__.py +0 -0
  64. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/simulation/actor_simulator.py +0 -0
  65. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/simulation/profiles/__init__.py +0 -0
  66. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/simulation/profiles/actor_profile.py +0 -0
  67. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/simulation/prompt_templates/__init__.py +0 -0
  68. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/simulation/prompt_templates/actor_profile_extraction.py +0 -0
  69. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/simulation/prompt_templates/actor_system_prompt.py +0 -0
  70. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/simulation/prompt_templates/goal_completion.py +0 -0
  71. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/simulation/tools/__init__.py +0 -0
  72. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/simulation/tools/goal_completion.py +0 -0
  73. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/telemetry/__init__.py +0 -0
  74. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/telemetry/_cloudwatch_logger.py +0 -0
  75. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/telemetry/config.py +0 -0
  76. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/telemetry/tracer.py +0 -0
  77. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/tools/evaluation_tools.py +0 -0
  78. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/types/__init__.py +0 -0
  79. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/types/evaluation.py +0 -0
  80. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/types/evaluation_report.py +0 -0
  81. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/types/simulation/__init__.py +0 -0
  82. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/types/simulation/actor.py +0 -0
  83. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/src/strands_evals/types/trace.py +0 -0
  84. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/__init__.py +0 -0
  85. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/evaluators/test_evaluator.py +0 -0
  86. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/evaluators/test_faithfulness_evaluator.py +0 -0
  87. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py +0 -0
  88. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/evaluators/test_harmfulness_evaluator.py +0 -0
  89. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/evaluators/test_helpfulness_evaluator.py +0 -0
  90. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/evaluators/test_interactions_evaluator.py +0 -0
  91. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/evaluators/test_output_evaluator.py +0 -0
  92. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/evaluators/test_tool_parameter_accuracy_evaluator.py +0 -0
  93. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/evaluators/test_tool_selection_accuracy_evaluator.py +0 -0
  94. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/evaluators/test_trajectory_evaluator.py +0 -0
  95. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/extractors/test_graph_extractor.py +0 -0
  96. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/extractors/test_swarm_extractor.py +0 -0
  97. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/extractors/test_tools_use_extractor.py +0 -0
  98. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/extractors/test_trace_extractor.py +0 -0
  99. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/generators/test_experiment_generator.py +0 -0
  100. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/generators/test_topic_planner.py +0 -0
  101. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/mappers/__init__.py +0 -0
  102. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/mappers/test_strands_in_memory_mapper.py +0 -0
  103. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/simulation/__init__.py +0 -0
  104. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/simulation/test_actor_simulator.py +0 -0
  105. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/simulation/test_goal_completion.py +0 -0
  106. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/telemetry/test_config.py +0 -0
  107. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/telemetry/test_tracer.py +0 -0
  108. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/test_cases.py +0 -0
  109. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/test_experiment.py +0 -0
  110. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/tools/test_evaluation_tools.py +0 -0
  111. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/strands_evals/types/test_trace.py +0 -0
  112. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests/test_integration.py +0 -0
  113. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.4}/tests_integ/test_output_evaluator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: strands-agents-evals
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Summary: Evaluation framework for Strands
5
5
  Author-email: AWS <opensource@amazon.com>
6
6
  License: Apache-2.0
@@ -45,9 +45,11 @@ class TraceExtractor:
45
45
  def _extract_trace_level(self, session: Session) -> list[TraceLevelInput]:
46
46
  """Extract trace-level inputs with session history up to each turn."""
47
47
  evaluation_inputs: list[TraceLevelInput] = []
48
- previous_turns: list[Union[UserMessage, AssistantMessage]] = []
48
+ previous_turns: list[Union[UserMessage, list[ToolExecution], AssistantMessage]] = []
49
49
 
50
50
  for trace in session.traces:
51
+ tool_spans = self._find_tool_execution_spans(trace)
52
+
51
53
  for span in trace.spans:
52
54
  if not isinstance(span, AgentInvocationSpan):
53
55
  continue
@@ -59,6 +61,17 @@ class TraceExtractor:
59
61
  logger.warning(f"Failed to create user message: {e}")
60
62
  continue
61
63
 
64
+ # Include tool executions in session history
65
+ if tool_spans:
66
+ try:
67
+ tool_executions = [
68
+ ToolExecution(tool_call=ts.tool_call, tool_result=ts.tool_result)
69
+ for ts in tool_spans
70
+ ]
71
+ previous_turns.append(tool_executions)
72
+ except (AttributeError, TypeError, ValueError) as e:
73
+ logger.warning(f"Failed to create tool executions: {e}")
74
+
62
75
  trace_input = TraceLevelInput(
63
76
  span_info=span.span_info,
64
77
  agent_response=TextContent(text=span.agent_response),