strands-agents-evals 0.1.0__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/.github/workflows/pypi-publish-on-release.yml +2 -2
  2. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/PKG-INFO +72 -3
  3. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/README.md +71 -2
  4. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/pyproject.toml +2 -1
  5. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/__init__.py +0 -2
  6. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/experiment.py +21 -14
  7. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/extractors/tools_use_extractor.py +6 -2
  8. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/extractors/test_tools_use_extractor.py +69 -0
  9. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/test_experiment.py +36 -0
  10. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/test_integration.py +42 -0
  11. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  12. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  13. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  14. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  15. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/.github/dependabot.yml +0 -0
  16. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/.github/workflows/integration-test.yml +0 -0
  17. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/.github/workflows/pr-and-push.yml +0 -0
  18. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/.github/workflows/test-lint.yml +0 -0
  19. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/.gitignore +0 -0
  20. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/.pre-commit-config.yaml +0 -0
  21. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/CODE_OF_CONDUCT.md +0 -0
  22. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/CONTRIBUTING.md +0 -0
  23. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/LICENSE +0 -0
  24. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/NOTICE +0 -0
  25. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/STYLE_GUIDE.md +0 -0
  26. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/__init__.py +0 -0
  27. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/case.py +0 -0
  28. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/display/display_console.py +0 -0
  29. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/__init__.py +0 -0
  30. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/evaluator.py +0 -0
  31. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/faithfulness_evaluator.py +0 -0
  32. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/goal_success_rate_evaluator.py +0 -0
  33. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/harmfulness_evaluator.py +0 -0
  34. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/helpfulness_evaluator.py +0 -0
  35. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/interactions_evaluator.py +0 -0
  36. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/output_evaluator.py +0 -0
  37. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/case_prompt_template.py +0 -0
  38. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/faithfulness/__init__.py +0 -0
  39. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/faithfulness/faithfulness_v0.py +0 -0
  40. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py +0 -0
  41. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v0.py +0 -0
  42. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/harmfulness/__init__.py +0 -0
  43. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/harmfulness/harmfulness_v0.py +0 -0
  44. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/helpfulness/__init__.py +0 -0
  45. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/helpfulness/helpfulness_v0.py +0 -0
  46. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/prompt_templates.py +0 -0
  47. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/__init__.py +0 -0
  48. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/tool_parameter_accuracy_v0.py +0 -0
  49. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/tool_selection_accuracy/__init__.py +0 -0
  50. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection_accuracy_v0.py +0 -0
  51. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/tool_parameter_accuracy_evaluator.py +0 -0
  52. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/tool_selection_accuracy_evaluator.py +0 -0
  53. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/trajectory_evaluator.py +0 -0
  54. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/extractors/__init__.py +0 -0
  55. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/extractors/graph_extractor.py +0 -0
  56. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/extractors/swarm_extractor.py +0 -0
  57. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/extractors/trace_extractor.py +0 -0
  58. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/generators/__init__.py +0 -0
  59. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/generators/experiment_generator.py +0 -0
  60. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/generators/prompt_template/prompt_templates.py +0 -0
  61. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/generators/topic_planner.py +0 -0
  62. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/mappers/__init__.py +0 -0
  63. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/mappers/session_mapper.py +0 -0
  64. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/mappers/strands_in_memory_session_mapper.py +0 -0
  65. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/README.md +0 -0
  66. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/__init__.py +0 -0
  67. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/actor_simulator.py +0 -0
  68. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/profiles/__init__.py +0 -0
  69. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/profiles/actor_profile.py +0 -0
  70. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/prompt_templates/__init__.py +0 -0
  71. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/prompt_templates/actor_profile_extraction.py +0 -0
  72. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/prompt_templates/actor_system_prompt.py +0 -0
  73. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/prompt_templates/goal_completion.py +0 -0
  74. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/tools/__init__.py +0 -0
  75. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/tools/goal_completion.py +0 -0
  76. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/telemetry/__init__.py +0 -0
  77. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/telemetry/_cloudwatch_logger.py +0 -0
  78. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/telemetry/config.py +0 -0
  79. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/telemetry/tracer.py +0 -0
  80. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/tools/evaluation_tools.py +0 -0
  81. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/types/__init__.py +0 -0
  82. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/types/evaluation.py +0 -0
  83. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/types/evaluation_report.py +0 -0
  84. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/types/simulation/__init__.py +0 -0
  85. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/types/simulation/actor.py +0 -0
  86. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/types/trace.py +0 -0
  87. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/__init__.py +0 -0
  88. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_evaluator.py +0 -0
  89. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_faithfulness_evaluator.py +0 -0
  90. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py +0 -0
  91. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_harmfulness_evaluator.py +0 -0
  92. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_helpfulness_evaluator.py +0 -0
  93. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_interactions_evaluator.py +0 -0
  94. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_output_evaluator.py +0 -0
  95. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_tool_parameter_accuracy_evaluator.py +0 -0
  96. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_tool_selection_accuracy_evaluator.py +0 -0
  97. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_trajectory_evaluator.py +0 -0
  98. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/extractors/test_graph_extractor.py +0 -0
  99. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/extractors/test_swarm_extractor.py +0 -0
  100. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/extractors/test_trace_extractor.py +0 -0
  101. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/generators/test_experiment_generator.py +0 -0
  102. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/generators/test_topic_planner.py +0 -0
  103. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/mappers/__init__.py +0 -0
  104. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/mappers/test_strands_in_memory_mapper.py +0 -0
  105. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/simulation/__init__.py +0 -0
  106. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/simulation/test_actor_simulator.py +0 -0
  107. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/simulation/test_goal_completion.py +0 -0
  108. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/telemetry/test_config.py +0 -0
  109. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/telemetry/test_tracer.py +0 -0
  110. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/test_cases.py +0 -0
  111. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/tools/test_evaluation_tools.py +0 -0
  112. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/types/test_trace.py +0 -0
  113. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests_integ/test_output_evaluator.py +0 -0
@@ -52,7 +52,7 @@ jobs:
52
52
  hatch build
53
53
 
54
54
  - name: Store the distribution packages
55
- uses: actions/upload-artifact@v5
55
+ uses: actions/upload-artifact@v6
56
56
  with:
57
57
  name: python-package-distributions
58
58
  path: dist/
@@ -74,7 +74,7 @@ jobs:
74
74
 
75
75
  steps:
76
76
  - name: Download all the dists
77
- uses: actions/download-artifact@v4
77
+ uses: actions/download-artifact@v7
78
78
  with:
79
79
  name: python-package-distributions
80
80
  path: dist/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: strands-agents-evals
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: Evaluation framework for Strands
5
5
  Author-email: AWS <opensource@amazon.com>
6
6
  License: Apache-2.0
@@ -68,6 +68,7 @@ Strands Evaluation is a powerful framework for evaluating AI agents and LLM appl
68
68
  ## Feature Overview
69
69
 
70
70
  - **Multiple Evaluation Types**: Output evaluation, trajectory analysis, tool usage assessment, and interaction evaluation
71
+ - **Dynamic Simulators**: Multi-turn conversation simulation with realistic user behavior and goal-oriented interactions
71
72
  - **LLM-as-a-Judge**: Built-in evaluators using language models for sophisticated assessment with structured scoring
72
73
  - **Trace-based Evaluation**: Analyze agent behavior through OpenTelemetry execution traces
73
74
  - **Automated Experiment Generation**: Generate comprehensive test suites from context descriptions
@@ -226,6 +227,73 @@ reports = experiment.run_evaluations(user_task_function)
226
227
  reports[0].run_display()
227
228
  ```
228
229
 
230
+ ### Multi-turn Conversation Simulation
231
+
232
+ Simulate realistic user interactions with dynamic, goal-oriented conversations using ActorSimulator:
233
+
234
+ ```python
235
+ from strands import Agent
236
+ from strands_evals import Case, Experiment, ActorSimulator
237
+ from strands_evals.evaluators import HelpfulnessEvaluator, GoalSuccessRateEvaluator
238
+ from strands_evals.mappers import StrandsInMemorySessionMapper
239
+ from strands_evals.telemetry import StrandsEvalsTelemetry
240
+
241
+ # Setup telemetry
242
+ telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter()
243
+ memory_exporter = telemetry.in_memory_exporter
244
+
245
+ def task_function(case: Case) -> dict:
246
+ # Create simulator to drive conversation
247
+ simulator = ActorSimulator.from_case_for_user_simulator(
248
+ case=case,
249
+ max_turns=10
250
+ )
251
+
252
+ # Create agent to evaluate
253
+ agent = Agent(
254
+ trace_attributes={
255
+ "gen_ai.conversation.id": case.session_id,
256
+ "session.id": case.session_id
257
+ },
258
+ callback_handler=None
259
+ )
260
+
261
+ # Run multi-turn conversation
262
+ all_spans = []
263
+ user_message = case.input
264
+
265
+ while simulator.has_next():
266
+ memory_exporter.clear()
267
+ agent_response = agent(user_message)
268
+ turn_spans = list(memory_exporter.get_finished_spans())
269
+ all_spans.extend(turn_spans)
270
+
271
+ user_result = simulator.act(str(agent_response))
272
+ user_message = str(user_result.structured_output.message)
273
+
274
+ # Map to session for evaluation
275
+ mapper = StrandsInMemorySessionMapper()
276
+ session = mapper.map_to_session(all_spans, session_id=case.session_id)
277
+
278
+ return {"output": str(agent_response), "trajectory": session}
279
+
280
+ # Use evaluators to assess simulated conversations
281
+ evaluators = [
282
+ HelpfulnessEvaluator(),
283
+ GoalSuccessRateEvaluator()
284
+ ]
285
+
286
+ experiment = Experiment(cases=test_cases, evaluators=evaluators)
287
+ reports = experiment.run_evaluations(task_function)
288
+ ```
289
+
290
+ **Key Benefits:**
291
+ - **Dynamic Interactions**: Simulator adapts responses based on agent behavior
292
+ - **Goal-Oriented Testing**: Verify agents can complete user objectives through dialogue
293
+ - **Realistic Conversations**: Generate authentic multi-turn interaction patterns
294
+ - **No Predefined Scripts**: Test agents without hardcoded conversation paths
295
+ - **Comprehensive Evaluation**: Combine with trace-based evaluators for full assessment
296
+
229
297
  ### Automated Experiment Generation
230
298
 
231
299
  Generate comprehensive test suites automatically from context descriptions:
@@ -388,8 +456,9 @@ reports[0].run_display() # Interactive display with metrics breakdown
388
456
 
389
457
  For detailed guidance & examples, explore our documentation:
390
458
 
391
- - [User Guide](https://strandsagents.com/latest//user-guide/evals-sdk/quickstart.md)
392
- - [Evaluator Reference](https://strandsagents.com/latest/user-guide/evals-sdk/evaluators/)
459
+ - [User Guide](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/quickstart/)
460
+ - [Evaluator Reference](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/evaluators/)
461
+ - [Simulators Guide](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/simulators/)
393
462
 
394
463
  ## Contributing ❤️
395
464
 
@@ -36,6 +36,7 @@ Strands Evaluation is a powerful framework for evaluating AI agents and LLM appl
36
36
  ## Feature Overview
37
37
 
38
38
  - **Multiple Evaluation Types**: Output evaluation, trajectory analysis, tool usage assessment, and interaction evaluation
39
+ - **Dynamic Simulators**: Multi-turn conversation simulation with realistic user behavior and goal-oriented interactions
39
40
  - **LLM-as-a-Judge**: Built-in evaluators using language models for sophisticated assessment with structured scoring
40
41
  - **Trace-based Evaluation**: Analyze agent behavior through OpenTelemetry execution traces
41
42
  - **Automated Experiment Generation**: Generate comprehensive test suites from context descriptions
@@ -194,6 +195,73 @@ reports = experiment.run_evaluations(user_task_function)
194
195
  reports[0].run_display()
195
196
  ```
196
197
 
198
+ ### Multi-turn Conversation Simulation
199
+
200
+ Simulate realistic user interactions with dynamic, goal-oriented conversations using ActorSimulator:
201
+
202
+ ```python
203
+ from strands import Agent
204
+ from strands_evals import Case, Experiment, ActorSimulator
205
+ from strands_evals.evaluators import HelpfulnessEvaluator, GoalSuccessRateEvaluator
206
+ from strands_evals.mappers import StrandsInMemorySessionMapper
207
+ from strands_evals.telemetry import StrandsEvalsTelemetry
208
+
209
+ # Setup telemetry
210
+ telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter()
211
+ memory_exporter = telemetry.in_memory_exporter
212
+
213
+ def task_function(case: Case) -> dict:
214
+ # Create simulator to drive conversation
215
+ simulator = ActorSimulator.from_case_for_user_simulator(
216
+ case=case,
217
+ max_turns=10
218
+ )
219
+
220
+ # Create agent to evaluate
221
+ agent = Agent(
222
+ trace_attributes={
223
+ "gen_ai.conversation.id": case.session_id,
224
+ "session.id": case.session_id
225
+ },
226
+ callback_handler=None
227
+ )
228
+
229
+ # Run multi-turn conversation
230
+ all_spans = []
231
+ user_message = case.input
232
+
233
+ while simulator.has_next():
234
+ memory_exporter.clear()
235
+ agent_response = agent(user_message)
236
+ turn_spans = list(memory_exporter.get_finished_spans())
237
+ all_spans.extend(turn_spans)
238
+
239
+ user_result = simulator.act(str(agent_response))
240
+ user_message = str(user_result.structured_output.message)
241
+
242
+ # Map to session for evaluation
243
+ mapper = StrandsInMemorySessionMapper()
244
+ session = mapper.map_to_session(all_spans, session_id=case.session_id)
245
+
246
+ return {"output": str(agent_response), "trajectory": session}
247
+
248
+ # Use evaluators to assess simulated conversations
249
+ evaluators = [
250
+ HelpfulnessEvaluator(),
251
+ GoalSuccessRateEvaluator()
252
+ ]
253
+
254
+ experiment = Experiment(cases=test_cases, evaluators=evaluators)
255
+ reports = experiment.run_evaluations(task_function)
256
+ ```
257
+
258
+ **Key Benefits:**
259
+ - **Dynamic Interactions**: Simulator adapts responses based on agent behavior
260
+ - **Goal-Oriented Testing**: Verify agents can complete user objectives through dialogue
261
+ - **Realistic Conversations**: Generate authentic multi-turn interaction patterns
262
+ - **No Predefined Scripts**: Test agents without hardcoded conversation paths
263
+ - **Comprehensive Evaluation**: Combine with trace-based evaluators for full assessment
264
+
197
265
  ### Automated Experiment Generation
198
266
 
199
267
  Generate comprehensive test suites automatically from context descriptions:
@@ -356,8 +424,9 @@ reports[0].run_display() # Interactive display with metrics breakdown
356
424
 
357
425
  For detailed guidance & examples, explore our documentation:
358
426
 
359
- - [User Guide](https://strandsagents.com/latest//user-guide/evals-sdk/quickstart.md)
360
- - [Evaluator Reference](https://strandsagents.com/latest/user-guide/evals-sdk/evaluators/)
427
+ - [User Guide](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/quickstart/)
428
+ - [Evaluator Reference](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/evaluators/)
429
+ - [Simulators Guide](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/simulators/)
361
430
 
362
431
  ## Contributing ❤️
363
432
 
@@ -138,7 +138,8 @@ disable_error_code = [
138
138
  disallow_untyped_decorators = false
139
139
 
140
140
  [tool.hatch.version]
141
- path = "src/strands_evals/__init__.py"
141
+ source = "vcs" # Use git tags for versioning
142
+
142
143
  [tool.pytest.ini_options]
143
144
  asyncio_mode = "auto"
144
145
  testpaths = ["tests"]
@@ -1,5 +1,3 @@
1
- __version__ = "0.1.0"
2
-
3
1
  from . import evaluators, extractors, generators, simulation, telemetry, types
4
2
  from .case import Case
5
3
  from .experiment import Experiment
@@ -391,8 +391,8 @@ class Experiment(Generic[InputT, OutputT]):
391
391
  "gen_ai.evaluation.case.input": serialize(case.input),
392
392
  },
393
393
  ) as case_span:
394
+ # Task execution span - execute once
394
395
  try:
395
- # Task execution span - execute once
396
396
  with self._tracer.start_as_current_span(
397
397
  "task_execution",
398
398
  attributes={
@@ -414,9 +414,21 @@ class Experiment(Generic[InputT, OutputT]):
414
414
  ),
415
415
  }
416
416
  )
417
-
418
- # Evaluate with each evaluator using the same task output
417
+ except Exception as e:
418
+ case_span.record_exception(e)
419
419
  for evaluator in self._evaluators:
420
+ eval_name = evaluator.get_type_name()
421
+ evaluator_data[eval_name]["cases"].append(case.model_dump())
422
+ evaluator_data[eval_name]["test_passes"].append(False)
423
+ evaluator_data[eval_name]["scores"].append(0)
424
+ evaluator_data[eval_name]["reasons"].append(f"Task execution error: {str(e)}")
425
+ evaluator_data[eval_name]["detailed_results"].append([])
426
+ continue
427
+
428
+ # Evaluate with each evaluator using the same task output
429
+ for evaluator in self._evaluators:
430
+ eval_name = evaluator.get_type_name()
431
+ try:
420
432
  with self._tracer.start_as_current_span(
421
433
  f"evaluator {evaluator.get_type_name()}",
422
434
  attributes={
@@ -436,21 +448,16 @@ class Experiment(Generic[InputT, OutputT]):
436
448
  }
437
449
  )
438
450
 
439
- eval_name = evaluator.get_type_name()
440
451
  evaluator_data[eval_name]["cases"].append(evaluation_context.model_dump())
441
452
  evaluator_data[eval_name]["test_passes"].append(aggregate_pass)
442
453
  evaluator_data[eval_name]["scores"].append(aggregate_score)
443
454
  evaluator_data[eval_name]["reasons"].append(aggregate_reason or "")
444
455
  evaluator_data[eval_name]["detailed_results"].append(evaluation_outputs)
445
-
446
- except Exception as e:
447
- case_span.record_exception(e)
448
- for evaluator in self._evaluators:
449
- eval_name = evaluator.get_type_name()
450
- evaluator_data[eval_name]["cases"].append(case.model_dump())
456
+ except Exception as e:
457
+ evaluator_data[eval_name]["cases"].append(evaluation_context.model_dump())
451
458
  evaluator_data[eval_name]["test_passes"].append(False)
452
459
  evaluator_data[eval_name]["scores"].append(0)
453
- evaluator_data[eval_name]["reasons"].append(f"An error occured : {str(e)}")
460
+ evaluator_data[eval_name]["reasons"].append(f"Evaluator error: {str(e)}")
454
461
  evaluator_data[eval_name]["detailed_results"].append([])
455
462
 
456
463
  reports = []
@@ -577,8 +584,8 @@ class Experiment(Generic[InputT, OutputT]):
577
584
 
578
585
  file_path.parent.mkdir(parents=True, exist_ok=True)
579
586
 
580
- with open(file_path, "w") as f:
581
- json.dump(self.to_dict(), f, indent=2)
587
+ with open(file_path, "w", encoding="utf-8") as f:
588
+ json.dump(self.to_dict(), f, indent=2, ensure_ascii=False)
582
589
 
583
590
  @classmethod
584
591
  def from_dict(cls, data: dict, custom_evaluators: list[type[Evaluator]] | None = None):
@@ -646,7 +653,7 @@ class Experiment(Generic[InputT, OutputT]):
646
653
  f"Only .json format is supported. Got file: {path}. Please provide a path with .json extension."
647
654
  )
648
655
 
649
- with open(file_path, "r") as f:
656
+ with open(file_path, "r", encoding="utf-8") as f:
650
657
  data = json.load(f)
651
658
 
652
659
  return cls.from_dict(data, custom_evaluators)
@@ -33,6 +33,7 @@ def extract_agent_tools_used_from_messages(agent_messages):
33
33
  tool_id = tool.get("toolUseId")
34
34
  # get the tool result from the next message
35
35
  tool_result = None
36
+ is_error = False
36
37
  next_message_i = i + 1
37
38
  while next_message_i < len(agent_messages):
38
39
  next_message = agent_messages[next_message_i]
@@ -42,13 +43,16 @@ def extract_agent_tools_used_from_messages(agent_messages):
42
43
  content = next_message.get("content")
43
44
  if content:
44
45
  tool_result_dict = content[0].get("toolResult")
45
- if tool_result_dict.get("toolUseId") == tool_id:
46
+ if tool_result_dict and tool_result_dict.get("toolUseId") == tool_id:
46
47
  tool_result_content = tool_result_dict.get("content", [])
47
48
  if len(tool_result_content) > 0:
48
49
  tool_result = tool_result_content[0].get("text")
50
+ is_error = tool_result_dict.get("status") == "error"
49
51
  break
50
52
 
51
- tools_used.append({"name": tool_name, "input": tool_input, "tool_result": tool_result})
53
+ tools_used.append(
54
+ {"name": tool_name, "input": tool_input, "tool_result": tool_result, "is_error": is_error}
55
+ )
52
56
  return tools_used
53
57
 
54
58
 
@@ -45,6 +45,7 @@ def test_tools_use_extractor_extract_from_messages_with_tools():
45
45
  assert result[0]["name"] == "calculator"
46
46
  assert result[0]["input"] == {"expression": "2+2"}
47
47
  assert result[0]["tool_result"] == "Result: 4"
48
+ assert result[0]["is_error"] is False
48
49
 
49
50
 
50
51
  def test_tools_use_extractor_extract_from_messages_no_tools():
@@ -59,6 +60,38 @@ def test_tools_use_extractor_extract_from_messages_no_tools():
59
60
  assert result == []
60
61
 
61
62
 
63
+ def test_tools_use_extractor_extract_from_messages_with_error():
64
+ """Test extracting tool usage from messages with error status"""
65
+ messages = [
66
+ {"role": "user", "content": [{"text": "Calculate invalid"}]},
67
+ {
68
+ "role": "assistant",
69
+ "content": [
70
+ {"toolUse": {"toolUseId": "tool_123", "name": "calculator", "input": {"expression": "invalid"}}},
71
+ ],
72
+ },
73
+ {
74
+ "role": "user",
75
+ "content": [
76
+ {
77
+ "toolResult": {
78
+ "status": "error",
79
+ "content": [{"text": "Invalid expression"}],
80
+ "toolUseId": "tool_123",
81
+ }
82
+ }
83
+ ],
84
+ },
85
+ ]
86
+
87
+ result = extract_agent_tools_used_from_messages(messages)
88
+
89
+ assert len(result) == 1
90
+ assert result[0]["name"] == "calculator"
91
+ assert result[0]["tool_result"] == "Invalid expression"
92
+ assert result[0]["is_error"] is True
93
+
94
+
62
95
  def test_tools_use_extractor_extract_from_messages_empty():
63
96
  """Test extracting tool usage from empty messages"""
64
97
  result = extract_agent_tools_used_from_messages([])
@@ -96,6 +129,7 @@ def test_tools_use_extractor_extract_from_messages_no_tool_result():
96
129
  assert result[0]["name"] == "calculator"
97
130
  assert result[0]["input"] == {"expression": "2+2"}
98
131
  assert result[0]["tool_result"] is None
132
+ assert result[0]["is_error"] is False
99
133
 
100
134
 
101
135
  def test_tools_use_extractor_extract_from_messages_malformed_tool_result():
@@ -209,3 +243,38 @@ def test_tools_use_extractor_extract_tools_description_empty():
209
243
  result = extract_tools_description(mock_agent, is_short=True)
210
244
 
211
245
  assert result == {}
246
+
247
+
248
+ def test_tools_use_extractor_extract_from_messages_user_message_without_tool_result():
249
+ """Test extracting tool usage when user message content lacks toolResult key."""
250
+ messages = [
251
+ {
252
+ "role": "assistant",
253
+ "content": [
254
+ {"toolUse": {"toolUseId": "tool_abc", "name": "calculator", "input": {"expression": "5+5"}}},
255
+ ],
256
+ },
257
+ {
258
+ "role": "user",
259
+ "content": [{"text": "Some user text without toolResult"}], # No toolResult key
260
+ },
261
+ {
262
+ "role": "user",
263
+ "content": [
264
+ {
265
+ "toolResult": {
266
+ "status": "success",
267
+ "content": [{"text": "Result: 10"}],
268
+ "toolUseId": "tool_abc",
269
+ }
270
+ }
271
+ ],
272
+ },
273
+ ]
274
+ result = extract_agent_tools_used_from_messages(messages)
275
+
276
+ assert len(result) == 1
277
+ assert result[0]["name"] == "calculator"
278
+ assert result[0]["input"] == {"expression": "5+5"}
279
+ assert result[0]["tool_result"] == "Result: 10"
280
+ assert result[0]["is_error"] is False
@@ -34,6 +34,16 @@ class MockEvaluator2(Evaluator[str, str]):
34
34
  return [EvaluationOutput(score=0.5, test_pass=True, reason="Async test evaluation 2")]
35
35
 
36
36
 
37
+ class ThrowingEvaluator(Evaluator[str, str]):
38
+ """Evaluator that always throws an exception - used to test error isolation"""
39
+
40
+ def evaluate(self, evaluation_case: EvaluationData[str, str]) -> list[EvaluationOutput]:
41
+ raise RuntimeError("Evaluator exploded")
42
+
43
+ async def evaluate_async(self, evaluation_case: EvaluationData[str, str]) -> list[EvaluationOutput]:
44
+ raise RuntimeError("Async evaluator exploded")
45
+
46
+
37
47
  @pytest.fixture
38
48
  def mock_evaluator():
39
49
  return MockEvaluator()
@@ -1052,3 +1062,29 @@ def test_experiment_run_evaluations_multiple_cases(mock_span, simple_task):
1052
1062
  assert len(reports) == 1
1053
1063
  assert len(reports[0].scores) == 2
1054
1064
  assert all(score == 1.0 for score in reports[0].scores)
1065
+
1066
+
1067
+ def test_experiment_run_evaluations_evaluator_error_isolated():
1068
+ """Test that one evaluator failing doesn't affect other evaluators."""
1069
+ case = Case(name="test", input="hello", expected_output="hello")
1070
+
1071
+ # MockEvaluator succeeds, ThrowingEvaluator fails
1072
+ experiment = Experiment(cases=[case], evaluators=[MockEvaluator(), ThrowingEvaluator()])
1073
+
1074
+ def echo_task(c):
1075
+ return c.input
1076
+
1077
+ reports = experiment.run_evaluations(echo_task)
1078
+
1079
+ assert len(reports) == 2
1080
+
1081
+ # First evaluator (MockEvaluator) should succeed
1082
+ assert reports[0].scores[0] == 1.0
1083
+ assert reports[0].test_passes[0] is True
1084
+ assert reports[0].reasons[0] == "Mock evaluation"
1085
+
1086
+ # Second evaluator (ThrowingEvaluator) should fail with error message
1087
+ assert reports[1].scores[0] == 0
1088
+ assert reports[1].test_passes[0] is False
1089
+ assert "Evaluator error" in reports[1].reasons[0]
1090
+ assert "Evaluator exploded" in reports[1].reasons[0]
@@ -348,3 +348,45 @@ async def test_async_dataset_with_interactions(interaction_case):
348
348
  assert len(report.cases) == 1
349
349
  assert report.cases[0].get("actual_interactions") is not None
350
350
  assert len(report.cases[0].get("actual_interactions")) == 2
351
+
352
+
353
+ def test_integration_tool_error_extraction():
354
+ """Test that is_error field is correctly extracted from tool execution"""
355
+ from strands_evals.extractors.tools_use_extractor import extract_agent_tools_used_from_messages
356
+
357
+ # Create mock messages simulating tool success and error
358
+ messages = [
359
+ {"role": "user", "content": [{"text": "test"}]},
360
+ {
361
+ "role": "assistant",
362
+ "content": [
363
+ {"toolUse": {"toolUseId": "tool1", "name": "success_tool", "input": {}}},
364
+ ],
365
+ },
366
+ {
367
+ "role": "user",
368
+ "content": [
369
+ {"toolResult": {"status": "success", "content": [{"text": "ok"}], "toolUseId": "tool1"}},
370
+ ],
371
+ },
372
+ {
373
+ "role": "assistant",
374
+ "content": [
375
+ {"toolUse": {"toolUseId": "tool2", "name": "error_tool", "input": {}}},
376
+ ],
377
+ },
378
+ {
379
+ "role": "user",
380
+ "content": [
381
+ {"toolResult": {"status": "error", "content": [{"text": "failed"}], "toolUseId": "tool2"}},
382
+ ],
383
+ },
384
+ ]
385
+
386
+ tools_used = extract_agent_tools_used_from_messages(messages)
387
+
388
+ assert len(tools_used) == 2
389
+ assert tools_used[0]["name"] == "success_tool"
390
+ assert tools_used[0]["is_error"] is False
391
+ assert tools_used[1]["name"] == "error_tool"
392
+ assert tools_used[1]["is_error"] is True