strands-agents-evals 0.1.0__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/.github/workflows/pypi-publish-on-release.yml +2 -2
  2. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/PKG-INFO +72 -3
  3. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/README.md +71 -2
  4. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/pyproject.toml +2 -1
  5. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/__init__.py +0 -2
  6. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/experiment.py +3 -3
  7. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/extractors/tools_use_extractor.py +5 -1
  8. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/extractors/test_tools_use_extractor.py +34 -0
  9. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/test_integration.py +42 -0
  10. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  11. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  12. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  13. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  14. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/.github/dependabot.yml +0 -0
  15. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/.github/workflows/integration-test.yml +0 -0
  16. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/.github/workflows/pr-and-push.yml +0 -0
  17. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/.github/workflows/test-lint.yml +0 -0
  18. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/.gitignore +0 -0
  19. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/.pre-commit-config.yaml +0 -0
  20. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/CODE_OF_CONDUCT.md +0 -0
  21. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/CONTRIBUTING.md +0 -0
  22. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/LICENSE +0 -0
  23. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/NOTICE +0 -0
  24. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/STYLE_GUIDE.md +0 -0
  25. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/__init__.py +0 -0
  26. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/case.py +0 -0
  27. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/display/display_console.py +0 -0
  28. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/evaluators/__init__.py +0 -0
  29. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/evaluators/evaluator.py +0 -0
  30. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/evaluators/faithfulness_evaluator.py +0 -0
  31. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/evaluators/goal_success_rate_evaluator.py +0 -0
  32. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/evaluators/harmfulness_evaluator.py +0 -0
  33. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/evaluators/helpfulness_evaluator.py +0 -0
  34. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/evaluators/interactions_evaluator.py +0 -0
  35. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/evaluators/output_evaluator.py +0 -0
  36. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/evaluators/prompt_templates/case_prompt_template.py +0 -0
  37. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/evaluators/prompt_templates/faithfulness/__init__.py +0 -0
  38. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/evaluators/prompt_templates/faithfulness/faithfulness_v0.py +0 -0
  39. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py +0 -0
  40. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v0.py +0 -0
  41. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/evaluators/prompt_templates/harmfulness/__init__.py +0 -0
  42. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/evaluators/prompt_templates/harmfulness/harmfulness_v0.py +0 -0
  43. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/evaluators/prompt_templates/helpfulness/__init__.py +0 -0
  44. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/evaluators/prompt_templates/helpfulness/helpfulness_v0.py +0 -0
  45. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/evaluators/prompt_templates/prompt_templates.py +0 -0
  46. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/__init__.py +0 -0
  47. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/tool_parameter_accuracy_v0.py +0 -0
  48. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/evaluators/prompt_templates/tool_selection_accuracy/__init__.py +0 -0
  49. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection_accuracy_v0.py +0 -0
  50. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/evaluators/tool_parameter_accuracy_evaluator.py +0 -0
  51. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/evaluators/tool_selection_accuracy_evaluator.py +0 -0
  52. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/evaluators/trajectory_evaluator.py +0 -0
  53. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/extractors/__init__.py +0 -0
  54. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/extractors/graph_extractor.py +0 -0
  55. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/extractors/swarm_extractor.py +0 -0
  56. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/extractors/trace_extractor.py +0 -0
  57. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/generators/__init__.py +0 -0
  58. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/generators/experiment_generator.py +0 -0
  59. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/generators/prompt_template/prompt_templates.py +0 -0
  60. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/generators/topic_planner.py +0 -0
  61. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/mappers/__init__.py +0 -0
  62. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/mappers/session_mapper.py +0 -0
  63. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/mappers/strands_in_memory_session_mapper.py +0 -0
  64. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/simulation/README.md +0 -0
  65. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/simulation/__init__.py +0 -0
  66. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/simulation/actor_simulator.py +0 -0
  67. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/simulation/profiles/__init__.py +0 -0
  68. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/simulation/profiles/actor_profile.py +0 -0
  69. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/simulation/prompt_templates/__init__.py +0 -0
  70. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/simulation/prompt_templates/actor_profile_extraction.py +0 -0
  71. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/simulation/prompt_templates/actor_system_prompt.py +0 -0
  72. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/simulation/prompt_templates/goal_completion.py +0 -0
  73. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/simulation/tools/__init__.py +0 -0
  74. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/simulation/tools/goal_completion.py +0 -0
  75. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/telemetry/__init__.py +0 -0
  76. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/telemetry/_cloudwatch_logger.py +0 -0
  77. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/telemetry/config.py +0 -0
  78. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/telemetry/tracer.py +0 -0
  79. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/tools/evaluation_tools.py +0 -0
  80. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/types/__init__.py +0 -0
  81. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/types/evaluation.py +0 -0
  82. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/types/evaluation_report.py +0 -0
  83. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/types/simulation/__init__.py +0 -0
  84. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/types/simulation/actor.py +0 -0
  85. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/src/strands_evals/types/trace.py +0 -0
  86. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/__init__.py +0 -0
  87. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/evaluators/test_evaluator.py +0 -0
  88. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/evaluators/test_faithfulness_evaluator.py +0 -0
  89. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py +0 -0
  90. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/evaluators/test_harmfulness_evaluator.py +0 -0
  91. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/evaluators/test_helpfulness_evaluator.py +0 -0
  92. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/evaluators/test_interactions_evaluator.py +0 -0
  93. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/evaluators/test_output_evaluator.py +0 -0
  94. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/evaluators/test_tool_parameter_accuracy_evaluator.py +0 -0
  95. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/evaluators/test_tool_selection_accuracy_evaluator.py +0 -0
  96. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/evaluators/test_trajectory_evaluator.py +0 -0
  97. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/extractors/test_graph_extractor.py +0 -0
  98. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/extractors/test_swarm_extractor.py +0 -0
  99. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/extractors/test_trace_extractor.py +0 -0
  100. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/generators/test_experiment_generator.py +0 -0
  101. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/generators/test_topic_planner.py +0 -0
  102. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/mappers/__init__.py +0 -0
  103. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/mappers/test_strands_in_memory_mapper.py +0 -0
  104. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/simulation/__init__.py +0 -0
  105. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/simulation/test_actor_simulator.py +0 -0
  106. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/simulation/test_goal_completion.py +0 -0
  107. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/telemetry/test_config.py +0 -0
  108. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/telemetry/test_tracer.py +0 -0
  109. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/test_cases.py +0 -0
  110. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/test_experiment.py +0 -0
  111. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/tools/test_evaluation_tools.py +0 -0
  112. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests/strands_evals/types/test_trace.py +0 -0
  113. {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.1}/tests_integ/test_output_evaluator.py +0 -0
@@ -52,7 +52,7 @@ jobs:
52
52
  hatch build
53
53
 
54
54
  - name: Store the distribution packages
55
- uses: actions/upload-artifact@v5
55
+ uses: actions/upload-artifact@v6
56
56
  with:
57
57
  name: python-package-distributions
58
58
  path: dist/
@@ -74,7 +74,7 @@ jobs:
74
74
 
75
75
  steps:
76
76
  - name: Download all the dists
77
- uses: actions/download-artifact@v4
77
+ uses: actions/download-artifact@v7
78
78
  with:
79
79
  name: python-package-distributions
80
80
  path: dist/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: strands-agents-evals
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: Evaluation framework for Strands
5
5
  Author-email: AWS <opensource@amazon.com>
6
6
  License: Apache-2.0
@@ -68,6 +68,7 @@ Strands Evaluation is a powerful framework for evaluating AI agents and LLM appl
68
68
  ## Feature Overview
69
69
 
70
70
  - **Multiple Evaluation Types**: Output evaluation, trajectory analysis, tool usage assessment, and interaction evaluation
71
+ - **Dynamic Simulators**: Multi-turn conversation simulation with realistic user behavior and goal-oriented interactions
71
72
  - **LLM-as-a-Judge**: Built-in evaluators using language models for sophisticated assessment with structured scoring
72
73
  - **Trace-based Evaluation**: Analyze agent behavior through OpenTelemetry execution traces
73
74
  - **Automated Experiment Generation**: Generate comprehensive test suites from context descriptions
@@ -226,6 +227,73 @@ reports = experiment.run_evaluations(user_task_function)
226
227
  reports[0].run_display()
227
228
  ```
228
229
 
230
+ ### Multi-turn Conversation Simulation
231
+
232
+ Simulate realistic user interactions with dynamic, goal-oriented conversations using ActorSimulator:
233
+
234
+ ```python
235
+ from strands import Agent
236
+ from strands_evals import Case, Experiment, ActorSimulator
237
+ from strands_evals.evaluators import HelpfulnessEvaluator, GoalSuccessRateEvaluator
238
+ from strands_evals.mappers import StrandsInMemorySessionMapper
239
+ from strands_evals.telemetry import StrandsEvalsTelemetry
240
+
241
+ # Setup telemetry
242
+ telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter()
243
+ memory_exporter = telemetry.in_memory_exporter
244
+
245
+ def task_function(case: Case) -> dict:
246
+ # Create simulator to drive conversation
247
+ simulator = ActorSimulator.from_case_for_user_simulator(
248
+ case=case,
249
+ max_turns=10
250
+ )
251
+
252
+ # Create agent to evaluate
253
+ agent = Agent(
254
+ trace_attributes={
255
+ "gen_ai.conversation.id": case.session_id,
256
+ "session.id": case.session_id
257
+ },
258
+ callback_handler=None
259
+ )
260
+
261
+ # Run multi-turn conversation
262
+ all_spans = []
263
+ user_message = case.input
264
+
265
+ while simulator.has_next():
266
+ memory_exporter.clear()
267
+ agent_response = agent(user_message)
268
+ turn_spans = list(memory_exporter.get_finished_spans())
269
+ all_spans.extend(turn_spans)
270
+
271
+ user_result = simulator.act(str(agent_response))
272
+ user_message = str(user_result.structured_output.message)
273
+
274
+ # Map to session for evaluation
275
+ mapper = StrandsInMemorySessionMapper()
276
+ session = mapper.map_to_session(all_spans, session_id=case.session_id)
277
+
278
+ return {"output": str(agent_response), "trajectory": session}
279
+
280
+ # Use evaluators to assess simulated conversations
281
+ evaluators = [
282
+ HelpfulnessEvaluator(),
283
+ GoalSuccessRateEvaluator()
284
+ ]
285
+
286
+ experiment = Experiment(cases=test_cases, evaluators=evaluators)
287
+ reports = experiment.run_evaluations(task_function)
288
+ ```
289
+
290
+ **Key Benefits:**
291
+ - **Dynamic Interactions**: Simulator adapts responses based on agent behavior
292
+ - **Goal-Oriented Testing**: Verify agents can complete user objectives through dialogue
293
+ - **Realistic Conversations**: Generate authentic multi-turn interaction patterns
294
+ - **No Predefined Scripts**: Test agents without hardcoded conversation paths
295
+ - **Comprehensive Evaluation**: Combine with trace-based evaluators for full assessment
296
+
229
297
  ### Automated Experiment Generation
230
298
 
231
299
  Generate comprehensive test suites automatically from context descriptions:
@@ -388,8 +456,9 @@ reports[0].run_display() # Interactive display with metrics breakdown
388
456
 
389
457
  For detailed guidance & examples, explore our documentation:
390
458
 
391
- - [User Guide](https://strandsagents.com/latest//user-guide/evals-sdk/quickstart.md)
392
- - [Evaluator Reference](https://strandsagents.com/latest/user-guide/evals-sdk/evaluators/)
459
+ - [User Guide](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/quickstart/)
460
+ - [Evaluator Reference](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/evaluators/)
461
+ - [Simulators Guide](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/simulators/)
393
462
 
394
463
  ## Contributing ❤️
395
464
 
@@ -36,6 +36,7 @@ Strands Evaluation is a powerful framework for evaluating AI agents and LLM appl
36
36
  ## Feature Overview
37
37
 
38
38
  - **Multiple Evaluation Types**: Output evaluation, trajectory analysis, tool usage assessment, and interaction evaluation
39
+ - **Dynamic Simulators**: Multi-turn conversation simulation with realistic user behavior and goal-oriented interactions
39
40
  - **LLM-as-a-Judge**: Built-in evaluators using language models for sophisticated assessment with structured scoring
40
41
  - **Trace-based Evaluation**: Analyze agent behavior through OpenTelemetry execution traces
41
42
  - **Automated Experiment Generation**: Generate comprehensive test suites from context descriptions
@@ -194,6 +195,73 @@ reports = experiment.run_evaluations(user_task_function)
194
195
  reports[0].run_display()
195
196
  ```
196
197
 
198
+ ### Multi-turn Conversation Simulation
199
+
200
+ Simulate realistic user interactions with dynamic, goal-oriented conversations using ActorSimulator:
201
+
202
+ ```python
203
+ from strands import Agent
204
+ from strands_evals import Case, Experiment, ActorSimulator
205
+ from strands_evals.evaluators import HelpfulnessEvaluator, GoalSuccessRateEvaluator
206
+ from strands_evals.mappers import StrandsInMemorySessionMapper
207
+ from strands_evals.telemetry import StrandsEvalsTelemetry
208
+
209
+ # Setup telemetry
210
+ telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter()
211
+ memory_exporter = telemetry.in_memory_exporter
212
+
213
+ def task_function(case: Case) -> dict:
214
+ # Create simulator to drive conversation
215
+ simulator = ActorSimulator.from_case_for_user_simulator(
216
+ case=case,
217
+ max_turns=10
218
+ )
219
+
220
+ # Create agent to evaluate
221
+ agent = Agent(
222
+ trace_attributes={
223
+ "gen_ai.conversation.id": case.session_id,
224
+ "session.id": case.session_id
225
+ },
226
+ callback_handler=None
227
+ )
228
+
229
+ # Run multi-turn conversation
230
+ all_spans = []
231
+ user_message = case.input
232
+
233
+ while simulator.has_next():
234
+ memory_exporter.clear()
235
+ agent_response = agent(user_message)
236
+ turn_spans = list(memory_exporter.get_finished_spans())
237
+ all_spans.extend(turn_spans)
238
+
239
+ user_result = simulator.act(str(agent_response))
240
+ user_message = str(user_result.structured_output.message)
241
+
242
+ # Map to session for evaluation
243
+ mapper = StrandsInMemorySessionMapper()
244
+ session = mapper.map_to_session(all_spans, session_id=case.session_id)
245
+
246
+ return {"output": str(agent_response), "trajectory": session}
247
+
248
+ # Use evaluators to assess simulated conversations
249
+ evaluators = [
250
+ HelpfulnessEvaluator(),
251
+ GoalSuccessRateEvaluator()
252
+ ]
253
+
254
+ experiment = Experiment(cases=test_cases, evaluators=evaluators)
255
+ reports = experiment.run_evaluations(task_function)
256
+ ```
257
+
258
+ **Key Benefits:**
259
+ - **Dynamic Interactions**: Simulator adapts responses based on agent behavior
260
+ - **Goal-Oriented Testing**: Verify agents can complete user objectives through dialogue
261
+ - **Realistic Conversations**: Generate authentic multi-turn interaction patterns
262
+ - **No Predefined Scripts**: Test agents without hardcoded conversation paths
263
+ - **Comprehensive Evaluation**: Combine with trace-based evaluators for full assessment
264
+
197
265
  ### Automated Experiment Generation
198
266
 
199
267
  Generate comprehensive test suites automatically from context descriptions:
@@ -356,8 +424,9 @@ reports[0].run_display() # Interactive display with metrics breakdown
356
424
 
357
425
  For detailed guidance & examples, explore our documentation:
358
426
 
359
- - [User Guide](https://strandsagents.com/latest//user-guide/evals-sdk/quickstart.md)
360
- - [Evaluator Reference](https://strandsagents.com/latest/user-guide/evals-sdk/evaluators/)
427
+ - [User Guide](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/quickstart/)
428
+ - [Evaluator Reference](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/evaluators/)
429
+ - [Simulators Guide](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/simulators/)
361
430
 
362
431
  ## Contributing ❤️
363
432
 
@@ -138,7 +138,8 @@ disable_error_code = [
138
138
  disallow_untyped_decorators = false
139
139
 
140
140
  [tool.hatch.version]
141
- path = "src/strands_evals/__init__.py"
141
+ source = "vcs" # Use git tags for versioning
142
+
142
143
  [tool.pytest.ini_options]
143
144
  asyncio_mode = "auto"
144
145
  testpaths = ["tests"]
@@ -1,5 +1,3 @@
1
- __version__ = "0.1.0"
2
-
3
1
  from . import evaluators, extractors, generators, simulation, telemetry, types
4
2
  from .case import Case
5
3
  from .experiment import Experiment
@@ -577,8 +577,8 @@ class Experiment(Generic[InputT, OutputT]):
577
577
 
578
578
  file_path.parent.mkdir(parents=True, exist_ok=True)
579
579
 
580
- with open(file_path, "w") as f:
581
- json.dump(self.to_dict(), f, indent=2)
580
+ with open(file_path, "w", encoding="utf-8") as f:
581
+ json.dump(self.to_dict(), f, indent=2, ensure_ascii=False)
582
582
 
583
583
  @classmethod
584
584
  def from_dict(cls, data: dict, custom_evaluators: list[type[Evaluator]] | None = None):
@@ -646,7 +646,7 @@ class Experiment(Generic[InputT, OutputT]):
646
646
  f"Only .json format is supported. Got file: {path}. Please provide a path with .json extension."
647
647
  )
648
648
 
649
- with open(file_path, "r") as f:
649
+ with open(file_path, "r", encoding="utf-8") as f:
650
650
  data = json.load(f)
651
651
 
652
652
  return cls.from_dict(data, custom_evaluators)
@@ -33,6 +33,7 @@ def extract_agent_tools_used_from_messages(agent_messages):
33
33
  tool_id = tool.get("toolUseId")
34
34
  # get the tool result from the next message
35
35
  tool_result = None
36
+ is_error = False
36
37
  next_message_i = i + 1
37
38
  while next_message_i < len(agent_messages):
38
39
  next_message = agent_messages[next_message_i]
@@ -46,9 +47,12 @@ def extract_agent_tools_used_from_messages(agent_messages):
46
47
  tool_result_content = tool_result_dict.get("content", [])
47
48
  if len(tool_result_content) > 0:
48
49
  tool_result = tool_result_content[0].get("text")
50
+ is_error = tool_result_dict.get("status") == "error"
49
51
  break
50
52
 
51
- tools_used.append({"name": tool_name, "input": tool_input, "tool_result": tool_result})
53
+ tools_used.append(
54
+ {"name": tool_name, "input": tool_input, "tool_result": tool_result, "is_error": is_error}
55
+ )
52
56
  return tools_used
53
57
 
54
58
 
@@ -45,6 +45,7 @@ def test_tools_use_extractor_extract_from_messages_with_tools():
45
45
  assert result[0]["name"] == "calculator"
46
46
  assert result[0]["input"] == {"expression": "2+2"}
47
47
  assert result[0]["tool_result"] == "Result: 4"
48
+ assert result[0]["is_error"] is False
48
49
 
49
50
 
50
51
  def test_tools_use_extractor_extract_from_messages_no_tools():
@@ -59,6 +60,38 @@ def test_tools_use_extractor_extract_from_messages_no_tools():
59
60
  assert result == []
60
61
 
61
62
 
63
+ def test_tools_use_extractor_extract_from_messages_with_error():
64
+ """Test extracting tool usage from messages with error status"""
65
+ messages = [
66
+ {"role": "user", "content": [{"text": "Calculate invalid"}]},
67
+ {
68
+ "role": "assistant",
69
+ "content": [
70
+ {"toolUse": {"toolUseId": "tool_123", "name": "calculator", "input": {"expression": "invalid"}}},
71
+ ],
72
+ },
73
+ {
74
+ "role": "user",
75
+ "content": [
76
+ {
77
+ "toolResult": {
78
+ "status": "error",
79
+ "content": [{"text": "Invalid expression"}],
80
+ "toolUseId": "tool_123",
81
+ }
82
+ }
83
+ ],
84
+ },
85
+ ]
86
+
87
+ result = extract_agent_tools_used_from_messages(messages)
88
+
89
+ assert len(result) == 1
90
+ assert result[0]["name"] == "calculator"
91
+ assert result[0]["tool_result"] == "Invalid expression"
92
+ assert result[0]["is_error"] is True
93
+
94
+
62
95
  def test_tools_use_extractor_extract_from_messages_empty():
63
96
  """Test extracting tool usage from empty messages"""
64
97
  result = extract_agent_tools_used_from_messages([])
@@ -96,6 +129,7 @@ def test_tools_use_extractor_extract_from_messages_no_tool_result():
96
129
  assert result[0]["name"] == "calculator"
97
130
  assert result[0]["input"] == {"expression": "2+2"}
98
131
  assert result[0]["tool_result"] is None
132
+ assert result[0]["is_error"] is False
99
133
 
100
134
 
101
135
  def test_tools_use_extractor_extract_from_messages_malformed_tool_result():
@@ -348,3 +348,45 @@ async def test_async_dataset_with_interactions(interaction_case):
348
348
  assert len(report.cases) == 1
349
349
  assert report.cases[0].get("actual_interactions") is not None
350
350
  assert len(report.cases[0].get("actual_interactions")) == 2
351
+
352
+
353
+ def test_integration_tool_error_extraction():
354
+ """Test that is_error field is correctly extracted from tool execution"""
355
+ from strands_evals.extractors.tools_use_extractor import extract_agent_tools_used_from_messages
356
+
357
+ # Create mock messages simulating tool success and error
358
+ messages = [
359
+ {"role": "user", "content": [{"text": "test"}]},
360
+ {
361
+ "role": "assistant",
362
+ "content": [
363
+ {"toolUse": {"toolUseId": "tool1", "name": "success_tool", "input": {}}},
364
+ ],
365
+ },
366
+ {
367
+ "role": "user",
368
+ "content": [
369
+ {"toolResult": {"status": "success", "content": [{"text": "ok"}], "toolUseId": "tool1"}},
370
+ ],
371
+ },
372
+ {
373
+ "role": "assistant",
374
+ "content": [
375
+ {"toolUse": {"toolUseId": "tool2", "name": "error_tool", "input": {}}},
376
+ ],
377
+ },
378
+ {
379
+ "role": "user",
380
+ "content": [
381
+ {"toolResult": {"status": "error", "content": [{"text": "failed"}], "toolUseId": "tool2"}},
382
+ ],
383
+ },
384
+ ]
385
+
386
+ tools_used = extract_agent_tools_used_from_messages(messages)
387
+
388
+ assert len(tools_used) == 2
389
+ assert tools_used[0]["name"] == "success_tool"
390
+ assert tools_used[0]["is_error"] is False
391
+ assert tools_used[1]["name"] == "error_tool"
392
+ assert tools_used[1]["is_error"] is True