strands-agents-evals 0.1.1__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/PKG-INFO +1 -1
  2. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/experiment.py +18 -11
  3. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/extractors/tools_use_extractor.py +30 -28
  4. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/extractors/test_tools_use_extractor.py +99 -0
  5. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/test_experiment.py +36 -0
  6. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  7. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  8. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  9. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  10. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/.github/dependabot.yml +0 -0
  11. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/.github/workflows/integration-test.yml +0 -0
  12. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/.github/workflows/pr-and-push.yml +0 -0
  13. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/.github/workflows/pypi-publish-on-release.yml +0 -0
  14. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/.github/workflows/test-lint.yml +0 -0
  15. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/.gitignore +0 -0
  16. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/.pre-commit-config.yaml +0 -0
  17. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/CODE_OF_CONDUCT.md +0 -0
  18. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/CONTRIBUTING.md +0 -0
  19. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/LICENSE +0 -0
  20. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/NOTICE +0 -0
  21. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/README.md +0 -0
  22. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/STYLE_GUIDE.md +0 -0
  23. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/pyproject.toml +0 -0
  24. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/__init__.py +0 -0
  25. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/__init__.py +0 -0
  26. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/case.py +0 -0
  27. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/display/display_console.py +0 -0
  28. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/evaluators/__init__.py +0 -0
  29. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/evaluators/evaluator.py +0 -0
  30. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/evaluators/faithfulness_evaluator.py +0 -0
  31. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/evaluators/goal_success_rate_evaluator.py +0 -0
  32. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/evaluators/harmfulness_evaluator.py +0 -0
  33. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/evaluators/helpfulness_evaluator.py +0 -0
  34. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/evaluators/interactions_evaluator.py +0 -0
  35. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/evaluators/output_evaluator.py +0 -0
  36. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/evaluators/prompt_templates/case_prompt_template.py +0 -0
  37. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/evaluators/prompt_templates/faithfulness/__init__.py +0 -0
  38. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/evaluators/prompt_templates/faithfulness/faithfulness_v0.py +0 -0
  39. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py +0 -0
  40. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v0.py +0 -0
  41. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/evaluators/prompt_templates/harmfulness/__init__.py +0 -0
  42. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/evaluators/prompt_templates/harmfulness/harmfulness_v0.py +0 -0
  43. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/evaluators/prompt_templates/helpfulness/__init__.py +0 -0
  44. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/evaluators/prompt_templates/helpfulness/helpfulness_v0.py +0 -0
  45. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/evaluators/prompt_templates/prompt_templates.py +0 -0
  46. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/__init__.py +0 -0
  47. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/tool_parameter_accuracy_v0.py +0 -0
  48. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/evaluators/prompt_templates/tool_selection_accuracy/__init__.py +0 -0
  49. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection_accuracy_v0.py +0 -0
  50. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/evaluators/tool_parameter_accuracy_evaluator.py +0 -0
  51. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/evaluators/tool_selection_accuracy_evaluator.py +0 -0
  52. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/evaluators/trajectory_evaluator.py +0 -0
  53. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/extractors/__init__.py +0 -0
  54. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/extractors/graph_extractor.py +0 -0
  55. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/extractors/swarm_extractor.py +0 -0
  56. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/extractors/trace_extractor.py +0 -0
  57. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/generators/__init__.py +0 -0
  58. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/generators/experiment_generator.py +0 -0
  59. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/generators/prompt_template/prompt_templates.py +0 -0
  60. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/generators/topic_planner.py +0 -0
  61. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/mappers/__init__.py +0 -0
  62. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/mappers/session_mapper.py +0 -0
  63. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/mappers/strands_in_memory_session_mapper.py +0 -0
  64. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/simulation/README.md +0 -0
  65. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/simulation/__init__.py +0 -0
  66. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/simulation/actor_simulator.py +0 -0
  67. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/simulation/profiles/__init__.py +0 -0
  68. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/simulation/profiles/actor_profile.py +0 -0
  69. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/simulation/prompt_templates/__init__.py +0 -0
  70. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/simulation/prompt_templates/actor_profile_extraction.py +0 -0
  71. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/simulation/prompt_templates/actor_system_prompt.py +0 -0
  72. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/simulation/prompt_templates/goal_completion.py +0 -0
  73. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/simulation/tools/__init__.py +0 -0
  74. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/simulation/tools/goal_completion.py +0 -0
  75. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/telemetry/__init__.py +0 -0
  76. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/telemetry/_cloudwatch_logger.py +0 -0
  77. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/telemetry/config.py +0 -0
  78. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/telemetry/tracer.py +0 -0
  79. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/tools/evaluation_tools.py +0 -0
  80. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/types/__init__.py +0 -0
  81. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/types/evaluation.py +0 -0
  82. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/types/evaluation_report.py +0 -0
  83. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/types/simulation/__init__.py +0 -0
  84. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/types/simulation/actor.py +0 -0
  85. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/src/strands_evals/types/trace.py +0 -0
  86. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/__init__.py +0 -0
  87. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/evaluators/test_evaluator.py +0 -0
  88. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/evaluators/test_faithfulness_evaluator.py +0 -0
  89. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py +0 -0
  90. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/evaluators/test_harmfulness_evaluator.py +0 -0
  91. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/evaluators/test_helpfulness_evaluator.py +0 -0
  92. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/evaluators/test_interactions_evaluator.py +0 -0
  93. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/evaluators/test_output_evaluator.py +0 -0
  94. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/evaluators/test_tool_parameter_accuracy_evaluator.py +0 -0
  95. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/evaluators/test_tool_selection_accuracy_evaluator.py +0 -0
  96. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/evaluators/test_trajectory_evaluator.py +0 -0
  97. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/extractors/test_graph_extractor.py +0 -0
  98. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/extractors/test_swarm_extractor.py +0 -0
  99. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/extractors/test_trace_extractor.py +0 -0
  100. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/generators/test_experiment_generator.py +0 -0
  101. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/generators/test_topic_planner.py +0 -0
  102. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/mappers/__init__.py +0 -0
  103. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/mappers/test_strands_in_memory_mapper.py +0 -0
  104. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/simulation/__init__.py +0 -0
  105. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/simulation/test_actor_simulator.py +0 -0
  106. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/simulation/test_goal_completion.py +0 -0
  107. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/telemetry/test_config.py +0 -0
  108. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/telemetry/test_tracer.py +0 -0
  109. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/test_cases.py +0 -0
  110. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/tools/test_evaluation_tools.py +0 -0
  111. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/strands_evals/types/test_trace.py +0 -0
  112. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests/test_integration.py +0 -0
  113. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.3}/tests_integ/test_output_evaluator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: strands-agents-evals
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: Evaluation framework for Strands
5
5
  Author-email: AWS <opensource@amazon.com>
6
6
  License: Apache-2.0
@@ -391,8 +391,8 @@ class Experiment(Generic[InputT, OutputT]):
391
391
  "gen_ai.evaluation.case.input": serialize(case.input),
392
392
  },
393
393
  ) as case_span:
394
+ # Task execution span - execute once
394
395
  try:
395
- # Task execution span - execute once
396
396
  with self._tracer.start_as_current_span(
397
397
  "task_execution",
398
398
  attributes={
@@ -414,9 +414,21 @@ class Experiment(Generic[InputT, OutputT]):
414
414
  ),
415
415
  }
416
416
  )
417
-
418
- # Evaluate with each evaluator using the same task output
417
+ except Exception as e:
418
+ case_span.record_exception(e)
419
419
  for evaluator in self._evaluators:
420
+ eval_name = evaluator.get_type_name()
421
+ evaluator_data[eval_name]["cases"].append(case.model_dump())
422
+ evaluator_data[eval_name]["test_passes"].append(False)
423
+ evaluator_data[eval_name]["scores"].append(0)
424
+ evaluator_data[eval_name]["reasons"].append(f"Task execution error: {str(e)}")
425
+ evaluator_data[eval_name]["detailed_results"].append([])
426
+ continue
427
+
428
+ # Evaluate with each evaluator using the same task output
429
+ for evaluator in self._evaluators:
430
+ eval_name = evaluator.get_type_name()
431
+ try:
420
432
  with self._tracer.start_as_current_span(
421
433
  f"evaluator {evaluator.get_type_name()}",
422
434
  attributes={
@@ -436,21 +448,16 @@ class Experiment(Generic[InputT, OutputT]):
436
448
  }
437
449
  )
438
450
 
439
- eval_name = evaluator.get_type_name()
440
451
  evaluator_data[eval_name]["cases"].append(evaluation_context.model_dump())
441
452
  evaluator_data[eval_name]["test_passes"].append(aggregate_pass)
442
453
  evaluator_data[eval_name]["scores"].append(aggregate_score)
443
454
  evaluator_data[eval_name]["reasons"].append(aggregate_reason or "")
444
455
  evaluator_data[eval_name]["detailed_results"].append(evaluation_outputs)
445
-
446
- except Exception as e:
447
- case_span.record_exception(e)
448
- for evaluator in self._evaluators:
449
- eval_name = evaluator.get_type_name()
450
- evaluator_data[eval_name]["cases"].append(case.model_dump())
456
+ except Exception as e:
457
+ evaluator_data[eval_name]["cases"].append(evaluation_context.model_dump())
451
458
  evaluator_data[eval_name]["test_passes"].append(False)
452
459
  evaluator_data[eval_name]["scores"].append(0)
453
- evaluator_data[eval_name]["reasons"].append(f"An error occured : {str(e)}")
460
+ evaluator_data[eval_name]["reasons"].append(f"Evaluator error: {str(e)}")
454
461
  evaluator_data[eval_name]["detailed_results"].append([])
455
462
 
456
463
  reports = []
@@ -22,37 +22,39 @@ def extract_agent_tools_used_from_messages(agent_messages):
22
22
  if message.get("role") == "assistant":
23
23
  message_info = message.get("content")
24
24
  if len(message_info) > 0:
25
- tool = None
25
+ tools = []
26
26
  for message in message_info:
27
27
  if "toolUse" in message:
28
+ tools.append(message.get("toolUse"))
29
+
30
+ for tool in tools:
31
+ if tool:
32
+ tool_name = tool.get("name")
33
+ tool_input = tool.get("input")
34
+ tool_id = tool.get("toolUseId")
35
+ # get the tool result from the next message
36
+ tool_result = None
37
+ is_error = False
38
+ next_message_i = i + 1
39
+ while next_message_i < len(agent_messages):
40
+ next_message = agent_messages[next_message_i]
41
+ next_message_i += 1
42
+
43
+ if next_message.get("role") == "user":
44
+ content = next_message.get("content")
45
+ if content:
46
+ tool_result_dict = content[0].get("toolResult")
47
+ if tool_result_dict and tool_result_dict.get("toolUseId") == tool_id:
48
+ tool_result_content = tool_result_dict.get("content", [])
49
+ if len(tool_result_content) > 0:
50
+ tool_result = tool_result_content[0].get("text")
51
+ is_error = tool_result_dict.get("status") == "error"
52
+ break
53
+
54
+ tools_used.append(
55
+ {"name": tool_name, "input": tool_input, "tool_result": tool_result, "is_error": is_error}
56
+ )
28
57
  tool = message.get("toolUse")
29
-
30
- if tool:
31
- tool_name = tool.get("name")
32
- tool_input = tool.get("input")
33
- tool_id = tool.get("toolUseId")
34
- # get the tool result from the next message
35
- tool_result = None
36
- is_error = False
37
- next_message_i = i + 1
38
- while next_message_i < len(agent_messages):
39
- next_message = agent_messages[next_message_i]
40
- next_message_i += 1
41
-
42
- if next_message.get("role") == "user":
43
- content = next_message.get("content")
44
- if content:
45
- tool_result_dict = content[0].get("toolResult")
46
- if tool_result_dict.get("toolUseId") == tool_id:
47
- tool_result_content = tool_result_dict.get("content", [])
48
- if len(tool_result_content) > 0:
49
- tool_result = tool_result_content[0].get("text")
50
- is_error = tool_result_dict.get("status") == "error"
51
- break
52
-
53
- tools_used.append(
54
- {"name": tool_name, "input": tool_input, "tool_result": tool_result, "is_error": is_error}
55
- )
56
58
  return tools_used
57
59
 
58
60
 
@@ -48,6 +48,70 @@ def test_tools_use_extractor_extract_from_messages_with_tools():
48
48
  assert result[0]["is_error"] is False
49
49
 
50
50
 
51
+ def test_tools_use_extractor_extract_from_messages_with_multiple_tools():
52
+ """Test extracting multiple tool usages from messages"""
53
+ messages = [
54
+ {"role": "user", "content": [{"text": "Calculate 2+2 and search for weather"}]},
55
+ {
56
+ "role": "assistant",
57
+ "content": [
58
+ {"text": "I'll calculate and search for you."},
59
+ {
60
+ "toolUse": {
61
+ "toolUseId": "tool1",
62
+ "name": "calculator",
63
+ "input": {"expression": "2+2"},
64
+ }
65
+ },
66
+ {
67
+ "toolUse": {
68
+ "toolUseId": "tool2",
69
+ "name": "web_search",
70
+ "input": {"query": "current weather"},
71
+ }
72
+ },
73
+ ],
74
+ },
75
+ {
76
+ "role": "user",
77
+ "content": [
78
+ {
79
+ "toolResult": {
80
+ "status": "success",
81
+ "content": [{"text": "Result: 4"}],
82
+ "toolUseId": "tool1",
83
+ }
84
+ }
85
+ ],
86
+ },
87
+ {
88
+ "role": "user",
89
+ "content": [
90
+ {
91
+ "toolResult": {
92
+ "status": "success",
93
+ "content": [{"text": "Sunny, 25°C"}],
94
+ "toolUseId": "tool2",
95
+ }
96
+ }
97
+ ],
98
+ },
99
+ {"role": "assistant", "content": [{"text": "Results: 4 and sunny weather."}]},
100
+ ]
101
+
102
+ result = extract_agent_tools_used_from_messages(messages)
103
+
104
+ assert len(result) == 2
105
+ assert result[0]["name"] == "calculator"
106
+ assert result[0]["input"] == {"expression": "2+2"}
107
+ assert result[0]["tool_result"] == "Result: 4"
108
+ assert result[0]["is_error"] is False
109
+ assert result[1]["name"] == "web_search"
110
+ assert result[1]["input"] == {"query": "current weather"}
111
+ assert result[1]["tool_result"] == "Sunny, 25°C"
112
+ assert result[1]["is_error"] is False
113
+
114
+
51
115
  def test_tools_use_extractor_extract_from_messages_no_tools():
52
116
  """Test extracting tool usage from messages without tool usage"""
53
117
  messages = [
@@ -243,3 +307,38 @@ def test_tools_use_extractor_extract_tools_description_empty():
243
307
  result = extract_tools_description(mock_agent, is_short=True)
244
308
 
245
309
  assert result == {}
310
+
311
+
312
+ def test_tools_use_extractor_extract_from_messages_user_message_without_tool_result():
313
+ """Test extracting tool usage when user message content lacks toolResult key."""
314
+ messages = [
315
+ {
316
+ "role": "assistant",
317
+ "content": [
318
+ {"toolUse": {"toolUseId": "tool_abc", "name": "calculator", "input": {"expression": "5+5"}}},
319
+ ],
320
+ },
321
+ {
322
+ "role": "user",
323
+ "content": [{"text": "Some user text without toolResult"}], # No toolResult key
324
+ },
325
+ {
326
+ "role": "user",
327
+ "content": [
328
+ {
329
+ "toolResult": {
330
+ "status": "success",
331
+ "content": [{"text": "Result: 10"}],
332
+ "toolUseId": "tool_abc",
333
+ }
334
+ }
335
+ ],
336
+ },
337
+ ]
338
+ result = extract_agent_tools_used_from_messages(messages)
339
+
340
+ assert len(result) == 1
341
+ assert result[0]["name"] == "calculator"
342
+ assert result[0]["input"] == {"expression": "5+5"}
343
+ assert result[0]["tool_result"] == "Result: 10"
344
+ assert result[0]["is_error"] is False
@@ -34,6 +34,16 @@ class MockEvaluator2(Evaluator[str, str]):
34
34
  return [EvaluationOutput(score=0.5, test_pass=True, reason="Async test evaluation 2")]
35
35
 
36
36
 
37
+ class ThrowingEvaluator(Evaluator[str, str]):
38
+ """Evaluator that always throws an exception - used to test error isolation"""
39
+
40
+ def evaluate(self, evaluation_case: EvaluationData[str, str]) -> list[EvaluationOutput]:
41
+ raise RuntimeError("Evaluator exploded")
42
+
43
+ async def evaluate_async(self, evaluation_case: EvaluationData[str, str]) -> list[EvaluationOutput]:
44
+ raise RuntimeError("Async evaluator exploded")
45
+
46
+
37
47
  @pytest.fixture
38
48
  def mock_evaluator():
39
49
  return MockEvaluator()
@@ -1052,3 +1062,29 @@ def test_experiment_run_evaluations_multiple_cases(mock_span, simple_task):
1052
1062
  assert len(reports) == 1
1053
1063
  assert len(reports[0].scores) == 2
1054
1064
  assert all(score == 1.0 for score in reports[0].scores)
1065
+
1066
+
1067
+ def test_experiment_run_evaluations_evaluator_error_isolated():
1068
+ """Test that one evaluator failing doesn't affect other evaluators."""
1069
+ case = Case(name="test", input="hello", expected_output="hello")
1070
+
1071
+ # MockEvaluator succeeds, ThrowingEvaluator fails
1072
+ experiment = Experiment(cases=[case], evaluators=[MockEvaluator(), ThrowingEvaluator()])
1073
+
1074
+ def echo_task(c):
1075
+ return c.input
1076
+
1077
+ reports = experiment.run_evaluations(echo_task)
1078
+
1079
+ assert len(reports) == 2
1080
+
1081
+ # First evaluator (MockEvaluator) should succeed
1082
+ assert reports[0].scores[0] == 1.0
1083
+ assert reports[0].test_passes[0] is True
1084
+ assert reports[0].reasons[0] == "Mock evaluation"
1085
+
1086
+ # Second evaluator (ThrowingEvaluator) should fail with error message
1087
+ assert reports[1].scores[0] == 0
1088
+ assert reports[1].test_passes[0] is False
1089
+ assert "Evaluator error" in reports[1].reasons[0]
1090
+ assert "Evaluator exploded" in reports[1].reasons[0]