strands-agents-evals 0.1.1__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/PKG-INFO +1 -1
  2. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/experiment.py +18 -11
  3. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/extractors/tools_use_extractor.py +1 -1
  4. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/extractors/test_tools_use_extractor.py +35 -0
  5. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/test_experiment.py +36 -0
  6. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  7. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  8. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  9. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  10. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/.github/dependabot.yml +0 -0
  11. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/.github/workflows/integration-test.yml +0 -0
  12. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/.github/workflows/pr-and-push.yml +0 -0
  13. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/.github/workflows/pypi-publish-on-release.yml +0 -0
  14. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/.github/workflows/test-lint.yml +0 -0
  15. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/.gitignore +0 -0
  16. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/.pre-commit-config.yaml +0 -0
  17. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/CODE_OF_CONDUCT.md +0 -0
  18. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/CONTRIBUTING.md +0 -0
  19. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/LICENSE +0 -0
  20. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/NOTICE +0 -0
  21. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/README.md +0 -0
  22. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/STYLE_GUIDE.md +0 -0
  23. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/pyproject.toml +0 -0
  24. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/__init__.py +0 -0
  25. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/__init__.py +0 -0
  26. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/case.py +0 -0
  27. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/display/display_console.py +0 -0
  28. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/__init__.py +0 -0
  29. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/evaluator.py +0 -0
  30. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/faithfulness_evaluator.py +0 -0
  31. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/goal_success_rate_evaluator.py +0 -0
  32. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/harmfulness_evaluator.py +0 -0
  33. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/helpfulness_evaluator.py +0 -0
  34. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/interactions_evaluator.py +0 -0
  35. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/output_evaluator.py +0 -0
  36. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/case_prompt_template.py +0 -0
  37. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/faithfulness/__init__.py +0 -0
  38. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/faithfulness/faithfulness_v0.py +0 -0
  39. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py +0 -0
  40. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v0.py +0 -0
  41. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/harmfulness/__init__.py +0 -0
  42. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/harmfulness/harmfulness_v0.py +0 -0
  43. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/helpfulness/__init__.py +0 -0
  44. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/helpfulness/helpfulness_v0.py +0 -0
  45. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/prompt_templates.py +0 -0
  46. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/__init__.py +0 -0
  47. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/tool_parameter_accuracy_v0.py +0 -0
  48. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/tool_selection_accuracy/__init__.py +0 -0
  49. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection_accuracy_v0.py +0 -0
  50. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/tool_parameter_accuracy_evaluator.py +0 -0
  51. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/tool_selection_accuracy_evaluator.py +0 -0
  52. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/trajectory_evaluator.py +0 -0
  53. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/extractors/__init__.py +0 -0
  54. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/extractors/graph_extractor.py +0 -0
  55. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/extractors/swarm_extractor.py +0 -0
  56. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/extractors/trace_extractor.py +0 -0
  57. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/generators/__init__.py +0 -0
  58. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/generators/experiment_generator.py +0 -0
  59. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/generators/prompt_template/prompt_templates.py +0 -0
  60. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/generators/topic_planner.py +0 -0
  61. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/mappers/__init__.py +0 -0
  62. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/mappers/session_mapper.py +0 -0
  63. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/mappers/strands_in_memory_session_mapper.py +0 -0
  64. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/README.md +0 -0
  65. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/__init__.py +0 -0
  66. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/actor_simulator.py +0 -0
  67. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/profiles/__init__.py +0 -0
  68. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/profiles/actor_profile.py +0 -0
  69. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/prompt_templates/__init__.py +0 -0
  70. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/prompt_templates/actor_profile_extraction.py +0 -0
  71. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/prompt_templates/actor_system_prompt.py +0 -0
  72. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/prompt_templates/goal_completion.py +0 -0
  73. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/tools/__init__.py +0 -0
  74. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/tools/goal_completion.py +0 -0
  75. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/telemetry/__init__.py +0 -0
  76. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/telemetry/_cloudwatch_logger.py +0 -0
  77. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/telemetry/config.py +0 -0
  78. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/telemetry/tracer.py +0 -0
  79. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/tools/evaluation_tools.py +0 -0
  80. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/types/__init__.py +0 -0
  81. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/types/evaluation.py +0 -0
  82. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/types/evaluation_report.py +0 -0
  83. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/types/simulation/__init__.py +0 -0
  84. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/types/simulation/actor.py +0 -0
  85. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/src/strands_evals/types/trace.py +0 -0
  86. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/__init__.py +0 -0
  87. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_evaluator.py +0 -0
  88. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_faithfulness_evaluator.py +0 -0
  89. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py +0 -0
  90. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_harmfulness_evaluator.py +0 -0
  91. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_helpfulness_evaluator.py +0 -0
  92. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_interactions_evaluator.py +0 -0
  93. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_output_evaluator.py +0 -0
  94. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_tool_parameter_accuracy_evaluator.py +0 -0
  95. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_tool_selection_accuracy_evaluator.py +0 -0
  96. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_trajectory_evaluator.py +0 -0
  97. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/extractors/test_graph_extractor.py +0 -0
  98. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/extractors/test_swarm_extractor.py +0 -0
  99. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/extractors/test_trace_extractor.py +0 -0
  100. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/generators/test_experiment_generator.py +0 -0
  101. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/generators/test_topic_planner.py +0 -0
  102. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/mappers/__init__.py +0 -0
  103. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/mappers/test_strands_in_memory_mapper.py +0 -0
  104. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/simulation/__init__.py +0 -0
  105. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/simulation/test_actor_simulator.py +0 -0
  106. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/simulation/test_goal_completion.py +0 -0
  107. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/telemetry/test_config.py +0 -0
  108. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/telemetry/test_tracer.py +0 -0
  109. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/test_cases.py +0 -0
  110. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/tools/test_evaluation_tools.py +0 -0
  111. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/strands_evals/types/test_trace.py +0 -0
  112. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests/test_integration.py +0 -0
  113. {strands_agents_evals-0.1.1 → strands_agents_evals-0.1.2}/tests_integ/test_output_evaluator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: strands-agents-evals
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Summary: Evaluation framework for Strands
5
5
  Author-email: AWS <opensource@amazon.com>
6
6
  License: Apache-2.0
@@ -391,8 +391,8 @@ class Experiment(Generic[InputT, OutputT]):
391
391
  "gen_ai.evaluation.case.input": serialize(case.input),
392
392
  },
393
393
  ) as case_span:
394
+ # Task execution span - execute once
394
395
  try:
395
- # Task execution span - execute once
396
396
  with self._tracer.start_as_current_span(
397
397
  "task_execution",
398
398
  attributes={
@@ -414,9 +414,21 @@ class Experiment(Generic[InputT, OutputT]):
414
414
  ),
415
415
  }
416
416
  )
417
-
418
- # Evaluate with each evaluator using the same task output
417
+ except Exception as e:
418
+ case_span.record_exception(e)
419
419
  for evaluator in self._evaluators:
420
+ eval_name = evaluator.get_type_name()
421
+ evaluator_data[eval_name]["cases"].append(case.model_dump())
422
+ evaluator_data[eval_name]["test_passes"].append(False)
423
+ evaluator_data[eval_name]["scores"].append(0)
424
+ evaluator_data[eval_name]["reasons"].append(f"Task execution error: {str(e)}")
425
+ evaluator_data[eval_name]["detailed_results"].append([])
426
+ continue
427
+
428
+ # Evaluate with each evaluator using the same task output
429
+ for evaluator in self._evaluators:
430
+ eval_name = evaluator.get_type_name()
431
+ try:
420
432
  with self._tracer.start_as_current_span(
421
433
  f"evaluator {evaluator.get_type_name()}",
422
434
  attributes={
@@ -436,21 +448,16 @@ class Experiment(Generic[InputT, OutputT]):
436
448
  }
437
449
  )
438
450
 
439
- eval_name = evaluator.get_type_name()
440
451
  evaluator_data[eval_name]["cases"].append(evaluation_context.model_dump())
441
452
  evaluator_data[eval_name]["test_passes"].append(aggregate_pass)
442
453
  evaluator_data[eval_name]["scores"].append(aggregate_score)
443
454
  evaluator_data[eval_name]["reasons"].append(aggregate_reason or "")
444
455
  evaluator_data[eval_name]["detailed_results"].append(evaluation_outputs)
445
-
446
- except Exception as e:
447
- case_span.record_exception(e)
448
- for evaluator in self._evaluators:
449
- eval_name = evaluator.get_type_name()
450
- evaluator_data[eval_name]["cases"].append(case.model_dump())
456
+ except Exception as e:
457
+ evaluator_data[eval_name]["cases"].append(evaluation_context.model_dump())
451
458
  evaluator_data[eval_name]["test_passes"].append(False)
452
459
  evaluator_data[eval_name]["scores"].append(0)
453
- evaluator_data[eval_name]["reasons"].append(f"An error occured : {str(e)}")
460
+ evaluator_data[eval_name]["reasons"].append(f"Evaluator error: {str(e)}")
454
461
  evaluator_data[eval_name]["detailed_results"].append([])
455
462
 
456
463
  reports = []
@@ -43,7 +43,7 @@ def extract_agent_tools_used_from_messages(agent_messages):
43
43
  content = next_message.get("content")
44
44
  if content:
45
45
  tool_result_dict = content[0].get("toolResult")
46
- if tool_result_dict.get("toolUseId") == tool_id:
46
+ if tool_result_dict and tool_result_dict.get("toolUseId") == tool_id:
47
47
  tool_result_content = tool_result_dict.get("content", [])
48
48
  if len(tool_result_content) > 0:
49
49
  tool_result = tool_result_content[0].get("text")
@@ -243,3 +243,38 @@ def test_tools_use_extractor_extract_tools_description_empty():
243
243
  result = extract_tools_description(mock_agent, is_short=True)
244
244
 
245
245
  assert result == {}
246
+
247
+
248
+ def test_tools_use_extractor_extract_from_messages_user_message_without_tool_result():
249
+ """Test extracting tool usage when user message content lacks toolResult key."""
250
+ messages = [
251
+ {
252
+ "role": "assistant",
253
+ "content": [
254
+ {"toolUse": {"toolUseId": "tool_abc", "name": "calculator", "input": {"expression": "5+5"}}},
255
+ ],
256
+ },
257
+ {
258
+ "role": "user",
259
+ "content": [{"text": "Some user text without toolResult"}], # No toolResult key
260
+ },
261
+ {
262
+ "role": "user",
263
+ "content": [
264
+ {
265
+ "toolResult": {
266
+ "status": "success",
267
+ "content": [{"text": "Result: 10"}],
268
+ "toolUseId": "tool_abc",
269
+ }
270
+ }
271
+ ],
272
+ },
273
+ ]
274
+ result = extract_agent_tools_used_from_messages(messages)
275
+
276
+ assert len(result) == 1
277
+ assert result[0]["name"] == "calculator"
278
+ assert result[0]["input"] == {"expression": "5+5"}
279
+ assert result[0]["tool_result"] == "Result: 10"
280
+ assert result[0]["is_error"] is False
@@ -34,6 +34,16 @@ class MockEvaluator2(Evaluator[str, str]):
34
34
  return [EvaluationOutput(score=0.5, test_pass=True, reason="Async test evaluation 2")]
35
35
 
36
36
 
37
+ class ThrowingEvaluator(Evaluator[str, str]):
38
+ """Evaluator that always throws an exception - used to test error isolation"""
39
+
40
+ def evaluate(self, evaluation_case: EvaluationData[str, str]) -> list[EvaluationOutput]:
41
+ raise RuntimeError("Evaluator exploded")
42
+
43
+ async def evaluate_async(self, evaluation_case: EvaluationData[str, str]) -> list[EvaluationOutput]:
44
+ raise RuntimeError("Async evaluator exploded")
45
+
46
+
37
47
  @pytest.fixture
38
48
  def mock_evaluator():
39
49
  return MockEvaluator()
@@ -1052,3 +1062,29 @@ def test_experiment_run_evaluations_multiple_cases(mock_span, simple_task):
1052
1062
  assert len(reports) == 1
1053
1063
  assert len(reports[0].scores) == 2
1054
1064
  assert all(score == 1.0 for score in reports[0].scores)
1065
+
1066
+
1067
+ def test_experiment_run_evaluations_evaluator_error_isolated():
1068
+ """Test that one evaluator failing doesn't affect other evaluators."""
1069
+ case = Case(name="test", input="hello", expected_output="hello")
1070
+
1071
+ # MockEvaluator succeeds, ThrowingEvaluator fails
1072
+ experiment = Experiment(cases=[case], evaluators=[MockEvaluator(), ThrowingEvaluator()])
1073
+
1074
+ def echo_task(c):
1075
+ return c.input
1076
+
1077
+ reports = experiment.run_evaluations(echo_task)
1078
+
1079
+ assert len(reports) == 2
1080
+
1081
+ # First evaluator (MockEvaluator) should succeed
1082
+ assert reports[0].scores[0] == 1.0
1083
+ assert reports[0].test_passes[0] is True
1084
+ assert reports[0].reasons[0] == "Mock evaluation"
1085
+
1086
+ # Second evaluator (ThrowingEvaluator) should fail with error message
1087
+ assert reports[1].scores[0] == 0
1088
+ assert reports[1].test_passes[0] is False
1089
+ assert "Evaluator error" in reports[1].reasons[0]
1090
+ assert "Evaluator exploded" in reports[1].reasons[0]