strands-agents-evals 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. strands_agents_evals-0.1.0.dist-info/METADATA +408 -0
  2. strands_agents_evals-0.1.0.dist-info/RECORD +68 -0
  3. strands_agents_evals-0.1.0.dist-info/WHEEL +4 -0
  4. strands_agents_evals-0.1.0.dist-info/licenses/LICENSE +175 -0
  5. strands_agents_evals-0.1.0.dist-info/licenses/NOTICE +1 -0
  6. strands_evals/__init__.py +22 -0
  7. strands_evals/case.py +53 -0
  8. strands_evals/display/display_console.py +150 -0
  9. strands_evals/evaluators/__init__.py +23 -0
  10. strands_evals/evaluators/evaluator.py +182 -0
  11. strands_evals/evaluators/faithfulness_evaluator.py +116 -0
  12. strands_evals/evaluators/goal_success_rate_evaluator.py +90 -0
  13. strands_evals/evaluators/harmfulness_evaluator.py +135 -0
  14. strands_evals/evaluators/helpfulness_evaluator.py +148 -0
  15. strands_evals/evaluators/interactions_evaluator.py +244 -0
  16. strands_evals/evaluators/output_evaluator.py +72 -0
  17. strands_evals/evaluators/prompt_templates/case_prompt_template.py +63 -0
  18. strands_evals/evaluators/prompt_templates/faithfulness/__init__.py +11 -0
  19. strands_evals/evaluators/prompt_templates/faithfulness/faithfulness_v0.py +30 -0
  20. strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py +11 -0
  21. strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v0.py +17 -0
  22. strands_evals/evaluators/prompt_templates/harmfulness/__init__.py +11 -0
  23. strands_evals/evaluators/prompt_templates/harmfulness/harmfulness_v0.py +8 -0
  24. strands_evals/evaluators/prompt_templates/helpfulness/__init__.py +11 -0
  25. strands_evals/evaluators/prompt_templates/helpfulness/helpfulness_v0.py +38 -0
  26. strands_evals/evaluators/prompt_templates/prompt_templates.py +176 -0
  27. strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/__init__.py +11 -0
  28. strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/tool_parameter_accuracy_v0.py +40 -0
  29. strands_evals/evaluators/prompt_templates/tool_selection_accuracy/__init__.py +11 -0
  30. strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection_accuracy_v0.py +23 -0
  31. strands_evals/evaluators/tool_parameter_accuracy_evaluator.py +112 -0
  32. strands_evals/evaluators/tool_selection_accuracy_evaluator.py +112 -0
  33. strands_evals/evaluators/trajectory_evaluator.py +100 -0
  34. strands_evals/experiment.py +652 -0
  35. strands_evals/extractors/__init__.py +3 -0
  36. strands_evals/extractors/graph_extractor.py +30 -0
  37. strands_evals/extractors/swarm_extractor.py +73 -0
  38. strands_evals/extractors/tools_use_extractor.py +164 -0
  39. strands_evals/extractors/trace_extractor.py +166 -0
  40. strands_evals/generators/__init__.py +3 -0
  41. strands_evals/generators/experiment_generator.py +498 -0
  42. strands_evals/generators/prompt_template/prompt_templates.py +75 -0
  43. strands_evals/generators/topic_planner.py +60 -0
  44. strands_evals/mappers/__init__.py +6 -0
  45. strands_evals/mappers/session_mapper.py +27 -0
  46. strands_evals/mappers/strands_in_memory_session_mapper.py +473 -0
  47. strands_evals/simulation/README.md +323 -0
  48. strands_evals/simulation/__init__.py +6 -0
  49. strands_evals/simulation/actor_simulator.py +292 -0
  50. strands_evals/simulation/profiles/__init__.py +5 -0
  51. strands_evals/simulation/profiles/actor_profile.py +26 -0
  52. strands_evals/simulation/prompt_templates/__init__.py +11 -0
  53. strands_evals/simulation/prompt_templates/actor_profile_extraction.py +25 -0
  54. strands_evals/simulation/prompt_templates/actor_system_prompt.py +64 -0
  55. strands_evals/simulation/prompt_templates/goal_completion.py +27 -0
  56. strands_evals/simulation/tools/__init__.py +5 -0
  57. strands_evals/simulation/tools/goal_completion.py +93 -0
  58. strands_evals/telemetry/__init__.py +15 -0
  59. strands_evals/telemetry/_cloudwatch_logger.py +209 -0
  60. strands_evals/telemetry/config.py +207 -0
  61. strands_evals/telemetry/tracer.py +38 -0
  62. strands_evals/tools/evaluation_tools.py +67 -0
  63. strands_evals/types/__init__.py +11 -0
  64. strands_evals/types/evaluation.py +105 -0
  65. strands_evals/types/evaluation_report.py +244 -0
  66. strands_evals/types/simulation/__init__.py +5 -0
  67. strands_evals/types/simulation/actor.py +34 -0
  68. strands_evals/types/trace.py +205 -0
@@ -0,0 +1,652 @@
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ import os
5
+ from collections.abc import Callable
6
+ from pathlib import Path
7
+
8
+ from opentelemetry.trace import format_trace_id
9
+ from typing_extensions import Any, Generic, TypeVar
10
+
11
+ from .case import Case
12
+ from .evaluators.evaluator import Evaluator
13
+ from .evaluators.interactions_evaluator import InteractionsEvaluator
14
+ from .evaluators.output_evaluator import OutputEvaluator
15
+ from .evaluators.trajectory_evaluator import TrajectoryEvaluator
16
+ from .telemetry import get_tracer, serialize
17
+ from .telemetry._cloudwatch_logger import _send_to_cloudwatch
18
+ from .types.evaluation import EvaluationData
19
+ from .types.evaluation_report import EvaluationReport
20
+
21
+ InputT = TypeVar("InputT")
22
+ OutputT = TypeVar("OutputT")
23
+
24
+ logger = logging.getLogger()
25
+ logger.setLevel(logging.INFO)
26
+
27
+
28
+ def _get_label_from_score(evaluator: Evaluator, score: float) -> str:
29
+ """
30
+ Get the label from score using evaluator's _score_mapping if available.
31
+ If no mapping exists, returns "YES" for scores >= 0.5, "NO" otherwise.
32
+
33
+ Args:
34
+ evaluator: The evaluator instance
35
+ score: The numeric score
36
+ default_label: Default label to return if provided and no mapping found
37
+
38
+ Returns:
39
+ The label corresponding to the score
40
+ """
41
+ if hasattr(evaluator, "_score_mapping") and evaluator._score_mapping:
42
+ # Create reverse mapping from score to label
43
+ reverse_mapping = {v: k for k, v in evaluator._score_mapping.items()}
44
+ # Find the score in the mapping
45
+ if score in reverse_mapping:
46
+ return str(reverse_mapping[score])
47
+
48
+ # Otherwise, return YES/NO based on score
49
+ return "YES" if score >= 0.5 else "NO"
50
+
51
+
52
+ class Experiment(Generic[InputT, OutputT]):
53
+ """
54
+ An evaluation experiment containing test cases and evaluators.
55
+
56
+ Experiment organizes a collection of test cases and evaluates them all with
57
+ the defined evaluators on some task.
58
+
59
+ Attributes:
60
+ cases: A list of test cases in the experiment.
61
+ evaluators: The list of evaluators to be used on the test cases.
62
+
63
+ Example:
64
+ experiment = Experiment[str, str](
65
+ cases=[
66
+ Case(name="Simple Knowledge",
67
+ input="What is the capital of France?",
68
+ expected_output="The capital of France is Paris.",
69
+ expected_trajectory=[],
70
+ metadata={"category": "knowledge"}),
71
+ Case(name="Simple Math",
72
+ input="What is 2x2?",
73
+ expected_output="2x2 is 4.",
74
+ expected_trajectory=["calculator"],
75
+ metadata={"category": "math"})
76
+ ],
77
+ evaluators=[
78
+ OutputEvaluator(
79
+ rubric=(
80
+ "The output is relevant and complete. 0 if the output is incorrect or irrelevant."
81
+ )
82
+ )
83
+ ]
84
+ )
85
+ """
86
+
87
+ def __init__(
88
+ self,
89
+ cases: list[Case[InputT, OutputT]] | None = None,
90
+ evaluators: list[Evaluator[InputT, OutputT]] | None = None,
91
+ ):
92
+ self._cases = cases or []
93
+ self._evaluators = evaluators or [Evaluator()]
94
+ self._tracer = get_tracer()
95
+ # self._logger = get_logger(__name__)
96
+
97
+ self._config_id = os.environ.get("EVALUATION_RESULTS_LOG_GROUP", "default-strands-evals")
98
+
99
+ @property
100
+ def cases(self) -> list[Case[InputT, OutputT]]:
101
+ """
102
+ Get a deep copy of all test cases in the experiment.
103
+
104
+ Returns deep copies to prevent accidental mutation of the original test cases.
105
+ Users can safely modify the returned cases without affecting the experiment.
106
+
107
+ Returns:
108
+ List of Case objects (deep copies) containing all test cases in the experiment
109
+ """
110
+ return [case.model_copy(deep=True) for case in self._cases]
111
+
112
+ @property
113
+ def evaluators(self) -> list[Evaluator[InputT, OutputT]]:
114
+ """
115
+ Get the evaluators used for assessing test case performance.
116
+
117
+ Returns:
118
+ The list of evaluator instances configured for this experiment
119
+ """
120
+ return self._evaluators
121
+
122
+ @cases.setter
123
+ def cases(self, new_cases: list[Case[InputT, OutputT]]):
124
+ """
125
+ Set the test cases for this experiment.
126
+
127
+ Args:
128
+ new_cases: List of Case objects to use as the experiment's test cases
129
+ """
130
+ self._cases = new_cases
131
+
132
+ @evaluators.setter
133
+ def evaluators(self, new_evaluators: list[Evaluator[InputT, OutputT]]):
134
+ """
135
+ Set the evaluators for assessing test case performance.
136
+
137
+ Args:
138
+ new_evaluators: List of Evaluator instances to use for evaluating test cases
139
+ """
140
+ self._evaluators = new_evaluators
141
+
142
+ def _run_task(
143
+ self, task: Callable[[Case[InputT, OutputT]], OutputT | dict[str, Any]], case: Case[InputT, OutputT]
144
+ ) -> EvaluationData[InputT, OutputT]:
145
+ """
146
+ Run the task with the inputs from the test case.
147
+
148
+ Args:
149
+ task: The task to run the test case on. This function should take in InputT and returns either
150
+ OutputT or {"output": OutputT, "trajectory": ...}.
151
+ case: The test case containing neccessary information to run the task
152
+
153
+ Return:
154
+ An EvaluationData record containing the input and actual output, name, expected output, and metadata.
155
+ """
156
+ if asyncio.iscoroutinefunction(task):
157
+ raise ValueError("Async task is not supported. Please use run_evaluations_async instead.")
158
+
159
+ evaluation_context = EvaluationData(
160
+ name=case.name,
161
+ input=case.input,
162
+ expected_output=case.expected_output,
163
+ expected_trajectory=case.expected_trajectory,
164
+ expected_interactions=case.expected_interactions,
165
+ metadata=case.metadata,
166
+ )
167
+ task_output = task(case)
168
+ if isinstance(task_output, dict): # could be evaluating the trajectory as well
169
+ evaluation_context.actual_output = task_output.get("output")
170
+ evaluation_context.actual_trajectory = task_output.get("trajectory")
171
+ evaluation_context.actual_interactions = task_output.get("interactions")
172
+ new_input = task_output.get("input", None) # allows the user to update the input in the task function
173
+ if new_input is not None:
174
+ evaluation_context.input = new_input
175
+ else: # evaluating only the output
176
+ evaluation_context.actual_output = task_output
177
+ return evaluation_context
178
+
179
+ async def _run_task_async(
180
+ self, task: Callable[[Case[InputT, OutputT]], OutputT | dict[str, Any]], case: Case[InputT, OutputT]
181
+ ) -> EvaluationData[InputT, OutputT]:
182
+ """
183
+ Run the task with the inputs from the test case asynchronously.
184
+
185
+ Args:
186
+ task: The task to run the test case on. This function should take in InputT and returns either
187
+ OutputT or {"output": OutputT, "trajectory": ...}. The task can either run synchronously
188
+ or asynchronously.
189
+ case: The test case containing neccessary information to run the task
190
+
191
+ Return:
192
+ An EvaluationData record containing the input and actual output, name, expected output, and metadata.
193
+ """
194
+ # Create evaluation context
195
+ evaluation_context = EvaluationData(
196
+ name=case.name,
197
+ input=case.input,
198
+ expected_output=case.expected_output,
199
+ expected_trajectory=case.expected_trajectory,
200
+ expected_interactions=case.expected_interactions,
201
+ metadata=case.metadata,
202
+ )
203
+
204
+ # Handle both async and sync tasks
205
+ if asyncio.iscoroutinefunction(task):
206
+ task_output = await task(case)
207
+ else:
208
+ # Run sync function in separate thread to avoid blocking
209
+ task_output = await asyncio.to_thread(task, case)
210
+
211
+ if isinstance(task_output, dict):
212
+ evaluation_context.actual_output = task_output.get("output")
213
+ evaluation_context.actual_trajectory = task_output.get("trajectory")
214
+ evaluation_context.actual_interactions = task_output.get("interactions")
215
+ # allows the user to update the input in the task function
216
+ new_input = task_output.get("input", None)
217
+ if new_input is not None:
218
+ evaluation_context.input = new_input
219
+ else:
220
+ evaluation_context.actual_output = task_output
221
+
222
+ return evaluation_context
223
+
224
+ async def _worker(self, queue: asyncio.Queue, task: Callable, results: list):
225
+ """
226
+ Worker that processes cases from the queue. Run evaluation on the task.
227
+
228
+ Args:
229
+ queue: Queue containing cases to process
230
+ task: Task function to run on each case
231
+ results: List to store results
232
+ """
233
+ while True:
234
+ try:
235
+ case = queue.get_nowait()
236
+ except asyncio.QueueEmpty:
237
+ break
238
+
239
+ case_name = case.name or f"case_{len(results)}"
240
+ trace_id = None
241
+
242
+ try:
243
+ with self._tracer.start_as_current_span(
244
+ f"execute_case {case_name}",
245
+ ) as case_span:
246
+ evaluation_context = await self._run_task_async(task, case)
247
+ case_span.set_attributes(
248
+ {
249
+ "gen_ai.evaluation.data.input": serialize(evaluation_context.input),
250
+ "gen_ai.evaluation.data.expected_output": serialize(evaluation_context.expected_output),
251
+ "gen_ai.evaluation.data.actual_output": serialize(evaluation_context.actual_output),
252
+ "gen_ai.evaluation.data.has_trajectory": (evaluation_context.actual_trajectory is not None),
253
+ "gen_ai.evaluation.data.has_interactions": (
254
+ evaluation_context.actual_interactions is not None
255
+ ),
256
+ }
257
+ )
258
+ trace_id = format_trace_id(case_span.get_span_context().trace_id)
259
+
260
+ # Evaluate with each evaluator
261
+ evaluator_results = []
262
+ for evaluator in self._evaluators:
263
+ with self._tracer.start_as_current_span(
264
+ f"evaluator {evaluator.get_type_name()}",
265
+ ) as eval_span:
266
+ evaluation_outputs = await evaluator.evaluate_async(evaluation_context)
267
+ (aggregate_score, aggregate_pass, aggregate_reason) = evaluator.aggregator(evaluation_outputs)
268
+
269
+ try:
270
+ label = _get_label_from_score(evaluator, aggregate_score)
271
+ except Exception:
272
+ label = "UNKNOWN"
273
+
274
+ eval_span.set_attributes(
275
+ {
276
+ "gen_ai.evaluation.score.label": label,
277
+ "gen_ai.evaluation.score.value": str(aggregate_score),
278
+ "gen_ai.evaluation.test_pass": aggregate_pass,
279
+ "gen_ai.evaluation.explanation": aggregate_reason or "",
280
+ }
281
+ )
282
+
283
+ evaluator_results.append(
284
+ {
285
+ "evaluator_name": evaluator.get_type_name(),
286
+ "test_pass": aggregate_pass,
287
+ "score": aggregate_score,
288
+ "reason": aggregate_reason or "",
289
+ "detailed_results": evaluation_outputs,
290
+ }
291
+ )
292
+
293
+ # CloudWatch logging for this evaluator
294
+ try:
295
+ evaluator_full_name = f"Custom.{evaluator.get_type_name()}"
296
+ region = os.environ.get("AWS_REGION", "us-east-1")
297
+ _config_arn = f"arn:aws:strands:{region}::strands-evaluation-empty-config/{self._config_id}"
298
+ _evaluator_arn = f"arn:aws:strands-evals:::evaluator/{evaluator_full_name}"
299
+
300
+ log_data = {
301
+ "gen_ai.evaluation.name": evaluator_full_name,
302
+ "gen_ai.evaluation.score.value": str(aggregate_score),
303
+ "gen_ai.evaluation.explanation": aggregate_reason or "",
304
+ "gen_ai.evaluation.score.label": label,
305
+ "gen_ai.response.id": trace_id,
306
+ "aws.bedrock_agentcore.evaluator.rating_scale": "Numerical",
307
+ "aws.bedrock_agentcore.evaluation_level": evaluator.evaluation_level or "Trace",
308
+ "event.name": "gen_ai.evaluation.result",
309
+ "aws.bedrock_agentcore.online_evaluation_config.arn": _config_arn,
310
+ "aws.bedrock_agentcore.online_evaluation_config.name": "strands-local-evaluation",
311
+ "aws.bedrock_agentcore.evaluator.arn": _evaluator_arn,
312
+ "session.id": case.session_id,
313
+ }
314
+
315
+ agent_observability_enabled = os.environ.get("AGENT_OBSERVABILITY_ENABLED", "")
316
+ if agent_observability_enabled:
317
+ _send_to_cloudwatch(
318
+ message="gen_ai.evaluation.result",
319
+ log_data=log_data,
320
+ trace_id=trace_id,
321
+ evaluator_name=evaluator_full_name,
322
+ score=aggregate_score,
323
+ config_id=self._config_id,
324
+ label=label,
325
+ )
326
+ except Exception as e:
327
+ logger.debug(f"Skipping CloudWatch logging: {str(e)}")
328
+
329
+ # Store results
330
+ results.append(
331
+ {
332
+ "case": evaluation_context.model_dump(),
333
+ "evaluator_results": evaluator_results,
334
+ }
335
+ )
336
+
337
+ except Exception as e:
338
+ # Handle task execution errors
339
+ evaluator_results = []
340
+ for evaluator in self._evaluators:
341
+ evaluator_results.append(
342
+ {
343
+ "evaluator_name": evaluator.get_type_name(),
344
+ "test_pass": False,
345
+ "score": 0,
346
+ "reason": f"An error occurred: {str(e)}",
347
+ "detailed_results": [],
348
+ }
349
+ )
350
+ results.append(
351
+ {
352
+ "case": case.model_dump(),
353
+ "evaluator_results": evaluator_results,
354
+ }
355
+ )
356
+ finally:
357
+ queue.task_done()
358
+
359
+ def run_evaluations(
360
+ self, task: Callable[[Case[InputT, OutputT]], OutputT | dict[str, Any]]
361
+ ) -> list[EvaluationReport]:
362
+ """
363
+ Run the evaluations for all of the test cases with all evaluators.
364
+
365
+ Args:
366
+ task: The task to run the test case on. This function should take in InputT and returns either
367
+ OutputT or {"output": OutputT, "trajectory": ...}.
368
+
369
+ Return:
370
+ A list of EvaluationReport objects, one for each evaluator, containing the overall score,
371
+ individual case results, and basic feedback for each test case.
372
+ """
373
+ evaluator_data: dict[str, dict[str, list]] = {
374
+ evaluator.get_type_name(): {
375
+ "scores": [],
376
+ "test_passes": [],
377
+ "cases": [],
378
+ "reasons": [],
379
+ "detailed_results": [],
380
+ }
381
+ for evaluator in self._evaluators
382
+ }
383
+
384
+ for case in self._cases:
385
+ case_name = case.name or f"case_{len(evaluator_data[self._evaluators[0].get_type_name()]['cases'])}"
386
+
387
+ with self._tracer.start_as_current_span(
388
+ f"eval_case {case_name}",
389
+ attributes={
390
+ "gen_ai.evaluation.case.name": case_name,
391
+ "gen_ai.evaluation.case.input": serialize(case.input),
392
+ },
393
+ ) as case_span:
394
+ try:
395
+ # Task execution span - execute once
396
+ with self._tracer.start_as_current_span(
397
+ "task_execution",
398
+ attributes={
399
+ "gen_ai.evaluation.task.type": "agent_task",
400
+ "gen_ai.evaluation.case.name": case_name,
401
+ },
402
+ ) as task_span:
403
+ evaluation_context = self._run_task(task, case)
404
+ task_span.set_attributes(
405
+ {
406
+ "gen_ai.evaluation.data.input": serialize(evaluation_context.input),
407
+ "gen_ai.evaluation.data.expected_output": serialize(evaluation_context.expected_output),
408
+ "gen_ai.evaluation.data.actual_output": serialize(evaluation_context.actual_output),
409
+ "gen_ai.evaluation.data.has_trajectory": (
410
+ evaluation_context.actual_trajectory is not None
411
+ ),
412
+ "gen_ai.evaluation.data.has_interactions": (
413
+ evaluation_context.actual_interactions is not None
414
+ ),
415
+ }
416
+ )
417
+
418
+ # Evaluate with each evaluator using the same task output
419
+ for evaluator in self._evaluators:
420
+ with self._tracer.start_as_current_span(
421
+ f"evaluator {evaluator.get_type_name()}",
422
+ attributes={
423
+ "gen_ai.evaluation.name": evaluator.get_type_name(),
424
+ "gen_ai.evaluation.case.name": case_name,
425
+ },
426
+ ) as eval_span:
427
+ evaluation_outputs = evaluator.evaluate(evaluation_context)
428
+ (aggregate_score, aggregate_pass, aggregate_reason) = evaluator.aggregator(
429
+ evaluation_outputs
430
+ )
431
+ eval_span.set_attributes(
432
+ {
433
+ "gen_ai.evaluation.score.value": aggregate_score,
434
+ "gen_ai.evaluation.test_pass": aggregate_pass,
435
+ "gen_ai.evaluation.explanation": aggregate_reason or "",
436
+ }
437
+ )
438
+
439
+ eval_name = evaluator.get_type_name()
440
+ evaluator_data[eval_name]["cases"].append(evaluation_context.model_dump())
441
+ evaluator_data[eval_name]["test_passes"].append(aggregate_pass)
442
+ evaluator_data[eval_name]["scores"].append(aggregate_score)
443
+ evaluator_data[eval_name]["reasons"].append(aggregate_reason or "")
444
+ evaluator_data[eval_name]["detailed_results"].append(evaluation_outputs)
445
+
446
+ except Exception as e:
447
+ case_span.record_exception(e)
448
+ for evaluator in self._evaluators:
449
+ eval_name = evaluator.get_type_name()
450
+ evaluator_data[eval_name]["cases"].append(case.model_dump())
451
+ evaluator_data[eval_name]["test_passes"].append(False)
452
+ evaluator_data[eval_name]["scores"].append(0)
453
+ evaluator_data[eval_name]["reasons"].append(f"An error occured : {str(e)}")
454
+ evaluator_data[eval_name]["detailed_results"].append([])
455
+
456
+ reports = []
457
+ for evaluator in self._evaluators:
458
+ eval_name = evaluator.get_type_name()
459
+ data = evaluator_data[eval_name]
460
+ report = EvaluationReport(
461
+ overall_score=sum(data["scores"]) / len(data["scores"]) if len(data["scores"]) else 0,
462
+ scores=data["scores"],
463
+ test_passes=data["test_passes"],
464
+ cases=data["cases"],
465
+ reasons=data["reasons"],
466
+ detailed_results=data["detailed_results"],
467
+ )
468
+ reports.append(report)
469
+
470
+ return reports
471
+
472
+ async def run_evaluations_async(self, task: Callable, max_workers: int = 10) -> list[EvaluationReport]:
473
+ """
474
+ Run evaluations asynchronously using a queue for parallel processing.
475
+
476
+ Args:
477
+ task: The task function to run on each case. This function should take in InputT and returns
478
+ either OutputT or {"output": OutputT, "trajectory": ...}. The task can either run
479
+ synchronously or asynchronously.
480
+ max_workers: Maximum number of parallel workers (default: 10)
481
+
482
+ Returns:
483
+ List of EvaluationReport objects, one for each evaluator, containing evaluation results
484
+ """
485
+ queue: asyncio.Queue[Case[InputT, OutputT]] = asyncio.Queue()
486
+ results: list[Any] = []
487
+
488
+ for case in self._cases:
489
+ queue.put_nowait(case)
490
+
491
+ num_workers = min(max_workers, len(self._cases))
492
+
493
+ workers = [asyncio.create_task(self._worker(queue, task, results)) for _ in range(num_workers)]
494
+
495
+ await queue.join()
496
+ for worker in workers:
497
+ worker.cancel()
498
+ await asyncio.gather(*workers, return_exceptions=True)
499
+
500
+ # Organize results by evaluator
501
+ evaluator_data: dict[str, dict[str, list]] = {
502
+ evaluator.get_type_name(): {
503
+ "scores": [],
504
+ "test_passes": [],
505
+ "cases": [],
506
+ "reasons": [],
507
+ "detailed_results": [],
508
+ }
509
+ for evaluator in self._evaluators
510
+ }
511
+
512
+ for result in results:
513
+ case_data = result["case"]
514
+ for eval_result in result["evaluator_results"]:
515
+ eval_name = eval_result["evaluator_name"]
516
+ evaluator_data[eval_name]["cases"].append(case_data)
517
+ evaluator_data[eval_name]["scores"].append(eval_result["score"])
518
+ evaluator_data[eval_name]["test_passes"].append(eval_result["test_pass"])
519
+ evaluator_data[eval_name]["reasons"].append(eval_result["reason"])
520
+ evaluator_data[eval_name]["detailed_results"].append(eval_result["detailed_results"])
521
+
522
+ reports = []
523
+ for evaluator in self._evaluators:
524
+ eval_name = evaluator.get_type_name()
525
+ data = evaluator_data[eval_name]
526
+ scores = data["scores"]
527
+ report = EvaluationReport(
528
+ overall_score=sum(scores) / len(scores) if scores else 0,
529
+ scores=scores,
530
+ test_passes=data["test_passes"],
531
+ cases=data["cases"],
532
+ reasons=data["reasons"],
533
+ detailed_results=data["detailed_results"],
534
+ )
535
+ reports.append(report)
536
+
537
+ return reports
538
+
539
+ def to_dict(self) -> dict:
540
+ """
541
+ Convert the experiment to a dictionary.
542
+
543
+ Return:
544
+ A dictionary representation of the experiment.
545
+ """
546
+ return {
547
+ "cases": [case.model_dump() for case in self._cases],
548
+ "evaluators": [evaluator.to_dict() for evaluator in self._evaluators],
549
+ }
550
+
551
+ def to_file(self, path: str):
552
+ """
553
+ Write the experiment to a JSON file.
554
+
555
+ Args:
556
+ path: The file path where the experiment will be saved. Can be:
557
+ - A filename only (e.g., "foo.json" or "foo") - saves in current working directory
558
+ - A relative path (e.g., "relative_path/foo.json") - saves relative to current working directory
559
+ - An absolute path (e.g., "/path/to/dir/foo.json") - saves in exact directory
560
+
561
+ If no extension is provided, ".json" will be added automatically.
562
+ Only .json format is supported.
563
+
564
+ Raises:
565
+ ValueError: If the path has a non-JSON extension.
566
+ """
567
+ file_path = Path(path)
568
+
569
+ if file_path.suffix:
570
+ if file_path.suffix != ".json":
571
+ raise ValueError(
572
+ f"Only .json format is supported. Got path with extension: {path}. "
573
+ f"Please use a .json extension or provide a path without an extension."
574
+ )
575
+ else:
576
+ file_path = file_path.with_suffix(".json")
577
+
578
+ file_path.parent.mkdir(parents=True, exist_ok=True)
579
+
580
+ with open(file_path, "w") as f:
581
+ json.dump(self.to_dict(), f, indent=2)
582
+
583
+ @classmethod
584
+ def from_dict(cls, data: dict, custom_evaluators: list[type[Evaluator]] | None = None):
585
+ """
586
+ Create an experiment from a dictionary.
587
+
588
+ Args:
589
+ data: A dictionary representation of the experiment.
590
+ custom_evaluators: A list of relevant custom evaluators.
591
+
592
+ Return:
593
+ An Experiment object.
594
+ """
595
+ custom_evaluators = custom_evaluators or []
596
+ cases: list[Case] = [Case.model_validate(case_data) for case_data in data["cases"]]
597
+ default_evaluators: dict[str, type[Evaluator]] = {
598
+ "Evaluator": Evaluator,
599
+ "OutputEvaluator": OutputEvaluator,
600
+ "TrajectoryEvaluator": TrajectoryEvaluator,
601
+ "InteractionsEvaluator": InteractionsEvaluator,
602
+ }
603
+ all_evaluators: dict[str, type[Evaluator]] = {
604
+ **default_evaluators,
605
+ **{v.get_type_name(): v for v in custom_evaluators},
606
+ }
607
+
608
+ evaluators = []
609
+ for evaluator_dict in data["evaluators"]:
610
+ evaluator_type = evaluator_dict["evaluator_type"]
611
+ evaluator_args = {k: v for k, v in evaluator_dict.items() if k != "evaluator_type"}
612
+
613
+ if "model_id" in evaluator_args:
614
+ evaluator_args["model"] = evaluator_args.pop("model_id")
615
+
616
+ if evaluator_type in all_evaluators:
617
+ evaluator = all_evaluators[evaluator_type](**evaluator_args)
618
+ evaluators.append(evaluator)
619
+ else:
620
+ raise Exception(
621
+ f"Cannot find {evaluator_type}. Make sure the evaluator type is spelled correctly and "
622
+ f"all relevant custom evaluators are passed in."
623
+ )
624
+
625
+ return cls(cases=cases, evaluators=evaluators)
626
+
627
+ @classmethod
628
+ def from_file(cls, path: str, custom_evaluators: list[type[Evaluator]] | None = None):
629
+ """
630
+ Create an experiment from a JSON file.
631
+
632
+ Args:
633
+ path: Path to the JSON file.
634
+ custom_evaluators: A list of relevant custom evaluators.
635
+
636
+ Return:
637
+ An Experiment object.
638
+
639
+ Raises:
640
+ ValueError: If the file does not have a .json extension.
641
+ """
642
+ file_path = Path(path)
643
+
644
+ if file_path.suffix != ".json":
645
+ raise ValueError(
646
+ f"Only .json format is supported. Got file: {path}. Please provide a path with .json extension."
647
+ )
648
+
649
+ with open(file_path, "r") as f:
650
+ data = json.load(f)
651
+
652
+ return cls.from_dict(data, custom_evaluators)
@@ -0,0 +1,3 @@
1
+ from .trace_extractor import TraceExtractor
2
+
3
+ __all__ = ["TraceExtractor"]
@@ -0,0 +1,30 @@
1
+ from typing import Any
2
+
3
+ from strands.multiagent import GraphResult
4
+
5
+
6
+ def extract_graph_interactions(graph_result: GraphResult):
7
+ """
8
+ Extract interaction information from graph execution results.
9
+
10
+ Args:
11
+ graph_result: Result object from graph execution
12
+
13
+ Returns:
14
+ list: Interactions with node names, dependencies, and messages
15
+ [{node_name: str, dependencies: list[str], messages: list[str]}]
16
+ """
17
+ message_info: list[dict[str, Any]] = []
18
+ for node in graph_result.execution_order:
19
+ # Skip nodes without results
20
+ if node.result is None:
21
+ continue
22
+ # Skip if result doesn't have the expected structure
23
+ if not hasattr(node.result, "result") or not hasattr(node.result.result, "message"):
24
+ continue
25
+
26
+ node_name = node.node_id
27
+ node_messages = [m["text"] for m in node.result.result.message["content"]]
28
+ dependencies = [n.node_id for n in node.dependencies]
29
+ message_info.append({"node_name": node_name, "dependencies": dependencies, "messages": node_messages})
30
+ return message_info