strands-agents-evals 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- strands_agents_evals-0.1.0.dist-info/METADATA +408 -0
- strands_agents_evals-0.1.0.dist-info/RECORD +68 -0
- strands_agents_evals-0.1.0.dist-info/WHEEL +4 -0
- strands_agents_evals-0.1.0.dist-info/licenses/LICENSE +175 -0
- strands_agents_evals-0.1.0.dist-info/licenses/NOTICE +1 -0
- strands_evals/__init__.py +22 -0
- strands_evals/case.py +53 -0
- strands_evals/display/display_console.py +150 -0
- strands_evals/evaluators/__init__.py +23 -0
- strands_evals/evaluators/evaluator.py +182 -0
- strands_evals/evaluators/faithfulness_evaluator.py +116 -0
- strands_evals/evaluators/goal_success_rate_evaluator.py +90 -0
- strands_evals/evaluators/harmfulness_evaluator.py +135 -0
- strands_evals/evaluators/helpfulness_evaluator.py +148 -0
- strands_evals/evaluators/interactions_evaluator.py +244 -0
- strands_evals/evaluators/output_evaluator.py +72 -0
- strands_evals/evaluators/prompt_templates/case_prompt_template.py +63 -0
- strands_evals/evaluators/prompt_templates/faithfulness/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/faithfulness/faithfulness_v0.py +30 -0
- strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v0.py +17 -0
- strands_evals/evaluators/prompt_templates/harmfulness/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/harmfulness/harmfulness_v0.py +8 -0
- strands_evals/evaluators/prompt_templates/helpfulness/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/helpfulness/helpfulness_v0.py +38 -0
- strands_evals/evaluators/prompt_templates/prompt_templates.py +176 -0
- strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/tool_parameter_accuracy_v0.py +40 -0
- strands_evals/evaluators/prompt_templates/tool_selection_accuracy/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection_accuracy_v0.py +23 -0
- strands_evals/evaluators/tool_parameter_accuracy_evaluator.py +112 -0
- strands_evals/evaluators/tool_selection_accuracy_evaluator.py +112 -0
- strands_evals/evaluators/trajectory_evaluator.py +100 -0
- strands_evals/experiment.py +652 -0
- strands_evals/extractors/__init__.py +3 -0
- strands_evals/extractors/graph_extractor.py +30 -0
- strands_evals/extractors/swarm_extractor.py +73 -0
- strands_evals/extractors/tools_use_extractor.py +164 -0
- strands_evals/extractors/trace_extractor.py +166 -0
- strands_evals/generators/__init__.py +3 -0
- strands_evals/generators/experiment_generator.py +498 -0
- strands_evals/generators/prompt_template/prompt_templates.py +75 -0
- strands_evals/generators/topic_planner.py +60 -0
- strands_evals/mappers/__init__.py +6 -0
- strands_evals/mappers/session_mapper.py +27 -0
- strands_evals/mappers/strands_in_memory_session_mapper.py +473 -0
- strands_evals/simulation/README.md +323 -0
- strands_evals/simulation/__init__.py +6 -0
- strands_evals/simulation/actor_simulator.py +292 -0
- strands_evals/simulation/profiles/__init__.py +5 -0
- strands_evals/simulation/profiles/actor_profile.py +26 -0
- strands_evals/simulation/prompt_templates/__init__.py +11 -0
- strands_evals/simulation/prompt_templates/actor_profile_extraction.py +25 -0
- strands_evals/simulation/prompt_templates/actor_system_prompt.py +64 -0
- strands_evals/simulation/prompt_templates/goal_completion.py +27 -0
- strands_evals/simulation/tools/__init__.py +5 -0
- strands_evals/simulation/tools/goal_completion.py +93 -0
- strands_evals/telemetry/__init__.py +15 -0
- strands_evals/telemetry/_cloudwatch_logger.py +209 -0
- strands_evals/telemetry/config.py +207 -0
- strands_evals/telemetry/tracer.py +38 -0
- strands_evals/tools/evaluation_tools.py +67 -0
- strands_evals/types/__init__.py +11 -0
- strands_evals/types/evaluation.py +105 -0
- strands_evals/types/evaluation_report.py +244 -0
- strands_evals/types/simulation/__init__.py +5 -0
- strands_evals/types/simulation/actor.py +34 -0
- strands_evals/types/trace.py +205 -0
|
@@ -0,0 +1,652 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
from collections.abc import Callable
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from opentelemetry.trace import format_trace_id
|
|
9
|
+
from typing_extensions import Any, Generic, TypeVar
|
|
10
|
+
|
|
11
|
+
from .case import Case
|
|
12
|
+
from .evaluators.evaluator import Evaluator
|
|
13
|
+
from .evaluators.interactions_evaluator import InteractionsEvaluator
|
|
14
|
+
from .evaluators.output_evaluator import OutputEvaluator
|
|
15
|
+
from .evaluators.trajectory_evaluator import TrajectoryEvaluator
|
|
16
|
+
from .telemetry import get_tracer, serialize
|
|
17
|
+
from .telemetry._cloudwatch_logger import _send_to_cloudwatch
|
|
18
|
+
from .types.evaluation import EvaluationData
|
|
19
|
+
from .types.evaluation_report import EvaluationReport
|
|
20
|
+
|
|
21
|
+
InputT = TypeVar("InputT")
|
|
22
|
+
OutputT = TypeVar("OutputT")
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger()
|
|
25
|
+
logger.setLevel(logging.INFO)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _get_label_from_score(evaluator: Evaluator, score: float) -> str:
|
|
29
|
+
"""
|
|
30
|
+
Get the label from score using evaluator's _score_mapping if available.
|
|
31
|
+
If no mapping exists, returns "YES" for scores >= 0.5, "NO" otherwise.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
evaluator: The evaluator instance
|
|
35
|
+
score: The numeric score
|
|
36
|
+
default_label: Default label to return if provided and no mapping found
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
The label corresponding to the score
|
|
40
|
+
"""
|
|
41
|
+
if hasattr(evaluator, "_score_mapping") and evaluator._score_mapping:
|
|
42
|
+
# Create reverse mapping from score to label
|
|
43
|
+
reverse_mapping = {v: k for k, v in evaluator._score_mapping.items()}
|
|
44
|
+
# Find the score in the mapping
|
|
45
|
+
if score in reverse_mapping:
|
|
46
|
+
return str(reverse_mapping[score])
|
|
47
|
+
|
|
48
|
+
# Otherwise, return YES/NO based on score
|
|
49
|
+
return "YES" if score >= 0.5 else "NO"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class Experiment(Generic[InputT, OutputT]):
|
|
53
|
+
"""
|
|
54
|
+
An evaluation experiment containing test cases and evaluators.
|
|
55
|
+
|
|
56
|
+
Experiment organizes a collection of test cases and evaluates them all with
|
|
57
|
+
the defined evaluators on some task.
|
|
58
|
+
|
|
59
|
+
Attributes:
|
|
60
|
+
cases: A list of test cases in the experiment.
|
|
61
|
+
evaluators: The list of evaluators to be used on the test cases.
|
|
62
|
+
|
|
63
|
+
Example:
|
|
64
|
+
experiment = Experiment[str, str](
|
|
65
|
+
cases=[
|
|
66
|
+
Case(name="Simple Knowledge",
|
|
67
|
+
input="What is the capital of France?",
|
|
68
|
+
expected_output="The capital of France is Paris.",
|
|
69
|
+
expected_trajectory=[],
|
|
70
|
+
metadata={"category": "knowledge"}),
|
|
71
|
+
Case(name="Simple Math",
|
|
72
|
+
input="What is 2x2?",
|
|
73
|
+
expected_output="2x2 is 4.",
|
|
74
|
+
expected_trajectory=["calculator"],
|
|
75
|
+
metadata={"category": "math"})
|
|
76
|
+
],
|
|
77
|
+
evaluators=[
|
|
78
|
+
OutputEvaluator(
|
|
79
|
+
rubric=(
|
|
80
|
+
"The output is relevant and complete. 0 if the output is incorrect or irrelevant."
|
|
81
|
+
)
|
|
82
|
+
)
|
|
83
|
+
]
|
|
84
|
+
)
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
def __init__(
|
|
88
|
+
self,
|
|
89
|
+
cases: list[Case[InputT, OutputT]] | None = None,
|
|
90
|
+
evaluators: list[Evaluator[InputT, OutputT]] | None = None,
|
|
91
|
+
):
|
|
92
|
+
self._cases = cases or []
|
|
93
|
+
self._evaluators = evaluators or [Evaluator()]
|
|
94
|
+
self._tracer = get_tracer()
|
|
95
|
+
# self._logger = get_logger(__name__)
|
|
96
|
+
|
|
97
|
+
self._config_id = os.environ.get("EVALUATION_RESULTS_LOG_GROUP", "default-strands-evals")
|
|
98
|
+
|
|
99
|
+
@property
|
|
100
|
+
def cases(self) -> list[Case[InputT, OutputT]]:
|
|
101
|
+
"""
|
|
102
|
+
Get a deep copy of all test cases in the experiment.
|
|
103
|
+
|
|
104
|
+
Returns deep copies to prevent accidental mutation of the original test cases.
|
|
105
|
+
Users can safely modify the returned cases without affecting the experiment.
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
List of Case objects (deep copies) containing all test cases in the experiment
|
|
109
|
+
"""
|
|
110
|
+
return [case.model_copy(deep=True) for case in self._cases]
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def evaluators(self) -> list[Evaluator[InputT, OutputT]]:
|
|
114
|
+
"""
|
|
115
|
+
Get the evaluators used for assessing test case performance.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
The list of evaluator instances configured for this experiment
|
|
119
|
+
"""
|
|
120
|
+
return self._evaluators
|
|
121
|
+
|
|
122
|
+
@cases.setter
|
|
123
|
+
def cases(self, new_cases: list[Case[InputT, OutputT]]):
|
|
124
|
+
"""
|
|
125
|
+
Set the test cases for this experiment.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
new_cases: List of Case objects to use as the experiment's test cases
|
|
129
|
+
"""
|
|
130
|
+
self._cases = new_cases
|
|
131
|
+
|
|
132
|
+
@evaluators.setter
|
|
133
|
+
def evaluators(self, new_evaluators: list[Evaluator[InputT, OutputT]]):
|
|
134
|
+
"""
|
|
135
|
+
Set the evaluators for assessing test case performance.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
new_evaluators: List of Evaluator instances to use for evaluating test cases
|
|
139
|
+
"""
|
|
140
|
+
self._evaluators = new_evaluators
|
|
141
|
+
|
|
142
|
+
def _run_task(
|
|
143
|
+
self, task: Callable[[Case[InputT, OutputT]], OutputT | dict[str, Any]], case: Case[InputT, OutputT]
|
|
144
|
+
) -> EvaluationData[InputT, OutputT]:
|
|
145
|
+
"""
|
|
146
|
+
Run the task with the inputs from the test case.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
task: The task to run the test case on. This function should take in InputT and returns either
|
|
150
|
+
OutputT or {"output": OutputT, "trajectory": ...}.
|
|
151
|
+
case: The test case containing neccessary information to run the task
|
|
152
|
+
|
|
153
|
+
Return:
|
|
154
|
+
An EvaluationData record containing the input and actual output, name, expected output, and metadata.
|
|
155
|
+
"""
|
|
156
|
+
if asyncio.iscoroutinefunction(task):
|
|
157
|
+
raise ValueError("Async task is not supported. Please use run_evaluations_async instead.")
|
|
158
|
+
|
|
159
|
+
evaluation_context = EvaluationData(
|
|
160
|
+
name=case.name,
|
|
161
|
+
input=case.input,
|
|
162
|
+
expected_output=case.expected_output,
|
|
163
|
+
expected_trajectory=case.expected_trajectory,
|
|
164
|
+
expected_interactions=case.expected_interactions,
|
|
165
|
+
metadata=case.metadata,
|
|
166
|
+
)
|
|
167
|
+
task_output = task(case)
|
|
168
|
+
if isinstance(task_output, dict): # could be evaluating the trajectory as well
|
|
169
|
+
evaluation_context.actual_output = task_output.get("output")
|
|
170
|
+
evaluation_context.actual_trajectory = task_output.get("trajectory")
|
|
171
|
+
evaluation_context.actual_interactions = task_output.get("interactions")
|
|
172
|
+
new_input = task_output.get("input", None) # allows the user to update the input in the task function
|
|
173
|
+
if new_input is not None:
|
|
174
|
+
evaluation_context.input = new_input
|
|
175
|
+
else: # evaluating only the output
|
|
176
|
+
evaluation_context.actual_output = task_output
|
|
177
|
+
return evaluation_context
|
|
178
|
+
|
|
179
|
+
async def _run_task_async(
|
|
180
|
+
self, task: Callable[[Case[InputT, OutputT]], OutputT | dict[str, Any]], case: Case[InputT, OutputT]
|
|
181
|
+
) -> EvaluationData[InputT, OutputT]:
|
|
182
|
+
"""
|
|
183
|
+
Run the task with the inputs from the test case asynchronously.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
task: The task to run the test case on. This function should take in InputT and returns either
|
|
187
|
+
OutputT or {"output": OutputT, "trajectory": ...}. The task can either run synchronously
|
|
188
|
+
or asynchronously.
|
|
189
|
+
case: The test case containing neccessary information to run the task
|
|
190
|
+
|
|
191
|
+
Return:
|
|
192
|
+
An EvaluationData record containing the input and actual output, name, expected output, and metadata.
|
|
193
|
+
"""
|
|
194
|
+
# Create evaluation context
|
|
195
|
+
evaluation_context = EvaluationData(
|
|
196
|
+
name=case.name,
|
|
197
|
+
input=case.input,
|
|
198
|
+
expected_output=case.expected_output,
|
|
199
|
+
expected_trajectory=case.expected_trajectory,
|
|
200
|
+
expected_interactions=case.expected_interactions,
|
|
201
|
+
metadata=case.metadata,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# Handle both async and sync tasks
|
|
205
|
+
if asyncio.iscoroutinefunction(task):
|
|
206
|
+
task_output = await task(case)
|
|
207
|
+
else:
|
|
208
|
+
# Run sync function in separate thread to avoid blocking
|
|
209
|
+
task_output = await asyncio.to_thread(task, case)
|
|
210
|
+
|
|
211
|
+
if isinstance(task_output, dict):
|
|
212
|
+
evaluation_context.actual_output = task_output.get("output")
|
|
213
|
+
evaluation_context.actual_trajectory = task_output.get("trajectory")
|
|
214
|
+
evaluation_context.actual_interactions = task_output.get("interactions")
|
|
215
|
+
# allows the user to update the input in the task function
|
|
216
|
+
new_input = task_output.get("input", None)
|
|
217
|
+
if new_input is not None:
|
|
218
|
+
evaluation_context.input = new_input
|
|
219
|
+
else:
|
|
220
|
+
evaluation_context.actual_output = task_output
|
|
221
|
+
|
|
222
|
+
return evaluation_context
|
|
223
|
+
|
|
224
|
+
async def _worker(self, queue: asyncio.Queue, task: Callable, results: list):
|
|
225
|
+
"""
|
|
226
|
+
Worker that processes cases from the queue. Run evaluation on the task.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
queue: Queue containing cases to process
|
|
230
|
+
task: Task function to run on each case
|
|
231
|
+
results: List to store results
|
|
232
|
+
"""
|
|
233
|
+
while True:
|
|
234
|
+
try:
|
|
235
|
+
case = queue.get_nowait()
|
|
236
|
+
except asyncio.QueueEmpty:
|
|
237
|
+
break
|
|
238
|
+
|
|
239
|
+
case_name = case.name or f"case_{len(results)}"
|
|
240
|
+
trace_id = None
|
|
241
|
+
|
|
242
|
+
try:
|
|
243
|
+
with self._tracer.start_as_current_span(
|
|
244
|
+
f"execute_case {case_name}",
|
|
245
|
+
) as case_span:
|
|
246
|
+
evaluation_context = await self._run_task_async(task, case)
|
|
247
|
+
case_span.set_attributes(
|
|
248
|
+
{
|
|
249
|
+
"gen_ai.evaluation.data.input": serialize(evaluation_context.input),
|
|
250
|
+
"gen_ai.evaluation.data.expected_output": serialize(evaluation_context.expected_output),
|
|
251
|
+
"gen_ai.evaluation.data.actual_output": serialize(evaluation_context.actual_output),
|
|
252
|
+
"gen_ai.evaluation.data.has_trajectory": (evaluation_context.actual_trajectory is not None),
|
|
253
|
+
"gen_ai.evaluation.data.has_interactions": (
|
|
254
|
+
evaluation_context.actual_interactions is not None
|
|
255
|
+
),
|
|
256
|
+
}
|
|
257
|
+
)
|
|
258
|
+
trace_id = format_trace_id(case_span.get_span_context().trace_id)
|
|
259
|
+
|
|
260
|
+
# Evaluate with each evaluator
|
|
261
|
+
evaluator_results = []
|
|
262
|
+
for evaluator in self._evaluators:
|
|
263
|
+
with self._tracer.start_as_current_span(
|
|
264
|
+
f"evaluator {evaluator.get_type_name()}",
|
|
265
|
+
) as eval_span:
|
|
266
|
+
evaluation_outputs = await evaluator.evaluate_async(evaluation_context)
|
|
267
|
+
(aggregate_score, aggregate_pass, aggregate_reason) = evaluator.aggregator(evaluation_outputs)
|
|
268
|
+
|
|
269
|
+
try:
|
|
270
|
+
label = _get_label_from_score(evaluator, aggregate_score)
|
|
271
|
+
except Exception:
|
|
272
|
+
label = "UNKNOWN"
|
|
273
|
+
|
|
274
|
+
eval_span.set_attributes(
|
|
275
|
+
{
|
|
276
|
+
"gen_ai.evaluation.score.label": label,
|
|
277
|
+
"gen_ai.evaluation.score.value": str(aggregate_score),
|
|
278
|
+
"gen_ai.evaluation.test_pass": aggregate_pass,
|
|
279
|
+
"gen_ai.evaluation.explanation": aggregate_reason or "",
|
|
280
|
+
}
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
evaluator_results.append(
|
|
284
|
+
{
|
|
285
|
+
"evaluator_name": evaluator.get_type_name(),
|
|
286
|
+
"test_pass": aggregate_pass,
|
|
287
|
+
"score": aggregate_score,
|
|
288
|
+
"reason": aggregate_reason or "",
|
|
289
|
+
"detailed_results": evaluation_outputs,
|
|
290
|
+
}
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
# CloudWatch logging for this evaluator
|
|
294
|
+
try:
|
|
295
|
+
evaluator_full_name = f"Custom.{evaluator.get_type_name()}"
|
|
296
|
+
region = os.environ.get("AWS_REGION", "us-east-1")
|
|
297
|
+
_config_arn = f"arn:aws:strands:{region}::strands-evaluation-empty-config/{self._config_id}"
|
|
298
|
+
_evaluator_arn = f"arn:aws:strands-evals:::evaluator/{evaluator_full_name}"
|
|
299
|
+
|
|
300
|
+
log_data = {
|
|
301
|
+
"gen_ai.evaluation.name": evaluator_full_name,
|
|
302
|
+
"gen_ai.evaluation.score.value": str(aggregate_score),
|
|
303
|
+
"gen_ai.evaluation.explanation": aggregate_reason or "",
|
|
304
|
+
"gen_ai.evaluation.score.label": label,
|
|
305
|
+
"gen_ai.response.id": trace_id,
|
|
306
|
+
"aws.bedrock_agentcore.evaluator.rating_scale": "Numerical",
|
|
307
|
+
"aws.bedrock_agentcore.evaluation_level": evaluator.evaluation_level or "Trace",
|
|
308
|
+
"event.name": "gen_ai.evaluation.result",
|
|
309
|
+
"aws.bedrock_agentcore.online_evaluation_config.arn": _config_arn,
|
|
310
|
+
"aws.bedrock_agentcore.online_evaluation_config.name": "strands-local-evaluation",
|
|
311
|
+
"aws.bedrock_agentcore.evaluator.arn": _evaluator_arn,
|
|
312
|
+
"session.id": case.session_id,
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
agent_observability_enabled = os.environ.get("AGENT_OBSERVABILITY_ENABLED", "")
|
|
316
|
+
if agent_observability_enabled:
|
|
317
|
+
_send_to_cloudwatch(
|
|
318
|
+
message="gen_ai.evaluation.result",
|
|
319
|
+
log_data=log_data,
|
|
320
|
+
trace_id=trace_id,
|
|
321
|
+
evaluator_name=evaluator_full_name,
|
|
322
|
+
score=aggregate_score,
|
|
323
|
+
config_id=self._config_id,
|
|
324
|
+
label=label,
|
|
325
|
+
)
|
|
326
|
+
except Exception as e:
|
|
327
|
+
logger.debug(f"Skipping CloudWatch logging: {str(e)}")
|
|
328
|
+
|
|
329
|
+
# Store results
|
|
330
|
+
results.append(
|
|
331
|
+
{
|
|
332
|
+
"case": evaluation_context.model_dump(),
|
|
333
|
+
"evaluator_results": evaluator_results,
|
|
334
|
+
}
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
except Exception as e:
|
|
338
|
+
# Handle task execution errors
|
|
339
|
+
evaluator_results = []
|
|
340
|
+
for evaluator in self._evaluators:
|
|
341
|
+
evaluator_results.append(
|
|
342
|
+
{
|
|
343
|
+
"evaluator_name": evaluator.get_type_name(),
|
|
344
|
+
"test_pass": False,
|
|
345
|
+
"score": 0,
|
|
346
|
+
"reason": f"An error occurred: {str(e)}",
|
|
347
|
+
"detailed_results": [],
|
|
348
|
+
}
|
|
349
|
+
)
|
|
350
|
+
results.append(
|
|
351
|
+
{
|
|
352
|
+
"case": case.model_dump(),
|
|
353
|
+
"evaluator_results": evaluator_results,
|
|
354
|
+
}
|
|
355
|
+
)
|
|
356
|
+
finally:
|
|
357
|
+
queue.task_done()
|
|
358
|
+
|
|
359
|
+
def run_evaluations(
|
|
360
|
+
self, task: Callable[[Case[InputT, OutputT]], OutputT | dict[str, Any]]
|
|
361
|
+
) -> list[EvaluationReport]:
|
|
362
|
+
"""
|
|
363
|
+
Run the evaluations for all of the test cases with all evaluators.
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
task: The task to run the test case on. This function should take in InputT and returns either
|
|
367
|
+
OutputT or {"output": OutputT, "trajectory": ...}.
|
|
368
|
+
|
|
369
|
+
Return:
|
|
370
|
+
A list of EvaluationReport objects, one for each evaluator, containing the overall score,
|
|
371
|
+
individual case results, and basic feedback for each test case.
|
|
372
|
+
"""
|
|
373
|
+
evaluator_data: dict[str, dict[str, list]] = {
|
|
374
|
+
evaluator.get_type_name(): {
|
|
375
|
+
"scores": [],
|
|
376
|
+
"test_passes": [],
|
|
377
|
+
"cases": [],
|
|
378
|
+
"reasons": [],
|
|
379
|
+
"detailed_results": [],
|
|
380
|
+
}
|
|
381
|
+
for evaluator in self._evaluators
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
for case in self._cases:
|
|
385
|
+
case_name = case.name or f"case_{len(evaluator_data[self._evaluators[0].get_type_name()]['cases'])}"
|
|
386
|
+
|
|
387
|
+
with self._tracer.start_as_current_span(
|
|
388
|
+
f"eval_case {case_name}",
|
|
389
|
+
attributes={
|
|
390
|
+
"gen_ai.evaluation.case.name": case_name,
|
|
391
|
+
"gen_ai.evaluation.case.input": serialize(case.input),
|
|
392
|
+
},
|
|
393
|
+
) as case_span:
|
|
394
|
+
try:
|
|
395
|
+
# Task execution span - execute once
|
|
396
|
+
with self._tracer.start_as_current_span(
|
|
397
|
+
"task_execution",
|
|
398
|
+
attributes={
|
|
399
|
+
"gen_ai.evaluation.task.type": "agent_task",
|
|
400
|
+
"gen_ai.evaluation.case.name": case_name,
|
|
401
|
+
},
|
|
402
|
+
) as task_span:
|
|
403
|
+
evaluation_context = self._run_task(task, case)
|
|
404
|
+
task_span.set_attributes(
|
|
405
|
+
{
|
|
406
|
+
"gen_ai.evaluation.data.input": serialize(evaluation_context.input),
|
|
407
|
+
"gen_ai.evaluation.data.expected_output": serialize(evaluation_context.expected_output),
|
|
408
|
+
"gen_ai.evaluation.data.actual_output": serialize(evaluation_context.actual_output),
|
|
409
|
+
"gen_ai.evaluation.data.has_trajectory": (
|
|
410
|
+
evaluation_context.actual_trajectory is not None
|
|
411
|
+
),
|
|
412
|
+
"gen_ai.evaluation.data.has_interactions": (
|
|
413
|
+
evaluation_context.actual_interactions is not None
|
|
414
|
+
),
|
|
415
|
+
}
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
# Evaluate with each evaluator using the same task output
|
|
419
|
+
for evaluator in self._evaluators:
|
|
420
|
+
with self._tracer.start_as_current_span(
|
|
421
|
+
f"evaluator {evaluator.get_type_name()}",
|
|
422
|
+
attributes={
|
|
423
|
+
"gen_ai.evaluation.name": evaluator.get_type_name(),
|
|
424
|
+
"gen_ai.evaluation.case.name": case_name,
|
|
425
|
+
},
|
|
426
|
+
) as eval_span:
|
|
427
|
+
evaluation_outputs = evaluator.evaluate(evaluation_context)
|
|
428
|
+
(aggregate_score, aggregate_pass, aggregate_reason) = evaluator.aggregator(
|
|
429
|
+
evaluation_outputs
|
|
430
|
+
)
|
|
431
|
+
eval_span.set_attributes(
|
|
432
|
+
{
|
|
433
|
+
"gen_ai.evaluation.score.value": aggregate_score,
|
|
434
|
+
"gen_ai.evaluation.test_pass": aggregate_pass,
|
|
435
|
+
"gen_ai.evaluation.explanation": aggregate_reason or "",
|
|
436
|
+
}
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
eval_name = evaluator.get_type_name()
|
|
440
|
+
evaluator_data[eval_name]["cases"].append(evaluation_context.model_dump())
|
|
441
|
+
evaluator_data[eval_name]["test_passes"].append(aggregate_pass)
|
|
442
|
+
evaluator_data[eval_name]["scores"].append(aggregate_score)
|
|
443
|
+
evaluator_data[eval_name]["reasons"].append(aggregate_reason or "")
|
|
444
|
+
evaluator_data[eval_name]["detailed_results"].append(evaluation_outputs)
|
|
445
|
+
|
|
446
|
+
except Exception as e:
|
|
447
|
+
case_span.record_exception(e)
|
|
448
|
+
for evaluator in self._evaluators:
|
|
449
|
+
eval_name = evaluator.get_type_name()
|
|
450
|
+
evaluator_data[eval_name]["cases"].append(case.model_dump())
|
|
451
|
+
evaluator_data[eval_name]["test_passes"].append(False)
|
|
452
|
+
evaluator_data[eval_name]["scores"].append(0)
|
|
453
|
+
evaluator_data[eval_name]["reasons"].append(f"An error occured : {str(e)}")
|
|
454
|
+
evaluator_data[eval_name]["detailed_results"].append([])
|
|
455
|
+
|
|
456
|
+
reports = []
|
|
457
|
+
for evaluator in self._evaluators:
|
|
458
|
+
eval_name = evaluator.get_type_name()
|
|
459
|
+
data = evaluator_data[eval_name]
|
|
460
|
+
report = EvaluationReport(
|
|
461
|
+
overall_score=sum(data["scores"]) / len(data["scores"]) if len(data["scores"]) else 0,
|
|
462
|
+
scores=data["scores"],
|
|
463
|
+
test_passes=data["test_passes"],
|
|
464
|
+
cases=data["cases"],
|
|
465
|
+
reasons=data["reasons"],
|
|
466
|
+
detailed_results=data["detailed_results"],
|
|
467
|
+
)
|
|
468
|
+
reports.append(report)
|
|
469
|
+
|
|
470
|
+
return reports
|
|
471
|
+
|
|
472
|
+
async def run_evaluations_async(self, task: Callable, max_workers: int = 10) -> list[EvaluationReport]:
|
|
473
|
+
"""
|
|
474
|
+
Run evaluations asynchronously using a queue for parallel processing.
|
|
475
|
+
|
|
476
|
+
Args:
|
|
477
|
+
task: The task function to run on each case. This function should take in InputT and returns
|
|
478
|
+
either OutputT or {"output": OutputT, "trajectory": ...}. The task can either run
|
|
479
|
+
synchronously or asynchronously.
|
|
480
|
+
max_workers: Maximum number of parallel workers (default: 10)
|
|
481
|
+
|
|
482
|
+
Returns:
|
|
483
|
+
List of EvaluationReport objects, one for each evaluator, containing evaluation results
|
|
484
|
+
"""
|
|
485
|
+
queue: asyncio.Queue[Case[InputT, OutputT]] = asyncio.Queue()
|
|
486
|
+
results: list[Any] = []
|
|
487
|
+
|
|
488
|
+
for case in self._cases:
|
|
489
|
+
queue.put_nowait(case)
|
|
490
|
+
|
|
491
|
+
num_workers = min(max_workers, len(self._cases))
|
|
492
|
+
|
|
493
|
+
workers = [asyncio.create_task(self._worker(queue, task, results)) for _ in range(num_workers)]
|
|
494
|
+
|
|
495
|
+
await queue.join()
|
|
496
|
+
for worker in workers:
|
|
497
|
+
worker.cancel()
|
|
498
|
+
await asyncio.gather(*workers, return_exceptions=True)
|
|
499
|
+
|
|
500
|
+
# Organize results by evaluator
|
|
501
|
+
evaluator_data: dict[str, dict[str, list]] = {
|
|
502
|
+
evaluator.get_type_name(): {
|
|
503
|
+
"scores": [],
|
|
504
|
+
"test_passes": [],
|
|
505
|
+
"cases": [],
|
|
506
|
+
"reasons": [],
|
|
507
|
+
"detailed_results": [],
|
|
508
|
+
}
|
|
509
|
+
for evaluator in self._evaluators
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
for result in results:
|
|
513
|
+
case_data = result["case"]
|
|
514
|
+
for eval_result in result["evaluator_results"]:
|
|
515
|
+
eval_name = eval_result["evaluator_name"]
|
|
516
|
+
evaluator_data[eval_name]["cases"].append(case_data)
|
|
517
|
+
evaluator_data[eval_name]["scores"].append(eval_result["score"])
|
|
518
|
+
evaluator_data[eval_name]["test_passes"].append(eval_result["test_pass"])
|
|
519
|
+
evaluator_data[eval_name]["reasons"].append(eval_result["reason"])
|
|
520
|
+
evaluator_data[eval_name]["detailed_results"].append(eval_result["detailed_results"])
|
|
521
|
+
|
|
522
|
+
reports = []
|
|
523
|
+
for evaluator in self._evaluators:
|
|
524
|
+
eval_name = evaluator.get_type_name()
|
|
525
|
+
data = evaluator_data[eval_name]
|
|
526
|
+
scores = data["scores"]
|
|
527
|
+
report = EvaluationReport(
|
|
528
|
+
overall_score=sum(scores) / len(scores) if scores else 0,
|
|
529
|
+
scores=scores,
|
|
530
|
+
test_passes=data["test_passes"],
|
|
531
|
+
cases=data["cases"],
|
|
532
|
+
reasons=data["reasons"],
|
|
533
|
+
detailed_results=data["detailed_results"],
|
|
534
|
+
)
|
|
535
|
+
reports.append(report)
|
|
536
|
+
|
|
537
|
+
return reports
|
|
538
|
+
|
|
539
|
+
def to_dict(self) -> dict:
|
|
540
|
+
"""
|
|
541
|
+
Convert the experiment to a dictionary.
|
|
542
|
+
|
|
543
|
+
Return:
|
|
544
|
+
A dictionary representation of the experiment.
|
|
545
|
+
"""
|
|
546
|
+
return {
|
|
547
|
+
"cases": [case.model_dump() for case in self._cases],
|
|
548
|
+
"evaluators": [evaluator.to_dict() for evaluator in self._evaluators],
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
def to_file(self, path: str):
|
|
552
|
+
"""
|
|
553
|
+
Write the experiment to a JSON file.
|
|
554
|
+
|
|
555
|
+
Args:
|
|
556
|
+
path: The file path where the experiment will be saved. Can be:
|
|
557
|
+
- A filename only (e.g., "foo.json" or "foo") - saves in current working directory
|
|
558
|
+
- A relative path (e.g., "relative_path/foo.json") - saves relative to current working directory
|
|
559
|
+
- An absolute path (e.g., "/path/to/dir/foo.json") - saves in exact directory
|
|
560
|
+
|
|
561
|
+
If no extension is provided, ".json" will be added automatically.
|
|
562
|
+
Only .json format is supported.
|
|
563
|
+
|
|
564
|
+
Raises:
|
|
565
|
+
ValueError: If the path has a non-JSON extension.
|
|
566
|
+
"""
|
|
567
|
+
file_path = Path(path)
|
|
568
|
+
|
|
569
|
+
if file_path.suffix:
|
|
570
|
+
if file_path.suffix != ".json":
|
|
571
|
+
raise ValueError(
|
|
572
|
+
f"Only .json format is supported. Got path with extension: {path}. "
|
|
573
|
+
f"Please use a .json extension or provide a path without an extension."
|
|
574
|
+
)
|
|
575
|
+
else:
|
|
576
|
+
file_path = file_path.with_suffix(".json")
|
|
577
|
+
|
|
578
|
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
579
|
+
|
|
580
|
+
with open(file_path, "w") as f:
|
|
581
|
+
json.dump(self.to_dict(), f, indent=2)
|
|
582
|
+
|
|
583
|
+
@classmethod
|
|
584
|
+
def from_dict(cls, data: dict, custom_evaluators: list[type[Evaluator]] | None = None):
|
|
585
|
+
"""
|
|
586
|
+
Create an experiment from a dictionary.
|
|
587
|
+
|
|
588
|
+
Args:
|
|
589
|
+
data: A dictionary representation of the experiment.
|
|
590
|
+
custom_evaluators: A list of relevant custom evaluators.
|
|
591
|
+
|
|
592
|
+
Return:
|
|
593
|
+
An Experiment object.
|
|
594
|
+
"""
|
|
595
|
+
custom_evaluators = custom_evaluators or []
|
|
596
|
+
cases: list[Case] = [Case.model_validate(case_data) for case_data in data["cases"]]
|
|
597
|
+
default_evaluators: dict[str, type[Evaluator]] = {
|
|
598
|
+
"Evaluator": Evaluator,
|
|
599
|
+
"OutputEvaluator": OutputEvaluator,
|
|
600
|
+
"TrajectoryEvaluator": TrajectoryEvaluator,
|
|
601
|
+
"InteractionsEvaluator": InteractionsEvaluator,
|
|
602
|
+
}
|
|
603
|
+
all_evaluators: dict[str, type[Evaluator]] = {
|
|
604
|
+
**default_evaluators,
|
|
605
|
+
**{v.get_type_name(): v for v in custom_evaluators},
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
evaluators = []
|
|
609
|
+
for evaluator_dict in data["evaluators"]:
|
|
610
|
+
evaluator_type = evaluator_dict["evaluator_type"]
|
|
611
|
+
evaluator_args = {k: v for k, v in evaluator_dict.items() if k != "evaluator_type"}
|
|
612
|
+
|
|
613
|
+
if "model_id" in evaluator_args:
|
|
614
|
+
evaluator_args["model"] = evaluator_args.pop("model_id")
|
|
615
|
+
|
|
616
|
+
if evaluator_type in all_evaluators:
|
|
617
|
+
evaluator = all_evaluators[evaluator_type](**evaluator_args)
|
|
618
|
+
evaluators.append(evaluator)
|
|
619
|
+
else:
|
|
620
|
+
raise Exception(
|
|
621
|
+
f"Cannot find {evaluator_type}. Make sure the evaluator type is spelled correctly and "
|
|
622
|
+
f"all relevant custom evaluators are passed in."
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
return cls(cases=cases, evaluators=evaluators)
|
|
626
|
+
|
|
627
|
+
@classmethod
|
|
628
|
+
def from_file(cls, path: str, custom_evaluators: list[type[Evaluator]] | None = None):
|
|
629
|
+
"""
|
|
630
|
+
Create an experiment from a JSON file.
|
|
631
|
+
|
|
632
|
+
Args:
|
|
633
|
+
path: Path to the JSON file.
|
|
634
|
+
custom_evaluators: A list of relevant custom evaluators.
|
|
635
|
+
|
|
636
|
+
Return:
|
|
637
|
+
An Experiment object.
|
|
638
|
+
|
|
639
|
+
Raises:
|
|
640
|
+
ValueError: If the file does not have a .json extension.
|
|
641
|
+
"""
|
|
642
|
+
file_path = Path(path)
|
|
643
|
+
|
|
644
|
+
if file_path.suffix != ".json":
|
|
645
|
+
raise ValueError(
|
|
646
|
+
f"Only .json format is supported. Got file: {path}. Please provide a path with .json extension."
|
|
647
|
+
)
|
|
648
|
+
|
|
649
|
+
with open(file_path, "r") as f:
|
|
650
|
+
data = json.load(f)
|
|
651
|
+
|
|
652
|
+
return cls.from_dict(data, custom_evaluators)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from strands.multiagent import GraphResult
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def extract_graph_interactions(graph_result: GraphResult):
|
|
7
|
+
"""
|
|
8
|
+
Extract interaction information from graph execution results.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
graph_result: Result object from graph execution
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
list: Interactions with node names, dependencies, and messages
|
|
15
|
+
[{node_name: str, dependencies: list[str], messages: list[str]}]
|
|
16
|
+
"""
|
|
17
|
+
message_info: list[dict[str, Any]] = []
|
|
18
|
+
for node in graph_result.execution_order:
|
|
19
|
+
# Skip nodes without results
|
|
20
|
+
if node.result is None:
|
|
21
|
+
continue
|
|
22
|
+
# Skip if result doesn't have the expected structure
|
|
23
|
+
if not hasattr(node.result, "result") or not hasattr(node.result.result, "message"):
|
|
24
|
+
continue
|
|
25
|
+
|
|
26
|
+
node_name = node.node_id
|
|
27
|
+
node_messages = [m["text"] for m in node.result.result.message["content"]]
|
|
28
|
+
dependencies = [n.node_id for n in node.dependencies]
|
|
29
|
+
message_info.append({"node_name": node_name, "dependencies": dependencies, "messages": node_messages})
|
|
30
|
+
return message_info
|