strands-agents-evals 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- strands_agents_evals-0.1.0.dist-info/METADATA +408 -0
- strands_agents_evals-0.1.0.dist-info/RECORD +68 -0
- strands_agents_evals-0.1.0.dist-info/WHEEL +4 -0
- strands_agents_evals-0.1.0.dist-info/licenses/LICENSE +175 -0
- strands_agents_evals-0.1.0.dist-info/licenses/NOTICE +1 -0
- strands_evals/__init__.py +22 -0
- strands_evals/case.py +53 -0
- strands_evals/display/display_console.py +150 -0
- strands_evals/evaluators/__init__.py +23 -0
- strands_evals/evaluators/evaluator.py +182 -0
- strands_evals/evaluators/faithfulness_evaluator.py +116 -0
- strands_evals/evaluators/goal_success_rate_evaluator.py +90 -0
- strands_evals/evaluators/harmfulness_evaluator.py +135 -0
- strands_evals/evaluators/helpfulness_evaluator.py +148 -0
- strands_evals/evaluators/interactions_evaluator.py +244 -0
- strands_evals/evaluators/output_evaluator.py +72 -0
- strands_evals/evaluators/prompt_templates/case_prompt_template.py +63 -0
- strands_evals/evaluators/prompt_templates/faithfulness/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/faithfulness/faithfulness_v0.py +30 -0
- strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v0.py +17 -0
- strands_evals/evaluators/prompt_templates/harmfulness/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/harmfulness/harmfulness_v0.py +8 -0
- strands_evals/evaluators/prompt_templates/helpfulness/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/helpfulness/helpfulness_v0.py +38 -0
- strands_evals/evaluators/prompt_templates/prompt_templates.py +176 -0
- strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/tool_parameter_accuracy_v0.py +40 -0
- strands_evals/evaluators/prompt_templates/tool_selection_accuracy/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection_accuracy_v0.py +23 -0
- strands_evals/evaluators/tool_parameter_accuracy_evaluator.py +112 -0
- strands_evals/evaluators/tool_selection_accuracy_evaluator.py +112 -0
- strands_evals/evaluators/trajectory_evaluator.py +100 -0
- strands_evals/experiment.py +652 -0
- strands_evals/extractors/__init__.py +3 -0
- strands_evals/extractors/graph_extractor.py +30 -0
- strands_evals/extractors/swarm_extractor.py +73 -0
- strands_evals/extractors/tools_use_extractor.py +164 -0
- strands_evals/extractors/trace_extractor.py +166 -0
- strands_evals/generators/__init__.py +3 -0
- strands_evals/generators/experiment_generator.py +498 -0
- strands_evals/generators/prompt_template/prompt_templates.py +75 -0
- strands_evals/generators/topic_planner.py +60 -0
- strands_evals/mappers/__init__.py +6 -0
- strands_evals/mappers/session_mapper.py +27 -0
- strands_evals/mappers/strands_in_memory_session_mapper.py +473 -0
- strands_evals/simulation/README.md +323 -0
- strands_evals/simulation/__init__.py +6 -0
- strands_evals/simulation/actor_simulator.py +292 -0
- strands_evals/simulation/profiles/__init__.py +5 -0
- strands_evals/simulation/profiles/actor_profile.py +26 -0
- strands_evals/simulation/prompt_templates/__init__.py +11 -0
- strands_evals/simulation/prompt_templates/actor_profile_extraction.py +25 -0
- strands_evals/simulation/prompt_templates/actor_system_prompt.py +64 -0
- strands_evals/simulation/prompt_templates/goal_completion.py +27 -0
- strands_evals/simulation/tools/__init__.py +5 -0
- strands_evals/simulation/tools/goal_completion.py +93 -0
- strands_evals/telemetry/__init__.py +15 -0
- strands_evals/telemetry/_cloudwatch_logger.py +209 -0
- strands_evals/telemetry/config.py +207 -0
- strands_evals/telemetry/tracer.py +38 -0
- strands_evals/tools/evaluation_tools.py +67 -0
- strands_evals/types/__init__.py +11 -0
- strands_evals/types/evaluation.py +105 -0
- strands_evals/types/evaluation_report.py +244 -0
- strands_evals/types/simulation/__init__.py +5 -0
- strands_evals/types/simulation/actor.py +34 -0
- strands_evals/types/trace.py +205 -0
|
@@ -0,0 +1,498 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import math
|
|
4
|
+
from textwrap import dedent
|
|
5
|
+
|
|
6
|
+
from pydantic import create_model
|
|
7
|
+
from strands import Agent
|
|
8
|
+
from typing_extensions import Any, Generic, TypeVar
|
|
9
|
+
|
|
10
|
+
from strands_evals.evaluators import Evaluator, InteractionsEvaluator, OutputEvaluator, TrajectoryEvaluator
|
|
11
|
+
|
|
12
|
+
from ..case import Case
|
|
13
|
+
from ..experiment import Experiment
|
|
14
|
+
from ..types.evaluation import Interaction
|
|
15
|
+
from .prompt_template.prompt_templates import generate_case_template as CASE_SYSTEM_PROMPT
|
|
16
|
+
from .prompt_template.prompt_templates import generate_rubric_template as RUBRIC_SYSTEM_PROMPT
|
|
17
|
+
from .topic_planner import TopicPlanner
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
InputT = TypeVar("InputT")
|
|
22
|
+
OutputT = TypeVar("OutputT")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ExperimentGenerator(Generic[InputT, OutputT]):
|
|
26
|
+
"""
|
|
27
|
+
Generates evaluation experiments with test cases and rubrics for LLM-based evaluators for agent assessment.
|
|
28
|
+
|
|
29
|
+
This class creates structured test cases and evaluation rubrics tailored to specific tasks
|
|
30
|
+
and domains, enabling comprehensive evaluation of agents' performance.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
_default_evaluators = {
|
|
34
|
+
OutputEvaluator: (
|
|
35
|
+
"evaluates only the output response, don't include information about trajectory "
|
|
36
|
+
"nor interactions even if provided"
|
|
37
|
+
),
|
|
38
|
+
TrajectoryEvaluator: (
|
|
39
|
+
"evaluates the trajectory and output if provided, don't include info about interactions even if provided"
|
|
40
|
+
),
|
|
41
|
+
InteractionsEvaluator: (
|
|
42
|
+
"evaluates the interactions and output if provided, don't include info about trajectory even if provided"
|
|
43
|
+
),
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
input_type: type,
|
|
49
|
+
output_type: type,
|
|
50
|
+
trajectory_type: type | None = None,
|
|
51
|
+
include_expected_output: bool = True,
|
|
52
|
+
include_expected_trajectory: bool = False,
|
|
53
|
+
include_expected_interactions: bool = False,
|
|
54
|
+
include_metadata: bool = False,
|
|
55
|
+
model: str | None = None,
|
|
56
|
+
max_parallel_num_cases: int = 10,
|
|
57
|
+
rubric_system_prompt: str = RUBRIC_SYSTEM_PROMPT,
|
|
58
|
+
case_system_prompt: str = CASE_SYSTEM_PROMPT,
|
|
59
|
+
):
|
|
60
|
+
"""
|
|
61
|
+
Initialize the experiment generator with configuration for test case structure.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
input_type: Type of input data for test cases (e.g., str, dict)
|
|
65
|
+
output_type: Type of expected output data (e.g., str, int)
|
|
66
|
+
trajectory_type: Type for trajectory elements, defaults to Any if None
|
|
67
|
+
include_expected_output: Whether to include expected outputs in test cases
|
|
68
|
+
include_expected_trajectory: Whether to include expected tool/action trajectories
|
|
69
|
+
include_expected_interactions: Whether to include expected interaction sequences
|
|
70
|
+
include_metadata: Whether to include metadata fields in test cases
|
|
71
|
+
model: Model identifier for the generation agent, defaults to strands' default model.
|
|
72
|
+
max_parallel_num_cases: Maximum number of test cases to generate in parallel asynchronously
|
|
73
|
+
rubric_system_prompt: System prompt for rubric generation, defaults to one of the available templates.
|
|
74
|
+
case_system_prompt: System prompt for test case generation, defaults to one of the available templates.
|
|
75
|
+
"""
|
|
76
|
+
self.model = model
|
|
77
|
+
self.input_type = input_type
|
|
78
|
+
self.output_type = output_type
|
|
79
|
+
self.include_expected_output = include_expected_output
|
|
80
|
+
self.include_expected_trajectory = include_expected_trajectory
|
|
81
|
+
self.include_expected_interactions = include_expected_interactions
|
|
82
|
+
self.include_metadata = include_metadata
|
|
83
|
+
self.max_parallel_num_cases = max_parallel_num_cases
|
|
84
|
+
|
|
85
|
+
self.rubric_system_prompt = rubric_system_prompt
|
|
86
|
+
self.case_system_prompt = case_system_prompt
|
|
87
|
+
|
|
88
|
+
# Create class structure for Case with stricter/literal types, excluding any fields not needed
|
|
89
|
+
fields: dict[str, Any] = {"name": (str, ...), "input": (self.input_type, ...)}
|
|
90
|
+
if self.include_expected_output:
|
|
91
|
+
fields["expected_output"] = (self.output_type, ...)
|
|
92
|
+
if self.include_expected_trajectory:
|
|
93
|
+
# Use Any for trajectory type since we can't use runtime variables as types
|
|
94
|
+
fields["expected_trajectory"] = (list[Any], ...)
|
|
95
|
+
if self.include_expected_interactions:
|
|
96
|
+
fields["expected_interactions"] = (list[Interaction], ...)
|
|
97
|
+
if self.include_metadata:
|
|
98
|
+
fields["metadata"] = (dict[str, Any], ...)
|
|
99
|
+
self._Case = create_model("_Case", **fields)
|
|
100
|
+
|
|
101
|
+
async def _case_worker(self, queue: asyncio.Queue, prompt: str, message_history: list | None, results: list):
|
|
102
|
+
"""
|
|
103
|
+
Worker that generates cases from the queue.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
queue: Queue containing cases to process
|
|
107
|
+
prompt: Generation prompt describing the test case requirements
|
|
108
|
+
message_history: Optional conversation history to provide context to the generation agent
|
|
109
|
+
results: List to store results
|
|
110
|
+
|
|
111
|
+
"""
|
|
112
|
+
case_generator = Agent(
|
|
113
|
+
model=self.model,
|
|
114
|
+
system_prompt=self.case_system_prompt,
|
|
115
|
+
callback_handler=None,
|
|
116
|
+
messages=message_history if message_history else [],
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
while True:
|
|
120
|
+
try:
|
|
121
|
+
difficulty = queue.get_nowait()
|
|
122
|
+
except asyncio.QueueEmpty:
|
|
123
|
+
break
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
full_prompt = prompt + f"Ensure that the test case has a difficulty level of {difficulty}."
|
|
127
|
+
gen_case = await case_generator.structured_output_async(self._Case, full_prompt)
|
|
128
|
+
results.append(Case(**gen_case.model_dump()))
|
|
129
|
+
except Exception as e:
|
|
130
|
+
logger.exception(f"Error generating case: {e}")
|
|
131
|
+
finally:
|
|
132
|
+
queue.task_done()
|
|
133
|
+
|
|
134
|
+
async def generate_cases_async(
|
|
135
|
+
self, prompt: str, num_cases: int = 5, message_history: list | None = None, num_topics: int | None = None
|
|
136
|
+
) -> list[Case]:
|
|
137
|
+
"""
|
|
138
|
+
Generate test cases asynchronously using parallel workers.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
prompt: Generation prompt describing the test case requirements
|
|
142
|
+
num_cases: Number of test cases to generate
|
|
143
|
+
message_history: Optional conversation history to provide context to the generation agent
|
|
144
|
+
num_topics: Optional number of topics for diverse coverage.
|
|
145
|
+
If None, generates all cases from the single prompt.
|
|
146
|
+
If specified, expands prompt into multiple topic-specific prompts.
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
List of generated Case objects matching the configured schema
|
|
150
|
+
"""
|
|
151
|
+
prompt_specs = await self._prepare_generation_prompts(
|
|
152
|
+
base_prompt=prompt, num_cases=num_cases, num_topics=num_topics
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
generated_cases: list = []
|
|
156
|
+
for prompt_text, cases_for_prompt in prompt_specs:
|
|
157
|
+
cases = await self._generate_batch(
|
|
158
|
+
prompt=prompt_text, num_cases=cases_for_prompt, message_history=message_history
|
|
159
|
+
)
|
|
160
|
+
generated_cases.extend(cases)
|
|
161
|
+
|
|
162
|
+
return generated_cases
|
|
163
|
+
|
|
164
|
+
async def _prepare_generation_prompts(
|
|
165
|
+
self, base_prompt: str, num_cases: int, num_topics: int | None = None
|
|
166
|
+
) -> list[tuple[str, int]]:
|
|
167
|
+
"""
|
|
168
|
+
Prepare generation prompts, optionally expanding via topic planning.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
List of (prompt, num_cases) tuples. Always returns at least one prompt.
|
|
172
|
+
"""
|
|
173
|
+
if num_topics is None:
|
|
174
|
+
return [(base_prompt, num_cases)]
|
|
175
|
+
|
|
176
|
+
topic_planner = TopicPlanner(model=self.model)
|
|
177
|
+
|
|
178
|
+
try:
|
|
179
|
+
topic_plan = await topic_planner.plan_topics_async(
|
|
180
|
+
context=base_prompt, task_description="", num_topics=num_topics, num_cases=num_cases
|
|
181
|
+
)
|
|
182
|
+
except Exception as e:
|
|
183
|
+
logger.warning(f"Topic planning failed: {e}. Using single prompt.")
|
|
184
|
+
return [(base_prompt, num_cases)]
|
|
185
|
+
|
|
186
|
+
# Distribute cases across topics
|
|
187
|
+
cases_per_topic = math.ceil(num_cases / len(topic_plan.topics))
|
|
188
|
+
prompt_specs: list[tuple[str, int]] = []
|
|
189
|
+
|
|
190
|
+
num_generated_cases = 0
|
|
191
|
+
for topic in topic_plan.topics:
|
|
192
|
+
remaining = num_cases - num_generated_cases
|
|
193
|
+
if remaining <= 0:
|
|
194
|
+
break
|
|
195
|
+
|
|
196
|
+
topic_cases = min(cases_per_topic, remaining)
|
|
197
|
+
topic_prompt = dedent(f"""
|
|
198
|
+
{base_prompt}
|
|
199
|
+
Focus on this topic:
|
|
200
|
+
- {topic.title}: {topic.description}
|
|
201
|
+
- Key aspects: {", ".join(topic.key_aspects)}
|
|
202
|
+
""")
|
|
203
|
+
|
|
204
|
+
prompt_specs.append((topic_prompt, topic_cases))
|
|
205
|
+
num_generated_cases += topic_cases
|
|
206
|
+
|
|
207
|
+
return prompt_specs
|
|
208
|
+
|
|
209
|
+
async def _generate_batch(self, prompt: str, num_cases: int, message_history: list | None = None) -> list[Case]:
|
|
210
|
+
"""Generate a batch of cases using the existing worker pattern."""
|
|
211
|
+
queue: asyncio.Queue[str] = asyncio.Queue()
|
|
212
|
+
generated_cases: list = []
|
|
213
|
+
|
|
214
|
+
for i in range(num_cases):
|
|
215
|
+
difficulty = "medium"
|
|
216
|
+
if i < num_cases * 0.3:
|
|
217
|
+
difficulty = "easy"
|
|
218
|
+
elif i > num_cases * 0.8:
|
|
219
|
+
difficulty = "hard"
|
|
220
|
+
queue.put_nowait(difficulty)
|
|
221
|
+
|
|
222
|
+
num_workers = min(self.max_parallel_num_cases, num_cases)
|
|
223
|
+
workers = [
|
|
224
|
+
asyncio.create_task(self._case_worker(queue, prompt, message_history, generated_cases))
|
|
225
|
+
for _ in range(num_workers)
|
|
226
|
+
]
|
|
227
|
+
|
|
228
|
+
await queue.join()
|
|
229
|
+
for worker in workers:
|
|
230
|
+
worker.cancel()
|
|
231
|
+
await asyncio.gather(*workers, return_exceptions=True)
|
|
232
|
+
|
|
233
|
+
return generated_cases
|
|
234
|
+
|
|
235
|
+
async def construct_evaluator_async(
|
|
236
|
+
self, prompt: str, evaluator: Evaluator, message_history: list | None = None
|
|
237
|
+
) -> Evaluator:
|
|
238
|
+
"""
|
|
239
|
+
Create an evaluator instance with a generated rubric.
|
|
240
|
+
|
|
241
|
+
Currently supports default evaluators: OutputEvaluator, TrajectoryEvaluator,
|
|
242
|
+
and InteractionsEvaluator. Generates task-specific rubrics for evaluation.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
prompt: Prompt describing the evaluation context and requirements
|
|
246
|
+
evaluator: Evaluator class to instantiate (must be a default evaluator)
|
|
247
|
+
message_history: Optional conversation history to provide context to the rubric generation agent
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
Configured evaluator instance with generated rubric
|
|
251
|
+
|
|
252
|
+
Raises:
|
|
253
|
+
ValueError: If evaluator is not one of the supported default evaluators
|
|
254
|
+
"""
|
|
255
|
+
if evaluator not in self._default_evaluators:
|
|
256
|
+
raise ValueError(
|
|
257
|
+
f"{evaluator} is not a default evaluator that needs a rubric. Please use one of the "
|
|
258
|
+
f"default evaluators: {list(self._default_evaluators.keys())}."
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
rubric_generator_agent = Agent(
|
|
262
|
+
model=self.model,
|
|
263
|
+
system_prompt=self.rubric_system_prompt,
|
|
264
|
+
callback_handler=None,
|
|
265
|
+
messages=message_history if message_history else [],
|
|
266
|
+
)
|
|
267
|
+
evaluator_name = evaluator.get_type_name()
|
|
268
|
+
evaluator_desc = self._default_evaluators[evaluator]
|
|
269
|
+
evaluator_info = f"""The evaluator selected is {evaluator_name}. This evaluator {evaluator_desc}."""
|
|
270
|
+
final_prompt = (
|
|
271
|
+
prompt
|
|
272
|
+
+ evaluator_info
|
|
273
|
+
+ """
|
|
274
|
+
IMPORTANT: Your response must be ONLY a few sentences describing how to evaluate the test cases."""
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
rubric = await rubric_generator_agent.invoke_async(final_prompt)
|
|
278
|
+
return evaluator(rubric=str(rubric))
|
|
279
|
+
|
|
280
|
+
async def from_scratch_async(
|
|
281
|
+
self, topics: list[str], task_description: str, num_cases: int = 5, evaluator: Evaluator = None
|
|
282
|
+
) -> Experiment:
|
|
283
|
+
"""
|
|
284
|
+
Generate an experiment from scratch based on specified topics and task description.
|
|
285
|
+
|
|
286
|
+
Creates diverse test cases covering the given topics for the specified task,
|
|
287
|
+
with optional evaluator and rubric generation.
|
|
288
|
+
|
|
289
|
+
Args:
|
|
290
|
+
topics: List of topics/domains to cover in test cases
|
|
291
|
+
task_description: Description of the task the AI system will perform
|
|
292
|
+
num_cases: Number of test cases to generate
|
|
293
|
+
evaluator: Optional evaluator class for assessment (generates rubric if provided).
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
Experiment containing generated test cases and evaluator. Use the generic Evaluator as placeholder
|
|
297
|
+
if no evaluator is passed in.
|
|
298
|
+
"""
|
|
299
|
+
topics_str = " ".join(topics)
|
|
300
|
+
case_prompt = (
|
|
301
|
+
f"""Create test cases for the following topics: {topics_str} for this task: """ f"""{task_description}."""
|
|
302
|
+
)
|
|
303
|
+
cases = await self.generate_cases_async(case_prompt, num_cases)
|
|
304
|
+
if evaluator:
|
|
305
|
+
rubric_prompt = (
|
|
306
|
+
f"""Create a rubric for the following topics: {topics_str} for this task: """ f"""{task_description}."""
|
|
307
|
+
)
|
|
308
|
+
_evaluator = await self.construct_evaluator_async(
|
|
309
|
+
prompt=rubric_prompt,
|
|
310
|
+
evaluator=evaluator,
|
|
311
|
+
)
|
|
312
|
+
return Experiment(cases=cases, evaluators=[_evaluator])
|
|
313
|
+
else:
|
|
314
|
+
return Experiment(cases=cases)
|
|
315
|
+
|
|
316
|
+
async def from_context_async(
|
|
317
|
+
self,
|
|
318
|
+
context: str,
|
|
319
|
+
task_description: str,
|
|
320
|
+
num_cases: int = 5,
|
|
321
|
+
evaluator: Evaluator = None,
|
|
322
|
+
num_topics: int | None = None,
|
|
323
|
+
) -> Experiment:
|
|
324
|
+
"""
|
|
325
|
+
Generate an experiment based on specific context that test cases should reference.
|
|
326
|
+
|
|
327
|
+
Creates test cases that can be answered using the provided context,
|
|
328
|
+
useful for testing knowledge retrieval, context understanding, or domain-specific tasks.
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
context: Specific context/information that test cases should reference. If there's any tools
|
|
332
|
+
they need to use, specify them here too. Be sure to include as much information as you can
|
|
333
|
+
about tools or sub-agents for generating interaction and/or trajectory.
|
|
334
|
+
task_description: Description of the task the AI system will perform
|
|
335
|
+
num_cases: Number of test cases to generate
|
|
336
|
+
evaluator: Optional evaluator class for assessment (generates rubric if provided), use Evaluator()
|
|
337
|
+
as a placeholder.
|
|
338
|
+
num_topics: Optional number of topics for diverse coverage
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
Experiment containing context-based test cases and evaluator. Use the generic Evaluator as placeholder
|
|
342
|
+
if no evaluator is passed in.
|
|
343
|
+
"""
|
|
344
|
+
cases = await self.generate_cases_async(
|
|
345
|
+
f"""Create test cases with the following context: {context}. Ensure that the questions can be """
|
|
346
|
+
f"""answer using the provided context for this task: {task_description} """,
|
|
347
|
+
num_cases=num_cases,
|
|
348
|
+
num_topics=num_topics,
|
|
349
|
+
)
|
|
350
|
+
if evaluator:
|
|
351
|
+
_evaluator = await self.construct_evaluator_async(
|
|
352
|
+
prompt=f"""Create a rubric with the following context: {context} for this task: """
|
|
353
|
+
f"""{task_description} """,
|
|
354
|
+
evaluator=evaluator,
|
|
355
|
+
)
|
|
356
|
+
return Experiment(cases=cases, evaluators=[_evaluator])
|
|
357
|
+
else:
|
|
358
|
+
return Experiment(cases=cases)
|
|
359
|
+
|
|
360
|
+
async def from_experiment_async(
|
|
361
|
+
self,
|
|
362
|
+
source_experiment: Experiment,
|
|
363
|
+
task_description: str,
|
|
364
|
+
num_cases: int = 5,
|
|
365
|
+
extra_information: str | None = None,
|
|
366
|
+
) -> Experiment:
|
|
367
|
+
"""
|
|
368
|
+
Generate a new experiment using an existing experiment as reference.
|
|
369
|
+
|
|
370
|
+
Creates new test cases that are similar in style and structure to the source experiment,
|
|
371
|
+
while adapting them for the specified task. If the source experiment uses a default
|
|
372
|
+
evaluator with a rubric, generates a new rubric based on the original.
|
|
373
|
+
|
|
374
|
+
Args:
|
|
375
|
+
source_experiment: Original experiment to use as reference for generating new test cases
|
|
376
|
+
task_description: Description of the task the AI system will perform
|
|
377
|
+
num_cases: Number of test cases to generate
|
|
378
|
+
extra_information: Optional additional context or requirements for the new test cases and rubric,
|
|
379
|
+
be sure to include as much information as you can about tools or sub-agents
|
|
380
|
+
for generating interaction and/or trajectory.
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
A new Experiment containing test cases inspired by the source experiment but adapted
|
|
384
|
+
for the new task. Uses an updated evaluator with new rubric if the source
|
|
385
|
+
evaluator is a default type, otherwise uses generic Evaluator.
|
|
386
|
+
"""
|
|
387
|
+
source_cases = source_experiment.cases
|
|
388
|
+
source_evaluators = source_experiment.evaluators
|
|
389
|
+
|
|
390
|
+
# construct messages to initialize the agent with context about the previous test cases
|
|
391
|
+
messages = [{"role": "user", "content": [{"text": "Here are the reference test cases: "}]}]
|
|
392
|
+
cases_string_list = []
|
|
393
|
+
for i, case in enumerate(source_cases):
|
|
394
|
+
cases_string_list.append({"text": f"{i}. {case.model_dump()}"})
|
|
395
|
+
messages.append({"role": "user", "content": cases_string_list})
|
|
396
|
+
new_cases = await self.generate_cases_async(
|
|
397
|
+
prompt=(
|
|
398
|
+
f"Create new test cases similar to the reference cases. Ensure that the input and output "
|
|
399
|
+
f"are relevant for this task: {task_description}. Here are some extra information: "
|
|
400
|
+
f"{extra_information}."
|
|
401
|
+
),
|
|
402
|
+
num_cases=num_cases,
|
|
403
|
+
message_history=messages,
|
|
404
|
+
)
|
|
405
|
+
new_evaluators = []
|
|
406
|
+
for source_evaluator in source_evaluators:
|
|
407
|
+
if type(source_evaluator) in self._default_evaluators:
|
|
408
|
+
source_rubric = source_evaluator.rubric
|
|
409
|
+
new_evaluator = await self.construct_evaluator_async(
|
|
410
|
+
prompt=(
|
|
411
|
+
f"Create a new rubric based on the reference rubric. Ensure that the rubric is relevant "
|
|
412
|
+
f"for this task: {task_description}. Here are some extra information: {extra_information}."
|
|
413
|
+
),
|
|
414
|
+
evaluator=type(source_evaluator),
|
|
415
|
+
message_history=[{"role": "user", "content": [{"text": source_rubric}]}],
|
|
416
|
+
)
|
|
417
|
+
new_evaluators.append(new_evaluator)
|
|
418
|
+
else:
|
|
419
|
+
new_evaluators.append(Evaluator())
|
|
420
|
+
|
|
421
|
+
return Experiment(cases=new_cases, evaluators=new_evaluators if new_evaluators else [Evaluator()])
|
|
422
|
+
|
|
423
|
+
async def update_current_experiment_async(
|
|
424
|
+
self,
|
|
425
|
+
source_experiment: Experiment,
|
|
426
|
+
task_description: str,
|
|
427
|
+
num_cases: int = 5,
|
|
428
|
+
context: str | None = None,
|
|
429
|
+
add_new_cases: bool = True,
|
|
430
|
+
add_new_rubric: bool = True,
|
|
431
|
+
new_evaluator_type: type | None = None,
|
|
432
|
+
) -> Experiment:
|
|
433
|
+
"""
|
|
434
|
+
Update an existing experiment by adding new test cases and/or updating the evaluator.
|
|
435
|
+
|
|
436
|
+
Extends the source experiment with additional test cases that complement the existing ones,
|
|
437
|
+
and optionally updates the evaluation rubric. Useful for iteratively improving experiments
|
|
438
|
+
or adapting them to new requirements while preserving the original test cases.
|
|
439
|
+
|
|
440
|
+
Args:
|
|
441
|
+
source_experiment: Original experiment to extend and update
|
|
442
|
+
task_description: Description of the task the AI system will perform
|
|
443
|
+
num_cases: Number of new test cases to add (if add_new_cases is True)
|
|
444
|
+
context: Additional context or requirements for new test cases and rubric,
|
|
445
|
+
be sure to include as much information as you can about tools or sub-agents
|
|
446
|
+
for generating interaction and/or trajectory.
|
|
447
|
+
add_new_cases: Whether to generate and add new test cases to the experiment
|
|
448
|
+
add_new_rubric: Whether to generate a new evaluation rubric
|
|
449
|
+
new_evaluator_type: Optional new evaluator type to use instead of the source evaluator type
|
|
450
|
+
|
|
451
|
+
Returns:
|
|
452
|
+
Updated Experiment containing original cases plus new cases (if requested) and
|
|
453
|
+
updated evaluator with new rubric (if requested and evaluator supports it).
|
|
454
|
+
"""
|
|
455
|
+
source_cases = source_experiment.cases
|
|
456
|
+
source_evaluators = source_experiment.evaluators
|
|
457
|
+
|
|
458
|
+
if add_new_cases:
|
|
459
|
+
# construct messages to initialize the agent with context about the previous test cases
|
|
460
|
+
messages = [{"role": "user", "content": [{"text": "Here are the current test cases: "}]}]
|
|
461
|
+
cases_string_list = []
|
|
462
|
+
for i, case in enumerate(source_cases):
|
|
463
|
+
cases_string_list.append({"text": f"{i}. {case.model_dump()}"})
|
|
464
|
+
messages.append({"role": "user", "content": cases_string_list})
|
|
465
|
+
new_cases = await self.generate_cases_async(
|
|
466
|
+
prompt=(
|
|
467
|
+
f"Create new test cases, expanding on previous cases for the following context: {context}. "
|
|
468
|
+
f"Ensure that the input and output are relevant for this task: {task_description}."
|
|
469
|
+
),
|
|
470
|
+
num_cases=num_cases,
|
|
471
|
+
message_history=messages,
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
if add_new_rubric:
|
|
475
|
+
new_evaluators = []
|
|
476
|
+
for source_evaluator in source_evaluators:
|
|
477
|
+
evaluator_type = new_evaluator_type if new_evaluator_type else type(source_evaluator)
|
|
478
|
+
|
|
479
|
+
if evaluator_type in self._default_evaluators:
|
|
480
|
+
source_rubric = (
|
|
481
|
+
source_evaluator.rubric if type(source_evaluator) in self._default_evaluators else None
|
|
482
|
+
)
|
|
483
|
+
new_evaluator = await self.construct_evaluator_async(
|
|
484
|
+
prompt=(
|
|
485
|
+
f"Create a new rubric based on the reference rubric if provided for the following "
|
|
486
|
+
f"context: {context}. Ensure that the rubric is relevant for this task: {task_description}."
|
|
487
|
+
),
|
|
488
|
+
evaluator=evaluator_type,
|
|
489
|
+
message_history=[{"role": "user", "content": [{"text": source_rubric}]}],
|
|
490
|
+
)
|
|
491
|
+
new_evaluators.append(new_evaluator)
|
|
492
|
+
else:
|
|
493
|
+
new_evaluators.append(source_evaluator)
|
|
494
|
+
|
|
495
|
+
return Experiment(
|
|
496
|
+
cases=source_cases + new_cases if add_new_cases else source_cases,
|
|
497
|
+
evaluators=new_evaluators if add_new_rubric else source_evaluators,
|
|
498
|
+
)
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
DEFAULT_PLANNING_SYSTEM_PROMPT = """You are a test scenario planner for AI agents.
|
|
2
|
+
Your role is to analyze agent configurations and generate strategic topic plans
|
|
3
|
+
that comprehensively evaluate agent capabilities.
|
|
4
|
+
|
|
5
|
+
Your topics should:
|
|
6
|
+
- Cover different aspects of the agent's capabilities
|
|
7
|
+
- Test edge cases and common scenarios
|
|
8
|
+
- Vary in complexity and scope
|
|
9
|
+
- Ensure comprehensive coverage of available tools and features
|
|
10
|
+
- Be diverse and non-overlapping"""
|
|
11
|
+
|
|
12
|
+
generate_case_template = """
|
|
13
|
+
You are an expert test case generator for AI evaluation datasets. Your role is to create high-quality, diverse test cases that thoroughly evaluate AI systems across different domains and capabilities.
|
|
14
|
+
|
|
15
|
+
When given a task description, you will generate test cases specifically designed to evaluate how well an AI system can perform that task.
|
|
16
|
+
|
|
17
|
+
CORE PRINCIPLES:
|
|
18
|
+
- Generate realistic, practical test cases that reflect real-world usage patterns for the given task
|
|
19
|
+
- Ensure comprehensive coverage of the task requirements and potential challenges
|
|
20
|
+
- Create test cases that are specific, unambiguous, and measurable within the task context
|
|
21
|
+
- Balance difficulty levels to assess different capability thresholds for the task
|
|
22
|
+
- Include edge cases, corner scenarios, and potential failure modes relevant to the task
|
|
23
|
+
|
|
24
|
+
TEST CASE DESIGN:
|
|
25
|
+
- Easy Level (30%): Basic task functionality, straightforward scenarios, common use cases
|
|
26
|
+
- Medium Level (50%): Multi-step reasoning, moderate complexity, realistic task challenges
|
|
27
|
+
- Hard Level (20%): Complex task scenarios, edge cases, advanced reasoning, error handling
|
|
28
|
+
|
|
29
|
+
QUALITY STANDARDS:
|
|
30
|
+
- Each test case should have a clear, well-defined input relevant to the task
|
|
31
|
+
- Expected outputs should be accurate, complete, and verifiable for the task
|
|
32
|
+
- Test cases should be independent and not rely on previous context
|
|
33
|
+
- Avoid repetitive or overly similar scenarios within the task scope
|
|
34
|
+
- Ensure cultural sensitivity and avoid biased content
|
|
35
|
+
|
|
36
|
+
TASK-SPECIFIC CONSIDERATIONS:
|
|
37
|
+
When creating test cases, consider:
|
|
38
|
+
- What inputs will the AI system receive for this task?
|
|
39
|
+
- What outputs should it produce?
|
|
40
|
+
- What tools or capabilities might it need to use?
|
|
41
|
+
- What are the success criteria for this task?
|
|
42
|
+
- What could go wrong or be challenging about this task?
|
|
43
|
+
|
|
44
|
+
Remember: You are creating evaluation data to measure AI performance on specific tasks. Quality and diversity are paramount for meaningful assessment.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
generate_rubric_template = """
|
|
48
|
+
You are an expert evaluation specialist focused on creating concise, actionable rubrics for AI agent system assessment.
|
|
49
|
+
|
|
50
|
+
When given a task description, you will create a rubric that captures the essential criteria for evaluating
|
|
51
|
+
how well an AI agent system performs that specific task for a particular information type (eg. output, trajectory, and/or interactions).
|
|
52
|
+
|
|
53
|
+
RUBRIC REQUIREMENTS:
|
|
54
|
+
- Should be clear, comprehensive, and easy to understand for the specific task
|
|
55
|
+
- Focus on what makes a response high-quality when performing the given task
|
|
56
|
+
- Include key evaluation dimensions relevant to the task (accuracy, completeness, clarity, tool usage, etc.)
|
|
57
|
+
- Be specific enough to guide evaluation but general enough to apply across test cases for the task
|
|
58
|
+
- Consider the task's success criteria and potential failure modes
|
|
59
|
+
- Avoid mentioning specific test case details or examples
|
|
60
|
+
|
|
61
|
+
TASK-AWARE EVALUATION:
|
|
62
|
+
When creating rubrics, consider:
|
|
63
|
+
- What does successful task completion look like?
|
|
64
|
+
- What are the key quality indicators for this task?
|
|
65
|
+
- What tools, reasoning, or capabilities should be demonstrated?
|
|
66
|
+
- What are common failure modes or errors for this task?
|
|
67
|
+
- How should edge cases or complex scenarios be handled?
|
|
68
|
+
|
|
69
|
+
FORMAT:
|
|
70
|
+
- Use active, measurable criteria specific to the task
|
|
71
|
+
- Keep concise but comprehensive
|
|
72
|
+
- Focus on observable, evaluable qualities
|
|
73
|
+
|
|
74
|
+
Focus on creating a rubric that evaluators can consistently apply to measure how well AI systems perform the given task. Starts with "Scoring should ..."
|
|
75
|
+
"""
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import math
|
|
2
|
+
from textwrap import dedent
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Field
|
|
5
|
+
from strands import Agent
|
|
6
|
+
|
|
7
|
+
from strands_evals.generators.prompt_template.prompt_templates import DEFAULT_PLANNING_SYSTEM_PROMPT
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Topic(BaseModel):
|
|
11
|
+
"""Represents a single topic for test case generation."""
|
|
12
|
+
|
|
13
|
+
title: str = Field(..., description="Brief descriptive title for the topic")
|
|
14
|
+
description: str = Field(..., description="Short description explaining the topic")
|
|
15
|
+
key_aspects: list[str] = Field(..., description="2-5 key aspects that test cases should explore for this topic")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TopicPlan(BaseModel):
|
|
19
|
+
"""Represents a complete topic plan with multiple topics."""
|
|
20
|
+
|
|
21
|
+
topics: list[Topic] = Field(..., description="List of diverse topics for comprehensive test coverage")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class TopicPlanner:
|
|
25
|
+
"""Plans diverse topics for test case generation based on agent context."""
|
|
26
|
+
|
|
27
|
+
def __init__(self, model: str | None = None, planning_prompt: str | None = None):
|
|
28
|
+
self.model = model
|
|
29
|
+
self.planning_prompt = planning_prompt or DEFAULT_PLANNING_SYSTEM_PROMPT
|
|
30
|
+
|
|
31
|
+
async def plan_topics_async(
|
|
32
|
+
self, context: str, task_description: str, num_topics: int, num_cases: int
|
|
33
|
+
) -> TopicPlan:
|
|
34
|
+
"""Generate a strategic plan of diverse topics for test case generation."""
|
|
35
|
+
cases_per_topic = math.ceil(num_cases / num_topics)
|
|
36
|
+
|
|
37
|
+
planning_agent = Agent(model=self.model, system_prompt=self.planning_prompt, callback_handler=None)
|
|
38
|
+
|
|
39
|
+
prompt = dedent(f"""
|
|
40
|
+
Generate {num_topics} diverse topics for creating {num_cases} test cases.
|
|
41
|
+
|
|
42
|
+
Agent Context:
|
|
43
|
+
{context}
|
|
44
|
+
|
|
45
|
+
Task Description:
|
|
46
|
+
{task_description}
|
|
47
|
+
|
|
48
|
+
Requirements:
|
|
49
|
+
- Create exactly {num_topics} distinct topics
|
|
50
|
+
- Each topic will generate approximately {cases_per_topic} test cases
|
|
51
|
+
- Include 2-5 key aspects per topic that test cases should explore
|
|
52
|
+
- Ensure topics span different complexity levels and use cases
|
|
53
|
+
- Make topics diverse and non-overlapping""")
|
|
54
|
+
|
|
55
|
+
topic_plan = await planning_agent.structured_output_async(TopicPlan, prompt)
|
|
56
|
+
|
|
57
|
+
if len(topic_plan.topics) > num_topics:
|
|
58
|
+
topic_plan.topics = topic_plan.topics[:num_topics]
|
|
59
|
+
|
|
60
|
+
return topic_plan
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
"""Converters for transforming telemetry data to Session format."""
|
|
2
|
+
|
|
3
|
+
from .session_mapper import SessionMapper
|
|
4
|
+
from .strands_in_memory_session_mapper import GenAIConventionVersion, StrandsInMemorySessionMapper
|
|
5
|
+
|
|
6
|
+
__all__ = ["GenAIConventionVersion", "SessionMapper", "StrandsInMemorySessionMapper"]
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SessionMapper - Base class for mapping telemetry data to Session format
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
|
|
7
|
+
from typing_extensions import Any
|
|
8
|
+
|
|
9
|
+
from ..types.trace import Session
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SessionMapper(ABC):
|
|
13
|
+
"""Base class for mapping telemetry data to Session format for evaluation."""
|
|
14
|
+
|
|
15
|
+
@abstractmethod
|
|
16
|
+
def map_to_session(self, spans: list[Any], session_id: str) -> Session:
|
|
17
|
+
"""
|
|
18
|
+
Map spans to Session format.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
spans: List of span objects
|
|
22
|
+
session_id: Session identifier
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
Session object ready for evaluation
|
|
26
|
+
"""
|
|
27
|
+
pass
|