strands-agents-evals 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. strands_agents_evals-0.1.0.dist-info/METADATA +408 -0
  2. strands_agents_evals-0.1.0.dist-info/RECORD +68 -0
  3. strands_agents_evals-0.1.0.dist-info/WHEEL +4 -0
  4. strands_agents_evals-0.1.0.dist-info/licenses/LICENSE +175 -0
  5. strands_agents_evals-0.1.0.dist-info/licenses/NOTICE +1 -0
  6. strands_evals/__init__.py +22 -0
  7. strands_evals/case.py +53 -0
  8. strands_evals/display/display_console.py +150 -0
  9. strands_evals/evaluators/__init__.py +23 -0
  10. strands_evals/evaluators/evaluator.py +182 -0
  11. strands_evals/evaluators/faithfulness_evaluator.py +116 -0
  12. strands_evals/evaluators/goal_success_rate_evaluator.py +90 -0
  13. strands_evals/evaluators/harmfulness_evaluator.py +135 -0
  14. strands_evals/evaluators/helpfulness_evaluator.py +148 -0
  15. strands_evals/evaluators/interactions_evaluator.py +244 -0
  16. strands_evals/evaluators/output_evaluator.py +72 -0
  17. strands_evals/evaluators/prompt_templates/case_prompt_template.py +63 -0
  18. strands_evals/evaluators/prompt_templates/faithfulness/__init__.py +11 -0
  19. strands_evals/evaluators/prompt_templates/faithfulness/faithfulness_v0.py +30 -0
  20. strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py +11 -0
  21. strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v0.py +17 -0
  22. strands_evals/evaluators/prompt_templates/harmfulness/__init__.py +11 -0
  23. strands_evals/evaluators/prompt_templates/harmfulness/harmfulness_v0.py +8 -0
  24. strands_evals/evaluators/prompt_templates/helpfulness/__init__.py +11 -0
  25. strands_evals/evaluators/prompt_templates/helpfulness/helpfulness_v0.py +38 -0
  26. strands_evals/evaluators/prompt_templates/prompt_templates.py +176 -0
  27. strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/__init__.py +11 -0
  28. strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/tool_parameter_accuracy_v0.py +40 -0
  29. strands_evals/evaluators/prompt_templates/tool_selection_accuracy/__init__.py +11 -0
  30. strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection_accuracy_v0.py +23 -0
  31. strands_evals/evaluators/tool_parameter_accuracy_evaluator.py +112 -0
  32. strands_evals/evaluators/tool_selection_accuracy_evaluator.py +112 -0
  33. strands_evals/evaluators/trajectory_evaluator.py +100 -0
  34. strands_evals/experiment.py +652 -0
  35. strands_evals/extractors/__init__.py +3 -0
  36. strands_evals/extractors/graph_extractor.py +30 -0
  37. strands_evals/extractors/swarm_extractor.py +73 -0
  38. strands_evals/extractors/tools_use_extractor.py +164 -0
  39. strands_evals/extractors/trace_extractor.py +166 -0
  40. strands_evals/generators/__init__.py +3 -0
  41. strands_evals/generators/experiment_generator.py +498 -0
  42. strands_evals/generators/prompt_template/prompt_templates.py +75 -0
  43. strands_evals/generators/topic_planner.py +60 -0
  44. strands_evals/mappers/__init__.py +6 -0
  45. strands_evals/mappers/session_mapper.py +27 -0
  46. strands_evals/mappers/strands_in_memory_session_mapper.py +473 -0
  47. strands_evals/simulation/README.md +323 -0
  48. strands_evals/simulation/__init__.py +6 -0
  49. strands_evals/simulation/actor_simulator.py +292 -0
  50. strands_evals/simulation/profiles/__init__.py +5 -0
  51. strands_evals/simulation/profiles/actor_profile.py +26 -0
  52. strands_evals/simulation/prompt_templates/__init__.py +11 -0
  53. strands_evals/simulation/prompt_templates/actor_profile_extraction.py +25 -0
  54. strands_evals/simulation/prompt_templates/actor_system_prompt.py +64 -0
  55. strands_evals/simulation/prompt_templates/goal_completion.py +27 -0
  56. strands_evals/simulation/tools/__init__.py +5 -0
  57. strands_evals/simulation/tools/goal_completion.py +93 -0
  58. strands_evals/telemetry/__init__.py +15 -0
  59. strands_evals/telemetry/_cloudwatch_logger.py +209 -0
  60. strands_evals/telemetry/config.py +207 -0
  61. strands_evals/telemetry/tracer.py +38 -0
  62. strands_evals/tools/evaluation_tools.py +67 -0
  63. strands_evals/types/__init__.py +11 -0
  64. strands_evals/types/evaluation.py +105 -0
  65. strands_evals/types/evaluation_report.py +244 -0
  66. strands_evals/types/simulation/__init__.py +5 -0
  67. strands_evals/types/simulation/actor.py +34 -0
  68. strands_evals/types/trace.py +205 -0
@@ -0,0 +1,498 @@
1
+ import asyncio
2
+ import logging
3
+ import math
4
+ from textwrap import dedent
5
+
6
+ from pydantic import create_model
7
+ from strands import Agent
8
+ from typing_extensions import Any, Generic, TypeVar
9
+
10
+ from strands_evals.evaluators import Evaluator, InteractionsEvaluator, OutputEvaluator, TrajectoryEvaluator
11
+
12
+ from ..case import Case
13
+ from ..experiment import Experiment
14
+ from ..types.evaluation import Interaction
15
+ from .prompt_template.prompt_templates import generate_case_template as CASE_SYSTEM_PROMPT
16
+ from .prompt_template.prompt_templates import generate_rubric_template as RUBRIC_SYSTEM_PROMPT
17
+ from .topic_planner import TopicPlanner
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ InputT = TypeVar("InputT")
22
+ OutputT = TypeVar("OutputT")
23
+
24
+
25
+ class ExperimentGenerator(Generic[InputT, OutputT]):
26
+ """
27
+ Generates evaluation experiments with test cases and rubrics for LLM-based evaluators for agent assessment.
28
+
29
+ This class creates structured test cases and evaluation rubrics tailored to specific tasks
30
+ and domains, enabling comprehensive evaluation of agents' performance.
31
+ """
32
+
33
+ _default_evaluators = {
34
+ OutputEvaluator: (
35
+ "evaluates only the output response, don't include information about trajectory "
36
+ "nor interactions even if provided"
37
+ ),
38
+ TrajectoryEvaluator: (
39
+ "evaluates the trajectory and output if provided, don't include info about interactions even if provided"
40
+ ),
41
+ InteractionsEvaluator: (
42
+ "evaluates the interactions and output if provided, don't include info about trajectory even if provided"
43
+ ),
44
+ }
45
+
46
+ def __init__(
47
+ self,
48
+ input_type: type,
49
+ output_type: type,
50
+ trajectory_type: type | None = None,
51
+ include_expected_output: bool = True,
52
+ include_expected_trajectory: bool = False,
53
+ include_expected_interactions: bool = False,
54
+ include_metadata: bool = False,
55
+ model: str | None = None,
56
+ max_parallel_num_cases: int = 10,
57
+ rubric_system_prompt: str = RUBRIC_SYSTEM_PROMPT,
58
+ case_system_prompt: str = CASE_SYSTEM_PROMPT,
59
+ ):
60
+ """
61
+ Initialize the experiment generator with configuration for test case structure.
62
+
63
+ Args:
64
+ input_type: Type of input data for test cases (e.g., str, dict)
65
+ output_type: Type of expected output data (e.g., str, int)
66
+ trajectory_type: Type for trajectory elements, defaults to Any if None
67
+ include_expected_output: Whether to include expected outputs in test cases
68
+ include_expected_trajectory: Whether to include expected tool/action trajectories
69
+ include_expected_interactions: Whether to include expected interaction sequences
70
+ include_metadata: Whether to include metadata fields in test cases
71
+ model: Model identifier for the generation agent, defaults to strands' default model.
72
+ max_parallel_num_cases: Maximum number of test cases to generate in parallel asynchronously
73
+ rubric_system_prompt: System prompt for rubric generation, defaults to one of the available templates.
74
+ case_system_prompt: System prompt for test case generation, defaults to one of the available templates.
75
+ """
76
+ self.model = model
77
+ self.input_type = input_type
78
+ self.output_type = output_type
79
+ self.include_expected_output = include_expected_output
80
+ self.include_expected_trajectory = include_expected_trajectory
81
+ self.include_expected_interactions = include_expected_interactions
82
+ self.include_metadata = include_metadata
83
+ self.max_parallel_num_cases = max_parallel_num_cases
84
+
85
+ self.rubric_system_prompt = rubric_system_prompt
86
+ self.case_system_prompt = case_system_prompt
87
+
88
+ # Create class structure for Case with stricter/literal types, excluding any fields not needed
89
+ fields: dict[str, Any] = {"name": (str, ...), "input": (self.input_type, ...)}
90
+ if self.include_expected_output:
91
+ fields["expected_output"] = (self.output_type, ...)
92
+ if self.include_expected_trajectory:
93
+ # Use Any for trajectory type since we can't use runtime variables as types
94
+ fields["expected_trajectory"] = (list[Any], ...)
95
+ if self.include_expected_interactions:
96
+ fields["expected_interactions"] = (list[Interaction], ...)
97
+ if self.include_metadata:
98
+ fields["metadata"] = (dict[str, Any], ...)
99
+ self._Case = create_model("_Case", **fields)
100
+
101
+ async def _case_worker(self, queue: asyncio.Queue, prompt: str, message_history: list | None, results: list):
102
+ """
103
+ Worker that generates cases from the queue.
104
+
105
+ Args:
106
+ queue: Queue containing cases to process
107
+ prompt: Generation prompt describing the test case requirements
108
+ message_history: Optional conversation history to provide context to the generation agent
109
+ results: List to store results
110
+
111
+ """
112
+ case_generator = Agent(
113
+ model=self.model,
114
+ system_prompt=self.case_system_prompt,
115
+ callback_handler=None,
116
+ messages=message_history if message_history else [],
117
+ )
118
+
119
+ while True:
120
+ try:
121
+ difficulty = queue.get_nowait()
122
+ except asyncio.QueueEmpty:
123
+ break
124
+
125
+ try:
126
+ full_prompt = prompt + f"Ensure that the test case has a difficulty level of {difficulty}."
127
+ gen_case = await case_generator.structured_output_async(self._Case, full_prompt)
128
+ results.append(Case(**gen_case.model_dump()))
129
+ except Exception as e:
130
+ logger.exception(f"Error generating case: {e}")
131
+ finally:
132
+ queue.task_done()
133
+
134
+ async def generate_cases_async(
135
+ self, prompt: str, num_cases: int = 5, message_history: list | None = None, num_topics: int | None = None
136
+ ) -> list[Case]:
137
+ """
138
+ Generate test cases asynchronously using parallel workers.
139
+
140
+ Args:
141
+ prompt: Generation prompt describing the test case requirements
142
+ num_cases: Number of test cases to generate
143
+ message_history: Optional conversation history to provide context to the generation agent
144
+ num_topics: Optional number of topics for diverse coverage.
145
+ If None, generates all cases from the single prompt.
146
+ If specified, expands prompt into multiple topic-specific prompts.
147
+
148
+ Returns:
149
+ List of generated Case objects matching the configured schema
150
+ """
151
+ prompt_specs = await self._prepare_generation_prompts(
152
+ base_prompt=prompt, num_cases=num_cases, num_topics=num_topics
153
+ )
154
+
155
+ generated_cases: list = []
156
+ for prompt_text, cases_for_prompt in prompt_specs:
157
+ cases = await self._generate_batch(
158
+ prompt=prompt_text, num_cases=cases_for_prompt, message_history=message_history
159
+ )
160
+ generated_cases.extend(cases)
161
+
162
+ return generated_cases
163
+
164
+ async def _prepare_generation_prompts(
165
+ self, base_prompt: str, num_cases: int, num_topics: int | None = None
166
+ ) -> list[tuple[str, int]]:
167
+ """
168
+ Prepare generation prompts, optionally expanding via topic planning.
169
+
170
+ Returns:
171
+ List of (prompt, num_cases) tuples. Always returns at least one prompt.
172
+ """
173
+ if num_topics is None:
174
+ return [(base_prompt, num_cases)]
175
+
176
+ topic_planner = TopicPlanner(model=self.model)
177
+
178
+ try:
179
+ topic_plan = await topic_planner.plan_topics_async(
180
+ context=base_prompt, task_description="", num_topics=num_topics, num_cases=num_cases
181
+ )
182
+ except Exception as e:
183
+ logger.warning(f"Topic planning failed: {e}. Using single prompt.")
184
+ return [(base_prompt, num_cases)]
185
+
186
+ # Distribute cases across topics
187
+ cases_per_topic = math.ceil(num_cases / len(topic_plan.topics))
188
+ prompt_specs: list[tuple[str, int]] = []
189
+
190
+ num_generated_cases = 0
191
+ for topic in topic_plan.topics:
192
+ remaining = num_cases - num_generated_cases
193
+ if remaining <= 0:
194
+ break
195
+
196
+ topic_cases = min(cases_per_topic, remaining)
197
+ topic_prompt = dedent(f"""
198
+ {base_prompt}
199
+ Focus on this topic:
200
+ - {topic.title}: {topic.description}
201
+ - Key aspects: {", ".join(topic.key_aspects)}
202
+ """)
203
+
204
+ prompt_specs.append((topic_prompt, topic_cases))
205
+ num_generated_cases += topic_cases
206
+
207
+ return prompt_specs
208
+
209
+ async def _generate_batch(self, prompt: str, num_cases: int, message_history: list | None = None) -> list[Case]:
210
+ """Generate a batch of cases using the existing worker pattern."""
211
+ queue: asyncio.Queue[str] = asyncio.Queue()
212
+ generated_cases: list = []
213
+
214
+ for i in range(num_cases):
215
+ difficulty = "medium"
216
+ if i < num_cases * 0.3:
217
+ difficulty = "easy"
218
+ elif i > num_cases * 0.8:
219
+ difficulty = "hard"
220
+ queue.put_nowait(difficulty)
221
+
222
+ num_workers = min(self.max_parallel_num_cases, num_cases)
223
+ workers = [
224
+ asyncio.create_task(self._case_worker(queue, prompt, message_history, generated_cases))
225
+ for _ in range(num_workers)
226
+ ]
227
+
228
+ await queue.join()
229
+ for worker in workers:
230
+ worker.cancel()
231
+ await asyncio.gather(*workers, return_exceptions=True)
232
+
233
+ return generated_cases
234
+
235
+ async def construct_evaluator_async(
236
+ self, prompt: str, evaluator: Evaluator, message_history: list | None = None
237
+ ) -> Evaluator:
238
+ """
239
+ Create an evaluator instance with a generated rubric.
240
+
241
+ Currently supports default evaluators: OutputEvaluator, TrajectoryEvaluator,
242
+ and InteractionsEvaluator. Generates task-specific rubrics for evaluation.
243
+
244
+ Args:
245
+ prompt: Prompt describing the evaluation context and requirements
246
+ evaluator: Evaluator class to instantiate (must be a default evaluator)
247
+ message_history: Optional conversation history to provide context to the rubric generation agent
248
+
249
+ Returns:
250
+ Configured evaluator instance with generated rubric
251
+
252
+ Raises:
253
+ ValueError: If evaluator is not one of the supported default evaluators
254
+ """
255
+ if evaluator not in self._default_evaluators:
256
+ raise ValueError(
257
+ f"{evaluator} is not a default evaluator that needs a rubric. Please use one of the "
258
+ f"default evaluators: {list(self._default_evaluators.keys())}."
259
+ )
260
+
261
+ rubric_generator_agent = Agent(
262
+ model=self.model,
263
+ system_prompt=self.rubric_system_prompt,
264
+ callback_handler=None,
265
+ messages=message_history if message_history else [],
266
+ )
267
+ evaluator_name = evaluator.get_type_name()
268
+ evaluator_desc = self._default_evaluators[evaluator]
269
+ evaluator_info = f"""The evaluator selected is {evaluator_name}. This evaluator {evaluator_desc}."""
270
+ final_prompt = (
271
+ prompt
272
+ + evaluator_info
273
+ + """
274
+ IMPORTANT: Your response must be ONLY a few sentences describing how to evaluate the test cases."""
275
+ )
276
+
277
+ rubric = await rubric_generator_agent.invoke_async(final_prompt)
278
+ return evaluator(rubric=str(rubric))
279
+
280
+ async def from_scratch_async(
281
+ self, topics: list[str], task_description: str, num_cases: int = 5, evaluator: Evaluator = None
282
+ ) -> Experiment:
283
+ """
284
+ Generate an experiment from scratch based on specified topics and task description.
285
+
286
+ Creates diverse test cases covering the given topics for the specified task,
287
+ with optional evaluator and rubric generation.
288
+
289
+ Args:
290
+ topics: List of topics/domains to cover in test cases
291
+ task_description: Description of the task the AI system will perform
292
+ num_cases: Number of test cases to generate
293
+ evaluator: Optional evaluator class for assessment (generates rubric if provided).
294
+
295
+ Returns:
296
+ Experiment containing generated test cases and evaluator. Use the generic Evaluator as placeholder
297
+ if no evaluator is passed in.
298
+ """
299
+ topics_str = " ".join(topics)
300
+ case_prompt = (
301
+ f"""Create test cases for the following topics: {topics_str} for this task: """ f"""{task_description}."""
302
+ )
303
+ cases = await self.generate_cases_async(case_prompt, num_cases)
304
+ if evaluator:
305
+ rubric_prompt = (
306
+ f"""Create a rubric for the following topics: {topics_str} for this task: """ f"""{task_description}."""
307
+ )
308
+ _evaluator = await self.construct_evaluator_async(
309
+ prompt=rubric_prompt,
310
+ evaluator=evaluator,
311
+ )
312
+ return Experiment(cases=cases, evaluators=[_evaluator])
313
+ else:
314
+ return Experiment(cases=cases)
315
+
316
+ async def from_context_async(
317
+ self,
318
+ context: str,
319
+ task_description: str,
320
+ num_cases: int = 5,
321
+ evaluator: Evaluator = None,
322
+ num_topics: int | None = None,
323
+ ) -> Experiment:
324
+ """
325
+ Generate an experiment based on specific context that test cases should reference.
326
+
327
+ Creates test cases that can be answered using the provided context,
328
+ useful for testing knowledge retrieval, context understanding, or domain-specific tasks.
329
+
330
+ Args:
331
+ context: Specific context/information that test cases should reference. If there's any tools
332
+ they need to use, specify them here too. Be sure to include as much information as you can
333
+ about tools or sub-agents for generating interaction and/or trajectory.
334
+ task_description: Description of the task the AI system will perform
335
+ num_cases: Number of test cases to generate
336
+ evaluator: Optional evaluator class for assessment (generates rubric if provided), use Evaluator()
337
+ as a placeholder.
338
+ num_topics: Optional number of topics for diverse coverage
339
+
340
+ Returns:
341
+ Experiment containing context-based test cases and evaluator. Use the generic Evaluator as placeholder
342
+ if no evaluator is passed in.
343
+ """
344
+ cases = await self.generate_cases_async(
345
+ f"""Create test cases with the following context: {context}. Ensure that the questions can be """
346
+ f"""answer using the provided context for this task: {task_description} """,
347
+ num_cases=num_cases,
348
+ num_topics=num_topics,
349
+ )
350
+ if evaluator:
351
+ _evaluator = await self.construct_evaluator_async(
352
+ prompt=f"""Create a rubric with the following context: {context} for this task: """
353
+ f"""{task_description} """,
354
+ evaluator=evaluator,
355
+ )
356
+ return Experiment(cases=cases, evaluators=[_evaluator])
357
+ else:
358
+ return Experiment(cases=cases)
359
+
360
+ async def from_experiment_async(
361
+ self,
362
+ source_experiment: Experiment,
363
+ task_description: str,
364
+ num_cases: int = 5,
365
+ extra_information: str | None = None,
366
+ ) -> Experiment:
367
+ """
368
+ Generate a new experiment using an existing experiment as reference.
369
+
370
+ Creates new test cases that are similar in style and structure to the source experiment,
371
+ while adapting them for the specified task. If the source experiment uses a default
372
+ evaluator with a rubric, generates a new rubric based on the original.
373
+
374
+ Args:
375
+ source_experiment: Original experiment to use as reference for generating new test cases
376
+ task_description: Description of the task the AI system will perform
377
+ num_cases: Number of test cases to generate
378
+ extra_information: Optional additional context or requirements for the new test cases and rubric,
379
+ be sure to include as much information as you can about tools or sub-agents
380
+ for generating interaction and/or trajectory.
381
+
382
+ Returns:
383
+ A new Experiment containing test cases inspired by the source experiment but adapted
384
+ for the new task. Uses an updated evaluator with new rubric if the source
385
+ evaluator is a default type, otherwise uses generic Evaluator.
386
+ """
387
+ source_cases = source_experiment.cases
388
+ source_evaluators = source_experiment.evaluators
389
+
390
+ # construct messages to initialize the agent with context about the previous test cases
391
+ messages = [{"role": "user", "content": [{"text": "Here are the reference test cases: "}]}]
392
+ cases_string_list = []
393
+ for i, case in enumerate(source_cases):
394
+ cases_string_list.append({"text": f"{i}. {case.model_dump()}"})
395
+ messages.append({"role": "user", "content": cases_string_list})
396
+ new_cases = await self.generate_cases_async(
397
+ prompt=(
398
+ f"Create new test cases similar to the reference cases. Ensure that the input and output "
399
+ f"are relevant for this task: {task_description}. Here are some extra information: "
400
+ f"{extra_information}."
401
+ ),
402
+ num_cases=num_cases,
403
+ message_history=messages,
404
+ )
405
+ new_evaluators = []
406
+ for source_evaluator in source_evaluators:
407
+ if type(source_evaluator) in self._default_evaluators:
408
+ source_rubric = source_evaluator.rubric
409
+ new_evaluator = await self.construct_evaluator_async(
410
+ prompt=(
411
+ f"Create a new rubric based on the reference rubric. Ensure that the rubric is relevant "
412
+ f"for this task: {task_description}. Here are some extra information: {extra_information}."
413
+ ),
414
+ evaluator=type(source_evaluator),
415
+ message_history=[{"role": "user", "content": [{"text": source_rubric}]}],
416
+ )
417
+ new_evaluators.append(new_evaluator)
418
+ else:
419
+ new_evaluators.append(Evaluator())
420
+
421
+ return Experiment(cases=new_cases, evaluators=new_evaluators if new_evaluators else [Evaluator()])
422
+
423
+ async def update_current_experiment_async(
424
+ self,
425
+ source_experiment: Experiment,
426
+ task_description: str,
427
+ num_cases: int = 5,
428
+ context: str | None = None,
429
+ add_new_cases: bool = True,
430
+ add_new_rubric: bool = True,
431
+ new_evaluator_type: type | None = None,
432
+ ) -> Experiment:
433
+ """
434
+ Update an existing experiment by adding new test cases and/or updating the evaluator.
435
+
436
+ Extends the source experiment with additional test cases that complement the existing ones,
437
+ and optionally updates the evaluation rubric. Useful for iteratively improving experiments
438
+ or adapting them to new requirements while preserving the original test cases.
439
+
440
+ Args:
441
+ source_experiment: Original experiment to extend and update
442
+ task_description: Description of the task the AI system will perform
443
+ num_cases: Number of new test cases to add (if add_new_cases is True)
444
+ context: Additional context or requirements for new test cases and rubric,
445
+ be sure to include as much information as you can about tools or sub-agents
446
+ for generating interaction and/or trajectory.
447
+ add_new_cases: Whether to generate and add new test cases to the experiment
448
+ add_new_rubric: Whether to generate a new evaluation rubric
449
+ new_evaluator_type: Optional new evaluator type to use instead of the source evaluator type
450
+
451
+ Returns:
452
+ Updated Experiment containing original cases plus new cases (if requested) and
453
+ updated evaluator with new rubric (if requested and evaluator supports it).
454
+ """
455
+ source_cases = source_experiment.cases
456
+ source_evaluators = source_experiment.evaluators
457
+
458
+ if add_new_cases:
459
+ # construct messages to initialize the agent with context about the previous test cases
460
+ messages = [{"role": "user", "content": [{"text": "Here are the current test cases: "}]}]
461
+ cases_string_list = []
462
+ for i, case in enumerate(source_cases):
463
+ cases_string_list.append({"text": f"{i}. {case.model_dump()}"})
464
+ messages.append({"role": "user", "content": cases_string_list})
465
+ new_cases = await self.generate_cases_async(
466
+ prompt=(
467
+ f"Create new test cases, expanding on previous cases for the following context: {context}. "
468
+ f"Ensure that the input and output are relevant for this task: {task_description}."
469
+ ),
470
+ num_cases=num_cases,
471
+ message_history=messages,
472
+ )
473
+
474
+ if add_new_rubric:
475
+ new_evaluators = []
476
+ for source_evaluator in source_evaluators:
477
+ evaluator_type = new_evaluator_type if new_evaluator_type else type(source_evaluator)
478
+
479
+ if evaluator_type in self._default_evaluators:
480
+ source_rubric = (
481
+ source_evaluator.rubric if type(source_evaluator) in self._default_evaluators else None
482
+ )
483
+ new_evaluator = await self.construct_evaluator_async(
484
+ prompt=(
485
+ f"Create a new rubric based on the reference rubric if provided for the following "
486
+ f"context: {context}. Ensure that the rubric is relevant for this task: {task_description}."
487
+ ),
488
+ evaluator=evaluator_type,
489
+ message_history=[{"role": "user", "content": [{"text": source_rubric}]}],
490
+ )
491
+ new_evaluators.append(new_evaluator)
492
+ else:
493
+ new_evaluators.append(source_evaluator)
494
+
495
+ return Experiment(
496
+ cases=source_cases + new_cases if add_new_cases else source_cases,
497
+ evaluators=new_evaluators if add_new_rubric else source_evaluators,
498
+ )
@@ -0,0 +1,75 @@
1
+ DEFAULT_PLANNING_SYSTEM_PROMPT = """You are a test scenario planner for AI agents.
2
+ Your role is to analyze agent configurations and generate strategic topic plans
3
+ that comprehensively evaluate agent capabilities.
4
+
5
+ Your topics should:
6
+ - Cover different aspects of the agent's capabilities
7
+ - Test edge cases and common scenarios
8
+ - Vary in complexity and scope
9
+ - Ensure comprehensive coverage of available tools and features
10
+ - Be diverse and non-overlapping"""
11
+
12
+ generate_case_template = """
13
+ You are an expert test case generator for AI evaluation datasets. Your role is to create high-quality, diverse test cases that thoroughly evaluate AI systems across different domains and capabilities.
14
+
15
+ When given a task description, you will generate test cases specifically designed to evaluate how well an AI system can perform that task.
16
+
17
+ CORE PRINCIPLES:
18
+ - Generate realistic, practical test cases that reflect real-world usage patterns for the given task
19
+ - Ensure comprehensive coverage of the task requirements and potential challenges
20
+ - Create test cases that are specific, unambiguous, and measurable within the task context
21
+ - Balance difficulty levels to assess different capability thresholds for the task
22
+ - Include edge cases, corner scenarios, and potential failure modes relevant to the task
23
+
24
+ TEST CASE DESIGN:
25
+ - Easy Level (30%): Basic task functionality, straightforward scenarios, common use cases
26
+ - Medium Level (50%): Multi-step reasoning, moderate complexity, realistic task challenges
27
+ - Hard Level (20%): Complex task scenarios, edge cases, advanced reasoning, error handling
28
+
29
+ QUALITY STANDARDS:
30
+ - Each test case should have a clear, well-defined input relevant to the task
31
+ - Expected outputs should be accurate, complete, and verifiable for the task
32
+ - Test cases should be independent and not rely on previous context
33
+ - Avoid repetitive or overly similar scenarios within the task scope
34
+ - Ensure cultural sensitivity and avoid biased content
35
+
36
+ TASK-SPECIFIC CONSIDERATIONS:
37
+ When creating test cases, consider:
38
+ - What inputs will the AI system receive for this task?
39
+ - What outputs should it produce?
40
+ - What tools or capabilities might it need to use?
41
+ - What are the success criteria for this task?
42
+ - What could go wrong or be challenging about this task?
43
+
44
+ Remember: You are creating evaluation data to measure AI performance on specific tasks. Quality and diversity are paramount for meaningful assessment.
45
+ """
46
+
47
+ generate_rubric_template = """
48
+ You are an expert evaluation specialist focused on creating concise, actionable rubrics for AI agent system assessment.
49
+
50
+ When given a task description, you will create a rubric that captures the essential criteria for evaluating
51
+ how well an AI agent system performs that specific task for a particular information type (eg. output, trajectory, and/or interactions).
52
+
53
+ RUBRIC REQUIREMENTS:
54
+ - Should be clear, comprehensive, and easy to understand for the specific task
55
+ - Focus on what makes a response high-quality when performing the given task
56
+ - Include key evaluation dimensions relevant to the task (accuracy, completeness, clarity, tool usage, etc.)
57
+ - Be specific enough to guide evaluation but general enough to apply across test cases for the task
58
+ - Consider the task's success criteria and potential failure modes
59
+ - Avoid mentioning specific test case details or examples
60
+
61
+ TASK-AWARE EVALUATION:
62
+ When creating rubrics, consider:
63
+ - What does successful task completion look like?
64
+ - What are the key quality indicators for this task?
65
+ - What tools, reasoning, or capabilities should be demonstrated?
66
+ - What are common failure modes or errors for this task?
67
+ - How should edge cases or complex scenarios be handled?
68
+
69
+ FORMAT:
70
+ - Use active, measurable criteria specific to the task
71
+ - Keep concise but comprehensive
72
+ - Focus on observable, evaluable qualities
73
+
74
+ Focus on creating a rubric that evaluators can consistently apply to measure how well AI systems perform the given task. Starts with "Scoring should ..."
75
+ """
@@ -0,0 +1,60 @@
1
+ import math
2
+ from textwrap import dedent
3
+
4
+ from pydantic import BaseModel, Field
5
+ from strands import Agent
6
+
7
+ from strands_evals.generators.prompt_template.prompt_templates import DEFAULT_PLANNING_SYSTEM_PROMPT
8
+
9
+
10
+ class Topic(BaseModel):
11
+ """Represents a single topic for test case generation."""
12
+
13
+ title: str = Field(..., description="Brief descriptive title for the topic")
14
+ description: str = Field(..., description="Short description explaining the topic")
15
+ key_aspects: list[str] = Field(..., description="2-5 key aspects that test cases should explore for this topic")
16
+
17
+
18
+ class TopicPlan(BaseModel):
19
+ """Represents a complete topic plan with multiple topics."""
20
+
21
+ topics: list[Topic] = Field(..., description="List of diverse topics for comprehensive test coverage")
22
+
23
+
24
+ class TopicPlanner:
25
+ """Plans diverse topics for test case generation based on agent context."""
26
+
27
+ def __init__(self, model: str | None = None, planning_prompt: str | None = None):
28
+ self.model = model
29
+ self.planning_prompt = planning_prompt or DEFAULT_PLANNING_SYSTEM_PROMPT
30
+
31
+ async def plan_topics_async(
32
+ self, context: str, task_description: str, num_topics: int, num_cases: int
33
+ ) -> TopicPlan:
34
+ """Generate a strategic plan of diverse topics for test case generation."""
35
+ cases_per_topic = math.ceil(num_cases / num_topics)
36
+
37
+ planning_agent = Agent(model=self.model, system_prompt=self.planning_prompt, callback_handler=None)
38
+
39
+ prompt = dedent(f"""
40
+ Generate {num_topics} diverse topics for creating {num_cases} test cases.
41
+
42
+ Agent Context:
43
+ {context}
44
+
45
+ Task Description:
46
+ {task_description}
47
+
48
+ Requirements:
49
+ - Create exactly {num_topics} distinct topics
50
+ - Each topic will generate approximately {cases_per_topic} test cases
51
+ - Include 2-5 key aspects per topic that test cases should explore
52
+ - Ensure topics span different complexity levels and use cases
53
+ - Make topics diverse and non-overlapping""")
54
+
55
+ topic_plan = await planning_agent.structured_output_async(TopicPlan, prompt)
56
+
57
+ if len(topic_plan.topics) > num_topics:
58
+ topic_plan.topics = topic_plan.topics[:num_topics]
59
+
60
+ return topic_plan
@@ -0,0 +1,6 @@
1
+ """Converters for transforming telemetry data to Session format."""
2
+
3
+ from .session_mapper import SessionMapper
4
+ from .strands_in_memory_session_mapper import GenAIConventionVersion, StrandsInMemorySessionMapper
5
+
6
+ __all__ = ["GenAIConventionVersion", "SessionMapper", "StrandsInMemorySessionMapper"]
@@ -0,0 +1,27 @@
1
+ """
2
+ SessionMapper - Base class for mapping telemetry data to Session format
3
+ """
4
+
5
+ from abc import ABC, abstractmethod
6
+
7
+ from typing_extensions import Any
8
+
9
+ from ..types.trace import Session
10
+
11
+
12
+ class SessionMapper(ABC):
13
+ """Base class for mapping telemetry data to Session format for evaluation."""
14
+
15
+ @abstractmethod
16
+ def map_to_session(self, spans: list[Any], session_id: str) -> Session:
17
+ """
18
+ Map spans to Session format.
19
+
20
+ Args:
21
+ spans: List of span objects
22
+ session_id: Session identifier
23
+
24
+ Returns:
25
+ Session object ready for evaluation
26
+ """
27
+ pass