strands-agents-evals 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {strands_agents_evals-0.1.0.dist-info → strands_agents_evals-0.1.1.dist-info}/METADATA +72 -3
- {strands_agents_evals-0.1.0.dist-info → strands_agents_evals-0.1.1.dist-info}/RECORD +8 -8
- strands_evals/__init__.py +0 -2
- strands_evals/experiment.py +3 -3
- strands_evals/extractors/tools_use_extractor.py +5 -1
- {strands_agents_evals-0.1.0.dist-info → strands_agents_evals-0.1.1.dist-info}/WHEEL +0 -0
- {strands_agents_evals-0.1.0.dist-info → strands_agents_evals-0.1.1.dist-info}/licenses/LICENSE +0 -0
- {strands_agents_evals-0.1.0.dist-info → strands_agents_evals-0.1.1.dist-info}/licenses/NOTICE +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: strands-agents-evals
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: Evaluation framework for Strands
|
|
5
5
|
Author-email: AWS <opensource@amazon.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -68,6 +68,7 @@ Strands Evaluation is a powerful framework for evaluating AI agents and LLM appl
|
|
|
68
68
|
## Feature Overview
|
|
69
69
|
|
|
70
70
|
- **Multiple Evaluation Types**: Output evaluation, trajectory analysis, tool usage assessment, and interaction evaluation
|
|
71
|
+
- **Dynamic Simulators**: Multi-turn conversation simulation with realistic user behavior and goal-oriented interactions
|
|
71
72
|
- **LLM-as-a-Judge**: Built-in evaluators using language models for sophisticated assessment with structured scoring
|
|
72
73
|
- **Trace-based Evaluation**: Analyze agent behavior through OpenTelemetry execution traces
|
|
73
74
|
- **Automated Experiment Generation**: Generate comprehensive test suites from context descriptions
|
|
@@ -226,6 +227,73 @@ reports = experiment.run_evaluations(user_task_function)
|
|
|
226
227
|
reports[0].run_display()
|
|
227
228
|
```
|
|
228
229
|
|
|
230
|
+
### Multi-turn Conversation Simulation
|
|
231
|
+
|
|
232
|
+
Simulate realistic user interactions with dynamic, goal-oriented conversations using ActorSimulator:
|
|
233
|
+
|
|
234
|
+
```python
|
|
235
|
+
from strands import Agent
|
|
236
|
+
from strands_evals import Case, Experiment, ActorSimulator
|
|
237
|
+
from strands_evals.evaluators import HelpfulnessEvaluator, GoalSuccessRateEvaluator
|
|
238
|
+
from strands_evals.mappers import StrandsInMemorySessionMapper
|
|
239
|
+
from strands_evals.telemetry import StrandsEvalsTelemetry
|
|
240
|
+
|
|
241
|
+
# Setup telemetry
|
|
242
|
+
telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter()
|
|
243
|
+
memory_exporter = telemetry.in_memory_exporter
|
|
244
|
+
|
|
245
|
+
def task_function(case: Case) -> dict:
|
|
246
|
+
# Create simulator to drive conversation
|
|
247
|
+
simulator = ActorSimulator.from_case_for_user_simulator(
|
|
248
|
+
case=case,
|
|
249
|
+
max_turns=10
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
# Create agent to evaluate
|
|
253
|
+
agent = Agent(
|
|
254
|
+
trace_attributes={
|
|
255
|
+
"gen_ai.conversation.id": case.session_id,
|
|
256
|
+
"session.id": case.session_id
|
|
257
|
+
},
|
|
258
|
+
callback_handler=None
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
# Run multi-turn conversation
|
|
262
|
+
all_spans = []
|
|
263
|
+
user_message = case.input
|
|
264
|
+
|
|
265
|
+
while simulator.has_next():
|
|
266
|
+
memory_exporter.clear()
|
|
267
|
+
agent_response = agent(user_message)
|
|
268
|
+
turn_spans = list(memory_exporter.get_finished_spans())
|
|
269
|
+
all_spans.extend(turn_spans)
|
|
270
|
+
|
|
271
|
+
user_result = simulator.act(str(agent_response))
|
|
272
|
+
user_message = str(user_result.structured_output.message)
|
|
273
|
+
|
|
274
|
+
# Map to session for evaluation
|
|
275
|
+
mapper = StrandsInMemorySessionMapper()
|
|
276
|
+
session = mapper.map_to_session(all_spans, session_id=case.session_id)
|
|
277
|
+
|
|
278
|
+
return {"output": str(agent_response), "trajectory": session}
|
|
279
|
+
|
|
280
|
+
# Use evaluators to assess simulated conversations
|
|
281
|
+
evaluators = [
|
|
282
|
+
HelpfulnessEvaluator(),
|
|
283
|
+
GoalSuccessRateEvaluator()
|
|
284
|
+
]
|
|
285
|
+
|
|
286
|
+
experiment = Experiment(cases=test_cases, evaluators=evaluators)
|
|
287
|
+
reports = experiment.run_evaluations(task_function)
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
**Key Benefits:**
|
|
291
|
+
- **Dynamic Interactions**: Simulator adapts responses based on agent behavior
|
|
292
|
+
- **Goal-Oriented Testing**: Verify agents can complete user objectives through dialogue
|
|
293
|
+
- **Realistic Conversations**: Generate authentic multi-turn interaction patterns
|
|
294
|
+
- **No Predefined Scripts**: Test agents without hardcoded conversation paths
|
|
295
|
+
- **Comprehensive Evaluation**: Combine with trace-based evaluators for full assessment
|
|
296
|
+
|
|
229
297
|
### Automated Experiment Generation
|
|
230
298
|
|
|
231
299
|
Generate comprehensive test suites automatically from context descriptions:
|
|
@@ -388,8 +456,9 @@ reports[0].run_display() # Interactive display with metrics breakdown
|
|
|
388
456
|
|
|
389
457
|
For detailed guidance & examples, explore our documentation:
|
|
390
458
|
|
|
391
|
-
- [User Guide](https://strandsagents.com/latest
|
|
392
|
-
- [Evaluator Reference](https://strandsagents.com/latest/user-guide/evals-sdk/evaluators/)
|
|
459
|
+
- [User Guide](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/quickstart/)
|
|
460
|
+
- [Evaluator Reference](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/evaluators/)
|
|
461
|
+
- [Simulators Guide](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/simulators/)
|
|
393
462
|
|
|
394
463
|
## Contributing ❤️
|
|
395
464
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
strands_evals/__init__.py,sha256=
|
|
1
|
+
strands_evals/__init__.py,sha256=WnYsQGtkatrCKM8v_i_oCtBHNJfPaTrOg2ThUlf55Pk,485
|
|
2
2
|
strands_evals/case.py,sha256=KWAL947NkmNzg9FFdTsL6KI9AFLQ8IcFjaOjcs9x5to,2131
|
|
3
|
-
strands_evals/experiment.py,sha256=
|
|
3
|
+
strands_evals/experiment.py,sha256=6gARs-JiGMSoeqC7-sjLGfL6hbEcHH5YJ4ABH0Qf3cM,28239
|
|
4
4
|
strands_evals/display/display_console.py,sha256=bOTr6RepgnifALz2DgXnnk3c4Jjxu_mA68-pFr7xry0,5932
|
|
5
5
|
strands_evals/evaluators/__init__.py,sha256=OfZU5RkYewHOAnEjPKdxiEvPnfOOWNZc_9nQpAfARfI,887
|
|
6
6
|
strands_evals/evaluators/evaluator.py,sha256=XEesDeT83H93B1X_w8s0Nsb1KKHy26QO8b99Hi6vKbc,7466
|
|
@@ -30,7 +30,7 @@ strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection
|
|
|
30
30
|
strands_evals/extractors/__init__.py,sha256=Jmlrk-m8sSS_LwmCVSloIkg3BjOgRzNEezjaAGMw5rw,74
|
|
31
31
|
strands_evals/extractors/graph_extractor.py,sha256=TeT-58JB9roqSvy2ELz1kg8WF5YO-cfLlGZTO0F9s_4,1105
|
|
32
32
|
strands_evals/extractors/swarm_extractor.py,sha256=Sm1XFCkAGVdF3XDyO3iF-20I8C6sAQ8JPNP5fgotOFU,2682
|
|
33
|
-
strands_evals/extractors/tools_use_extractor.py,sha256=
|
|
33
|
+
strands_evals/extractors/tools_use_extractor.py,sha256=3WngKFdTz9XYeD0eXn90Dr1eGuM8egbOJT0w0LYxWhk,6388
|
|
34
34
|
strands_evals/extractors/trace_extractor.py,sha256=l7gk5rUFoUcxQduPJz49OX66SdgeK1MLt81aF1yr4Lc,6653
|
|
35
35
|
strands_evals/generators/__init__.py,sha256=B1F30DAIf0kPyBdE4PAZvSby-dTelqb_7hFJoATqVb0,89
|
|
36
36
|
strands_evals/generators/experiment_generator.py,sha256=6wLTL0iG2b0YAiu0w8dDiaBxOIy7p_Fs7l3hCjgQc0w,22655
|
|
@@ -61,8 +61,8 @@ strands_evals/types/evaluation_report.py,sha256=vT86zO4Qn9CQbULo3aziGMdG-1qWLdcB
|
|
|
61
61
|
strands_evals/types/trace.py,sha256=BFoEylzAlENyPH702T5MDz-_H21-Wfx-FFTSXX1tDfY,4844
|
|
62
62
|
strands_evals/types/simulation/__init__.py,sha256=-mz5lW6qFfIMm4dJGaP9pXY3xeiefLbB0XevjdFykkU,133
|
|
63
63
|
strands_evals/types/simulation/actor.py,sha256=ESTV8165c3Ad5QT4yYmjm-A-oZdwZ0Rf0Lq7zokjTPo,1163
|
|
64
|
-
strands_agents_evals-0.1.
|
|
65
|
-
strands_agents_evals-0.1.
|
|
66
|
-
strands_agents_evals-0.1.
|
|
67
|
-
strands_agents_evals-0.1.
|
|
68
|
-
strands_agents_evals-0.1.
|
|
64
|
+
strands_agents_evals-0.1.1.dist-info/METADATA,sha256=W8UdHTxX2zsjd4F3jVuK5t2e0toxbRKBx9whF34ZjFc,17721
|
|
65
|
+
strands_agents_evals-0.1.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
66
|
+
strands_agents_evals-0.1.1.dist-info/licenses/LICENSE,sha256=yIWKWnZEC7ykhOE0z330Y4XQkN0YssWOZQ-TkliALls,10141
|
|
67
|
+
strands_agents_evals-0.1.1.dist-info/licenses/NOTICE,sha256=Eg13ogOmcI7JpMjxniFnKG81vwU3X8X7P_IlpvVg5RU,66
|
|
68
|
+
strands_agents_evals-0.1.1.dist-info/RECORD,,
|
strands_evals/__init__.py
CHANGED
strands_evals/experiment.py
CHANGED
|
@@ -577,8 +577,8 @@ class Experiment(Generic[InputT, OutputT]):
|
|
|
577
577
|
|
|
578
578
|
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
579
579
|
|
|
580
|
-
with open(file_path, "w") as f:
|
|
581
|
-
json.dump(self.to_dict(), f, indent=2)
|
|
580
|
+
with open(file_path, "w", encoding="utf-8") as f:
|
|
581
|
+
json.dump(self.to_dict(), f, indent=2, ensure_ascii=False)
|
|
582
582
|
|
|
583
583
|
@classmethod
|
|
584
584
|
def from_dict(cls, data: dict, custom_evaluators: list[type[Evaluator]] | None = None):
|
|
@@ -646,7 +646,7 @@ class Experiment(Generic[InputT, OutputT]):
|
|
|
646
646
|
f"Only .json format is supported. Got file: {path}. Please provide a path with .json extension."
|
|
647
647
|
)
|
|
648
648
|
|
|
649
|
-
with open(file_path, "r") as f:
|
|
649
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
650
650
|
data = json.load(f)
|
|
651
651
|
|
|
652
652
|
return cls.from_dict(data, custom_evaluators)
|
|
@@ -33,6 +33,7 @@ def extract_agent_tools_used_from_messages(agent_messages):
|
|
|
33
33
|
tool_id = tool.get("toolUseId")
|
|
34
34
|
# get the tool result from the next message
|
|
35
35
|
tool_result = None
|
|
36
|
+
is_error = False
|
|
36
37
|
next_message_i = i + 1
|
|
37
38
|
while next_message_i < len(agent_messages):
|
|
38
39
|
next_message = agent_messages[next_message_i]
|
|
@@ -46,9 +47,12 @@ def extract_agent_tools_used_from_messages(agent_messages):
|
|
|
46
47
|
tool_result_content = tool_result_dict.get("content", [])
|
|
47
48
|
if len(tool_result_content) > 0:
|
|
48
49
|
tool_result = tool_result_content[0].get("text")
|
|
50
|
+
is_error = tool_result_dict.get("status") == "error"
|
|
49
51
|
break
|
|
50
52
|
|
|
51
|
-
tools_used.append(
|
|
53
|
+
tools_used.append(
|
|
54
|
+
{"name": tool_name, "input": tool_input, "tool_result": tool_result, "is_error": is_error}
|
|
55
|
+
)
|
|
52
56
|
return tools_used
|
|
53
57
|
|
|
54
58
|
|
|
File without changes
|
{strands_agents_evals-0.1.0.dist-info → strands_agents_evals-0.1.1.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{strands_agents_evals-0.1.0.dist-info → strands_agents_evals-0.1.1.dist-info}/licenses/NOTICE
RENAMED
|
File without changes
|