strands-agents-evals 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: strands-agents-evals
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: Evaluation framework for Strands
5
5
  Author-email: AWS <opensource@amazon.com>
6
6
  License: Apache-2.0
@@ -68,6 +68,7 @@ Strands Evaluation is a powerful framework for evaluating AI agents and LLM appl
68
68
  ## Feature Overview
69
69
 
70
70
  - **Multiple Evaluation Types**: Output evaluation, trajectory analysis, tool usage assessment, and interaction evaluation
71
+ - **Dynamic Simulators**: Multi-turn conversation simulation with realistic user behavior and goal-oriented interactions
71
72
  - **LLM-as-a-Judge**: Built-in evaluators using language models for sophisticated assessment with structured scoring
72
73
  - **Trace-based Evaluation**: Analyze agent behavior through OpenTelemetry execution traces
73
74
  - **Automated Experiment Generation**: Generate comprehensive test suites from context descriptions
@@ -226,6 +227,73 @@ reports = experiment.run_evaluations(user_task_function)
226
227
  reports[0].run_display()
227
228
  ```
228
229
 
230
+ ### Multi-turn Conversation Simulation
231
+
232
+ Simulate realistic user interactions with dynamic, goal-oriented conversations using ActorSimulator:
233
+
234
+ ```python
235
+ from strands import Agent
236
+ from strands_evals import Case, Experiment, ActorSimulator
237
+ from strands_evals.evaluators import HelpfulnessEvaluator, GoalSuccessRateEvaluator
238
+ from strands_evals.mappers import StrandsInMemorySessionMapper
239
+ from strands_evals.telemetry import StrandsEvalsTelemetry
240
+
241
+ # Setup telemetry
242
+ telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter()
243
+ memory_exporter = telemetry.in_memory_exporter
244
+
245
+ def task_function(case: Case) -> dict:
246
+ # Create simulator to drive conversation
247
+ simulator = ActorSimulator.from_case_for_user_simulator(
248
+ case=case,
249
+ max_turns=10
250
+ )
251
+
252
+ # Create agent to evaluate
253
+ agent = Agent(
254
+ trace_attributes={
255
+ "gen_ai.conversation.id": case.session_id,
256
+ "session.id": case.session_id
257
+ },
258
+ callback_handler=None
259
+ )
260
+
261
+ # Run multi-turn conversation
262
+ all_spans = []
263
+ user_message = case.input
264
+
265
+ while simulator.has_next():
266
+ memory_exporter.clear()
267
+ agent_response = agent(user_message)
268
+ turn_spans = list(memory_exporter.get_finished_spans())
269
+ all_spans.extend(turn_spans)
270
+
271
+ user_result = simulator.act(str(agent_response))
272
+ user_message = str(user_result.structured_output.message)
273
+
274
+ # Map to session for evaluation
275
+ mapper = StrandsInMemorySessionMapper()
276
+ session = mapper.map_to_session(all_spans, session_id=case.session_id)
277
+
278
+ return {"output": str(agent_response), "trajectory": session}
279
+
280
+ # Use evaluators to assess simulated conversations
281
+ evaluators = [
282
+ HelpfulnessEvaluator(),
283
+ GoalSuccessRateEvaluator()
284
+ ]
285
+
286
+ experiment = Experiment(cases=test_cases, evaluators=evaluators)
287
+ reports = experiment.run_evaluations(task_function)
288
+ ```
289
+
290
+ **Key Benefits:**
291
+ - **Dynamic Interactions**: Simulator adapts responses based on agent behavior
292
+ - **Goal-Oriented Testing**: Verify agents can complete user objectives through dialogue
293
+ - **Realistic Conversations**: Generate authentic multi-turn interaction patterns
294
+ - **No Predefined Scripts**: Test agents without hardcoded conversation paths
295
+ - **Comprehensive Evaluation**: Combine with trace-based evaluators for full assessment
296
+
229
297
  ### Automated Experiment Generation
230
298
 
231
299
  Generate comprehensive test suites automatically from context descriptions:
@@ -388,8 +456,9 @@ reports[0].run_display() # Interactive display with metrics breakdown
388
456
 
389
457
  For detailed guidance & examples, explore our documentation:
390
458
 
391
- - [User Guide](https://strandsagents.com/latest//user-guide/evals-sdk/quickstart.md)
392
- - [Evaluator Reference](https://strandsagents.com/latest/user-guide/evals-sdk/evaluators/)
459
+ - [User Guide](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/quickstart/)
460
+ - [Evaluator Reference](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/evaluators/)
461
+ - [Simulators Guide](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/simulators/)
393
462
 
394
463
  ## Contributing ❤️
395
464
 
@@ -1,6 +1,6 @@
1
- strands_evals/__init__.py,sha256=LzUchWCMqfkXf4ca1p-S8YGBG5yTvy8ePTbdoVlqO8E,508
1
+ strands_evals/__init__.py,sha256=WnYsQGtkatrCKM8v_i_oCtBHNJfPaTrOg2ThUlf55Pk,485
2
2
  strands_evals/case.py,sha256=KWAL947NkmNzg9FFdTsL6KI9AFLQ8IcFjaOjcs9x5to,2131
3
- strands_evals/experiment.py,sha256=H_XJnOPufqg_3ZhSnjFFMFEzo4CqRtoHbHdFqLXR4iQ,28183
3
+ strands_evals/experiment.py,sha256=6gARs-JiGMSoeqC7-sjLGfL6hbEcHH5YJ4ABH0Qf3cM,28239
4
4
  strands_evals/display/display_console.py,sha256=bOTr6RepgnifALz2DgXnnk3c4Jjxu_mA68-pFr7xry0,5932
5
5
  strands_evals/evaluators/__init__.py,sha256=OfZU5RkYewHOAnEjPKdxiEvPnfOOWNZc_9nQpAfARfI,887
6
6
  strands_evals/evaluators/evaluator.py,sha256=XEesDeT83H93B1X_w8s0Nsb1KKHy26QO8b99Hi6vKbc,7466
@@ -30,7 +30,7 @@ strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection
30
30
  strands_evals/extractors/__init__.py,sha256=Jmlrk-m8sSS_LwmCVSloIkg3BjOgRzNEezjaAGMw5rw,74
31
31
  strands_evals/extractors/graph_extractor.py,sha256=TeT-58JB9roqSvy2ELz1kg8WF5YO-cfLlGZTO0F9s_4,1105
32
32
  strands_evals/extractors/swarm_extractor.py,sha256=Sm1XFCkAGVdF3XDyO3iF-20I8C6sAQ8JPNP5fgotOFU,2682
33
- strands_evals/extractors/tools_use_extractor.py,sha256=XTD7dDIWMiuaIYjUCRnZP45ly629n2XtkrbBwMlKU-s,6190
33
+ strands_evals/extractors/tools_use_extractor.py,sha256=3WngKFdTz9XYeD0eXn90Dr1eGuM8egbOJT0w0LYxWhk,6388
34
34
  strands_evals/extractors/trace_extractor.py,sha256=l7gk5rUFoUcxQduPJz49OX66SdgeK1MLt81aF1yr4Lc,6653
35
35
  strands_evals/generators/__init__.py,sha256=B1F30DAIf0kPyBdE4PAZvSby-dTelqb_7hFJoATqVb0,89
36
36
  strands_evals/generators/experiment_generator.py,sha256=6wLTL0iG2b0YAiu0w8dDiaBxOIy7p_Fs7l3hCjgQc0w,22655
@@ -61,8 +61,8 @@ strands_evals/types/evaluation_report.py,sha256=vT86zO4Qn9CQbULo3aziGMdG-1qWLdcB
61
61
  strands_evals/types/trace.py,sha256=BFoEylzAlENyPH702T5MDz-_H21-Wfx-FFTSXX1tDfY,4844
62
62
  strands_evals/types/simulation/__init__.py,sha256=-mz5lW6qFfIMm4dJGaP9pXY3xeiefLbB0XevjdFykkU,133
63
63
  strands_evals/types/simulation/actor.py,sha256=ESTV8165c3Ad5QT4yYmjm-A-oZdwZ0Rf0Lq7zokjTPo,1163
64
- strands_agents_evals-0.1.0.dist-info/METADATA,sha256=M51KkWtamOJjOpBeVocDA2A1np_X17BZmmWyLGTHvUE,15145
65
- strands_agents_evals-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
66
- strands_agents_evals-0.1.0.dist-info/licenses/LICENSE,sha256=yIWKWnZEC7ykhOE0z330Y4XQkN0YssWOZQ-TkliALls,10141
67
- strands_agents_evals-0.1.0.dist-info/licenses/NOTICE,sha256=Eg13ogOmcI7JpMjxniFnKG81vwU3X8X7P_IlpvVg5RU,66
68
- strands_agents_evals-0.1.0.dist-info/RECORD,,
64
+ strands_agents_evals-0.1.1.dist-info/METADATA,sha256=W8UdHTxX2zsjd4F3jVuK5t2e0toxbRKBx9whF34ZjFc,17721
65
+ strands_agents_evals-0.1.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
66
+ strands_agents_evals-0.1.1.dist-info/licenses/LICENSE,sha256=yIWKWnZEC7ykhOE0z330Y4XQkN0YssWOZQ-TkliALls,10141
67
+ strands_agents_evals-0.1.1.dist-info/licenses/NOTICE,sha256=Eg13ogOmcI7JpMjxniFnKG81vwU3X8X7P_IlpvVg5RU,66
68
+ strands_agents_evals-0.1.1.dist-info/RECORD,,
strands_evals/__init__.py CHANGED
@@ -1,5 +1,3 @@
1
- __version__ = "0.1.0"
2
-
3
1
  from . import evaluators, extractors, generators, simulation, telemetry, types
4
2
  from .case import Case
5
3
  from .experiment import Experiment
@@ -577,8 +577,8 @@ class Experiment(Generic[InputT, OutputT]):
577
577
 
578
578
  file_path.parent.mkdir(parents=True, exist_ok=True)
579
579
 
580
- with open(file_path, "w") as f:
581
- json.dump(self.to_dict(), f, indent=2)
580
+ with open(file_path, "w", encoding="utf-8") as f:
581
+ json.dump(self.to_dict(), f, indent=2, ensure_ascii=False)
582
582
 
583
583
  @classmethod
584
584
  def from_dict(cls, data: dict, custom_evaluators: list[type[Evaluator]] | None = None):
@@ -646,7 +646,7 @@ class Experiment(Generic[InputT, OutputT]):
646
646
  f"Only .json format is supported. Got file: {path}. Please provide a path with .json extension."
647
647
  )
648
648
 
649
- with open(file_path, "r") as f:
649
+ with open(file_path, "r", encoding="utf-8") as f:
650
650
  data = json.load(f)
651
651
 
652
652
  return cls.from_dict(data, custom_evaluators)
@@ -33,6 +33,7 @@ def extract_agent_tools_used_from_messages(agent_messages):
33
33
  tool_id = tool.get("toolUseId")
34
34
  # get the tool result from the next message
35
35
  tool_result = None
36
+ is_error = False
36
37
  next_message_i = i + 1
37
38
  while next_message_i < len(agent_messages):
38
39
  next_message = agent_messages[next_message_i]
@@ -46,9 +47,12 @@ def extract_agent_tools_used_from_messages(agent_messages):
46
47
  tool_result_content = tool_result_dict.get("content", [])
47
48
  if len(tool_result_content) > 0:
48
49
  tool_result = tool_result_content[0].get("text")
50
+ is_error = tool_result_dict.get("status") == "error"
49
51
  break
50
52
 
51
- tools_used.append({"name": tool_name, "input": tool_input, "tool_result": tool_result})
53
+ tools_used.append(
54
+ {"name": tool_name, "input": tool_input, "tool_result": tool_result, "is_error": is_error}
55
+ )
52
56
  return tools_used
53
57
 
54
58