strands-agents-evals 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: strands-agents-evals
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Summary: Evaluation framework for Strands
5
5
  Author-email: AWS <opensource@amazon.com>
6
6
  License: Apache-2.0
@@ -1,6 +1,6 @@
1
1
  strands_evals/__init__.py,sha256=WnYsQGtkatrCKM8v_i_oCtBHNJfPaTrOg2ThUlf55Pk,485
2
2
  strands_evals/case.py,sha256=KWAL947NkmNzg9FFdTsL6KI9AFLQ8IcFjaOjcs9x5to,2131
3
- strands_evals/experiment.py,sha256=6gARs-JiGMSoeqC7-sjLGfL6hbEcHH5YJ4ABH0Qf3cM,28239
3
+ strands_evals/experiment.py,sha256=yySXFW5p9xkDSvkxHBBDncxXKiuj0aDFY7iKoUyprwc,28745
4
4
  strands_evals/display/display_console.py,sha256=bOTr6RepgnifALz2DgXnnk3c4Jjxu_mA68-pFr7xry0,5932
5
5
  strands_evals/evaluators/__init__.py,sha256=OfZU5RkYewHOAnEjPKdxiEvPnfOOWNZc_9nQpAfARfI,887
6
6
  strands_evals/evaluators/evaluator.py,sha256=XEesDeT83H93B1X_w8s0Nsb1KKHy26QO8b99Hi6vKbc,7466
@@ -30,7 +30,7 @@ strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection
30
30
  strands_evals/extractors/__init__.py,sha256=Jmlrk-m8sSS_LwmCVSloIkg3BjOgRzNEezjaAGMw5rw,74
31
31
  strands_evals/extractors/graph_extractor.py,sha256=TeT-58JB9roqSvy2ELz1kg8WF5YO-cfLlGZTO0F9s_4,1105
32
32
  strands_evals/extractors/swarm_extractor.py,sha256=Sm1XFCkAGVdF3XDyO3iF-20I8C6sAQ8JPNP5fgotOFU,2682
33
- strands_evals/extractors/tools_use_extractor.py,sha256=3WngKFdTz9XYeD0eXn90Dr1eGuM8egbOJT0w0LYxWhk,6388
33
+ strands_evals/extractors/tools_use_extractor.py,sha256=o2e9ZuPqQ_hdcrDkl1Rw9h7Ipfc-CsLNkWemxcRvglg,6409
34
34
  strands_evals/extractors/trace_extractor.py,sha256=l7gk5rUFoUcxQduPJz49OX66SdgeK1MLt81aF1yr4Lc,6653
35
35
  strands_evals/generators/__init__.py,sha256=B1F30DAIf0kPyBdE4PAZvSby-dTelqb_7hFJoATqVb0,89
36
36
  strands_evals/generators/experiment_generator.py,sha256=6wLTL0iG2b0YAiu0w8dDiaBxOIy7p_Fs7l3hCjgQc0w,22655
@@ -61,8 +61,8 @@ strands_evals/types/evaluation_report.py,sha256=vT86zO4Qn9CQbULo3aziGMdG-1qWLdcB
61
61
  strands_evals/types/trace.py,sha256=BFoEylzAlENyPH702T5MDz-_H21-Wfx-FFTSXX1tDfY,4844
62
62
  strands_evals/types/simulation/__init__.py,sha256=-mz5lW6qFfIMm4dJGaP9pXY3xeiefLbB0XevjdFykkU,133
63
63
  strands_evals/types/simulation/actor.py,sha256=ESTV8165c3Ad5QT4yYmjm-A-oZdwZ0Rf0Lq7zokjTPo,1163
64
- strands_agents_evals-0.1.1.dist-info/METADATA,sha256=W8UdHTxX2zsjd4F3jVuK5t2e0toxbRKBx9whF34ZjFc,17721
65
- strands_agents_evals-0.1.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
66
- strands_agents_evals-0.1.1.dist-info/licenses/LICENSE,sha256=yIWKWnZEC7ykhOE0z330Y4XQkN0YssWOZQ-TkliALls,10141
67
- strands_agents_evals-0.1.1.dist-info/licenses/NOTICE,sha256=Eg13ogOmcI7JpMjxniFnKG81vwU3X8X7P_IlpvVg5RU,66
68
- strands_agents_evals-0.1.1.dist-info/RECORD,,
64
+ strands_agents_evals-0.1.2.dist-info/METADATA,sha256=eO-nDiGzAJxDmmwBwPVvNfTdds0408trSL4cKqLR8b4,17721
65
+ strands_agents_evals-0.1.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
66
+ strands_agents_evals-0.1.2.dist-info/licenses/LICENSE,sha256=yIWKWnZEC7ykhOE0z330Y4XQkN0YssWOZQ-TkliALls,10141
67
+ strands_agents_evals-0.1.2.dist-info/licenses/NOTICE,sha256=Eg13ogOmcI7JpMjxniFnKG81vwU3X8X7P_IlpvVg5RU,66
68
+ strands_agents_evals-0.1.2.dist-info/RECORD,,
@@ -391,8 +391,8 @@ class Experiment(Generic[InputT, OutputT]):
391
391
  "gen_ai.evaluation.case.input": serialize(case.input),
392
392
  },
393
393
  ) as case_span:
394
+ # Task execution span - execute once
394
395
  try:
395
- # Task execution span - execute once
396
396
  with self._tracer.start_as_current_span(
397
397
  "task_execution",
398
398
  attributes={
@@ -414,9 +414,21 @@ class Experiment(Generic[InputT, OutputT]):
414
414
  ),
415
415
  }
416
416
  )
417
-
418
- # Evaluate with each evaluator using the same task output
417
+ except Exception as e:
418
+ case_span.record_exception(e)
419
419
  for evaluator in self._evaluators:
420
+ eval_name = evaluator.get_type_name()
421
+ evaluator_data[eval_name]["cases"].append(case.model_dump())
422
+ evaluator_data[eval_name]["test_passes"].append(False)
423
+ evaluator_data[eval_name]["scores"].append(0)
424
+ evaluator_data[eval_name]["reasons"].append(f"Task execution error: {str(e)}")
425
+ evaluator_data[eval_name]["detailed_results"].append([])
426
+ continue
427
+
428
+ # Evaluate with each evaluator using the same task output
429
+ for evaluator in self._evaluators:
430
+ eval_name = evaluator.get_type_name()
431
+ try:
420
432
  with self._tracer.start_as_current_span(
421
433
  f"evaluator {evaluator.get_type_name()}",
422
434
  attributes={
@@ -436,21 +448,16 @@ class Experiment(Generic[InputT, OutputT]):
436
448
  }
437
449
  )
438
450
 
439
- eval_name = evaluator.get_type_name()
440
451
  evaluator_data[eval_name]["cases"].append(evaluation_context.model_dump())
441
452
  evaluator_data[eval_name]["test_passes"].append(aggregate_pass)
442
453
  evaluator_data[eval_name]["scores"].append(aggregate_score)
443
454
  evaluator_data[eval_name]["reasons"].append(aggregate_reason or "")
444
455
  evaluator_data[eval_name]["detailed_results"].append(evaluation_outputs)
445
-
446
- except Exception as e:
447
- case_span.record_exception(e)
448
- for evaluator in self._evaluators:
449
- eval_name = evaluator.get_type_name()
450
- evaluator_data[eval_name]["cases"].append(case.model_dump())
456
+ except Exception as e:
457
+ evaluator_data[eval_name]["cases"].append(evaluation_context.model_dump())
451
458
  evaluator_data[eval_name]["test_passes"].append(False)
452
459
  evaluator_data[eval_name]["scores"].append(0)
453
- evaluator_data[eval_name]["reasons"].append(f"An error occured : {str(e)}")
460
+ evaluator_data[eval_name]["reasons"].append(f"Evaluator error: {str(e)}")
454
461
  evaluator_data[eval_name]["detailed_results"].append([])
455
462
 
456
463
  reports = []
@@ -43,7 +43,7 @@ def extract_agent_tools_used_from_messages(agent_messages):
43
43
  content = next_message.get("content")
44
44
  if content:
45
45
  tool_result_dict = content[0].get("toolResult")
46
- if tool_result_dict.get("toolUseId") == tool_id:
46
+ if tool_result_dict and tool_result_dict.get("toolUseId") == tool_id:
47
47
  tool_result_content = tool_result_dict.get("content", [])
48
48
  if len(tool_result_content) > 0:
49
49
  tool_result = tool_result_content[0].get("text")