strands-agents-evals 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {strands_agents_evals-0.1.1.dist-info → strands_agents_evals-0.1.3.dist-info}/METADATA +1 -1
- {strands_agents_evals-0.1.1.dist-info → strands_agents_evals-0.1.3.dist-info}/RECORD +7 -7
- strands_evals/experiment.py +18 -11
- strands_evals/extractors/tools_use_extractor.py +30 -28
- {strands_agents_evals-0.1.1.dist-info → strands_agents_evals-0.1.3.dist-info}/WHEEL +0 -0
- {strands_agents_evals-0.1.1.dist-info → strands_agents_evals-0.1.3.dist-info}/licenses/LICENSE +0 -0
- {strands_agents_evals-0.1.1.dist-info → strands_agents_evals-0.1.3.dist-info}/licenses/NOTICE +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
strands_evals/__init__.py,sha256=WnYsQGtkatrCKM8v_i_oCtBHNJfPaTrOg2ThUlf55Pk,485
|
|
2
2
|
strands_evals/case.py,sha256=KWAL947NkmNzg9FFdTsL6KI9AFLQ8IcFjaOjcs9x5to,2131
|
|
3
|
-
strands_evals/experiment.py,sha256=
|
|
3
|
+
strands_evals/experiment.py,sha256=yySXFW5p9xkDSvkxHBBDncxXKiuj0aDFY7iKoUyprwc,28745
|
|
4
4
|
strands_evals/display/display_console.py,sha256=bOTr6RepgnifALz2DgXnnk3c4Jjxu_mA68-pFr7xry0,5932
|
|
5
5
|
strands_evals/evaluators/__init__.py,sha256=OfZU5RkYewHOAnEjPKdxiEvPnfOOWNZc_9nQpAfARfI,887
|
|
6
6
|
strands_evals/evaluators/evaluator.py,sha256=XEesDeT83H93B1X_w8s0Nsb1KKHy26QO8b99Hi6vKbc,7466
|
|
@@ -30,7 +30,7 @@ strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection
|
|
|
30
30
|
strands_evals/extractors/__init__.py,sha256=Jmlrk-m8sSS_LwmCVSloIkg3BjOgRzNEezjaAGMw5rw,74
|
|
31
31
|
strands_evals/extractors/graph_extractor.py,sha256=TeT-58JB9roqSvy2ELz1kg8WF5YO-cfLlGZTO0F9s_4,1105
|
|
32
32
|
strands_evals/extractors/swarm_extractor.py,sha256=Sm1XFCkAGVdF3XDyO3iF-20I8C6sAQ8JPNP5fgotOFU,2682
|
|
33
|
-
strands_evals/extractors/tools_use_extractor.py,sha256=
|
|
33
|
+
strands_evals/extractors/tools_use_extractor.py,sha256=emLL63LKldL2IA2u5wZL0ZhklZJqX0KLr5xFRt-S4i4,6600
|
|
34
34
|
strands_evals/extractors/trace_extractor.py,sha256=l7gk5rUFoUcxQduPJz49OX66SdgeK1MLt81aF1yr4Lc,6653
|
|
35
35
|
strands_evals/generators/__init__.py,sha256=B1F30DAIf0kPyBdE4PAZvSby-dTelqb_7hFJoATqVb0,89
|
|
36
36
|
strands_evals/generators/experiment_generator.py,sha256=6wLTL0iG2b0YAiu0w8dDiaBxOIy7p_Fs7l3hCjgQc0w,22655
|
|
@@ -61,8 +61,8 @@ strands_evals/types/evaluation_report.py,sha256=vT86zO4Qn9CQbULo3aziGMdG-1qWLdcB
|
|
|
61
61
|
strands_evals/types/trace.py,sha256=BFoEylzAlENyPH702T5MDz-_H21-Wfx-FFTSXX1tDfY,4844
|
|
62
62
|
strands_evals/types/simulation/__init__.py,sha256=-mz5lW6qFfIMm4dJGaP9pXY3xeiefLbB0XevjdFykkU,133
|
|
63
63
|
strands_evals/types/simulation/actor.py,sha256=ESTV8165c3Ad5QT4yYmjm-A-oZdwZ0Rf0Lq7zokjTPo,1163
|
|
64
|
-
strands_agents_evals-0.1.
|
|
65
|
-
strands_agents_evals-0.1.
|
|
66
|
-
strands_agents_evals-0.1.
|
|
67
|
-
strands_agents_evals-0.1.
|
|
68
|
-
strands_agents_evals-0.1.
|
|
64
|
+
strands_agents_evals-0.1.3.dist-info/METADATA,sha256=GnFR1FmK9no2J7NWeuV8efHNvw2eoMzl8VbSJHeRdLg,17721
|
|
65
|
+
strands_agents_evals-0.1.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
66
|
+
strands_agents_evals-0.1.3.dist-info/licenses/LICENSE,sha256=yIWKWnZEC7ykhOE0z330Y4XQkN0YssWOZQ-TkliALls,10141
|
|
67
|
+
strands_agents_evals-0.1.3.dist-info/licenses/NOTICE,sha256=Eg13ogOmcI7JpMjxniFnKG81vwU3X8X7P_IlpvVg5RU,66
|
|
68
|
+
strands_agents_evals-0.1.3.dist-info/RECORD,,
|
strands_evals/experiment.py
CHANGED
|
@@ -391,8 +391,8 @@ class Experiment(Generic[InputT, OutputT]):
|
|
|
391
391
|
"gen_ai.evaluation.case.input": serialize(case.input),
|
|
392
392
|
},
|
|
393
393
|
) as case_span:
|
|
394
|
+
# Task execution span - execute once
|
|
394
395
|
try:
|
|
395
|
-
# Task execution span - execute once
|
|
396
396
|
with self._tracer.start_as_current_span(
|
|
397
397
|
"task_execution",
|
|
398
398
|
attributes={
|
|
@@ -414,9 +414,21 @@ class Experiment(Generic[InputT, OutputT]):
|
|
|
414
414
|
),
|
|
415
415
|
}
|
|
416
416
|
)
|
|
417
|
-
|
|
418
|
-
|
|
417
|
+
except Exception as e:
|
|
418
|
+
case_span.record_exception(e)
|
|
419
419
|
for evaluator in self._evaluators:
|
|
420
|
+
eval_name = evaluator.get_type_name()
|
|
421
|
+
evaluator_data[eval_name]["cases"].append(case.model_dump())
|
|
422
|
+
evaluator_data[eval_name]["test_passes"].append(False)
|
|
423
|
+
evaluator_data[eval_name]["scores"].append(0)
|
|
424
|
+
evaluator_data[eval_name]["reasons"].append(f"Task execution error: {str(e)}")
|
|
425
|
+
evaluator_data[eval_name]["detailed_results"].append([])
|
|
426
|
+
continue
|
|
427
|
+
|
|
428
|
+
# Evaluate with each evaluator using the same task output
|
|
429
|
+
for evaluator in self._evaluators:
|
|
430
|
+
eval_name = evaluator.get_type_name()
|
|
431
|
+
try:
|
|
420
432
|
with self._tracer.start_as_current_span(
|
|
421
433
|
f"evaluator {evaluator.get_type_name()}",
|
|
422
434
|
attributes={
|
|
@@ -436,21 +448,16 @@ class Experiment(Generic[InputT, OutputT]):
|
|
|
436
448
|
}
|
|
437
449
|
)
|
|
438
450
|
|
|
439
|
-
eval_name = evaluator.get_type_name()
|
|
440
451
|
evaluator_data[eval_name]["cases"].append(evaluation_context.model_dump())
|
|
441
452
|
evaluator_data[eval_name]["test_passes"].append(aggregate_pass)
|
|
442
453
|
evaluator_data[eval_name]["scores"].append(aggregate_score)
|
|
443
454
|
evaluator_data[eval_name]["reasons"].append(aggregate_reason or "")
|
|
444
455
|
evaluator_data[eval_name]["detailed_results"].append(evaluation_outputs)
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
case_span.record_exception(e)
|
|
448
|
-
for evaluator in self._evaluators:
|
|
449
|
-
eval_name = evaluator.get_type_name()
|
|
450
|
-
evaluator_data[eval_name]["cases"].append(case.model_dump())
|
|
456
|
+
except Exception as e:
|
|
457
|
+
evaluator_data[eval_name]["cases"].append(evaluation_context.model_dump())
|
|
451
458
|
evaluator_data[eval_name]["test_passes"].append(False)
|
|
452
459
|
evaluator_data[eval_name]["scores"].append(0)
|
|
453
|
-
evaluator_data[eval_name]["reasons"].append(f"
|
|
460
|
+
evaluator_data[eval_name]["reasons"].append(f"Evaluator error: {str(e)}")
|
|
454
461
|
evaluator_data[eval_name]["detailed_results"].append([])
|
|
455
462
|
|
|
456
463
|
reports = []
|
|
@@ -22,37 +22,39 @@ def extract_agent_tools_used_from_messages(agent_messages):
|
|
|
22
22
|
if message.get("role") == "assistant":
|
|
23
23
|
message_info = message.get("content")
|
|
24
24
|
if len(message_info) > 0:
|
|
25
|
-
|
|
25
|
+
tools = []
|
|
26
26
|
for message in message_info:
|
|
27
27
|
if "toolUse" in message:
|
|
28
|
+
tools.append(message.get("toolUse"))
|
|
29
|
+
|
|
30
|
+
for tool in tools:
|
|
31
|
+
if tool:
|
|
32
|
+
tool_name = tool.get("name")
|
|
33
|
+
tool_input = tool.get("input")
|
|
34
|
+
tool_id = tool.get("toolUseId")
|
|
35
|
+
# get the tool result from the next message
|
|
36
|
+
tool_result = None
|
|
37
|
+
is_error = False
|
|
38
|
+
next_message_i = i + 1
|
|
39
|
+
while next_message_i < len(agent_messages):
|
|
40
|
+
next_message = agent_messages[next_message_i]
|
|
41
|
+
next_message_i += 1
|
|
42
|
+
|
|
43
|
+
if next_message.get("role") == "user":
|
|
44
|
+
content = next_message.get("content")
|
|
45
|
+
if content:
|
|
46
|
+
tool_result_dict = content[0].get("toolResult")
|
|
47
|
+
if tool_result_dict and tool_result_dict.get("toolUseId") == tool_id:
|
|
48
|
+
tool_result_content = tool_result_dict.get("content", [])
|
|
49
|
+
if len(tool_result_content) > 0:
|
|
50
|
+
tool_result = tool_result_content[0].get("text")
|
|
51
|
+
is_error = tool_result_dict.get("status") == "error"
|
|
52
|
+
break
|
|
53
|
+
|
|
54
|
+
tools_used.append(
|
|
55
|
+
{"name": tool_name, "input": tool_input, "tool_result": tool_result, "is_error": is_error}
|
|
56
|
+
)
|
|
28
57
|
tool = message.get("toolUse")
|
|
29
|
-
|
|
30
|
-
if tool:
|
|
31
|
-
tool_name = tool.get("name")
|
|
32
|
-
tool_input = tool.get("input")
|
|
33
|
-
tool_id = tool.get("toolUseId")
|
|
34
|
-
# get the tool result from the next message
|
|
35
|
-
tool_result = None
|
|
36
|
-
is_error = False
|
|
37
|
-
next_message_i = i + 1
|
|
38
|
-
while next_message_i < len(agent_messages):
|
|
39
|
-
next_message = agent_messages[next_message_i]
|
|
40
|
-
next_message_i += 1
|
|
41
|
-
|
|
42
|
-
if next_message.get("role") == "user":
|
|
43
|
-
content = next_message.get("content")
|
|
44
|
-
if content:
|
|
45
|
-
tool_result_dict = content[0].get("toolResult")
|
|
46
|
-
if tool_result_dict.get("toolUseId") == tool_id:
|
|
47
|
-
tool_result_content = tool_result_dict.get("content", [])
|
|
48
|
-
if len(tool_result_content) > 0:
|
|
49
|
-
tool_result = tool_result_content[0].get("text")
|
|
50
|
-
is_error = tool_result_dict.get("status") == "error"
|
|
51
|
-
break
|
|
52
|
-
|
|
53
|
-
tools_used.append(
|
|
54
|
-
{"name": tool_name, "input": tool_input, "tool_result": tool_result, "is_error": is_error}
|
|
55
|
-
)
|
|
56
58
|
return tools_used
|
|
57
59
|
|
|
58
60
|
|
|
File without changes
|
{strands_agents_evals-0.1.1.dist-info → strands_agents_evals-0.1.3.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{strands_agents_evals-0.1.1.dist-info → strands_agents_evals-0.1.3.dist-info}/licenses/NOTICE
RENAMED
|
File without changes
|