strands-agents-evals 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- strands_agents_evals-0.1.0.dist-info/METADATA +408 -0
- strands_agents_evals-0.1.0.dist-info/RECORD +68 -0
- strands_agents_evals-0.1.0.dist-info/WHEEL +4 -0
- strands_agents_evals-0.1.0.dist-info/licenses/LICENSE +175 -0
- strands_agents_evals-0.1.0.dist-info/licenses/NOTICE +1 -0
- strands_evals/__init__.py +22 -0
- strands_evals/case.py +53 -0
- strands_evals/display/display_console.py +150 -0
- strands_evals/evaluators/__init__.py +23 -0
- strands_evals/evaluators/evaluator.py +182 -0
- strands_evals/evaluators/faithfulness_evaluator.py +116 -0
- strands_evals/evaluators/goal_success_rate_evaluator.py +90 -0
- strands_evals/evaluators/harmfulness_evaluator.py +135 -0
- strands_evals/evaluators/helpfulness_evaluator.py +148 -0
- strands_evals/evaluators/interactions_evaluator.py +244 -0
- strands_evals/evaluators/output_evaluator.py +72 -0
- strands_evals/evaluators/prompt_templates/case_prompt_template.py +63 -0
- strands_evals/evaluators/prompt_templates/faithfulness/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/faithfulness/faithfulness_v0.py +30 -0
- strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v0.py +17 -0
- strands_evals/evaluators/prompt_templates/harmfulness/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/harmfulness/harmfulness_v0.py +8 -0
- strands_evals/evaluators/prompt_templates/helpfulness/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/helpfulness/helpfulness_v0.py +38 -0
- strands_evals/evaluators/prompt_templates/prompt_templates.py +176 -0
- strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/tool_parameter_accuracy_v0.py +40 -0
- strands_evals/evaluators/prompt_templates/tool_selection_accuracy/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection_accuracy_v0.py +23 -0
- strands_evals/evaluators/tool_parameter_accuracy_evaluator.py +112 -0
- strands_evals/evaluators/tool_selection_accuracy_evaluator.py +112 -0
- strands_evals/evaluators/trajectory_evaluator.py +100 -0
- strands_evals/experiment.py +652 -0
- strands_evals/extractors/__init__.py +3 -0
- strands_evals/extractors/graph_extractor.py +30 -0
- strands_evals/extractors/swarm_extractor.py +73 -0
- strands_evals/extractors/tools_use_extractor.py +164 -0
- strands_evals/extractors/trace_extractor.py +166 -0
- strands_evals/generators/__init__.py +3 -0
- strands_evals/generators/experiment_generator.py +498 -0
- strands_evals/generators/prompt_template/prompt_templates.py +75 -0
- strands_evals/generators/topic_planner.py +60 -0
- strands_evals/mappers/__init__.py +6 -0
- strands_evals/mappers/session_mapper.py +27 -0
- strands_evals/mappers/strands_in_memory_session_mapper.py +473 -0
- strands_evals/simulation/README.md +323 -0
- strands_evals/simulation/__init__.py +6 -0
- strands_evals/simulation/actor_simulator.py +292 -0
- strands_evals/simulation/profiles/__init__.py +5 -0
- strands_evals/simulation/profiles/actor_profile.py +26 -0
- strands_evals/simulation/prompt_templates/__init__.py +11 -0
- strands_evals/simulation/prompt_templates/actor_profile_extraction.py +25 -0
- strands_evals/simulation/prompt_templates/actor_system_prompt.py +64 -0
- strands_evals/simulation/prompt_templates/goal_completion.py +27 -0
- strands_evals/simulation/tools/__init__.py +5 -0
- strands_evals/simulation/tools/goal_completion.py +93 -0
- strands_evals/telemetry/__init__.py +15 -0
- strands_evals/telemetry/_cloudwatch_logger.py +209 -0
- strands_evals/telemetry/config.py +207 -0
- strands_evals/telemetry/tracer.py +38 -0
- strands_evals/tools/evaluation_tools.py +67 -0
- strands_evals/types/__init__.py +11 -0
- strands_evals/types/evaluation.py +105 -0
- strands_evals/types/evaluation_report.py +244 -0
- strands_evals/types/simulation/__init__.py +5 -0
- strands_evals/types/simulation/actor.py +34 -0
- strands_evals/types/trace.py +205 -0
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from strands import tool
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
@tool
|
|
5
|
+
def exact_match_scorer(actual_trajectory: list, expected_trajectory: list) -> float:
|
|
6
|
+
"""
|
|
7
|
+
Score the trajectory based on exact match. A score of 0 indicates 0 steps matched and 1
|
|
8
|
+
indicates a perfect match.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
actual_trajectory: The actual trajectory.
|
|
12
|
+
expected_trajectory: The expected trajectory.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
The score of the trajectory.
|
|
16
|
+
"""
|
|
17
|
+
correct = 0
|
|
18
|
+
for actual, expected in zip(actual_trajectory, expected_trajectory, strict=False):
|
|
19
|
+
if actual == expected:
|
|
20
|
+
correct += 1
|
|
21
|
+
|
|
22
|
+
return correct / len(expected_trajectory)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@tool
|
|
26
|
+
def in_order_match_scorer(actual_trajectory: list, expected_trajectory: list) -> float:
|
|
27
|
+
"""
|
|
28
|
+
Score based on correct actions in correct order, allows extra actions.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
actual_trajectory: The actual trajectory.
|
|
32
|
+
expected_trajectory: The expected trajectory.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
The score of the trajectory.
|
|
36
|
+
"""
|
|
37
|
+
if not expected_trajectory:
|
|
38
|
+
return 1.0
|
|
39
|
+
|
|
40
|
+
expected_idx = 0
|
|
41
|
+
for action in actual_trajectory:
|
|
42
|
+
if expected_idx < len(expected_trajectory) and action == expected_trajectory[expected_idx]:
|
|
43
|
+
expected_idx += 1
|
|
44
|
+
|
|
45
|
+
return expected_idx / len(expected_trajectory)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@tool
|
|
49
|
+
def any_order_match_scorer(actual_trajectory: list, expected_trajectory: list) -> float:
|
|
50
|
+
"""
|
|
51
|
+
Score based on correct actions in any order, allows extra actions.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
actual_trajectory: The actual trajectory.
|
|
55
|
+
expected_trajectory: The expected trajectory.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
The score of the trajectory.
|
|
59
|
+
"""
|
|
60
|
+
if not expected_trajectory:
|
|
61
|
+
return 1.0
|
|
62
|
+
|
|
63
|
+
expected_set = set(expected_trajectory)
|
|
64
|
+
actual_set = set(actual_trajectory)
|
|
65
|
+
matched = len(expected_set.intersection(actual_set))
|
|
66
|
+
|
|
67
|
+
return matched / len(expected_trajectory)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from .evaluation import EvaluationData, EvaluationOutput, Interaction, TaskOutput
|
|
2
|
+
from .simulation import ActorProfile, ActorResponse
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"Interaction",
|
|
6
|
+
"TaskOutput",
|
|
7
|
+
"EvaluationData",
|
|
8
|
+
"EvaluationOutput",
|
|
9
|
+
"ActorProfile",
|
|
10
|
+
"ActorResponse",
|
|
11
|
+
]
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
from pydantic import BaseModel
|
|
2
|
+
from typing_extensions import Any, Generic, TypedDict, TypeVar, Union
|
|
3
|
+
|
|
4
|
+
from .trace import Session
|
|
5
|
+
|
|
6
|
+
InputT = TypeVar("InputT")
|
|
7
|
+
OutputT = TypeVar("OutputT")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Interaction(TypedDict, total=False):
|
|
11
|
+
"""
|
|
12
|
+
Represents a single interaction in a multi-agent or multi-step system.
|
|
13
|
+
|
|
14
|
+
Used to capture the communication flow and dependencies between different
|
|
15
|
+
components (agents, tools, or processing nodes) during task execution.
|
|
16
|
+
All fields are optional to accommodate different interaction patterns.
|
|
17
|
+
|
|
18
|
+
Attributes:
|
|
19
|
+
node_name: Identifier for the agent, tool, or component involved in this interaction
|
|
20
|
+
dependencies: List of other nodes/components this interaction depends on or references
|
|
21
|
+
messages: Sequence of messages, responses, or communication exchanged during this interaction
|
|
22
|
+
|
|
23
|
+
Example:
|
|
24
|
+
interaction = {
|
|
25
|
+
"node_name": "calculator_agent",
|
|
26
|
+
"dependencies": ["input_parser", "math_validator"],
|
|
27
|
+
"messages": ["Calculate 2+2"]
|
|
28
|
+
}
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
node_name: str
|
|
32
|
+
dependencies: list
|
|
33
|
+
messages: list
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class TaskOutput(TypedDict, total=False):
|
|
37
|
+
"""
|
|
38
|
+
Structured output format for task functions that return complex results.
|
|
39
|
+
|
|
40
|
+
Used when task functions need to return more than just the output response,
|
|
41
|
+
such as trajectory or interaction history. All fields are optional
|
|
42
|
+
to support different task complexity levels.
|
|
43
|
+
|
|
44
|
+
Attributes:
|
|
45
|
+
output: The primary response or result from the task
|
|
46
|
+
trajectory: Sequence of steps, tools, or actions taken during task execution
|
|
47
|
+
interactions: Communication flow between agents or components during execution
|
|
48
|
+
input: A new input to replace the original in the evaluation, will not mutate the original test case
|
|
49
|
+
|
|
50
|
+
Example:
|
|
51
|
+
task_result = {
|
|
52
|
+
"output": "The answer is 42",
|
|
53
|
+
"trajectory": ["calculator", "validator"],
|
|
54
|
+
"interactions": [{"node_name": "math_agent", "messages": ["Computing..."]}]
|
|
55
|
+
}
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
output: Any
|
|
59
|
+
trajectory: list[Any]
|
|
60
|
+
interactions: list[Interaction]
|
|
61
|
+
input: Any
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class EvaluationData(BaseModel, Generic[InputT, OutputT]):
|
|
65
|
+
"""
|
|
66
|
+
A record of all of the context for the evaluator to evaluate a test case.
|
|
67
|
+
|
|
68
|
+
Attributes:
|
|
69
|
+
input: The input to the task. eg. the query to the agent
|
|
70
|
+
actual_output: The actual response given the input.
|
|
71
|
+
expected_output: The expected response given the input.
|
|
72
|
+
actual_trajectory: The actual trajectory of a task given the input.
|
|
73
|
+
expected_trajectory: The expected trajectory of a task given the input.
|
|
74
|
+
name: The name of the test case. This will be used to identify the test in the summary report.
|
|
75
|
+
metadata: Additional information about the test case.
|
|
76
|
+
actual_interactions: The actual interaction sequence given the input.
|
|
77
|
+
expected_interactions: The expected interaction sequence given the input.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
input: InputT
|
|
81
|
+
actual_output: OutputT | None = None
|
|
82
|
+
name: str | None = None
|
|
83
|
+
expected_output: OutputT | None = None
|
|
84
|
+
expected_trajectory: Union[list[Any], Session, None] = None
|
|
85
|
+
actual_trajectory: Union[list[Any], Session, None] = None
|
|
86
|
+
metadata: dict[str, Any] | None = None
|
|
87
|
+
actual_interactions: list[Interaction] | None = None
|
|
88
|
+
expected_interactions: list[Interaction] | None = None
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class EvaluationOutput(BaseModel):
|
|
92
|
+
"""
|
|
93
|
+
Structured output for LLM-based judge.
|
|
94
|
+
|
|
95
|
+
Attributes:
|
|
96
|
+
score: The score of the test case.
|
|
97
|
+
test_pass: Whether the test pass or fail.
|
|
98
|
+
reason: The reason for the score for each test case.
|
|
99
|
+
label: The categorical label corresponding to the score.
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
score: float
|
|
103
|
+
test_pass: bool
|
|
104
|
+
reason: str | None = None
|
|
105
|
+
label: str | None = None
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
from typing_extensions import TypeVar
|
|
6
|
+
|
|
7
|
+
from ..display.display_console import CollapsibleTableReportDisplay
|
|
8
|
+
from ..types.evaluation import EvaluationOutput
|
|
9
|
+
|
|
10
|
+
InputT = TypeVar("InputT")
|
|
11
|
+
OutputT = TypeVar("OutputT")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class EvaluationReport(BaseModel):
|
|
15
|
+
"""
|
|
16
|
+
A report of the evaluation of a task.
|
|
17
|
+
|
|
18
|
+
Attributes:
|
|
19
|
+
overall_score: The overall score of the task.
|
|
20
|
+
scores: A list of the score for each test case in order.
|
|
21
|
+
cases: A list of records for each test case.
|
|
22
|
+
test_passes: A list of booleans indicating whether the test pass or fail.
|
|
23
|
+
reasons: A list of reason for each test case.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
overall_score: float
|
|
27
|
+
scores: list[float]
|
|
28
|
+
cases: list[dict]
|
|
29
|
+
test_passes: list[bool]
|
|
30
|
+
reasons: list[str] = []
|
|
31
|
+
detailed_results: list[list[EvaluationOutput]] = []
|
|
32
|
+
|
|
33
|
+
def _display(
|
|
34
|
+
self,
|
|
35
|
+
static: bool = True,
|
|
36
|
+
include_input: bool = True,
|
|
37
|
+
include_actual_output: bool = False,
|
|
38
|
+
include_expected_output: bool = False,
|
|
39
|
+
include_expected_trajectory: bool = False,
|
|
40
|
+
include_actual_trajectory: bool = False,
|
|
41
|
+
include_actual_interactions: bool = False,
|
|
42
|
+
include_expected_interactions: bool = False,
|
|
43
|
+
include_meta: bool = False,
|
|
44
|
+
):
|
|
45
|
+
"""
|
|
46
|
+
Render an interface of the report with as much details as configured using Rich.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
static: Whether to render the interface as interactive or static.
|
|
50
|
+
include_input (Defaults to True): Include the input in the display.
|
|
51
|
+
include_actual_output (Defaults to False): Include the actual output in the display.
|
|
52
|
+
include_expected_output (Defaults to False): Include the expected output in the display.
|
|
53
|
+
include_expected_trajectory (Defaults to False): Include the expected trajectory in the display.
|
|
54
|
+
include_actual_trajectory (Defaults to False): Include the actual trajectory in the display.
|
|
55
|
+
include_actual_interactions (Defaults to False): Include the actual interactions in the display.
|
|
56
|
+
include_expected_interactions (Defaults to False): Include the expected interactions in the display.
|
|
57
|
+
include_meta (Defaults to False): Include metadata in the display.
|
|
58
|
+
|
|
59
|
+
Note:
|
|
60
|
+
This method provides an interactive console interface where users can expand or collapse
|
|
61
|
+
individual test cases to view more or less detail.
|
|
62
|
+
"""
|
|
63
|
+
report_data = {}
|
|
64
|
+
for i in range(len(self.scores)):
|
|
65
|
+
name = self.cases[i].get("name", f"Test {i + 1}")
|
|
66
|
+
reason = self.reasons[i] if i < len(self.reasons) else "N/A"
|
|
67
|
+
details_dict = {
|
|
68
|
+
"name": name,
|
|
69
|
+
"score": f"{self.scores[i]:.2f}",
|
|
70
|
+
"test_pass": self.test_passes[i],
|
|
71
|
+
"reason": reason,
|
|
72
|
+
}
|
|
73
|
+
if include_input:
|
|
74
|
+
details_dict["input"] = str(self.cases[i].get("input"))
|
|
75
|
+
if include_actual_output:
|
|
76
|
+
details_dict["actual_output"] = str(self.cases[i].get("actual_output"))
|
|
77
|
+
if include_expected_output:
|
|
78
|
+
details_dict["expected_output"] = str(self.cases[i].get("expected_output"))
|
|
79
|
+
if include_actual_trajectory:
|
|
80
|
+
details_dict["actual_trajectory"] = str(self.cases[i].get("actual_trajectory"))
|
|
81
|
+
if include_expected_trajectory:
|
|
82
|
+
details_dict["expected_trajectory"] = str(self.cases[i].get("expected_trajectory"))
|
|
83
|
+
if include_actual_interactions:
|
|
84
|
+
details_dict["actual_interactions"] = str(self.cases[i].get("actual_interactions"))
|
|
85
|
+
if include_expected_interactions:
|
|
86
|
+
details_dict["expected_interactions"] = str(self.cases[i].get("expected_interactions"))
|
|
87
|
+
if include_meta:
|
|
88
|
+
details_dict["metadata"] = str(self.cases[i].get("metadata"))
|
|
89
|
+
|
|
90
|
+
report_data[str(i)] = {
|
|
91
|
+
"details": details_dict,
|
|
92
|
+
"detailed_results": self.detailed_results[i] if i < len(self.detailed_results) else [], # NEW
|
|
93
|
+
"expanded": False,
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
display_console = CollapsibleTableReportDisplay(items=report_data, overall_score=self.overall_score)
|
|
97
|
+
display_console.run(static=static)
|
|
98
|
+
|
|
99
|
+
def display(
|
|
100
|
+
self,
|
|
101
|
+
include_input: bool = True,
|
|
102
|
+
include_actual_output: bool = False,
|
|
103
|
+
include_expected_output: bool = False,
|
|
104
|
+
include_expected_trajectory: bool = False,
|
|
105
|
+
include_actual_trajectory: bool = False,
|
|
106
|
+
include_actual_interactions: bool = False,
|
|
107
|
+
include_expected_interactions: bool = False,
|
|
108
|
+
include_meta: bool = False,
|
|
109
|
+
):
|
|
110
|
+
"""
|
|
111
|
+
Render the report with as much details as configured using Rich. Use run_display if want
|
|
112
|
+
to interact with the table.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
include_input: Whether to include the input in the display. Defaults to True.
|
|
116
|
+
include_actual_output (Defaults to False): Include the actual output in the display.
|
|
117
|
+
include_expected_output (Defaults to False): Include the expected output in the display.
|
|
118
|
+
include_expected_trajectory (Defaults to False): Include the expected trajectory in the display.
|
|
119
|
+
include_actual_trajectory (Defaults to False): Include the actual trajectory in the display.
|
|
120
|
+
include_actual_interactions (Defaults to False): Include the actual interactions in the display.
|
|
121
|
+
include_expected_interactions (Defaults to False): Include the expected interactions in the display.
|
|
122
|
+
include_meta (Defaults to False): Include metadata in the display.
|
|
123
|
+
"""
|
|
124
|
+
self._display(
|
|
125
|
+
static=True,
|
|
126
|
+
include_input=include_input,
|
|
127
|
+
include_actual_output=include_actual_output,
|
|
128
|
+
include_expected_output=include_expected_output,
|
|
129
|
+
include_expected_trajectory=include_expected_trajectory,
|
|
130
|
+
include_actual_trajectory=include_actual_trajectory,
|
|
131
|
+
include_actual_interactions=include_actual_interactions,
|
|
132
|
+
include_expected_interactions=include_expected_interactions,
|
|
133
|
+
include_meta=include_meta,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
def run_display(
|
|
137
|
+
self,
|
|
138
|
+
include_input: bool = True,
|
|
139
|
+
include_actual_output: bool = False,
|
|
140
|
+
include_expected_output: bool = False,
|
|
141
|
+
include_expected_trajectory: bool = False,
|
|
142
|
+
include_actual_trajectory: bool = False,
|
|
143
|
+
include_actual_interactions: bool = False,
|
|
144
|
+
include_expected_interactions: bool = False,
|
|
145
|
+
include_meta: bool = False,
|
|
146
|
+
):
|
|
147
|
+
"""
|
|
148
|
+
Render the report interactively with as much details as configured using Rich.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
include_input: Whether to include the input in the display. Defaults to True.
|
|
152
|
+
include_actual_output (Defaults to False): Include the actual output in the display.
|
|
153
|
+
include_expected_output (Defaults to False): Include the expected output in the display.
|
|
154
|
+
include_expected_trajectory (Defaults to False): Include the expected trajectory in the display.
|
|
155
|
+
include_actual_trajectory (Defaults to False): Include the actual trajectory in the display.
|
|
156
|
+
include_actual_interactions (Defaults to False): Include the actual interactions in the display.
|
|
157
|
+
include_expected_interactions (Defaults to False): Include the expected interactions in the display.
|
|
158
|
+
include_meta (Defaults to False): Include metadata in the display.
|
|
159
|
+
"""
|
|
160
|
+
self._display(
|
|
161
|
+
static=False,
|
|
162
|
+
include_input=include_input,
|
|
163
|
+
include_actual_output=include_actual_output,
|
|
164
|
+
include_expected_output=include_expected_output,
|
|
165
|
+
include_expected_trajectory=include_expected_trajectory,
|
|
166
|
+
include_actual_trajectory=include_actual_trajectory,
|
|
167
|
+
include_actual_interactions=include_actual_interactions,
|
|
168
|
+
include_expected_interactions=include_expected_interactions,
|
|
169
|
+
include_meta=include_meta,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
def to_dict(self):
|
|
173
|
+
"""
|
|
174
|
+
Returns a dictionary representation of the report.
|
|
175
|
+
"""
|
|
176
|
+
return self.model_dump()
|
|
177
|
+
|
|
178
|
+
@classmethod
|
|
179
|
+
def from_dict(cls, data: dict):
|
|
180
|
+
"""
|
|
181
|
+
Create an EvaluationReport instance from a dictionary.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
data: A dictionary containing the report data.
|
|
185
|
+
"""
|
|
186
|
+
return cls.model_validate(data)
|
|
187
|
+
|
|
188
|
+
def to_file(self, path: str):
|
|
189
|
+
"""
|
|
190
|
+
Write the report to a JSON file.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
path: The file path where the report will be saved. Can be:
|
|
194
|
+
- A filename only (e.g., "foo.json" or "foo") - saves in current working directory
|
|
195
|
+
- A relative path (e.g., "relative_path/foo.json") - saves relative to current working directory
|
|
196
|
+
- An absolute path (e.g., "/path/to/dir/foo.json") - saves in exact directory
|
|
197
|
+
|
|
198
|
+
If no extension is provided, ".json" will be added automatically.
|
|
199
|
+
Only .json format is supported.
|
|
200
|
+
|
|
201
|
+
Raises:
|
|
202
|
+
ValueError: If the path has a non-JSON extension.
|
|
203
|
+
"""
|
|
204
|
+
file_path = Path(path)
|
|
205
|
+
|
|
206
|
+
if file_path.suffix:
|
|
207
|
+
if file_path.suffix != ".json":
|
|
208
|
+
raise ValueError(
|
|
209
|
+
f"Only .json format is supported. Got path with extension: {path}. "
|
|
210
|
+
f"Please use a .json extension or provide a path without an extension."
|
|
211
|
+
)
|
|
212
|
+
else:
|
|
213
|
+
file_path = file_path.with_suffix(".json")
|
|
214
|
+
|
|
215
|
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
216
|
+
|
|
217
|
+
with open(file_path, "w") as f:
|
|
218
|
+
json.dump(self.to_dict(), f, indent=2)
|
|
219
|
+
|
|
220
|
+
@classmethod
|
|
221
|
+
def from_file(cls, path: str):
|
|
222
|
+
"""
|
|
223
|
+
Create an EvaluationReport instance from a JSON file.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
path: Path to the JSON file.
|
|
227
|
+
|
|
228
|
+
Return:
|
|
229
|
+
An EvaluationReport object.
|
|
230
|
+
|
|
231
|
+
Raises:
|
|
232
|
+
ValueError: If the file does not have a .json extension.
|
|
233
|
+
"""
|
|
234
|
+
file_path = Path(path)
|
|
235
|
+
|
|
236
|
+
if file_path.suffix != ".json":
|
|
237
|
+
raise ValueError(
|
|
238
|
+
f"Only .json format is supported. Got file: {path}. Please provide a path with .json extension."
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
with open(file_path, "r") as f:
|
|
242
|
+
data = json.load(f)
|
|
243
|
+
|
|
244
|
+
return cls.from_dict(data)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
|
+
from typing_extensions import Any
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ActorProfile(BaseModel):
|
|
6
|
+
"""
|
|
7
|
+
Profile for actor simulation.
|
|
8
|
+
|
|
9
|
+
Attributes:
|
|
10
|
+
traits: Dictionary of actor characteristics and attributes.
|
|
11
|
+
context: Supplementary background information about the actor.
|
|
12
|
+
actor_goal: What the actor ultimately wants to achieve in the interaction.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
traits: dict[str, Any] = Field(..., description="Actor traits for simulation")
|
|
16
|
+
context: str = Field(..., description="Supplementary actor background details")
|
|
17
|
+
actor_goal: str = Field(
|
|
18
|
+
...,
|
|
19
|
+
description="What the actor ultimately wants to achieve in this interaction - "
|
|
20
|
+
"should be specific, actionable, and written from the actor's perspective",
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ActorResponse(BaseModel):
|
|
25
|
+
"""
|
|
26
|
+
Structured response from an actor.
|
|
27
|
+
|
|
28
|
+
Attributes:
|
|
29
|
+
reasoning: Internal reasoning process for the response.
|
|
30
|
+
message: The actual message content from the actor.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
reasoning: str = Field(..., description="Reasoning for the actor's response")
|
|
34
|
+
message: str = Field(..., description="Message from the actor")
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Generic trace types for agent observability.
|
|
3
|
+
|
|
4
|
+
These types represent standard observability primitives for agents.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from enum import Enum
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, field_serializer
|
|
11
|
+
from typing_extensions import Mapping, Sequence, TypeAlias, Union
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Role(str, Enum):
|
|
15
|
+
USER = "user"
|
|
16
|
+
ASSISTANT = "assistant"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ContentType(str, Enum):
|
|
20
|
+
TEXT = "text"
|
|
21
|
+
TOOL_USE = "tool_use"
|
|
22
|
+
TOOL_RESULT = "tool_result"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SpanType(str, Enum):
|
|
26
|
+
INFERENCE = "inference"
|
|
27
|
+
TOOL_EXECUTION = "execute_tool"
|
|
28
|
+
AGENT_INVOCATION = "invoke_agent"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class EvaluationLevel(str, Enum):
|
|
32
|
+
"""Type of evaluation based on trace granularity."""
|
|
33
|
+
|
|
34
|
+
SESSION_LEVEL = "Session"
|
|
35
|
+
TRACE_LEVEL = "Trace"
|
|
36
|
+
TOOL_LEVEL = "ToolCall"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class ToolCall(BaseModel):
|
|
40
|
+
name: str
|
|
41
|
+
arguments: dict
|
|
42
|
+
tool_call_id: str | None = None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class ToolResult(BaseModel):
|
|
46
|
+
content: str
|
|
47
|
+
error: str | None = None
|
|
48
|
+
tool_call_id: str | None = None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class ToolConfig(BaseModel):
|
|
52
|
+
name: str
|
|
53
|
+
description: str | None = None
|
|
54
|
+
parameters: dict | None = None
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class TextContent(BaseModel):
|
|
58
|
+
content_type: ContentType = ContentType.TEXT
|
|
59
|
+
text: str
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class ToolCallContent(ToolCall):
|
|
63
|
+
content_type: ContentType = ContentType.TOOL_USE
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class ToolResultContent(ToolResult):
|
|
67
|
+
content_type: ContentType = ContentType.TOOL_RESULT
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class UserMessage(BaseModel):
|
|
71
|
+
role: Role = Role.USER
|
|
72
|
+
content: list[Union[TextContent, ToolResultContent]]
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class AssistantMessage(BaseModel):
|
|
76
|
+
role: Role = Role.ASSISTANT
|
|
77
|
+
content: list[Union[TextContent, ToolCallContent]]
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class SpanInfo(BaseModel):
|
|
81
|
+
trace_id: str | None = None
|
|
82
|
+
span_id: str | None = None
|
|
83
|
+
session_id: str
|
|
84
|
+
parent_span_id: str | None = None
|
|
85
|
+
start_time: datetime
|
|
86
|
+
end_time: datetime
|
|
87
|
+
|
|
88
|
+
@field_serializer("start_time", "end_time")
|
|
89
|
+
def serialize_datetime_utc(self, dt: datetime) -> str:
|
|
90
|
+
"""Serialize datetime fields in UTC timezone with ISO format."""
|
|
91
|
+
# Convert to UTC if timezone-aware, otherwise assume it's already UTC
|
|
92
|
+
if dt.tzinfo is not None:
|
|
93
|
+
utc_dt = dt.astimezone(timezone.utc)
|
|
94
|
+
else:
|
|
95
|
+
utc_dt = dt.replace(tzinfo=timezone.utc)
|
|
96
|
+
# Return ISO format string with 'Z' suffix for UTC
|
|
97
|
+
return utc_dt.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class BaseSpan(BaseModel):
|
|
101
|
+
span_info: SpanInfo
|
|
102
|
+
metadata: dict | None = {}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class InferenceSpan(BaseSpan):
|
|
106
|
+
span_type: SpanType = SpanType.INFERENCE
|
|
107
|
+
messages: list[Union[UserMessage, AssistantMessage]]
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class ToolExecutionSpan(BaseSpan):
|
|
111
|
+
span_type: SpanType = SpanType.TOOL_EXECUTION
|
|
112
|
+
tool_call: ToolCall
|
|
113
|
+
tool_result: ToolResult
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class AgentInvocationSpan(BaseSpan):
|
|
117
|
+
span_type: SpanType = SpanType.AGENT_INVOCATION
|
|
118
|
+
user_prompt: str
|
|
119
|
+
agent_response: str
|
|
120
|
+
available_tools: list[ToolConfig]
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
SpanUnion: TypeAlias = Union[InferenceSpan, ToolExecutionSpan, AgentInvocationSpan]
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class Trace(BaseModel):
|
|
127
|
+
spans: list[SpanUnion]
|
|
128
|
+
trace_id: str
|
|
129
|
+
session_id: str
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class Session(BaseModel):
|
|
133
|
+
traces: list[Trace]
|
|
134
|
+
session_id: str
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class BaseEvaluationInput(BaseModel):
|
|
138
|
+
"""Base class for all evaluation inputs"""
|
|
139
|
+
|
|
140
|
+
span_info: SpanInfo
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class ToolExecution(BaseModel):
|
|
144
|
+
tool_call: ToolCall
|
|
145
|
+
tool_result: ToolResult
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class Context(BaseModel):
|
|
149
|
+
user_prompt: TextContent
|
|
150
|
+
agent_response: TextContent
|
|
151
|
+
tool_execution_history: list[ToolExecution] | None = None
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class SessionLevelInput(BaseEvaluationInput):
|
|
155
|
+
"""Input for session-level evaluators"""
|
|
156
|
+
|
|
157
|
+
session_history: list[Context]
|
|
158
|
+
available_tools: list[ToolConfig] | None = None
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class TraceLevelInput(BaseEvaluationInput):
|
|
162
|
+
"""Input for trace-level evaluators"""
|
|
163
|
+
|
|
164
|
+
agent_response: TextContent
|
|
165
|
+
session_history: list[Union[UserMessage, list[ToolExecution], AssistantMessage]]
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
class ToolLevelInput(BaseEvaluationInput):
|
|
169
|
+
"""Input for tool-level evaluators"""
|
|
170
|
+
|
|
171
|
+
available_tools: list[ToolConfig]
|
|
172
|
+
tool_execution_details: ToolExecutionSpan
|
|
173
|
+
session_history: list[Union[UserMessage, list[ToolExecution], AssistantMessage]]
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
class EvaluatorScore(BaseModel):
|
|
177
|
+
explanation: str
|
|
178
|
+
value: Union[int, float] | None = None
|
|
179
|
+
error: str | None = None
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
class TokenUsage(BaseModel):
|
|
183
|
+
cache_read_input_tokens: int
|
|
184
|
+
cache_creation_input_tokens: int
|
|
185
|
+
input_tokens: int
|
|
186
|
+
output_tokens: int
|
|
187
|
+
total_tokens: int
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class EvaluatorResult(BaseModel):
|
|
191
|
+
span_info: SpanInfo
|
|
192
|
+
evaluator_name: str
|
|
193
|
+
score: EvaluatorScore
|
|
194
|
+
token_usage: TokenUsage | None = None
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class EvaluationResponse(BaseModel):
|
|
198
|
+
evaluator_results: list[EvaluatorResult]
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
AttributeValue = Mapping[
|
|
202
|
+
str, str | bool | int | float | Sequence[str] | Sequence[bool] | Sequence[int] | Sequence[float]
|
|
203
|
+
]
|
|
204
|
+
|
|
205
|
+
Attributes = Mapping[str, AttributeValue] | None
|