strands-agents-evals 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. strands_agents_evals-0.1.0.dist-info/METADATA +408 -0
  2. strands_agents_evals-0.1.0.dist-info/RECORD +68 -0
  3. strands_agents_evals-0.1.0.dist-info/WHEEL +4 -0
  4. strands_agents_evals-0.1.0.dist-info/licenses/LICENSE +175 -0
  5. strands_agents_evals-0.1.0.dist-info/licenses/NOTICE +1 -0
  6. strands_evals/__init__.py +22 -0
  7. strands_evals/case.py +53 -0
  8. strands_evals/display/display_console.py +150 -0
  9. strands_evals/evaluators/__init__.py +23 -0
  10. strands_evals/evaluators/evaluator.py +182 -0
  11. strands_evals/evaluators/faithfulness_evaluator.py +116 -0
  12. strands_evals/evaluators/goal_success_rate_evaluator.py +90 -0
  13. strands_evals/evaluators/harmfulness_evaluator.py +135 -0
  14. strands_evals/evaluators/helpfulness_evaluator.py +148 -0
  15. strands_evals/evaluators/interactions_evaluator.py +244 -0
  16. strands_evals/evaluators/output_evaluator.py +72 -0
  17. strands_evals/evaluators/prompt_templates/case_prompt_template.py +63 -0
  18. strands_evals/evaluators/prompt_templates/faithfulness/__init__.py +11 -0
  19. strands_evals/evaluators/prompt_templates/faithfulness/faithfulness_v0.py +30 -0
  20. strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py +11 -0
  21. strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v0.py +17 -0
  22. strands_evals/evaluators/prompt_templates/harmfulness/__init__.py +11 -0
  23. strands_evals/evaluators/prompt_templates/harmfulness/harmfulness_v0.py +8 -0
  24. strands_evals/evaluators/prompt_templates/helpfulness/__init__.py +11 -0
  25. strands_evals/evaluators/prompt_templates/helpfulness/helpfulness_v0.py +38 -0
  26. strands_evals/evaluators/prompt_templates/prompt_templates.py +176 -0
  27. strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/__init__.py +11 -0
  28. strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/tool_parameter_accuracy_v0.py +40 -0
  29. strands_evals/evaluators/prompt_templates/tool_selection_accuracy/__init__.py +11 -0
  30. strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection_accuracy_v0.py +23 -0
  31. strands_evals/evaluators/tool_parameter_accuracy_evaluator.py +112 -0
  32. strands_evals/evaluators/tool_selection_accuracy_evaluator.py +112 -0
  33. strands_evals/evaluators/trajectory_evaluator.py +100 -0
  34. strands_evals/experiment.py +652 -0
  35. strands_evals/extractors/__init__.py +3 -0
  36. strands_evals/extractors/graph_extractor.py +30 -0
  37. strands_evals/extractors/swarm_extractor.py +73 -0
  38. strands_evals/extractors/tools_use_extractor.py +164 -0
  39. strands_evals/extractors/trace_extractor.py +166 -0
  40. strands_evals/generators/__init__.py +3 -0
  41. strands_evals/generators/experiment_generator.py +498 -0
  42. strands_evals/generators/prompt_template/prompt_templates.py +75 -0
  43. strands_evals/generators/topic_planner.py +60 -0
  44. strands_evals/mappers/__init__.py +6 -0
  45. strands_evals/mappers/session_mapper.py +27 -0
  46. strands_evals/mappers/strands_in_memory_session_mapper.py +473 -0
  47. strands_evals/simulation/README.md +323 -0
  48. strands_evals/simulation/__init__.py +6 -0
  49. strands_evals/simulation/actor_simulator.py +292 -0
  50. strands_evals/simulation/profiles/__init__.py +5 -0
  51. strands_evals/simulation/profiles/actor_profile.py +26 -0
  52. strands_evals/simulation/prompt_templates/__init__.py +11 -0
  53. strands_evals/simulation/prompt_templates/actor_profile_extraction.py +25 -0
  54. strands_evals/simulation/prompt_templates/actor_system_prompt.py +64 -0
  55. strands_evals/simulation/prompt_templates/goal_completion.py +27 -0
  56. strands_evals/simulation/tools/__init__.py +5 -0
  57. strands_evals/simulation/tools/goal_completion.py +93 -0
  58. strands_evals/telemetry/__init__.py +15 -0
  59. strands_evals/telemetry/_cloudwatch_logger.py +209 -0
  60. strands_evals/telemetry/config.py +207 -0
  61. strands_evals/telemetry/tracer.py +38 -0
  62. strands_evals/tools/evaluation_tools.py +67 -0
  63. strands_evals/types/__init__.py +11 -0
  64. strands_evals/types/evaluation.py +105 -0
  65. strands_evals/types/evaluation_report.py +244 -0
  66. strands_evals/types/simulation/__init__.py +5 -0
  67. strands_evals/types/simulation/actor.py +34 -0
  68. strands_evals/types/trace.py +205 -0
@@ -0,0 +1,67 @@
1
+ from strands import tool
2
+
3
+
4
+ @tool
5
+ def exact_match_scorer(actual_trajectory: list, expected_trajectory: list) -> float:
6
+ """
7
+ Score the trajectory based on exact match. A score of 0 indicates 0 steps matched and 1
8
+ indicates a perfect match.
9
+
10
+ Args:
11
+ actual_trajectory: The actual trajectory.
12
+ expected_trajectory: The expected trajectory.
13
+
14
+ Returns:
15
+ The score of the trajectory.
16
+ """
17
+ correct = 0
18
+ for actual, expected in zip(actual_trajectory, expected_trajectory, strict=False):
19
+ if actual == expected:
20
+ correct += 1
21
+
22
+ return correct / len(expected_trajectory)
23
+
24
+
25
+ @tool
26
+ def in_order_match_scorer(actual_trajectory: list, expected_trajectory: list) -> float:
27
+ """
28
+ Score based on correct actions in correct order, allows extra actions.
29
+
30
+ Args:
31
+ actual_trajectory: The actual trajectory.
32
+ expected_trajectory: The expected trajectory.
33
+
34
+ Returns:
35
+ The score of the trajectory.
36
+ """
37
+ if not expected_trajectory:
38
+ return 1.0
39
+
40
+ expected_idx = 0
41
+ for action in actual_trajectory:
42
+ if expected_idx < len(expected_trajectory) and action == expected_trajectory[expected_idx]:
43
+ expected_idx += 1
44
+
45
+ return expected_idx / len(expected_trajectory)
46
+
47
+
48
+ @tool
49
+ def any_order_match_scorer(actual_trajectory: list, expected_trajectory: list) -> float:
50
+ """
51
+ Score based on correct actions in any order, allows extra actions.
52
+
53
+ Args:
54
+ actual_trajectory: The actual trajectory.
55
+ expected_trajectory: The expected trajectory.
56
+
57
+ Returns:
58
+ The score of the trajectory.
59
+ """
60
+ if not expected_trajectory:
61
+ return 1.0
62
+
63
+ expected_set = set(expected_trajectory)
64
+ actual_set = set(actual_trajectory)
65
+ matched = len(expected_set.intersection(actual_set))
66
+
67
+ return matched / len(expected_trajectory)
@@ -0,0 +1,11 @@
1
+ from .evaluation import EvaluationData, EvaluationOutput, Interaction, TaskOutput
2
+ from .simulation import ActorProfile, ActorResponse
3
+
4
+ __all__ = [
5
+ "Interaction",
6
+ "TaskOutput",
7
+ "EvaluationData",
8
+ "EvaluationOutput",
9
+ "ActorProfile",
10
+ "ActorResponse",
11
+ ]
@@ -0,0 +1,105 @@
1
+ from pydantic import BaseModel
2
+ from typing_extensions import Any, Generic, TypedDict, TypeVar, Union
3
+
4
+ from .trace import Session
5
+
6
+ InputT = TypeVar("InputT")
7
+ OutputT = TypeVar("OutputT")
8
+
9
+
10
+ class Interaction(TypedDict, total=False):
11
+ """
12
+ Represents a single interaction in a multi-agent or multi-step system.
13
+
14
+ Used to capture the communication flow and dependencies between different
15
+ components (agents, tools, or processing nodes) during task execution.
16
+ All fields are optional to accommodate different interaction patterns.
17
+
18
+ Attributes:
19
+ node_name: Identifier for the agent, tool, or component involved in this interaction
20
+ dependencies: List of other nodes/components this interaction depends on or references
21
+ messages: Sequence of messages, responses, or communication exchanged during this interaction
22
+
23
+ Example:
24
+ interaction = {
25
+ "node_name": "calculator_agent",
26
+ "dependencies": ["input_parser", "math_validator"],
27
+ "messages": ["Calculate 2+2"]
28
+ }
29
+ """
30
+
31
+ node_name: str
32
+ dependencies: list
33
+ messages: list
34
+
35
+
36
+ class TaskOutput(TypedDict, total=False):
37
+ """
38
+ Structured output format for task functions that return complex results.
39
+
40
+ Used when task functions need to return more than just the output response,
41
+ such as trajectory or interaction history. All fields are optional
42
+ to support different task complexity levels.
43
+
44
+ Attributes:
45
+ output: The primary response or result from the task
46
+ trajectory: Sequence of steps, tools, or actions taken during task execution
47
+ interactions: Communication flow between agents or components during execution
48
+ input: A new input to replace the original in the evaluation, will not mutate the original test case
49
+
50
+ Example:
51
+ task_result = {
52
+ "output": "The answer is 42",
53
+ "trajectory": ["calculator", "validator"],
54
+ "interactions": [{"node_name": "math_agent", "messages": ["Computing..."]}]
55
+ }
56
+ """
57
+
58
+ output: Any
59
+ trajectory: list[Any]
60
+ interactions: list[Interaction]
61
+ input: Any
62
+
63
+
64
+ class EvaluationData(BaseModel, Generic[InputT, OutputT]):
65
+ """
66
+ A record of all of the context for the evaluator to evaluate a test case.
67
+
68
+ Attributes:
69
+ input: The input to the task. eg. the query to the agent
70
+ actual_output: The actual response given the input.
71
+ expected_output: The expected response given the input.
72
+ actual_trajectory: The actual trajectory of a task given the input.
73
+ expected_trajectory: The expected trajectory of a task given the input.
74
+ name: The name of the test case. This will be used to identify the test in the summary report.
75
+ metadata: Additional information about the test case.
76
+ actual_interactions: The actual interaction sequence given the input.
77
+ expected_interactions: The expected interaction sequence given the input.
78
+ """
79
+
80
+ input: InputT
81
+ actual_output: OutputT | None = None
82
+ name: str | None = None
83
+ expected_output: OutputT | None = None
84
+ expected_trajectory: Union[list[Any], Session, None] = None
85
+ actual_trajectory: Union[list[Any], Session, None] = None
86
+ metadata: dict[str, Any] | None = None
87
+ actual_interactions: list[Interaction] | None = None
88
+ expected_interactions: list[Interaction] | None = None
89
+
90
+
91
+ class EvaluationOutput(BaseModel):
92
+ """
93
+ Structured output for LLM-based judge.
94
+
95
+ Attributes:
96
+ score: The score of the test case.
97
+ test_pass: Whether the test pass or fail.
98
+ reason: The reason for the score for each test case.
99
+ label: The categorical label corresponding to the score.
100
+ """
101
+
102
+ score: float
103
+ test_pass: bool
104
+ reason: str | None = None
105
+ label: str | None = None
@@ -0,0 +1,244 @@
1
+ import json
2
+ from pathlib import Path
3
+
4
+ from pydantic import BaseModel
5
+ from typing_extensions import TypeVar
6
+
7
+ from ..display.display_console import CollapsibleTableReportDisplay
8
+ from ..types.evaluation import EvaluationOutput
9
+
10
+ InputT = TypeVar("InputT")
11
+ OutputT = TypeVar("OutputT")
12
+
13
+
14
+ class EvaluationReport(BaseModel):
15
+ """
16
+ A report of the evaluation of a task.
17
+
18
+ Attributes:
19
+ overall_score: The overall score of the task.
20
+ scores: A list of the score for each test case in order.
21
+ cases: A list of records for each test case.
22
+ test_passes: A list of booleans indicating whether the test pass or fail.
23
+ reasons: A list of reason for each test case.
24
+ """
25
+
26
+ overall_score: float
27
+ scores: list[float]
28
+ cases: list[dict]
29
+ test_passes: list[bool]
30
+ reasons: list[str] = []
31
+ detailed_results: list[list[EvaluationOutput]] = []
32
+
33
+ def _display(
34
+ self,
35
+ static: bool = True,
36
+ include_input: bool = True,
37
+ include_actual_output: bool = False,
38
+ include_expected_output: bool = False,
39
+ include_expected_trajectory: bool = False,
40
+ include_actual_trajectory: bool = False,
41
+ include_actual_interactions: bool = False,
42
+ include_expected_interactions: bool = False,
43
+ include_meta: bool = False,
44
+ ):
45
+ """
46
+ Render an interface of the report with as much details as configured using Rich.
47
+
48
+ Args:
49
+ static: Whether to render the interface as interactive or static.
50
+ include_input (Defaults to True): Include the input in the display.
51
+ include_actual_output (Defaults to False): Include the actual output in the display.
52
+ include_expected_output (Defaults to False): Include the expected output in the display.
53
+ include_expected_trajectory (Defaults to False): Include the expected trajectory in the display.
54
+ include_actual_trajectory (Defaults to False): Include the actual trajectory in the display.
55
+ include_actual_interactions (Defaults to False): Include the actual interactions in the display.
56
+ include_expected_interactions (Defaults to False): Include the expected interactions in the display.
57
+ include_meta (Defaults to False): Include metadata in the display.
58
+
59
+ Note:
60
+ This method provides an interactive console interface where users can expand or collapse
61
+ individual test cases to view more or less detail.
62
+ """
63
+ report_data = {}
64
+ for i in range(len(self.scores)):
65
+ name = self.cases[i].get("name", f"Test {i + 1}")
66
+ reason = self.reasons[i] if i < len(self.reasons) else "N/A"
67
+ details_dict = {
68
+ "name": name,
69
+ "score": f"{self.scores[i]:.2f}",
70
+ "test_pass": self.test_passes[i],
71
+ "reason": reason,
72
+ }
73
+ if include_input:
74
+ details_dict["input"] = str(self.cases[i].get("input"))
75
+ if include_actual_output:
76
+ details_dict["actual_output"] = str(self.cases[i].get("actual_output"))
77
+ if include_expected_output:
78
+ details_dict["expected_output"] = str(self.cases[i].get("expected_output"))
79
+ if include_actual_trajectory:
80
+ details_dict["actual_trajectory"] = str(self.cases[i].get("actual_trajectory"))
81
+ if include_expected_trajectory:
82
+ details_dict["expected_trajectory"] = str(self.cases[i].get("expected_trajectory"))
83
+ if include_actual_interactions:
84
+ details_dict["actual_interactions"] = str(self.cases[i].get("actual_interactions"))
85
+ if include_expected_interactions:
86
+ details_dict["expected_interactions"] = str(self.cases[i].get("expected_interactions"))
87
+ if include_meta:
88
+ details_dict["metadata"] = str(self.cases[i].get("metadata"))
89
+
90
+ report_data[str(i)] = {
91
+ "details": details_dict,
92
+ "detailed_results": self.detailed_results[i] if i < len(self.detailed_results) else [], # NEW
93
+ "expanded": False,
94
+ }
95
+
96
+ display_console = CollapsibleTableReportDisplay(items=report_data, overall_score=self.overall_score)
97
+ display_console.run(static=static)
98
+
99
+ def display(
100
+ self,
101
+ include_input: bool = True,
102
+ include_actual_output: bool = False,
103
+ include_expected_output: bool = False,
104
+ include_expected_trajectory: bool = False,
105
+ include_actual_trajectory: bool = False,
106
+ include_actual_interactions: bool = False,
107
+ include_expected_interactions: bool = False,
108
+ include_meta: bool = False,
109
+ ):
110
+ """
111
+ Render the report with as much details as configured using Rich. Use run_display if want
112
+ to interact with the table.
113
+
114
+ Args:
115
+ include_input: Whether to include the input in the display. Defaults to True.
116
+ include_actual_output (Defaults to False): Include the actual output in the display.
117
+ include_expected_output (Defaults to False): Include the expected output in the display.
118
+ include_expected_trajectory (Defaults to False): Include the expected trajectory in the display.
119
+ include_actual_trajectory (Defaults to False): Include the actual trajectory in the display.
120
+ include_actual_interactions (Defaults to False): Include the actual interactions in the display.
121
+ include_expected_interactions (Defaults to False): Include the expected interactions in the display.
122
+ include_meta (Defaults to False): Include metadata in the display.
123
+ """
124
+ self._display(
125
+ static=True,
126
+ include_input=include_input,
127
+ include_actual_output=include_actual_output,
128
+ include_expected_output=include_expected_output,
129
+ include_expected_trajectory=include_expected_trajectory,
130
+ include_actual_trajectory=include_actual_trajectory,
131
+ include_actual_interactions=include_actual_interactions,
132
+ include_expected_interactions=include_expected_interactions,
133
+ include_meta=include_meta,
134
+ )
135
+
136
+ def run_display(
137
+ self,
138
+ include_input: bool = True,
139
+ include_actual_output: bool = False,
140
+ include_expected_output: bool = False,
141
+ include_expected_trajectory: bool = False,
142
+ include_actual_trajectory: bool = False,
143
+ include_actual_interactions: bool = False,
144
+ include_expected_interactions: bool = False,
145
+ include_meta: bool = False,
146
+ ):
147
+ """
148
+ Render the report interactively with as much details as configured using Rich.
149
+
150
+ Args:
151
+ include_input: Whether to include the input in the display. Defaults to True.
152
+ include_actual_output (Defaults to False): Include the actual output in the display.
153
+ include_expected_output (Defaults to False): Include the expected output in the display.
154
+ include_expected_trajectory (Defaults to False): Include the expected trajectory in the display.
155
+ include_actual_trajectory (Defaults to False): Include the actual trajectory in the display.
156
+ include_actual_interactions (Defaults to False): Include the actual interactions in the display.
157
+ include_expected_interactions (Defaults to False): Include the expected interactions in the display.
158
+ include_meta (Defaults to False): Include metadata in the display.
159
+ """
160
+ self._display(
161
+ static=False,
162
+ include_input=include_input,
163
+ include_actual_output=include_actual_output,
164
+ include_expected_output=include_expected_output,
165
+ include_expected_trajectory=include_expected_trajectory,
166
+ include_actual_trajectory=include_actual_trajectory,
167
+ include_actual_interactions=include_actual_interactions,
168
+ include_expected_interactions=include_expected_interactions,
169
+ include_meta=include_meta,
170
+ )
171
+
172
+ def to_dict(self):
173
+ """
174
+ Returns a dictionary representation of the report.
175
+ """
176
+ return self.model_dump()
177
+
178
+ @classmethod
179
+ def from_dict(cls, data: dict):
180
+ """
181
+ Create an EvaluationReport instance from a dictionary.
182
+
183
+ Args:
184
+ data: A dictionary containing the report data.
185
+ """
186
+ return cls.model_validate(data)
187
+
188
+ def to_file(self, path: str):
189
+ """
190
+ Write the report to a JSON file.
191
+
192
+ Args:
193
+ path: The file path where the report will be saved. Can be:
194
+ - A filename only (e.g., "foo.json" or "foo") - saves in current working directory
195
+ - A relative path (e.g., "relative_path/foo.json") - saves relative to current working directory
196
+ - An absolute path (e.g., "/path/to/dir/foo.json") - saves in exact directory
197
+
198
+ If no extension is provided, ".json" will be added automatically.
199
+ Only .json format is supported.
200
+
201
+ Raises:
202
+ ValueError: If the path has a non-JSON extension.
203
+ """
204
+ file_path = Path(path)
205
+
206
+ if file_path.suffix:
207
+ if file_path.suffix != ".json":
208
+ raise ValueError(
209
+ f"Only .json format is supported. Got path with extension: {path}. "
210
+ f"Please use a .json extension or provide a path without an extension."
211
+ )
212
+ else:
213
+ file_path = file_path.with_suffix(".json")
214
+
215
+ file_path.parent.mkdir(parents=True, exist_ok=True)
216
+
217
+ with open(file_path, "w") as f:
218
+ json.dump(self.to_dict(), f, indent=2)
219
+
220
+ @classmethod
221
+ def from_file(cls, path: str):
222
+ """
223
+ Create an EvaluationReport instance from a JSON file.
224
+
225
+ Args:
226
+ path: Path to the JSON file.
227
+
228
+ Return:
229
+ An EvaluationReport object.
230
+
231
+ Raises:
232
+ ValueError: If the file does not have a .json extension.
233
+ """
234
+ file_path = Path(path)
235
+
236
+ if file_path.suffix != ".json":
237
+ raise ValueError(
238
+ f"Only .json format is supported. Got file: {path}. Please provide a path with .json extension."
239
+ )
240
+
241
+ with open(file_path, "r") as f:
242
+ data = json.load(f)
243
+
244
+ return cls.from_dict(data)
@@ -0,0 +1,5 @@
1
+ """Data models for actor simulation."""
2
+
3
+ from .actor import ActorProfile, ActorResponse
4
+
5
+ __all__ = ["ActorProfile", "ActorResponse"]
@@ -0,0 +1,34 @@
1
+ from pydantic import BaseModel, Field
2
+ from typing_extensions import Any
3
+
4
+
5
+ class ActorProfile(BaseModel):
6
+ """
7
+ Profile for actor simulation.
8
+
9
+ Attributes:
10
+ traits: Dictionary of actor characteristics and attributes.
11
+ context: Supplementary background information about the actor.
12
+ actor_goal: What the actor ultimately wants to achieve in the interaction.
13
+ """
14
+
15
+ traits: dict[str, Any] = Field(..., description="Actor traits for simulation")
16
+ context: str = Field(..., description="Supplementary actor background details")
17
+ actor_goal: str = Field(
18
+ ...,
19
+ description="What the actor ultimately wants to achieve in this interaction - "
20
+ "should be specific, actionable, and written from the actor's perspective",
21
+ )
22
+
23
+
24
+ class ActorResponse(BaseModel):
25
+ """
26
+ Structured response from an actor.
27
+
28
+ Attributes:
29
+ reasoning: Internal reasoning process for the response.
30
+ message: The actual message content from the actor.
31
+ """
32
+
33
+ reasoning: str = Field(..., description="Reasoning for the actor's response")
34
+ message: str = Field(..., description="Message from the actor")
@@ -0,0 +1,205 @@
1
+ """
2
+ Generic trace types for agent observability.
3
+
4
+ These types represent standard observability primitives for agents.
5
+ """
6
+
7
+ from datetime import datetime, timezone
8
+ from enum import Enum
9
+
10
+ from pydantic import BaseModel, field_serializer
11
+ from typing_extensions import Mapping, Sequence, TypeAlias, Union
12
+
13
+
14
+ class Role(str, Enum):
15
+ USER = "user"
16
+ ASSISTANT = "assistant"
17
+
18
+
19
+ class ContentType(str, Enum):
20
+ TEXT = "text"
21
+ TOOL_USE = "tool_use"
22
+ TOOL_RESULT = "tool_result"
23
+
24
+
25
+ class SpanType(str, Enum):
26
+ INFERENCE = "inference"
27
+ TOOL_EXECUTION = "execute_tool"
28
+ AGENT_INVOCATION = "invoke_agent"
29
+
30
+
31
+ class EvaluationLevel(str, Enum):
32
+ """Type of evaluation based on trace granularity."""
33
+
34
+ SESSION_LEVEL = "Session"
35
+ TRACE_LEVEL = "Trace"
36
+ TOOL_LEVEL = "ToolCall"
37
+
38
+
39
+ class ToolCall(BaseModel):
40
+ name: str
41
+ arguments: dict
42
+ tool_call_id: str | None = None
43
+
44
+
45
+ class ToolResult(BaseModel):
46
+ content: str
47
+ error: str | None = None
48
+ tool_call_id: str | None = None
49
+
50
+
51
+ class ToolConfig(BaseModel):
52
+ name: str
53
+ description: str | None = None
54
+ parameters: dict | None = None
55
+
56
+
57
+ class TextContent(BaseModel):
58
+ content_type: ContentType = ContentType.TEXT
59
+ text: str
60
+
61
+
62
+ class ToolCallContent(ToolCall):
63
+ content_type: ContentType = ContentType.TOOL_USE
64
+
65
+
66
+ class ToolResultContent(ToolResult):
67
+ content_type: ContentType = ContentType.TOOL_RESULT
68
+
69
+
70
+ class UserMessage(BaseModel):
71
+ role: Role = Role.USER
72
+ content: list[Union[TextContent, ToolResultContent]]
73
+
74
+
75
+ class AssistantMessage(BaseModel):
76
+ role: Role = Role.ASSISTANT
77
+ content: list[Union[TextContent, ToolCallContent]]
78
+
79
+
80
+ class SpanInfo(BaseModel):
81
+ trace_id: str | None = None
82
+ span_id: str | None = None
83
+ session_id: str
84
+ parent_span_id: str | None = None
85
+ start_time: datetime
86
+ end_time: datetime
87
+
88
+ @field_serializer("start_time", "end_time")
89
+ def serialize_datetime_utc(self, dt: datetime) -> str:
90
+ """Serialize datetime fields in UTC timezone with ISO format."""
91
+ # Convert to UTC if timezone-aware, otherwise assume it's already UTC
92
+ if dt.tzinfo is not None:
93
+ utc_dt = dt.astimezone(timezone.utc)
94
+ else:
95
+ utc_dt = dt.replace(tzinfo=timezone.utc)
96
+ # Return ISO format string with 'Z' suffix for UTC
97
+ return utc_dt.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
98
+
99
+
100
+ class BaseSpan(BaseModel):
101
+ span_info: SpanInfo
102
+ metadata: dict | None = {}
103
+
104
+
105
+ class InferenceSpan(BaseSpan):
106
+ span_type: SpanType = SpanType.INFERENCE
107
+ messages: list[Union[UserMessage, AssistantMessage]]
108
+
109
+
110
+ class ToolExecutionSpan(BaseSpan):
111
+ span_type: SpanType = SpanType.TOOL_EXECUTION
112
+ tool_call: ToolCall
113
+ tool_result: ToolResult
114
+
115
+
116
+ class AgentInvocationSpan(BaseSpan):
117
+ span_type: SpanType = SpanType.AGENT_INVOCATION
118
+ user_prompt: str
119
+ agent_response: str
120
+ available_tools: list[ToolConfig]
121
+
122
+
123
+ SpanUnion: TypeAlias = Union[InferenceSpan, ToolExecutionSpan, AgentInvocationSpan]
124
+
125
+
126
+ class Trace(BaseModel):
127
+ spans: list[SpanUnion]
128
+ trace_id: str
129
+ session_id: str
130
+
131
+
132
+ class Session(BaseModel):
133
+ traces: list[Trace]
134
+ session_id: str
135
+
136
+
137
+ class BaseEvaluationInput(BaseModel):
138
+ """Base class for all evaluation inputs"""
139
+
140
+ span_info: SpanInfo
141
+
142
+
143
+ class ToolExecution(BaseModel):
144
+ tool_call: ToolCall
145
+ tool_result: ToolResult
146
+
147
+
148
+ class Context(BaseModel):
149
+ user_prompt: TextContent
150
+ agent_response: TextContent
151
+ tool_execution_history: list[ToolExecution] | None = None
152
+
153
+
154
+ class SessionLevelInput(BaseEvaluationInput):
155
+ """Input for session-level evaluators"""
156
+
157
+ session_history: list[Context]
158
+ available_tools: list[ToolConfig] | None = None
159
+
160
+
161
+ class TraceLevelInput(BaseEvaluationInput):
162
+ """Input for trace-level evaluators"""
163
+
164
+ agent_response: TextContent
165
+ session_history: list[Union[UserMessage, list[ToolExecution], AssistantMessage]]
166
+
167
+
168
+ class ToolLevelInput(BaseEvaluationInput):
169
+ """Input for tool-level evaluators"""
170
+
171
+ available_tools: list[ToolConfig]
172
+ tool_execution_details: ToolExecutionSpan
173
+ session_history: list[Union[UserMessage, list[ToolExecution], AssistantMessage]]
174
+
175
+
176
+ class EvaluatorScore(BaseModel):
177
+ explanation: str
178
+ value: Union[int, float] | None = None
179
+ error: str | None = None
180
+
181
+
182
+ class TokenUsage(BaseModel):
183
+ cache_read_input_tokens: int
184
+ cache_creation_input_tokens: int
185
+ input_tokens: int
186
+ output_tokens: int
187
+ total_tokens: int
188
+
189
+
190
+ class EvaluatorResult(BaseModel):
191
+ span_info: SpanInfo
192
+ evaluator_name: str
193
+ score: EvaluatorScore
194
+ token_usage: TokenUsage | None = None
195
+
196
+
197
+ class EvaluationResponse(BaseModel):
198
+ evaluator_results: list[EvaluatorResult]
199
+
200
+
201
+ AttributeValue = Mapping[
202
+ str, str | bool | int | float | Sequence[str] | Sequence[bool] | Sequence[int] | Sequence[float]
203
+ ]
204
+
205
+ Attributes = Mapping[str, AttributeValue] | None