agent-trace-eval 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_trace_eval-0.1.0/.gitignore +13 -0
- agent_trace_eval-0.1.0/LICENSE +21 -0
- agent_trace_eval-0.1.0/PKG-INFO +148 -0
- agent_trace_eval-0.1.0/README.md +122 -0
- agent_trace_eval-0.1.0/pyproject.toml +55 -0
- agent_trace_eval-0.1.0/src/agent_trace_eval/__init__.py +19 -0
- agent_trace_eval-0.1.0/src/agent_trace_eval/case.py +40 -0
- agent_trace_eval-0.1.0/src/agent_trace_eval/cli.py +72 -0
- agent_trace_eval-0.1.0/src/agent_trace_eval/loader.py +37 -0
- agent_trace_eval-0.1.0/src/agent_trace_eval/matchers/__init__.py +13 -0
- agent_trace_eval-0.1.0/src/agent_trace_eval/matchers/arguments.py +81 -0
- agent_trace_eval-0.1.0/src/agent_trace_eval/matchers/handoffs.py +70 -0
- agent_trace_eval-0.1.0/src/agent_trace_eval/matchers/ordering.py +70 -0
- agent_trace_eval-0.1.0/src/agent_trace_eval/matchers/recovery.py +74 -0
- agent_trace_eval-0.1.0/src/agent_trace_eval/matchers/tools.py +67 -0
- agent_trace_eval-0.1.0/src/agent_trace_eval/models.py +112 -0
- agent_trace_eval-0.1.0/src/agent_trace_eval/report.py +40 -0
- agent_trace_eval-0.1.0/src/agent_trace_eval/result.py +62 -0
- agent_trace_eval-0.1.0/src/agent_trace_eval/runner.py +75 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Yifan Cheng
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: agent-trace-eval
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Golden trace regression evaluation for tool-using AI agents
|
|
5
|
+
Project-URL: Homepage, https://github.com/yfccyf/agent-trace-eval
|
|
6
|
+
Project-URL: Documentation, https://github.com/yfccyf/agent-trace-eval#readme
|
|
7
|
+
Project-URL: Repository, https://github.com/yfccyf/agent-trace-eval
|
|
8
|
+
Author: Yifan Cheng
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: ai-agents,evaluation,observability,regression-testing,tool-calls
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Software Development :: Testing
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Requires-Dist: pyyaml>=6.0
|
|
22
|
+
Provides-Extra: dev
|
|
23
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
24
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# agent-trace-eval
|
|
28
|
+
|
|
29
|
+
[](https://github.com/yfccyf/agent-trace-eval/actions/workflows/ci.yml)
|
|
30
|
+
[](https://pypi.org/project/agent-trace-eval/)
|
|
31
|
+
[](https://opensource.org/licenses/MIT)
|
|
32
|
+
|
|
33
|
+
Golden trace regression evaluation for tool-using AI agents.
|
|
34
|
+
|
|
35
|
+
`agent-trace-eval` checks agent execution traces against declarative expectations for:
|
|
36
|
+
|
|
37
|
+
- tool selection (required / forbidden tools)
|
|
38
|
+
- tool-call arguments
|
|
39
|
+
- tool-call ordering
|
|
40
|
+
- multi-agent handoffs
|
|
41
|
+
- recovery decisions (retry / fallback / escalate)
|
|
42
|
+
|
|
43
|
+
It is designed as a small, employer-neutral library you can use in CI to gate agent workflow changes.
|
|
44
|
+
|
|
45
|
+
## Install
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install agent-trace-eval
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
For local development:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install -e ".[dev]"
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Quick start
|
|
58
|
+
|
|
59
|
+
Run the bundled examples:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
agent-trace-eval \
|
|
63
|
+
--cases examples/cases \
|
|
64
|
+
--traces-dir examples/traces \
|
|
65
|
+
--report reports/example-report.md
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Trace format
|
|
69
|
+
|
|
70
|
+
Traces are JSON or YAML documents with an `events` list. Supported event types include:
|
|
71
|
+
|
|
72
|
+
- `tool_call`
|
|
73
|
+
- `handoff`
|
|
74
|
+
- `recovery_decision`
|
|
75
|
+
- `final_answer`
|
|
76
|
+
|
|
77
|
+
Example:
|
|
78
|
+
|
|
79
|
+
```json
|
|
80
|
+
{
|
|
81
|
+
"case_id": "refund_lookup",
|
|
82
|
+
"events": [
|
|
83
|
+
{
|
|
84
|
+
"type": "tool_call",
|
|
85
|
+
"name": "lookup_order",
|
|
86
|
+
"arguments": { "order_id": "12345" }
|
|
87
|
+
},
|
|
88
|
+
{
|
|
89
|
+
"type": "tool_call",
|
|
90
|
+
"name": "issue_refund",
|
|
91
|
+
"arguments": { "order_id": "12345" }
|
|
92
|
+
}
|
|
93
|
+
]
|
|
94
|
+
}
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Case format
|
|
98
|
+
|
|
99
|
+
Cases are YAML or JSON files with `id`, `description`, `input`, and `expect` sections:
|
|
100
|
+
|
|
101
|
+
```yaml
|
|
102
|
+
id: refund_lookup
|
|
103
|
+
description: Agent should look up an order before issuing a refund.
|
|
104
|
+
expect:
|
|
105
|
+
tools:
|
|
106
|
+
required: [lookup_order, issue_refund]
|
|
107
|
+
forbidden: [delete_account]
|
|
108
|
+
ordering:
|
|
109
|
+
before:
|
|
110
|
+
- first: lookup_order
|
|
111
|
+
second: issue_refund
|
|
112
|
+
arguments:
|
|
113
|
+
issue_refund:
|
|
114
|
+
order_id: "12345"
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## Python API
|
|
118
|
+
|
|
119
|
+
```python
|
|
120
|
+
from agent_trace_eval import RegressionRunner, render_markdown_report
|
|
121
|
+
from agent_trace_eval.loader import load_case, load_trace
|
|
122
|
+
from agent_trace_eval.result import SuiteResult
|
|
123
|
+
|
|
124
|
+
case = load_case("examples/cases/refund_lookup.yaml")
|
|
125
|
+
trace = load_trace("examples/traces/refund_lookup.json")
|
|
126
|
+
|
|
127
|
+
runner = RegressionRunner()
|
|
128
|
+
result = runner.run_case(case, trace)
|
|
129
|
+
report = render_markdown_report(SuiteResult(case_results=[result]))
|
|
130
|
+
print(report)
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## Related writing
|
|
134
|
+
|
|
135
|
+
This project complements a series on agent regression testing and release gates:
|
|
136
|
+
|
|
137
|
+
- [Final Answers Are Not Enough (Part 1)](https://yfccyf.github.io/writing/final-answers-are-not-enough-golden-trace-regression-testing-part-1/)
|
|
138
|
+
- [From Golden Traces to Release Gates (Part 2)](https://yfccyf.github.io/writing/from-golden-traces-to-release-gates-building-an-agent-regression-harness-part-2/)
|
|
139
|
+
|
|
140
|
+
## Development
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
pytest
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
## License
|
|
147
|
+
|
|
148
|
+
MIT
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# agent-trace-eval
|
|
2
|
+
|
|
3
|
+
[](https://github.com/yfccyf/agent-trace-eval/actions/workflows/ci.yml)
|
|
4
|
+
[](https://pypi.org/project/agent-trace-eval/)
|
|
5
|
+
[](https://opensource.org/licenses/MIT)
|
|
6
|
+
|
|
7
|
+
Golden trace regression evaluation for tool-using AI agents.
|
|
8
|
+
|
|
9
|
+
`agent-trace-eval` checks agent execution traces against declarative expectations for:
|
|
10
|
+
|
|
11
|
+
- tool selection (required / forbidden tools)
|
|
12
|
+
- tool-call arguments
|
|
13
|
+
- tool-call ordering
|
|
14
|
+
- multi-agent handoffs
|
|
15
|
+
- recovery decisions (retry / fallback / escalate)
|
|
16
|
+
|
|
17
|
+
It is designed as a small, employer-neutral library you can use in CI to gate agent workflow changes.
|
|
18
|
+
|
|
19
|
+
## Install
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install agent-trace-eval
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
For local development:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install -e ".[dev]"
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Quick start
|
|
32
|
+
|
|
33
|
+
Run the bundled examples:
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
agent-trace-eval \
|
|
37
|
+
--cases examples/cases \
|
|
38
|
+
--traces-dir examples/traces \
|
|
39
|
+
--report reports/example-report.md
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Trace format
|
|
43
|
+
|
|
44
|
+
Traces are JSON or YAML documents with an `events` list. Supported event types include:
|
|
45
|
+
|
|
46
|
+
- `tool_call`
|
|
47
|
+
- `handoff`
|
|
48
|
+
- `recovery_decision`
|
|
49
|
+
- `final_answer`
|
|
50
|
+
|
|
51
|
+
Example:
|
|
52
|
+
|
|
53
|
+
```json
|
|
54
|
+
{
|
|
55
|
+
"case_id": "refund_lookup",
|
|
56
|
+
"events": [
|
|
57
|
+
{
|
|
58
|
+
"type": "tool_call",
|
|
59
|
+
"name": "lookup_order",
|
|
60
|
+
"arguments": { "order_id": "12345" }
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
"type": "tool_call",
|
|
64
|
+
"name": "issue_refund",
|
|
65
|
+
"arguments": { "order_id": "12345" }
|
|
66
|
+
}
|
|
67
|
+
]
|
|
68
|
+
}
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Case format
|
|
72
|
+
|
|
73
|
+
Cases are YAML or JSON files with `id`, `description`, `input`, and `expect` sections:
|
|
74
|
+
|
|
75
|
+
```yaml
|
|
76
|
+
id: refund_lookup
|
|
77
|
+
description: Agent should look up an order before issuing a refund.
|
|
78
|
+
expect:
|
|
79
|
+
tools:
|
|
80
|
+
required: [lookup_order, issue_refund]
|
|
81
|
+
forbidden: [delete_account]
|
|
82
|
+
ordering:
|
|
83
|
+
before:
|
|
84
|
+
- first: lookup_order
|
|
85
|
+
second: issue_refund
|
|
86
|
+
arguments:
|
|
87
|
+
issue_refund:
|
|
88
|
+
order_id: "12345"
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Python API
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
from agent_trace_eval import RegressionRunner, render_markdown_report
|
|
95
|
+
from agent_trace_eval.loader import load_case, load_trace
|
|
96
|
+
from agent_trace_eval.result import SuiteResult
|
|
97
|
+
|
|
98
|
+
case = load_case("examples/cases/refund_lookup.yaml")
|
|
99
|
+
trace = load_trace("examples/traces/refund_lookup.json")
|
|
100
|
+
|
|
101
|
+
runner = RegressionRunner()
|
|
102
|
+
result = runner.run_case(case, trace)
|
|
103
|
+
report = render_markdown_report(SuiteResult(case_results=[result]))
|
|
104
|
+
print(report)
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Related writing
|
|
108
|
+
|
|
109
|
+
This project complements a series on agent regression testing and release gates:
|
|
110
|
+
|
|
111
|
+
- [Final Answers Are Not Enough (Part 1)](https://yfccyf.github.io/writing/final-answers-are-not-enough-golden-trace-regression-testing-part-1/)
|
|
112
|
+
- [From Golden Traces to Release Gates (Part 2)](https://yfccyf.github.io/writing/from-golden-traces-to-release-gates-building-an-agent-regression-harness-part-2/)
|
|
113
|
+
|
|
114
|
+
## Development
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
pytest
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## License
|
|
121
|
+
|
|
122
|
+
MIT
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "agent-trace-eval"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Golden trace regression evaluation for tool-using AI agents"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "Yifan Cheng" }]
|
|
13
|
+
keywords = ["ai-agents", "evaluation", "regression-testing", "tool-calls", "observability"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.10",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Topic :: Software Development :: Testing",
|
|
23
|
+
]
|
|
24
|
+
dependencies = [
|
|
25
|
+
"pyyaml>=6.0",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
[project.optional-dependencies]
|
|
29
|
+
dev = [
|
|
30
|
+
"pytest>=8.0",
|
|
31
|
+
"pytest-cov>=5.0",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[project.scripts]
|
|
35
|
+
agent-trace-eval = "agent_trace_eval.cli:main"
|
|
36
|
+
|
|
37
|
+
[project.urls]
|
|
38
|
+
Homepage = "https://github.com/yfccyf/agent-trace-eval"
|
|
39
|
+
Documentation = "https://github.com/yfccyf/agent-trace-eval#readme"
|
|
40
|
+
Repository = "https://github.com/yfccyf/agent-trace-eval"
|
|
41
|
+
|
|
42
|
+
[tool.hatch.build.targets.wheel]
|
|
43
|
+
packages = ["src/agent_trace_eval"]
|
|
44
|
+
|
|
45
|
+
[tool.hatch.build.targets.sdist]
|
|
46
|
+
include = [
|
|
47
|
+
"src/agent_trace_eval",
|
|
48
|
+
"README.md",
|
|
49
|
+
"LICENSE",
|
|
50
|
+
"pyproject.toml",
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
[tool.pytest.ini_options]
|
|
54
|
+
testpaths = ["tests"]
|
|
55
|
+
pythonpath = ["src"]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Golden trace regression evaluation for tool-using AI agents."""
|
|
2
|
+
|
|
3
|
+
from agent_trace_eval.case import EvalCase
|
|
4
|
+
from agent_trace_eval.models import AgentTrace
|
|
5
|
+
from agent_trace_eval.report import render_markdown_report
|
|
6
|
+
from agent_trace_eval.result import CaseResult, MatchResult, Severity
|
|
7
|
+
from agent_trace_eval.runner import RegressionRunner
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"AgentTrace",
|
|
11
|
+
"CaseResult",
|
|
12
|
+
"MatchResult",
|
|
13
|
+
"RegressionRunner",
|
|
14
|
+
"Severity",
|
|
15
|
+
"EvalCase",
|
|
16
|
+
"render_markdown_report",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from agent_trace_eval.result import Severity
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class EvalCase:
|
|
11
|
+
id: str
|
|
12
|
+
description: str = ""
|
|
13
|
+
input: dict[str, Any] = field(default_factory=dict)
|
|
14
|
+
expect: dict[str, Any] = field(default_factory=dict)
|
|
15
|
+
|
|
16
|
+
@classmethod
|
|
17
|
+
def from_dict(cls, data: dict[str, Any]) -> "EvalCase":
|
|
18
|
+
case_id = data.get("id") or data.get("case_id")
|
|
19
|
+
if not case_id:
|
|
20
|
+
raise ValueError("Test case must include an 'id' or 'case_id' field.")
|
|
21
|
+
return cls(
|
|
22
|
+
id=str(case_id),
|
|
23
|
+
description=str(data.get("description", "")),
|
|
24
|
+
input=dict(data.get("input") or {}),
|
|
25
|
+
expect=dict(data.get("expect") or {}),
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
def severity(self, key: str, default: Severity = Severity.MEDIUM) -> Severity:
|
|
29
|
+
raw = self._lookup(self.expect, key)
|
|
30
|
+
if raw is None:
|
|
31
|
+
return default
|
|
32
|
+
return Severity(str(raw))
|
|
33
|
+
|
|
34
|
+
def _lookup(self, data: dict[str, Any], dotted_key: str) -> Any:
|
|
35
|
+
current: Any = data
|
|
36
|
+
for part in dotted_key.split("."):
|
|
37
|
+
if not isinstance(current, dict) or part not in current:
|
|
38
|
+
return None
|
|
39
|
+
current = current[part]
|
|
40
|
+
return current
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from agent_trace_eval.loader import load_cases
|
|
8
|
+
from agent_trace_eval.report import render_markdown_report
|
|
9
|
+
from agent_trace_eval.result import SuiteResult
|
|
10
|
+
from agent_trace_eval.runner import RegressionRunner
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
14
|
+
parser = argparse.ArgumentParser(
|
|
15
|
+
prog="agent-trace-eval",
|
|
16
|
+
description="Evaluate agent execution traces against golden expectations.",
|
|
17
|
+
)
|
|
18
|
+
parser.add_argument(
|
|
19
|
+
"--cases",
|
|
20
|
+
required=True,
|
|
21
|
+
help="Path to a YAML/JSON case file or directory of case files.",
|
|
22
|
+
)
|
|
23
|
+
parser.add_argument(
|
|
24
|
+
"--traces-dir",
|
|
25
|
+
required=True,
|
|
26
|
+
help="Directory containing trace files named <case_id>.json or .yaml.",
|
|
27
|
+
)
|
|
28
|
+
parser.add_argument(
|
|
29
|
+
"--report",
|
|
30
|
+
help="Optional path to write a Markdown report.",
|
|
31
|
+
)
|
|
32
|
+
return parser
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _collect_case_files(path: Path) -> list[Path]:
|
|
36
|
+
if path.is_file():
|
|
37
|
+
return [path]
|
|
38
|
+
return sorted(path.glob("*.yml")) + sorted(path.glob("*.yaml")) + sorted(path.glob("*.json"))
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def main(argv: list[str] | None = None) -> int:
|
|
42
|
+
parser = build_parser()
|
|
43
|
+
args = parser.parse_args(argv)
|
|
44
|
+
|
|
45
|
+
cases_path = Path(args.cases)
|
|
46
|
+
traces_dir = Path(args.traces_dir)
|
|
47
|
+
runner = RegressionRunner(traces_dir=traces_dir)
|
|
48
|
+
|
|
49
|
+
case_files = _collect_case_files(cases_path)
|
|
50
|
+
if not case_files:
|
|
51
|
+
print(f"No case files found at {cases_path}", file=sys.stderr)
|
|
52
|
+
return 2
|
|
53
|
+
|
|
54
|
+
results = []
|
|
55
|
+
for case_file in case_files:
|
|
56
|
+
for case in load_cases(case_file):
|
|
57
|
+
results.append(runner.run_case_file(case))
|
|
58
|
+
|
|
59
|
+
suite = SuiteResult(case_results=results)
|
|
60
|
+
report = render_markdown_report(suite)
|
|
61
|
+
print(report)
|
|
62
|
+
|
|
63
|
+
if args.report:
|
|
64
|
+
report_path = Path(args.report)
|
|
65
|
+
report_path.parent.mkdir(parents=True, exist_ok=True)
|
|
66
|
+
report_path.write_text(report, encoding="utf-8")
|
|
67
|
+
|
|
68
|
+
return 0 if suite.passed else 1
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
if __name__ == "__main__":
|
|
72
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
|
|
9
|
+
from agent_trace_eval.case import EvalCase
|
|
10
|
+
from agent_trace_eval.models import AgentTrace
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def load_trace(path: str | Path) -> AgentTrace:
|
|
14
|
+
data = _load_data(path)
|
|
15
|
+
return AgentTrace.from_dict(data)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def load_case(path: str | Path) -> EvalCase:
|
|
19
|
+
data = _load_data(path)
|
|
20
|
+
return EvalCase.from_dict(data)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def load_cases(path: str | Path) -> list[EvalCase]:
|
|
24
|
+
data = _load_data(path)
|
|
25
|
+
if isinstance(data, list):
|
|
26
|
+
return [EvalCase.from_dict(item) for item in data]
|
|
27
|
+
if "cases" in data:
|
|
28
|
+
return [EvalCase.from_dict(item) for item in data["cases"]]
|
|
29
|
+
return [EvalCase.from_dict(data)]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _load_data(path: str | Path) -> dict[str, Any] | list[dict[str, Any]]:
|
|
33
|
+
file_path = Path(path)
|
|
34
|
+
text = file_path.read_text(encoding="utf-8")
|
|
35
|
+
if file_path.suffix in {".yaml", ".yml"}:
|
|
36
|
+
return yaml.safe_load(text)
|
|
37
|
+
return json.loads(text)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from agent_trace_eval.matchers.arguments import match_arguments
|
|
2
|
+
from agent_trace_eval.matchers.handoffs import match_handoffs
|
|
3
|
+
from agent_trace_eval.matchers.ordering import match_ordering
|
|
4
|
+
from agent_trace_eval.matchers.recovery import match_recovery
|
|
5
|
+
from agent_trace_eval.matchers.tools import match_tools
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"match_arguments",
|
|
9
|
+
"match_handoffs",
|
|
10
|
+
"match_ordering",
|
|
11
|
+
"match_recovery",
|
|
12
|
+
"match_tools",
|
|
13
|
+
]
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from agent_trace_eval.case import EvalCase
|
|
6
|
+
from agent_trace_eval.models import AgentTrace
|
|
7
|
+
from agent_trace_eval.result import MatchResult, Severity
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def match_arguments(trace: AgentTrace, case: EvalCase) -> list[MatchResult]:
|
|
11
|
+
expect = case.expect.get("arguments", {})
|
|
12
|
+
if not expect:
|
|
13
|
+
return []
|
|
14
|
+
|
|
15
|
+
results: list[MatchResult] = []
|
|
16
|
+
for tool_name, rules in expect.items():
|
|
17
|
+
if not isinstance(rules, dict):
|
|
18
|
+
continue
|
|
19
|
+
calls = trace.find_tool_calls(tool_name)
|
|
20
|
+
if not calls:
|
|
21
|
+
results.append(
|
|
22
|
+
MatchResult(
|
|
23
|
+
name="arguments.tool_present",
|
|
24
|
+
passed=False,
|
|
25
|
+
message=f"Cannot evaluate arguments for '{tool_name}' because it was not called.",
|
|
26
|
+
severity=case.severity("severity.arguments", Severity.HIGH),
|
|
27
|
+
details={"tool": tool_name},
|
|
28
|
+
)
|
|
29
|
+
)
|
|
30
|
+
continue
|
|
31
|
+
|
|
32
|
+
call = calls[0]
|
|
33
|
+
for arg_name, expected in rules.items():
|
|
34
|
+
if isinstance(expected, dict) and "any_of" in expected:
|
|
35
|
+
result = _match_any_of(tool_name, arg_name, call.arguments, expected["any_of"])
|
|
36
|
+
else:
|
|
37
|
+
result = _match_exact(tool_name, arg_name, call.arguments, expected)
|
|
38
|
+
result.severity = case.severity("severity.arguments", Severity.HIGH)
|
|
39
|
+
results.append(result)
|
|
40
|
+
|
|
41
|
+
return results
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _match_exact(
|
|
45
|
+
tool_name: str,
|
|
46
|
+
arg_name: str,
|
|
47
|
+
arguments: dict[str, Any],
|
|
48
|
+
expected: Any,
|
|
49
|
+
) -> MatchResult:
|
|
50
|
+
actual = arguments.get(arg_name)
|
|
51
|
+
passed = actual == expected
|
|
52
|
+
return MatchResult(
|
|
53
|
+
name="arguments.exact",
|
|
54
|
+
passed=passed,
|
|
55
|
+
message=(
|
|
56
|
+
f"Argument '{arg_name}' for '{tool_name}' matches expected value."
|
|
57
|
+
if passed
|
|
58
|
+
else f"Argument '{arg_name}' for '{tool_name}' does not match expected value."
|
|
59
|
+
),
|
|
60
|
+
details={"tool": tool_name, "argument": arg_name, "expected": expected, "actual": actual},
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _match_any_of(
|
|
65
|
+
tool_name: str,
|
|
66
|
+
arg_name: str,
|
|
67
|
+
arguments: dict[str, Any],
|
|
68
|
+
options: list[Any],
|
|
69
|
+
) -> MatchResult:
|
|
70
|
+
actual = arguments.get(arg_name)
|
|
71
|
+
passed = actual in options
|
|
72
|
+
return MatchResult(
|
|
73
|
+
name="arguments.any_of",
|
|
74
|
+
passed=passed,
|
|
75
|
+
message=(
|
|
76
|
+
f"Argument '{arg_name}' for '{tool_name}' is one of the allowed values."
|
|
77
|
+
if passed
|
|
78
|
+
else f"Argument '{arg_name}' for '{tool_name}' is not one of the allowed values."
|
|
79
|
+
),
|
|
80
|
+
details={"tool": tool_name, "argument": arg_name, "options": options, "actual": actual},
|
|
81
|
+
)
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from agent_trace_eval.case import EvalCase
|
|
4
|
+
from agent_trace_eval.models import AgentTrace
|
|
5
|
+
from agent_trace_eval.result import MatchResult, Severity
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def match_handoffs(trace: AgentTrace, case: EvalCase) -> list[MatchResult]:
|
|
9
|
+
expect = case.expect.get("handoffs", {})
|
|
10
|
+
if not expect:
|
|
11
|
+
return []
|
|
12
|
+
|
|
13
|
+
results: list[MatchResult] = []
|
|
14
|
+
handoffs = trace.handoffs()
|
|
15
|
+
|
|
16
|
+
required = expect.get("required", [])
|
|
17
|
+
for rule in required:
|
|
18
|
+
passed = _has_handoff(handoffs, rule)
|
|
19
|
+
results.append(
|
|
20
|
+
MatchResult(
|
|
21
|
+
name="handoffs.required",
|
|
22
|
+
passed=passed,
|
|
23
|
+
message=(
|
|
24
|
+
"Required handoff was observed."
|
|
25
|
+
if passed
|
|
26
|
+
else "Required handoff was not observed."
|
|
27
|
+
),
|
|
28
|
+
severity=case.severity("severity.handoffs", Severity.HIGH),
|
|
29
|
+
details={"rule": rule, "observed": [_handoff_summary(event) for event in handoffs]},
|
|
30
|
+
)
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
forbidden = expect.get("forbidden", [])
|
|
34
|
+
for rule in forbidden:
|
|
35
|
+
passed = not _has_handoff(handoffs, rule)
|
|
36
|
+
results.append(
|
|
37
|
+
MatchResult(
|
|
38
|
+
name="handoffs.forbidden",
|
|
39
|
+
passed=passed,
|
|
40
|
+
message=(
|
|
41
|
+
"Forbidden handoff was not observed."
|
|
42
|
+
if passed
|
|
43
|
+
else "Forbidden handoff was observed."
|
|
44
|
+
),
|
|
45
|
+
severity=case.severity("severity.handoffs", Severity.HIGH),
|
|
46
|
+
details={"rule": rule, "observed": [_handoff_summary(event) for event in handoffs]},
|
|
47
|
+
)
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
return results
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _has_handoff(handoffs: list, rule: dict) -> bool:
|
|
54
|
+
for event in handoffs:
|
|
55
|
+
if rule.get("from") and event.from_agent != rule.get("from"):
|
|
56
|
+
continue
|
|
57
|
+
if rule.get("to") and event.to_agent != rule.get("to"):
|
|
58
|
+
continue
|
|
59
|
+
if rule.get("reason") and event.reason != rule.get("reason"):
|
|
60
|
+
continue
|
|
61
|
+
return True
|
|
62
|
+
return False
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _handoff_summary(event) -> dict:
|
|
66
|
+
return {
|
|
67
|
+
"from": event.from_agent,
|
|
68
|
+
"to": event.to_agent,
|
|
69
|
+
"reason": event.reason,
|
|
70
|
+
}
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from agent_trace_eval.case import EvalCase
|
|
4
|
+
from agent_trace_eval.models import AgentTrace
|
|
5
|
+
from agent_trace_eval.result import MatchResult, Severity
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def match_ordering(trace: AgentTrace, case: EvalCase) -> list[MatchResult]:
|
|
9
|
+
expect = case.expect.get("ordering", {})
|
|
10
|
+
if not expect:
|
|
11
|
+
return []
|
|
12
|
+
|
|
13
|
+
results: list[MatchResult] = []
|
|
14
|
+
tool_names = [name for name in trace.event_names() if name is not None]
|
|
15
|
+
|
|
16
|
+
before_rules = expect.get("before", [])
|
|
17
|
+
for rule in before_rules:
|
|
18
|
+
first = rule.get("first")
|
|
19
|
+
second = rule.get("second")
|
|
20
|
+
if not first or not second:
|
|
21
|
+
continue
|
|
22
|
+
passed = _appears_before(tool_names, first, second)
|
|
23
|
+
results.append(
|
|
24
|
+
MatchResult(
|
|
25
|
+
name="ordering.before",
|
|
26
|
+
passed=passed,
|
|
27
|
+
message=(
|
|
28
|
+
f"'{first}' appears before '{second}'."
|
|
29
|
+
if passed
|
|
30
|
+
else f"'{first}' does not appear before '{second}'."
|
|
31
|
+
),
|
|
32
|
+
severity=case.severity("severity.ordering", Severity.HIGH),
|
|
33
|
+
details={"first": first, "second": second, "observed": tool_names},
|
|
34
|
+
)
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
after_rules = expect.get("after", [])
|
|
38
|
+
for rule in after_rules:
|
|
39
|
+
first = rule.get("first")
|
|
40
|
+
second = rule.get("second")
|
|
41
|
+
if not first or not second:
|
|
42
|
+
continue
|
|
43
|
+
passed = _appears_after(tool_names, first, second)
|
|
44
|
+
results.append(
|
|
45
|
+
MatchResult(
|
|
46
|
+
name="ordering.after",
|
|
47
|
+
passed=passed,
|
|
48
|
+
message=(
|
|
49
|
+
f"'{first}' appears after '{second}'."
|
|
50
|
+
if passed
|
|
51
|
+
else f"'{first}' does not appear after '{second}'."
|
|
52
|
+
),
|
|
53
|
+
severity=case.severity("severity.ordering", Severity.HIGH),
|
|
54
|
+
details={"first": first, "second": second, "observed": tool_names},
|
|
55
|
+
)
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
return results
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _appears_before(sequence: list[str], first: str, second: str) -> bool:
|
|
62
|
+
if first not in sequence or second not in sequence:
|
|
63
|
+
return False
|
|
64
|
+
return sequence.index(first) < sequence.index(second)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _appears_after(sequence: list[str], first: str, second: str) -> bool:
|
|
68
|
+
if first not in sequence or second not in sequence:
|
|
69
|
+
return False
|
|
70
|
+
return sequence.index(first) > sequence.index(second)
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from agent_trace_eval.case import EvalCase
|
|
4
|
+
from agent_trace_eval.models import AgentTrace
|
|
5
|
+
from agent_trace_eval.result import MatchResult, Severity
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def match_recovery(trace: AgentTrace, case: EvalCase) -> list[MatchResult]:
|
|
9
|
+
expect = case.expect.get("recovery", {})
|
|
10
|
+
if not expect:
|
|
11
|
+
return []
|
|
12
|
+
|
|
13
|
+
results: list[MatchResult] = []
|
|
14
|
+
decisions = trace.recovery_decisions()
|
|
15
|
+
|
|
16
|
+
required = expect.get("required", [])
|
|
17
|
+
for rule in required:
|
|
18
|
+
passed = _has_recovery(decisions, rule)
|
|
19
|
+
results.append(
|
|
20
|
+
MatchResult(
|
|
21
|
+
name="recovery.required",
|
|
22
|
+
passed=passed,
|
|
23
|
+
message=(
|
|
24
|
+
"Required recovery decision was observed."
|
|
25
|
+
if passed
|
|
26
|
+
else "Required recovery decision was not observed."
|
|
27
|
+
),
|
|
28
|
+
severity=case.severity("severity.recovery", Severity.HIGH),
|
|
29
|
+
details={
|
|
30
|
+
"rule": rule,
|
|
31
|
+
"observed": [_recovery_summary(event) for event in decisions],
|
|
32
|
+
},
|
|
33
|
+
)
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
forbidden = expect.get("forbidden", [])
|
|
37
|
+
for rule in forbidden:
|
|
38
|
+
passed = not _has_recovery(decisions, rule)
|
|
39
|
+
results.append(
|
|
40
|
+
MatchResult(
|
|
41
|
+
name="recovery.forbidden",
|
|
42
|
+
passed=passed,
|
|
43
|
+
message=(
|
|
44
|
+
"Forbidden recovery decision was not observed."
|
|
45
|
+
if passed
|
|
46
|
+
else "Forbidden recovery decision was observed."
|
|
47
|
+
),
|
|
48
|
+
severity=case.severity("severity.recovery", Severity.HIGH),
|
|
49
|
+
details={
|
|
50
|
+
"rule": rule,
|
|
51
|
+
"observed": [_recovery_summary(event) for event in decisions],
|
|
52
|
+
},
|
|
53
|
+
)
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
return results
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _has_recovery(decisions: list, rule: dict) -> bool:
|
|
60
|
+
for event in decisions:
|
|
61
|
+
if rule.get("decision") and event.decision != rule.get("decision"):
|
|
62
|
+
continue
|
|
63
|
+
if rule.get("error_class") and event.error_class != rule.get("error_class"):
|
|
64
|
+
continue
|
|
65
|
+
return True
|
|
66
|
+
return False
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _recovery_summary(event) -> dict:
|
|
70
|
+
return {
|
|
71
|
+
"decision": event.decision,
|
|
72
|
+
"error_class": event.error_class,
|
|
73
|
+
"reason": event.reason,
|
|
74
|
+
}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from agent_trace_eval.case import EvalCase
|
|
4
|
+
from agent_trace_eval.models import AgentTrace
|
|
5
|
+
from agent_trace_eval.result import MatchResult, Severity
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def match_tools(trace: AgentTrace, case: EvalCase) -> list[MatchResult]:
|
|
9
|
+
expect = case.expect.get("tools", {})
|
|
10
|
+
if not expect:
|
|
11
|
+
return []
|
|
12
|
+
|
|
13
|
+
results: list[MatchResult] = []
|
|
14
|
+
tool_names = [name for name in trace.event_names() if name is not None]
|
|
15
|
+
|
|
16
|
+
required = expect.get("required", [])
|
|
17
|
+
for tool_name in required:
|
|
18
|
+
passed = tool_name in tool_names
|
|
19
|
+
results.append(
|
|
20
|
+
MatchResult(
|
|
21
|
+
name="tools.required",
|
|
22
|
+
passed=passed,
|
|
23
|
+
message=(
|
|
24
|
+
f"Required tool '{tool_name}' was called."
|
|
25
|
+
if passed
|
|
26
|
+
else f"Required tool '{tool_name}' was not called."
|
|
27
|
+
),
|
|
28
|
+
severity=case.severity("severity.tools", Severity.HIGH),
|
|
29
|
+
details={"tool": tool_name, "observed": tool_names},
|
|
30
|
+
)
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
forbidden = expect.get("forbidden", [])
|
|
34
|
+
for tool_name in forbidden:
|
|
35
|
+
passed = tool_name not in tool_names
|
|
36
|
+
results.append(
|
|
37
|
+
MatchResult(
|
|
38
|
+
name="tools.forbidden",
|
|
39
|
+
passed=passed,
|
|
40
|
+
message=(
|
|
41
|
+
f"Forbidden tool '{tool_name}' was not called."
|
|
42
|
+
if passed
|
|
43
|
+
else f"Forbidden tool '{tool_name}' was called."
|
|
44
|
+
),
|
|
45
|
+
severity=case.severity("severity.tools", Severity.HIGH),
|
|
46
|
+
details={"tool": tool_name, "observed": tool_names},
|
|
47
|
+
)
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
exact_sequence = expect.get("exact_sequence")
|
|
51
|
+
if exact_sequence is not None:
|
|
52
|
+
passed = tool_names == list(exact_sequence)
|
|
53
|
+
results.append(
|
|
54
|
+
MatchResult(
|
|
55
|
+
name="tools.exact_sequence",
|
|
56
|
+
passed=passed,
|
|
57
|
+
message=(
|
|
58
|
+
"Tool call sequence matches expected order."
|
|
59
|
+
if passed
|
|
60
|
+
else "Tool call sequence does not match expected order."
|
|
61
|
+
),
|
|
62
|
+
severity=case.severity("severity.ordering", Severity.HIGH),
|
|
63
|
+
details={"expected": exact_sequence, "observed": tool_names},
|
|
64
|
+
)
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
return results
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class TraceEvent:
|
|
9
|
+
type: str
|
|
10
|
+
name: str | None = None
|
|
11
|
+
arguments: dict[str, Any] = field(default_factory=dict)
|
|
12
|
+
result: dict[str, Any] = field(default_factory=dict)
|
|
13
|
+
payload: dict[str, Any] = field(default_factory=dict)
|
|
14
|
+
from_agent: str | None = None
|
|
15
|
+
to_agent: str | None = None
|
|
16
|
+
reason: str | None = None
|
|
17
|
+
decision: str | None = None
|
|
18
|
+
error_class: str | None = None
|
|
19
|
+
text: str | None = None
|
|
20
|
+
status: str | None = None
|
|
21
|
+
source: str | None = None
|
|
22
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
def from_dict(cls, data: dict[str, Any]) -> "TraceEvent":
|
|
26
|
+
event_type = data.get("type", "unknown")
|
|
27
|
+
return cls(
|
|
28
|
+
type=event_type,
|
|
29
|
+
name=data.get("name"),
|
|
30
|
+
arguments=dict(data.get("arguments") or {}),
|
|
31
|
+
result=dict(data.get("result") or {}),
|
|
32
|
+
payload=dict(data.get("payload") or {}),
|
|
33
|
+
from_agent=data.get("from"),
|
|
34
|
+
to_agent=data.get("to"),
|
|
35
|
+
reason=data.get("reason"),
|
|
36
|
+
decision=data.get("decision"),
|
|
37
|
+
error_class=data.get("error_class"),
|
|
38
|
+
text=data.get("text"),
|
|
39
|
+
status=_event_status(data),
|
|
40
|
+
source=data.get("source"),
|
|
41
|
+
metadata={
|
|
42
|
+
key: value
|
|
43
|
+
for key, value in data.items()
|
|
44
|
+
if key
|
|
45
|
+
not in {
|
|
46
|
+
"type",
|
|
47
|
+
"name",
|
|
48
|
+
"arguments",
|
|
49
|
+
"result",
|
|
50
|
+
"payload",
|
|
51
|
+
"from",
|
|
52
|
+
"to",
|
|
53
|
+
"reason",
|
|
54
|
+
"decision",
|
|
55
|
+
"error_class",
|
|
56
|
+
"text",
|
|
57
|
+
"status",
|
|
58
|
+
"source",
|
|
59
|
+
}
|
|
60
|
+
},
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass
|
|
65
|
+
class AgentTrace:
|
|
66
|
+
case_id: str | None = None
|
|
67
|
+
run_id: str | None = None
|
|
68
|
+
input: str | None = None
|
|
69
|
+
events: list[TraceEvent] = field(default_factory=list)
|
|
70
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
71
|
+
|
|
72
|
+
@classmethod
|
|
73
|
+
def from_dict(cls, data: dict[str, Any]) -> "AgentTrace":
|
|
74
|
+
events = [TraceEvent.from_dict(event) for event in data.get("events", [])]
|
|
75
|
+
return cls(
|
|
76
|
+
case_id=data.get("case_id"),
|
|
77
|
+
run_id=data.get("run_id"),
|
|
78
|
+
input=data.get("input"),
|
|
79
|
+
events=events,
|
|
80
|
+
metadata={
|
|
81
|
+
key: value
|
|
82
|
+
for key, value in data.items()
|
|
83
|
+
if key not in {"case_id", "run_id", "input", "events"}
|
|
84
|
+
},
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
def tool_calls(self) -> list[TraceEvent]:
|
|
88
|
+
return [event for event in self.events if event.type == "tool_call"]
|
|
89
|
+
|
|
90
|
+
def handoffs(self) -> list[TraceEvent]:
|
|
91
|
+
return [event for event in self.events if event.type == "handoff"]
|
|
92
|
+
|
|
93
|
+
def recovery_decisions(self) -> list[TraceEvent]:
|
|
94
|
+
return [event for event in self.events if event.type == "recovery_decision"]
|
|
95
|
+
|
|
96
|
+
def final_answers(self) -> list[TraceEvent]:
|
|
97
|
+
return [event for event in self.events if event.type == "final_answer"]
|
|
98
|
+
|
|
99
|
+
def event_names(self) -> list[str | None]:
|
|
100
|
+
return [event.name for event in self.events if event.type == "tool_call"]
|
|
101
|
+
|
|
102
|
+
def find_tool_calls(self, tool_name: str) -> list[TraceEvent]:
|
|
103
|
+
return [event for event in self.tool_calls() if event.name == tool_name]
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _event_status(data: dict[str, Any]) -> str | None:
|
|
107
|
+
if "status" in data and data["status"] is not None:
|
|
108
|
+
return str(data["status"])
|
|
109
|
+
result = data.get("result")
|
|
110
|
+
if isinstance(result, dict) and "status" in result:
|
|
111
|
+
return str(result["status"])
|
|
112
|
+
return None
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from agent_trace_eval.result import CaseResult, SuiteResult
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def render_markdown_report(suite: SuiteResult) -> str:
|
|
7
|
+
lines = [
|
|
8
|
+
"# Agent Trace Regression Report",
|
|
9
|
+
"",
|
|
10
|
+
f"- Total cases: {suite.total}",
|
|
11
|
+
f"- Passed: {suite.total - suite.failed_count}",
|
|
12
|
+
f"- Failed: {suite.failed_count}",
|
|
13
|
+
"",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
for result in suite.case_results:
|
|
17
|
+
status = "PASS" if result.passed else "FAIL"
|
|
18
|
+
lines.append(f"## {status}: {result.case_id}")
|
|
19
|
+
if result.description:
|
|
20
|
+
lines.append(result.description)
|
|
21
|
+
if result.trace_path:
|
|
22
|
+
lines.append(f"Trace: `{result.trace_path}`")
|
|
23
|
+
lines.append("")
|
|
24
|
+
|
|
25
|
+
if not result.matches:
|
|
26
|
+
lines.append("_No matchers configured._")
|
|
27
|
+
lines.append("")
|
|
28
|
+
continue
|
|
29
|
+
|
|
30
|
+
for match in result.matches:
|
|
31
|
+
mark = "x" if match.failed else " "
|
|
32
|
+
lines.append(f"- [{mark}] **{match.name}** ({match.severity.value}): {match.message}")
|
|
33
|
+
lines.append("")
|
|
34
|
+
|
|
35
|
+
return "\n".join(lines).rstrip() + "\n"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def render_case_summary(result: CaseResult) -> str:
|
|
39
|
+
status = "PASS" if result.passed else "FAIL"
|
|
40
|
+
return f"{status} {result.case_id}: {len(result.failures)} failure(s)"
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Severity(str, Enum):
|
|
9
|
+
LOW = "low"
|
|
10
|
+
MEDIUM = "medium"
|
|
11
|
+
HIGH = "high"
|
|
12
|
+
CRITICAL = "critical"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class MatchResult:
|
|
17
|
+
name: str
|
|
18
|
+
passed: bool
|
|
19
|
+
message: str
|
|
20
|
+
severity: Severity = Severity.MEDIUM
|
|
21
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def failed(self) -> bool:
|
|
25
|
+
return not self.passed
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class CaseResult:
|
|
30
|
+
case_id: str
|
|
31
|
+
description: str
|
|
32
|
+
passed: bool
|
|
33
|
+
matches: list[MatchResult] = field(default_factory=list)
|
|
34
|
+
trace_path: str | None = None
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def failures(self) -> list[MatchResult]:
|
|
38
|
+
return [match for match in self.matches if match.failed]
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def highest_severity(self) -> Severity | None:
|
|
42
|
+
if not self.failures:
|
|
43
|
+
return None
|
|
44
|
+
order = [Severity.LOW, Severity.MEDIUM, Severity.HIGH, Severity.CRITICAL]
|
|
45
|
+
return max(self.failures, key=lambda match: order.index(match.severity)).severity
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class SuiteResult:
|
|
50
|
+
case_results: list[CaseResult] = field(default_factory=list)
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def passed(self) -> bool:
|
|
54
|
+
return all(result.passed for result in self.case_results)
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def total(self) -> int:
|
|
58
|
+
return len(self.case_results)
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def failed_count(self) -> int:
|
|
62
|
+
return sum(1 for result in self.case_results if not result.passed)
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from agent_trace_eval.case import EvalCase
|
|
6
|
+
from agent_trace_eval.loader import load_case, load_trace
|
|
7
|
+
from agent_trace_eval.matchers import (
|
|
8
|
+
match_arguments,
|
|
9
|
+
match_handoffs,
|
|
10
|
+
match_ordering,
|
|
11
|
+
match_recovery,
|
|
12
|
+
match_tools,
|
|
13
|
+
)
|
|
14
|
+
from agent_trace_eval.models import AgentTrace
|
|
15
|
+
from agent_trace_eval.result import CaseResult, SuiteResult
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class RegressionRunner:
|
|
19
|
+
def __init__(self, traces_dir: str | Path | None = None) -> None:
|
|
20
|
+
self.traces_dir = Path(traces_dir) if traces_dir else None
|
|
21
|
+
|
|
22
|
+
def run_case(self, case: EvalCase, trace: AgentTrace, trace_path: str | None = None) -> CaseResult:
|
|
23
|
+
matches = []
|
|
24
|
+
matches.extend(match_tools(trace, case))
|
|
25
|
+
matches.extend(match_arguments(trace, case))
|
|
26
|
+
matches.extend(match_ordering(trace, case))
|
|
27
|
+
matches.extend(match_handoffs(trace, case))
|
|
28
|
+
matches.extend(match_recovery(trace, case))
|
|
29
|
+
passed = all(match.passed for match in matches)
|
|
30
|
+
return CaseResult(
|
|
31
|
+
case_id=case.id,
|
|
32
|
+
description=case.description,
|
|
33
|
+
passed=passed,
|
|
34
|
+
matches=matches,
|
|
35
|
+
trace_path=trace_path,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
def run_case_file(
|
|
39
|
+
self,
|
|
40
|
+
case_or_path: EvalCase | str | Path,
|
|
41
|
+
trace_path: str | Path | None = None,
|
|
42
|
+
) -> CaseResult:
|
|
43
|
+
if isinstance(case_or_path, EvalCase):
|
|
44
|
+
case = case_or_path
|
|
45
|
+
else:
|
|
46
|
+
case = load_case(case_or_path)
|
|
47
|
+
resolved_trace = trace_path or self._default_trace_path(case.id)
|
|
48
|
+
trace = load_trace(resolved_trace)
|
|
49
|
+
return self.run_case(case, trace, str(resolved_trace))
|
|
50
|
+
|
|
51
|
+
def run_suite(self, cases: list[EvalCase], traces: dict[str, AgentTrace]) -> SuiteResult:
|
|
52
|
+
results = []
|
|
53
|
+
for case in cases:
|
|
54
|
+
if case.id not in traces:
|
|
55
|
+
results.append(
|
|
56
|
+
CaseResult(
|
|
57
|
+
case_id=case.id,
|
|
58
|
+
description=case.description,
|
|
59
|
+
passed=False,
|
|
60
|
+
matches=[],
|
|
61
|
+
trace_path=None,
|
|
62
|
+
)
|
|
63
|
+
)
|
|
64
|
+
continue
|
|
65
|
+
results.append(self.run_case(case, traces[case.id]))
|
|
66
|
+
return SuiteResult(case_results=results)
|
|
67
|
+
|
|
68
|
+
def _default_trace_path(self, case_id: str) -> Path:
|
|
69
|
+
if not self.traces_dir:
|
|
70
|
+
raise ValueError("trace_path is required when traces_dir is not configured.")
|
|
71
|
+
for suffix in (".json", ".yaml", ".yml"):
|
|
72
|
+
candidate = self.traces_dir / f"{case_id}{suffix}"
|
|
73
|
+
if candidate.exists():
|
|
74
|
+
return candidate
|
|
75
|
+
raise FileNotFoundError(f"No trace file found for case '{case_id}' in {self.traces_dir}")
|