agent-trace-eval 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ .eggs/
5
+ dist/
6
+ build/
7
+ .pytest_cache/
8
+ .coverage
9
+ htmlcov/
10
+ .venv/
11
+ venv/
12
+ .DS_Store
13
+ reports/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Yifan Cheng
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,148 @@
1
+ Metadata-Version: 2.4
2
+ Name: agent-trace-eval
3
+ Version: 0.1.0
4
+ Summary: Golden trace regression evaluation for tool-using AI agents
5
+ Project-URL: Homepage, https://github.com/yfccyf/agent-trace-eval
6
+ Project-URL: Documentation, https://github.com/yfccyf/agent-trace-eval#readme
7
+ Project-URL: Repository, https://github.com/yfccyf/agent-trace-eval
8
+ Author: Yifan Cheng
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: ai-agents,evaluation,observability,regression-testing,tool-calls
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Software Development :: Testing
20
+ Requires-Python: >=3.10
21
+ Requires-Dist: pyyaml>=6.0
22
+ Provides-Extra: dev
23
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
24
+ Requires-Dist: pytest>=8.0; extra == 'dev'
25
+ Description-Content-Type: text/markdown
26
+
27
+ # agent-trace-eval
28
+
29
+ [![CI](https://github.com/yfccyf/agent-trace-eval/actions/workflows/ci.yml/badge.svg)](https://github.com/yfccyf/agent-trace-eval/actions/workflows/ci.yml)
30
+ [![PyPI](https://img.shields.io/pypi/v/agent-trace-eval)](https://pypi.org/project/agent-trace-eval/)
31
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
32
+
33
+ Golden trace regression evaluation for tool-using AI agents.
34
+
35
+ `agent-trace-eval` checks agent execution traces against declarative expectations for:
36
+
37
+ - tool selection (required / forbidden tools)
38
+ - tool-call arguments
39
+ - tool-call ordering
40
+ - multi-agent handoffs
41
+ - recovery decisions (retry / fallback / escalate)
42
+
43
+ It is designed as a small, employer-neutral library you can use in CI to gate agent workflow changes.
44
+
45
+ ## Install
46
+
47
+ ```bash
48
+ pip install agent-trace-eval
49
+ ```
50
+
51
+ For local development:
52
+
53
+ ```bash
54
+ pip install -e ".[dev]"
55
+ ```
56
+
57
+ ## Quick start
58
+
59
+ Run the bundled examples:
60
+
61
+ ```bash
62
+ agent-trace-eval \
63
+ --cases examples/cases \
64
+ --traces-dir examples/traces \
65
+ --report reports/example-report.md
66
+ ```
67
+
68
+ ## Trace format
69
+
70
+ Traces are JSON or YAML documents with an `events` list. Supported event types include:
71
+
72
+ - `tool_call`
73
+ - `handoff`
74
+ - `recovery_decision`
75
+ - `final_answer`
76
+
77
+ Example:
78
+
79
+ ```json
80
+ {
81
+ "case_id": "refund_lookup",
82
+ "events": [
83
+ {
84
+ "type": "tool_call",
85
+ "name": "lookup_order",
86
+ "arguments": { "order_id": "12345" }
87
+ },
88
+ {
89
+ "type": "tool_call",
90
+ "name": "issue_refund",
91
+ "arguments": { "order_id": "12345" }
92
+ }
93
+ ]
94
+ }
95
+ ```
96
+
97
+ ## Case format
98
+
99
+ Cases are YAML or JSON files with `id`, `description`, `input`, and `expect` sections:
100
+
101
+ ```yaml
102
+ id: refund_lookup
103
+ description: Agent should look up an order before issuing a refund.
104
+ expect:
105
+ tools:
106
+ required: [lookup_order, issue_refund]
107
+ forbidden: [delete_account]
108
+ ordering:
109
+ before:
110
+ - first: lookup_order
111
+ second: issue_refund
112
+ arguments:
113
+ issue_refund:
114
+ order_id: "12345"
115
+ ```
116
+
117
+ ## Python API
118
+
119
+ ```python
120
+ from agent_trace_eval import RegressionRunner, render_markdown_report
121
+ from agent_trace_eval.loader import load_case, load_trace
122
+ from agent_trace_eval.result import SuiteResult
123
+
124
+ case = load_case("examples/cases/refund_lookup.yaml")
125
+ trace = load_trace("examples/traces/refund_lookup.json")
126
+
127
+ runner = RegressionRunner()
128
+ result = runner.run_case(case, trace)
129
+ report = render_markdown_report(SuiteResult(case_results=[result]))
130
+ print(report)
131
+ ```
132
+
133
+ ## Related writing
134
+
135
+ This project complements a series on agent regression testing and release gates:
136
+
137
+ - [Final Answers Are Not Enough (Part 1)](https://yfccyf.github.io/writing/final-answers-are-not-enough-golden-trace-regression-testing-part-1/)
138
+ - [From Golden Traces to Release Gates (Part 2)](https://yfccyf.github.io/writing/from-golden-traces-to-release-gates-building-an-agent-regression-harness-part-2/)
139
+
140
+ ## Development
141
+
142
+ ```bash
143
+ pytest
144
+ ```
145
+
146
+ ## License
147
+
148
+ MIT
@@ -0,0 +1,122 @@
1
+ # agent-trace-eval
2
+
3
+ [![CI](https://github.com/yfccyf/agent-trace-eval/actions/workflows/ci.yml/badge.svg)](https://github.com/yfccyf/agent-trace-eval/actions/workflows/ci.yml)
4
+ [![PyPI](https://img.shields.io/pypi/v/agent-trace-eval)](https://pypi.org/project/agent-trace-eval/)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
6
+
7
+ Golden trace regression evaluation for tool-using AI agents.
8
+
9
+ `agent-trace-eval` checks agent execution traces against declarative expectations for:
10
+
11
+ - tool selection (required / forbidden tools)
12
+ - tool-call arguments
13
+ - tool-call ordering
14
+ - multi-agent handoffs
15
+ - recovery decisions (retry / fallback / escalate)
16
+
17
+ It is designed as a small, employer-neutral library you can use in CI to gate agent workflow changes.
18
+
19
+ ## Install
20
+
21
+ ```bash
22
+ pip install agent-trace-eval
23
+ ```
24
+
25
+ For local development:
26
+
27
+ ```bash
28
+ pip install -e ".[dev]"
29
+ ```
30
+
31
+ ## Quick start
32
+
33
+ Run the bundled examples:
34
+
35
+ ```bash
36
+ agent-trace-eval \
37
+ --cases examples/cases \
38
+ --traces-dir examples/traces \
39
+ --report reports/example-report.md
40
+ ```
41
+
42
+ ## Trace format
43
+
44
+ Traces are JSON or YAML documents with an `events` list. Supported event types include:
45
+
46
+ - `tool_call`
47
+ - `handoff`
48
+ - `recovery_decision`
49
+ - `final_answer`
50
+
51
+ Example:
52
+
53
+ ```json
54
+ {
55
+ "case_id": "refund_lookup",
56
+ "events": [
57
+ {
58
+ "type": "tool_call",
59
+ "name": "lookup_order",
60
+ "arguments": { "order_id": "12345" }
61
+ },
62
+ {
63
+ "type": "tool_call",
64
+ "name": "issue_refund",
65
+ "arguments": { "order_id": "12345" }
66
+ }
67
+ ]
68
+ }
69
+ ```
70
+
71
+ ## Case format
72
+
73
+ Cases are YAML or JSON files with `id`, `description`, `input`, and `expect` sections:
74
+
75
+ ```yaml
76
+ id: refund_lookup
77
+ description: Agent should look up an order before issuing a refund.
78
+ expect:
79
+ tools:
80
+ required: [lookup_order, issue_refund]
81
+ forbidden: [delete_account]
82
+ ordering:
83
+ before:
84
+ - first: lookup_order
85
+ second: issue_refund
86
+ arguments:
87
+ issue_refund:
88
+ order_id: "12345"
89
+ ```
90
+
91
+ ## Python API
92
+
93
+ ```python
94
+ from agent_trace_eval import RegressionRunner, render_markdown_report
95
+ from agent_trace_eval.loader import load_case, load_trace
96
+ from agent_trace_eval.result import SuiteResult
97
+
98
+ case = load_case("examples/cases/refund_lookup.yaml")
99
+ trace = load_trace("examples/traces/refund_lookup.json")
100
+
101
+ runner = RegressionRunner()
102
+ result = runner.run_case(case, trace)
103
+ report = render_markdown_report(SuiteResult(case_results=[result]))
104
+ print(report)
105
+ ```
106
+
107
+ ## Related writing
108
+
109
+ This project complements a series on agent regression testing and release gates:
110
+
111
+ - [Final Answers Are Not Enough (Part 1)](https://yfccyf.github.io/writing/final-answers-are-not-enough-golden-trace-regression-testing-part-1/)
112
+ - [From Golden Traces to Release Gates (Part 2)](https://yfccyf.github.io/writing/from-golden-traces-to-release-gates-building-an-agent-regression-harness-part-2/)
113
+
114
+ ## Development
115
+
116
+ ```bash
117
+ pytest
118
+ ```
119
+
120
+ ## License
121
+
122
+ MIT
@@ -0,0 +1,55 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "agent-trace-eval"
7
+ version = "0.1.0"
8
+ description = "Golden trace regression evaluation for tool-using AI agents"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "Yifan Cheng" }]
13
+ keywords = ["ai-agents", "evaluation", "regression-testing", "tool-calls", "observability"]
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "Intended Audience :: Developers",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3.10",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Topic :: Software Development :: Testing",
23
+ ]
24
+ dependencies = [
25
+ "pyyaml>=6.0",
26
+ ]
27
+
28
+ [project.optional-dependencies]
29
+ dev = [
30
+ "pytest>=8.0",
31
+ "pytest-cov>=5.0",
32
+ ]
33
+
34
+ [project.scripts]
35
+ agent-trace-eval = "agent_trace_eval.cli:main"
36
+
37
+ [project.urls]
38
+ Homepage = "https://github.com/yfccyf/agent-trace-eval"
39
+ Documentation = "https://github.com/yfccyf/agent-trace-eval#readme"
40
+ Repository = "https://github.com/yfccyf/agent-trace-eval"
41
+
42
+ [tool.hatch.build.targets.wheel]
43
+ packages = ["src/agent_trace_eval"]
44
+
45
+ [tool.hatch.build.targets.sdist]
46
+ include = [
47
+ "src/agent_trace_eval",
48
+ "README.md",
49
+ "LICENSE",
50
+ "pyproject.toml",
51
+ ]
52
+
53
+ [tool.pytest.ini_options]
54
+ testpaths = ["tests"]
55
+ pythonpath = ["src"]
@@ -0,0 +1,19 @@
1
+ """Golden trace regression evaluation for tool-using AI agents."""
2
+
3
+ from agent_trace_eval.case import EvalCase
4
+ from agent_trace_eval.models import AgentTrace
5
+ from agent_trace_eval.report import render_markdown_report
6
+ from agent_trace_eval.result import CaseResult, MatchResult, Severity
7
+ from agent_trace_eval.runner import RegressionRunner
8
+
9
+ __all__ = [
10
+ "AgentTrace",
11
+ "CaseResult",
12
+ "MatchResult",
13
+ "RegressionRunner",
14
+ "Severity",
15
+ "EvalCase",
16
+ "render_markdown_report",
17
+ ]
18
+
19
+ __version__ = "0.1.0"
@@ -0,0 +1,40 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Any
5
+
6
+ from agent_trace_eval.result import Severity
7
+
8
+
9
+ @dataclass
10
+ class EvalCase:
11
+ id: str
12
+ description: str = ""
13
+ input: dict[str, Any] = field(default_factory=dict)
14
+ expect: dict[str, Any] = field(default_factory=dict)
15
+
16
+ @classmethod
17
+ def from_dict(cls, data: dict[str, Any]) -> "EvalCase":
18
+ case_id = data.get("id") or data.get("case_id")
19
+ if not case_id:
20
+ raise ValueError("Test case must include an 'id' or 'case_id' field.")
21
+ return cls(
22
+ id=str(case_id),
23
+ description=str(data.get("description", "")),
24
+ input=dict(data.get("input") or {}),
25
+ expect=dict(data.get("expect") or {}),
26
+ )
27
+
28
+ def severity(self, key: str, default: Severity = Severity.MEDIUM) -> Severity:
29
+ raw = self._lookup(self.expect, key)
30
+ if raw is None:
31
+ return default
32
+ return Severity(str(raw))
33
+
34
+ def _lookup(self, data: dict[str, Any], dotted_key: str) -> Any:
35
+ current: Any = data
36
+ for part in dotted_key.split("."):
37
+ if not isinstance(current, dict) or part not in current:
38
+ return None
39
+ current = current[part]
40
+ return current
@@ -0,0 +1,72 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ from agent_trace_eval.loader import load_cases
8
+ from agent_trace_eval.report import render_markdown_report
9
+ from agent_trace_eval.result import SuiteResult
10
+ from agent_trace_eval.runner import RegressionRunner
11
+
12
+
13
+ def build_parser() -> argparse.ArgumentParser:
14
+ parser = argparse.ArgumentParser(
15
+ prog="agent-trace-eval",
16
+ description="Evaluate agent execution traces against golden expectations.",
17
+ )
18
+ parser.add_argument(
19
+ "--cases",
20
+ required=True,
21
+ help="Path to a YAML/JSON case file or directory of case files.",
22
+ )
23
+ parser.add_argument(
24
+ "--traces-dir",
25
+ required=True,
26
+ help="Directory containing trace files named <case_id>.json or .yaml.",
27
+ )
28
+ parser.add_argument(
29
+ "--report",
30
+ help="Optional path to write a Markdown report.",
31
+ )
32
+ return parser
33
+
34
+
35
+ def _collect_case_files(path: Path) -> list[Path]:
36
+ if path.is_file():
37
+ return [path]
38
+ return sorted(path.glob("*.yml")) + sorted(path.glob("*.yaml")) + sorted(path.glob("*.json"))
39
+
40
+
41
+ def main(argv: list[str] | None = None) -> int:
42
+ parser = build_parser()
43
+ args = parser.parse_args(argv)
44
+
45
+ cases_path = Path(args.cases)
46
+ traces_dir = Path(args.traces_dir)
47
+ runner = RegressionRunner(traces_dir=traces_dir)
48
+
49
+ case_files = _collect_case_files(cases_path)
50
+ if not case_files:
51
+ print(f"No case files found at {cases_path}", file=sys.stderr)
52
+ return 2
53
+
54
+ results = []
55
+ for case_file in case_files:
56
+ for case in load_cases(case_file):
57
+ results.append(runner.run_case_file(case))
58
+
59
+ suite = SuiteResult(case_results=results)
60
+ report = render_markdown_report(suite)
61
+ print(report)
62
+
63
+ if args.report:
64
+ report_path = Path(args.report)
65
+ report_path.parent.mkdir(parents=True, exist_ok=True)
66
+ report_path.write_text(report, encoding="utf-8")
67
+
68
+ return 0 if suite.passed else 1
69
+
70
+
71
+ if __name__ == "__main__":
72
+ raise SystemExit(main())
@@ -0,0 +1,37 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ import yaml
8
+
9
+ from agent_trace_eval.case import EvalCase
10
+ from agent_trace_eval.models import AgentTrace
11
+
12
+
13
+ def load_trace(path: str | Path) -> AgentTrace:
14
+ data = _load_data(path)
15
+ return AgentTrace.from_dict(data)
16
+
17
+
18
+ def load_case(path: str | Path) -> EvalCase:
19
+ data = _load_data(path)
20
+ return EvalCase.from_dict(data)
21
+
22
+
23
+ def load_cases(path: str | Path) -> list[EvalCase]:
24
+ data = _load_data(path)
25
+ if isinstance(data, list):
26
+ return [EvalCase.from_dict(item) for item in data]
27
+ if "cases" in data:
28
+ return [EvalCase.from_dict(item) for item in data["cases"]]
29
+ return [EvalCase.from_dict(data)]
30
+
31
+
32
+ def _load_data(path: str | Path) -> dict[str, Any] | list[dict[str, Any]]:
33
+ file_path = Path(path)
34
+ text = file_path.read_text(encoding="utf-8")
35
+ if file_path.suffix in {".yaml", ".yml"}:
36
+ return yaml.safe_load(text)
37
+ return json.loads(text)
@@ -0,0 +1,13 @@
1
+ from agent_trace_eval.matchers.arguments import match_arguments
2
+ from agent_trace_eval.matchers.handoffs import match_handoffs
3
+ from agent_trace_eval.matchers.ordering import match_ordering
4
+ from agent_trace_eval.matchers.recovery import match_recovery
5
+ from agent_trace_eval.matchers.tools import match_tools
6
+
7
+ __all__ = [
8
+ "match_arguments",
9
+ "match_handoffs",
10
+ "match_ordering",
11
+ "match_recovery",
12
+ "match_tools",
13
+ ]
@@ -0,0 +1,81 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from agent_trace_eval.case import EvalCase
6
+ from agent_trace_eval.models import AgentTrace
7
+ from agent_trace_eval.result import MatchResult, Severity
8
+
9
+
10
+ def match_arguments(trace: AgentTrace, case: EvalCase) -> list[MatchResult]:
11
+ expect = case.expect.get("arguments", {})
12
+ if not expect:
13
+ return []
14
+
15
+ results: list[MatchResult] = []
16
+ for tool_name, rules in expect.items():
17
+ if not isinstance(rules, dict):
18
+ continue
19
+ calls = trace.find_tool_calls(tool_name)
20
+ if not calls:
21
+ results.append(
22
+ MatchResult(
23
+ name="arguments.tool_present",
24
+ passed=False,
25
+ message=f"Cannot evaluate arguments for '{tool_name}' because it was not called.",
26
+ severity=case.severity("severity.arguments", Severity.HIGH),
27
+ details={"tool": tool_name},
28
+ )
29
+ )
30
+ continue
31
+
32
+ call = calls[0]
33
+ for arg_name, expected in rules.items():
34
+ if isinstance(expected, dict) and "any_of" in expected:
35
+ result = _match_any_of(tool_name, arg_name, call.arguments, expected["any_of"])
36
+ else:
37
+ result = _match_exact(tool_name, arg_name, call.arguments, expected)
38
+ result.severity = case.severity("severity.arguments", Severity.HIGH)
39
+ results.append(result)
40
+
41
+ return results
42
+
43
+
44
+ def _match_exact(
45
+ tool_name: str,
46
+ arg_name: str,
47
+ arguments: dict[str, Any],
48
+ expected: Any,
49
+ ) -> MatchResult:
50
+ actual = arguments.get(arg_name)
51
+ passed = actual == expected
52
+ return MatchResult(
53
+ name="arguments.exact",
54
+ passed=passed,
55
+ message=(
56
+ f"Argument '{arg_name}' for '{tool_name}' matches expected value."
57
+ if passed
58
+ else f"Argument '{arg_name}' for '{tool_name}' does not match expected value."
59
+ ),
60
+ details={"tool": tool_name, "argument": arg_name, "expected": expected, "actual": actual},
61
+ )
62
+
63
+
64
+ def _match_any_of(
65
+ tool_name: str,
66
+ arg_name: str,
67
+ arguments: dict[str, Any],
68
+ options: list[Any],
69
+ ) -> MatchResult:
70
+ actual = arguments.get(arg_name)
71
+ passed = actual in options
72
+ return MatchResult(
73
+ name="arguments.any_of",
74
+ passed=passed,
75
+ message=(
76
+ f"Argument '{arg_name}' for '{tool_name}' is one of the allowed values."
77
+ if passed
78
+ else f"Argument '{arg_name}' for '{tool_name}' is not one of the allowed values."
79
+ ),
80
+ details={"tool": tool_name, "argument": arg_name, "options": options, "actual": actual},
81
+ )
@@ -0,0 +1,70 @@
1
+ from __future__ import annotations
2
+
3
+ from agent_trace_eval.case import EvalCase
4
+ from agent_trace_eval.models import AgentTrace
5
+ from agent_trace_eval.result import MatchResult, Severity
6
+
7
+
8
+ def match_handoffs(trace: AgentTrace, case: EvalCase) -> list[MatchResult]:
9
+ expect = case.expect.get("handoffs", {})
10
+ if not expect:
11
+ return []
12
+
13
+ results: list[MatchResult] = []
14
+ handoffs = trace.handoffs()
15
+
16
+ required = expect.get("required", [])
17
+ for rule in required:
18
+ passed = _has_handoff(handoffs, rule)
19
+ results.append(
20
+ MatchResult(
21
+ name="handoffs.required",
22
+ passed=passed,
23
+ message=(
24
+ "Required handoff was observed."
25
+ if passed
26
+ else "Required handoff was not observed."
27
+ ),
28
+ severity=case.severity("severity.handoffs", Severity.HIGH),
29
+ details={"rule": rule, "observed": [_handoff_summary(event) for event in handoffs]},
30
+ )
31
+ )
32
+
33
+ forbidden = expect.get("forbidden", [])
34
+ for rule in forbidden:
35
+ passed = not _has_handoff(handoffs, rule)
36
+ results.append(
37
+ MatchResult(
38
+ name="handoffs.forbidden",
39
+ passed=passed,
40
+ message=(
41
+ "Forbidden handoff was not observed."
42
+ if passed
43
+ else "Forbidden handoff was observed."
44
+ ),
45
+ severity=case.severity("severity.handoffs", Severity.HIGH),
46
+ details={"rule": rule, "observed": [_handoff_summary(event) for event in handoffs]},
47
+ )
48
+ )
49
+
50
+ return results
51
+
52
+
53
+ def _has_handoff(handoffs: list, rule: dict) -> bool:
54
+ for event in handoffs:
55
+ if rule.get("from") and event.from_agent != rule.get("from"):
56
+ continue
57
+ if rule.get("to") and event.to_agent != rule.get("to"):
58
+ continue
59
+ if rule.get("reason") and event.reason != rule.get("reason"):
60
+ continue
61
+ return True
62
+ return False
63
+
64
+
65
+ def _handoff_summary(event) -> dict:
66
+ return {
67
+ "from": event.from_agent,
68
+ "to": event.to_agent,
69
+ "reason": event.reason,
70
+ }
@@ -0,0 +1,70 @@
1
+ from __future__ import annotations
2
+
3
+ from agent_trace_eval.case import EvalCase
4
+ from agent_trace_eval.models import AgentTrace
5
+ from agent_trace_eval.result import MatchResult, Severity
6
+
7
+
8
+ def match_ordering(trace: AgentTrace, case: EvalCase) -> list[MatchResult]:
9
+ expect = case.expect.get("ordering", {})
10
+ if not expect:
11
+ return []
12
+
13
+ results: list[MatchResult] = []
14
+ tool_names = [name for name in trace.event_names() if name is not None]
15
+
16
+ before_rules = expect.get("before", [])
17
+ for rule in before_rules:
18
+ first = rule.get("first")
19
+ second = rule.get("second")
20
+ if not first or not second:
21
+ continue
22
+ passed = _appears_before(tool_names, first, second)
23
+ results.append(
24
+ MatchResult(
25
+ name="ordering.before",
26
+ passed=passed,
27
+ message=(
28
+ f"'{first}' appears before '{second}'."
29
+ if passed
30
+ else f"'{first}' does not appear before '{second}'."
31
+ ),
32
+ severity=case.severity("severity.ordering", Severity.HIGH),
33
+ details={"first": first, "second": second, "observed": tool_names},
34
+ )
35
+ )
36
+
37
+ after_rules = expect.get("after", [])
38
+ for rule in after_rules:
39
+ first = rule.get("first")
40
+ second = rule.get("second")
41
+ if not first or not second:
42
+ continue
43
+ passed = _appears_after(tool_names, first, second)
44
+ results.append(
45
+ MatchResult(
46
+ name="ordering.after",
47
+ passed=passed,
48
+ message=(
49
+ f"'{first}' appears after '{second}'."
50
+ if passed
51
+ else f"'{first}' does not appear after '{second}'."
52
+ ),
53
+ severity=case.severity("severity.ordering", Severity.HIGH),
54
+ details={"first": first, "second": second, "observed": tool_names},
55
+ )
56
+ )
57
+
58
+ return results
59
+
60
+
61
+ def _appears_before(sequence: list[str], first: str, second: str) -> bool:
62
+ if first not in sequence or second not in sequence:
63
+ return False
64
+ return sequence.index(first) < sequence.index(second)
65
+
66
+
67
+ def _appears_after(sequence: list[str], first: str, second: str) -> bool:
68
+ if first not in sequence or second not in sequence:
69
+ return False
70
+ return sequence.index(first) > sequence.index(second)
@@ -0,0 +1,74 @@
1
+ from __future__ import annotations
2
+
3
+ from agent_trace_eval.case import EvalCase
4
+ from agent_trace_eval.models import AgentTrace
5
+ from agent_trace_eval.result import MatchResult, Severity
6
+
7
+
8
+ def match_recovery(trace: AgentTrace, case: EvalCase) -> list[MatchResult]:
9
+ expect = case.expect.get("recovery", {})
10
+ if not expect:
11
+ return []
12
+
13
+ results: list[MatchResult] = []
14
+ decisions = trace.recovery_decisions()
15
+
16
+ required = expect.get("required", [])
17
+ for rule in required:
18
+ passed = _has_recovery(decisions, rule)
19
+ results.append(
20
+ MatchResult(
21
+ name="recovery.required",
22
+ passed=passed,
23
+ message=(
24
+ "Required recovery decision was observed."
25
+ if passed
26
+ else "Required recovery decision was not observed."
27
+ ),
28
+ severity=case.severity("severity.recovery", Severity.HIGH),
29
+ details={
30
+ "rule": rule,
31
+ "observed": [_recovery_summary(event) for event in decisions],
32
+ },
33
+ )
34
+ )
35
+
36
+ forbidden = expect.get("forbidden", [])
37
+ for rule in forbidden:
38
+ passed = not _has_recovery(decisions, rule)
39
+ results.append(
40
+ MatchResult(
41
+ name="recovery.forbidden",
42
+ passed=passed,
43
+ message=(
44
+ "Forbidden recovery decision was not observed."
45
+ if passed
46
+ else "Forbidden recovery decision was observed."
47
+ ),
48
+ severity=case.severity("severity.recovery", Severity.HIGH),
49
+ details={
50
+ "rule": rule,
51
+ "observed": [_recovery_summary(event) for event in decisions],
52
+ },
53
+ )
54
+ )
55
+
56
+ return results
57
+
58
+
59
+ def _has_recovery(decisions: list, rule: dict) -> bool:
60
+ for event in decisions:
61
+ if rule.get("decision") and event.decision != rule.get("decision"):
62
+ continue
63
+ if rule.get("error_class") and event.error_class != rule.get("error_class"):
64
+ continue
65
+ return True
66
+ return False
67
+
68
+
69
+ def _recovery_summary(event) -> dict:
70
+ return {
71
+ "decision": event.decision,
72
+ "error_class": event.error_class,
73
+ "reason": event.reason,
74
+ }
@@ -0,0 +1,67 @@
1
+ from __future__ import annotations
2
+
3
+ from agent_trace_eval.case import EvalCase
4
+ from agent_trace_eval.models import AgentTrace
5
+ from agent_trace_eval.result import MatchResult, Severity
6
+
7
+
8
+ def match_tools(trace: AgentTrace, case: EvalCase) -> list[MatchResult]:
9
+ expect = case.expect.get("tools", {})
10
+ if not expect:
11
+ return []
12
+
13
+ results: list[MatchResult] = []
14
+ tool_names = [name for name in trace.event_names() if name is not None]
15
+
16
+ required = expect.get("required", [])
17
+ for tool_name in required:
18
+ passed = tool_name in tool_names
19
+ results.append(
20
+ MatchResult(
21
+ name="tools.required",
22
+ passed=passed,
23
+ message=(
24
+ f"Required tool '{tool_name}' was called."
25
+ if passed
26
+ else f"Required tool '{tool_name}' was not called."
27
+ ),
28
+ severity=case.severity("severity.tools", Severity.HIGH),
29
+ details={"tool": tool_name, "observed": tool_names},
30
+ )
31
+ )
32
+
33
+ forbidden = expect.get("forbidden", [])
34
+ for tool_name in forbidden:
35
+ passed = tool_name not in tool_names
36
+ results.append(
37
+ MatchResult(
38
+ name="tools.forbidden",
39
+ passed=passed,
40
+ message=(
41
+ f"Forbidden tool '{tool_name}' was not called."
42
+ if passed
43
+ else f"Forbidden tool '{tool_name}' was called."
44
+ ),
45
+ severity=case.severity("severity.tools", Severity.HIGH),
46
+ details={"tool": tool_name, "observed": tool_names},
47
+ )
48
+ )
49
+
50
+ exact_sequence = expect.get("exact_sequence")
51
+ if exact_sequence is not None:
52
+ passed = tool_names == list(exact_sequence)
53
+ results.append(
54
+ MatchResult(
55
+ name="tools.exact_sequence",
56
+ passed=passed,
57
+ message=(
58
+ "Tool call sequence matches expected order."
59
+ if passed
60
+ else "Tool call sequence does not match expected order."
61
+ ),
62
+ severity=case.severity("severity.ordering", Severity.HIGH),
63
+ details={"expected": exact_sequence, "observed": tool_names},
64
+ )
65
+ )
66
+
67
+ return results
@@ -0,0 +1,112 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Any
5
+
6
+
7
+ @dataclass
8
+ class TraceEvent:
9
+ type: str
10
+ name: str | None = None
11
+ arguments: dict[str, Any] = field(default_factory=dict)
12
+ result: dict[str, Any] = field(default_factory=dict)
13
+ payload: dict[str, Any] = field(default_factory=dict)
14
+ from_agent: str | None = None
15
+ to_agent: str | None = None
16
+ reason: str | None = None
17
+ decision: str | None = None
18
+ error_class: str | None = None
19
+ text: str | None = None
20
+ status: str | None = None
21
+ source: str | None = None
22
+ metadata: dict[str, Any] = field(default_factory=dict)
23
+
24
+ @classmethod
25
+ def from_dict(cls, data: dict[str, Any]) -> "TraceEvent":
26
+ event_type = data.get("type", "unknown")
27
+ return cls(
28
+ type=event_type,
29
+ name=data.get("name"),
30
+ arguments=dict(data.get("arguments") or {}),
31
+ result=dict(data.get("result") or {}),
32
+ payload=dict(data.get("payload") or {}),
33
+ from_agent=data.get("from"),
34
+ to_agent=data.get("to"),
35
+ reason=data.get("reason"),
36
+ decision=data.get("decision"),
37
+ error_class=data.get("error_class"),
38
+ text=data.get("text"),
39
+ status=_event_status(data),
40
+ source=data.get("source"),
41
+ metadata={
42
+ key: value
43
+ for key, value in data.items()
44
+ if key
45
+ not in {
46
+ "type",
47
+ "name",
48
+ "arguments",
49
+ "result",
50
+ "payload",
51
+ "from",
52
+ "to",
53
+ "reason",
54
+ "decision",
55
+ "error_class",
56
+ "text",
57
+ "status",
58
+ "source",
59
+ }
60
+ },
61
+ )
62
+
63
+
64
+ @dataclass
65
+ class AgentTrace:
66
+ case_id: str | None = None
67
+ run_id: str | None = None
68
+ input: str | None = None
69
+ events: list[TraceEvent] = field(default_factory=list)
70
+ metadata: dict[str, Any] = field(default_factory=dict)
71
+
72
+ @classmethod
73
+ def from_dict(cls, data: dict[str, Any]) -> "AgentTrace":
74
+ events = [TraceEvent.from_dict(event) for event in data.get("events", [])]
75
+ return cls(
76
+ case_id=data.get("case_id"),
77
+ run_id=data.get("run_id"),
78
+ input=data.get("input"),
79
+ events=events,
80
+ metadata={
81
+ key: value
82
+ for key, value in data.items()
83
+ if key not in {"case_id", "run_id", "input", "events"}
84
+ },
85
+ )
86
+
87
+ def tool_calls(self) -> list[TraceEvent]:
88
+ return [event for event in self.events if event.type == "tool_call"]
89
+
90
+ def handoffs(self) -> list[TraceEvent]:
91
+ return [event for event in self.events if event.type == "handoff"]
92
+
93
+ def recovery_decisions(self) -> list[TraceEvent]:
94
+ return [event for event in self.events if event.type == "recovery_decision"]
95
+
96
+ def final_answers(self) -> list[TraceEvent]:
97
+ return [event for event in self.events if event.type == "final_answer"]
98
+
99
+ def event_names(self) -> list[str | None]:
100
+ return [event.name for event in self.events if event.type == "tool_call"]
101
+
102
+ def find_tool_calls(self, tool_name: str) -> list[TraceEvent]:
103
+ return [event for event in self.tool_calls() if event.name == tool_name]
104
+
105
+
106
+ def _event_status(data: dict[str, Any]) -> str | None:
107
+ if "status" in data and data["status"] is not None:
108
+ return str(data["status"])
109
+ result = data.get("result")
110
+ if isinstance(result, dict) and "status" in result:
111
+ return str(result["status"])
112
+ return None
@@ -0,0 +1,40 @@
1
+ from __future__ import annotations
2
+
3
+ from agent_trace_eval.result import CaseResult, SuiteResult
4
+
5
+
6
+ def render_markdown_report(suite: SuiteResult) -> str:
7
+ lines = [
8
+ "# Agent Trace Regression Report",
9
+ "",
10
+ f"- Total cases: {suite.total}",
11
+ f"- Passed: {suite.total - suite.failed_count}",
12
+ f"- Failed: {suite.failed_count}",
13
+ "",
14
+ ]
15
+
16
+ for result in suite.case_results:
17
+ status = "PASS" if result.passed else "FAIL"
18
+ lines.append(f"## {status}: {result.case_id}")
19
+ if result.description:
20
+ lines.append(result.description)
21
+ if result.trace_path:
22
+ lines.append(f"Trace: `{result.trace_path}`")
23
+ lines.append("")
24
+
25
+ if not result.matches:
26
+ lines.append("_No matchers configured._")
27
+ lines.append("")
28
+ continue
29
+
30
+ for match in result.matches:
31
+ mark = "x" if match.failed else " "
32
+ lines.append(f"- [{mark}] **{match.name}** ({match.severity.value}): {match.message}")
33
+ lines.append("")
34
+
35
+ return "\n".join(lines).rstrip() + "\n"
36
+
37
+
38
+ def render_case_summary(result: CaseResult) -> str:
39
+ status = "PASS" if result.passed else "FAIL"
40
+ return f"{status} {result.case_id}: {len(result.failures)} failure(s)"
@@ -0,0 +1,62 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from enum import Enum
5
+ from typing import Any
6
+
7
+
8
+ class Severity(str, Enum):
9
+ LOW = "low"
10
+ MEDIUM = "medium"
11
+ HIGH = "high"
12
+ CRITICAL = "critical"
13
+
14
+
15
+ @dataclass
16
+ class MatchResult:
17
+ name: str
18
+ passed: bool
19
+ message: str
20
+ severity: Severity = Severity.MEDIUM
21
+ details: dict[str, Any] = field(default_factory=dict)
22
+
23
+ @property
24
+ def failed(self) -> bool:
25
+ return not self.passed
26
+
27
+
28
+ @dataclass
29
+ class CaseResult:
30
+ case_id: str
31
+ description: str
32
+ passed: bool
33
+ matches: list[MatchResult] = field(default_factory=list)
34
+ trace_path: str | None = None
35
+
36
+ @property
37
+ def failures(self) -> list[MatchResult]:
38
+ return [match for match in self.matches if match.failed]
39
+
40
+ @property
41
+ def highest_severity(self) -> Severity | None:
42
+ if not self.failures:
43
+ return None
44
+ order = [Severity.LOW, Severity.MEDIUM, Severity.HIGH, Severity.CRITICAL]
45
+ return max(self.failures, key=lambda match: order.index(match.severity)).severity
46
+
47
+
48
+ @dataclass
49
+ class SuiteResult:
50
+ case_results: list[CaseResult] = field(default_factory=list)
51
+
52
+ @property
53
+ def passed(self) -> bool:
54
+ return all(result.passed for result in self.case_results)
55
+
56
+ @property
57
+ def total(self) -> int:
58
+ return len(self.case_results)
59
+
60
+ @property
61
+ def failed_count(self) -> int:
62
+ return sum(1 for result in self.case_results if not result.passed)
@@ -0,0 +1,75 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from agent_trace_eval.case import EvalCase
6
+ from agent_trace_eval.loader import load_case, load_trace
7
+ from agent_trace_eval.matchers import (
8
+ match_arguments,
9
+ match_handoffs,
10
+ match_ordering,
11
+ match_recovery,
12
+ match_tools,
13
+ )
14
+ from agent_trace_eval.models import AgentTrace
15
+ from agent_trace_eval.result import CaseResult, SuiteResult
16
+
17
+
18
+ class RegressionRunner:
19
+ def __init__(self, traces_dir: str | Path | None = None) -> None:
20
+ self.traces_dir = Path(traces_dir) if traces_dir else None
21
+
22
+ def run_case(self, case: EvalCase, trace: AgentTrace, trace_path: str | None = None) -> CaseResult:
23
+ matches = []
24
+ matches.extend(match_tools(trace, case))
25
+ matches.extend(match_arguments(trace, case))
26
+ matches.extend(match_ordering(trace, case))
27
+ matches.extend(match_handoffs(trace, case))
28
+ matches.extend(match_recovery(trace, case))
29
+ passed = all(match.passed for match in matches)
30
+ return CaseResult(
31
+ case_id=case.id,
32
+ description=case.description,
33
+ passed=passed,
34
+ matches=matches,
35
+ trace_path=trace_path,
36
+ )
37
+
38
+ def run_case_file(
39
+ self,
40
+ case_or_path: EvalCase | str | Path,
41
+ trace_path: str | Path | None = None,
42
+ ) -> CaseResult:
43
+ if isinstance(case_or_path, EvalCase):
44
+ case = case_or_path
45
+ else:
46
+ case = load_case(case_or_path)
47
+ resolved_trace = trace_path or self._default_trace_path(case.id)
48
+ trace = load_trace(resolved_trace)
49
+ return self.run_case(case, trace, str(resolved_trace))
50
+
51
+ def run_suite(self, cases: list[EvalCase], traces: dict[str, AgentTrace]) -> SuiteResult:
52
+ results = []
53
+ for case in cases:
54
+ if case.id not in traces:
55
+ results.append(
56
+ CaseResult(
57
+ case_id=case.id,
58
+ description=case.description,
59
+ passed=False,
60
+ matches=[],
61
+ trace_path=None,
62
+ )
63
+ )
64
+ continue
65
+ results.append(self.run_case(case, traces[case.id]))
66
+ return SuiteResult(case_results=results)
67
+
68
+ def _default_trace_path(self, case_id: str) -> Path:
69
+ if not self.traces_dir:
70
+ raise ValueError("trace_path is required when traces_dir is not configured.")
71
+ for suffix in (".json", ".yaml", ".yml"):
72
+ candidate = self.traces_dir / f"{case_id}{suffix}"
73
+ if candidate.exists():
74
+ return candidate
75
+ raise FileNotFoundError(f"No trace file found for case '{case_id}' in {self.traces_dir}")