traceval 0.2.1__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {traceval-0.2.1 → traceval-0.2.2}/CHANGELOG.md +6 -0
- traceval-0.2.2/CONTRIBUTING.md +22 -0
- traceval-0.2.2/Makefile +14 -0
- {traceval-0.2.1 → traceval-0.2.2}/PKG-INFO +8 -3
- {traceval-0.2.1 → traceval-0.2.2}/README.md +7 -2
- traceval-0.2.2/docs/img/report.png +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/pyproject.toml +1 -1
- traceval-0.2.2/src/traceval/__init__.py +1 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/compile/templates/conftest.py.jinja +112 -34
- traceval-0.2.2/tests/test_broken_target.py +90 -0
- {traceval-0.2.1 → traceval-0.2.2}/uv.lock +1 -1
- traceval-0.2.1/docs/img/report.png +0 -0
- traceval-0.2.1/src/traceval/__init__.py +0 -1
- {traceval-0.2.1 → traceval-0.2.2}/.github/workflows/ci.yml +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/.gitignore +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/.pre-commit-config.yaml +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/LICENSE +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/action.yml +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/docs/formats.md +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/examples/demo.sh +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/examples/demo_agent/agent.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/examples/demo_agent/core.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/examples/make_traces.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/examples/synthetic_traces.jsonl +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/scripts/readme-outputs.sh +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/analyze/__init__.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/analyze/cluster.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/analyze/coverage.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/analyze/outcomes.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/analyze/report.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/cli.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/compile/__init__.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/compile/cases.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/compile/emit_pytest.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/compile/emit_yaml.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/compile/rubrics.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/compile/templates/test_generated.py.jinja +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/demo/__init__.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/demo/agent.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/demo/traces.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/ingest/__init__.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/ingest/base.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/ingest/generic.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/ingest/langfuse.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/ingest/langsmith.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/ingest/otel.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/model.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/run/calibrate.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/run/judge.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/run/runner.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/run/scorers.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/run/target.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/src/traceval/store.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/tests/fixtures/README.md +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/tests/fixtures/generic_traces.jsonl +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/tests/fixtures/langfuse_export.jsonl +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/tests/fixtures/langsmith_runs.jsonl +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/tests/fixtures/otel_spans.jsonl +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/tests/test_calibrate.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/tests/test_cli.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/tests/test_e2e_demo.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/tests/test_json_output.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/tests/test_phase1.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/tests/test_phase2.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/tests/test_phase3.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/tests/test_phase4.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/tests/test_phase5.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/tests/test_phase6.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/tests/test_regression_cases.py +0 -0
- {traceval-0.2.1 → traceval-0.2.2}/tests/test_serve.py +0 -0
|
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.2.2] - 2026-07-02
|
|
9
|
+
|
|
10
|
+
### Fixed
|
|
11
|
+
- A run in which zero cases execute (e.g. unresolvable target) now writes a self-describing run report with an `errors` section instead of writing nothing; `--json` never reports `null`. Added a single clear top-level error line on target-resolution failure. Added `make test` and CONTRIBUTING as the canonical dev commands. Reported via external review of a failed-target invocation.
|
|
12
|
+
- Run report schema additions (existing fields unchanged): `summary.errored` counts cases that never executed due to setup/collection errors; top-level `errors` lists `{stage, detail}` entries for `target_resolution`, `collection`, and `setup` failures, with identical details deduplicated into one entry carrying a `count`; `results` is `[]` rather than absent when nothing executed. The terminal summary now shows `Errored: n`.
|
|
13
|
+
|
|
8
14
|
## [0.2.1] - 2026-07-02
|
|
9
15
|
|
|
10
16
|
### Added
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Contributing to traceval
|
|
2
|
+
|
|
3
|
+
## Setup
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
git clone https://github.com/theramkm/traceval.git
|
|
7
|
+
cd traceval
|
|
8
|
+
uv sync # installs the package and dev dependencies (same as CI)
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Development loop
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
make test # pytest -q
|
|
15
|
+
make lint # ruff check, ruff format --check, mypy
|
|
16
|
+
make demo # end-to-end smoke: healthy agent passes, buggy agent fails
|
|
17
|
+
make all # lint + test
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
CI runs the same commands on Python 3.11, 3.12, and 3.13, plus a
|
|
21
|
+
wheel-based demo smoke job, and enforces 85% coverage. Keep all of it
|
|
22
|
+
green; add a test for every behavior change.
|
traceval-0.2.2/Makefile
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
.PHONY: test lint demo all
|
|
2
|
+
|
|
3
|
+
test:
|
|
4
|
+
uv run pytest -q
|
|
5
|
+
|
|
6
|
+
lint:
|
|
7
|
+
uv run ruff check src tests examples
|
|
8
|
+
uv run ruff format --check src tests examples
|
|
9
|
+
uv run mypy src/traceval
|
|
10
|
+
|
|
11
|
+
demo:
|
|
12
|
+
uv run traceval demo -o /tmp/traceval-demo --force
|
|
13
|
+
|
|
14
|
+
all: lint test
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: traceval
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: Trace-to-Eval Compiler
|
|
5
5
|
License: MIT
|
|
6
6
|
License-File: LICENSE
|
|
@@ -140,7 +140,7 @@ traceval Run Summary
|
|
|
140
140
|
│ c_d3f3b631__case_007 │ c_d3f3b631 │ PASS │ 0.0 │
|
|
141
141
|
│ c_e834c13c__case_008 │ c_e834c13c │ PASS │ 0.0 │
|
|
142
142
|
└──────────────────────┴────────────┴─────────┴──────────────┘
|
|
143
|
-
Total: 8 | Passed: 8 | Failed: 0
|
|
143
|
+
Total: 8 | Passed: 8 | Failed: 0 | Errored: 0
|
|
144
144
|
```
|
|
145
145
|
|
|
146
146
|
The target is an HTTP URL or a `module:function` callable. Checks cover `exact`, `contains_any`, `not_contains`, `regex`, `json_schema`, `tool_sequence`, `no_tool_loop`, and `judge`. Run reports land in `<evals_dir>/runs/` (override with `--runs-dir`); pass `--compare <previous report>` to print regressions and improvements between runs. The exit code is nonzero when any case fails.
|
|
@@ -191,7 +191,7 @@ jobs:
|
|
|
191
191
|
runs-on: ubuntu-latest
|
|
192
192
|
steps:
|
|
193
193
|
- uses: actions/checkout@v4
|
|
194
|
-
- uses: theramkm/traceval@v0.2.
|
|
194
|
+
- uses: theramkm/traceval@v0.2.2
|
|
195
195
|
with:
|
|
196
196
|
evals-dir: evals/
|
|
197
197
|
target: myapp.agent:invoke_agent # or an HTTP URL
|
|
@@ -200,6 +200,11 @@ jobs:
|
|
|
200
200
|
|
|
201
201
|
Inputs: `evals-dir` and `target` (required); `judge`, `compare`, `only`, `runs-dir`, `traceval-version`, `python-version` (optional). For a real LLM judge, set `judge: openai` and pass `OPENAI_API_KEY` (or `GEMINI_API_KEY`) via `env:` from your repository secrets.
|
|
202
202
|
|
|
203
|
+
## Development
|
|
204
|
+
|
|
205
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for setup.
|
|
206
|
+
Run the test suite with `make test` and the full gate set with `make lint`.
|
|
207
|
+
|
|
203
208
|
## Honest Limitations
|
|
204
209
|
|
|
205
210
|
* **Side-Effect Free**: traceval assertions evaluate input/output matches. It does not attempt to replay side effects (e.g., updating database records) on mock tools.
|
|
@@ -123,7 +123,7 @@ traceval Run Summary
|
|
|
123
123
|
│ c_d3f3b631__case_007 │ c_d3f3b631 │ PASS │ 0.0 │
|
|
124
124
|
│ c_e834c13c__case_008 │ c_e834c13c │ PASS │ 0.0 │
|
|
125
125
|
└──────────────────────┴────────────┴─────────┴──────────────┘
|
|
126
|
-
Total: 8 | Passed: 8 | Failed: 0
|
|
126
|
+
Total: 8 | Passed: 8 | Failed: 0 | Errored: 0
|
|
127
127
|
```
|
|
128
128
|
|
|
129
129
|
The target is an HTTP URL or a `module:function` callable. Checks cover `exact`, `contains_any`, `not_contains`, `regex`, `json_schema`, `tool_sequence`, `no_tool_loop`, and `judge`. Run reports land in `<evals_dir>/runs/` (override with `--runs-dir`); pass `--compare <previous report>` to print regressions and improvements between runs. The exit code is nonzero when any case fails.
|
|
@@ -174,7 +174,7 @@ jobs:
|
|
|
174
174
|
runs-on: ubuntu-latest
|
|
175
175
|
steps:
|
|
176
176
|
- uses: actions/checkout@v4
|
|
177
|
-
- uses: theramkm/traceval@v0.2.
|
|
177
|
+
- uses: theramkm/traceval@v0.2.2
|
|
178
178
|
with:
|
|
179
179
|
evals-dir: evals/
|
|
180
180
|
target: myapp.agent:invoke_agent # or an HTTP URL
|
|
@@ -183,6 +183,11 @@ jobs:
|
|
|
183
183
|
|
|
184
184
|
Inputs: `evals-dir` and `target` (required); `judge`, `compare`, `only`, `runs-dir`, `traceval-version`, `python-version` (optional). For a real LLM judge, set `judge: openai` and pass `OPENAI_API_KEY` (or `GEMINI_API_KEY`) via `env:` from your repository secrets.
|
|
185
185
|
|
|
186
|
+
## Development
|
|
187
|
+
|
|
188
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for setup.
|
|
189
|
+
Run the test suite with `make test` and the full gate set with `make lint`.
|
|
190
|
+
|
|
186
191
|
## Honest Limitations
|
|
187
192
|
|
|
188
193
|
* **Side-Effect Free**: traceval assertions evaluate input/output matches. It does not attempt to replay side effects (e.g., updating database records) on mock tools.
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.2"
|
|
@@ -46,38 +46,111 @@ def pytest_generate_tests(metafunc):
|
|
|
46
46
|
|
|
47
47
|
metafunc.parametrize("eval_case", cases, ids=[c["id"] for c in cases])
|
|
48
48
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
49
|
+
# Target resolution runs ONCE per session so a broken target produces one
|
|
50
|
+
# clear error line up front (and one errors entry in the run report),
|
|
51
|
+
# never a wall of identical per-case tracebacks with no explanation.
|
|
52
|
+
_target_probe = {}
|
|
53
|
+
|
|
54
|
+
def _probe_target(config):
|
|
55
|
+
if _target_probe.get("done"):
|
|
56
|
+
return _target_probe
|
|
57
|
+
_target_probe["done"] = True
|
|
58
|
+
|
|
59
|
+
target_opt = config.getoption("--target")
|
|
52
60
|
if not target_opt:
|
|
53
61
|
config_path = Path(__file__).parent / "traceval.yaml"
|
|
54
62
|
if config_path.exists():
|
|
55
63
|
try:
|
|
56
64
|
with open(config_path, encoding="utf-8") as f:
|
|
57
|
-
|
|
58
|
-
target_opt =
|
|
65
|
+
cfg = yaml.safe_load(f)
|
|
66
|
+
target_opt = cfg.get("target", {}).get("default_url")
|
|
59
67
|
except Exception:
|
|
60
68
|
pass
|
|
69
|
+
|
|
61
70
|
if not target_opt:
|
|
62
|
-
|
|
63
|
-
|
|
71
|
+
detail = "No target specified. Use --target option or set in traceval.yaml."
|
|
72
|
+
_target_probe["error"] = detail
|
|
73
|
+
_record_error("target_resolution", detail)
|
|
74
|
+
return _target_probe
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
_target_probe["target"] = resolve_target(target_opt)
|
|
78
|
+
except Exception as e:
|
|
79
|
+
detail = f"target '{target_opt}' could not be imported ({e}). Check the module path or URL."
|
|
80
|
+
_target_probe["error"] = detail
|
|
81
|
+
_record_error("target_resolution", detail)
|
|
82
|
+
return _target_probe
|
|
83
|
+
|
|
84
|
+
if target_opt.startswith(("http://", "https://")):
|
|
85
|
+
# First-contact reachability check; connection-level failures only.
|
|
86
|
+
# The resolved target is kept either way so cases still run.
|
|
87
|
+
import httpx
|
|
88
|
+
try:
|
|
89
|
+
httpx.get(target_opt, timeout=2.0)
|
|
90
|
+
except (httpx.ConnectError, httpx.InvalidURL, httpx.UnsupportedProtocol) as e:
|
|
91
|
+
detail = f"target '{target_opt}' is unreachable ({e}). Check the module path or URL."
|
|
92
|
+
_target_probe["error"] = detail
|
|
93
|
+
_record_error("target_resolution", detail)
|
|
94
|
+
|
|
95
|
+
return _target_probe
|
|
96
|
+
|
|
97
|
+
def pytest_report_header(config):
|
|
98
|
+
probe = _probe_target(config)
|
|
99
|
+
if probe.get("error"):
|
|
100
|
+
return [f"ERROR: {probe['error']}"]
|
|
101
|
+
return []
|
|
102
|
+
|
|
103
|
+
@pytest.fixture(scope="session")
|
|
104
|
+
def target_runner(request):
|
|
105
|
+
probe = _probe_target(request.config)
|
|
106
|
+
if "target" not in probe:
|
|
107
|
+
pytest.fail(probe.get("error") or "Target resolution failed.")
|
|
108
|
+
return probe["target"]
|
|
64
109
|
|
|
65
110
|
@pytest.fixture(scope="session")
|
|
66
111
|
def judge_runner(request):
|
|
67
112
|
judge_opt = request.config.getoption("--judge")
|
|
68
113
|
return resolve_judge(judge_opt)
|
|
69
114
|
|
|
70
|
-
#
|
|
115
|
+
# Accumulators for final report
|
|
71
116
|
_results_accumulator = []
|
|
117
|
+
_errors_accumulator = []
|
|
118
|
+
_errored_cases = [0]
|
|
119
|
+
|
|
120
|
+
def _record_error(stage, detail):
|
|
121
|
+
# Deduplicate identical details into one entry with a count field
|
|
122
|
+
for entry in _errors_accumulator:
|
|
123
|
+
if entry["stage"] == stage and entry["detail"] == detail:
|
|
124
|
+
entry["count"] = entry.get("count", 1) + 1
|
|
125
|
+
return
|
|
126
|
+
_errors_accumulator.append({"stage": stage, "detail": detail})
|
|
127
|
+
|
|
128
|
+
def pytest_collectreport(report):
|
|
129
|
+
if report.failed:
|
|
130
|
+
detail = str(report.longrepr).strip().splitlines()[-1]
|
|
131
|
+
_record_error("collection", detail)
|
|
132
|
+
|
|
133
|
+
@pytest.hookimpl(hookwrapper=True)
|
|
134
|
+
def pytest_runtest_makereport(item, call):
|
|
135
|
+
outcome = yield
|
|
136
|
+
report = outcome.get_result()
|
|
137
|
+
if report.when == "setup" and report.failed:
|
|
138
|
+
# Case never executed (fixture/setup error)
|
|
139
|
+
_errored_cases[0] += 1
|
|
140
|
+
detail = getattr(getattr(report, "longrepr", None), "reprcrash", None)
|
|
141
|
+
detail = detail.message if detail else str(report.longrepr).strip().splitlines()[-1]
|
|
142
|
+
_record_error("setup", detail)
|
|
72
143
|
|
|
73
144
|
@pytest.hookimpl(tryfirst=True)
|
|
74
145
|
def pytest_sessionstart(session):
|
|
75
146
|
_results_accumulator.clear()
|
|
147
|
+
_errors_accumulator.clear()
|
|
148
|
+
_errored_cases[0] = 0
|
|
76
149
|
|
|
77
150
|
def pytest_sessionfinish(session, exitstatus):
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
151
|
+
# ALWAYS write a run report, including when zero cases executed:
|
|
152
|
+
# catastrophic failure must produce a self-describing artifact,
|
|
153
|
+
# never silence.
|
|
81
154
|
runs_opt = session.config.getoption("--runs-dir")
|
|
82
155
|
if runs_opt:
|
|
83
156
|
runs_dir = Path(runs_opt).resolve()
|
|
@@ -95,43 +168,48 @@ def pytest_sessionfinish(session, exitstatus):
|
|
|
95
168
|
|
|
96
169
|
passed_count = sum(1 for r in _results_accumulator if r["passed"])
|
|
97
170
|
failed_count = len(_results_accumulator) - passed_count
|
|
98
|
-
|
|
171
|
+
errored_count = _errored_cases[0]
|
|
172
|
+
total_count = len(_results_accumulator) + errored_count
|
|
173
|
+
|
|
99
174
|
report = {
|
|
100
175
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
101
176
|
"summary": {
|
|
102
|
-
"total":
|
|
177
|
+
"total": total_count,
|
|
103
178
|
"passed": passed_count,
|
|
104
179
|
"failed": failed_count,
|
|
180
|
+
"errored": errored_count,
|
|
105
181
|
},
|
|
182
|
+
"errors": _errors_accumulator,
|
|
106
183
|
"results": _results_accumulator
|
|
107
184
|
}
|
|
108
|
-
|
|
185
|
+
|
|
109
186
|
report_file.write_text(json.dumps(report, indent=2), encoding="utf-8")
|
|
110
|
-
|
|
187
|
+
|
|
111
188
|
# Rich Table Terminal output
|
|
112
189
|
from rich.console import Console
|
|
113
190
|
from rich.table import Table
|
|
114
|
-
|
|
191
|
+
|
|
115
192
|
console = Console()
|
|
116
193
|
console.print("\n[bold purple]traceval Run Summary[/bold purple]")
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
194
|
+
|
|
195
|
+
if _results_accumulator:
|
|
196
|
+
table = Table(show_header=True, header_style="bold blue")
|
|
197
|
+
table.add_column("Case ID", style="cyan")
|
|
198
|
+
table.add_column("Cluster", style="magenta")
|
|
199
|
+
table.add_column("Outcome", justify="center")
|
|
200
|
+
table.add_column("Latency (ms)", justify="right")
|
|
201
|
+
|
|
202
|
+
for r in _results_accumulator:
|
|
203
|
+
outcome_str = "[bold green]PASS[/bold green]" if r["passed"] else "[bold red]FAIL[/bold red]"
|
|
204
|
+
table.add_row(
|
|
205
|
+
r["case_id"],
|
|
206
|
+
r["cluster"],
|
|
207
|
+
outcome_str,
|
|
208
|
+
f"{r['latency_ms']:.1f}"
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
console.print(table)
|
|
212
|
+
console.print(f"Total: {total_count} | Passed: {passed_count} | Failed: {failed_count} | Errored: {errored_count}")
|
|
135
213
|
console.print(f"Run report written to: {report_file}")
|
|
136
214
|
|
|
137
215
|
# Optional --compare checking
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""Regression tests for the failed-target incident: a run in which zero
|
|
2
|
+
cases execute must write a self-describing run report, print one clear
|
|
3
|
+
error line, and never report null. Silence is the bug.
|
|
4
|
+
|
|
5
|
+
Reported via external review of a failed-target invocation.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import sys
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from typer.testing import CliRunner
|
|
13
|
+
|
|
14
|
+
from traceval.cli import app
|
|
15
|
+
from traceval.compile import generate_evals
|
|
16
|
+
from traceval.ingest import ingest_file
|
|
17
|
+
from traceval.store import TraceStore
|
|
18
|
+
|
|
19
|
+
FIXTURES_DIR = Path(__file__).parent / "fixtures"
|
|
20
|
+
BROKEN_TARGET = "no.such.module:fn"
|
|
21
|
+
|
|
22
|
+
runner = CliRunner()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _generate_suite(tmp_path):
|
|
26
|
+
db_path = tmp_path / "traces.db"
|
|
27
|
+
store = TraceStore(db_path)
|
|
28
|
+
ingest_file(FIXTURES_DIR / "generic_traces.jsonl", store, format_name="generic")
|
|
29
|
+
store.close()
|
|
30
|
+
evals_dir = tmp_path / "evals"
|
|
31
|
+
generate_evals(db_path, evals_dir, include_failures=True)
|
|
32
|
+
return evals_dir
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _run(evals_dir, *extra_args):
|
|
36
|
+
# `run` calls pytest.main in-process; a previously executed generated
|
|
37
|
+
# suite leaves its conftest cached in sys.modules and poisons this one.
|
|
38
|
+
for mod in ("conftest", "test_generated"):
|
|
39
|
+
sys.modules.pop(mod, None)
|
|
40
|
+
return runner.invoke(
|
|
41
|
+
app,
|
|
42
|
+
[
|
|
43
|
+
"run",
|
|
44
|
+
str(evals_dir),
|
|
45
|
+
"--target",
|
|
46
|
+
BROKEN_TARGET,
|
|
47
|
+
"--judge",
|
|
48
|
+
"fake",
|
|
49
|
+
*extra_args,
|
|
50
|
+
],
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_broken_target_writes_report(tmp_path):
|
|
55
|
+
evals_dir = _generate_suite(tmp_path)
|
|
56
|
+
result = _run(evals_dir)
|
|
57
|
+
assert result.exit_code != 0
|
|
58
|
+
|
|
59
|
+
reports = list((evals_dir / "runs").glob("run_*.json"))
|
|
60
|
+
assert len(reports) == 1, "exactly one report must be written"
|
|
61
|
+
|
|
62
|
+
with open(reports[0], encoding="utf-8") as f:
|
|
63
|
+
report = json.load(f)
|
|
64
|
+
assert report["summary"]["errored"] == report["summary"]["total"] > 0
|
|
65
|
+
assert report["summary"]["passed"] == 0
|
|
66
|
+
assert report["summary"]["failed"] == 0
|
|
67
|
+
assert report["results"] == []
|
|
68
|
+
assert report["errors"][0]["stage"] == "target_resolution"
|
|
69
|
+
assert BROKEN_TARGET in report["errors"][0]["detail"]
|
|
70
|
+
# Identical per-case setup errors deduplicate into one counted entry
|
|
71
|
+
setup_errors = [e for e in report["errors"] if e["stage"] == "setup"]
|
|
72
|
+
assert len(setup_errors) == 1
|
|
73
|
+
assert setup_errors[0]["count"] == report["summary"]["errored"]
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def test_broken_target_json_report_not_null(tmp_path):
|
|
77
|
+
evals_dir = _generate_suite(tmp_path)
|
|
78
|
+
result = _run(evals_dir, "--json")
|
|
79
|
+
data = json.loads(result.stdout)
|
|
80
|
+
assert isinstance(data["report"], str)
|
|
81
|
+
assert Path(data["report"]).exists()
|
|
82
|
+
assert data["exit_code"] == 1
|
|
83
|
+
assert result.exit_code == 1
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def test_broken_target_prints_one_clear_error(tmp_path):
|
|
87
|
+
evals_dir = _generate_suite(tmp_path)
|
|
88
|
+
result = _run(evals_dir)
|
|
89
|
+
error_line = f"ERROR: target '{BROKEN_TARGET}' could not be imported"
|
|
90
|
+
assert result.output.count(error_line) == 1, result.output
|
|
Binary file
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.2.1"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|