traceval 0.2.1__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. {traceval-0.2.1 → traceval-0.2.2}/CHANGELOG.md +6 -0
  2. traceval-0.2.2/CONTRIBUTING.md +22 -0
  3. traceval-0.2.2/Makefile +14 -0
  4. {traceval-0.2.1 → traceval-0.2.2}/PKG-INFO +8 -3
  5. {traceval-0.2.1 → traceval-0.2.2}/README.md +7 -2
  6. traceval-0.2.2/docs/img/report.png +0 -0
  7. {traceval-0.2.1 → traceval-0.2.2}/pyproject.toml +1 -1
  8. traceval-0.2.2/src/traceval/__init__.py +1 -0
  9. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/compile/templates/conftest.py.jinja +112 -34
  10. traceval-0.2.2/tests/test_broken_target.py +90 -0
  11. {traceval-0.2.1 → traceval-0.2.2}/uv.lock +1 -1
  12. traceval-0.2.1/docs/img/report.png +0 -0
  13. traceval-0.2.1/src/traceval/__init__.py +0 -1
  14. {traceval-0.2.1 → traceval-0.2.2}/.github/workflows/ci.yml +0 -0
  15. {traceval-0.2.1 → traceval-0.2.2}/.gitignore +0 -0
  16. {traceval-0.2.1 → traceval-0.2.2}/.pre-commit-config.yaml +0 -0
  17. {traceval-0.2.1 → traceval-0.2.2}/LICENSE +0 -0
  18. {traceval-0.2.1 → traceval-0.2.2}/action.yml +0 -0
  19. {traceval-0.2.1 → traceval-0.2.2}/docs/formats.md +0 -0
  20. {traceval-0.2.1 → traceval-0.2.2}/examples/demo.sh +0 -0
  21. {traceval-0.2.1 → traceval-0.2.2}/examples/demo_agent/agent.py +0 -0
  22. {traceval-0.2.1 → traceval-0.2.2}/examples/demo_agent/core.py +0 -0
  23. {traceval-0.2.1 → traceval-0.2.2}/examples/make_traces.py +0 -0
  24. {traceval-0.2.1 → traceval-0.2.2}/examples/synthetic_traces.jsonl +0 -0
  25. {traceval-0.2.1 → traceval-0.2.2}/scripts/readme-outputs.sh +0 -0
  26. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/analyze/__init__.py +0 -0
  27. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/analyze/cluster.py +0 -0
  28. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/analyze/coverage.py +0 -0
  29. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/analyze/outcomes.py +0 -0
  30. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/analyze/report.py +0 -0
  31. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/cli.py +0 -0
  32. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/compile/__init__.py +0 -0
  33. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/compile/cases.py +0 -0
  34. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/compile/emit_pytest.py +0 -0
  35. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/compile/emit_yaml.py +0 -0
  36. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/compile/rubrics.py +0 -0
  37. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/compile/templates/test_generated.py.jinja +0 -0
  38. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/demo/__init__.py +0 -0
  39. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/demo/agent.py +0 -0
  40. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/demo/traces.py +0 -0
  41. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/ingest/__init__.py +0 -0
  42. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/ingest/base.py +0 -0
  43. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/ingest/generic.py +0 -0
  44. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/ingest/langfuse.py +0 -0
  45. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/ingest/langsmith.py +0 -0
  46. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/ingest/otel.py +0 -0
  47. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/model.py +0 -0
  48. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/run/calibrate.py +0 -0
  49. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/run/judge.py +0 -0
  50. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/run/runner.py +0 -0
  51. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/run/scorers.py +0 -0
  52. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/run/target.py +0 -0
  53. {traceval-0.2.1 → traceval-0.2.2}/src/traceval/store.py +0 -0
  54. {traceval-0.2.1 → traceval-0.2.2}/tests/fixtures/README.md +0 -0
  55. {traceval-0.2.1 → traceval-0.2.2}/tests/fixtures/generic_traces.jsonl +0 -0
  56. {traceval-0.2.1 → traceval-0.2.2}/tests/fixtures/langfuse_export.jsonl +0 -0
  57. {traceval-0.2.1 → traceval-0.2.2}/tests/fixtures/langsmith_runs.jsonl +0 -0
  58. {traceval-0.2.1 → traceval-0.2.2}/tests/fixtures/otel_spans.jsonl +0 -0
  59. {traceval-0.2.1 → traceval-0.2.2}/tests/test_calibrate.py +0 -0
  60. {traceval-0.2.1 → traceval-0.2.2}/tests/test_cli.py +0 -0
  61. {traceval-0.2.1 → traceval-0.2.2}/tests/test_e2e_demo.py +0 -0
  62. {traceval-0.2.1 → traceval-0.2.2}/tests/test_json_output.py +0 -0
  63. {traceval-0.2.1 → traceval-0.2.2}/tests/test_phase1.py +0 -0
  64. {traceval-0.2.1 → traceval-0.2.2}/tests/test_phase2.py +0 -0
  65. {traceval-0.2.1 → traceval-0.2.2}/tests/test_phase3.py +0 -0
  66. {traceval-0.2.1 → traceval-0.2.2}/tests/test_phase4.py +0 -0
  67. {traceval-0.2.1 → traceval-0.2.2}/tests/test_phase5.py +0 -0
  68. {traceval-0.2.1 → traceval-0.2.2}/tests/test_phase6.py +0 -0
  69. {traceval-0.2.1 → traceval-0.2.2}/tests/test_regression_cases.py +0 -0
  70. {traceval-0.2.1 → traceval-0.2.2}/tests/test_serve.py +0 -0
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.2.2] - 2026-07-02
9
+
10
+ ### Fixed
11
+ - A run in which zero cases execute (e.g. unresolvable target) now writes a self-describing run report with an `errors` section instead of writing nothing; `--json` never reports `null`. Added a single clear top-level error line on target-resolution failure. Added `make test` and CONTRIBUTING as the canonical dev commands. Reported via external review of a failed-target invocation.
12
+ - Run report schema additions (existing fields unchanged): `summary.errored` counts cases that never executed due to setup/collection errors; top-level `errors` lists `{stage, detail}` entries for `target_resolution`, `collection`, and `setup` failures, with identical details deduplicated into one entry carrying a `count`; `results` is `[]` rather than absent when nothing executed. The terminal summary now shows `Errored: n`.
13
+
8
14
  ## [0.2.1] - 2026-07-02
9
15
 
10
16
  ### Added
@@ -0,0 +1,22 @@
1
+ # Contributing to traceval
2
+
3
+ ## Setup
4
+
5
+ ```bash
6
+ git clone https://github.com/theramkm/traceval.git
7
+ cd traceval
8
+ uv sync # installs the package and dev dependencies (same as CI)
9
+ ```
10
+
11
+ ## Development loop
12
+
13
+ ```bash
14
+ make test # pytest -q
15
+ make lint # ruff check, ruff format --check, mypy
16
+ make demo # end-to-end smoke: healthy agent passes, buggy agent fails
17
+ make all # lint + test
18
+ ```
19
+
20
+ CI runs the same commands on Python 3.11, 3.12, and 3.13, plus a
21
+ wheel-based demo smoke job, and enforces 85% coverage. Keep all of it
22
+ green; add a test for every behavior change.
@@ -0,0 +1,14 @@
1
+ .PHONY: test lint demo all
2
+
3
+ test:
4
+ uv run pytest -q
5
+
6
+ lint:
7
+ uv run ruff check src tests examples
8
+ uv run ruff format --check src tests examples
9
+ uv run mypy src/traceval
10
+
11
+ demo:
12
+ uv run traceval demo -o /tmp/traceval-demo --force
13
+
14
+ all: lint test
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: traceval
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: Trace-to-Eval Compiler
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -140,7 +140,7 @@ traceval Run Summary
140
140
  │ c_d3f3b631__case_007 │ c_d3f3b631 │ PASS │ 0.0 │
141
141
  │ c_e834c13c__case_008 │ c_e834c13c │ PASS │ 0.0 │
142
142
  └──────────────────────┴────────────┴─────────┴──────────────┘
143
- Total: 8 | Passed: 8 | Failed: 0
143
+ Total: 8 | Passed: 8 | Failed: 0 | Errored: 0
144
144
  ```
145
145
 
146
146
  The target is an HTTP URL or a `module:function` callable. Checks cover `exact`, `contains_any`, `not_contains`, `regex`, `json_schema`, `tool_sequence`, `no_tool_loop`, and `judge`. Run reports land in `<evals_dir>/runs/` (override with `--runs-dir`); pass `--compare <previous report>` to print regressions and improvements between runs. The exit code is nonzero when any case fails.
@@ -191,7 +191,7 @@ jobs:
191
191
  runs-on: ubuntu-latest
192
192
  steps:
193
193
  - uses: actions/checkout@v4
194
- - uses: theramkm/traceval@v0.2.1
194
+ - uses: theramkm/traceval@v0.2.2
195
195
  with:
196
196
  evals-dir: evals/
197
197
  target: myapp.agent:invoke_agent # or an HTTP URL
@@ -200,6 +200,11 @@ jobs:
200
200
 
201
201
  Inputs: `evals-dir` and `target` (required); `judge`, `compare`, `only`, `runs-dir`, `traceval-version`, `python-version` (optional). For a real LLM judge, set `judge: openai` and pass `OPENAI_API_KEY` (or `GEMINI_API_KEY`) via `env:` from your repository secrets.
202
202
 
203
+ ## Development
204
+
205
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for setup.
206
+ Run the test suite with `make test` and the full gate set with `make lint`.
207
+
203
208
  ## Honest Limitations
204
209
 
205
210
  * **Side-Effect Free**: traceval assertions evaluate input/output matches. It does not attempt to replay side effects (e.g., updating database records) on mock tools.
@@ -123,7 +123,7 @@ traceval Run Summary
123
123
  │ c_d3f3b631__case_007 │ c_d3f3b631 │ PASS │ 0.0 │
124
124
  │ c_e834c13c__case_008 │ c_e834c13c │ PASS │ 0.0 │
125
125
  └──────────────────────┴────────────┴─────────┴──────────────┘
126
- Total: 8 | Passed: 8 | Failed: 0
126
+ Total: 8 | Passed: 8 | Failed: 0 | Errored: 0
127
127
  ```
128
128
 
129
129
  The target is an HTTP URL or a `module:function` callable. Checks cover `exact`, `contains_any`, `not_contains`, `regex`, `json_schema`, `tool_sequence`, `no_tool_loop`, and `judge`. Run reports land in `<evals_dir>/runs/` (override with `--runs-dir`); pass `--compare <previous report>` to print regressions and improvements between runs. The exit code is nonzero when any case fails.
@@ -174,7 +174,7 @@ jobs:
174
174
  runs-on: ubuntu-latest
175
175
  steps:
176
176
  - uses: actions/checkout@v4
177
- - uses: theramkm/traceval@v0.2.1
177
+ - uses: theramkm/traceval@v0.2.2
178
178
  with:
179
179
  evals-dir: evals/
180
180
  target: myapp.agent:invoke_agent # or an HTTP URL
@@ -183,6 +183,11 @@ jobs:
183
183
 
184
184
  Inputs: `evals-dir` and `target` (required); `judge`, `compare`, `only`, `runs-dir`, `traceval-version`, `python-version` (optional). For a real LLM judge, set `judge: openai` and pass `OPENAI_API_KEY` (or `GEMINI_API_KEY`) via `env:` from your repository secrets.
185
185
 
186
+ ## Development
187
+
188
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for setup.
189
+ Run the test suite with `make test` and the full gate set with `make lint`.
190
+
186
191
  ## Honest Limitations
187
192
 
188
193
  * **Side-Effect Free**: traceval assertions evaluate input/output matches. It does not attempt to replay side effects (e.g., updating database records) on mock tools.
Binary file
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "traceval"
7
- version = "0.2.1"
7
+ version = "0.2.2"
8
8
  description = "Trace-to-Eval Compiler"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"
@@ -0,0 +1 @@
1
+ __version__ = "0.2.2"
@@ -46,38 +46,111 @@ def pytest_generate_tests(metafunc):
46
46
 
47
47
  metafunc.parametrize("eval_case", cases, ids=[c["id"] for c in cases])
48
48
 
49
- @pytest.fixture(scope="session")
50
- def target_runner(request):
51
- target_opt = request.config.getoption("--target")
49
+ # Target resolution runs ONCE per session so a broken target produces one
50
+ # clear error line up front (and one errors entry in the run report),
51
+ # never a wall of identical per-case tracebacks with no explanation.
52
+ _target_probe = {}
53
+
54
+ def _probe_target(config):
55
+ if _target_probe.get("done"):
56
+ return _target_probe
57
+ _target_probe["done"] = True
58
+
59
+ target_opt = config.getoption("--target")
52
60
  if not target_opt:
53
61
  config_path = Path(__file__).parent / "traceval.yaml"
54
62
  if config_path.exists():
55
63
  try:
56
64
  with open(config_path, encoding="utf-8") as f:
57
- config = yaml.safe_load(f)
58
- target_opt = config.get("target", {}).get("default_url")
65
+ cfg = yaml.safe_load(f)
66
+ target_opt = cfg.get("target", {}).get("default_url")
59
67
  except Exception:
60
68
  pass
69
+
61
70
  if not target_opt:
62
- pytest.fail("No target specified. Use --target option or set in traceval.yaml.")
63
- return resolve_target(target_opt)
71
+ detail = "No target specified. Use --target option or set in traceval.yaml."
72
+ _target_probe["error"] = detail
73
+ _record_error("target_resolution", detail)
74
+ return _target_probe
75
+
76
+ try:
77
+ _target_probe["target"] = resolve_target(target_opt)
78
+ except Exception as e:
79
+ detail = f"target '{target_opt}' could not be imported ({e}). Check the module path or URL."
80
+ _target_probe["error"] = detail
81
+ _record_error("target_resolution", detail)
82
+ return _target_probe
83
+
84
+ if target_opt.startswith(("http://", "https://")):
85
+ # First-contact reachability check; connection-level failures only.
86
+ # The resolved target is kept either way so cases still run.
87
+ import httpx
88
+ try:
89
+ httpx.get(target_opt, timeout=2.0)
90
+ except (httpx.ConnectError, httpx.InvalidURL, httpx.UnsupportedProtocol) as e:
91
+ detail = f"target '{target_opt}' is unreachable ({e}). Check the module path or URL."
92
+ _target_probe["error"] = detail
93
+ _record_error("target_resolution", detail)
94
+
95
+ return _target_probe
96
+
97
+ def pytest_report_header(config):
98
+ probe = _probe_target(config)
99
+ if probe.get("error"):
100
+ return [f"ERROR: {probe['error']}"]
101
+ return []
102
+
103
+ @pytest.fixture(scope="session")
104
+ def target_runner(request):
105
+ probe = _probe_target(request.config)
106
+ if "target" not in probe:
107
+ pytest.fail(probe.get("error") or "Target resolution failed.")
108
+ return probe["target"]
64
109
 
65
110
  @pytest.fixture(scope="session")
66
111
  def judge_runner(request):
67
112
  judge_opt = request.config.getoption("--judge")
68
113
  return resolve_judge(judge_opt)
69
114
 
70
- # Accumulator for final report
115
+ # Accumulators for final report
71
116
  _results_accumulator = []
117
+ _errors_accumulator = []
118
+ _errored_cases = [0]
119
+
120
+ def _record_error(stage, detail):
121
+ # Deduplicate identical details into one entry with a count field
122
+ for entry in _errors_accumulator:
123
+ if entry["stage"] == stage and entry["detail"] == detail:
124
+ entry["count"] = entry.get("count", 1) + 1
125
+ return
126
+ _errors_accumulator.append({"stage": stage, "detail": detail})
127
+
128
+ def pytest_collectreport(report):
129
+ if report.failed:
130
+ detail = str(report.longrepr).strip().splitlines()[-1]
131
+ _record_error("collection", detail)
132
+
133
+ @pytest.hookimpl(hookwrapper=True)
134
+ def pytest_runtest_makereport(item, call):
135
+ outcome = yield
136
+ report = outcome.get_result()
137
+ if report.when == "setup" and report.failed:
138
+ # Case never executed (fixture/setup error)
139
+ _errored_cases[0] += 1
140
+ detail = getattr(getattr(report, "longrepr", None), "reprcrash", None)
141
+ detail = detail.message if detail else str(report.longrepr).strip().splitlines()[-1]
142
+ _record_error("setup", detail)
72
143
 
73
144
  @pytest.hookimpl(tryfirst=True)
74
145
  def pytest_sessionstart(session):
75
146
  _results_accumulator.clear()
147
+ _errors_accumulator.clear()
148
+ _errored_cases[0] = 0
76
149
 
77
150
  def pytest_sessionfinish(session, exitstatus):
78
- if not _results_accumulator:
79
- return
80
-
151
+ # ALWAYS write a run report, including when zero cases executed:
152
+ # catastrophic failure must produce a self-describing artifact,
153
+ # never silence.
81
154
  runs_opt = session.config.getoption("--runs-dir")
82
155
  if runs_opt:
83
156
  runs_dir = Path(runs_opt).resolve()
@@ -95,43 +168,48 @@ def pytest_sessionfinish(session, exitstatus):
95
168
 
96
169
  passed_count = sum(1 for r in _results_accumulator if r["passed"])
97
170
  failed_count = len(_results_accumulator) - passed_count
98
-
171
+ errored_count = _errored_cases[0]
172
+ total_count = len(_results_accumulator) + errored_count
173
+
99
174
  report = {
100
175
  "timestamp": datetime.now(timezone.utc).isoformat(),
101
176
  "summary": {
102
- "total": len(_results_accumulator),
177
+ "total": total_count,
103
178
  "passed": passed_count,
104
179
  "failed": failed_count,
180
+ "errored": errored_count,
105
181
  },
182
+ "errors": _errors_accumulator,
106
183
  "results": _results_accumulator
107
184
  }
108
-
185
+
109
186
  report_file.write_text(json.dumps(report, indent=2), encoding="utf-8")
110
-
187
+
111
188
  # Rich Table Terminal output
112
189
  from rich.console import Console
113
190
  from rich.table import Table
114
-
191
+
115
192
  console = Console()
116
193
  console.print("\n[bold purple]traceval Run Summary[/bold purple]")
117
-
118
- table = Table(show_header=True, header_style="bold blue")
119
- table.add_column("Case ID", style="cyan")
120
- table.add_column("Cluster", style="magenta")
121
- table.add_column("Outcome", justify="center")
122
- table.add_column("Latency (ms)", justify="right")
123
-
124
- for r in _results_accumulator:
125
- outcome_str = "[bold green]PASS[/bold green]" if r["passed"] else "[bold red]FAIL[/bold red]"
126
- table.add_row(
127
- r["case_id"],
128
- r["cluster"],
129
- outcome_str,
130
- f"{r['latency_ms']:.1f}"
131
- )
132
-
133
- console.print(table)
134
- console.print(f"Total: {len(_results_accumulator)} | Passed: {passed_count} | Failed: {failed_count}")
194
+
195
+ if _results_accumulator:
196
+ table = Table(show_header=True, header_style="bold blue")
197
+ table.add_column("Case ID", style="cyan")
198
+ table.add_column("Cluster", style="magenta")
199
+ table.add_column("Outcome", justify="center")
200
+ table.add_column("Latency (ms)", justify="right")
201
+
202
+ for r in _results_accumulator:
203
+ outcome_str = "[bold green]PASS[/bold green]" if r["passed"] else "[bold red]FAIL[/bold red]"
204
+ table.add_row(
205
+ r["case_id"],
206
+ r["cluster"],
207
+ outcome_str,
208
+ f"{r['latency_ms']:.1f}"
209
+ )
210
+
211
+ console.print(table)
212
+ console.print(f"Total: {total_count} | Passed: {passed_count} | Failed: {failed_count} | Errored: {errored_count}")
135
213
  console.print(f"Run report written to: {report_file}")
136
214
 
137
215
  # Optional --compare checking
@@ -0,0 +1,90 @@
1
+ """Regression tests for the failed-target incident: a run in which zero
2
+ cases execute must write a self-describing run report, print one clear
3
+ error line, and never report null. Silence is the bug.
4
+
5
+ Reported via external review of a failed-target invocation.
6
+ """
7
+
8
+ import json
9
+ import sys
10
+ from pathlib import Path
11
+
12
+ from typer.testing import CliRunner
13
+
14
+ from traceval.cli import app
15
+ from traceval.compile import generate_evals
16
+ from traceval.ingest import ingest_file
17
+ from traceval.store import TraceStore
18
+
19
+ FIXTURES_DIR = Path(__file__).parent / "fixtures"
20
+ BROKEN_TARGET = "no.such.module:fn"
21
+
22
+ runner = CliRunner()
23
+
24
+
25
+ def _generate_suite(tmp_path):
26
+ db_path = tmp_path / "traces.db"
27
+ store = TraceStore(db_path)
28
+ ingest_file(FIXTURES_DIR / "generic_traces.jsonl", store, format_name="generic")
29
+ store.close()
30
+ evals_dir = tmp_path / "evals"
31
+ generate_evals(db_path, evals_dir, include_failures=True)
32
+ return evals_dir
33
+
34
+
35
+ def _run(evals_dir, *extra_args):
36
+ # `run` calls pytest.main in-process; a previously executed generated
37
+ # suite leaves its conftest cached in sys.modules and poisons this one.
38
+ for mod in ("conftest", "test_generated"):
39
+ sys.modules.pop(mod, None)
40
+ return runner.invoke(
41
+ app,
42
+ [
43
+ "run",
44
+ str(evals_dir),
45
+ "--target",
46
+ BROKEN_TARGET,
47
+ "--judge",
48
+ "fake",
49
+ *extra_args,
50
+ ],
51
+ )
52
+
53
+
54
+ def test_broken_target_writes_report(tmp_path):
55
+ evals_dir = _generate_suite(tmp_path)
56
+ result = _run(evals_dir)
57
+ assert result.exit_code != 0
58
+
59
+ reports = list((evals_dir / "runs").glob("run_*.json"))
60
+ assert len(reports) == 1, "exactly one report must be written"
61
+
62
+ with open(reports[0], encoding="utf-8") as f:
63
+ report = json.load(f)
64
+ assert report["summary"]["errored"] == report["summary"]["total"] > 0
65
+ assert report["summary"]["passed"] == 0
66
+ assert report["summary"]["failed"] == 0
67
+ assert report["results"] == []
68
+ assert report["errors"][0]["stage"] == "target_resolution"
69
+ assert BROKEN_TARGET in report["errors"][0]["detail"]
70
+ # Identical per-case setup errors deduplicate into one counted entry
71
+ setup_errors = [e for e in report["errors"] if e["stage"] == "setup"]
72
+ assert len(setup_errors) == 1
73
+ assert setup_errors[0]["count"] == report["summary"]["errored"]
74
+
75
+
76
+ def test_broken_target_json_report_not_null(tmp_path):
77
+ evals_dir = _generate_suite(tmp_path)
78
+ result = _run(evals_dir, "--json")
79
+ data = json.loads(result.stdout)
80
+ assert isinstance(data["report"], str)
81
+ assert Path(data["report"]).exists()
82
+ assert data["exit_code"] == 1
83
+ assert result.exit_code == 1
84
+
85
+
86
+ def test_broken_target_prints_one_clear_error(tmp_path):
87
+ evals_dir = _generate_suite(tmp_path)
88
+ result = _run(evals_dir)
89
+ error_line = f"ERROR: target '{BROKEN_TARGET}' could not be imported"
90
+ assert result.output.count(error_line) == 1, result.output
@@ -1037,7 +1037,7 @@ wheels = [
1037
1037
 
1038
1038
  [[package]]
1039
1039
  name = "traceval"
1040
- version = "0.2.1"
1040
+ version = "0.2.2"
1041
1041
  source = { editable = "." }
1042
1042
  dependencies = [
1043
1043
  { name = "httpx" },
Binary file
@@ -1 +0,0 @@
1
- __version__ = "0.2.1"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes