traceval 0.2.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {traceval-0.2.0 → traceval-0.2.2}/.github/workflows/ci.yml +35 -10
- {traceval-0.2.0 → traceval-0.2.2}/CHANGELOG.md +24 -0
- traceval-0.2.2/CONTRIBUTING.md +22 -0
- traceval-0.2.2/Makefile +14 -0
- traceval-0.2.2/PKG-INFO +212 -0
- traceval-0.2.2/README.md +195 -0
- traceval-0.2.2/docs/img/report.png +0 -0
- traceval-0.2.2/examples/demo.sh +7 -0
- traceval-0.2.2/examples/demo_agent/core.py +8 -0
- traceval-0.2.2/examples/make_traces.py +11 -0
- {traceval-0.2.0 → traceval-0.2.2}/pyproject.toml +1 -1
- traceval-0.2.2/scripts/readme-outputs.sh +53 -0
- traceval-0.2.2/src/traceval/__init__.py +1 -0
- {traceval-0.2.0 → traceval-0.2.2}/src/traceval/analyze/report.py +4 -1
- traceval-0.2.2/src/traceval/cli.py +534 -0
- {traceval-0.2.0 → traceval-0.2.2}/src/traceval/compile/__init__.py +7 -2
- {traceval-0.2.0 → traceval-0.2.2}/src/traceval/compile/cases.py +51 -6
- {traceval-0.2.0 → traceval-0.2.2}/src/traceval/compile/templates/conftest.py.jinja +118 -38
- {traceval-0.2.0 → traceval-0.2.2}/src/traceval/compile/templates/test_generated.py.jinja +13 -0
- traceval-0.2.2/src/traceval/demo/__init__.py +9 -0
- traceval-0.2.0/examples/demo_agent/core.py → traceval-0.2.2/src/traceval/demo/agent.py +3 -2
- traceval-0.2.0/examples/make_traces.py → traceval-0.2.2/src/traceval/demo/traces.py +5 -10
- {traceval-0.2.0 → traceval-0.2.2}/src/traceval/run/scorers.py +7 -2
- traceval-0.2.2/tests/test_broken_target.py +90 -0
- traceval-0.2.2/tests/test_e2e_demo.py +67 -0
- traceval-0.2.2/tests/test_json_output.py +112 -0
- {traceval-0.2.0 → traceval-0.2.2}/tests/test_phase6.py +12 -1
- {traceval-0.2.0 → traceval-0.2.2}/tests/test_regression_cases.py +102 -0
- traceval-0.2.2/tests/test_serve.py +25 -0
- {traceval-0.2.0 → traceval-0.2.2}/uv.lock +1 -1
- traceval-0.2.0/PKG-INFO +0 -200
- traceval-0.2.0/README.md +0 -183
- traceval-0.2.0/examples/demo.sh +0 -31
- traceval-0.2.0/src/traceval/__init__.py +0 -1
- traceval-0.2.0/src/traceval/cli.py +0 -284
- traceval-0.2.0/tests/test_e2e_demo.py +0 -99
- {traceval-0.2.0 → traceval-0.2.2}/.gitignore +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/.pre-commit-config.yaml +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/LICENSE +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/action.yml +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/docs/formats.md +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/examples/demo_agent/agent.py +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/examples/synthetic_traces.jsonl +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/src/traceval/analyze/__init__.py +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/src/traceval/analyze/cluster.py +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/src/traceval/analyze/coverage.py +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/src/traceval/analyze/outcomes.py +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/src/traceval/compile/emit_pytest.py +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/src/traceval/compile/emit_yaml.py +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/src/traceval/compile/rubrics.py +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/src/traceval/ingest/__init__.py +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/src/traceval/ingest/base.py +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/src/traceval/ingest/generic.py +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/src/traceval/ingest/langfuse.py +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/src/traceval/ingest/langsmith.py +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/src/traceval/ingest/otel.py +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/src/traceval/model.py +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/src/traceval/run/calibrate.py +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/src/traceval/run/judge.py +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/src/traceval/run/runner.py +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/src/traceval/run/target.py +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/src/traceval/store.py +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/tests/fixtures/README.md +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/tests/fixtures/generic_traces.jsonl +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/tests/fixtures/langfuse_export.jsonl +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/tests/fixtures/langsmith_runs.jsonl +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/tests/fixtures/otel_spans.jsonl +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/tests/test_calibrate.py +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/tests/test_cli.py +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/tests/test_phase1.py +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/tests/test_phase2.py +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/tests/test_phase3.py +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/tests/test_phase4.py +0 -0
- {traceval-0.2.0 → traceval-0.2.2}/tests/test_phase5.py +0 -0
|
@@ -34,6 +34,33 @@ jobs:
|
|
|
34
34
|
- name: Run Tests with Coverage
|
|
35
35
|
run: uv run pytest --cov=src/traceval --cov-fail-under=85
|
|
36
36
|
|
|
37
|
+
demo-wheel:
|
|
38
|
+
# The pip-user path is what ships: build the wheel, install it in a
|
|
39
|
+
# clean venv, and run the packaged demo from an empty directory.
|
|
40
|
+
needs: test
|
|
41
|
+
runs-on: ubuntu-latest
|
|
42
|
+
steps:
|
|
43
|
+
- uses: actions/checkout@v4
|
|
44
|
+
|
|
45
|
+
- name: Set up Python
|
|
46
|
+
uses: actions/setup-python@v5
|
|
47
|
+
with:
|
|
48
|
+
python-version: "3.12"
|
|
49
|
+
|
|
50
|
+
- name: Build wheel
|
|
51
|
+
run: |
|
|
52
|
+
pip install build
|
|
53
|
+
python -m build --wheel
|
|
54
|
+
|
|
55
|
+
- name: Install wheel in a clean venv and run the demo
|
|
56
|
+
run: |
|
|
57
|
+
python -m venv /tmp/wheel-venv
|
|
58
|
+
/tmp/wheel-venv/bin/pip install dist/*.whl
|
|
59
|
+
mkdir /tmp/demo-smoke
|
|
60
|
+
cd /tmp/demo-smoke
|
|
61
|
+
/tmp/wheel-venv/bin/traceval demo
|
|
62
|
+
/tmp/wheel-venv/bin/traceval analyze traceval-demo/traces.db --json | python -m json.tool
|
|
63
|
+
|
|
37
64
|
demo-action:
|
|
38
65
|
# Dogfoods action.yml against the demo agent: the only CI coverage the
|
|
39
66
|
# user-facing GitHub Action gets.
|
|
@@ -50,15 +77,13 @@ jobs:
|
|
|
50
77
|
- name: Generate demo eval suite
|
|
51
78
|
run: |
|
|
52
79
|
pip install .
|
|
53
|
-
|
|
54
|
-
traceval ingest examples/synthetic_traces.jsonl -o demo_traces.db
|
|
55
|
-
traceval generate demo_traces.db -o demo_evals/ --include-failures
|
|
80
|
+
traceval demo -o demo-artifacts
|
|
56
81
|
|
|
57
82
|
- name: Run evals via the action (healthy agent must pass)
|
|
58
83
|
uses: ./
|
|
59
84
|
with:
|
|
60
|
-
evals-dir:
|
|
61
|
-
target:
|
|
85
|
+
evals-dir: demo-artifacts/evals
|
|
86
|
+
target: traceval.demo.agent:invoke_agent
|
|
62
87
|
traceval-version: local
|
|
63
88
|
|
|
64
89
|
- name: Run evals via the action (buggy agent must fail)
|
|
@@ -68,20 +93,20 @@ jobs:
|
|
|
68
93
|
env:
|
|
69
94
|
BUGGY: "true"
|
|
70
95
|
with:
|
|
71
|
-
evals-dir:
|
|
72
|
-
target:
|
|
96
|
+
evals-dir: demo-artifacts/evals
|
|
97
|
+
target: traceval.demo.agent:invoke_agent
|
|
73
98
|
traceval-version: local
|
|
74
99
|
|
|
75
100
|
- name: Assert buggy run failed
|
|
76
101
|
run: |
|
|
77
102
|
if [ "${{ steps.buggy.outcome }}" != "failure" ]; then
|
|
78
|
-
echo "
|
|
103
|
+
echo "Buggy agent unexpectedly passed the eval suite"
|
|
79
104
|
exit 1
|
|
80
105
|
fi
|
|
81
|
-
echo "
|
|
106
|
+
echo "Action correctly failed the job for the buggy agent"
|
|
82
107
|
|
|
83
108
|
tag-and-release:
|
|
84
|
-
needs: [test, demo-action]
|
|
109
|
+
needs: [test, demo-wheel, demo-action]
|
|
85
110
|
if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref == 'refs/heads/master')
|
|
86
111
|
runs-on: ubuntu-latest
|
|
87
112
|
permissions:
|
|
@@ -5,6 +5,30 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.2.2] - 2026-07-02
|
|
9
|
+
|
|
10
|
+
### Fixed
|
|
11
|
+
- A run in which zero cases execute (e.g. unresolvable target) now writes a self-describing run report with an `errors` section instead of writing nothing; `--json` never reports `null`. Added a single clear top-level error line on target-resolution failure. Added `make test` and CONTRIBUTING as the canonical dev commands. Reported via external review of a failed-target invocation.
|
|
12
|
+
- Run report schema additions (existing fields unchanged): `summary.errored` counts cases that never executed due to setup/collection errors; top-level `errors` lists `{stage, detail}` entries for `target_resolution`, `collection`, and `setup` failures, with identical details deduplicated into one entry carrying a `count`; `results` is `[]` rather than absent when nothing executed. The terminal summary now shows `Errored: n`.
|
|
13
|
+
|
|
14
|
+
## [0.2.1] - 2026-07-02
|
|
15
|
+
|
|
16
|
+
### Added
|
|
17
|
+
- `--json` on `ingest`, `analyze`, `generate`, and `run`: suppresses human-readable output and prints a single JSON object to stdout for scripting (`run` still exits nonzero on failures). The README previously claimed this flag existed; now it does.
|
|
18
|
+
- `traceval demo`: runs the full trace-to-eval loop end-to-end with a built-in demo agent from a plain pip install (no repo clone). Creates `./traceval-demo/` (override with `-o`), refuses to write into a non-empty directory unless `--force`, and `--force` only ever replaces the demo's own artifacts. The demo agent and trace generator moved into the package (`traceval.demo`); `examples/` keeps thin wrappers.
|
|
19
|
+
- `traceval serve [dir]`: serves the analysis report directory on localhost with Python's stdlib http.server and prints the report URL. Not a web UI.
|
|
20
|
+
- Generated `test_generated.py` opens with the three commands a new teammate needs; the run summary ends with a `traceval calibrate` hint for the report just written.
|
|
21
|
+
- CI: wheel-based demo smoke job (build wheel, install into a clean venv, run `traceval demo` from an empty directory) so the pip-user path is what CI tests. Releases now gate on it.
|
|
22
|
+
|
|
23
|
+
### Changed
|
|
24
|
+
- Failure-signature tokens for `not_contains` checks are now distinctiveness-filtered: a token qualifies only if it appears in fewer than 10% of success outputs (same-cluster successes preferred, all successes in the db as fallback). If no token survives, the check is omitted rather than emitting a junk forbidden list.
|
|
25
|
+
- `not_contains` matching is word-boundary based and case-insensitive instead of raw substring: forbidding "error" no longer false-fails a healthy "no errors found".
|
|
26
|
+
- README rewritten: real command outputs (regenerable via `scripts/readme-outputs.sh`), a live screenshot of the analysis report, evidence-based feature claims, plain pipeline diagram, and the GitHub Action example pinned to a release tag.
|
|
27
|
+
|
|
28
|
+
### Fixed
|
|
29
|
+
- Generated run summary printed a literal `\n` before "traceval Run Summary" (jinja over-escaping in the conftest template).
|
|
30
|
+
- `analysis/report.html` version badge was hardcoded to v0.1.0; it now shows the installed traceval version.
|
|
31
|
+
|
|
8
32
|
## [0.2.0] - 2026-07-02
|
|
9
33
|
|
|
10
34
|
### Added
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Contributing to traceval
|
|
2
|
+
|
|
3
|
+
## Setup
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
git clone https://github.com/theramkm/traceval.git
|
|
7
|
+
cd traceval
|
|
8
|
+
uv sync # installs the package and dev dependencies (same as CI)
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Development loop
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
make test # pytest -q
|
|
15
|
+
make lint # ruff check, ruff format --check, mypy
|
|
16
|
+
make demo # end-to-end smoke: healthy agent passes, buggy agent fails
|
|
17
|
+
make all # lint + test
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
CI runs the same commands on Python 3.11, 3.12, and 3.13, plus a
|
|
21
|
+
wheel-based demo smoke job, and enforces 85% coverage. Keep all of it
|
|
22
|
+
green; add a test for every behavior change.
|
traceval-0.2.2/Makefile
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
.PHONY: test lint demo all
|
|
2
|
+
|
|
3
|
+
test:
|
|
4
|
+
uv run pytest -q
|
|
5
|
+
|
|
6
|
+
lint:
|
|
7
|
+
uv run ruff check src tests examples
|
|
8
|
+
uv run ruff format --check src tests examples
|
|
9
|
+
uv run mypy src/traceval
|
|
10
|
+
|
|
11
|
+
demo:
|
|
12
|
+
uv run traceval demo -o /tmp/traceval-demo --force
|
|
13
|
+
|
|
14
|
+
all: lint test
|
traceval-0.2.2/PKG-INFO
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: traceval
|
|
3
|
+
Version: 0.2.2
|
|
4
|
+
Summary: Trace-to-Eval Compiler
|
|
5
|
+
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Python: >=3.11
|
|
8
|
+
Requires-Dist: httpx>=0.24.0
|
|
9
|
+
Requires-Dist: jinja2>=3.1.0
|
|
10
|
+
Requires-Dist: jsonschema>=4.17.0
|
|
11
|
+
Requires-Dist: pydantic>=2.0.0
|
|
12
|
+
Requires-Dist: pytest>=8.0.0
|
|
13
|
+
Requires-Dist: pyyaml>=6.0.0
|
|
14
|
+
Requires-Dist: rich>=13.0.0
|
|
15
|
+
Requires-Dist: typer>=0.9.0
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
# traceval: Trace-to-Eval Compiler
|
|
19
|
+
|
|
20
|
+
<p align="center">
|
|
21
|
+
<img src="https://img.shields.io/badge/Python-3.11%20%7C%203.12%20%7C%203.13-blue.svg" alt="Python Version" />
|
|
22
|
+
<img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="License" />
|
|
23
|
+
</p>
|
|
24
|
+
|
|
25
|
+
***"Your traces already know how your agent fails. traceval turns them into the test suite you never wrote."***
|
|
26
|
+
|
|
27
|
+
Teams running LLM agents in production have observability traces, but only a fraction maintain evals. The raw material for good tests, thousands of real traces full of edge cases and errors, sits unused because turning it into a regression suite is manual and tedious.
|
|
28
|
+
|
|
29
|
+
traceval ingests agent traces from standard sources, normalizes them into a canonical Pydantic model, labels outcomes, clusters task shapes, and compiles the result into a human-editable eval suite: YAML cases, a pytest harness, and judge rubric scaffolds.
|
|
30
|
+
|
|
31
|
+

|
|
32
|
+
|
|
33
|
+
## Quickstart
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install traceval
|
|
37
|
+
traceval demo
|
|
38
|
+
open traceval-demo/analysis/report.html # xdg-open on Linux
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
`traceval demo` runs the entire loop against a built-in demo agent: it generates 200 synthetic traces, ingests them, clusters the failures, compiles an eval suite, and then proves the headline claim by running that suite twice:
|
|
42
|
+
|
|
43
|
+
```text
|
|
44
|
+
=== Demo complete: healthy agent PASSED, buggy agent FAILED ===
|
|
45
|
+
Failure-cluster report: traceval-demo/analysis/report.html
|
|
46
|
+
Run report: traceval-demo/evals/runs/run_20260702T072029851802Z.json
|
|
47
|
+
Run report: traceval-demo/evals/runs/run_20260702T072030171406Z.json
|
|
48
|
+
|
|
49
|
+
Re-run any stage manually:
|
|
50
|
+
traceval ingest traceval-demo/synthetic_traces.jsonl -o traceval-demo/traces.db
|
|
51
|
+
traceval analyze traceval-demo/traces.db -o traceval-demo/analysis
|
|
52
|
+
traceval generate traceval-demo/traces.db -o traceval-demo/evals --include-failures
|
|
53
|
+
traceval run traceval-demo/evals --target traceval.demo.agent:invoke_agent --judge fake
|
|
54
|
+
traceval calibrate traceval-demo/evals/runs/run_20260702T072030171406Z.json
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## How it works
|
|
58
|
+
|
|
59
|
+
```mermaid
|
|
60
|
+
graph LR
|
|
61
|
+
A[OTel / Langfuse / LangSmith traces] --> B[Canonical trace DB]
|
|
62
|
+
B --> C[Label and cluster]
|
|
63
|
+
C --> D[YAML cases + pytest + rubrics]
|
|
64
|
+
D --> E[Run, diff, calibrate]
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Features
|
|
68
|
+
|
|
69
|
+
* Ingests OpenTelemetry GenAI, Langfuse, and LangSmith exports, plus generic JSONL. Malformed lines are logged as warnings instead of crashing the run (tested against corrupt fixtures in `tests/fixtures/`).
|
|
70
|
+
* Labels every trace with a rule-based outcome taxonomy (`success`, `tool_error`, `validation_error`, `loop`, `timeout`, `bad_output`) that you can extend with your own Python rules via `--rules`.
|
|
71
|
+
* Clusters task shapes with Jaccard shingle similarity, fully offline: no embeddings, no API calls. Numeric tokens are normalized, so "order 57978" and "order 12345" land in the same cluster.
|
|
72
|
+
* Deterministic generation: regenerating a suite from the same database is byte-identical, so evals diff cleanly in git.
|
|
73
|
+
* Regression cases are inverted: a failure trace asserts the failure does *not* recur (forbidden error signatures, tool-loop bounds, non-empty output), never that the agent reproduces it.
|
|
74
|
+
* Redacts emails, credit cards, phone numbers, and API tokens before case inputs are written (add your own scrubber with `--redact-hook`).
|
|
75
|
+
* `traceval run` exits nonzero on any failing case and diffs against a previous report with `--compare`, so CI can gate deploys on it.
|
|
76
|
+
* `traceval calibrate` measures judge-vs-human agreement per cluster and flags rubrics the automated judge scores unreliably.
|
|
77
|
+
|
|
78
|
+
## Walkthrough on your own traces
|
|
79
|
+
|
|
80
|
+
The command outputs below are real, captured from a run over the demo trace set (regenerate them with `scripts/readme-outputs.sh`).
|
|
81
|
+
|
|
82
|
+
### 1. Ingest
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
traceval ingest traces.jsonl -o traces.db # --format auto|otel|langfuse|langsmith|generic
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
```text
|
|
89
|
+
Ingested 200 traces (209 spans).
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Malformed spans do not abort the ingest; warnings are written to `<traces.db>.log`.
|
|
93
|
+
|
|
94
|
+
### 2. Analyze
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
traceval analyze traces.db -o analysis
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
```text
|
|
101
|
+
Outcomes: success 60% · tool_error 15% · loop 10% · timeout 8% · validation_error 8%
|
|
102
|
+
Clusters: 8 task clusters found.
|
|
103
|
+
Top failure cluster: "refund stripe -> stripe_lookup -> (tool_error)" (30 traces)
|
|
104
|
+
Report written to analysis/report.html
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
`analysis/report.html` is the single-file page shown in the screenshot above. Pass `--evals evals/` to overlay eval coverage per cluster, and `--rules my_rules.py` to add your own labeling rules. To view it over HTTP instead of `file://`, `traceval serve analysis` starts a stdlib localhost server and prints the report URL.
|
|
108
|
+
|
|
109
|
+
### 3. Generate
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
traceval generate traces.db -o evals --include-failures
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
```text
|
|
116
|
+
Wrote 8 eval cases across 8 clusters → evals/cases/*.yaml
|
|
117
|
+
Wrote judge rubrics → evals/rubrics/*.md
|
|
118
|
+
Wrote pytest harness → evals/test_generated.py, evals/conftest.py
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
Every case is a reviewable YAML file. Golden cases assert the recorded successful behavior. Regression cases, generated from failure traces, assert the failure does **not** recur: forbidden error tokens (word-boundary matched, filtered against tokens that success traces also use), tool-loop bounds, and non-empty output. A regression case passes for any agent that avoids that failure mode; golden cases carry general bug detection.
|
|
122
|
+
|
|
123
|
+
### 4. Run
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
traceval run evals --target myapp.agent:invoke_agent --judge fake
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
```text
|
|
130
|
+
traceval Run Summary
|
|
131
|
+
┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┓
|
|
132
|
+
┃ Case ID ┃ Cluster ┃ Outcome ┃ Latency (ms) ┃
|
|
133
|
+
┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━┩
|
|
134
|
+
│ c_0c422a7a__case_001 │ c_0c422a7a │ PASS │ 0.0 │
|
|
135
|
+
│ c_1e5d0942__case_002 │ c_1e5d0942 │ PASS │ 0.0 │
|
|
136
|
+
│ c_2c881177__case_003 │ c_2c881177 │ PASS │ 0.0 │
|
|
137
|
+
│ c_361535b0__case_004 │ c_361535b0 │ PASS │ 0.0 │
|
|
138
|
+
│ c_9a8a4644__case_005 │ c_9a8a4644 │ PASS │ 0.0 │
|
|
139
|
+
│ c_d30af83a__case_006 │ c_d30af83a │ PASS │ 0.0 │
|
|
140
|
+
│ c_d3f3b631__case_007 │ c_d3f3b631 │ PASS │ 0.0 │
|
|
141
|
+
│ c_e834c13c__case_008 │ c_e834c13c │ PASS │ 0.0 │
|
|
142
|
+
└──────────────────────┴────────────┴─────────┴──────────────┘
|
|
143
|
+
Total: 8 | Passed: 8 | Failed: 0 | Errored: 0
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
The target is an HTTP URL or a `module:function` callable. Checks cover `exact`, `contains_any`, `not_contains`, `regex`, `json_schema`, `tool_sequence`, `no_tool_loop`, and `judge`. Run reports land in `<evals_dir>/runs/` (override with `--runs-dir`); pass `--compare <previous report>` to print regressions and improvements between runs. The exit code is nonzero when any case fails.
|
|
147
|
+
|
|
148
|
+
### 5. Calibrate the judge
|
|
149
|
+
|
|
150
|
+
An LLM judge is only as trustworthy as its agreement with human judgment. `calibrate` samples judge-scored results from a run report and presents each agent output for blind pass/fail labeling in the terminal; judge verdicts stay hidden until the end so they cannot anchor you.
|
|
151
|
+
|
|
152
|
+
```bash
|
|
153
|
+
traceval calibrate evals/runs/run_<timestamp>.json --sample 8
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
```text
|
|
157
|
+
Judge Calibration Summary
|
|
158
|
+
┏━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━┓
|
|
159
|
+
┃ Cluster ┃ Labeled ┃ Agreement ┃
|
|
160
|
+
┡━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━┩
|
|
161
|
+
│ c_0c422a7a │ 1 │ 100% │
|
|
162
|
+
│ c_1e5d0942 │ 1 │ 100% │
|
|
163
|
+
│ c_2c881177 │ 1 │ 100% │
|
|
164
|
+
│ c_361535b0 │ 1 │ 100% │
|
|
165
|
+
│ c_9a8a4644 │ 1 │ 0% │
|
|
166
|
+
│ c_d30af83a │ 1 │ 100% │
|
|
167
|
+
│ c_d3f3b631 │ 1 │ 100% │
|
|
168
|
+
│ c_e834c13c │ 1 │ 100% │
|
|
169
|
+
└────────────┴─────────┴───────────┘
|
|
170
|
+
Overall agreement: 88% on 8 case(s) | false-pass (judge OK, human not): 1 | false-fail: 0
|
|
171
|
+
WARNING: Judge unreliable (< 80% agreement) for clusters: c_9a8a4644. Review their rubrics before trusting automated scores.
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
False-pass counts (judge approved, human rejected) are called out because that is the dangerous direction: a lenient judge waves bad outputs into production. Clusters below `--min-agreement` (default 80%) are flagged for rubric review, and the full labels plus stats are written to `calibration.json`.
|
|
175
|
+
|
|
176
|
+
## Scripting with --json
|
|
177
|
+
|
|
178
|
+
`ingest`, `analyze`, `generate`, and `run` accept `--json`: human-readable output is suppressed and a single JSON object is printed to stdout. `run` still exits nonzero on failures.
|
|
179
|
+
|
|
180
|
+
```bash
|
|
181
|
+
traceval analyze traces.db --json | python -m json.tool
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
## GitHub Action
|
|
185
|
+
|
|
186
|
+
Gate deploys on your generated eval suite. The action installs traceval, runs the suite, and fails the job on any regression:
|
|
187
|
+
|
|
188
|
+
```yaml
|
|
189
|
+
jobs:
|
|
190
|
+
agent-evals:
|
|
191
|
+
runs-on: ubuntu-latest
|
|
192
|
+
steps:
|
|
193
|
+
- uses: actions/checkout@v4
|
|
194
|
+
- uses: theramkm/traceval@v0.2.2
|
|
195
|
+
with:
|
|
196
|
+
evals-dir: evals/
|
|
197
|
+
target: myapp.agent:invoke_agent # or an HTTP URL
|
|
198
|
+
judge: fake # offline; 'openai' needs an API key
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
Inputs: `evals-dir` and `target` (required); `judge`, `compare`, `only`, `runs-dir`, `traceval-version`, `python-version` (optional). For a real LLM judge, set `judge: openai` and pass `OPENAI_API_KEY` (or `GEMINI_API_KEY`) via `env:` from your repository secrets.
|
|
202
|
+
|
|
203
|
+
## Development
|
|
204
|
+
|
|
205
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for setup.
|
|
206
|
+
Run the test suite with `make test` and the full gate set with `make lint`.
|
|
207
|
+
|
|
208
|
+
## Honest Limitations
|
|
209
|
+
|
|
210
|
+
* **Side-Effect Free**: traceval assertions evaluate input/output matches. It does not attempt to replay side effects (e.g., updating database records) on mock tools.
|
|
211
|
+
* **Text Telemetry**: The canonical model is optimized for text logs. Image or multimodal payloads in traces are logged as references.
|
|
212
|
+
* **Static Visualization**: The coverage report is a portable, single-file HTML page. There is no hosted web service.
|
traceval-0.2.2/README.md
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# traceval: Trace-to-Eval Compiler
|
|
2
|
+
|
|
3
|
+
<p align="center">
|
|
4
|
+
<img src="https://img.shields.io/badge/Python-3.11%20%7C%203.12%20%7C%203.13-blue.svg" alt="Python Version" />
|
|
5
|
+
<img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="License" />
|
|
6
|
+
</p>
|
|
7
|
+
|
|
8
|
+
***"Your traces already know how your agent fails. traceval turns them into the test suite you never wrote."***
|
|
9
|
+
|
|
10
|
+
Teams running LLM agents in production have observability traces, but only a fraction maintain evals. The raw material for good tests, thousands of real traces full of edge cases and errors, sits unused because turning it into a regression suite is manual and tedious.
|
|
11
|
+
|
|
12
|
+
traceval ingests agent traces from standard sources, normalizes them into a canonical Pydantic model, labels outcomes, clusters task shapes, and compiles the result into a human-editable eval suite: YAML cases, a pytest harness, and judge rubric scaffolds.
|
|
13
|
+
|
|
14
|
+

|
|
15
|
+
|
|
16
|
+
## Quickstart
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install traceval
|
|
20
|
+
traceval demo
|
|
21
|
+
open traceval-demo/analysis/report.html # xdg-open on Linux
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
`traceval demo` runs the entire loop against a built-in demo agent: it generates 200 synthetic traces, ingests them, clusters the failures, compiles an eval suite, and then proves the headline claim by running that suite twice:
|
|
25
|
+
|
|
26
|
+
```text
|
|
27
|
+
=== Demo complete: healthy agent PASSED, buggy agent FAILED ===
|
|
28
|
+
Failure-cluster report: traceval-demo/analysis/report.html
|
|
29
|
+
Run report: traceval-demo/evals/runs/run_20260702T072029851802Z.json
|
|
30
|
+
Run report: traceval-demo/evals/runs/run_20260702T072030171406Z.json
|
|
31
|
+
|
|
32
|
+
Re-run any stage manually:
|
|
33
|
+
traceval ingest traceval-demo/synthetic_traces.jsonl -o traceval-demo/traces.db
|
|
34
|
+
traceval analyze traceval-demo/traces.db -o traceval-demo/analysis
|
|
35
|
+
traceval generate traceval-demo/traces.db -o traceval-demo/evals --include-failures
|
|
36
|
+
traceval run traceval-demo/evals --target traceval.demo.agent:invoke_agent --judge fake
|
|
37
|
+
traceval calibrate traceval-demo/evals/runs/run_20260702T072030171406Z.json
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## How it works
|
|
41
|
+
|
|
42
|
+
```mermaid
|
|
43
|
+
graph LR
|
|
44
|
+
A[OTel / Langfuse / LangSmith traces] --> B[Canonical trace DB]
|
|
45
|
+
B --> C[Label and cluster]
|
|
46
|
+
C --> D[YAML cases + pytest + rubrics]
|
|
47
|
+
D --> E[Run, diff, calibrate]
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Features
|
|
51
|
+
|
|
52
|
+
* Ingests OpenTelemetry GenAI, Langfuse, and LangSmith exports, plus generic JSONL. Malformed lines are logged as warnings instead of crashing the run (tested against corrupt fixtures in `tests/fixtures/`).
|
|
53
|
+
* Labels every trace with a rule-based outcome taxonomy (`success`, `tool_error`, `validation_error`, `loop`, `timeout`, `bad_output`) that you can extend with your own Python rules via `--rules`.
|
|
54
|
+
* Clusters task shapes with Jaccard shingle similarity, fully offline: no embeddings, no API calls. Numeric tokens are normalized, so "order 57978" and "order 12345" land in the same cluster.
|
|
55
|
+
* Deterministic generation: regenerating a suite from the same database is byte-identical, so evals diff cleanly in git.
|
|
56
|
+
* Regression cases are inverted: a failure trace asserts the failure does *not* recur (forbidden error signatures, tool-loop bounds, non-empty output), never that the agent reproduces it.
|
|
57
|
+
* Redacts emails, credit cards, phone numbers, and API tokens before case inputs are written (add your own scrubber with `--redact-hook`).
|
|
58
|
+
* `traceval run` exits nonzero on any failing case and diffs against a previous report with `--compare`, so CI can gate deploys on it.
|
|
59
|
+
* `traceval calibrate` measures judge-vs-human agreement per cluster and flags rubrics the automated judge scores unreliably.
|
|
60
|
+
|
|
61
|
+
## Walkthrough on your own traces
|
|
62
|
+
|
|
63
|
+
The command outputs below are real, captured from a run over the demo trace set (regenerate them with `scripts/readme-outputs.sh`).
|
|
64
|
+
|
|
65
|
+
### 1. Ingest
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
traceval ingest traces.jsonl -o traces.db # --format auto|otel|langfuse|langsmith|generic
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
```text
|
|
72
|
+
Ingested 200 traces (209 spans).
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Malformed spans do not abort the ingest; warnings are written to `<traces.db>.log`.
|
|
76
|
+
|
|
77
|
+
### 2. Analyze
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
traceval analyze traces.db -o analysis
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
```text
|
|
84
|
+
Outcomes: success 60% · tool_error 15% · loop 10% · timeout 8% · validation_error 8%
|
|
85
|
+
Clusters: 8 task clusters found.
|
|
86
|
+
Top failure cluster: "refund stripe -> stripe_lookup -> (tool_error)" (30 traces)
|
|
87
|
+
Report written to analysis/report.html
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
`analysis/report.html` is the single-file page shown in the screenshot above. Pass `--evals evals/` to overlay eval coverage per cluster, and `--rules my_rules.py` to add your own labeling rules. To view it over HTTP instead of `file://`, `traceval serve analysis` starts a stdlib localhost server and prints the report URL.
|
|
91
|
+
|
|
92
|
+
### 3. Generate
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
traceval generate traces.db -o evals --include-failures
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
```text
|
|
99
|
+
Wrote 8 eval cases across 8 clusters → evals/cases/*.yaml
|
|
100
|
+
Wrote judge rubrics → evals/rubrics/*.md
|
|
101
|
+
Wrote pytest harness → evals/test_generated.py, evals/conftest.py
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Every case is a reviewable YAML file. Golden cases assert the recorded successful behavior. Regression cases, generated from failure traces, assert the failure does **not** recur: forbidden error tokens (word-boundary matched, filtered against tokens that success traces also use), tool-loop bounds, and non-empty output. A regression case passes for any agent that avoids that failure mode; golden cases carry general bug detection.
|
|
105
|
+
|
|
106
|
+
### 4. Run
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
traceval run evals --target myapp.agent:invoke_agent --judge fake
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
```text
|
|
113
|
+
traceval Run Summary
|
|
114
|
+
┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┓
|
|
115
|
+
┃ Case ID ┃ Cluster ┃ Outcome ┃ Latency (ms) ┃
|
|
116
|
+
┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━┩
|
|
117
|
+
│ c_0c422a7a__case_001 │ c_0c422a7a │ PASS │ 0.0 │
|
|
118
|
+
│ c_1e5d0942__case_002 │ c_1e5d0942 │ PASS │ 0.0 │
|
|
119
|
+
│ c_2c881177__case_003 │ c_2c881177 │ PASS │ 0.0 │
|
|
120
|
+
│ c_361535b0__case_004 │ c_361535b0 │ PASS │ 0.0 │
|
|
121
|
+
│ c_9a8a4644__case_005 │ c_9a8a4644 │ PASS │ 0.0 │
|
|
122
|
+
│ c_d30af83a__case_006 │ c_d30af83a │ PASS │ 0.0 │
|
|
123
|
+
│ c_d3f3b631__case_007 │ c_d3f3b631 │ PASS │ 0.0 │
|
|
124
|
+
│ c_e834c13c__case_008 │ c_e834c13c │ PASS │ 0.0 │
|
|
125
|
+
└──────────────────────┴────────────┴─────────┴──────────────┘
|
|
126
|
+
Total: 8 | Passed: 8 | Failed: 0 | Errored: 0
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
The target is an HTTP URL or a `module:function` callable. Checks cover `exact`, `contains_any`, `not_contains`, `regex`, `json_schema`, `tool_sequence`, `no_tool_loop`, and `judge`. Run reports land in `<evals_dir>/runs/` (override with `--runs-dir`); pass `--compare <previous report>` to print regressions and improvements between runs. The exit code is nonzero when any case fails.
|
|
130
|
+
|
|
131
|
+
### 5. Calibrate the judge
|
|
132
|
+
|
|
133
|
+
An LLM judge is only as trustworthy as its agreement with human judgment. `calibrate` samples judge-scored results from a run report and presents each agent output for blind pass/fail labeling in the terminal; judge verdicts stay hidden until the end so they cannot anchor you.
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
traceval calibrate evals/runs/run_<timestamp>.json --sample 8
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
```text
|
|
140
|
+
Judge Calibration Summary
|
|
141
|
+
┏━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━┓
|
|
142
|
+
┃ Cluster ┃ Labeled ┃ Agreement ┃
|
|
143
|
+
┡━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━┩
|
|
144
|
+
│ c_0c422a7a │ 1 │ 100% │
|
|
145
|
+
│ c_1e5d0942 │ 1 │ 100% │
|
|
146
|
+
│ c_2c881177 │ 1 │ 100% │
|
|
147
|
+
│ c_361535b0 │ 1 │ 100% │
|
|
148
|
+
│ c_9a8a4644 │ 1 │ 0% │
|
|
149
|
+
│ c_d30af83a │ 1 │ 100% │
|
|
150
|
+
│ c_d3f3b631 │ 1 │ 100% │
|
|
151
|
+
│ c_e834c13c │ 1 │ 100% │
|
|
152
|
+
└────────────┴─────────┴───────────┘
|
|
153
|
+
Overall agreement: 88% on 8 case(s) | false-pass (judge OK, human not): 1 | false-fail: 0
|
|
154
|
+
WARNING: Judge unreliable (< 80% agreement) for clusters: c_9a8a4644. Review their rubrics before trusting automated scores.
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
False-pass counts (judge approved, human rejected) are called out because that is the dangerous direction: a lenient judge waves bad outputs into production. Clusters below `--min-agreement` (default 80%) are flagged for rubric review, and the full labels plus stats are written to `calibration.json`.
|
|
158
|
+
|
|
159
|
+
## Scripting with --json
|
|
160
|
+
|
|
161
|
+
`ingest`, `analyze`, `generate`, and `run` accept `--json`: human-readable output is suppressed and a single JSON object is printed to stdout. `run` still exits nonzero on failures.
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
traceval analyze traces.db --json | python -m json.tool
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
## GitHub Action
|
|
168
|
+
|
|
169
|
+
Gate deploys on your generated eval suite. The action installs traceval, runs the suite, and fails the job on any regression:
|
|
170
|
+
|
|
171
|
+
```yaml
|
|
172
|
+
jobs:
|
|
173
|
+
agent-evals:
|
|
174
|
+
runs-on: ubuntu-latest
|
|
175
|
+
steps:
|
|
176
|
+
- uses: actions/checkout@v4
|
|
177
|
+
- uses: theramkm/traceval@v0.2.2
|
|
178
|
+
with:
|
|
179
|
+
evals-dir: evals/
|
|
180
|
+
target: myapp.agent:invoke_agent # or an HTTP URL
|
|
181
|
+
judge: fake # offline; 'openai' needs an API key
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
Inputs: `evals-dir` and `target` (required); `judge`, `compare`, `only`, `runs-dir`, `traceval-version`, `python-version` (optional). For a real LLM judge, set `judge: openai` and pass `OPENAI_API_KEY` (or `GEMINI_API_KEY`) via `env:` from your repository secrets.
|
|
185
|
+
|
|
186
|
+
## Development
|
|
187
|
+
|
|
188
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for setup.
|
|
189
|
+
Run the test suite with `make test` and the full gate set with `make lint`.
|
|
190
|
+
|
|
191
|
+
## Honest Limitations
|
|
192
|
+
|
|
193
|
+
* **Side-Effect Free**: traceval assertions evaluate input/output matches. It does not attempt to replay side effects (e.g., updating database records) on mock tools.
|
|
194
|
+
* **Text Telemetry**: The canonical model is optimized for text logs. Image or multimodal payloads in traces are logged as references.
|
|
195
|
+
* **Static Visualization**: The coverage report is a portable, single-file HTML page. There is no hosted web service.
|
|
Binary file
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# traceval e2e quickstart: the whole loop now lives in `traceval demo`
|
|
3
|
+
# (generate traces -> ingest -> analyze -> generate evals -> run healthy
|
|
4
|
+
# agent, must pass -> run buggy agent, must fail).
|
|
5
|
+
set -e
|
|
6
|
+
|
|
7
|
+
uv run traceval demo -o traceval-demo --force "$@"
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""Thin wrapper: the demo agent now ships inside the package so
|
|
2
|
+
`traceval demo` works from a plain pip install. Kept so existing
|
|
3
|
+
`--target examples.demo_agent.core:invoke_agent` invocations still work.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from traceval.demo.agent import invoke_agent, run_agent_logic
|
|
7
|
+
|
|
8
|
+
__all__ = ["invoke_agent", "run_agent_logic"]
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Thin wrapper: the trace generator now ships inside the package
|
|
2
|
+
(traceval.demo.traces) so `traceval demo` works from a plain pip install.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from traceval.demo.traces import generate_traces_file
|
|
8
|
+
|
|
9
|
+
if __name__ == "__main__":
|
|
10
|
+
out = Path(__file__).parent / "synthetic_traces.jsonl"
|
|
11
|
+
generate_traces_file(out)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Regenerates every sample-output block in README.md, plus
|
|
3
|
+
# docs/img/report.png, from real command runs so the docs cannot drift.
|
|
4
|
+
# Run from anywhere; paste the printed sections into README.md verbatim.
|
|
5
|
+
set -e
|
|
6
|
+
|
|
7
|
+
ROOT=$(cd "$(dirname "$0")/.." && pwd)
|
|
8
|
+
WORK=$(mktemp -d)
|
|
9
|
+
trap 'rm -rf "$WORK"' EXIT
|
|
10
|
+
cd "$WORK"
|
|
11
|
+
|
|
12
|
+
tv() { uv run --project "$ROOT" traceval "$@"; }
|
|
13
|
+
|
|
14
|
+
uv run --project "$ROOT" python3 -c "
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from traceval.demo.traces import generate_traces_file
|
|
17
|
+
generate_traces_file(Path('traces.jsonl'))
|
|
18
|
+
" >/dev/null
|
|
19
|
+
|
|
20
|
+
echo "=== ingest ==="
|
|
21
|
+
tv ingest traces.jsonl -o traces.db
|
|
22
|
+
|
|
23
|
+
echo
|
|
24
|
+
echo "=== analyze ==="
|
|
25
|
+
tv analyze traces.db -o analysis
|
|
26
|
+
|
|
27
|
+
echo
|
|
28
|
+
echo "=== generate ==="
|
|
29
|
+
tv generate traces.db -o evals --include-failures
|
|
30
|
+
|
|
31
|
+
echo
|
|
32
|
+
echo "=== run (healthy demo agent) ==="
|
|
33
|
+
tv run evals --target traceval.demo.agent:invoke_agent --judge fake || true
|
|
34
|
+
|
|
35
|
+
echo
|
|
36
|
+
echo "=== calibrate (example labels: 7x pass, 1x fail) ==="
|
|
37
|
+
REPORT=$(ls -t evals/runs/run_*.json | head -1)
|
|
38
|
+
printf 'y\ny\ny\ny\nn\ny\ny\ny\n' | tv calibrate "$REPORT" --sample 8 | tail -16
|
|
39
|
+
|
|
40
|
+
echo
|
|
41
|
+
echo "=== screenshot -> docs/img/report.png ==="
|
|
42
|
+
# Re-analyze with --evals so the report shows populated eval coverage
|
|
43
|
+
tv analyze traces.db --evals evals -o analysis >/dev/null
|
|
44
|
+
CHROME="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
|
|
45
|
+
if [ -x "$CHROME" ]; then
|
|
46
|
+
"$CHROME" --headless --disable-gpu \
|
|
47
|
+
--screenshot="$ROOT/docs/img/report.png" \
|
|
48
|
+
--window-size=1280,860 --hide-scrollbars \
|
|
49
|
+
"file://$WORK/analysis/report.html" 2>/dev/null
|
|
50
|
+
echo "wrote $ROOT/docs/img/report.png"
|
|
51
|
+
else
|
|
52
|
+
echo "Chrome not found; screenshot skipped"
|
|
53
|
+
fi
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.2"
|