traceval 0.2.0__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. {traceval-0.2.0 → traceval-0.2.2}/.github/workflows/ci.yml +35 -10
  2. {traceval-0.2.0 → traceval-0.2.2}/CHANGELOG.md +24 -0
  3. traceval-0.2.2/CONTRIBUTING.md +22 -0
  4. traceval-0.2.2/Makefile +14 -0
  5. traceval-0.2.2/PKG-INFO +212 -0
  6. traceval-0.2.2/README.md +195 -0
  7. traceval-0.2.2/docs/img/report.png +0 -0
  8. traceval-0.2.2/examples/demo.sh +7 -0
  9. traceval-0.2.2/examples/demo_agent/core.py +8 -0
  10. traceval-0.2.2/examples/make_traces.py +11 -0
  11. {traceval-0.2.0 → traceval-0.2.2}/pyproject.toml +1 -1
  12. traceval-0.2.2/scripts/readme-outputs.sh +53 -0
  13. traceval-0.2.2/src/traceval/__init__.py +1 -0
  14. {traceval-0.2.0 → traceval-0.2.2}/src/traceval/analyze/report.py +4 -1
  15. traceval-0.2.2/src/traceval/cli.py +534 -0
  16. {traceval-0.2.0 → traceval-0.2.2}/src/traceval/compile/__init__.py +7 -2
  17. {traceval-0.2.0 → traceval-0.2.2}/src/traceval/compile/cases.py +51 -6
  18. {traceval-0.2.0 → traceval-0.2.2}/src/traceval/compile/templates/conftest.py.jinja +118 -38
  19. {traceval-0.2.0 → traceval-0.2.2}/src/traceval/compile/templates/test_generated.py.jinja +13 -0
  20. traceval-0.2.2/src/traceval/demo/__init__.py +9 -0
  21. traceval-0.2.0/examples/demo_agent/core.py → traceval-0.2.2/src/traceval/demo/agent.py +3 -2
  22. traceval-0.2.0/examples/make_traces.py → traceval-0.2.2/src/traceval/demo/traces.py +5 -10
  23. {traceval-0.2.0 → traceval-0.2.2}/src/traceval/run/scorers.py +7 -2
  24. traceval-0.2.2/tests/test_broken_target.py +90 -0
  25. traceval-0.2.2/tests/test_e2e_demo.py +67 -0
  26. traceval-0.2.2/tests/test_json_output.py +112 -0
  27. {traceval-0.2.0 → traceval-0.2.2}/tests/test_phase6.py +12 -1
  28. {traceval-0.2.0 → traceval-0.2.2}/tests/test_regression_cases.py +102 -0
  29. traceval-0.2.2/tests/test_serve.py +25 -0
  30. {traceval-0.2.0 → traceval-0.2.2}/uv.lock +1 -1
  31. traceval-0.2.0/PKG-INFO +0 -200
  32. traceval-0.2.0/README.md +0 -183
  33. traceval-0.2.0/examples/demo.sh +0 -31
  34. traceval-0.2.0/src/traceval/__init__.py +0 -1
  35. traceval-0.2.0/src/traceval/cli.py +0 -284
  36. traceval-0.2.0/tests/test_e2e_demo.py +0 -99
  37. {traceval-0.2.0 → traceval-0.2.2}/.gitignore +0 -0
  38. {traceval-0.2.0 → traceval-0.2.2}/.pre-commit-config.yaml +0 -0
  39. {traceval-0.2.0 → traceval-0.2.2}/LICENSE +0 -0
  40. {traceval-0.2.0 → traceval-0.2.2}/action.yml +0 -0
  41. {traceval-0.2.0 → traceval-0.2.2}/docs/formats.md +0 -0
  42. {traceval-0.2.0 → traceval-0.2.2}/examples/demo_agent/agent.py +0 -0
  43. {traceval-0.2.0 → traceval-0.2.2}/examples/synthetic_traces.jsonl +0 -0
  44. {traceval-0.2.0 → traceval-0.2.2}/src/traceval/analyze/__init__.py +0 -0
  45. {traceval-0.2.0 → traceval-0.2.2}/src/traceval/analyze/cluster.py +0 -0
  46. {traceval-0.2.0 → traceval-0.2.2}/src/traceval/analyze/coverage.py +0 -0
  47. {traceval-0.2.0 → traceval-0.2.2}/src/traceval/analyze/outcomes.py +0 -0
  48. {traceval-0.2.0 → traceval-0.2.2}/src/traceval/compile/emit_pytest.py +0 -0
  49. {traceval-0.2.0 → traceval-0.2.2}/src/traceval/compile/emit_yaml.py +0 -0
  50. {traceval-0.2.0 → traceval-0.2.2}/src/traceval/compile/rubrics.py +0 -0
  51. {traceval-0.2.0 → traceval-0.2.2}/src/traceval/ingest/__init__.py +0 -0
  52. {traceval-0.2.0 → traceval-0.2.2}/src/traceval/ingest/base.py +0 -0
  53. {traceval-0.2.0 → traceval-0.2.2}/src/traceval/ingest/generic.py +0 -0
  54. {traceval-0.2.0 → traceval-0.2.2}/src/traceval/ingest/langfuse.py +0 -0
  55. {traceval-0.2.0 → traceval-0.2.2}/src/traceval/ingest/langsmith.py +0 -0
  56. {traceval-0.2.0 → traceval-0.2.2}/src/traceval/ingest/otel.py +0 -0
  57. {traceval-0.2.0 → traceval-0.2.2}/src/traceval/model.py +0 -0
  58. {traceval-0.2.0 → traceval-0.2.2}/src/traceval/run/calibrate.py +0 -0
  59. {traceval-0.2.0 → traceval-0.2.2}/src/traceval/run/judge.py +0 -0
  60. {traceval-0.2.0 → traceval-0.2.2}/src/traceval/run/runner.py +0 -0
  61. {traceval-0.2.0 → traceval-0.2.2}/src/traceval/run/target.py +0 -0
  62. {traceval-0.2.0 → traceval-0.2.2}/src/traceval/store.py +0 -0
  63. {traceval-0.2.0 → traceval-0.2.2}/tests/fixtures/README.md +0 -0
  64. {traceval-0.2.0 → traceval-0.2.2}/tests/fixtures/generic_traces.jsonl +0 -0
  65. {traceval-0.2.0 → traceval-0.2.2}/tests/fixtures/langfuse_export.jsonl +0 -0
  66. {traceval-0.2.0 → traceval-0.2.2}/tests/fixtures/langsmith_runs.jsonl +0 -0
  67. {traceval-0.2.0 → traceval-0.2.2}/tests/fixtures/otel_spans.jsonl +0 -0
  68. {traceval-0.2.0 → traceval-0.2.2}/tests/test_calibrate.py +0 -0
  69. {traceval-0.2.0 → traceval-0.2.2}/tests/test_cli.py +0 -0
  70. {traceval-0.2.0 → traceval-0.2.2}/tests/test_phase1.py +0 -0
  71. {traceval-0.2.0 → traceval-0.2.2}/tests/test_phase2.py +0 -0
  72. {traceval-0.2.0 → traceval-0.2.2}/tests/test_phase3.py +0 -0
  73. {traceval-0.2.0 → traceval-0.2.2}/tests/test_phase4.py +0 -0
  74. {traceval-0.2.0 → traceval-0.2.2}/tests/test_phase5.py +0 -0
@@ -34,6 +34,33 @@ jobs:
34
34
  - name: Run Tests with Coverage
35
35
  run: uv run pytest --cov=src/traceval --cov-fail-under=85
36
36
 
37
+ demo-wheel:
38
+ # The pip-user path is what ships: build the wheel, install it in a
39
+ # clean venv, and run the packaged demo from an empty directory.
40
+ needs: test
41
+ runs-on: ubuntu-latest
42
+ steps:
43
+ - uses: actions/checkout@v4
44
+
45
+ - name: Set up Python
46
+ uses: actions/setup-python@v5
47
+ with:
48
+ python-version: "3.12"
49
+
50
+ - name: Build wheel
51
+ run: |
52
+ pip install build
53
+ python -m build --wheel
54
+
55
+ - name: Install wheel in a clean venv and run the demo
56
+ run: |
57
+ python -m venv /tmp/wheel-venv
58
+ /tmp/wheel-venv/bin/pip install dist/*.whl
59
+ mkdir /tmp/demo-smoke
60
+ cd /tmp/demo-smoke
61
+ /tmp/wheel-venv/bin/traceval demo
62
+ /tmp/wheel-venv/bin/traceval analyze traceval-demo/traces.db --json | python -m json.tool
63
+
37
64
  demo-action:
38
65
  # Dogfoods action.yml against the demo agent: the only CI coverage the
39
66
  # user-facing GitHub Action gets.
@@ -50,15 +77,13 @@ jobs:
50
77
  - name: Generate demo eval suite
51
78
  run: |
52
79
  pip install .
53
- python examples/make_traces.py
54
- traceval ingest examples/synthetic_traces.jsonl -o demo_traces.db
55
- traceval generate demo_traces.db -o demo_evals/ --include-failures
80
+ traceval demo -o demo-artifacts
56
81
 
57
82
  - name: Run evals via the action (healthy agent must pass)
58
83
  uses: ./
59
84
  with:
60
- evals-dir: demo_evals
61
- target: examples.demo_agent.core:invoke_agent
85
+ evals-dir: demo-artifacts/evals
86
+ target: traceval.demo.agent:invoke_agent
62
87
  traceval-version: local
63
88
 
64
89
  - name: Run evals via the action (buggy agent must fail)
@@ -68,20 +93,20 @@ jobs:
68
93
  env:
69
94
  BUGGY: "true"
70
95
  with:
71
- evals-dir: demo_evals
72
- target: examples.demo_agent.core:invoke_agent
96
+ evals-dir: demo-artifacts/evals
97
+ target: traceval.demo.agent:invoke_agent
73
98
  traceval-version: local
74
99
 
75
100
  - name: Assert buggy run failed
76
101
  run: |
77
102
  if [ "${{ steps.buggy.outcome }}" != "failure" ]; then
78
- echo "Buggy agent unexpectedly passed the eval suite"
103
+ echo "Buggy agent unexpectedly passed the eval suite"
79
104
  exit 1
80
105
  fi
81
- echo "Action correctly failed the job for the buggy agent"
106
+ echo "Action correctly failed the job for the buggy agent"
82
107
 
83
108
  tag-and-release:
84
- needs: [test, demo-action]
109
+ needs: [test, demo-wheel, demo-action]
85
110
  if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref == 'refs/heads/master')
86
111
  runs-on: ubuntu-latest
87
112
  permissions:
@@ -5,6 +5,30 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.2.2] - 2026-07-02
9
+
10
+ ### Fixed
11
+ - A run in which zero cases execute (e.g. unresolvable target) now writes a self-describing run report with an `errors` section instead of writing nothing; `--json` never reports `null`. Added a single clear top-level error line on target-resolution failure. Added `make test` and CONTRIBUTING as the canonical dev commands. Reported via external review of a failed-target invocation.
12
+ - Run report schema additions (existing fields unchanged): `summary.errored` counts cases that never executed due to setup/collection errors; top-level `errors` lists `{stage, detail}` entries for `target_resolution`, `collection`, and `setup` failures, with identical details deduplicated into one entry carrying a `count`; `results` is `[]` rather than absent when nothing executed. The terminal summary now shows `Errored: n`.
13
+
14
+ ## [0.2.1] - 2026-07-02
15
+
16
+ ### Added
17
+ - `--json` on `ingest`, `analyze`, `generate`, and `run`: suppresses human-readable output and prints a single JSON object to stdout for scripting (`run` still exits nonzero on failures). The README previously claimed this flag existed; now it does.
18
+ - `traceval demo`: runs the full trace-to-eval loop end-to-end with a built-in demo agent from a plain pip install (no repo clone). Creates `./traceval-demo/` (override with `-o`), refuses to write into a non-empty directory unless `--force`, and `--force` only ever replaces the demo's own artifacts. The demo agent and trace generator moved into the package (`traceval.demo`); `examples/` keeps thin wrappers.
19
+ - `traceval serve [dir]`: serves the analysis report directory on localhost with Python's stdlib http.server and prints the report URL. Not a web UI.
20
+ - Generated `test_generated.py` opens with the three commands a new teammate needs; the run summary ends with a `traceval calibrate` hint for the report just written.
21
+ - CI: wheel-based demo smoke job (build wheel, install into a clean venv, run `traceval demo` from an empty directory) so the pip-user path is what CI tests. Releases now gate on it.
22
+
23
+ ### Changed
24
+ - Failure-signature tokens for `not_contains` checks are now distinctiveness-filtered: a token qualifies only if it appears in fewer than 10% of success outputs (same-cluster successes preferred, all successes in the db as fallback). If no token survives, the check is omitted rather than emitting a junk forbidden list.
25
+ - `not_contains` matching is word-boundary based and case-insensitive instead of raw substring: forbidding "error" no longer false-fails a healthy "no errors found".
26
+ - README rewritten: real command outputs (regenerable via `scripts/readme-outputs.sh`), a live screenshot of the analysis report, evidence-based feature claims, plain pipeline diagram, and the GitHub Action example pinned to a release tag.
27
+
28
+ ### Fixed
29
+ - Generated run summary printed a literal `\n` before "traceval Run Summary" (jinja over-escaping in the conftest template).
30
+ - `analysis/report.html` version badge was hardcoded to v0.1.0; it now shows the installed traceval version.
31
+
8
32
  ## [0.2.0] - 2026-07-02
9
33
 
10
34
  ### Added
@@ -0,0 +1,22 @@
1
+ # Contributing to traceval
2
+
3
+ ## Setup
4
+
5
+ ```bash
6
+ git clone https://github.com/theramkm/traceval.git
7
+ cd traceval
8
+ uv sync # installs the package and dev dependencies (same as CI)
9
+ ```
10
+
11
+ ## Development loop
12
+
13
+ ```bash
14
+ make test # pytest -q
15
+ make lint # ruff check, ruff format --check, mypy
16
+ make demo # end-to-end smoke: healthy agent passes, buggy agent fails
17
+ make all # lint + test
18
+ ```
19
+
20
+ CI runs the same commands on Python 3.11, 3.12, and 3.13, plus a
21
+ wheel-based demo smoke job, and enforces 85% coverage. Keep all of it
22
+ green; add a test for every behavior change.
@@ -0,0 +1,14 @@
1
+ .PHONY: test lint demo all
2
+
3
+ test:
4
+ uv run pytest -q
5
+
6
+ lint:
7
+ uv run ruff check src tests examples
8
+ uv run ruff format --check src tests examples
9
+ uv run mypy src/traceval
10
+
11
+ demo:
12
+ uv run traceval demo -o /tmp/traceval-demo --force
13
+
14
+ all: lint test
@@ -0,0 +1,212 @@
1
+ Metadata-Version: 2.4
2
+ Name: traceval
3
+ Version: 0.2.2
4
+ Summary: Trace-to-Eval Compiler
5
+ License: MIT
6
+ License-File: LICENSE
7
+ Requires-Python: >=3.11
8
+ Requires-Dist: httpx>=0.24.0
9
+ Requires-Dist: jinja2>=3.1.0
10
+ Requires-Dist: jsonschema>=4.17.0
11
+ Requires-Dist: pydantic>=2.0.0
12
+ Requires-Dist: pytest>=8.0.0
13
+ Requires-Dist: pyyaml>=6.0.0
14
+ Requires-Dist: rich>=13.0.0
15
+ Requires-Dist: typer>=0.9.0
16
+ Description-Content-Type: text/markdown
17
+
18
+ # traceval: Trace-to-Eval Compiler
19
+
20
+ <p align="center">
21
+ <img src="https://img.shields.io/badge/Python-3.11%20%7C%203.12%20%7C%203.13-blue.svg" alt="Python Version" />
22
+ <img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="License" />
23
+ </p>
24
+
25
+ ***"Your traces already know how your agent fails. traceval turns them into the test suite you never wrote."***
26
+
27
+ Teams running LLM agents in production have observability traces, but only a fraction maintain evals. The raw material for good tests, thousands of real traces full of edge cases and errors, sits unused because turning it into a regression suite is manual and tedious.
28
+
29
+ traceval ingests agent traces from standard sources, normalizes them into a canonical Pydantic model, labels outcomes, clusters task shapes, and compiles the result into a human-editable eval suite: YAML cases, a pytest harness, and judge rubric scaffolds.
30
+
31
+ ![Failure-cluster coverage report generated by traceval analyze](docs/img/report.png)
32
+
33
+ ## Quickstart
34
+
35
+ ```bash
36
+ pip install traceval
37
+ traceval demo
38
+ open traceval-demo/analysis/report.html # xdg-open on Linux
39
+ ```
40
+
41
+ `traceval demo` runs the entire loop against a built-in demo agent: it generates 200 synthetic traces, ingests them, clusters the failures, compiles an eval suite, and then proves the headline claim by running that suite twice:
42
+
43
+ ```text
44
+ === Demo complete: healthy agent PASSED, buggy agent FAILED ===
45
+ Failure-cluster report: traceval-demo/analysis/report.html
46
+ Run report: traceval-demo/evals/runs/run_20260702T072029851802Z.json
47
+ Run report: traceval-demo/evals/runs/run_20260702T072030171406Z.json
48
+
49
+ Re-run any stage manually:
50
+ traceval ingest traceval-demo/synthetic_traces.jsonl -o traceval-demo/traces.db
51
+ traceval analyze traceval-demo/traces.db -o traceval-demo/analysis
52
+ traceval generate traceval-demo/traces.db -o traceval-demo/evals --include-failures
53
+ traceval run traceval-demo/evals --target traceval.demo.agent:invoke_agent --judge fake
54
+ traceval calibrate traceval-demo/evals/runs/run_20260702T072030171406Z.json
55
+ ```
56
+
57
+ ## How it works
58
+
59
+ ```mermaid
60
+ graph LR
61
+ A[OTel / Langfuse / LangSmith traces] --> B[Canonical trace DB]
62
+ B --> C[Label and cluster]
63
+ C --> D[YAML cases + pytest + rubrics]
64
+ D --> E[Run, diff, calibrate]
65
+ ```
66
+
67
+ ## Features
68
+
69
+ * Ingests OpenTelemetry GenAI, Langfuse, and LangSmith exports, plus generic JSONL. Malformed lines are logged as warnings instead of crashing the run (tested against corrupt fixtures in `tests/fixtures/`).
70
+ * Labels every trace with a rule-based outcome taxonomy (`success`, `tool_error`, `validation_error`, `loop`, `timeout`, `bad_output`) that you can extend with your own Python rules via `--rules`.
71
+ * Clusters task shapes with Jaccard shingle similarity, fully offline: no embeddings, no API calls. Numeric tokens are normalized, so "order 57978" and "order 12345" land in the same cluster.
72
+ * Deterministic generation: regenerating a suite from the same database is byte-identical, so evals diff cleanly in git.
73
+ * Regression cases are inverted: a failure trace asserts the failure does *not* recur (forbidden error signatures, tool-loop bounds, non-empty output), never that the agent reproduces it.
74
+ * Redacts emails, credit cards, phone numbers, and API tokens before case inputs are written (add your own scrubber with `--redact-hook`).
75
+ * `traceval run` exits nonzero on any failing case and diffs against a previous report with `--compare`, so CI can gate deploys on it.
76
+ * `traceval calibrate` measures judge-vs-human agreement per cluster and flags rubrics the automated judge scores unreliably.
77
+
78
+ ## Walkthrough on your own traces
79
+
80
+ The command outputs below are real, captured from a run over the demo trace set (regenerate them with `scripts/readme-outputs.sh`).
81
+
82
+ ### 1. Ingest
83
+
84
+ ```bash
85
+ traceval ingest traces.jsonl -o traces.db # --format auto|otel|langfuse|langsmith|generic
86
+ ```
87
+
88
+ ```text
89
+ Ingested 200 traces (209 spans).
90
+ ```
91
+
92
+ Malformed spans do not abort the ingest; warnings are written to `<traces.db>.log`.
93
+
94
+ ### 2. Analyze
95
+
96
+ ```bash
97
+ traceval analyze traces.db -o analysis
98
+ ```
99
+
100
+ ```text
101
+ Outcomes: success 60% · tool_error 15% · loop 10% · timeout 8% · validation_error 8%
102
+ Clusters: 8 task clusters found.
103
+ Top failure cluster: "refund stripe -> stripe_lookup -> (tool_error)" (30 traces)
104
+ Report written to analysis/report.html
105
+ ```
106
+
107
+ `analysis/report.html` is the single-file page shown in the screenshot above. Pass `--evals evals/` to overlay eval coverage per cluster, and `--rules my_rules.py` to add your own labeling rules. To view it over HTTP instead of `file://`, `traceval serve analysis` starts a stdlib localhost server and prints the report URL.
108
+
109
+ ### 3. Generate
110
+
111
+ ```bash
112
+ traceval generate traces.db -o evals --include-failures
113
+ ```
114
+
115
+ ```text
116
+ Wrote 8 eval cases across 8 clusters → evals/cases/*.yaml
117
+ Wrote judge rubrics → evals/rubrics/*.md
118
+ Wrote pytest harness → evals/test_generated.py, evals/conftest.py
119
+ ```
120
+
121
+ Every case is a reviewable YAML file. Golden cases assert the recorded successful behavior. Regression cases, generated from failure traces, assert the failure does **not** recur: forbidden error tokens (word-boundary matched, filtered against tokens that success traces also use), tool-loop bounds, and non-empty output. A regression case passes for any agent that avoids that failure mode; golden cases carry general bug detection.
122
+
123
+ ### 4. Run
124
+
125
+ ```bash
126
+ traceval run evals --target myapp.agent:invoke_agent --judge fake
127
+ ```
128
+
129
+ ```text
130
+ traceval Run Summary
131
+ ┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┓
132
+ ┃ Case ID ┃ Cluster ┃ Outcome ┃ Latency (ms) ┃
133
+ ┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━┩
134
+ │ c_0c422a7a__case_001 │ c_0c422a7a │ PASS │ 0.0 │
135
+ │ c_1e5d0942__case_002 │ c_1e5d0942 │ PASS │ 0.0 │
136
+ │ c_2c881177__case_003 │ c_2c881177 │ PASS │ 0.0 │
137
+ │ c_361535b0__case_004 │ c_361535b0 │ PASS │ 0.0 │
138
+ │ c_9a8a4644__case_005 │ c_9a8a4644 │ PASS │ 0.0 │
139
+ │ c_d30af83a__case_006 │ c_d30af83a │ PASS │ 0.0 │
140
+ │ c_d3f3b631__case_007 │ c_d3f3b631 │ PASS │ 0.0 │
141
+ │ c_e834c13c__case_008 │ c_e834c13c │ PASS │ 0.0 │
142
+ └──────────────────────┴────────────┴─────────┴──────────────┘
143
+ Total: 8 | Passed: 8 | Failed: 0 | Errored: 0
144
+ ```
145
+
146
+ The target is an HTTP URL or a `module:function` callable. Checks cover `exact`, `contains_any`, `not_contains`, `regex`, `json_schema`, `tool_sequence`, `no_tool_loop`, and `judge`. Run reports land in `<evals_dir>/runs/` (override with `--runs-dir`); pass `--compare <previous report>` to print regressions and improvements between runs. The exit code is nonzero when any case fails.
147
+
148
+ ### 5. Calibrate the judge
149
+
150
+ An LLM judge is only as trustworthy as its agreement with human judgment. `calibrate` samples judge-scored results from a run report and presents each agent output for blind pass/fail labeling in the terminal; judge verdicts stay hidden until the end so they cannot anchor you.
151
+
152
+ ```bash
153
+ traceval calibrate evals/runs/run_<timestamp>.json --sample 8
154
+ ```
155
+
156
+ ```text
157
+ Judge Calibration Summary
158
+ ┏━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━┓
159
+ ┃ Cluster ┃ Labeled ┃ Agreement ┃
160
+ ┡━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━┩
161
+ │ c_0c422a7a │ 1 │ 100% │
162
+ │ c_1e5d0942 │ 1 │ 100% │
163
+ │ c_2c881177 │ 1 │ 100% │
164
+ │ c_361535b0 │ 1 │ 100% │
165
+ │ c_9a8a4644 │ 1 │ 0% │
166
+ │ c_d30af83a │ 1 │ 100% │
167
+ │ c_d3f3b631 │ 1 │ 100% │
168
+ │ c_e834c13c │ 1 │ 100% │
169
+ └────────────┴─────────┴───────────┘
170
+ Overall agreement: 88% on 8 case(s) | false-pass (judge OK, human not): 1 | false-fail: 0
171
+ WARNING: Judge unreliable (< 80% agreement) for clusters: c_9a8a4644. Review their rubrics before trusting automated scores.
172
+ ```
173
+
174
+ False-pass counts (judge approved, human rejected) are called out because that is the dangerous direction: a lenient judge waves bad outputs into production. Clusters below `--min-agreement` (default 80%) are flagged for rubric review, and the full labels plus stats are written to `calibration.json`.
175
+
176
+ ## Scripting with --json
177
+
178
+ `ingest`, `analyze`, `generate`, and `run` accept `--json`: human-readable output is suppressed and a single JSON object is printed to stdout. `run` still exits nonzero on failures.
179
+
180
+ ```bash
181
+ traceval analyze traces.db --json | python -m json.tool
182
+ ```
183
+
184
+ ## GitHub Action
185
+
186
+ Gate deploys on your generated eval suite. The action installs traceval, runs the suite, and fails the job on any regression:
187
+
188
+ ```yaml
189
+ jobs:
190
+ agent-evals:
191
+ runs-on: ubuntu-latest
192
+ steps:
193
+ - uses: actions/checkout@v4
194
+ - uses: theramkm/traceval@v0.2.2
195
+ with:
196
+ evals-dir: evals/
197
+ target: myapp.agent:invoke_agent # or an HTTP URL
198
+ judge: fake # offline; 'openai' needs an API key
199
+ ```
200
+
201
+ Inputs: `evals-dir` and `target` (required); `judge`, `compare`, `only`, `runs-dir`, `traceval-version`, `python-version` (optional). For a real LLM judge, set `judge: openai` and pass `OPENAI_API_KEY` (or `GEMINI_API_KEY`) via `env:` from your repository secrets.
202
+
203
+ ## Development
204
+
205
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for setup.
206
+ Run the test suite with `make test` and the full gate set with `make lint`.
207
+
208
+ ## Honest Limitations
209
+
210
+ * **Side-Effect Free**: traceval assertions evaluate input/output matches. It does not attempt to replay side effects (e.g., updating database records) on mock tools.
211
+ * **Text Telemetry**: The canonical model is optimized for text logs. Image or multimodal payloads in traces are logged as references.
212
+ * **Static Visualization**: The coverage report is a portable, single-file HTML page. There is no hosted web service.
@@ -0,0 +1,195 @@
1
+ # traceval: Trace-to-Eval Compiler
2
+
3
+ <p align="center">
4
+ <img src="https://img.shields.io/badge/Python-3.11%20%7C%203.12%20%7C%203.13-blue.svg" alt="Python Version" />
5
+ <img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="License" />
6
+ </p>
7
+
8
+ ***"Your traces already know how your agent fails. traceval turns them into the test suite you never wrote."***
9
+
10
+ Teams running LLM agents in production have observability traces, but only a fraction maintain evals. The raw material for good tests, thousands of real traces full of edge cases and errors, sits unused because turning it into a regression suite is manual and tedious.
11
+
12
+ traceval ingests agent traces from standard sources, normalizes them into a canonical Pydantic model, labels outcomes, clusters task shapes, and compiles the result into a human-editable eval suite: YAML cases, a pytest harness, and judge rubric scaffolds.
13
+
14
+ ![Failure-cluster coverage report generated by traceval analyze](docs/img/report.png)
15
+
16
+ ## Quickstart
17
+
18
+ ```bash
19
+ pip install traceval
20
+ traceval demo
21
+ open traceval-demo/analysis/report.html # xdg-open on Linux
22
+ ```
23
+
24
+ `traceval demo` runs the entire loop against a built-in demo agent: it generates 200 synthetic traces, ingests them, clusters the failures, compiles an eval suite, and then proves the headline claim by running that suite twice:
25
+
26
+ ```text
27
+ === Demo complete: healthy agent PASSED, buggy agent FAILED ===
28
+ Failure-cluster report: traceval-demo/analysis/report.html
29
+ Run report: traceval-demo/evals/runs/run_20260702T072029851802Z.json
30
+ Run report: traceval-demo/evals/runs/run_20260702T072030171406Z.json
31
+
32
+ Re-run any stage manually:
33
+ traceval ingest traceval-demo/synthetic_traces.jsonl -o traceval-demo/traces.db
34
+ traceval analyze traceval-demo/traces.db -o traceval-demo/analysis
35
+ traceval generate traceval-demo/traces.db -o traceval-demo/evals --include-failures
36
+ traceval run traceval-demo/evals --target traceval.demo.agent:invoke_agent --judge fake
37
+ traceval calibrate traceval-demo/evals/runs/run_20260702T072030171406Z.json
38
+ ```
39
+
40
+ ## How it works
41
+
42
+ ```mermaid
43
+ graph LR
44
+ A[OTel / Langfuse / LangSmith traces] --> B[Canonical trace DB]
45
+ B --> C[Label and cluster]
46
+ C --> D[YAML cases + pytest + rubrics]
47
+ D --> E[Run, diff, calibrate]
48
+ ```
49
+
50
+ ## Features
51
+
52
+ * Ingests OpenTelemetry GenAI, Langfuse, and LangSmith exports, plus generic JSONL. Malformed lines are logged as warnings instead of crashing the run (tested against corrupt fixtures in `tests/fixtures/`).
53
+ * Labels every trace with a rule-based outcome taxonomy (`success`, `tool_error`, `validation_error`, `loop`, `timeout`, `bad_output`) that you can extend with your own Python rules via `--rules`.
54
+ * Clusters task shapes with Jaccard shingle similarity, fully offline: no embeddings, no API calls. Numeric tokens are normalized, so "order 57978" and "order 12345" land in the same cluster.
55
+ * Deterministic generation: regenerating a suite from the same database is byte-identical, so evals diff cleanly in git.
56
+ * Regression cases are inverted: a failure trace asserts the failure does *not* recur (forbidden error signatures, tool-loop bounds, non-empty output), never that the agent reproduces it.
57
+ * Redacts emails, credit cards, phone numbers, and API tokens before case inputs are written (add your own scrubber with `--redact-hook`).
58
+ * `traceval run` exits nonzero on any failing case and diffs against a previous report with `--compare`, so CI can gate deploys on it.
59
+ * `traceval calibrate` measures judge-vs-human agreement per cluster and flags rubrics the automated judge scores unreliably.
60
+
61
+ ## Walkthrough on your own traces
62
+
63
+ The command outputs below are real, captured from a run over the demo trace set (regenerate them with `scripts/readme-outputs.sh`).
64
+
65
+ ### 1. Ingest
66
+
67
+ ```bash
68
+ traceval ingest traces.jsonl -o traces.db # --format auto|otel|langfuse|langsmith|generic
69
+ ```
70
+
71
+ ```text
72
+ Ingested 200 traces (209 spans).
73
+ ```
74
+
75
+ Malformed spans do not abort the ingest; warnings are written to `<traces.db>.log`.
76
+
77
+ ### 2. Analyze
78
+
79
+ ```bash
80
+ traceval analyze traces.db -o analysis
81
+ ```
82
+
83
+ ```text
84
+ Outcomes: success 60% · tool_error 15% · loop 10% · timeout 8% · validation_error 8%
85
+ Clusters: 8 task clusters found.
86
+ Top failure cluster: "refund stripe -> stripe_lookup -> (tool_error)" (30 traces)
87
+ Report written to analysis/report.html
88
+ ```
89
+
90
+ `analysis/report.html` is the single-file page shown in the screenshot above. Pass `--evals evals/` to overlay eval coverage per cluster, and `--rules my_rules.py` to add your own labeling rules. To view it over HTTP instead of `file://`, `traceval serve analysis` starts a stdlib localhost server and prints the report URL.
91
+
92
+ ### 3. Generate
93
+
94
+ ```bash
95
+ traceval generate traces.db -o evals --include-failures
96
+ ```
97
+
98
+ ```text
99
+ Wrote 8 eval cases across 8 clusters → evals/cases/*.yaml
100
+ Wrote judge rubrics → evals/rubrics/*.md
101
+ Wrote pytest harness → evals/test_generated.py, evals/conftest.py
102
+ ```
103
+
104
+ Every case is a reviewable YAML file. Golden cases assert the recorded successful behavior. Regression cases, generated from failure traces, assert the failure does **not** recur: forbidden error tokens (word-boundary matched, filtered against tokens that success traces also use), tool-loop bounds, and non-empty output. A regression case passes for any agent that avoids that failure mode; golden cases carry general bug detection.
105
+
106
+ ### 4. Run
107
+
108
+ ```bash
109
+ traceval run evals --target myapp.agent:invoke_agent --judge fake
110
+ ```
111
+
112
+ ```text
113
+ traceval Run Summary
114
+ ┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┓
115
+ ┃ Case ID ┃ Cluster ┃ Outcome ┃ Latency (ms) ┃
116
+ ┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━┩
117
+ │ c_0c422a7a__case_001 │ c_0c422a7a │ PASS │ 0.0 │
118
+ │ c_1e5d0942__case_002 │ c_1e5d0942 │ PASS │ 0.0 │
119
+ │ c_2c881177__case_003 │ c_2c881177 │ PASS │ 0.0 │
120
+ │ c_361535b0__case_004 │ c_361535b0 │ PASS │ 0.0 │
121
+ │ c_9a8a4644__case_005 │ c_9a8a4644 │ PASS │ 0.0 │
122
+ │ c_d30af83a__case_006 │ c_d30af83a │ PASS │ 0.0 │
123
+ │ c_d3f3b631__case_007 │ c_d3f3b631 │ PASS │ 0.0 │
124
+ │ c_e834c13c__case_008 │ c_e834c13c │ PASS │ 0.0 │
125
+ └──────────────────────┴────────────┴─────────┴──────────────┘
126
+ Total: 8 | Passed: 8 | Failed: 0 | Errored: 0
127
+ ```
128
+
129
+ The target is an HTTP URL or a `module:function` callable. Checks cover `exact`, `contains_any`, `not_contains`, `regex`, `json_schema`, `tool_sequence`, `no_tool_loop`, and `judge`. Run reports land in `<evals_dir>/runs/` (override with `--runs-dir`); pass `--compare <previous report>` to print regressions and improvements between runs. The exit code is nonzero when any case fails.
130
+
131
+ ### 5. Calibrate the judge
132
+
133
+ An LLM judge is only as trustworthy as its agreement with human judgment. `calibrate` samples judge-scored results from a run report and presents each agent output for blind pass/fail labeling in the terminal; judge verdicts stay hidden until the end so they cannot anchor you.
134
+
135
+ ```bash
136
+ traceval calibrate evals/runs/run_<timestamp>.json --sample 8
137
+ ```
138
+
139
+ ```text
140
+ Judge Calibration Summary
141
+ ┏━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━┓
142
+ ┃ Cluster ┃ Labeled ┃ Agreement ┃
143
+ ┡━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━┩
144
+ │ c_0c422a7a │ 1 │ 100% │
145
+ │ c_1e5d0942 │ 1 │ 100% │
146
+ │ c_2c881177 │ 1 │ 100% │
147
+ │ c_361535b0 │ 1 │ 100% │
148
+ │ c_9a8a4644 │ 1 │ 0% │
149
+ │ c_d30af83a │ 1 │ 100% │
150
+ │ c_d3f3b631 │ 1 │ 100% │
151
+ │ c_e834c13c │ 1 │ 100% │
152
+ └────────────┴─────────┴───────────┘
153
+ Overall agreement: 88% on 8 case(s) | false-pass (judge OK, human not): 1 | false-fail: 0
154
+ WARNING: Judge unreliable (< 80% agreement) for clusters: c_9a8a4644. Review their rubrics before trusting automated scores.
155
+ ```
156
+
157
+ False-pass counts (judge approved, human rejected) are called out because that is the dangerous direction: a lenient judge waves bad outputs into production. Clusters below `--min-agreement` (default 80%) are flagged for rubric review, and the full labels plus stats are written to `calibration.json`.
158
+
159
+ ## Scripting with --json
160
+
161
+ `ingest`, `analyze`, `generate`, and `run` accept `--json`: human-readable output is suppressed and a single JSON object is printed to stdout. `run` still exits nonzero on failures.
162
+
163
+ ```bash
164
+ traceval analyze traces.db --json | python -m json.tool
165
+ ```
166
+
167
+ ## GitHub Action
168
+
169
+ Gate deploys on your generated eval suite. The action installs traceval, runs the suite, and fails the job on any regression:
170
+
171
+ ```yaml
172
+ jobs:
173
+ agent-evals:
174
+ runs-on: ubuntu-latest
175
+ steps:
176
+ - uses: actions/checkout@v4
177
+ - uses: theramkm/traceval@v0.2.2
178
+ with:
179
+ evals-dir: evals/
180
+ target: myapp.agent:invoke_agent # or an HTTP URL
181
+ judge: fake # offline; 'openai' needs an API key
182
+ ```
183
+
184
+ Inputs: `evals-dir` and `target` (required); `judge`, `compare`, `only`, `runs-dir`, `traceval-version`, `python-version` (optional). For a real LLM judge, set `judge: openai` and pass `OPENAI_API_KEY` (or `GEMINI_API_KEY`) via `env:` from your repository secrets.
185
+
186
+ ## Development
187
+
188
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for setup.
189
+ Run the test suite with `make test` and the full gate set with `make lint`.
190
+
191
+ ## Honest Limitations
192
+
193
+ * **Side-Effect Free**: traceval assertions evaluate input/output matches. It does not attempt to replay side effects (e.g., updating database records) on mock tools.
194
+ * **Text Telemetry**: The canonical model is optimized for text logs. Image or multimodal payloads in traces are logged as references.
195
+ * **Static Visualization**: The coverage report is a portable, single-file HTML page. There is no hosted web service.
Binary file
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ # traceval e2e quickstart: the whole loop now lives in `traceval demo`
3
+ # (generate traces -> ingest -> analyze -> generate evals -> run healthy
4
+ # agent, must pass -> run buggy agent, must fail).
5
+ set -e
6
+
7
+ uv run traceval demo -o traceval-demo --force "$@"
@@ -0,0 +1,8 @@
1
+ """Thin wrapper: the demo agent now ships inside the package so
2
+ `traceval demo` works from a plain pip install. Kept so existing
3
+ `--target examples.demo_agent.core:invoke_agent` invocations still work.
4
+ """
5
+
6
+ from traceval.demo.agent import invoke_agent, run_agent_logic
7
+
8
+ __all__ = ["invoke_agent", "run_agent_logic"]
@@ -0,0 +1,11 @@
1
+ """Thin wrapper: the trace generator now ships inside the package
2
+ (traceval.demo.traces) so `traceval demo` works from a plain pip install.
3
+ """
4
+
5
+ from pathlib import Path
6
+
7
+ from traceval.demo.traces import generate_traces_file
8
+
9
+ if __name__ == "__main__":
10
+ out = Path(__file__).parent / "synthetic_traces.jsonl"
11
+ generate_traces_file(out)
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "traceval"
7
- version = "0.2.0"
7
+ version = "0.2.2"
8
8
  description = "Trace-to-Eval Compiler"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"
@@ -0,0 +1,53 @@
1
+ #!/bin/bash
2
+ # Regenerates every sample-output block in README.md, plus
3
+ # docs/img/report.png, from real command runs so the docs cannot drift.
4
+ # Run from anywhere; paste the printed sections into README.md verbatim.
5
+ set -e
6
+
7
+ ROOT=$(cd "$(dirname "$0")/.." && pwd)
8
+ WORK=$(mktemp -d)
9
+ trap 'rm -rf "$WORK"' EXIT
10
+ cd "$WORK"
11
+
12
+ tv() { uv run --project "$ROOT" traceval "$@"; }
13
+
14
+ uv run --project "$ROOT" python3 -c "
15
+ from pathlib import Path
16
+ from traceval.demo.traces import generate_traces_file
17
+ generate_traces_file(Path('traces.jsonl'))
18
+ " >/dev/null
19
+
20
+ echo "=== ingest ==="
21
+ tv ingest traces.jsonl -o traces.db
22
+
23
+ echo
24
+ echo "=== analyze ==="
25
+ tv analyze traces.db -o analysis
26
+
27
+ echo
28
+ echo "=== generate ==="
29
+ tv generate traces.db -o evals --include-failures
30
+
31
+ echo
32
+ echo "=== run (healthy demo agent) ==="
33
+ tv run evals --target traceval.demo.agent:invoke_agent --judge fake || true
34
+
35
+ echo
36
+ echo "=== calibrate (example labels: 7x pass, 1x fail) ==="
37
+ REPORT=$(ls -t evals/runs/run_*.json | head -1)
38
+ printf 'y\ny\ny\ny\nn\ny\ny\ny\n' | tv calibrate "$REPORT" --sample 8 | tail -16
39
+
40
+ echo
41
+ echo "=== screenshot -> docs/img/report.png ==="
42
+ # Re-analyze with --evals so the report shows populated eval coverage
43
+ tv analyze traces.db --evals evals -o analysis >/dev/null
44
+ CHROME="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
45
+ if [ -x "$CHROME" ]; then
46
+ "$CHROME" --headless --disable-gpu \
47
+ --screenshot="$ROOT/docs/img/report.png" \
48
+ --window-size=1280,860 --hide-scrollbars \
49
+ "file://$WORK/analysis/report.html" 2>/dev/null
50
+ echo "wrote $ROOT/docs/img/report.png"
51
+ else
52
+ echo "Chrome not found; screenshot skipped"
53
+ fi
@@ -0,0 +1 @@
1
+ __version__ = "0.2.2"