traceval 0.2.1__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {traceval-0.2.1 → traceval-0.2.3}/CHANGELOG.md +18 -0
- traceval-0.2.3/CONTRIBUTING.md +22 -0
- traceval-0.2.3/Makefile +14 -0
- {traceval-0.2.1 → traceval-0.2.3}/PKG-INFO +19 -12
- {traceval-0.2.1 → traceval-0.2.3}/README.md +18 -11
- traceval-0.2.3/docs/extending.md +97 -0
- {traceval-0.2.1 → traceval-0.2.3}/docs/formats.md +47 -8
- traceval-0.2.3/docs/img/report.png +0 -0
- traceval-0.2.3/docs/targets.md +91 -0
- {traceval-0.2.1 → traceval-0.2.3}/pyproject.toml +1 -1
- traceval-0.2.3/src/traceval/__init__.py +1 -0
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/cli.py +14 -1
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/compile/templates/conftest.py.jinja +113 -34
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/ingest/__init__.py +2 -1
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/ingest/base.py +2 -0
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/ingest/generic.py +4 -0
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/ingest/langfuse.py +28 -5
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/ingest/langsmith.py +4 -0
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/ingest/otel.py +21 -3
- {traceval-0.2.1 → traceval-0.2.3}/tests/fixtures/README.md +11 -0
- {traceval-0.2.1 → traceval-0.2.3}/tests/fixtures/langfuse_export.jsonl +1 -0
- {traceval-0.2.1 → traceval-0.2.3}/tests/fixtures/otel_spans.jsonl +2 -0
- traceval-0.2.3/tests/test_broken_target.py +90 -0
- {traceval-0.2.1 → traceval-0.2.3}/tests/test_phase2.py +56 -8
- {traceval-0.2.1 → traceval-0.2.3}/uv.lock +1 -1
- traceval-0.2.1/docs/img/report.png +0 -0
- traceval-0.2.1/src/traceval/__init__.py +0 -1
- {traceval-0.2.1 → traceval-0.2.3}/.github/workflows/ci.yml +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/.gitignore +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/.pre-commit-config.yaml +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/LICENSE +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/action.yml +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/examples/demo.sh +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/examples/demo_agent/agent.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/examples/demo_agent/core.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/examples/make_traces.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/examples/synthetic_traces.jsonl +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/scripts/readme-outputs.sh +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/analyze/__init__.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/analyze/cluster.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/analyze/coverage.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/analyze/outcomes.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/analyze/report.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/compile/__init__.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/compile/cases.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/compile/emit_pytest.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/compile/emit_yaml.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/compile/rubrics.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/compile/templates/test_generated.py.jinja +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/demo/__init__.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/demo/agent.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/demo/traces.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/model.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/run/calibrate.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/run/judge.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/run/runner.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/run/scorers.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/run/target.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/src/traceval/store.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/tests/fixtures/generic_traces.jsonl +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/tests/fixtures/langsmith_runs.jsonl +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/tests/test_calibrate.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/tests/test_cli.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/tests/test_e2e_demo.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/tests/test_json_output.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/tests/test_phase1.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/tests/test_phase3.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/tests/test_phase4.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/tests/test_phase5.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/tests/test_phase6.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/tests/test_regression_cases.py +0 -0
- {traceval-0.2.1 → traceval-0.2.3}/tests/test_serve.py +0 -0
|
@@ -5,6 +5,24 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.2.3] - 2026-07-02
|
|
9
|
+
|
|
10
|
+
### Fixed
|
|
11
|
+
- **Adapter tool detection no longer depends on demo tool names.** The Langfuse and OTel adapters matched span names against the demo agent's tool vocabulary (`order_lookup`, `stripe_lookup`, `kb_search`), so real exports with any other tool names silently classified tool spans as `other`, breaking `tool_error` labeling, `tool_sequence`/`no_tool_loop` generation, and cluster signatures. Langfuse SPANs now classify via `metadata.tool`, user-supplied `--tool-span-names` globs, or a documented input+output/error heuristic; OTel uses GenAI semantic-convention attributes with attribute-only fallbacks. Detection is proven vocabulary-free by new `create_ticket` fixtures. Surfaced by external review, found by reading; a real export would have found it in seconds.
|
|
12
|
+
- Sub-millisecond latencies render as `<0.1` in the run summary instead of `0.0`.
|
|
13
|
+
|
|
14
|
+
### Added
|
|
15
|
+
- `traceval ingest --tool-span-names`: comma-separated name globs marking spans as tool calls (replaces the Langfuse heuristic, adds an OTel fallback).
|
|
16
|
+
- `docs/targets.md`: the exact run-against-my-agent contract (HTTP request/response shape, callable return shapes, timeout and failure behavior) with an executable FastAPI example.
|
|
17
|
+
- `docs/extending.md`: custom outcome rules, redaction hook, judge configuration as implemented, and an honest `traceval.yaml` reference.
|
|
18
|
+
- `docs/formats.md`: complete annotated generic-format examples (success and tool-failure lines that ingest as-is) and a required-vs-optional field table; Langfuse/OTel sections rewritten to match the new heuristics.
|
|
19
|
+
|
|
20
|
+
## [0.2.2] - 2026-07-02
|
|
21
|
+
|
|
22
|
+
### Fixed
|
|
23
|
+
- A run in which zero cases execute (e.g. unresolvable target) now writes a self-describing run report with an `errors` section instead of writing nothing; `--json` never reports `null`. Added a single clear top-level error line on target-resolution failure. Added `make test` and CONTRIBUTING as the canonical dev commands. Reported via external review of a failed-target invocation.
|
|
24
|
+
- Run report schema additions (existing fields unchanged): `summary.errored` counts cases that never executed due to setup/collection errors; top-level `errors` lists `{stage, detail}` entries for `target_resolution`, `collection`, and `setup` failures, with identical details deduplicated into one entry carrying a `count`; `results` is `[]` rather than absent when nothing executed. The terminal summary now shows `Errored: n`.
|
|
25
|
+
|
|
8
26
|
## [0.2.1] - 2026-07-02
|
|
9
27
|
|
|
10
28
|
### Added
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Contributing to traceval
|
|
2
|
+
|
|
3
|
+
## Setup
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
git clone https://github.com/theramkm/traceval.git
|
|
7
|
+
cd traceval
|
|
8
|
+
uv sync # installs the package and dev dependencies (same as CI)
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Development loop
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
make test # pytest -q
|
|
15
|
+
make lint # ruff check, ruff format --check, mypy
|
|
16
|
+
make demo # end-to-end smoke: healthy agent passes, buggy agent fails
|
|
17
|
+
make all # lint + test
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
CI runs the same commands on Python 3.11, 3.12, and 3.13, plus a
|
|
21
|
+
wheel-based demo smoke job, and enforces 85% coverage. Keep all of it
|
|
22
|
+
green; add a test for every behavior change.
|
traceval-0.2.3/Makefile
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
.PHONY: test lint demo all
|
|
2
|
+
|
|
3
|
+
test:
|
|
4
|
+
uv run pytest -q
|
|
5
|
+
|
|
6
|
+
lint:
|
|
7
|
+
uv run ruff check src tests examples
|
|
8
|
+
uv run ruff format --check src tests examples
|
|
9
|
+
uv run mypy src/traceval
|
|
10
|
+
|
|
11
|
+
demo:
|
|
12
|
+
uv run traceval demo -o /tmp/traceval-demo --force
|
|
13
|
+
|
|
14
|
+
all: lint test
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: traceval
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: Trace-to-Eval Compiler
|
|
5
5
|
License: MIT
|
|
6
6
|
License-File: LICENSE
|
|
@@ -106,6 +106,8 @@ Report written to analysis/report.html
|
|
|
106
106
|
|
|
107
107
|
`analysis/report.html` is the single-file page shown in the screenshot above. Pass `--evals evals/` to overlay eval coverage per cluster, and `--rules my_rules.py` to add your own labeling rules. To view it over HTTP instead of `file://`, `traceval serve analysis` starts a stdlib localhost server and prints the report URL.
|
|
108
108
|
|
|
109
|
+
Custom labeling rules, the redaction hook, and judge configuration are documented in [docs/extending.md](docs/extending.md).
|
|
110
|
+
|
|
109
111
|
### 3. Generate
|
|
110
112
|
|
|
111
113
|
```bash
|
|
@@ -131,19 +133,19 @@ traceval Run Summary
|
|
|
131
133
|
┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┓
|
|
132
134
|
┃ Case ID ┃ Cluster ┃ Outcome ┃ Latency (ms) ┃
|
|
133
135
|
┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━┩
|
|
134
|
-
│ c_0c422a7a__case_001 │ c_0c422a7a │ PASS │
|
|
135
|
-
│ c_1e5d0942__case_002 │ c_1e5d0942 │ PASS │
|
|
136
|
-
│ c_2c881177__case_003 │ c_2c881177 │ PASS │
|
|
137
|
-
│ c_361535b0__case_004 │ c_361535b0 │ PASS │
|
|
138
|
-
│ c_9a8a4644__case_005 │ c_9a8a4644 │ PASS │
|
|
139
|
-
│ c_d30af83a__case_006 │ c_d30af83a │ PASS │
|
|
140
|
-
│ c_d3f3b631__case_007 │ c_d3f3b631 │ PASS │
|
|
141
|
-
│ c_e834c13c__case_008 │ c_e834c13c │ PASS │
|
|
136
|
+
│ c_0c422a7a__case_001 │ c_0c422a7a │ PASS │ <0.1 │
|
|
137
|
+
│ c_1e5d0942__case_002 │ c_1e5d0942 │ PASS │ <0.1 │
|
|
138
|
+
│ c_2c881177__case_003 │ c_2c881177 │ PASS │ <0.1 │
|
|
139
|
+
│ c_361535b0__case_004 │ c_361535b0 │ PASS │ <0.1 │
|
|
140
|
+
│ c_9a8a4644__case_005 │ c_9a8a4644 │ PASS │ <0.1 │
|
|
141
|
+
│ c_d30af83a__case_006 │ c_d30af83a │ PASS │ <0.1 │
|
|
142
|
+
│ c_d3f3b631__case_007 │ c_d3f3b631 │ PASS │ <0.1 │
|
|
143
|
+
│ c_e834c13c__case_008 │ c_e834c13c │ PASS │ <0.1 │
|
|
142
144
|
└──────────────────────┴────────────┴─────────┴──────────────┘
|
|
143
|
-
Total: 8 | Passed: 8 | Failed: 0
|
|
145
|
+
Total: 8 | Passed: 8 | Failed: 0 | Errored: 0
|
|
144
146
|
```
|
|
145
147
|
|
|
146
|
-
The target is an HTTP URL or a `module:function` callable. Checks cover `exact`, `contains_any`, `not_contains`, `regex`, `json_schema`, `tool_sequence`, `no_tool_loop`, and `judge`. Run reports land in `<evals_dir>/runs/` (override with `--runs-dir`); pass `--compare <previous report>` to print regressions and improvements between runs. The exit code is nonzero when any case fails.
|
|
148
|
+
The target is an HTTP URL or a `module:function` callable; the exact request/response contract, with a copy-pasteable FastAPI example, is in [docs/targets.md](docs/targets.md). Checks cover `exact`, `contains_any`, `not_contains`, `regex`, `json_schema`, `tool_sequence`, `no_tool_loop`, and `judge`. Run reports land in `<evals_dir>/runs/` (override with `--runs-dir`); pass `--compare <previous report>` to print regressions and improvements between runs. The exit code is nonzero when any case fails.
|
|
147
149
|
|
|
148
150
|
### 5. Calibrate the judge
|
|
149
151
|
|
|
@@ -191,7 +193,7 @@ jobs:
|
|
|
191
193
|
runs-on: ubuntu-latest
|
|
192
194
|
steps:
|
|
193
195
|
- uses: actions/checkout@v4
|
|
194
|
-
- uses: theramkm/traceval@v0.2.
|
|
196
|
+
- uses: theramkm/traceval@v0.2.3
|
|
195
197
|
with:
|
|
196
198
|
evals-dir: evals/
|
|
197
199
|
target: myapp.agent:invoke_agent # or an HTTP URL
|
|
@@ -200,6 +202,11 @@ jobs:
|
|
|
200
202
|
|
|
201
203
|
Inputs: `evals-dir` and `target` (required); `judge`, `compare`, `only`, `runs-dir`, `traceval-version`, `python-version` (optional). For a real LLM judge, set `judge: openai` and pass `OPENAI_API_KEY` (or `GEMINI_API_KEY`) via `env:` from your repository secrets.
|
|
202
204
|
|
|
205
|
+
## Development
|
|
206
|
+
|
|
207
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for setup.
|
|
208
|
+
Run the test suite with `make test` and the full gate set with `make lint`.
|
|
209
|
+
|
|
203
210
|
## Honest Limitations
|
|
204
211
|
|
|
205
212
|
* **Side-Effect Free**: traceval assertions evaluate input/output matches. It does not attempt to replay side effects (e.g., updating database records) on mock tools.
|
|
@@ -89,6 +89,8 @@ Report written to analysis/report.html
|
|
|
89
89
|
|
|
90
90
|
`analysis/report.html` is the single-file page shown in the screenshot above. Pass `--evals evals/` to overlay eval coverage per cluster, and `--rules my_rules.py` to add your own labeling rules. To view it over HTTP instead of `file://`, `traceval serve analysis` starts a stdlib localhost server and prints the report URL.
|
|
91
91
|
|
|
92
|
+
Custom labeling rules, the redaction hook, and judge configuration are documented in [docs/extending.md](docs/extending.md).
|
|
93
|
+
|
|
92
94
|
### 3. Generate
|
|
93
95
|
|
|
94
96
|
```bash
|
|
@@ -114,19 +116,19 @@ traceval Run Summary
|
|
|
114
116
|
┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┓
|
|
115
117
|
┃ Case ID ┃ Cluster ┃ Outcome ┃ Latency (ms) ┃
|
|
116
118
|
┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━┩
|
|
117
|
-
│ c_0c422a7a__case_001 │ c_0c422a7a │ PASS │
|
|
118
|
-
│ c_1e5d0942__case_002 │ c_1e5d0942 │ PASS │
|
|
119
|
-
│ c_2c881177__case_003 │ c_2c881177 │ PASS │
|
|
120
|
-
│ c_361535b0__case_004 │ c_361535b0 │ PASS │
|
|
121
|
-
│ c_9a8a4644__case_005 │ c_9a8a4644 │ PASS │
|
|
122
|
-
│ c_d30af83a__case_006 │ c_d30af83a │ PASS │
|
|
123
|
-
│ c_d3f3b631__case_007 │ c_d3f3b631 │ PASS │
|
|
124
|
-
│ c_e834c13c__case_008 │ c_e834c13c │ PASS │
|
|
119
|
+
│ c_0c422a7a__case_001 │ c_0c422a7a │ PASS │ <0.1 │
|
|
120
|
+
│ c_1e5d0942__case_002 │ c_1e5d0942 │ PASS │ <0.1 │
|
|
121
|
+
│ c_2c881177__case_003 │ c_2c881177 │ PASS │ <0.1 │
|
|
122
|
+
│ c_361535b0__case_004 │ c_361535b0 │ PASS │ <0.1 │
|
|
123
|
+
│ c_9a8a4644__case_005 │ c_9a8a4644 │ PASS │ <0.1 │
|
|
124
|
+
│ c_d30af83a__case_006 │ c_d30af83a │ PASS │ <0.1 │
|
|
125
|
+
│ c_d3f3b631__case_007 │ c_d3f3b631 │ PASS │ <0.1 │
|
|
126
|
+
│ c_e834c13c__case_008 │ c_e834c13c │ PASS │ <0.1 │
|
|
125
127
|
└──────────────────────┴────────────┴─────────┴──────────────┘
|
|
126
|
-
Total: 8 | Passed: 8 | Failed: 0
|
|
128
|
+
Total: 8 | Passed: 8 | Failed: 0 | Errored: 0
|
|
127
129
|
```
|
|
128
130
|
|
|
129
|
-
The target is an HTTP URL or a `module:function` callable. Checks cover `exact`, `contains_any`, `not_contains`, `regex`, `json_schema`, `tool_sequence`, `no_tool_loop`, and `judge`. Run reports land in `<evals_dir>/runs/` (override with `--runs-dir`); pass `--compare <previous report>` to print regressions and improvements between runs. The exit code is nonzero when any case fails.
|
|
131
|
+
The target is an HTTP URL or a `module:function` callable; the exact request/response contract, with a copy-pasteable FastAPI example, is in [docs/targets.md](docs/targets.md). Checks cover `exact`, `contains_any`, `not_contains`, `regex`, `json_schema`, `tool_sequence`, `no_tool_loop`, and `judge`. Run reports land in `<evals_dir>/runs/` (override with `--runs-dir`); pass `--compare <previous report>` to print regressions and improvements between runs. The exit code is nonzero when any case fails.
|
|
130
132
|
|
|
131
133
|
### 5. Calibrate the judge
|
|
132
134
|
|
|
@@ -174,7 +176,7 @@ jobs:
|
|
|
174
176
|
runs-on: ubuntu-latest
|
|
175
177
|
steps:
|
|
176
178
|
- uses: actions/checkout@v4
|
|
177
|
-
- uses: theramkm/traceval@v0.2.
|
|
179
|
+
- uses: theramkm/traceval@v0.2.3
|
|
178
180
|
with:
|
|
179
181
|
evals-dir: evals/
|
|
180
182
|
target: myapp.agent:invoke_agent # or an HTTP URL
|
|
@@ -183,6 +185,11 @@ jobs:
|
|
|
183
185
|
|
|
184
186
|
Inputs: `evals-dir` and `target` (required); `judge`, `compare`, `only`, `runs-dir`, `traceval-version`, `python-version` (optional). For a real LLM judge, set `judge: openai` and pass `OPENAI_API_KEY` (or `GEMINI_API_KEY`) via `env:` from your repository secrets.
|
|
185
187
|
|
|
188
|
+
## Development
|
|
189
|
+
|
|
190
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for setup.
|
|
191
|
+
Run the test suite with `make test` and the full gate set with `make lint`.
|
|
192
|
+
|
|
186
193
|
## Honest Limitations
|
|
187
194
|
|
|
188
195
|
* **Side-Effect Free**: traceval assertions evaluate input/output matches. It does not attempt to replay side effects (e.g., updating database records) on mock tools.
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# Extension points
|
|
2
|
+
|
|
3
|
+
traceval has three: custom outcome rules, a redaction hook, and judge
|
|
4
|
+
configuration.
|
|
5
|
+
|
|
6
|
+
## Custom outcome rules (`traceval analyze --rules my_rules.py`)
|
|
7
|
+
|
|
8
|
+
The file must expose a module-level `RULES` list. Each entry is either a
|
|
9
|
+
`Rule` object from `traceval.analyze.outcomes` or a bare callable with the
|
|
10
|
+
signature `(Trace) -> Outcome | None`. Return `None` to pass; the first
|
|
11
|
+
rule that returns an `Outcome` wins. **User rules run before the
|
|
12
|
+
built-ins**, so they can override any built-in label.
|
|
13
|
+
|
|
14
|
+
```python
|
|
15
|
+
# my_rules.py
|
|
16
|
+
from traceval.analyze.outcomes import Rule
|
|
17
|
+
from traceval.model import Outcome, Trace
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def escalation(trace: Trace) -> Outcome | None:
|
|
21
|
+
if trace.final_output and "escalate to human" in trace.final_output.lower():
|
|
22
|
+
return Outcome(
|
|
23
|
+
label="bad_output",
|
|
24
|
+
reason="agent escalated instead of resolving",
|
|
25
|
+
rule_id="R_USER_ESCALATION",
|
|
26
|
+
labeled_by="user_rule",
|
|
27
|
+
)
|
|
28
|
+
return None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
RULES = [Rule("R_USER_ESCALATION", "Escalations count as failures", escalation)]
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
traceval analyze traces.db --rules my_rules.py -o analysis
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
`Outcome` requires `label` (one of `success`, `tool_error`,
|
|
39
|
+
`validation_error`, `loop`, `timeout`, `bad_output`, `unknown`), `reason`,
|
|
40
|
+
and `labeled_by`; `rule_id` is optional but shows up in reports, so set it.
|
|
41
|
+
Built-in rules run afterwards in this order: `R_TOOL_ERROR`, `R_LLM_ERROR`,
|
|
42
|
+
`R_LOOP`, `R_TIMEOUT`, `R_VALIDATION`, `R_EMPTY_OUTPUT`,
|
|
43
|
+
`R_DEFAULT_SUCCESS`, `R_UNKNOWN`.
|
|
44
|
+
|
|
45
|
+
## Redaction hook (`traceval generate --redact-hook module:function`)
|
|
46
|
+
|
|
47
|
+
A `str -> str` function applied to case inputs and reference outputs before
|
|
48
|
+
they are written to YAML, after the built-in scrubbers (emails, credit
|
|
49
|
+
cards, phone numbers, API tokens). The module is imported from the current
|
|
50
|
+
working directory.
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
# my_redactions.py
|
|
54
|
+
import re
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def scrub(text: str) -> str:
|
|
58
|
+
return re.sub(r"ACC-\d{6}", "[REDACTED_ACCOUNT]", text)
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
traceval generate traces.db -o evals --include-failures --redact-hook my_redactions:scrub
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Judge configuration (`traceval run --judge ...`)
|
|
66
|
+
|
|
67
|
+
| Value | Behavior |
|
|
68
|
+
| --- | --- |
|
|
69
|
+
| `fake` (default) | Deterministic offline judge: keyword-overlap heuristic, never gates a run at the generated `min_score`. Use it in CI without API keys. |
|
|
70
|
+
| `<model>` | `OpenAICompatJudge` against `https://api.openai.com/v1` with that model, e.g. `--judge gpt-4o-mini`. |
|
|
71
|
+
| `<model>:<base_url>` | Any OpenAI-compatible endpoint, e.g. `--judge llama3:http://localhost:11434/v1`. |
|
|
72
|
+
|
|
73
|
+
API keys come from the environment: `OPENAI_API_KEY` first, then
|
|
74
|
+
`GEMINI_API_KEY`. If only `GEMINI_API_KEY` is set and the base URL is the
|
|
75
|
+
OpenAI default, traceval automatically routes to Gemini's OpenAI-compatible
|
|
76
|
+
endpoint (`https://generativelanguage.googleapis.com/v1beta/openai`) with
|
|
77
|
+
model `gemini-2.5-flash`.
|
|
78
|
+
|
|
79
|
+
Judge calls are budget-capped at 200 per run (hardcoded); calls beyond the
|
|
80
|
+
budget score 0.0 with an explanatory reason. Validate any real judge with
|
|
81
|
+
`traceval calibrate` before trusting its scores.
|
|
82
|
+
|
|
83
|
+
## `traceval.yaml` reference
|
|
84
|
+
|
|
85
|
+
`traceval generate` writes this scaffold next to the suite. Honest status
|
|
86
|
+
of every key:
|
|
87
|
+
|
|
88
|
+
| Key | Default | Consumed today? |
|
|
89
|
+
| --- | --- | --- |
|
|
90
|
+
| `schema_version` | `"1"` | No, informational. |
|
|
91
|
+
| `target.default_url` | `http://localhost:8000/agent` | **Yes**: used as the target when `traceval run` is invoked without `--target`. |
|
|
92
|
+
| `target.timeout_s` | `30` | No: the HTTP timeout is fixed at 30s in code. |
|
|
93
|
+
| `judge.default_provider` | `fake` | No: the CLI's `--judge` default (`fake`) applies instead. |
|
|
94
|
+
| `judge.max_judge_calls` | `200` | No: the 200-call budget is hardcoded. |
|
|
95
|
+
|
|
96
|
+
The unconsumed keys document intended configuration surface; treat them as
|
|
97
|
+
reserved.
|
|
@@ -22,11 +22,41 @@ All trace adapters transform incoming logs into the `Trace` schema (`src/traceva
|
|
|
22
22
|
|
|
23
23
|
## 1. Generic format (`generic`)
|
|
24
24
|
|
|
25
|
-
A line-by-line JSONL file where each line is a raw JSON string validating directly against our canonical `Trace` model.
|
|
25
|
+
A line-by-line JSONL file where each line is a raw JSON string validating directly against our canonical `Trace` model. This is the format to convert to when your backend is not natively supported; the two examples below are complete and ingest as-is.
|
|
26
|
+
|
|
27
|
+
### Required vs optional fields
|
|
28
|
+
|
|
29
|
+
| Field | Required | Notes |
|
|
30
|
+
| --- | --- | --- |
|
|
31
|
+
| `trace_id` | yes | Unique string. |
|
|
32
|
+
| `source` | yes | Free-form origin label, e.g. `"generic"`. |
|
|
33
|
+
| `started_at` | yes | ISO 8601 datetime. |
|
|
34
|
+
| `task_input` | yes | The user request that started the trace. |
|
|
35
|
+
| `steps` | yes | May be `[]`. Each step needs `index` and `kind` (`llm`, `tool`, `retrieval`, `other`); `llm` steps need an `llm` object with `span_id` and `input_messages`, `tool` steps need a `tool` object with `span_id`, `name`, `arguments_json`. |
|
|
36
|
+
| `ended_at` | no | Missing/`null` triggers the built-in timeout rule. |
|
|
37
|
+
| `final_output` | no | `null`/empty triggers the empty-output rule. |
|
|
38
|
+
| `metadata` | no | `dict[str, str]`. |
|
|
39
|
+
| `schema_version`, `outcome` | no | `outcome` is filled by `traceval analyze`; supply it only to pre-label. |
|
|
40
|
+
|
|
41
|
+
### Example: success trace (one llm step, one tool step)
|
|
42
|
+
|
|
43
|
+
```json
|
|
44
|
+
{"trace_id": "tr-ok-1", "source": "generic", "started_at": "2026-07-01T12:00:00Z", "ended_at": "2026-07-01T12:00:02Z", "task_input": "Where is order 88421?", "final_output": "Your order 88421 is in transit.", "steps": [{"index": 0, "kind": "llm", "llm": {"span_id": "s1", "model": "gpt-4o-mini", "input_messages": [{"role": "user", "content": "Where is order 88421?"}], "output_message": {"role": "assistant", "content": "Let me look that up."}}}, {"index": 1, "kind": "tool", "tool": {"span_id": "s2", "name": "order_lookup", "arguments_json": "{\"order_id\": \"88421\"}", "output": "status: in_transit", "latency_ms": 142.0}}]}
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Annotations: `arguments_json` is a raw JSON *string* (never a parsed object), so nothing is lost in translation. `output_message`, `model`, token counts, and `latency_ms` are all optional.
|
|
48
|
+
|
|
49
|
+
### Example: failure trace (tool step with `error` set)
|
|
50
|
+
|
|
51
|
+
```json
|
|
52
|
+
{"trace_id": "tr-fail-1", "source": "generic", "started_at": "2026-07-01T12:05:00Z", "ended_at": "2026-07-01T12:05:01Z", "task_input": "Refund order 88421", "final_output": "Error: refund service unavailable.", "steps": [{"index": 0, "kind": "tool", "tool": {"span_id": "s3", "name": "refund_api", "arguments_json": "{\"order_id\": \"88421\"}", "output": null, "error": "HTTP 503 Service Unavailable"}}]}
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
A non-null `tool.error` makes the built-in `R_TOOL_ERROR` rule label the trace `tool_error`, which is what routes it into a failure cluster and, with `--include-failures`, into a regression case.
|
|
26
56
|
|
|
27
57
|
### Assumptions & Heuristics
|
|
28
|
-
- Direct structural validation.
|
|
29
|
-
- Lines failing to parse are logged as warnings and skipped.
|
|
58
|
+
- Direct structural validation, no field inference.
|
|
59
|
+
- Lines failing to parse are logged as warnings and skipped; the ingest never aborts.
|
|
30
60
|
|
|
31
61
|
---
|
|
32
62
|
|
|
@@ -40,10 +70,14 @@ OTel traces are ingested from flat lists of JSON span logs (e.g. OTLP export for
|
|
|
40
70
|
- `gen_ai.system`
|
|
41
71
|
- `gen_ai.prompt`
|
|
42
72
|
- `gen_ai.completion`
|
|
43
|
-
- **Tool Call**: Spans
|
|
44
|
-
- `gen_ai.tool.name`
|
|
45
|
-
- `gen_ai.tool.arguments`
|
|
46
|
-
-
|
|
73
|
+
- **Tool Call**: Spans matching any of these signals:
|
|
74
|
+
- `gen_ai.tool.name` present in attributes (primary, per GenAI semantic conventions)
|
|
75
|
+
- `gen_ai.tool.arguments` present in attributes
|
|
76
|
+
- `gen_ai.operation.name` attribute equal to `"execute_tool"`
|
|
77
|
+
- `tool.name` present in attributes
|
|
78
|
+
- Span name matches a user-supplied glob from `traceval ingest --tool-span-names` (comma-separated, e.g. `"*_lookup,tool_*"`)
|
|
79
|
+
|
|
80
|
+
There is no built-in tool-name list; detection never depends on a specific tool vocabulary.
|
|
47
81
|
- **Other**: All other spans are categorized as `other`.
|
|
48
82
|
|
|
49
83
|
### Attribute Translations
|
|
@@ -89,7 +123,12 @@ Langfuse exports traces as JSON objects with nested lists of observations (of ty
|
|
|
89
123
|
- `llm.prompt_tokens` $\leftarrow$ `usage.promptTokens`
|
|
90
124
|
- `llm.completion_tokens` $\leftarrow$ `usage.completionTokens`
|
|
91
125
|
- `llm.error` $\leftarrow$ `statusMessage` when `level == "ERROR"`
|
|
92
|
-
- **SPAN** $\rightarrow$ `ToolCall`
|
|
126
|
+
- **SPAN** $\rightarrow$ `ToolCall` when, in priority order:
|
|
127
|
+
1. the observation's `metadata.tool` is set (explicit marker, always wins), or
|
|
128
|
+
2. `traceval ingest --tool-span-names` globs were supplied and the observation `name` matches one (globs replace the heuristic below), or
|
|
129
|
+
3. default heuristic: the SPAN recorded an `input` AND either an `output` or an error signal (`level == "ERROR"` or `statusMessage` set). Failed tool calls often produce no output, which is why an error counts as the second signal.
|
|
130
|
+
|
|
131
|
+
SPANs matching none of these become `other` steps. Field mapping:
|
|
93
132
|
- `tool.name` $\leftarrow$ Observation `name`
|
|
94
133
|
- `tool.arguments_json` $\leftarrow$ Observation `input` (serialized to JSON)
|
|
95
134
|
- `tool.output` $\leftarrow$ Observation `output` (stringified)
|
|
Binary file
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# Running the suite against your agent: the target contract
|
|
2
|
+
|
|
3
|
+
`traceval run <evals_dir> --target <target>` accepts two target forms. Both
|
|
4
|
+
are resolved once per session; an unresolvable target prints one clear
|
|
5
|
+
`ERROR:` line at the top of the output, records a `target_resolution` entry
|
|
6
|
+
in the run report's `errors` list, and the run exits nonzero.
|
|
7
|
+
|
|
8
|
+
## HTTP target
|
|
9
|
+
|
|
10
|
+
Pass any `http://` or `https://` URL. For every case, traceval sends:
|
|
11
|
+
|
|
12
|
+
```
|
|
13
|
+
POST <your url>
|
|
14
|
+
Content-Type: application/json
|
|
15
|
+
|
|
16
|
+
{"input": "<the case's task text>"}
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
with a 30 second timeout. Non-2xx responses fail the case.
|
|
20
|
+
|
|
21
|
+
The JSON response is interpreted as follows:
|
|
22
|
+
|
|
23
|
+
| Field | Meaning |
|
|
24
|
+
| --- | --- |
|
|
25
|
+
| `output` / `final_output` / `response` | The agent's answer, checked in that priority order; the first present key wins. |
|
|
26
|
+
| any other key | Fallback: if none of the three keys exist, the first non-`tool_calls` value is stringified and used as the output. A non-object response body is stringified whole. |
|
|
27
|
+
| `tool_calls` (optional) | List of tool invocations, either `{"name": "..."}` objects or plain strings. Only the names are kept. |
|
|
28
|
+
|
|
29
|
+
`tool_calls` exists so the `tool_sequence` check (did the agent call the
|
|
30
|
+
recorded tools, in order) and the `no_tool_loop` check (did it avoid calling
|
|
31
|
+
the same tool 3+ times consecutively) have something to score. If your
|
|
32
|
+
endpoint omits it, generated `tool_sequence` checks will fail; either return
|
|
33
|
+
the names or delete those checks from the case YAML.
|
|
34
|
+
|
|
35
|
+
If the URL is unreachable (connection refused, invalid URL) traceval prints
|
|
36
|
+
the one-line `ERROR:` at session start; cases still run and fail
|
|
37
|
+
individually so the run report stays complete.
|
|
38
|
+
|
|
39
|
+
### Minimal FastAPI implementation
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
# my_agent.py
|
|
43
|
+
from fastapi import FastAPI
|
|
44
|
+
from pydantic import BaseModel
|
|
45
|
+
|
|
46
|
+
app = FastAPI()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class AgentInput(BaseModel):
|
|
50
|
+
input: str
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@app.post("/agent")
|
|
54
|
+
def run_agent(payload: AgentInput) -> dict:
|
|
55
|
+
# Call your real agent here.
|
|
56
|
+
answer = f"You asked: {payload.input}"
|
|
57
|
+
return {
|
|
58
|
+
"output": answer,
|
|
59
|
+
"tool_calls": [{"name": "kb_search"}],
|
|
60
|
+
}
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
uvicorn my_agent:app --port 8000 &
|
|
65
|
+
traceval run evals/ --target http://127.0.0.1:8000/agent --judge fake
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Callable target
|
|
69
|
+
|
|
70
|
+
Pass `module:function` (one colon). traceval inserts the current working
|
|
71
|
+
directory into `sys.path`, imports the module, and calls the function with
|
|
72
|
+
the case's task text as a single string argument:
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
def invoke_agent(input_text: str) -> dict: ...
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
Accepted return shapes:
|
|
79
|
+
|
|
80
|
+
- a dict, interpreted exactly like the HTTP response above
|
|
81
|
+
(`output`/`final_output`/`response` priority, optional `tool_calls`)
|
|
82
|
+
- an object with an `.output` attribute (and optional `.tool_calls`)
|
|
83
|
+
- anything else, stringified whole and used as the output
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
traceval run evals/ --target myapp.agent:invoke_agent --judge fake
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Because the working directory is importable, `myapp/agent.py` in your repo
|
|
90
|
+
root works without installation. A bad module path or missing attribute
|
|
91
|
+
produces the same one-line `ERROR:` plus a self-describing run report.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.3"
|
|
@@ -25,15 +25,28 @@ def ingest(
|
|
|
25
25
|
path: str,
|
|
26
26
|
format: str = typer.Option("auto", help="auto|otel|langfuse|langsmith|generic"),
|
|
27
27
|
output: str = typer.Option("traces.db", "-o", help="SQLite database output path"),
|
|
28
|
+
tool_span_names: str = typer.Option(
|
|
29
|
+
None,
|
|
30
|
+
"--tool-span-names",
|
|
31
|
+
help=(
|
|
32
|
+
"Comma-separated name globs (e.g. '*_lookup,tool_*') that mark "
|
|
33
|
+
"spans as tool calls; replaces the built-in heuristic for "
|
|
34
|
+
"Langfuse SPANs and adds a fallback for OTel spans"
|
|
35
|
+
),
|
|
36
|
+
),
|
|
28
37
|
json_output: bool = typer.Option(
|
|
29
38
|
False, "--json", help="Print a machine-readable JSON summary to stdout"
|
|
30
39
|
),
|
|
31
40
|
) -> None:
|
|
32
41
|
"""Ingest trace logs into SQLite database."""
|
|
42
|
+
globs = None
|
|
43
|
+
if tool_span_names:
|
|
44
|
+
globs = [g.strip() for g in tool_span_names.split(",") if g.strip()]
|
|
45
|
+
|
|
33
46
|
db = TraceStore(output)
|
|
34
47
|
try:
|
|
35
48
|
ok_count, span_count, warn_count, log_file = ingest_file(
|
|
36
|
-
Path(path), db, format_name=format
|
|
49
|
+
Path(path), db, format_name=format, tool_span_globs=globs
|
|
37
50
|
)
|
|
38
51
|
if json_output:
|
|
39
52
|
typer.echo(
|