traceval 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- traceval-0.1.1/.github/workflows/ci.yml +83 -0
- traceval-0.1.1/.gitignore +68 -0
- traceval-0.1.1/.pre-commit-config.yaml +7 -0
- traceval-0.1.1/CHANGELOG.md +18 -0
- traceval-0.1.1/LICENSE +21 -0
- traceval-0.1.1/PKG-INFO +154 -0
- traceval-0.1.1/README.md +138 -0
- traceval-0.1.1/docs/formats.md +126 -0
- traceval-0.1.1/examples/demo.sh +30 -0
- traceval-0.1.1/examples/demo_agent/agent.py +46 -0
- traceval-0.1.1/examples/demo_agent/core.py +57 -0
- traceval-0.1.1/examples/make_traces.py +216 -0
- traceval-0.1.1/examples/synthetic_traces.jsonl +200 -0
- traceval-0.1.1/pyproject.toml +79 -0
- traceval-0.1.1/src/traceval/__init__.py +1 -0
- traceval-0.1.1/src/traceval/analyze/__init__.py +75 -0
- traceval-0.1.1/src/traceval/analyze/cluster.py +226 -0
- traceval-0.1.1/src/traceval/analyze/coverage.py +84 -0
- traceval-0.1.1/src/traceval/analyze/outcomes.py +254 -0
- traceval-0.1.1/src/traceval/analyze/report.py +478 -0
- traceval-0.1.1/src/traceval/cli.py +168 -0
- traceval-0.1.1/src/traceval/compile/__init__.py +50 -0
- traceval-0.1.1/src/traceval/compile/cases.py +204 -0
- traceval-0.1.1/src/traceval/compile/emit_pytest.py +21 -0
- traceval-0.1.1/src/traceval/compile/emit_yaml.py +47 -0
- traceval-0.1.1/src/traceval/compile/rubrics.py +52 -0
- traceval-0.1.1/src/traceval/compile/templates/conftest.py.jinja +158 -0
- traceval-0.1.1/src/traceval/compile/templates/test_generated.py.jinja +17 -0
- traceval-0.1.1/src/traceval/ingest/__init__.py +99 -0
- traceval-0.1.1/src/traceval/ingest/base.py +13 -0
- traceval-0.1.1/src/traceval/ingest/generic.py +40 -0
- traceval-0.1.1/src/traceval/ingest/langfuse.py +241 -0
- traceval-0.1.1/src/traceval/ingest/langsmith.py +257 -0
- traceval-0.1.1/src/traceval/ingest/otel.py +238 -0
- traceval-0.1.1/src/traceval/model.py +66 -0
- traceval-0.1.1/src/traceval/run/judge.py +196 -0
- traceval-0.1.1/src/traceval/run/runner.py +113 -0
- traceval-0.1.1/src/traceval/run/scorers.py +150 -0
- traceval-0.1.1/src/traceval/run/target.py +104 -0
- traceval-0.1.1/src/traceval/store.py +59 -0
- traceval-0.1.1/tests/fixtures/README.md +18 -0
- traceval-0.1.1/tests/fixtures/demo_agent_expected/placeholder.txt +1 -0
- traceval-0.1.1/tests/fixtures/demo_agent_expected/results.json +7 -0
- traceval-0.1.1/tests/fixtures/generic_traces.jsonl +12 -0
- traceval-0.1.1/tests/fixtures/langfuse_export.jsonl +6 -0
- traceval-0.1.1/tests/fixtures/langsmith_runs.jsonl +13 -0
- traceval-0.1.1/tests/fixtures/otel_spans.jsonl +13 -0
- traceval-0.1.1/tests/test_cli.py +12 -0
- traceval-0.1.1/tests/test_phase1.py +141 -0
- traceval-0.1.1/tests/test_phase2.py +139 -0
- traceval-0.1.1/tests/test_phase3.py +143 -0
- traceval-0.1.1/tests/test_phase4.py +79 -0
- traceval-0.1.1/tests/test_phase5.py +116 -0
- traceval-0.1.1/tests/test_phase6.py +275 -0
- traceval-0.1.1/uv.lock +1130 -0
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [ main, master ]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [ main, master ]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.11", "3.12", "3.13"]
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Install uv and set up Python
|
|
20
|
+
uses: astral-sh/setup-uv@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: ${{ matrix.python-version }}
|
|
23
|
+
enable-cache: true
|
|
24
|
+
|
|
25
|
+
- name: Run Ruff Check
|
|
26
|
+
run: uv run ruff check src/ tests/ examples/
|
|
27
|
+
|
|
28
|
+
- name: Run Ruff Format Check
|
|
29
|
+
run: uv run ruff format --check src/ tests/ examples/
|
|
30
|
+
|
|
31
|
+
- name: Run Mypy
|
|
32
|
+
run: uv run mypy src/
|
|
33
|
+
|
|
34
|
+
- name: Run Tests with Coverage
|
|
35
|
+
run: uv run pytest --cov=src/traceval --cov-fail-under=85
|
|
36
|
+
|
|
37
|
+
tag-and-release:
|
|
38
|
+
needs: test
|
|
39
|
+
if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref == 'refs/heads/master')
|
|
40
|
+
runs-on: ubuntu-latest
|
|
41
|
+
permissions:
|
|
42
|
+
contents: write
|
|
43
|
+
id-token: write
|
|
44
|
+
steps:
|
|
45
|
+
- uses: actions/checkout@v4
|
|
46
|
+
with:
|
|
47
|
+
fetch-depth: 0
|
|
48
|
+
|
|
49
|
+
- name: Install uv and set up Python
|
|
50
|
+
uses: astral-sh/setup-uv@v5
|
|
51
|
+
with:
|
|
52
|
+
python-version: "3.12"
|
|
53
|
+
enable-cache: true
|
|
54
|
+
|
|
55
|
+
- name: Check version and build
|
|
56
|
+
id: check-ver
|
|
57
|
+
run: |
|
|
58
|
+
VERSION=$(grep -m 1 'version =' pyproject.toml | tr -d '"' | tr -d "'" | awk '{print $NF}')
|
|
59
|
+
TAG="v$VERSION"
|
|
60
|
+
|
|
61
|
+
if git rev-parse "$TAG" >/dev/null 2>&1; then
|
|
62
|
+
echo "Tag $TAG already exists, skipping release."
|
|
63
|
+
echo "publish=false" >> $GITHUB_OUTPUT
|
|
64
|
+
else
|
|
65
|
+
echo "New version detected. Building and releasing $TAG..."
|
|
66
|
+
uv build
|
|
67
|
+
|
|
68
|
+
# Configure Git and push tag
|
|
69
|
+
git config user.name "github-actions[bot]"
|
|
70
|
+
git config user.email "github-actions[bot]@users.noreply.github.com"
|
|
71
|
+
git tag "$TAG"
|
|
72
|
+
git push origin "$TAG"
|
|
73
|
+
|
|
74
|
+
echo "publish=true" >> $GITHUB_OUTPUT
|
|
75
|
+
fi
|
|
76
|
+
|
|
77
|
+
- name: Publish to PyPI
|
|
78
|
+
if: steps.check-ver.outputs.publish == 'true'
|
|
79
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
80
|
+
with:
|
|
81
|
+
skip-existing: true
|
|
82
|
+
|
|
83
|
+
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# Pytest / coverage
|
|
30
|
+
.cache
|
|
31
|
+
.pytest_cache/
|
|
32
|
+
.coverage
|
|
33
|
+
.coverage.*
|
|
34
|
+
.traceback
|
|
35
|
+
nosetests.xml
|
|
36
|
+
coverage.xml
|
|
37
|
+
htmlcov/
|
|
38
|
+
.htmlcov/
|
|
39
|
+
|
|
40
|
+
# Mypy / Ruff
|
|
41
|
+
.mypy_cache/
|
|
42
|
+
.ruff_cache/
|
|
43
|
+
|
|
44
|
+
# Environments
|
|
45
|
+
.env
|
|
46
|
+
.venv/
|
|
47
|
+
env/
|
|
48
|
+
venv/
|
|
49
|
+
ENV/
|
|
50
|
+
env.bak/
|
|
51
|
+
venv.bak/
|
|
52
|
+
|
|
53
|
+
# SQLite databases & logs (traceval database runs)
|
|
54
|
+
*.db
|
|
55
|
+
*.log
|
|
56
|
+
demo_analysis/
|
|
57
|
+
demo_evals/
|
|
58
|
+
evals/
|
|
59
|
+
|
|
60
|
+
# Specifications & user instructions
|
|
61
|
+
spec.md
|
|
62
|
+
|
|
63
|
+
# OS files
|
|
64
|
+
.DS_Store
|
|
65
|
+
.DS_Store?
|
|
66
|
+
ehthumbs.db
|
|
67
|
+
Icon?
|
|
68
|
+
Thumbs.db
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [0.1.0] - 2026-07-02
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
- **Canonical Model & SQLite Storage**: Structured canonical schema representing `Trace`, `Step`, `ToolCall`, `LLMCall`, and `Outcome`.
|
|
12
|
+
- **Multi-backend Telemetry Ingestion**: Added log adapters for OpenTelemetry GenAI, Langfuse observation dumps, LangSmith logs, and Generic JSONL files.
|
|
13
|
+
- **Rule-based Labeler**: Dynamic outcome labels classifier (`success`, `tool_error`, `validation_error`, `loop`, `timeout`, `bad_output`, `unknown`) with custom python rule plugins.
|
|
14
|
+
- **Agglomerative Clustering**: Signature and task-input Jaccard shingle clustering.
|
|
15
|
+
- **Jinja2 Coverage Report Visualizer**: Ported self-contained single-page HTML report charts.
|
|
16
|
+
- **Pytest Case Compiler**: Emitter of YAML test case configurations, LLM-as-judge scaffolds, and custom redact hook scrubbers.
|
|
17
|
+
- **Scorers & Judges**: Scorer implementations for `exact`, `contains`, `regex`, `json_schema`, `tool_sequence` (order/subset modes), and `judge` (FakeJudge, OpenAICompatJudge with call caps).
|
|
18
|
+
- **FastAPI Demo Agent**: Supporting mock customer service tools and BUGGY mode regressions checks.
|
traceval-0.1.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Ramkumar M
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
traceval-0.1.1/PKG-INFO
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: traceval
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Trace-to-Eval Compiler
|
|
5
|
+
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Python: >=3.11
|
|
8
|
+
Requires-Dist: httpx>=0.24.0
|
|
9
|
+
Requires-Dist: jinja2>=3.1.0
|
|
10
|
+
Requires-Dist: jsonschema>=4.17.0
|
|
11
|
+
Requires-Dist: pydantic>=2.0.0
|
|
12
|
+
Requires-Dist: pyyaml>=6.0.0
|
|
13
|
+
Requires-Dist: rich>=13.0.0
|
|
14
|
+
Requires-Dist: typer>=0.9.0
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
|
|
17
|
+
# 🚀 traceval: Trace-to-Eval Compiler
|
|
18
|
+
|
|
19
|
+
<p align="center">
|
|
20
|
+
<img src="https://img.shields.io/badge/Python-3.11%20%7C%203.12%20%7C%203.13-blue.svg" alt="Python Version" />
|
|
21
|
+
<img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="License" />
|
|
22
|
+
<img src="https://img.shields.io/badge/Coverage-87%25-green.svg" alt="Coverage" />
|
|
23
|
+
<a href="https://github.com/astral-sh/uv"><img src="https://img.shields.io/badge/dynamic/json?label=uv&query=%24.version&url=https%3A%2F%2Fraw.githubusercontent.com%2Fastral-sh%2Fuv%2Fmain%2Fcargo.toml&color=de5d43" alt="uv" /></a>
|
|
24
|
+
</p>
|
|
25
|
+
|
|
26
|
+
***"Your traces already know how your agent fails. traceval turns them into the test suite you never wrote."***
|
|
27
|
+
|
|
28
|
+
Teams running LLM agents in production have observability traces, but only a fraction maintain robust evals. The raw material for great tests — thousands of real production traces, including edge cases and errors — sits unused because converting them into regression suites is manual and tedious.
|
|
29
|
+
|
|
30
|
+
**traceval** automates this by ingesting agent traces from standard sources, normalizing them into a canonical Pydantic model, analyzing outcomes/clustering task signatures, and **compiling them into a human-editable eval suite**: pytest files + YAML datasets + judge rubric scaffolds.
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## 🎨 Architectural Pipeline
|
|
35
|
+
|
|
36
|
+
```mermaid
|
|
37
|
+
graph LR
|
|
38
|
+
classDef source fill:#2c3e50,stroke:#34495e,stroke-width:2px,color:#fff;
|
|
39
|
+
classDef normalize fill:#16a085,stroke:#1abc9c,stroke-width:2px,color:#fff;
|
|
40
|
+
classDef analyze fill:#2980b9,stroke:#3498db,stroke-width:2px,color:#fff;
|
|
41
|
+
classDef compile fill:#8e44ad,stroke:#9b59b6,stroke-width:2px,color:#fff;
|
|
42
|
+
classDef run fill:#d35400,stroke:#e67e22,stroke-width:2px,color:#fff;
|
|
43
|
+
|
|
44
|
+
A[OTel / Langfuse / LangSmith] --> B(Canonical Trace DB)
|
|
45
|
+
B --> C(Outcome Labeler & Jaccard Clusterer)
|
|
46
|
+
C --> D(YAML cases + Pytest + Rubrics)
|
|
47
|
+
D --> E(HTTP / Callable Runner & Diff Reports)
|
|
48
|
+
|
|
49
|
+
class A source;
|
|
50
|
+
class B normalize;
|
|
51
|
+
class C analyze;
|
|
52
|
+
class D compile;
|
|
53
|
+
class E run;
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## ✨ Key Features
|
|
59
|
+
|
|
60
|
+
* 🔌 **Zero-Configuration Ingest**: Direct compatibility with OpenTelemetry GenAI semantic conventions, Langfuse observations, LangSmith runs, or generic JSONL exports.
|
|
61
|
+
* 🧠 **Smart Outcome Taxonomy**: Automatic categorization of trace outcomes (`success`, `tool_error`, `validation_error`, `loop`, `timeout`, `bad_output`) using rule-based heuristics that you can extend with Python modules.
|
|
62
|
+
* 📊 **Embedding-Free Clustering**: Fast, local Jaccard-similarity shingle grouping that runs 100% offline, keeping your development cycle private and deterministic.
|
|
63
|
+
* 📝 **Clean Code Generation**: Compiles cases into editable YAML files, LLM-as-a-judge rubrics into Markdown checklist scaffolds, and pytest test runs into clean templates.
|
|
64
|
+
* ⚡ **PII Redaction Safeguards**: Automatically scrubs emails, credit cards, phone numbers, and API tokens before writing test inputs.
|
|
65
|
+
* 🛡️ **CI/CD Regression Diff**: Compares execution summaries and scores between runs using exit codes to catch agent failures before deploying.
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
## ⏱️ 90-Second E2E Quickstart
|
|
70
|
+
|
|
71
|
+
Experience `traceval` regression testing out of the box using our interactive demo script:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
# Clone & run the demo
|
|
75
|
+
chmod +x examples/demo.sh
|
|
76
|
+
./examples/demo.sh
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Manual Walkthrough
|
|
80
|
+
|
|
81
|
+
#### 1. Ingest Observability Logs
|
|
82
|
+
```bash
|
|
83
|
+
# Seed 200 synthetic telemetry traces containing successes and failure edge cases
|
|
84
|
+
python3 examples/make_traces.py
|
|
85
|
+
|
|
86
|
+
# Ingest into SQLite database
|
|
87
|
+
traceval ingest examples/synthetic_traces.jsonl -o traces.db
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
#### 2. Label & Analyze Traffic Gaps
|
|
91
|
+
```bash
|
|
92
|
+
traceval analyze traces.db -o analysis/
|
|
93
|
+
```
|
|
94
|
+
*Outputs outcome statistics and generates `analysis/report.html` mapping traffic clusters:*
|
|
95
|
+
```text
|
|
96
|
+
Outcomes: success 60% · tool_error 15% · loop 10% · timeout 8% · validation_error 8%
|
|
97
|
+
Clusters: 37 task clusters found.
|
|
98
|
+
Top failure cluster: "500 refund stripe -> stripe_lookup -> (tool_error)" (30 traces)
|
|
99
|
+
Report written to analysis/report.html
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
#### 3. Compile Cases and Pytest Harness
|
|
103
|
+
```bash
|
|
104
|
+
traceval generate traces.db -o evals/ --include-failures
|
|
105
|
+
```
|
|
106
|
+
*Generates test parameters `evals/cases/` and rubric Markdown checklists `evals/rubrics/`.*
|
|
107
|
+
|
|
108
|
+
#### 4. Run Evaluations & Detect Regressions
|
|
109
|
+
```bash
|
|
110
|
+
# Run against the healthy agent (100% Pass)
|
|
111
|
+
traceval run evals/ --target examples.demo_agent.agent:invoke_agent --judge fake
|
|
112
|
+
|
|
113
|
+
# Run against the buggy agent (Detects regressions and exits with status 1)
|
|
114
|
+
BUGGY=true traceval run evals/ --target examples.demo_agent.agent:invoke_agent --judge fake
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
## 🛠️ CLI Command Reference
|
|
120
|
+
|
|
121
|
+
> [!NOTE]
|
|
122
|
+
> All CLI commands support `--json` to output machine-readable stdout for scripting.
|
|
123
|
+
|
|
124
|
+
### Ingestion
|
|
125
|
+
```bash
|
|
126
|
+
traceval ingest <path> --format [auto|otel|langfuse|langsmith|generic] -o <traces.db>
|
|
127
|
+
```
|
|
128
|
+
*Ingests telemetry log dumps losslessly. Malformed spans write warnings to `<traces.db>.log`.*
|
|
129
|
+
|
|
130
|
+
### Analysis
|
|
131
|
+
```bash
|
|
132
|
+
traceval analyze <traces.db> [--rules custom_rules.py] [--evals evals/] -o <analysis_dir/>
|
|
133
|
+
```
|
|
134
|
+
*Runs rule pipelines and Jaccard shingle similarity groupings.*
|
|
135
|
+
|
|
136
|
+
### Generation
|
|
137
|
+
```bash
|
|
138
|
+
traceval generate <traces.db> -o <evals_dir/> [--per-cluster 3] [--include-failures] [--redact-hook module:fn]
|
|
139
|
+
```
|
|
140
|
+
*Creates regression cases, Markdown LLM-judge checklists, and conftest runners.*
|
|
141
|
+
|
|
142
|
+
### Runner
|
|
143
|
+
```bash
|
|
144
|
+
traceval run <evals_dir/> --target <url|module:function> [--judge fake|openai] [--compare runs/prev.json]
|
|
145
|
+
```
|
|
146
|
+
*Executes tests, scores output constraints (`exact`, `contains`, `regex`, `json_schema`, `tool_sequence`, `judge`), and logs to project-level `runs/` directory.*
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## 💡 Honest Limitations
|
|
151
|
+
|
|
152
|
+
* **Side-Effect Free**: traceval assertions evaluate input/output matches. It does not attempt to replay side effects (e.g., updating database records) on mock tools.
|
|
153
|
+
* **Text Telemetry**: The canonical model is optimized for text logs. Image or multimodal payloads in traces are logged as references.
|
|
154
|
+
* **Static Visualization**: The coverage report is a portable, single-file HTML page. There is no hosted web service.
|
traceval-0.1.1/README.md
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# 🚀 traceval: Trace-to-Eval Compiler
|
|
2
|
+
|
|
3
|
+
<p align="center">
|
|
4
|
+
<img src="https://img.shields.io/badge/Python-3.11%20%7C%203.12%20%7C%203.13-blue.svg" alt="Python Version" />
|
|
5
|
+
<img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="License" />
|
|
6
|
+
<img src="https://img.shields.io/badge/Coverage-87%25-green.svg" alt="Coverage" />
|
|
7
|
+
<a href="https://github.com/astral-sh/uv"><img src="https://img.shields.io/badge/dynamic/json?label=uv&query=%24.version&url=https%3A%2F%2Fraw.githubusercontent.com%2Fastral-sh%2Fuv%2Fmain%2Fcargo.toml&color=de5d43" alt="uv" /></a>
|
|
8
|
+
</p>
|
|
9
|
+
|
|
10
|
+
***"Your traces already know how your agent fails. traceval turns them into the test suite you never wrote."***
|
|
11
|
+
|
|
12
|
+
Teams running LLM agents in production have observability traces, but only a fraction maintain robust evals. The raw material for great tests — thousands of real production traces, including edge cases and errors — sits unused because converting them into regression suites is manual and tedious.
|
|
13
|
+
|
|
14
|
+
**traceval** automates this by ingesting agent traces from standard sources, normalizing them into a canonical Pydantic model, analyzing outcomes/clustering task signatures, and **compiling them into a human-editable eval suite**: pytest files + YAML datasets + judge rubric scaffolds.
|
|
15
|
+
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
## 🎨 Architectural Pipeline
|
|
19
|
+
|
|
20
|
+
```mermaid
|
|
21
|
+
graph LR
|
|
22
|
+
classDef source fill:#2c3e50,stroke:#34495e,stroke-width:2px,color:#fff;
|
|
23
|
+
classDef normalize fill:#16a085,stroke:#1abc9c,stroke-width:2px,color:#fff;
|
|
24
|
+
classDef analyze fill:#2980b9,stroke:#3498db,stroke-width:2px,color:#fff;
|
|
25
|
+
classDef compile fill:#8e44ad,stroke:#9b59b6,stroke-width:2px,color:#fff;
|
|
26
|
+
classDef run fill:#d35400,stroke:#e67e22,stroke-width:2px,color:#fff;
|
|
27
|
+
|
|
28
|
+
A[OTel / Langfuse / LangSmith] --> B(Canonical Trace DB)
|
|
29
|
+
B --> C(Outcome Labeler & Jaccard Clusterer)
|
|
30
|
+
C --> D(YAML cases + Pytest + Rubrics)
|
|
31
|
+
D --> E(HTTP / Callable Runner & Diff Reports)
|
|
32
|
+
|
|
33
|
+
class A source;
|
|
34
|
+
class B normalize;
|
|
35
|
+
class C analyze;
|
|
36
|
+
class D compile;
|
|
37
|
+
class E run;
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## ✨ Key Features
|
|
43
|
+
|
|
44
|
+
* 🔌 **Zero-Configuration Ingest**: Direct compatibility with OpenTelemetry GenAI semantic conventions, Langfuse observations, LangSmith runs, or generic JSONL exports.
|
|
45
|
+
* 🧠 **Smart Outcome Taxonomy**: Automatic categorization of trace outcomes (`success`, `tool_error`, `validation_error`, `loop`, `timeout`, `bad_output`) using rule-based heuristics that you can extend with Python modules.
|
|
46
|
+
* 📊 **Embedding-Free Clustering**: Fast, local Jaccard-similarity shingle grouping that runs 100% offline, keeping your development cycle private and deterministic.
|
|
47
|
+
* 📝 **Clean Code Generation**: Compiles cases into editable YAML files, LLM-as-a-judge rubrics into Markdown checklist scaffolds, and pytest test runs into clean templates.
|
|
48
|
+
* ⚡ **PII Redaction Safeguards**: Automatically scrubs emails, credit cards, phone numbers, and API tokens before writing test inputs.
|
|
49
|
+
* 🛡️ **CI/CD Regression Diff**: Compares execution summaries and scores between runs using exit codes to catch agent failures before deploying.
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## ⏱️ 90-Second E2E Quickstart
|
|
54
|
+
|
|
55
|
+
Experience `traceval` regression testing out of the box using our interactive demo script:
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
# Clone & run the demo
|
|
59
|
+
chmod +x examples/demo.sh
|
|
60
|
+
./examples/demo.sh
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Manual Walkthrough
|
|
64
|
+
|
|
65
|
+
#### 1. Ingest Observability Logs
|
|
66
|
+
```bash
|
|
67
|
+
# Seed 200 synthetic telemetry traces containing successes and failure edge cases
|
|
68
|
+
python3 examples/make_traces.py
|
|
69
|
+
|
|
70
|
+
# Ingest into SQLite database
|
|
71
|
+
traceval ingest examples/synthetic_traces.jsonl -o traces.db
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
#### 2. Label & Analyze Traffic Gaps
|
|
75
|
+
```bash
|
|
76
|
+
traceval analyze traces.db -o analysis/
|
|
77
|
+
```
|
|
78
|
+
*Outputs outcome statistics and generates `analysis/report.html` mapping traffic clusters:*
|
|
79
|
+
```text
|
|
80
|
+
Outcomes: success 60% · tool_error 15% · loop 10% · timeout 8% · validation_error 8%
|
|
81
|
+
Clusters: 37 task clusters found.
|
|
82
|
+
Top failure cluster: "500 refund stripe -> stripe_lookup -> (tool_error)" (30 traces)
|
|
83
|
+
Report written to analysis/report.html
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
#### 3. Compile Cases and Pytest Harness
|
|
87
|
+
```bash
|
|
88
|
+
traceval generate traces.db -o evals/ --include-failures
|
|
89
|
+
```
|
|
90
|
+
*Generates test parameters `evals/cases/` and rubric Markdown checklists `evals/rubrics/`.*
|
|
91
|
+
|
|
92
|
+
#### 4. Run Evaluations & Detect Regressions
|
|
93
|
+
```bash
|
|
94
|
+
# Run against the healthy agent (100% Pass)
|
|
95
|
+
traceval run evals/ --target examples.demo_agent.agent:invoke_agent --judge fake
|
|
96
|
+
|
|
97
|
+
# Run against the buggy agent (Detects regressions and exits with status 1)
|
|
98
|
+
BUGGY=true traceval run evals/ --target examples.demo_agent.agent:invoke_agent --judge fake
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## 🛠️ CLI Command Reference
|
|
104
|
+
|
|
105
|
+
> [!NOTE]
|
|
106
|
+
> All CLI commands support `--json` to output machine-readable stdout for scripting.
|
|
107
|
+
|
|
108
|
+
### Ingestion
|
|
109
|
+
```bash
|
|
110
|
+
traceval ingest <path> --format [auto|otel|langfuse|langsmith|generic] -o <traces.db>
|
|
111
|
+
```
|
|
112
|
+
*Ingests telemetry log dumps losslessly. Malformed spans write warnings to `<traces.db>.log`.*
|
|
113
|
+
|
|
114
|
+
### Analysis
|
|
115
|
+
```bash
|
|
116
|
+
traceval analyze <traces.db> [--rules custom_rules.py] [--evals evals/] -o <analysis_dir/>
|
|
117
|
+
```
|
|
118
|
+
*Runs rule pipelines and Jaccard shingle similarity groupings.*
|
|
119
|
+
|
|
120
|
+
### Generation
|
|
121
|
+
```bash
|
|
122
|
+
traceval generate <traces.db> -o <evals_dir/> [--per-cluster 3] [--include-failures] [--redact-hook module:fn]
|
|
123
|
+
```
|
|
124
|
+
*Creates regression cases, Markdown LLM-judge checklists, and conftest runners.*
|
|
125
|
+
|
|
126
|
+
### Runner
|
|
127
|
+
```bash
|
|
128
|
+
traceval run <evals_dir/> --target <url|module:function> [--judge fake|openai] [--compare runs/prev.json]
|
|
129
|
+
```
|
|
130
|
+
*Executes tests, scores output constraints (`exact`, `contains`, `regex`, `json_schema`, `tool_sequence`, `judge`), and logs to project-level `runs/` directory.*
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
## 💡 Honest Limitations
|
|
135
|
+
|
|
136
|
+
* **Side-Effect Free**: traceval assertions evaluate input/output matches. It does not attempt to replay side effects (e.g., updating database records) on mock tools.
|
|
137
|
+
* **Text Telemetry**: The canonical model is optimized for text logs. Image or multimodal payloads in traces are logged as references.
|
|
138
|
+
* **Static Visualization**: The coverage report is a portable, single-file HTML page. There is no hosted web service.
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
# Trace Ingestion Formats & Mappings
|
|
2
|
+
|
|
3
|
+
This document describes how raw trace records from different observability backends are mapped onto the `traceval` canonical `Trace` Pydantic model.
|
|
4
|
+
|
|
5
|
+
## Canonical Trace Model
|
|
6
|
+
|
|
7
|
+
All trace adapters transform incoming logs into the `Trace` schema (`src/traceval/model.py`), which includes:
|
|
8
|
+
- `trace_id` (str)
|
|
9
|
+
- `source` (str: "otel", "langfuse", "langsmith", "generic")
|
|
10
|
+
- `started_at` (datetime)
|
|
11
|
+
- `ended_at` (datetime or None)
|
|
12
|
+
- `task_input` (str, user prompt triggering the trace)
|
|
13
|
+
- `final_output` (str or None, assistant's final response)
|
|
14
|
+
- `steps` (list of Step objects chronologically ordered):
|
|
15
|
+
- `index` (int)
|
|
16
|
+
- `kind` ("llm", "tool", "retrieval", "other")
|
|
17
|
+
- `llm` (LLMCall or None)
|
|
18
|
+
- `tool` (ToolCall or None)
|
|
19
|
+
- `raw_attributes` (dict[str, str], lossless metadata escape hatch)
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## 1. Generic format (`generic`)
|
|
24
|
+
|
|
25
|
+
A line-by-line JSONL file where each line is a raw JSON string validating directly against our canonical `Trace` model.
|
|
26
|
+
|
|
27
|
+
### Assumptions & Heuristics
|
|
28
|
+
- Direct structural validation.
|
|
29
|
+
- Lines failing to parse are logged as warnings and skipped.
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## 2. OpenTelemetry GenAI Conventions (`otel`)
|
|
34
|
+
|
|
35
|
+
OTel traces are ingested from flat lists of JSON span logs (e.g. OTLP export format), grouped by `trace_id`.
|
|
36
|
+
|
|
37
|
+
### Span Categorization Rules
|
|
38
|
+
- **Root Span**: Identified by `parent_span_id` being `None` or empty.
|
|
39
|
+
- **LLM Call**: Spans containing GenAI semantic convention keys in their attributes:
|
|
40
|
+
- `gen_ai.system`
|
|
41
|
+
- `gen_ai.prompt`
|
|
42
|
+
- `gen_ai.completion`
|
|
43
|
+
- **Tool Call**: Spans containing:
|
|
44
|
+
- `gen_ai.tool.name`
|
|
45
|
+
- `gen_ai.tool.arguments`
|
|
46
|
+
- Or span name matching `order_lookup`, `stripe_lookup`, or `kb_search`.
|
|
47
|
+
- **Other**: All other spans are categorized as `other`.
|
|
48
|
+
|
|
49
|
+
### Attribute Translations
|
|
50
|
+
|
|
51
|
+
| Canonical Field | OTel Span Path |
|
|
52
|
+
| --- | --- |
|
|
53
|
+
| `started_at` | Root span `start_time` (ISO datetime) |
|
|
54
|
+
| `ended_at` | Root span `end_time` (ISO datetime) |
|
|
55
|
+
| `task_input` | Root span `attributes["gen_ai.task_input"]` |
|
|
56
|
+
| `final_output` | Root span `attributes["gen_ai.final_output"]` |
|
|
57
|
+
| `llm.model` | Span `attributes["gen_ai.request.model"]` |
|
|
58
|
+
| `llm.input_messages` | Parsed JSON array from `attributes["gen_ai.prompt"]` |
|
|
59
|
+
| `llm.output_message` | Assistant role with content from `attributes["gen_ai.completion"]` |
|
|
60
|
+
| `llm.prompt_tokens` | `attributes["gen_ai.usage.prompt_tokens"]` |
|
|
61
|
+
| `llm.completion_tokens` | `attributes["gen_ai.usage.completion_tokens"]` |
|
|
62
|
+
| `llm.error` | `attributes["gen_ai.error"]` |
|
|
63
|
+
| `tool.name` | `attributes["gen_ai.tool.name"]` or Span `name` |
|
|
64
|
+
| `tool.arguments_json` | `attributes["gen_ai.tool.arguments"]` |
|
|
65
|
+
| `tool.output` | `attributes["gen_ai.tool.output"]` |
|
|
66
|
+
| `tool.error` | `attributes["gen_ai.tool.error"]` or `attributes["gen_ai.error"]` |
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
## 3. Langfuse Export (`langfuse`)
|
|
71
|
+
|
|
72
|
+
Langfuse exports traces as JSON objects with nested lists of observations (of types `GENERATION`, `SPAN`, `EVENT`).
|
|
73
|
+
|
|
74
|
+
### Mapping Translations
|
|
75
|
+
|
|
76
|
+
| Canonical Field | Langfuse Path |
|
|
77
|
+
| --- | --- |
|
|
78
|
+
| `trace_id` | Trace `id` |
|
|
79
|
+
| `started_at` | Trace `timestamp` |
|
|
80
|
+
| `task_input` | Trace `input` |
|
|
81
|
+
| `final_output` | Trace `output` |
|
|
82
|
+
| `metadata` | Trace `metadata` |
|
|
83
|
+
|
|
84
|
+
### Observation Mapping
|
|
85
|
+
- **GENERATION** $\rightarrow$ `LLMCall`:
|
|
86
|
+
- `llm.model` $\leftarrow$ Observation `model`
|
|
87
|
+
- `llm.input_messages` $\leftarrow$ Observation `input` (parsed list of message objects)
|
|
88
|
+
- `llm.output_message` $\leftarrow$ Observation `output`
|
|
89
|
+
- `llm.prompt_tokens` $\leftarrow$ `usage.promptTokens`
|
|
90
|
+
- `llm.completion_tokens` $\leftarrow$ `usage.completionTokens`
|
|
91
|
+
- `llm.error` $\leftarrow$ `statusMessage` when `level == "ERROR"`
|
|
92
|
+
- **SPAN** $\rightarrow$ `ToolCall` (if name is order/stripe/kb lookup or `metadata.tool` matches):
|
|
93
|
+
- `tool.name` $\leftarrow$ Observation `name`
|
|
94
|
+
- `tool.arguments_json` $\leftarrow$ Observation `input` (serialized to JSON)
|
|
95
|
+
- `tool.output` $\leftarrow$ Observation `output` (stringified)
|
|
96
|
+
- `tool.error` $\leftarrow$ `statusMessage` when `level == "ERROR"`
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
|
|
100
|
+
## 4. LangSmith Run Export (`langsmith`)
|
|
101
|
+
|
|
102
|
+
LangSmith exports represent hierarchical run hierarchies grouped by `trace_id`.
|
|
103
|
+
|
|
104
|
+
### Mapping Translations
|
|
105
|
+
|
|
106
|
+
| Canonical Field | LangSmith Path |
|
|
107
|
+
| --- | --- |
|
|
108
|
+
| `trace_id` | Run `trace_id` or root run `id` |
|
|
109
|
+
| `started_at` | Root run `start_time` |
|
|
110
|
+
| `ended_at` | Root run `end_time` |
|
|
111
|
+
| `task_input` | Root run `inputs["input"]` or first key's value |
|
|
112
|
+
| `final_output` | Root run `outputs["output"]` or first key's value |
|
|
113
|
+
|
|
114
|
+
### Child Run Classification
|
|
115
|
+
- **run_type == "llm"** $\rightarrow$ `LLMCall`:
|
|
116
|
+
- `llm.model` $\leftarrow$ `extra.metadata.ls_model_name`
|
|
117
|
+
- `llm.input_messages` $\leftarrow$ `inputs.messages` (list mapping)
|
|
118
|
+
- `llm.output_message` $\leftarrow$ First element of `outputs.generations`
|
|
119
|
+
- `llm.prompt_tokens` $\leftarrow$ `extra.token_usage.prompt_tokens`
|
|
120
|
+
- `llm.completion_tokens` $\leftarrow$ `extra.token_usage.completion_tokens`
|
|
121
|
+
- `llm.error` $\leftarrow$ Run `error` field
|
|
122
|
+
- **run_type == "tool"** $\rightarrow$ `ToolCall`:
|
|
123
|
+
- `tool.name` $\leftarrow$ Run `name`
|
|
124
|
+
- `tool.arguments_json` $\leftarrow$ Serialized run `inputs` dict
|
|
125
|
+
- `tool.output` $\leftarrow$ Run `outputs["output"]` or serialized outputs
|
|
126
|
+
- `tool.error` $\leftarrow$ Run `error` field
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# traceval e2e quickstart demo script
|
|
3
|
+
set -e
|
|
4
|
+
|
|
5
|
+
echo "=== 1. Generate synthetic traces ==="
|
|
6
|
+
python3 examples/make_traces.py
|
|
7
|
+
|
|
8
|
+
echo -e "\n=== 2. Ingest traces into SQLite ==="
|
|
9
|
+
rm -f demo_traces.db
|
|
10
|
+
uv run python3 src/traceval/cli.py ingest examples/synthetic_traces.jsonl -o demo_traces.db
|
|
11
|
+
|
|
12
|
+
echo -e "\n=== 3. Analyze traces (labeling, clustering, outcomes) ==="
|
|
13
|
+
uv run python3 src/traceval/cli.py analyze demo_traces.db -o demo_analysis/
|
|
14
|
+
|
|
15
|
+
echo -e "\n=== 4. Generate eval suite ==="
|
|
16
|
+
rm -rf demo_evals/
|
|
17
|
+
uv run python3 src/traceval/cli.py generate demo_traces.db -o demo_evals/ --include-failures
|
|
18
|
+
|
|
19
|
+
echo -e "\n=== 5. Run evals against healthy demo agent ==="
|
|
20
|
+
# We add --with fastapi --with uvicorn --with pytest so that all run dependencies are active
|
|
21
|
+
HEALTHY_REPORT=$(uv run --with fastapi --with uvicorn --with pytest python3 src/traceval/cli.py run demo_evals/ --target examples.demo_agent.agent:invoke_agent --judge fake | grep -o 'demo_evals/runs/run_.*\.json' | head -n 1) || true
|
|
22
|
+
|
|
23
|
+
echo -e "\n=== 6. Run evals against buggy demo agent and compare ==="
|
|
24
|
+
if [ -n "$HEALTHY_REPORT" ]; then
|
|
25
|
+
# Should report regressions and exit with failure code 1
|
|
26
|
+
BUGGY=true uv run --with fastapi --with uvicorn --with pytest python3 src/traceval/cli.py run demo_evals/ --target examples.demo_agent.agent:invoke_agent --judge fake --compare "$HEALTHY_REPORT" || echo "✅ E2E regression check passed: traceval correctly detected regressions and exited with failure status!"
|
|
27
|
+
else
|
|
28
|
+
# Fallback if grep failed to capture path
|
|
29
|
+
BUGGY=true uv run --with fastapi --with uvicorn --with pytest python3 src/traceval/cli.py run demo_evals/ --target examples.demo_agent.agent:invoke_agent --judge fake || echo "✅ E2E regression check passed: traceval correctly detected regressions!"
|
|
30
|
+
fi
|