traceval 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. traceval-0.1.1/.github/workflows/ci.yml +83 -0
  2. traceval-0.1.1/.gitignore +68 -0
  3. traceval-0.1.1/.pre-commit-config.yaml +7 -0
  4. traceval-0.1.1/CHANGELOG.md +18 -0
  5. traceval-0.1.1/LICENSE +21 -0
  6. traceval-0.1.1/PKG-INFO +154 -0
  7. traceval-0.1.1/README.md +138 -0
  8. traceval-0.1.1/docs/formats.md +126 -0
  9. traceval-0.1.1/examples/demo.sh +30 -0
  10. traceval-0.1.1/examples/demo_agent/agent.py +46 -0
  11. traceval-0.1.1/examples/demo_agent/core.py +57 -0
  12. traceval-0.1.1/examples/make_traces.py +216 -0
  13. traceval-0.1.1/examples/synthetic_traces.jsonl +200 -0
  14. traceval-0.1.1/pyproject.toml +79 -0
  15. traceval-0.1.1/src/traceval/__init__.py +1 -0
  16. traceval-0.1.1/src/traceval/analyze/__init__.py +75 -0
  17. traceval-0.1.1/src/traceval/analyze/cluster.py +226 -0
  18. traceval-0.1.1/src/traceval/analyze/coverage.py +84 -0
  19. traceval-0.1.1/src/traceval/analyze/outcomes.py +254 -0
  20. traceval-0.1.1/src/traceval/analyze/report.py +478 -0
  21. traceval-0.1.1/src/traceval/cli.py +168 -0
  22. traceval-0.1.1/src/traceval/compile/__init__.py +50 -0
  23. traceval-0.1.1/src/traceval/compile/cases.py +204 -0
  24. traceval-0.1.1/src/traceval/compile/emit_pytest.py +21 -0
  25. traceval-0.1.1/src/traceval/compile/emit_yaml.py +47 -0
  26. traceval-0.1.1/src/traceval/compile/rubrics.py +52 -0
  27. traceval-0.1.1/src/traceval/compile/templates/conftest.py.jinja +158 -0
  28. traceval-0.1.1/src/traceval/compile/templates/test_generated.py.jinja +17 -0
  29. traceval-0.1.1/src/traceval/ingest/__init__.py +99 -0
  30. traceval-0.1.1/src/traceval/ingest/base.py +13 -0
  31. traceval-0.1.1/src/traceval/ingest/generic.py +40 -0
  32. traceval-0.1.1/src/traceval/ingest/langfuse.py +241 -0
  33. traceval-0.1.1/src/traceval/ingest/langsmith.py +257 -0
  34. traceval-0.1.1/src/traceval/ingest/otel.py +238 -0
  35. traceval-0.1.1/src/traceval/model.py +66 -0
  36. traceval-0.1.1/src/traceval/run/judge.py +196 -0
  37. traceval-0.1.1/src/traceval/run/runner.py +113 -0
  38. traceval-0.1.1/src/traceval/run/scorers.py +150 -0
  39. traceval-0.1.1/src/traceval/run/target.py +104 -0
  40. traceval-0.1.1/src/traceval/store.py +59 -0
  41. traceval-0.1.1/tests/fixtures/README.md +18 -0
  42. traceval-0.1.1/tests/fixtures/demo_agent_expected/placeholder.txt +1 -0
  43. traceval-0.1.1/tests/fixtures/demo_agent_expected/results.json +7 -0
  44. traceval-0.1.1/tests/fixtures/generic_traces.jsonl +12 -0
  45. traceval-0.1.1/tests/fixtures/langfuse_export.jsonl +6 -0
  46. traceval-0.1.1/tests/fixtures/langsmith_runs.jsonl +13 -0
  47. traceval-0.1.1/tests/fixtures/otel_spans.jsonl +13 -0
  48. traceval-0.1.1/tests/test_cli.py +12 -0
  49. traceval-0.1.1/tests/test_phase1.py +141 -0
  50. traceval-0.1.1/tests/test_phase2.py +139 -0
  51. traceval-0.1.1/tests/test_phase3.py +143 -0
  52. traceval-0.1.1/tests/test_phase4.py +79 -0
  53. traceval-0.1.1/tests/test_phase5.py +116 -0
  54. traceval-0.1.1/tests/test_phase6.py +275 -0
  55. traceval-0.1.1/uv.lock +1130 -0
@@ -0,0 +1,83 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [ main, master ]
6
+ pull_request:
7
+ branches: [ main, master ]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.11", "3.12", "3.13"]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Install uv and set up Python
20
+ uses: astral-sh/setup-uv@v5
21
+ with:
22
+ python-version: ${{ matrix.python-version }}
23
+ enable-cache: true
24
+
25
+ - name: Run Ruff Check
26
+ run: uv run ruff check src/ tests/ examples/
27
+
28
+ - name: Run Ruff Format Check
29
+ run: uv run ruff format --check src/ tests/ examples/
30
+
31
+ - name: Run Mypy
32
+ run: uv run mypy src/
33
+
34
+ - name: Run Tests with Coverage
35
+ run: uv run pytest --cov=src/traceval --cov-fail-under=85
36
+
37
+ tag-and-release:
38
+ needs: test
39
+ if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref == 'refs/heads/master')
40
+ runs-on: ubuntu-latest
41
+ permissions:
42
+ contents: write
43
+ id-token: write
44
+ steps:
45
+ - uses: actions/checkout@v4
46
+ with:
47
+ fetch-depth: 0
48
+
49
+ - name: Install uv and set up Python
50
+ uses: astral-sh/setup-uv@v5
51
+ with:
52
+ python-version: "3.12"
53
+ enable-cache: true
54
+
55
+ - name: Check version and build
56
+ id: check-ver
57
+ run: |
58
+ VERSION=$(grep -m 1 'version =' pyproject.toml | tr -d '"' | tr -d "'" | awk '{print $NF}')
59
+ TAG="v$VERSION"
60
+
61
+ if git rev-parse "$TAG" >/dev/null 2>&1; then
62
+ echo "Tag $TAG already exists, skipping release."
63
+ echo "publish=false" >> $GITHUB_OUTPUT
64
+ else
65
+ echo "New version detected. Building and releasing $TAG..."
66
+ uv build
67
+
68
+ # Configure Git and push tag
69
+ git config user.name "github-actions[bot]"
70
+ git config user.email "github-actions[bot]@users.noreply.github.com"
71
+ git tag "$TAG"
72
+ git push origin "$TAG"
73
+
74
+ echo "publish=true" >> $GITHUB_OUTPUT
75
+ fi
76
+
77
+ - name: Publish to PyPI
78
+ if: steps.check-ver.outputs.publish == 'true'
79
+ uses: pypa/gh-action-pypi-publish@release/v1
80
+ with:
81
+ skip-existing: true
82
+
83
+
@@ -0,0 +1,68 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # Pytest / coverage
30
+ .cache
31
+ .pytest_cache/
32
+ .coverage
33
+ .coverage.*
34
+ .traceback
35
+ nosetests.xml
36
+ coverage.xml
37
+ htmlcov/
38
+ .htmlcov/
39
+
40
+ # Mypy / Ruff
41
+ .mypy_cache/
42
+ .ruff_cache/
43
+
44
+ # Environments
45
+ .env
46
+ .venv/
47
+ env/
48
+ venv/
49
+ ENV/
50
+ env.bak/
51
+ venv.bak/
52
+
53
+ # SQLite databases & logs (traceval database runs)
54
+ *.db
55
+ *.log
56
+ demo_analysis/
57
+ demo_evals/
58
+ evals/
59
+
60
+ # Specifications & user instructions
61
+ spec.md
62
+
63
+ # OS files
64
+ .DS_Store
65
+ .DS_Store?
66
+ ehthumbs.db
67
+ Icon?
68
+ Thumbs.db
@@ -0,0 +1,7 @@
1
+ repos:
2
+ - repr: https://github.com/astral-sh/ruff-pre-commit
3
+ rev: v0.2.0
4
+ hooks:
5
+ - id: ruff
6
+ args: [ --fix ]
7
+ - id: ruff-format
@@ -0,0 +1,18 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [0.1.0] - 2026-07-02
9
+
10
+ ### Added
11
+ - **Canonical Model & SQLite Storage**: Structured canonical schema representing `Trace`, `Step`, `ToolCall`, `LLMCall`, and `Outcome`.
12
+ - **Multi-backend Telemetry Ingestion**: Added log adapters for OpenTelemetry GenAI, Langfuse observation dumps, LangSmith logs, and Generic JSONL files.
13
+ - **Rule-based Labeler**: Dynamic outcome labels classifier (`success`, `tool_error`, `validation_error`, `loop`, `timeout`, `bad_output`, `unknown`) with custom python rule plugins.
14
+ - **Agglomerative Clustering**: Signature and task-input Jaccard shingle clustering.
15
+ - **Jinja2 Coverage Report Visualizer**: Ported self-contained single-page HTML report charts.
16
+ - **Pytest Case Compiler**: Emitter of YAML test case configurations, LLM-as-judge scaffolds, and custom redact hook scrubbers.
17
+ - **Scorers & Judges**: Scorer implementations for `exact`, `contains`, `regex`, `json_schema`, `tool_sequence` (order/subset modes), and `judge` (FakeJudge, OpenAICompatJudge with call caps).
18
+ - **FastAPI Demo Agent**: Supporting mock customer service tools and BUGGY mode regressions checks.
traceval-0.1.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Ramkumar M
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,154 @@
1
+ Metadata-Version: 2.4
2
+ Name: traceval
3
+ Version: 0.1.1
4
+ Summary: Trace-to-Eval Compiler
5
+ License: MIT
6
+ License-File: LICENSE
7
+ Requires-Python: >=3.11
8
+ Requires-Dist: httpx>=0.24.0
9
+ Requires-Dist: jinja2>=3.1.0
10
+ Requires-Dist: jsonschema>=4.17.0
11
+ Requires-Dist: pydantic>=2.0.0
12
+ Requires-Dist: pyyaml>=6.0.0
13
+ Requires-Dist: rich>=13.0.0
14
+ Requires-Dist: typer>=0.9.0
15
+ Description-Content-Type: text/markdown
16
+
17
+ # 🚀 traceval: Trace-to-Eval Compiler
18
+
19
+ <p align="center">
20
+ <img src="https://img.shields.io/badge/Python-3.11%20%7C%203.12%20%7C%203.13-blue.svg" alt="Python Version" />
21
+ <img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="License" />
22
+ <img src="https://img.shields.io/badge/Coverage-87%25-green.svg" alt="Coverage" />
23
+ <a href="https://github.com/astral-sh/uv"><img src="https://img.shields.io/badge/dynamic/json?label=uv&query=%24.version&url=https%3A%2F%2Fraw.githubusercontent.com%2Fastral-sh%2Fuv%2Fmain%2Fcargo.toml&color=de5d43" alt="uv" /></a>
24
+ </p>
25
+
26
+ ***"Your traces already know how your agent fails. traceval turns them into the test suite you never wrote."***
27
+
28
+ Teams running LLM agents in production have observability traces, but only a fraction maintain robust evals. The raw material for great tests — thousands of real production traces, including edge cases and errors — sits unused because converting them into regression suites is manual and tedious.
29
+
30
+ **traceval** automates this by ingesting agent traces from standard sources, normalizing them into a canonical Pydantic model, analyzing outcomes/clustering task signatures, and **compiling them into a human-editable eval suite**: pytest files + YAML datasets + judge rubric scaffolds.
31
+
32
+ ---
33
+
34
+ ## 🎨 Architectural Pipeline
35
+
36
+ ```mermaid
37
+ graph LR
38
+ classDef source fill:#2c3e50,stroke:#34495e,stroke-width:2px,color:#fff;
39
+ classDef normalize fill:#16a085,stroke:#1abc9c,stroke-width:2px,color:#fff;
40
+ classDef analyze fill:#2980b9,stroke:#3498db,stroke-width:2px,color:#fff;
41
+ classDef compile fill:#8e44ad,stroke:#9b59b6,stroke-width:2px,color:#fff;
42
+ classDef run fill:#d35400,stroke:#e67e22,stroke-width:2px,color:#fff;
43
+
44
+ A[OTel / Langfuse / LangSmith] --> B(Canonical Trace DB)
45
+ B --> C(Outcome Labeler & Jaccard Clusterer)
46
+ C --> D(YAML cases + Pytest + Rubrics)
47
+ D --> E(HTTP / Callable Runner & Diff Reports)
48
+
49
+ class A source;
50
+ class B normalize;
51
+ class C analyze;
52
+ class D compile;
53
+ class E run;
54
+ ```
55
+
56
+ ---
57
+
58
+ ## ✨ Key Features
59
+
60
+ * 🔌 **Zero-Configuration Ingest**: Direct compatibility with OpenTelemetry GenAI semantic conventions, Langfuse observations, LangSmith runs, or generic JSONL exports.
61
+ * 🧠 **Smart Outcome Taxonomy**: Automatic categorization of trace outcomes (`success`, `tool_error`, `validation_error`, `loop`, `timeout`, `bad_output`) using rule-based heuristics that you can extend with Python modules.
62
+ * 📊 **Embedding-Free Clustering**: Fast, local Jaccard-similarity shingle grouping that runs 100% offline, keeping your development cycle private and deterministic.
63
+ * 📝 **Clean Code Generation**: Compiles cases into editable YAML files, LLM-as-a-judge rubrics into Markdown checklist scaffolds, and pytest test runs into clean templates.
64
+ * ⚡ **PII Redaction Safeguards**: Automatically scrubs emails, credit cards, phone numbers, and API tokens before writing test inputs.
65
+ * 🛡️ **CI/CD Regression Diff**: Compares execution summaries and scores between runs using exit codes to catch agent failures before deploying.
66
+
67
+ ---
68
+
69
+ ## ⏱️ 90-Second E2E Quickstart
70
+
71
+ Experience `traceval` regression testing out of the box using our interactive demo script:
72
+
73
+ ```bash
74
+ # Clone & run the demo
75
+ chmod +x examples/demo.sh
76
+ ./examples/demo.sh
77
+ ```
78
+
79
+ ### Manual Walkthrough
80
+
81
+ #### 1. Ingest Observability Logs
82
+ ```bash
83
+ # Seed 200 synthetic telemetry traces containing successes and failure edge cases
84
+ python3 examples/make_traces.py
85
+
86
+ # Ingest into SQLite database
87
+ traceval ingest examples/synthetic_traces.jsonl -o traces.db
88
+ ```
89
+
90
+ #### 2. Label & Analyze Traffic Gaps
91
+ ```bash
92
+ traceval analyze traces.db -o analysis/
93
+ ```
94
+ *Outputs outcome statistics and generates `analysis/report.html` mapping traffic clusters:*
95
+ ```text
96
+ Outcomes: success 60% · tool_error 15% · loop 10% · timeout 8% · validation_error 8%
97
+ Clusters: 37 task clusters found.
98
+ Top failure cluster: "500 refund stripe -> stripe_lookup -> (tool_error)" (30 traces)
99
+ Report written to analysis/report.html
100
+ ```
101
+
102
+ #### 3. Compile Cases and Pytest Harness
103
+ ```bash
104
+ traceval generate traces.db -o evals/ --include-failures
105
+ ```
106
+ *Generates test parameters `evals/cases/` and rubric Markdown checklists `evals/rubrics/`.*
107
+
108
+ #### 4. Run Evaluations & Detect Regressions
109
+ ```bash
110
+ # Run against the healthy agent (100% Pass)
111
+ traceval run evals/ --target examples.demo_agent.agent:invoke_agent --judge fake
112
+
113
+ # Run against the buggy agent (Detects regressions and exits with status 1)
114
+ BUGGY=true traceval run evals/ --target examples.demo_agent.agent:invoke_agent --judge fake
115
+ ```
116
+
117
+ ---
118
+
119
+ ## 🛠️ CLI Command Reference
120
+
121
+ > [!NOTE]
122
+ > All CLI commands support `--json` to output machine-readable stdout for scripting.
123
+
124
+ ### Ingestion
125
+ ```bash
126
+ traceval ingest <path> --format [auto|otel|langfuse|langsmith|generic] -o <traces.db>
127
+ ```
128
+ *Ingests telemetry log dumps losslessly. Malformed spans write warnings to `<traces.db>.log`.*
129
+
130
+ ### Analysis
131
+ ```bash
132
+ traceval analyze <traces.db> [--rules custom_rules.py] [--evals evals/] -o <analysis_dir/>
133
+ ```
134
+ *Runs rule pipelines and Jaccard shingle similarity groupings.*
135
+
136
+ ### Generation
137
+ ```bash
138
+ traceval generate <traces.db> -o <evals_dir/> [--per-cluster 3] [--include-failures] [--redact-hook module:fn]
139
+ ```
140
+ *Creates regression cases, Markdown LLM-judge checklists, and conftest runners.*
141
+
142
+ ### Runner
143
+ ```bash
144
+ traceval run <evals_dir/> --target <url|module:function> [--judge fake|openai] [--compare runs/prev.json]
145
+ ```
146
+ *Executes tests, scores output constraints (`exact`, `contains`, `regex`, `json_schema`, `tool_sequence`, `judge`), and logs to project-level `runs/` directory.*
147
+
148
+ ---
149
+
150
+ ## 💡 Honest Limitations
151
+
152
+ * **Side-Effect Free**: traceval assertions evaluate input/output matches. It does not attempt to replay side effects (e.g., updating database records) on mock tools.
153
+ * **Text Telemetry**: The canonical model is optimized for text logs. Image or multimodal payloads in traces are logged as references.
154
+ * **Static Visualization**: The coverage report is a portable, single-file HTML page. There is no hosted web service.
@@ -0,0 +1,138 @@
1
+ # 🚀 traceval: Trace-to-Eval Compiler
2
+
3
+ <p align="center">
4
+ <img src="https://img.shields.io/badge/Python-3.11%20%7C%203.12%20%7C%203.13-blue.svg" alt="Python Version" />
5
+ <img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="License" />
6
+ <img src="https://img.shields.io/badge/Coverage-87%25-green.svg" alt="Coverage" />
7
+ <a href="https://github.com/astral-sh/uv"><img src="https://img.shields.io/badge/dynamic/json?label=uv&query=%24.version&url=https%3A%2F%2Fraw.githubusercontent.com%2Fastral-sh%2Fuv%2Fmain%2Fcargo.toml&color=de5d43" alt="uv" /></a>
8
+ </p>
9
+
10
+ ***"Your traces already know how your agent fails. traceval turns them into the test suite you never wrote."***
11
+
12
+ Teams running LLM agents in production have observability traces, but only a fraction maintain robust evals. The raw material for great tests — thousands of real production traces, including edge cases and errors — sits unused because converting them into regression suites is manual and tedious.
13
+
14
+ **traceval** automates this by ingesting agent traces from standard sources, normalizing them into a canonical Pydantic model, analyzing outcomes/clustering task signatures, and **compiling them into a human-editable eval suite**: pytest files + YAML datasets + judge rubric scaffolds.
15
+
16
+ ---
17
+
18
+ ## 🎨 Architectural Pipeline
19
+
20
+ ```mermaid
21
+ graph LR
22
+ classDef source fill:#2c3e50,stroke:#34495e,stroke-width:2px,color:#fff;
23
+ classDef normalize fill:#16a085,stroke:#1abc9c,stroke-width:2px,color:#fff;
24
+ classDef analyze fill:#2980b9,stroke:#3498db,stroke-width:2px,color:#fff;
25
+ classDef compile fill:#8e44ad,stroke:#9b59b6,stroke-width:2px,color:#fff;
26
+ classDef run fill:#d35400,stroke:#e67e22,stroke-width:2px,color:#fff;
27
+
28
+ A[OTel / Langfuse / LangSmith] --> B(Canonical Trace DB)
29
+ B --> C(Outcome Labeler & Jaccard Clusterer)
30
+ C --> D(YAML cases + Pytest + Rubrics)
31
+ D --> E(HTTP / Callable Runner & Diff Reports)
32
+
33
+ class A source;
34
+ class B normalize;
35
+ class C analyze;
36
+ class D compile;
37
+ class E run;
38
+ ```
39
+
40
+ ---
41
+
42
+ ## ✨ Key Features
43
+
44
+ * 🔌 **Zero-Configuration Ingest**: Direct compatibility with OpenTelemetry GenAI semantic conventions, Langfuse observations, LangSmith runs, or generic JSONL exports.
45
+ * 🧠 **Smart Outcome Taxonomy**: Automatic categorization of trace outcomes (`success`, `tool_error`, `validation_error`, `loop`, `timeout`, `bad_output`) using rule-based heuristics that you can extend with Python modules.
46
+ * 📊 **Embedding-Free Clustering**: Fast, local Jaccard-similarity shingle grouping that runs 100% offline, keeping your development cycle private and deterministic.
47
+ * 📝 **Clean Code Generation**: Compiles cases into editable YAML files, LLM-as-a-judge rubrics into Markdown checklist scaffolds, and pytest test runs into clean templates.
48
+ * ⚡ **PII Redaction Safeguards**: Automatically scrubs emails, credit cards, phone numbers, and API tokens before writing test inputs.
49
+ * 🛡️ **CI/CD Regression Diff**: Compares execution summaries and scores between runs using exit codes to catch agent failures before deploying.
50
+
51
+ ---
52
+
53
+ ## ⏱️ 90-Second E2E Quickstart
54
+
55
+ Experience `traceval` regression testing out of the box using our interactive demo script:
56
+
57
+ ```bash
58
+ # Clone & run the demo
59
+ chmod +x examples/demo.sh
60
+ ./examples/demo.sh
61
+ ```
62
+
63
+ ### Manual Walkthrough
64
+
65
+ #### 1. Ingest Observability Logs
66
+ ```bash
67
+ # Seed 200 synthetic telemetry traces containing successes and failure edge cases
68
+ python3 examples/make_traces.py
69
+
70
+ # Ingest into SQLite database
71
+ traceval ingest examples/synthetic_traces.jsonl -o traces.db
72
+ ```
73
+
74
+ #### 2. Label & Analyze Traffic Gaps
75
+ ```bash
76
+ traceval analyze traces.db -o analysis/
77
+ ```
78
+ *Outputs outcome statistics and generates `analysis/report.html` mapping traffic clusters:*
79
+ ```text
80
+ Outcomes: success 60% · tool_error 15% · loop 10% · timeout 8% · validation_error 8%
81
+ Clusters: 37 task clusters found.
82
+ Top failure cluster: "500 refund stripe -> stripe_lookup -> (tool_error)" (30 traces)
83
+ Report written to analysis/report.html
84
+ ```
85
+
86
+ #### 3. Compile Cases and Pytest Harness
87
+ ```bash
88
+ traceval generate traces.db -o evals/ --include-failures
89
+ ```
90
+ *Generates test parameters `evals/cases/` and rubric Markdown checklists `evals/rubrics/`.*
91
+
92
+ #### 4. Run Evaluations & Detect Regressions
93
+ ```bash
94
+ # Run against the healthy agent (100% Pass)
95
+ traceval run evals/ --target examples.demo_agent.agent:invoke_agent --judge fake
96
+
97
+ # Run against the buggy agent (Detects regressions and exits with status 1)
98
+ BUGGY=true traceval run evals/ --target examples.demo_agent.agent:invoke_agent --judge fake
99
+ ```
100
+
101
+ ---
102
+
103
+ ## 🛠️ CLI Command Reference
104
+
105
+ > [!NOTE]
106
+ > All CLI commands support `--json` to output machine-readable stdout for scripting.
107
+
108
+ ### Ingestion
109
+ ```bash
110
+ traceval ingest <path> --format [auto|otel|langfuse|langsmith|generic] -o <traces.db>
111
+ ```
112
+ *Ingests telemetry log dumps losslessly. Malformed spans write warnings to `<traces.db>.log`.*
113
+
114
+ ### Analysis
115
+ ```bash
116
+ traceval analyze <traces.db> [--rules custom_rules.py] [--evals evals/] -o <analysis_dir/>
117
+ ```
118
+ *Runs rule pipelines and Jaccard shingle similarity groupings.*
119
+
120
+ ### Generation
121
+ ```bash
122
+ traceval generate <traces.db> -o <evals_dir/> [--per-cluster 3] [--include-failures] [--redact-hook module:fn]
123
+ ```
124
+ *Creates regression cases, Markdown LLM-judge checklists, and conftest runners.*
125
+
126
+ ### Runner
127
+ ```bash
128
+ traceval run <evals_dir/> --target <url|module:function> [--judge fake|openai] [--compare runs/prev.json]
129
+ ```
130
+ *Executes tests, scores output constraints (`exact`, `contains`, `regex`, `json_schema`, `tool_sequence`, `judge`), and logs to project-level `runs/` directory.*
131
+
132
+ ---
133
+
134
+ ## 💡 Honest Limitations
135
+
136
+ * **Side-Effect Free**: traceval assertions evaluate input/output matches. It does not attempt to replay side effects (e.g., updating database records) on mock tools.
137
+ * **Text Telemetry**: The canonical model is optimized for text logs. Image or multimodal payloads in traces are logged as references.
138
+ * **Static Visualization**: The coverage report is a portable, single-file HTML page. There is no hosted web service.
@@ -0,0 +1,126 @@
1
+ # Trace Ingestion Formats & Mappings
2
+
3
+ This document describes how raw trace records from different observability backends are mapped onto the `traceval` canonical `Trace` Pydantic model.
4
+
5
+ ## Canonical Trace Model
6
+
7
+ All trace adapters transform incoming logs into the `Trace` schema (`src/traceval/model.py`), which includes:
8
+ - `trace_id` (str)
9
+ - `source` (str: "otel", "langfuse", "langsmith", "generic")
10
+ - `started_at` (datetime)
11
+ - `ended_at` (datetime or None)
12
+ - `task_input` (str, user prompt triggering the trace)
13
+ - `final_output` (str or None, assistant's final response)
14
+ - `steps` (list of Step objects chronologically ordered):
15
+ - `index` (int)
16
+ - `kind` ("llm", "tool", "retrieval", "other")
17
+ - `llm` (LLMCall or None)
18
+ - `tool` (ToolCall or None)
19
+ - `raw_attributes` (dict[str, str], lossless metadata escape hatch)
20
+
21
+ ---
22
+
23
+ ## 1. Generic format (`generic`)
24
+
25
+ A line-by-line JSONL file where each line is a raw JSON string validating directly against our canonical `Trace` model.
26
+
27
+ ### Assumptions & Heuristics
28
+ - Direct structural validation.
29
+ - Lines failing to parse are logged as warnings and skipped.
30
+
31
+ ---
32
+
33
+ ## 2. OpenTelemetry GenAI Conventions (`otel`)
34
+
35
+ OTel traces are ingested from flat lists of JSON span logs (e.g. OTLP export format), grouped by `trace_id`.
36
+
37
+ ### Span Categorization Rules
38
+ - **Root Span**: Identified by `parent_span_id` being `None` or empty.
39
+ - **LLM Call**: Spans containing GenAI semantic convention keys in their attributes:
40
+ - `gen_ai.system`
41
+ - `gen_ai.prompt`
42
+ - `gen_ai.completion`
43
+ - **Tool Call**: Spans containing:
44
+ - `gen_ai.tool.name`
45
+ - `gen_ai.tool.arguments`
46
+ - Or span name matching `order_lookup`, `stripe_lookup`, or `kb_search`.
47
+ - **Other**: All other spans are categorized as `other`.
48
+
49
+ ### Attribute Translations
50
+
51
+ | Canonical Field | OTel Span Path |
52
+ | --- | --- |
53
+ | `started_at` | Root span `start_time` (ISO datetime) |
54
+ | `ended_at` | Root span `end_time` (ISO datetime) |
55
+ | `task_input` | Root span `attributes["gen_ai.task_input"]` |
56
+ | `final_output` | Root span `attributes["gen_ai.final_output"]` |
57
+ | `llm.model` | Span `attributes["gen_ai.request.model"]` |
58
+ | `llm.input_messages` | Parsed JSON array from `attributes["gen_ai.prompt"]` |
59
+ | `llm.output_message` | Assistant role with content from `attributes["gen_ai.completion"]` |
60
+ | `llm.prompt_tokens` | `attributes["gen_ai.usage.prompt_tokens"]` |
61
+ | `llm.completion_tokens` | `attributes["gen_ai.usage.completion_tokens"]` |
62
+ | `llm.error` | `attributes["gen_ai.error"]` |
63
+ | `tool.name` | `attributes["gen_ai.tool.name"]` or Span `name` |
64
+ | `tool.arguments_json` | `attributes["gen_ai.tool.arguments"]` |
65
+ | `tool.output` | `attributes["gen_ai.tool.output"]` |
66
+ | `tool.error` | `attributes["gen_ai.tool.error"]` or `attributes["gen_ai.error"]` |
67
+
68
+ ---
69
+
70
+ ## 3. Langfuse Export (`langfuse`)
71
+
72
+ Langfuse exports traces as JSON objects with nested lists of observations (of types `GENERATION`, `SPAN`, `EVENT`).
73
+
74
+ ### Mapping Translations
75
+
76
+ | Canonical Field | Langfuse Path |
77
+ | --- | --- |
78
+ | `trace_id` | Trace `id` |
79
+ | `started_at` | Trace `timestamp` |
80
+ | `task_input` | Trace `input` |
81
+ | `final_output` | Trace `output` |
82
+ | `metadata` | Trace `metadata` |
83
+
84
+ ### Observation Mapping
85
+ - **GENERATION** $\rightarrow$ `LLMCall`:
86
+ - `llm.model` $\leftarrow$ Observation `model`
87
+ - `llm.input_messages` $\leftarrow$ Observation `input` (parsed list of message objects)
88
+ - `llm.output_message` $\leftarrow$ Observation `output`
89
+ - `llm.prompt_tokens` $\leftarrow$ `usage.promptTokens`
90
+ - `llm.completion_tokens` $\leftarrow$ `usage.completionTokens`
91
+ - `llm.error` $\leftarrow$ `statusMessage` when `level == "ERROR"`
92
+ - **SPAN** $\rightarrow$ `ToolCall` (if name is order/stripe/kb lookup or `metadata.tool` matches):
93
+ - `tool.name` $\leftarrow$ Observation `name`
94
+ - `tool.arguments_json` $\leftarrow$ Observation `input` (serialized to JSON)
95
+ - `tool.output` $\leftarrow$ Observation `output` (stringified)
96
+ - `tool.error` $\leftarrow$ `statusMessage` when `level == "ERROR"`
97
+
98
+ ---
99
+
100
+ ## 4. LangSmith Run Export (`langsmith`)
101
+
102
+ LangSmith exports represent hierarchical run hierarchies grouped by `trace_id`.
103
+
104
+ ### Mapping Translations
105
+
106
+ | Canonical Field | LangSmith Path |
107
+ | --- | --- |
108
+ | `trace_id` | Run `trace_id` or root run `id` |
109
+ | `started_at` | Root run `start_time` |
110
+ | `ended_at` | Root run `end_time` |
111
+ | `task_input` | Root run `inputs["input"]` or first key's value |
112
+ | `final_output` | Root run `outputs["output"]` or first key's value |
113
+
114
+ ### Child Run Classification
115
+ - **run_type == "llm"** $\rightarrow$ `LLMCall`:
116
+ - `llm.model` $\leftarrow$ `extra.metadata.ls_model_name`
117
+ - `llm.input_messages` $\leftarrow$ `inputs.messages` (list mapping)
118
+ - `llm.output_message` $\leftarrow$ First element of `outputs.generations`
119
+ - `llm.prompt_tokens` $\leftarrow$ `extra.token_usage.prompt_tokens`
120
+ - `llm.completion_tokens` $\leftarrow$ `extra.token_usage.completion_tokens`
121
+ - `llm.error` $\leftarrow$ Run `error` field
122
+ - **run_type == "tool"** $\rightarrow$ `ToolCall`:
123
+ - `tool.name` $\leftarrow$ Run `name`
124
+ - `tool.arguments_json` $\leftarrow$ Serialized run `inputs` dict
125
+ - `tool.output` $\leftarrow$ Run `outputs["output"]` or serialized outputs
126
+ - `tool.error` $\leftarrow$ Run `error` field
@@ -0,0 +1,30 @@
1
+ #!/bin/bash
2
+ # traceval e2e quickstart demo script
3
+ set -e
4
+
5
+ echo "=== 1. Generate synthetic traces ==="
6
+ python3 examples/make_traces.py
7
+
8
+ echo -e "\n=== 2. Ingest traces into SQLite ==="
9
+ rm -f demo_traces.db
10
+ uv run python3 src/traceval/cli.py ingest examples/synthetic_traces.jsonl -o demo_traces.db
11
+
12
+ echo -e "\n=== 3. Analyze traces (labeling, clustering, outcomes) ==="
13
+ uv run python3 src/traceval/cli.py analyze demo_traces.db -o demo_analysis/
14
+
15
+ echo -e "\n=== 4. Generate eval suite ==="
16
+ rm -rf demo_evals/
17
+ uv run python3 src/traceval/cli.py generate demo_traces.db -o demo_evals/ --include-failures
18
+
19
+ echo -e "\n=== 5. Run evals against healthy demo agent ==="
20
+ # We add --with fastapi --with uvicorn --with pytest so that all run dependencies are active
21
+ HEALTHY_REPORT=$(uv run --with fastapi --with uvicorn --with pytest python3 src/traceval/cli.py run demo_evals/ --target examples.demo_agent.agent:invoke_agent --judge fake | grep -o 'demo_evals/runs/run_.*\.json' | head -n 1) || true
22
+
23
+ echo -e "\n=== 6. Run evals against buggy demo agent and compare ==="
24
+ if [ -n "$HEALTHY_REPORT" ]; then
25
+ # Should report regressions and exit with failure code 1
26
+ BUGGY=true uv run --with fastapi --with uvicorn --with pytest python3 src/traceval/cli.py run demo_evals/ --target examples.demo_agent.agent:invoke_agent --judge fake --compare "$HEALTHY_REPORT" || echo "✅ E2E regression check passed: traceval correctly detected regressions and exited with failure status!"
27
+ else
28
+ # Fallback if grep failed to capture path
29
+ BUGGY=true uv run --with fastapi --with uvicorn --with pytest python3 src/traceval/cli.py run demo_evals/ --target examples.demo_agent.agent:invoke_agent --judge fake || echo "✅ E2E regression check passed: traceval correctly detected regressions!"
30
+ fi