traceval 0.2.2__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. {traceval-0.2.2 → traceval-0.2.3}/CHANGELOG.md +12 -0
  2. {traceval-0.2.2 → traceval-0.2.3}/PKG-INFO +13 -11
  3. {traceval-0.2.2 → traceval-0.2.3}/README.md +12 -10
  4. traceval-0.2.3/docs/extending.md +97 -0
  5. {traceval-0.2.2 → traceval-0.2.3}/docs/formats.md +47 -8
  6. traceval-0.2.3/docs/img/report.png +0 -0
  7. traceval-0.2.3/docs/targets.md +91 -0
  8. {traceval-0.2.2 → traceval-0.2.3}/pyproject.toml +1 -1
  9. traceval-0.2.3/src/traceval/__init__.py +1 -0
  10. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/cli.py +14 -1
  11. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/compile/templates/conftest.py.jinja +2 -1
  12. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/ingest/__init__.py +2 -1
  13. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/ingest/base.py +2 -0
  14. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/ingest/generic.py +4 -0
  15. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/ingest/langfuse.py +28 -5
  16. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/ingest/langsmith.py +4 -0
  17. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/ingest/otel.py +21 -3
  18. {traceval-0.2.2 → traceval-0.2.3}/tests/fixtures/README.md +11 -0
  19. {traceval-0.2.2 → traceval-0.2.3}/tests/fixtures/langfuse_export.jsonl +1 -0
  20. {traceval-0.2.2 → traceval-0.2.3}/tests/fixtures/otel_spans.jsonl +2 -0
  21. {traceval-0.2.2 → traceval-0.2.3}/tests/test_phase2.py +56 -8
  22. {traceval-0.2.2 → traceval-0.2.3}/uv.lock +1 -1
  23. traceval-0.2.2/docs/img/report.png +0 -0
  24. traceval-0.2.2/src/traceval/__init__.py +0 -1
  25. {traceval-0.2.2 → traceval-0.2.3}/.github/workflows/ci.yml +0 -0
  26. {traceval-0.2.2 → traceval-0.2.3}/.gitignore +0 -0
  27. {traceval-0.2.2 → traceval-0.2.3}/.pre-commit-config.yaml +0 -0
  28. {traceval-0.2.2 → traceval-0.2.3}/CONTRIBUTING.md +0 -0
  29. {traceval-0.2.2 → traceval-0.2.3}/LICENSE +0 -0
  30. {traceval-0.2.2 → traceval-0.2.3}/Makefile +0 -0
  31. {traceval-0.2.2 → traceval-0.2.3}/action.yml +0 -0
  32. {traceval-0.2.2 → traceval-0.2.3}/examples/demo.sh +0 -0
  33. {traceval-0.2.2 → traceval-0.2.3}/examples/demo_agent/agent.py +0 -0
  34. {traceval-0.2.2 → traceval-0.2.3}/examples/demo_agent/core.py +0 -0
  35. {traceval-0.2.2 → traceval-0.2.3}/examples/make_traces.py +0 -0
  36. {traceval-0.2.2 → traceval-0.2.3}/examples/synthetic_traces.jsonl +0 -0
  37. {traceval-0.2.2 → traceval-0.2.3}/scripts/readme-outputs.sh +0 -0
  38. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/analyze/__init__.py +0 -0
  39. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/analyze/cluster.py +0 -0
  40. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/analyze/coverage.py +0 -0
  41. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/analyze/outcomes.py +0 -0
  42. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/analyze/report.py +0 -0
  43. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/compile/__init__.py +0 -0
  44. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/compile/cases.py +0 -0
  45. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/compile/emit_pytest.py +0 -0
  46. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/compile/emit_yaml.py +0 -0
  47. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/compile/rubrics.py +0 -0
  48. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/compile/templates/test_generated.py.jinja +0 -0
  49. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/demo/__init__.py +0 -0
  50. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/demo/agent.py +0 -0
  51. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/demo/traces.py +0 -0
  52. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/model.py +0 -0
  53. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/run/calibrate.py +0 -0
  54. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/run/judge.py +0 -0
  55. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/run/runner.py +0 -0
  56. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/run/scorers.py +0 -0
  57. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/run/target.py +0 -0
  58. {traceval-0.2.2 → traceval-0.2.3}/src/traceval/store.py +0 -0
  59. {traceval-0.2.2 → traceval-0.2.3}/tests/fixtures/generic_traces.jsonl +0 -0
  60. {traceval-0.2.2 → traceval-0.2.3}/tests/fixtures/langsmith_runs.jsonl +0 -0
  61. {traceval-0.2.2 → traceval-0.2.3}/tests/test_broken_target.py +0 -0
  62. {traceval-0.2.2 → traceval-0.2.3}/tests/test_calibrate.py +0 -0
  63. {traceval-0.2.2 → traceval-0.2.3}/tests/test_cli.py +0 -0
  64. {traceval-0.2.2 → traceval-0.2.3}/tests/test_e2e_demo.py +0 -0
  65. {traceval-0.2.2 → traceval-0.2.3}/tests/test_json_output.py +0 -0
  66. {traceval-0.2.2 → traceval-0.2.3}/tests/test_phase1.py +0 -0
  67. {traceval-0.2.2 → traceval-0.2.3}/tests/test_phase3.py +0 -0
  68. {traceval-0.2.2 → traceval-0.2.3}/tests/test_phase4.py +0 -0
  69. {traceval-0.2.2 → traceval-0.2.3}/tests/test_phase5.py +0 -0
  70. {traceval-0.2.2 → traceval-0.2.3}/tests/test_phase6.py +0 -0
  71. {traceval-0.2.2 → traceval-0.2.3}/tests/test_regression_cases.py +0 -0
  72. {traceval-0.2.2 → traceval-0.2.3}/tests/test_serve.py +0 -0
@@ -5,6 +5,18 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.2.3] - 2026-07-02
9
+
10
+ ### Fixed
11
+ - **Adapter tool detection no longer depends on demo tool names.** The Langfuse and OTel adapters matched span names against the demo agent's tool vocabulary (`order_lookup`, `stripe_lookup`, `kb_search`), so real exports with any other tool names silently classified tool spans as `other`, breaking `tool_error` labeling, `tool_sequence`/`no_tool_loop` generation, and cluster signatures. Langfuse SPANs now classify via `metadata.tool`, user-supplied `--tool-span-names` globs, or a documented input+output/error heuristic; OTel uses GenAI semantic-convention attributes with attribute-only fallbacks. Detection is proven vocabulary-free by new `create_ticket` fixtures. Surfaced by external review, found by reading; a real export would have found it in seconds.
12
+ - Sub-millisecond latencies render as `<0.1` in the run summary instead of `0.0`.
13
+
14
+ ### Added
15
+ - `traceval ingest --tool-span-names`: comma-separated name globs marking spans as tool calls (replaces the Langfuse heuristic, adds an OTel fallback).
16
+ - `docs/targets.md`: the exact run-against-my-agent contract (HTTP request/response shape, callable return shapes, timeout and failure behavior) with an executable FastAPI example.
17
+ - `docs/extending.md`: custom outcome rules, redaction hook, judge configuration as implemented, and an honest `traceval.yaml` reference.
18
+ - `docs/formats.md`: complete annotated generic-format examples (success and tool-failure lines that ingest as-is) and a required-vs-optional field table; Langfuse/OTel sections rewritten to match the new heuristics.
19
+
8
20
  ## [0.2.2] - 2026-07-02
9
21
 
10
22
  ### Fixed
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: traceval
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: Trace-to-Eval Compiler
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -106,6 +106,8 @@ Report written to analysis/report.html
106
106
 
107
107
  `analysis/report.html` is the single-file page shown in the screenshot above. Pass `--evals evals/` to overlay eval coverage per cluster, and `--rules my_rules.py` to add your own labeling rules. To view it over HTTP instead of `file://`, `traceval serve analysis` starts a stdlib localhost server and prints the report URL.
108
108
 
109
+ Custom labeling rules, the redaction hook, and judge configuration are documented in [docs/extending.md](docs/extending.md).
110
+
109
111
  ### 3. Generate
110
112
 
111
113
  ```bash
@@ -131,19 +133,19 @@ traceval Run Summary
131
133
  ┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┓
132
134
  ┃ Case ID ┃ Cluster ┃ Outcome ┃ Latency (ms) ┃
133
135
  ┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━┩
134
- │ c_0c422a7a__case_001 │ c_0c422a7a │ PASS │ 0.0
135
- │ c_1e5d0942__case_002 │ c_1e5d0942 │ PASS │ 0.0
136
- │ c_2c881177__case_003 │ c_2c881177 │ PASS │ 0.0
137
- │ c_361535b0__case_004 │ c_361535b0 │ PASS │ 0.0
138
- │ c_9a8a4644__case_005 │ c_9a8a4644 │ PASS │ 0.0
139
- │ c_d30af83a__case_006 │ c_d30af83a │ PASS │ 0.0
140
- │ c_d3f3b631__case_007 │ c_d3f3b631 │ PASS │ 0.0
141
- │ c_e834c13c__case_008 │ c_e834c13c │ PASS │ 0.0
136
+ │ c_0c422a7a__case_001 │ c_0c422a7a │ PASS │ <0.1
137
+ │ c_1e5d0942__case_002 │ c_1e5d0942 │ PASS │ <0.1
138
+ │ c_2c881177__case_003 │ c_2c881177 │ PASS │ <0.1
139
+ │ c_361535b0__case_004 │ c_361535b0 │ PASS │ <0.1
140
+ │ c_9a8a4644__case_005 │ c_9a8a4644 │ PASS │ <0.1
141
+ │ c_d30af83a__case_006 │ c_d30af83a │ PASS │ <0.1
142
+ │ c_d3f3b631__case_007 │ c_d3f3b631 │ PASS │ <0.1
143
+ │ c_e834c13c__case_008 │ c_e834c13c │ PASS │ <0.1
142
144
  └──────────────────────┴────────────┴─────────┴──────────────┘
143
145
  Total: 8 | Passed: 8 | Failed: 0 | Errored: 0
144
146
  ```
145
147
 
146
- The target is an HTTP URL or a `module:function` callable. Checks cover `exact`, `contains_any`, `not_contains`, `regex`, `json_schema`, `tool_sequence`, `no_tool_loop`, and `judge`. Run reports land in `<evals_dir>/runs/` (override with `--runs-dir`); pass `--compare <previous report>` to print regressions and improvements between runs. The exit code is nonzero when any case fails.
148
+ The target is an HTTP URL or a `module:function` callable; the exact request/response contract, with a copy-pasteable FastAPI example, is in [docs/targets.md](docs/targets.md). Checks cover `exact`, `contains_any`, `not_contains`, `regex`, `json_schema`, `tool_sequence`, `no_tool_loop`, and `judge`. Run reports land in `<evals_dir>/runs/` (override with `--runs-dir`); pass `--compare <previous report>` to print regressions and improvements between runs. The exit code is nonzero when any case fails.
147
149
 
148
150
  ### 5. Calibrate the judge
149
151
 
@@ -191,7 +193,7 @@ jobs:
191
193
  runs-on: ubuntu-latest
192
194
  steps:
193
195
  - uses: actions/checkout@v4
194
- - uses: theramkm/traceval@v0.2.2
196
+ - uses: theramkm/traceval@v0.2.3
195
197
  with:
196
198
  evals-dir: evals/
197
199
  target: myapp.agent:invoke_agent # or an HTTP URL
@@ -89,6 +89,8 @@ Report written to analysis/report.html
89
89
 
90
90
  `analysis/report.html` is the single-file page shown in the screenshot above. Pass `--evals evals/` to overlay eval coverage per cluster, and `--rules my_rules.py` to add your own labeling rules. To view it over HTTP instead of `file://`, `traceval serve analysis` starts a stdlib localhost server and prints the report URL.
91
91
 
92
+ Custom labeling rules, the redaction hook, and judge configuration are documented in [docs/extending.md](docs/extending.md).
93
+
92
94
  ### 3. Generate
93
95
 
94
96
  ```bash
@@ -114,19 +116,19 @@ traceval Run Summary
114
116
  ┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┓
115
117
  ┃ Case ID ┃ Cluster ┃ Outcome ┃ Latency (ms) ┃
116
118
  ┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━┩
117
- │ c_0c422a7a__case_001 │ c_0c422a7a │ PASS │ 0.0
118
- │ c_1e5d0942__case_002 │ c_1e5d0942 │ PASS │ 0.0
119
- │ c_2c881177__case_003 │ c_2c881177 │ PASS │ 0.0
120
- │ c_361535b0__case_004 │ c_361535b0 │ PASS │ 0.0
121
- │ c_9a8a4644__case_005 │ c_9a8a4644 │ PASS │ 0.0
122
- │ c_d30af83a__case_006 │ c_d30af83a │ PASS │ 0.0
123
- │ c_d3f3b631__case_007 │ c_d3f3b631 │ PASS │ 0.0
124
- │ c_e834c13c__case_008 │ c_e834c13c │ PASS │ 0.0
119
+ │ c_0c422a7a__case_001 │ c_0c422a7a │ PASS │ <0.1
120
+ │ c_1e5d0942__case_002 │ c_1e5d0942 │ PASS │ <0.1
121
+ │ c_2c881177__case_003 │ c_2c881177 │ PASS │ <0.1
122
+ │ c_361535b0__case_004 │ c_361535b0 │ PASS │ <0.1
123
+ │ c_9a8a4644__case_005 │ c_9a8a4644 │ PASS │ <0.1
124
+ │ c_d30af83a__case_006 │ c_d30af83a │ PASS │ <0.1
125
+ │ c_d3f3b631__case_007 │ c_d3f3b631 │ PASS │ <0.1
126
+ │ c_e834c13c__case_008 │ c_e834c13c │ PASS │ <0.1
125
127
  └──────────────────────┴────────────┴─────────┴──────────────┘
126
128
  Total: 8 | Passed: 8 | Failed: 0 | Errored: 0
127
129
  ```
128
130
 
129
- The target is an HTTP URL or a `module:function` callable. Checks cover `exact`, `contains_any`, `not_contains`, `regex`, `json_schema`, `tool_sequence`, `no_tool_loop`, and `judge`. Run reports land in `<evals_dir>/runs/` (override with `--runs-dir`); pass `--compare <previous report>` to print regressions and improvements between runs. The exit code is nonzero when any case fails.
131
+ The target is an HTTP URL or a `module:function` callable; the exact request/response contract, with a copy-pasteable FastAPI example, is in [docs/targets.md](docs/targets.md). Checks cover `exact`, `contains_any`, `not_contains`, `regex`, `json_schema`, `tool_sequence`, `no_tool_loop`, and `judge`. Run reports land in `<evals_dir>/runs/` (override with `--runs-dir`); pass `--compare <previous report>` to print regressions and improvements between runs. The exit code is nonzero when any case fails.
130
132
 
131
133
  ### 5. Calibrate the judge
132
134
 
@@ -174,7 +176,7 @@ jobs:
174
176
  runs-on: ubuntu-latest
175
177
  steps:
176
178
  - uses: actions/checkout@v4
177
- - uses: theramkm/traceval@v0.2.2
179
+ - uses: theramkm/traceval@v0.2.3
178
180
  with:
179
181
  evals-dir: evals/
180
182
  target: myapp.agent:invoke_agent # or an HTTP URL
@@ -0,0 +1,97 @@
1
+ # Extension points
2
+
3
+ traceval has three: custom outcome rules, a redaction hook, and judge
4
+ configuration.
5
+
6
+ ## Custom outcome rules (`traceval analyze --rules my_rules.py`)
7
+
8
+ The file must expose a module-level `RULES` list. Each entry is either a
9
+ `Rule` object from `traceval.analyze.outcomes` or a bare callable with the
10
+ signature `(Trace) -> Outcome | None`. Return `None` to pass; the first
11
+ rule that returns an `Outcome` wins. **User rules run before the
12
+ built-ins**, so they can override any built-in label.
13
+
14
+ ```python
15
+ # my_rules.py
16
+ from traceval.analyze.outcomes import Rule
17
+ from traceval.model import Outcome, Trace
18
+
19
+
20
+ def escalation(trace: Trace) -> Outcome | None:
21
+ if trace.final_output and "escalate to human" in trace.final_output.lower():
22
+ return Outcome(
23
+ label="bad_output",
24
+ reason="agent escalated instead of resolving",
25
+ rule_id="R_USER_ESCALATION",
26
+ labeled_by="user_rule",
27
+ )
28
+ return None
29
+
30
+
31
+ RULES = [Rule("R_USER_ESCALATION", "Escalations count as failures", escalation)]
32
+ ```
33
+
34
+ ```bash
35
+ traceval analyze traces.db --rules my_rules.py -o analysis
36
+ ```
37
+
38
+ `Outcome` requires `label` (one of `success`, `tool_error`,
39
+ `validation_error`, `loop`, `timeout`, `bad_output`, `unknown`), `reason`,
40
+ and `labeled_by`; `rule_id` is optional but shows up in reports, so set it.
41
+ Built-in rules run afterwards in this order: `R_TOOL_ERROR`, `R_LLM_ERROR`,
42
+ `R_LOOP`, `R_TIMEOUT`, `R_VALIDATION`, `R_EMPTY_OUTPUT`,
43
+ `R_DEFAULT_SUCCESS`, `R_UNKNOWN`.
44
+
45
+ ## Redaction hook (`traceval generate --redact-hook module:function`)
46
+
47
+ A `str -> str` function applied to case inputs and reference outputs before
48
+ they are written to YAML, after the built-in scrubbers (emails, credit
49
+ cards, phone numbers, API tokens). The module is imported from the current
50
+ working directory.
51
+
52
+ ```python
53
+ # my_redactions.py
54
+ import re
55
+
56
+
57
+ def scrub(text: str) -> str:
58
+ return re.sub(r"ACC-\d{6}", "[REDACTED_ACCOUNT]", text)
59
+ ```
60
+
61
+ ```bash
62
+ traceval generate traces.db -o evals --include-failures --redact-hook my_redactions:scrub
63
+ ```
64
+
65
+ ## Judge configuration (`traceval run --judge ...`)
66
+
67
+ | Value | Behavior |
68
+ | --- | --- |
69
+ | `fake` (default) | Deterministic offline judge: keyword-overlap heuristic, never gates a run at the generated `min_score`. Use it in CI without API keys. |
70
+ | `<model>` | `OpenAICompatJudge` against `https://api.openai.com/v1` with that model, e.g. `--judge gpt-4o-mini`. |
71
+ | `<model>:<base_url>` | Any OpenAI-compatible endpoint, e.g. `--judge llama3:http://localhost:11434/v1`. |
72
+
73
+ API keys come from the environment: `OPENAI_API_KEY` first, then
74
+ `GEMINI_API_KEY`. If only `GEMINI_API_KEY` is set and the base URL is the
75
+ OpenAI default, traceval automatically routes to Gemini's OpenAI-compatible
76
+ endpoint (`https://generativelanguage.googleapis.com/v1beta/openai`) with
77
+ model `gemini-2.5-flash`.
78
+
79
+ Judge calls are budget-capped at 200 per run (hardcoded); calls beyond the
80
+ budget score 0.0 with an explanatory reason. Validate any real judge with
81
+ `traceval calibrate` before trusting its scores.
82
+
83
+ ## `traceval.yaml` reference
84
+
85
+ `traceval generate` writes this scaffold next to the suite. Honest status
86
+ of every key:
87
+
88
+ | Key | Default | Consumed today? |
89
+ | --- | --- | --- |
90
+ | `schema_version` | `"1"` | No, informational. |
91
+ | `target.default_url` | `http://localhost:8000/agent` | **Yes**: used as the target when `traceval run` is invoked without `--target`. |
92
+ | `target.timeout_s` | `30` | No: the HTTP timeout is fixed at 30s in code. |
93
+ | `judge.default_provider` | `fake` | No: the CLI's `--judge` default (`fake`) applies instead. |
94
+ | `judge.max_judge_calls` | `200` | No: the 200-call budget is hardcoded. |
95
+
96
+ The unconsumed keys document intended configuration surface; treat them as
97
+ reserved.
@@ -22,11 +22,41 @@ All trace adapters transform incoming logs into the `Trace` schema (`src/traceva
22
22
 
23
23
  ## 1. Generic format (`generic`)
24
24
 
25
- A line-by-line JSONL file where each line is a raw JSON string validating directly against our canonical `Trace` model.
25
+ A line-by-line JSONL file where each line is a raw JSON string validating directly against our canonical `Trace` model. This is the format to convert to when your backend is not natively supported; the two examples below are complete and ingest as-is.
26
+
27
+ ### Required vs optional fields
28
+
29
+ | Field | Required | Notes |
30
+ | --- | --- | --- |
31
+ | `trace_id` | yes | Unique string. |
32
+ | `source` | yes | Free-form origin label, e.g. `"generic"`. |
33
+ | `started_at` | yes | ISO 8601 datetime. |
34
+ | `task_input` | yes | The user request that started the trace. |
35
+ | `steps` | yes | May be `[]`. Each step needs `index` and `kind` (`llm`, `tool`, `retrieval`, `other`); `llm` steps need an `llm` object with `span_id` and `input_messages`, `tool` steps need a `tool` object with `span_id`, `name`, `arguments_json`. |
36
+ | `ended_at` | no | Missing/`null` triggers the built-in timeout rule. |
37
+ | `final_output` | no | `null`/empty triggers the empty-output rule. |
38
+ | `metadata` | no | `dict[str, str]`. |
39
+ | `schema_version`, `outcome` | no | `outcome` is filled by `traceval analyze`; supply it only to pre-label. |
40
+
41
+ ### Example: success trace (one llm step, one tool step)
42
+
43
+ ```json
44
+ {"trace_id": "tr-ok-1", "source": "generic", "started_at": "2026-07-01T12:00:00Z", "ended_at": "2026-07-01T12:00:02Z", "task_input": "Where is order 88421?", "final_output": "Your order 88421 is in transit.", "steps": [{"index": 0, "kind": "llm", "llm": {"span_id": "s1", "model": "gpt-4o-mini", "input_messages": [{"role": "user", "content": "Where is order 88421?"}], "output_message": {"role": "assistant", "content": "Let me look that up."}}}, {"index": 1, "kind": "tool", "tool": {"span_id": "s2", "name": "order_lookup", "arguments_json": "{\"order_id\": \"88421\"}", "output": "status: in_transit", "latency_ms": 142.0}}]}
45
+ ```
46
+
47
+ Annotations: `arguments_json` is a raw JSON *string* (never a parsed object), so nothing is lost in translation. `output_message`, `model`, token counts, and `latency_ms` are all optional.
48
+
49
+ ### Example: failure trace (tool step with `error` set)
50
+
51
+ ```json
52
+ {"trace_id": "tr-fail-1", "source": "generic", "started_at": "2026-07-01T12:05:00Z", "ended_at": "2026-07-01T12:05:01Z", "task_input": "Refund order 88421", "final_output": "Error: refund service unavailable.", "steps": [{"index": 0, "kind": "tool", "tool": {"span_id": "s3", "name": "refund_api", "arguments_json": "{\"order_id\": \"88421\"}", "output": null, "error": "HTTP 503 Service Unavailable"}}]}
53
+ ```
54
+
55
+ A non-null `tool.error` makes the built-in `R_TOOL_ERROR` rule label the trace `tool_error`, which is what routes it into a failure cluster and, with `--include-failures`, into a regression case.
26
56
 
27
57
  ### Assumptions & Heuristics
28
- - Direct structural validation.
29
- - Lines failing to parse are logged as warnings and skipped.
58
+ - Direct structural validation, no field inference.
59
+ - Lines failing to parse are logged as warnings and skipped; the ingest never aborts.
30
60
 
31
61
  ---
32
62
 
@@ -40,10 +70,14 @@ OTel traces are ingested from flat lists of JSON span logs (e.g. OTLP export for
40
70
  - `gen_ai.system`
41
71
  - `gen_ai.prompt`
42
72
  - `gen_ai.completion`
43
- - **Tool Call**: Spans containing:
44
- - `gen_ai.tool.name`
45
- - `gen_ai.tool.arguments`
46
- - Or span name matching `order_lookup`, `stripe_lookup`, or `kb_search`.
73
+ - **Tool Call**: Spans matching any of these signals:
74
+ - `gen_ai.tool.name` present in attributes (primary, per GenAI semantic conventions)
75
+ - `gen_ai.tool.arguments` present in attributes
76
+ - `gen_ai.operation.name` attribute equal to `"execute_tool"`
77
+ - `tool.name` present in attributes
78
+ - Span name matches a user-supplied glob from `traceval ingest --tool-span-names` (comma-separated, e.g. `"*_lookup,tool_*"`)
79
+
80
+ There is no built-in tool-name list; detection never depends on a specific tool vocabulary.
47
81
  - **Other**: All other spans are categorized as `other`.
48
82
 
49
83
  ### Attribute Translations
@@ -89,7 +123,12 @@ Langfuse exports traces as JSON objects with nested lists of observations (of ty
89
123
  - `llm.prompt_tokens` $\leftarrow$ `usage.promptTokens`
90
124
  - `llm.completion_tokens` $\leftarrow$ `usage.completionTokens`
91
125
  - `llm.error` $\leftarrow$ `statusMessage` when `level == "ERROR"`
92
- - **SPAN** $\rightarrow$ `ToolCall` (if name is order/stripe/kb lookup or `metadata.tool` matches):
126
+ - **SPAN** $\rightarrow$ `ToolCall` when, in priority order:
127
+ 1. the observation's `metadata.tool` is set (explicit marker, always wins), or
128
+ 2. `traceval ingest --tool-span-names` globs were supplied and the observation `name` matches one (globs replace the heuristic below), or
129
+ 3. default heuristic: the SPAN recorded an `input` AND either an `output` or an error signal (`level == "ERROR"` or `statusMessage` set). Failed tool calls often produce no output, which is why an error counts as the second signal.
130
+
131
+ SPANs matching none of these become `other` steps. Field mapping:
93
132
  - `tool.name` $\leftarrow$ Observation `name`
94
133
  - `tool.arguments_json` $\leftarrow$ Observation `input` (serialized to JSON)
95
134
  - `tool.output` $\leftarrow$ Observation `output` (stringified)
Binary file
@@ -0,0 +1,91 @@
1
+ # Running the suite against your agent: the target contract
2
+
3
+ `traceval run <evals_dir> --target <target>` accepts two target forms. Both
4
+ are resolved once per session; an unresolvable target prints one clear
5
+ `ERROR:` line at the top of the output, records a `target_resolution` entry
6
+ in the run report's `errors` list, and the run exits nonzero.
7
+
8
+ ## HTTP target
9
+
10
+ Pass any `http://` or `https://` URL. For every case, traceval sends:
11
+
12
+ ```
13
+ POST <your url>
14
+ Content-Type: application/json
15
+
16
+ {"input": "<the case's task text>"}
17
+ ```
18
+
19
+ with a 30 second timeout. Non-2xx responses fail the case.
20
+
21
+ The JSON response is interpreted as follows:
22
+
23
+ | Field | Meaning |
24
+ | --- | --- |
25
+ | `output` / `final_output` / `response` | The agent's answer, checked in that priority order; the first present key wins. |
26
+ | any other key | Fallback: if none of the three keys exist, the first non-`tool_calls` value is stringified and used as the output. A non-object response body is stringified whole. |
27
+ | `tool_calls` (optional) | List of tool invocations, either `{"name": "..."}` objects or plain strings. Only the names are kept. |
28
+
29
+ `tool_calls` exists so the `tool_sequence` check (did the agent call the
30
+ recorded tools, in order) and the `no_tool_loop` check (did it avoid calling
31
+ the same tool 3+ times consecutively) have something to score. If your
32
+ endpoint omits it, generated `tool_sequence` checks will fail; either return
33
+ the names or delete those checks from the case YAML.
34
+
35
+ If the URL is unreachable (connection refused, invalid URL) traceval prints
36
+ the one-line `ERROR:` at session start; cases still run and fail
37
+ individually so the run report stays complete.
38
+
39
+ ### Minimal FastAPI implementation
40
+
41
+ ```python
42
+ # my_agent.py
43
+ from fastapi import FastAPI
44
+ from pydantic import BaseModel
45
+
46
+ app = FastAPI()
47
+
48
+
49
+ class AgentInput(BaseModel):
50
+ input: str
51
+
52
+
53
+ @app.post("/agent")
54
+ def run_agent(payload: AgentInput) -> dict:
55
+ # Call your real agent here.
56
+ answer = f"You asked: {payload.input}"
57
+ return {
58
+ "output": answer,
59
+ "tool_calls": [{"name": "kb_search"}],
60
+ }
61
+ ```
62
+
63
+ ```bash
64
+ uvicorn my_agent:app --port 8000 &
65
+ traceval run evals/ --target http://127.0.0.1:8000/agent --judge fake
66
+ ```
67
+
68
+ ## Callable target
69
+
70
+ Pass `module:function` (one colon). traceval inserts the current working
71
+ directory into `sys.path`, imports the module, and calls the function with
72
+ the case's task text as a single string argument:
73
+
74
+ ```python
75
+ def invoke_agent(input_text: str) -> dict: ...
76
+ ```
77
+
78
+ Accepted return shapes:
79
+
80
+ - a dict, interpreted exactly like the HTTP response above
81
+ (`output`/`final_output`/`response` priority, optional `tool_calls`)
82
+ - an object with an `.output` attribute (and optional `.tool_calls`)
83
+ - anything else, stringified whole and used as the output
84
+
85
+ ```bash
86
+ traceval run evals/ --target myapp.agent:invoke_agent --judge fake
87
+ ```
88
+
89
+ Because the working directory is importable, `myapp/agent.py` in your repo
90
+ root works without installation. A bad module path or missing attribute
91
+ produces the same one-line `ERROR:` plus a self-describing run report.
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "traceval"
7
- version = "0.2.2"
7
+ version = "0.2.3"
8
8
  description = "Trace-to-Eval Compiler"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"
@@ -0,0 +1 @@
1
+ __version__ = "0.2.3"
@@ -25,15 +25,28 @@ def ingest(
25
25
  path: str,
26
26
  format: str = typer.Option("auto", help="auto|otel|langfuse|langsmith|generic"),
27
27
  output: str = typer.Option("traces.db", "-o", help="SQLite database output path"),
28
+ tool_span_names: str = typer.Option(
29
+ None,
30
+ "--tool-span-names",
31
+ help=(
32
+ "Comma-separated name globs (e.g. '*_lookup,tool_*') that mark "
33
+ "spans as tool calls; replaces the built-in heuristic for "
34
+ "Langfuse SPANs and adds a fallback for OTel spans"
35
+ ),
36
+ ),
28
37
  json_output: bool = typer.Option(
29
38
  False, "--json", help="Print a machine-readable JSON summary to stdout"
30
39
  ),
31
40
  ) -> None:
32
41
  """Ingest trace logs into SQLite database."""
42
+ globs = None
43
+ if tool_span_names:
44
+ globs = [g.strip() for g in tool_span_names.split(",") if g.strip()]
45
+
33
46
  db = TraceStore(output)
34
47
  try:
35
48
  ok_count, span_count, warn_count, log_file = ingest_file(
36
- Path(path), db, format_name=format
49
+ Path(path), db, format_name=format, tool_span_globs=globs
37
50
  )
38
51
  if json_output:
39
52
  typer.echo(
@@ -201,11 +201,12 @@ def pytest_sessionfinish(session, exitstatus):
201
201
 
202
202
  for r in _results_accumulator:
203
203
  outcome_str = "[bold green]PASS[/bold green]" if r["passed"] else "[bold red]FAIL[/bold red]"
204
+ latency_str = "<0.1" if r["latency_ms"] < 0.1 else f"{r['latency_ms']:.1f}"
204
205
  table.add_row(
205
206
  r["case_id"],
206
207
  r["cluster"],
207
208
  outcome_str,
208
- f"{r['latency_ms']:.1f}"
209
+ latency_str
209
210
  )
210
211
 
211
212
  console.print(table)
@@ -58,6 +58,7 @@ def ingest_file(
58
58
  store: TraceStore,
59
59
  format_name: str = "auto",
60
60
  log_path: Path | None = None,
61
+ tool_span_globs: list[str] | None = None,
61
62
  ) -> tuple[int, int, int, Path]:
62
63
  if format_name == "auto":
63
64
  format_name = detect_format(path)
@@ -66,7 +67,7 @@ def ingest_file(
66
67
  if not adapter_cls:
67
68
  raise ValueError(f"Unknown format: {format_name}")
68
69
 
69
- adapter = adapter_cls()
70
+ adapter = adapter_cls(tool_span_globs=tool_span_globs)
70
71
 
71
72
  if log_path is None:
72
73
  db_p = Path(store.db_path)
@@ -8,6 +8,8 @@ from traceval.model import Trace
8
8
  class Adapter(Protocol):
9
9
  format_name: ClassVar[str]
10
10
 
11
+ def __init__(self, tool_span_globs: list[str] | None = None) -> None: ...
12
+
11
13
  def detect(self, first_lines: list[str]) -> bool: ...
12
14
 
13
15
  def parse(self, path: Path) -> Iterator[Trace]: ...
@@ -13,6 +13,10 @@ logger = logging.getLogger(__name__)
13
13
  class GenericAdapter(Adapter):
14
14
  format_name: ClassVar[str] = "generic"
15
15
 
16
+ def __init__(self, tool_span_globs: list[str] | None = None) -> None:
17
+ # Generic traces declare step kinds explicitly; globs are unused.
18
+ self.tool_span_globs = tool_span_globs
19
+
16
20
  def detect(self, first_lines: list[str]) -> bool:
17
21
  if not first_lines:
18
22
  return False
@@ -1,3 +1,4 @@
1
+ import fnmatch
1
2
  import json
2
3
  import logging
3
4
  from collections.abc import Iterator
@@ -25,6 +26,9 @@ def parse_iso_datetime(dt_str: str | None) -> datetime | None:
25
26
  class LangfuseAdapter(Adapter):
26
27
  format_name: ClassVar[str] = "langfuse"
27
28
 
29
+ def __init__(self, tool_span_globs: list[str] | None = None) -> None:
30
+ self.tool_span_globs = tool_span_globs
31
+
28
32
  def detect(self, first_lines: list[str]) -> bool:
29
33
  if not first_lines:
30
34
  return False
@@ -142,11 +146,30 @@ class LangfuseAdapter(Adapter):
142
146
  )
143
147
 
144
148
  elif obs_type == "SPAN":
145
- # Check if tool span
146
- is_tool = (
147
- name in ["order_lookup", "stripe_lookup", "kb_search"]
148
- or obs.get("metadata", {}).get("tool") is not None
149
- )
149
+ # Tool classification, in priority order:
150
+ # 1. explicit metadata.tool marker (always wins)
151
+ # 2. user globs (--tool-span-names), which REPLACE
152
+ # the built-in heuristic when provided
153
+ # 3. heuristic: a SPAN that recorded an input and
154
+ # either an output or an error signal is a tool
155
+ # call (failed tools often produce no output,
156
+ # so an ERROR level counts as second signal)
157
+ metadata = obs.get("metadata") or {}
158
+ if metadata.get("tool") is not None:
159
+ is_tool = True
160
+ elif self.tool_span_globs is not None:
161
+ is_tool = any(
162
+ fnmatch.fnmatch(name, pattern)
163
+ for pattern in self.tool_span_globs
164
+ )
165
+ else:
166
+ has_error_signal = (
167
+ level == "ERROR"
168
+ or obs.get("statusMessage") is not None
169
+ )
170
+ is_tool = obs.get("input") is not None and (
171
+ obs.get("output") is not None or has_error_signal
172
+ )
150
173
  if is_tool:
151
174
  obs_input = obs.get("input")
152
175
  if isinstance(obs_input, (dict, list)):
@@ -25,6 +25,10 @@ def parse_iso_datetime(dt_str: str | None) -> datetime | None:
25
25
  class LangsmithAdapter(Adapter):
26
26
  format_name: ClassVar[str] = "langsmith"
27
27
 
28
+ def __init__(self, tool_span_globs: list[str] | None = None) -> None:
29
+ # LangSmith runs carry an explicit run_type; globs are unused.
30
+ self.tool_span_globs = tool_span_globs
31
+
28
32
  def detect(self, first_lines: list[str]) -> bool:
29
33
  if not first_lines:
30
34
  return False
@@ -1,3 +1,4 @@
1
+ import fnmatch
1
2
  import json
2
3
  import logging
3
4
  from collections.abc import Iterator
@@ -26,6 +27,9 @@ def parse_iso_datetime(dt_str: str | None) -> datetime | None:
26
27
  class OtelAdapter(Adapter):
27
28
  format_name: ClassVar[str] = "otel"
28
29
 
30
+ def __init__(self, tool_span_globs: list[str] | None = None) -> None:
31
+ self.tool_span_globs = tool_span_globs
32
+
29
33
  def detect(self, first_lines: list[str]) -> bool:
30
34
  if not first_lines:
31
35
  return False
@@ -170,13 +174,27 @@ class OtelAdapter(Adapter):
170
174
  )
171
175
  )
172
176
 
173
- # Check if Tool call
177
+ # Check if Tool call. Primary signal: GenAI semantic
178
+ # convention attributes. Fallbacks are attribute- or
179
+ # user-glob-based only, never tool-name lists.
174
180
  elif (
175
181
  "gen_ai.tool.name" in span_attrs
176
182
  or "gen_ai.tool.arguments" in span_attrs
177
- or name in ["order_lookup", "stripe_lookup", "kb_search"]
183
+ or span_attrs.get("gen_ai.operation.name") == "execute_tool"
184
+ or "tool.name" in span_attrs
185
+ or (
186
+ self.tool_span_globs is not None
187
+ and any(
188
+ fnmatch.fnmatch(name, pattern)
189
+ for pattern in self.tool_span_globs
190
+ )
191
+ )
178
192
  ):
179
- tool_name = span_attrs.get("gen_ai.tool.name") or name
193
+ tool_name = (
194
+ span_attrs.get("gen_ai.tool.name")
195
+ or span_attrs.get("tool.name")
196
+ or name
197
+ )
180
198
  args_json = span_attrs.get("gen_ai.tool.arguments") or "{}"
181
199
  tool_call = ToolCall(
182
200
  span_id=span["span_id"],
@@ -16,3 +16,14 @@ This directory contains synthetic logs and outputs used for testing traceval ada
16
16
  10. **tr-010 (Simple success)**: Direct assistant output response.
17
17
  11. **tr-011 (Latency timeout)**: Completed but exceeds runtime duration bounds.
18
18
  12. **tr-012 (Success stripe lookup)**: Simple tool path resolving successfully.
19
+
20
+ ## Backend export fixtures (otel_spans / langfuse_export / langsmith_runs)
21
+
22
+ Each backend file carries the same five stories (success with llm+tool,
23
+ tool error, llm error, timeout, tool loop) plus one corrupt line to test
24
+ warning handling. The Langfuse and OTel files additionally contain a
25
+ sixth trace (`lf-006` / `otel-006`) whose tool span is named
26
+ `create_ticket`, a name the demo agent never uses: it proves tool
27
+ detection relies on the documented signals (Langfuse input/output
28
+ heuristic, OTel gen_ai semantic-convention attributes), not on a tool
29
+ vocabulary.
@@ -3,4 +3,5 @@
3
3
  {"id": "lf-003", "name": "agent_run", "timestamp": "2026-07-01T12:02:00Z", "input": "Get user data.", "output": null, "metadata": {}, "observations": [{"id": "obs-4", "type": "GENERATION", "name": "llm_call", "startTime": "2026-07-01T12:02:01Z", "endTime": "2026-07-01T12:02:02Z", "level": "ERROR", "statusMessage": "API Error: Invalid credentials"}]}
4
4
  {"id": "lf-004", "name": "agent_run", "timestamp": "2026-07-01T12:03:00Z", "input": "Run infinite process.", "output": null, "metadata": {}, "observations": []}
5
5
  {"id": "lf-005", "name": "agent_run", "timestamp": "2026-07-01T12:04:00Z", "input": "Find information.", "output": null, "metadata": {}, "observations": [{"id": "obs-5", "type": "SPAN", "name": "kb_search", "startTime": "2026-07-01T12:04:01Z", "endTime": "2026-07-01T12:04:02Z", "input": {"query": "refund"}, "output": "None"}, {"id": "obs-6", "type": "SPAN", "name": "kb_search", "startTime": "2026-07-01T12:04:03Z", "endTime": "2026-07-01T12:04:04Z", "input": {"query": "refund"}, "output": "None"}, {"id": "obs-7", "type": "SPAN", "name": "kb_search", "startTime": "2026-07-01T12:04:05Z", "endTime": "2026-07-01T12:04:06Z", "input": {"query": "refund"}, "output": "None"}]}
6
+ {"id": "lf-006", "name": "agent_run", "timestamp": "2026-07-01T12:06:00Z", "input": "Open a support ticket for my broken keyboard", "output": "Ticket TCK-778 created.", "metadata": {}, "observations": [{"id": "obs-6a", "type": "SPAN", "name": "create_ticket", "startTime": "2026-07-01T12:06:01Z", "endTime": "2026-07-01T12:06:02Z", "input": {"summary": "broken keyboard"}, "output": "TCK-778"}]}
6
7
  INVALID_LINE_CORRUPT_JSON
@@ -10,4 +10,6 @@
10
10
  {"trace_id": "otel-005", "span_id": "tool-5-1", "parent_span_id": "root-5", "name": "kb_search", "start_time": "2026-07-01T12:04:01Z", "end_time": "2026-07-01T12:04:02Z", "attributes": {"gen_ai.tool.name": "kb_search", "gen_ai.tool.arguments": "{\"query\": \"refund\"}"}}
11
11
  {"trace_id": "otel-005", "span_id": "tool-5-2", "parent_span_id": "root-5", "name": "kb_search", "start_time": "2026-07-01T12:04:03Z", "end_time": "2026-07-01T12:04:04Z", "attributes": {"gen_ai.tool.name": "kb_search", "gen_ai.tool.arguments": "{\"query\": \"refund\"}"}}
12
12
  {"trace_id": "otel-005", "span_id": "tool-5-3", "parent_span_id": "root-5", "name": "kb_search", "start_time": "2026-07-01T12:04:05Z", "end_time": "2026-07-01T12:04:06Z", "attributes": {"gen_ai.tool.name": "kb_search", "gen_ai.tool.arguments": "{\"query\": \"refund\"}"}}
13
+ {"trace_id": "otel-006", "span_id": "root-6", "parent_span_id": null, "name": "agent_run", "start_time": "2026-07-01T12:06:00Z", "end_time": "2026-07-01T12:06:03Z", "attributes": {"gen_ai.task_input": "Open a support ticket for my broken keyboard", "gen_ai.final_output": "Ticket TCK-778 created."}}
14
+ {"trace_id": "otel-006", "span_id": "span-6a", "parent_span_id": "root-6", "name": "create_ticket", "start_time": "2026-07-01T12:06:01Z", "end_time": "2026-07-01T12:06:02Z", "attributes": {"gen_ai.tool.name": "create_ticket", "gen_ai.tool.arguments": "{\"summary\": \"broken keyboard\"}", "gen_ai.tool.output": "TCK-778"}}
13
15
  INVALID_LINE_CORRUPT_JSON
@@ -29,13 +29,13 @@ def test_otel_ingest(tmp_path):
29
29
  log_path=log_path,
30
30
  )
31
31
 
32
- # 5 traces are defined in our otel_spans.jsonl + 1 invalid line
33
- assert ok_count == 5
32
+ # 6 traces are defined in our otel_spans.jsonl + 1 invalid line
33
+ assert ok_count == 6
34
34
  assert warn_count == 1
35
35
  assert log_file == log_path
36
36
 
37
37
  traces = list(store.list_traces())
38
- assert len(traces) == 5
38
+ assert len(traces) == 6
39
39
 
40
40
  # Check trace 1 detail: otel-001 has 1 llm step and 1 tool step
41
41
  t1 = next(t for t in traces if t.trace_id == "otel-001")
@@ -70,12 +70,12 @@ def test_langfuse_ingest(tmp_path):
70
70
  log_path=log_path,
71
71
  )
72
72
 
73
- # 5 traces + 1 invalid line
74
- assert ok_count == 5
73
+ # 6 traces + 1 invalid line
74
+ assert ok_count == 6
75
75
  assert warn_count == 1
76
76
 
77
77
  traces = list(store.list_traces())
78
- assert len(traces) == 5
78
+ assert len(traces) == 6
79
79
 
80
80
  t1 = next(t for t in traces if t.trace_id == "lf-001")
81
81
  assert len(t1.steps) == 2
@@ -133,7 +133,55 @@ def test_robustness_on_shuffled_inputs(tmp_path):
133
133
  store,
134
134
  format_name="otel",
135
135
  )
136
- # Reconstructs all 5 traces
137
- assert ok_count == 5
136
+ # Reconstructs all 6 traces
137
+ assert ok_count == 6
138
138
  assert warn_count == 0
139
139
  store.close()
140
+
141
+
142
+ def test_real_tool_names_detected(tmp_path):
143
+ # Tool detection must not depend on the demo agent's tool vocabulary:
144
+ # the create_ticket spans in both fixtures classify as tools via the
145
+ # documented signals (Langfuse input/output heuristic; OTel gen_ai
146
+ # semantic-convention attributes), never a name list.
147
+ for fixture, fmt, trace_id in [
148
+ ("langfuse_export.jsonl", "langfuse", "lf-006"),
149
+ ("otel_spans.jsonl", "otel", "otel-006"),
150
+ ]:
151
+ store = TraceStore(tmp_path / f"{fmt}.db")
152
+ ingest_file(FIXTURES_DIR / fixture, store, format_name=fmt)
153
+ traces = list(store.list_traces())
154
+ store.close()
155
+
156
+ trace = next(t for t in traces if t.trace_id == trace_id)
157
+ tool_steps = [s for s in trace.steps if s.kind == "tool"]
158
+ assert len(tool_steps) == 1, f"{fmt}: create_ticket span not detected"
159
+ assert tool_steps[0].tool.name == "create_ticket"
160
+
161
+
162
+ def test_tool_span_globs_override(tmp_path):
163
+ # User globs replace the built-in Langfuse heuristic: with a glob that
164
+ # matches nothing, the create_ticket SPAN (no metadata.tool) must NOT
165
+ # classify as a tool; with a matching glob it must.
166
+ store = TraceStore(tmp_path / "none.db")
167
+ ingest_file(
168
+ FIXTURES_DIR / "langfuse_export.jsonl",
169
+ store,
170
+ format_name="langfuse",
171
+ tool_span_globs=["nothing_matches_*"],
172
+ )
173
+ trace = next(t for t in store.list_traces() if t.trace_id == "lf-006")
174
+ store.close()
175
+ assert all(s.kind != "tool" for s in trace.steps)
176
+
177
+ store = TraceStore(tmp_path / "match.db")
178
+ ingest_file(
179
+ FIXTURES_DIR / "langfuse_export.jsonl",
180
+ store,
181
+ format_name="langfuse",
182
+ tool_span_globs=["create_*"],
183
+ )
184
+ trace = next(t for t in store.list_traces() if t.trace_id == "lf-006")
185
+ store.close()
186
+ assert [s.kind for s in trace.steps] == ["tool"]
187
+ assert trace.steps[0].tool.name == "create_ticket"
@@ -1037,7 +1037,7 @@ wheels = [
1037
1037
 
1038
1038
  [[package]]
1039
1039
  name = "traceval"
1040
- version = "0.2.2"
1040
+ version = "0.2.3"
1041
1041
  source = { editable = "." }
1042
1042
  dependencies = [
1043
1043
  { name = "httpx" },
Binary file
@@ -1 +0,0 @@
1
- __version__ = "0.2.2"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes