traceval 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- traceval/__init__.py +1 -0
- traceval/analyze/__init__.py +75 -0
- traceval/analyze/cluster.py +226 -0
- traceval/analyze/coverage.py +84 -0
- traceval/analyze/outcomes.py +254 -0
- traceval/analyze/report.py +478 -0
- traceval/cli.py +168 -0
- traceval/compile/__init__.py +50 -0
- traceval/compile/cases.py +204 -0
- traceval/compile/emit_pytest.py +21 -0
- traceval/compile/emit_yaml.py +47 -0
- traceval/compile/rubrics.py +52 -0
- traceval/compile/templates/conftest.py.jinja +158 -0
- traceval/compile/templates/test_generated.py.jinja +17 -0
- traceval/ingest/__init__.py +99 -0
- traceval/ingest/base.py +13 -0
- traceval/ingest/generic.py +40 -0
- traceval/ingest/langfuse.py +241 -0
- traceval/ingest/langsmith.py +257 -0
- traceval/ingest/otel.py +238 -0
- traceval/model.py +66 -0
- traceval/run/judge.py +196 -0
- traceval/run/runner.py +113 -0
- traceval/run/scorers.py +150 -0
- traceval/run/target.py +104 -0
- traceval/store.py +59 -0
- traceval-0.1.1.dist-info/METADATA +154 -0
- traceval-0.1.1.dist-info/RECORD +31 -0
- traceval-0.1.1.dist-info/WHEEL +4 -0
- traceval-0.1.1.dist-info/entry_points.txt +2 -0
- traceval-0.1.1.dist-info/licenses/LICENSE +21 -0
traceval/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.1"
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from traceval.analyze.cluster import JaccardClusterer
|
|
5
|
+
from traceval.analyze.coverage import compute_coverage
|
|
6
|
+
from traceval.analyze.outcomes import label_trace, load_user_rules
|
|
7
|
+
from traceval.analyze.report import render_report
|
|
8
|
+
from traceval.store import TraceStore
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def run_analysis(
|
|
12
|
+
db_path: Path,
|
|
13
|
+
rules_path: Path | None = None,
|
|
14
|
+
evals_dir: Path | None = None,
|
|
15
|
+
output_dir: Path | None = None,
|
|
16
|
+
) -> dict[str, Any]:
|
|
17
|
+
store = TraceStore(db_path)
|
|
18
|
+
try:
|
|
19
|
+
# 1. Load user rules if any
|
|
20
|
+
user_rules = None
|
|
21
|
+
if rules_path:
|
|
22
|
+
user_rules = load_user_rules(rules_path)
|
|
23
|
+
|
|
24
|
+
# 2. Label all traces
|
|
25
|
+
traces = list(store.list_traces())
|
|
26
|
+
for trace in traces:
|
|
27
|
+
outcome = label_trace(trace, user_rules=user_rules)
|
|
28
|
+
trace.outcome = outcome
|
|
29
|
+
store.save_trace(trace)
|
|
30
|
+
|
|
31
|
+
# 3. Fetch fresh labeled traces
|
|
32
|
+
labeled_traces = list(store.list_traces())
|
|
33
|
+
|
|
34
|
+
# 4. Cluster traces
|
|
35
|
+
clusterer = JaccardClusterer()
|
|
36
|
+
clusters = clusterer.cluster(labeled_traces)
|
|
37
|
+
|
|
38
|
+
# 5. Compute coverage of clusters against existing evals
|
|
39
|
+
coverage = compute_coverage(clusters, evals_dir, labeled_traces)
|
|
40
|
+
|
|
41
|
+
# 6. Render reports if output_dir is provided
|
|
42
|
+
summary = {
|
|
43
|
+
"total_traces": len(labeled_traces),
|
|
44
|
+
"outcomes": {},
|
|
45
|
+
"clusters": [],
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
# Populate summary outcomes counts
|
|
49
|
+
outcome_counts: dict[str, int] = {}
|
|
50
|
+
for t in labeled_traces:
|
|
51
|
+
lbl = t.outcome.label if t.outcome else "unknown"
|
|
52
|
+
outcome_counts[lbl] = outcome_counts.get(lbl, 0) + 1
|
|
53
|
+
summary["outcomes"] = outcome_counts
|
|
54
|
+
|
|
55
|
+
summary["clusters"] = [
|
|
56
|
+
{
|
|
57
|
+
"id": c.id,
|
|
58
|
+
"name": c.name,
|
|
59
|
+
"tool_signature": c.tool_signature,
|
|
60
|
+
"top_terms": c.top_terms,
|
|
61
|
+
"trace_count": len(c.trace_ids),
|
|
62
|
+
"trace_ids": c.trace_ids,
|
|
63
|
+
"coverage_count": coverage.get(c.id, 0),
|
|
64
|
+
}
|
|
65
|
+
for c in clusters
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
if output_dir:
|
|
69
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
70
|
+
html_path = output_dir / "report.html"
|
|
71
|
+
render_report(summary, coverage, html_path)
|
|
72
|
+
|
|
73
|
+
return summary
|
|
74
|
+
finally:
|
|
75
|
+
store.close()
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import math
|
|
3
|
+
import re
|
|
4
|
+
from collections import Counter
|
|
5
|
+
from typing import Protocol
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
|
|
9
|
+
from traceval.model import Trace
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Cluster(BaseModel):
|
|
13
|
+
id: str
|
|
14
|
+
name: str
|
|
15
|
+
trace_ids: list[str]
|
|
16
|
+
tool_signature: str
|
|
17
|
+
top_terms: list[str]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Clusterer(Protocol):
|
|
21
|
+
def cluster(self, traces: list[Trace]) -> list[Cluster]: ...
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# Standard English stopwords
|
|
25
|
+
STOPWORDS: set[str] = {
|
|
26
|
+
"a",
|
|
27
|
+
"an",
|
|
28
|
+
"the",
|
|
29
|
+
"and",
|
|
30
|
+
"or",
|
|
31
|
+
"but",
|
|
32
|
+
"is",
|
|
33
|
+
"are",
|
|
34
|
+
"was",
|
|
35
|
+
"were",
|
|
36
|
+
"to",
|
|
37
|
+
"for",
|
|
38
|
+
"in",
|
|
39
|
+
"of",
|
|
40
|
+
"on",
|
|
41
|
+
"at",
|
|
42
|
+
"by",
|
|
43
|
+
"with",
|
|
44
|
+
"this",
|
|
45
|
+
"that",
|
|
46
|
+
"these",
|
|
47
|
+
"those",
|
|
48
|
+
"i",
|
|
49
|
+
"you",
|
|
50
|
+
"he",
|
|
51
|
+
"she",
|
|
52
|
+
"it",
|
|
53
|
+
"we",
|
|
54
|
+
"they",
|
|
55
|
+
"my",
|
|
56
|
+
"your",
|
|
57
|
+
"his",
|
|
58
|
+
"her",
|
|
59
|
+
"its",
|
|
60
|
+
"our",
|
|
61
|
+
"their",
|
|
62
|
+
"me",
|
|
63
|
+
"him",
|
|
64
|
+
"us",
|
|
65
|
+
"them",
|
|
66
|
+
"what",
|
|
67
|
+
"where",
|
|
68
|
+
"how",
|
|
69
|
+
"why",
|
|
70
|
+
"please",
|
|
71
|
+
"can",
|
|
72
|
+
"get",
|
|
73
|
+
"show",
|
|
74
|
+
"find",
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def tokenize(text: str) -> list[str]:
|
|
79
|
+
# Lowercase and strip non-alphanumeric
|
|
80
|
+
text = text.lower()
|
|
81
|
+
words = re.findall(r"\b[a-z0-9_]+\b", text)
|
|
82
|
+
return [w for w in words if w not in STOPWORDS]
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def get_ngrams(tokens: list[str], max_n: int = 3) -> set[tuple[str, ...]]:
|
|
86
|
+
ngrams: set[tuple[str, ...]] = set()
|
|
87
|
+
n_tokens = len(tokens)
|
|
88
|
+
for n in range(1, min(max_n + 1, n_tokens + 1)):
|
|
89
|
+
for i in range(n_tokens - n + 1):
|
|
90
|
+
ngrams.add(tuple(tokens[i : i + n]))
|
|
91
|
+
return ngrams
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def get_tool_signature(trace: Trace) -> str:
|
|
95
|
+
tool_names = []
|
|
96
|
+
for step in trace.steps:
|
|
97
|
+
if step.kind == "tool" and step.tool:
|
|
98
|
+
tool_names.append(step.tool.name)
|
|
99
|
+
return ">".join(tool_names) if tool_names else ""
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class JaccardClusterer:
|
|
103
|
+
def __init__(self, jaccard_threshold: float = 0.35) -> None:
|
|
104
|
+
self.jaccard_threshold = jaccard_threshold
|
|
105
|
+
|
|
106
|
+
def cluster(self, traces: list[Trace]) -> list[Cluster]:
|
|
107
|
+
if not traces:
|
|
108
|
+
return []
|
|
109
|
+
|
|
110
|
+
# 1. Group traces by exact tool signature
|
|
111
|
+
traces_by_sig: dict[str, list[Trace]] = {}
|
|
112
|
+
for trace in traces:
|
|
113
|
+
sig = get_tool_signature(trace)
|
|
114
|
+
traces_by_sig.setdefault(sig, []).append(trace)
|
|
115
|
+
|
|
116
|
+
raw_clusters: list[tuple[str, list[Trace]]] = []
|
|
117
|
+
|
|
118
|
+
# 2. Within each signature group, apply greedy Jaccard
|
|
119
|
+
# clustering on task_input ngrams
|
|
120
|
+
for sig, sig_traces in traces_by_sig.items():
|
|
121
|
+
# Keep trace list sorted by trace_id to ensure determinism
|
|
122
|
+
sorted_traces = sorted(sig_traces, key=lambda t: t.trace_id)
|
|
123
|
+
|
|
124
|
+
sig_clusters: list[list[Trace]] = []
|
|
125
|
+
cluster_ngrams: list[set[tuple[str, ...]]] = []
|
|
126
|
+
|
|
127
|
+
for trace in sorted_traces:
|
|
128
|
+
tokens = tokenize(trace.task_input)
|
|
129
|
+
ngrams = get_ngrams(tokens)
|
|
130
|
+
|
|
131
|
+
matched_idx = -1
|
|
132
|
+
for idx, seed_ngrams in enumerate(cluster_ngrams):
|
|
133
|
+
if not ngrams or not seed_ngrams:
|
|
134
|
+
jaccard = 0.0
|
|
135
|
+
else:
|
|
136
|
+
intersection = len(ngrams.intersection(seed_ngrams))
|
|
137
|
+
union = len(ngrams.union(seed_ngrams))
|
|
138
|
+
jaccard = intersection / union if union > 0 else 0.0
|
|
139
|
+
|
|
140
|
+
if jaccard >= self.jaccard_threshold:
|
|
141
|
+
matched_idx = idx
|
|
142
|
+
break
|
|
143
|
+
|
|
144
|
+
if matched_idx != -1:
|
|
145
|
+
sig_clusters[matched_idx].append(trace)
|
|
146
|
+
else:
|
|
147
|
+
sig_clusters.append([trace])
|
|
148
|
+
cluster_ngrams.append(ngrams)
|
|
149
|
+
|
|
150
|
+
for group in sig_clusters:
|
|
151
|
+
raw_clusters.append((sig, group))
|
|
152
|
+
|
|
153
|
+
# 3. Calculate TF-IDF of terms per cluster to name them
|
|
154
|
+
# Term counts per cluster
|
|
155
|
+
cluster_term_docs: list[Counter[str]] = []
|
|
156
|
+
doc_frequencies: Counter[str] = Counter()
|
|
157
|
+
all_terms: set[str] = set()
|
|
158
|
+
|
|
159
|
+
for _sig, group in raw_clusters:
|
|
160
|
+
terms = []
|
|
161
|
+
for t in group:
|
|
162
|
+
terms.extend(tokenize(t.task_input))
|
|
163
|
+
counter = Counter(terms)
|
|
164
|
+
cluster_term_docs.append(counter)
|
|
165
|
+
for term in counter:
|
|
166
|
+
doc_frequencies[term] += 1
|
|
167
|
+
all_terms.add(term)
|
|
168
|
+
|
|
169
|
+
num_clusters = len(raw_clusters)
|
|
170
|
+
idfs: dict[str, float] = {}
|
|
171
|
+
for term in all_terms:
|
|
172
|
+
# Standard smooth IDF formula
|
|
173
|
+
idfs[term] = math.log((num_clusters + 1) / (doc_frequencies[term] + 1)) + 1
|
|
174
|
+
|
|
175
|
+
clusters: list[Cluster] = []
|
|
176
|
+
for c_idx, (sig, group) in enumerate(raw_clusters):
|
|
177
|
+
# Compute TF-IDF for each term in this cluster
|
|
178
|
+
term_scores = {}
|
|
179
|
+
counter = cluster_term_docs[c_idx]
|
|
180
|
+
for term, count in counter.items():
|
|
181
|
+
tf = count # raw term frequency in the cluster docs
|
|
182
|
+
term_scores[term] = tf * idfs[term]
|
|
183
|
+
|
|
184
|
+
# Sort terms by TF-IDF score
|
|
185
|
+
sorted_terms = sorted(
|
|
186
|
+
term_scores.keys(), key=lambda term: term_scores[term], reverse=True
|
|
187
|
+
)
|
|
188
|
+
top_terms = sorted_terms[:3]
|
|
189
|
+
|
|
190
|
+
# Reconstruct signature string for title (replace '>' with ' -> ')
|
|
191
|
+
formatted_sig = sig.replace(">", " -> ") if sig else ""
|
|
192
|
+
|
|
193
|
+
# Outcome suffix (optional, if we want to differentiate failure clusters)
|
|
194
|
+
outcomes = [t.outcome.label if t.outcome else "unknown" for t in group]
|
|
195
|
+
outcome_counter = Counter(outcomes)
|
|
196
|
+
dominant_outcome = outcome_counter.most_common(1)[0][0]
|
|
197
|
+
|
|
198
|
+
# Build name
|
|
199
|
+
name_parts = []
|
|
200
|
+
if top_terms:
|
|
201
|
+
name_parts.append(" ".join(top_terms))
|
|
202
|
+
if formatted_sig:
|
|
203
|
+
name_parts.append(formatted_sig)
|
|
204
|
+
if dominant_outcome != "success":
|
|
205
|
+
name_parts.append(f"({dominant_outcome})")
|
|
206
|
+
|
|
207
|
+
name = " -> ".join(name_parts) if name_parts else "unclassified"
|
|
208
|
+
|
|
209
|
+
# Stable content hash ID based on sorted list of trace_ids in the cluster
|
|
210
|
+
sorted_ids = sorted(t.trace_id for t in group)
|
|
211
|
+
joined_ids = ",".join(sorted_ids)
|
|
212
|
+
content_hash = hashlib.sha256(joined_ids.encode("utf-8")).hexdigest()[:8]
|
|
213
|
+
cluster_id = f"c_{content_hash}"
|
|
214
|
+
|
|
215
|
+
clusters.append(
|
|
216
|
+
Cluster(
|
|
217
|
+
id=cluster_id,
|
|
218
|
+
name=name,
|
|
219
|
+
trace_ids=sorted_ids,
|
|
220
|
+
tool_signature=sig,
|
|
221
|
+
top_terms=top_terms,
|
|
222
|
+
)
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
# Sort clusters by id for deterministic outputs
|
|
226
|
+
return sorted(clusters, key=lambda c: c.id)
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import yaml
|
|
4
|
+
|
|
5
|
+
from traceval.analyze.cluster import Cluster, get_ngrams, tokenize
|
|
6
|
+
from traceval.model import Trace
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def compute_coverage(
|
|
10
|
+
clusters: list[Cluster],
|
|
11
|
+
evals_dir: Path | None,
|
|
12
|
+
traces: list[Trace],
|
|
13
|
+
) -> dict[str, int]:
|
|
14
|
+
coverage: dict[str, int] = {c.id: 0 for c in clusters}
|
|
15
|
+
if not evals_dir or not evals_dir.exists():
|
|
16
|
+
return coverage
|
|
17
|
+
|
|
18
|
+
# 1. Parse all eval files (*.yaml) in evals_dir recursively
|
|
19
|
+
eval_cases = []
|
|
20
|
+
for p in evals_dir.rglob("*.yaml"):
|
|
21
|
+
# Skip potential metadata yaml like traceval.yaml
|
|
22
|
+
if p.name == "traceval.yaml":
|
|
23
|
+
continue
|
|
24
|
+
try:
|
|
25
|
+
with open(p, encoding="utf-8") as f:
|
|
26
|
+
data = yaml.safe_load(f)
|
|
27
|
+
if isinstance(data, dict) and "input" in data:
|
|
28
|
+
eval_cases.append(data)
|
|
29
|
+
except Exception:
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
# 2. For each eval case, map to a cluster
|
|
33
|
+
traces_by_id = {t.trace_id: t for t in traces}
|
|
34
|
+
cluster_by_id = {c.id: c for c in clusters}
|
|
35
|
+
|
|
36
|
+
for case in eval_cases:
|
|
37
|
+
mapped_cluster_id = None
|
|
38
|
+
|
|
39
|
+
# Scenario A: case has a explicit 'cluster' field
|
|
40
|
+
case_cluster = case.get("cluster")
|
|
41
|
+
if case_cluster and case_cluster in cluster_by_id:
|
|
42
|
+
mapped_cluster_id = case_cluster
|
|
43
|
+
|
|
44
|
+
# Scenario B: look up the source trace and find which cluster contains it
|
|
45
|
+
elif "source_trace_id" in case:
|
|
46
|
+
tid = case["source_trace_id"]
|
|
47
|
+
for c in clusters:
|
|
48
|
+
if tid in c.trace_ids:
|
|
49
|
+
mapped_cluster_id = c.id
|
|
50
|
+
break
|
|
51
|
+
|
|
52
|
+
# Scenario C: fallback to input similarity match
|
|
53
|
+
if not mapped_cluster_id:
|
|
54
|
+
case_input = case.get("input", "")
|
|
55
|
+
case_ngrams = get_ngrams(tokenize(case_input))
|
|
56
|
+
best_sim = -1.0
|
|
57
|
+
best_cid = None
|
|
58
|
+
|
|
59
|
+
for c in clusters:
|
|
60
|
+
# Compare similarity with each trace in the cluster
|
|
61
|
+
for tid in c.trace_ids:
|
|
62
|
+
t = traces_by_id.get(tid)
|
|
63
|
+
if not t:
|
|
64
|
+
continue
|
|
65
|
+
t_ngrams = get_ngrams(tokenize(t.task_input))
|
|
66
|
+
if not case_ngrams or not t_ngrams:
|
|
67
|
+
sim = 0.0
|
|
68
|
+
else:
|
|
69
|
+
union_len = len(case_ngrams.union(t_ngrams))
|
|
70
|
+
sim = (
|
|
71
|
+
len(case_ngrams.intersection(t_ngrams)) / union_len
|
|
72
|
+
if union_len > 0
|
|
73
|
+
else 0.0
|
|
74
|
+
)
|
|
75
|
+
if sim > best_sim:
|
|
76
|
+
best_sim = sim
|
|
77
|
+
best_cid = c.id
|
|
78
|
+
if best_sim >= 0.35:
|
|
79
|
+
mapped_cluster_id = best_cid
|
|
80
|
+
|
|
81
|
+
if mapped_cluster_id:
|
|
82
|
+
coverage[mapped_cluster_id] += 1
|
|
83
|
+
|
|
84
|
+
return coverage
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
import importlib.util
|
|
2
|
+
import logging
|
|
3
|
+
import re
|
|
4
|
+
import sys
|
|
5
|
+
from collections.abc import Callable
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from traceval.model import Outcome, Trace
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Rule:
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
rule_id: str,
|
|
16
|
+
description: str,
|
|
17
|
+
fn: Callable[[Trace], Outcome | None],
|
|
18
|
+
) -> None:
|
|
19
|
+
self.rule_id = rule_id
|
|
20
|
+
self.description = description
|
|
21
|
+
self.fn = fn
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def R_TOOL_ERROR(trace: Trace) -> Outcome | None:
|
|
25
|
+
for step in trace.steps:
|
|
26
|
+
if step.kind == "tool" and step.tool and step.tool.error:
|
|
27
|
+
return Outcome(
|
|
28
|
+
label="tool_error",
|
|
29
|
+
reason=f"Tool '{step.tool.name}' failed with error: {step.tool.error}",
|
|
30
|
+
labeled_by="rule",
|
|
31
|
+
rule_id="R_TOOL_ERROR",
|
|
32
|
+
)
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def R_LLM_ERROR(trace: Trace) -> Outcome | None:
|
|
37
|
+
for step in trace.steps:
|
|
38
|
+
if step.kind == "llm" and step.llm and step.llm.error:
|
|
39
|
+
return Outcome(
|
|
40
|
+
label="bad_output",
|
|
41
|
+
reason=(
|
|
42
|
+
f"LLM call in step {step.index} failed with error: {step.llm.error}"
|
|
43
|
+
),
|
|
44
|
+
labeled_by="rule",
|
|
45
|
+
rule_id="R_LLM_ERROR",
|
|
46
|
+
)
|
|
47
|
+
return None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def R_LOOP(trace: Trace, loop_step_threshold: int = 25) -> Outcome | None:
|
|
51
|
+
# 1. Check total steps
|
|
52
|
+
if len(trace.steps) > loop_step_threshold:
|
|
53
|
+
return Outcome(
|
|
54
|
+
label="loop",
|
|
55
|
+
reason=(
|
|
56
|
+
f"Total steps ({len(trace.steps)}) exceeded loop step "
|
|
57
|
+
f"threshold ({loop_step_threshold})"
|
|
58
|
+
),
|
|
59
|
+
labeled_by="rule",
|
|
60
|
+
rule_id="R_LOOP",
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# 2. Check identical consecutive tool calls (>=3 times)
|
|
64
|
+
consec_count = 0
|
|
65
|
+
last_name = None
|
|
66
|
+
last_args = None
|
|
67
|
+
|
|
68
|
+
for step in trace.steps:
|
|
69
|
+
if step.kind == "tool" and step.tool:
|
|
70
|
+
curr_name = step.tool.name
|
|
71
|
+
curr_args = step.tool.arguments_json
|
|
72
|
+
if curr_name == last_name and curr_args == last_args:
|
|
73
|
+
consec_count += 1
|
|
74
|
+
if consec_count >= 3:
|
|
75
|
+
return Outcome(
|
|
76
|
+
label="loop",
|
|
77
|
+
reason=(
|
|
78
|
+
f"Tool '{curr_name}' called with identical "
|
|
79
|
+
f"arguments {consec_count} times consecutively"
|
|
80
|
+
),
|
|
81
|
+
labeled_by="rule",
|
|
82
|
+
rule_id="R_LOOP",
|
|
83
|
+
)
|
|
84
|
+
else:
|
|
85
|
+
consec_count = 1
|
|
86
|
+
last_name = curr_name
|
|
87
|
+
last_args = curr_args
|
|
88
|
+
else:
|
|
89
|
+
# reset consecutive count if another step kind occurs
|
|
90
|
+
consec_count = 0
|
|
91
|
+
last_name = None
|
|
92
|
+
last_args = None
|
|
93
|
+
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def R_TIMEOUT(trace: Trace, timeout_s: float = 300.0) -> Outcome | None:
|
|
98
|
+
if not trace.ended_at:
|
|
99
|
+
return Outcome(
|
|
100
|
+
label="timeout",
|
|
101
|
+
reason="Trace has not completed (ended_at is missing)",
|
|
102
|
+
labeled_by="rule",
|
|
103
|
+
rule_id="R_TIMEOUT",
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
duration = (trace.ended_at - trace.started_at).total_seconds()
|
|
107
|
+
if duration > timeout_s:
|
|
108
|
+
return Outcome(
|
|
109
|
+
label="timeout",
|
|
110
|
+
reason=f"Trace duration ({duration:.1f}s) exceeded limit of {timeout_s}s",
|
|
111
|
+
labeled_by="rule",
|
|
112
|
+
rule_id="R_TIMEOUT",
|
|
113
|
+
)
|
|
114
|
+
return None
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
DEFAULT_VALIDATION_REGEXES = [
|
|
118
|
+
re.compile(r"validation\s+error", re.IGNORECASE),
|
|
119
|
+
re.compile(r"field\s+required", re.IGNORECASE),
|
|
120
|
+
re.compile(r"value\s+is\s+not\s+a\s+valid", re.IGNORECASE),
|
|
121
|
+
re.compile(r"json-schema", re.IGNORECASE),
|
|
122
|
+
re.compile(r"json\s+schema\s+error", re.IGNORECASE),
|
|
123
|
+
]
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def R_VALIDATION(trace: Trace) -> Outcome | None:
|
|
127
|
+
if not trace.final_output:
|
|
128
|
+
return None
|
|
129
|
+
for pattern in DEFAULT_VALIDATION_REGEXES:
|
|
130
|
+
if pattern.search(trace.final_output):
|
|
131
|
+
return Outcome(
|
|
132
|
+
label="validation_error",
|
|
133
|
+
reason=(
|
|
134
|
+
f"Final output matched validation signature pattern: "
|
|
135
|
+
f"'{pattern.pattern}'"
|
|
136
|
+
),
|
|
137
|
+
labeled_by="rule",
|
|
138
|
+
rule_id="R_VALIDATION",
|
|
139
|
+
)
|
|
140
|
+
return None
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def R_EMPTY_OUTPUT(trace: Trace) -> Outcome | None:
|
|
144
|
+
if trace.task_input and not trace.final_output:
|
|
145
|
+
return Outcome(
|
|
146
|
+
label="bad_output",
|
|
147
|
+
reason="Task input is non-empty, but final output is empty or missing",
|
|
148
|
+
labeled_by="rule",
|
|
149
|
+
rule_id="R_EMPTY_OUTPUT",
|
|
150
|
+
)
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def R_DEFAULT_SUCCESS(trace: Trace) -> Outcome | None:
|
|
155
|
+
if trace.final_output:
|
|
156
|
+
return Outcome(
|
|
157
|
+
label="success",
|
|
158
|
+
reason="Trace completed with non-empty final output",
|
|
159
|
+
labeled_by="rule",
|
|
160
|
+
rule_id="R_DEFAULT_SUCCESS",
|
|
161
|
+
)
|
|
162
|
+
return None
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def R_UNKNOWN(trace: Trace) -> Outcome | None:
|
|
166
|
+
return Outcome(
|
|
167
|
+
label="unknown",
|
|
168
|
+
reason="No rules matched this trace",
|
|
169
|
+
labeled_by="rule",
|
|
170
|
+
rule_id="R_UNKNOWN",
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
BUILTIN_RULES = [
|
|
175
|
+
R_TOOL_ERROR,
|
|
176
|
+
R_LLM_ERROR,
|
|
177
|
+
R_LOOP,
|
|
178
|
+
R_TIMEOUT,
|
|
179
|
+
R_VALIDATION,
|
|
180
|
+
R_EMPTY_OUTPUT,
|
|
181
|
+
R_DEFAULT_SUCCESS,
|
|
182
|
+
R_UNKNOWN,
|
|
183
|
+
]
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def load_user_rules(rules_path: str | Path) -> list[Any]:
|
|
187
|
+
path = Path(rules_path)
|
|
188
|
+
if not path.exists():
|
|
189
|
+
raise FileNotFoundError(f"Custom rules file not found: {rules_path}")
|
|
190
|
+
|
|
191
|
+
spec = importlib.util.spec_from_file_location("custom_rules", path)
|
|
192
|
+
if spec is None or spec.loader is None:
|
|
193
|
+
raise ValueError(f"Could not load custom rules from {rules_path}")
|
|
194
|
+
|
|
195
|
+
mod = importlib.util.module_from_spec(spec)
|
|
196
|
+
sys.modules["custom_rules"] = mod
|
|
197
|
+
spec.loader.exec_module(mod)
|
|
198
|
+
|
|
199
|
+
rules = getattr(mod, "RULES", [])
|
|
200
|
+
return rules
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def label_trace(
|
|
204
|
+
trace: Trace,
|
|
205
|
+
user_rules: list[Any] | None = None,
|
|
206
|
+
loop_step_threshold: int = 25,
|
|
207
|
+
timeout_s: float = 300.0,
|
|
208
|
+
) -> Outcome:
|
|
209
|
+
# 1. Run user rules first
|
|
210
|
+
if user_rules:
|
|
211
|
+
for rule in user_rules:
|
|
212
|
+
# support both Rule objects and simple callables
|
|
213
|
+
fn = getattr(rule, "fn", rule)
|
|
214
|
+
rule_id = getattr(rule, "rule_id", getattr(fn, "__name__", "user_rule"))
|
|
215
|
+
try:
|
|
216
|
+
res = fn(trace)
|
|
217
|
+
if res is not None:
|
|
218
|
+
# ensure correct metadata is set for user rule label
|
|
219
|
+
if isinstance(res, Outcome):
|
|
220
|
+
return Outcome(
|
|
221
|
+
label=res.label,
|
|
222
|
+
reason=res.reason,
|
|
223
|
+
labeled_by="user_rule",
|
|
224
|
+
rule_id=res.rule_id or rule_id,
|
|
225
|
+
)
|
|
226
|
+
return Outcome(
|
|
227
|
+
label=res,
|
|
228
|
+
reason=f"Matched custom rule '{rule_id}'",
|
|
229
|
+
labeled_by="user_rule",
|
|
230
|
+
rule_id=rule_id,
|
|
231
|
+
)
|
|
232
|
+
except Exception as e:
|
|
233
|
+
logger = logging.getLogger("traceval.analyze")
|
|
234
|
+
logger.error("Error executing user rule '%s': %s", rule_id, str(e))
|
|
235
|
+
|
|
236
|
+
# 2. Run built-in rules
|
|
237
|
+
# loop and timeout require extra arguments
|
|
238
|
+
for rule_fn in BUILTIN_RULES:
|
|
239
|
+
if rule_fn is R_LOOP:
|
|
240
|
+
res = R_LOOP(trace, loop_step_threshold=loop_step_threshold)
|
|
241
|
+
elif rule_fn is R_TIMEOUT:
|
|
242
|
+
res = R_TIMEOUT(trace, timeout_s=timeout_s)
|
|
243
|
+
else:
|
|
244
|
+
res = rule_fn(trace)
|
|
245
|
+
|
|
246
|
+
if res is not None:
|
|
247
|
+
return res
|
|
248
|
+
|
|
249
|
+
return Outcome(
|
|
250
|
+
label="unknown",
|
|
251
|
+
reason="Default fallback",
|
|
252
|
+
labeled_by="rule",
|
|
253
|
+
rule_id="R_UNKNOWN",
|
|
254
|
+
)
|