traceval 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
traceval/__init__.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.1.1"
@@ -0,0 +1,75 @@
1
+ from pathlib import Path
2
+ from typing import Any
3
+
4
+ from traceval.analyze.cluster import JaccardClusterer
5
+ from traceval.analyze.coverage import compute_coverage
6
+ from traceval.analyze.outcomes import label_trace, load_user_rules
7
+ from traceval.analyze.report import render_report
8
+ from traceval.store import TraceStore
9
+
10
+
11
+ def run_analysis(
12
+ db_path: Path,
13
+ rules_path: Path | None = None,
14
+ evals_dir: Path | None = None,
15
+ output_dir: Path | None = None,
16
+ ) -> dict[str, Any]:
17
+ store = TraceStore(db_path)
18
+ try:
19
+ # 1. Load user rules if any
20
+ user_rules = None
21
+ if rules_path:
22
+ user_rules = load_user_rules(rules_path)
23
+
24
+ # 2. Label all traces
25
+ traces = list(store.list_traces())
26
+ for trace in traces:
27
+ outcome = label_trace(trace, user_rules=user_rules)
28
+ trace.outcome = outcome
29
+ store.save_trace(trace)
30
+
31
+ # 3. Fetch fresh labeled traces
32
+ labeled_traces = list(store.list_traces())
33
+
34
+ # 4. Cluster traces
35
+ clusterer = JaccardClusterer()
36
+ clusters = clusterer.cluster(labeled_traces)
37
+
38
+ # 5. Compute coverage of clusters against existing evals
39
+ coverage = compute_coverage(clusters, evals_dir, labeled_traces)
40
+
41
+ # 6. Render reports if output_dir is provided
42
+ summary = {
43
+ "total_traces": len(labeled_traces),
44
+ "outcomes": {},
45
+ "clusters": [],
46
+ }
47
+
48
+ # Populate summary outcomes counts
49
+ outcome_counts: dict[str, int] = {}
50
+ for t in labeled_traces:
51
+ lbl = t.outcome.label if t.outcome else "unknown"
52
+ outcome_counts[lbl] = outcome_counts.get(lbl, 0) + 1
53
+ summary["outcomes"] = outcome_counts
54
+
55
+ summary["clusters"] = [
56
+ {
57
+ "id": c.id,
58
+ "name": c.name,
59
+ "tool_signature": c.tool_signature,
60
+ "top_terms": c.top_terms,
61
+ "trace_count": len(c.trace_ids),
62
+ "trace_ids": c.trace_ids,
63
+ "coverage_count": coverage.get(c.id, 0),
64
+ }
65
+ for c in clusters
66
+ ]
67
+
68
+ if output_dir:
69
+ output_dir.mkdir(parents=True, exist_ok=True)
70
+ html_path = output_dir / "report.html"
71
+ render_report(summary, coverage, html_path)
72
+
73
+ return summary
74
+ finally:
75
+ store.close()
@@ -0,0 +1,226 @@
1
+ import hashlib
2
+ import math
3
+ import re
4
+ from collections import Counter
5
+ from typing import Protocol
6
+
7
+ from pydantic import BaseModel
8
+
9
+ from traceval.model import Trace
10
+
11
+
12
+ class Cluster(BaseModel):
13
+ id: str
14
+ name: str
15
+ trace_ids: list[str]
16
+ tool_signature: str
17
+ top_terms: list[str]
18
+
19
+
20
+ class Clusterer(Protocol):
21
+ def cluster(self, traces: list[Trace]) -> list[Cluster]: ...
22
+
23
+
24
+ # Standard English stopwords
25
+ STOPWORDS: set[str] = {
26
+ "a",
27
+ "an",
28
+ "the",
29
+ "and",
30
+ "or",
31
+ "but",
32
+ "is",
33
+ "are",
34
+ "was",
35
+ "were",
36
+ "to",
37
+ "for",
38
+ "in",
39
+ "of",
40
+ "on",
41
+ "at",
42
+ "by",
43
+ "with",
44
+ "this",
45
+ "that",
46
+ "these",
47
+ "those",
48
+ "i",
49
+ "you",
50
+ "he",
51
+ "she",
52
+ "it",
53
+ "we",
54
+ "they",
55
+ "my",
56
+ "your",
57
+ "his",
58
+ "her",
59
+ "its",
60
+ "our",
61
+ "their",
62
+ "me",
63
+ "him",
64
+ "us",
65
+ "them",
66
+ "what",
67
+ "where",
68
+ "how",
69
+ "why",
70
+ "please",
71
+ "can",
72
+ "get",
73
+ "show",
74
+ "find",
75
+ }
76
+
77
+
78
+ def tokenize(text: str) -> list[str]:
79
+ # Lowercase and strip non-alphanumeric
80
+ text = text.lower()
81
+ words = re.findall(r"\b[a-z0-9_]+\b", text)
82
+ return [w for w in words if w not in STOPWORDS]
83
+
84
+
85
+ def get_ngrams(tokens: list[str], max_n: int = 3) -> set[tuple[str, ...]]:
86
+ ngrams: set[tuple[str, ...]] = set()
87
+ n_tokens = len(tokens)
88
+ for n in range(1, min(max_n + 1, n_tokens + 1)):
89
+ for i in range(n_tokens - n + 1):
90
+ ngrams.add(tuple(tokens[i : i + n]))
91
+ return ngrams
92
+
93
+
94
+ def get_tool_signature(trace: Trace) -> str:
95
+ tool_names = []
96
+ for step in trace.steps:
97
+ if step.kind == "tool" and step.tool:
98
+ tool_names.append(step.tool.name)
99
+ return ">".join(tool_names) if tool_names else ""
100
+
101
+
102
+ class JaccardClusterer:
103
+ def __init__(self, jaccard_threshold: float = 0.35) -> None:
104
+ self.jaccard_threshold = jaccard_threshold
105
+
106
+ def cluster(self, traces: list[Trace]) -> list[Cluster]:
107
+ if not traces:
108
+ return []
109
+
110
+ # 1. Group traces by exact tool signature
111
+ traces_by_sig: dict[str, list[Trace]] = {}
112
+ for trace in traces:
113
+ sig = get_tool_signature(trace)
114
+ traces_by_sig.setdefault(sig, []).append(trace)
115
+
116
+ raw_clusters: list[tuple[str, list[Trace]]] = []
117
+
118
+ # 2. Within each signature group, apply greedy Jaccard
119
+ # clustering on task_input ngrams
120
+ for sig, sig_traces in traces_by_sig.items():
121
+ # Keep trace list sorted by trace_id to ensure determinism
122
+ sorted_traces = sorted(sig_traces, key=lambda t: t.trace_id)
123
+
124
+ sig_clusters: list[list[Trace]] = []
125
+ cluster_ngrams: list[set[tuple[str, ...]]] = []
126
+
127
+ for trace in sorted_traces:
128
+ tokens = tokenize(trace.task_input)
129
+ ngrams = get_ngrams(tokens)
130
+
131
+ matched_idx = -1
132
+ for idx, seed_ngrams in enumerate(cluster_ngrams):
133
+ if not ngrams or not seed_ngrams:
134
+ jaccard = 0.0
135
+ else:
136
+ intersection = len(ngrams.intersection(seed_ngrams))
137
+ union = len(ngrams.union(seed_ngrams))
138
+ jaccard = intersection / union if union > 0 else 0.0
139
+
140
+ if jaccard >= self.jaccard_threshold:
141
+ matched_idx = idx
142
+ break
143
+
144
+ if matched_idx != -1:
145
+ sig_clusters[matched_idx].append(trace)
146
+ else:
147
+ sig_clusters.append([trace])
148
+ cluster_ngrams.append(ngrams)
149
+
150
+ for group in sig_clusters:
151
+ raw_clusters.append((sig, group))
152
+
153
+ # 3. Calculate TF-IDF of terms per cluster to name them
154
+ # Term counts per cluster
155
+ cluster_term_docs: list[Counter[str]] = []
156
+ doc_frequencies: Counter[str] = Counter()
157
+ all_terms: set[str] = set()
158
+
159
+ for _sig, group in raw_clusters:
160
+ terms = []
161
+ for t in group:
162
+ terms.extend(tokenize(t.task_input))
163
+ counter = Counter(terms)
164
+ cluster_term_docs.append(counter)
165
+ for term in counter:
166
+ doc_frequencies[term] += 1
167
+ all_terms.add(term)
168
+
169
+ num_clusters = len(raw_clusters)
170
+ idfs: dict[str, float] = {}
171
+ for term in all_terms:
172
+ # Standard smooth IDF formula
173
+ idfs[term] = math.log((num_clusters + 1) / (doc_frequencies[term] + 1)) + 1
174
+
175
+ clusters: list[Cluster] = []
176
+ for c_idx, (sig, group) in enumerate(raw_clusters):
177
+ # Compute TF-IDF for each term in this cluster
178
+ term_scores = {}
179
+ counter = cluster_term_docs[c_idx]
180
+ for term, count in counter.items():
181
+ tf = count # raw term frequency in the cluster docs
182
+ term_scores[term] = tf * idfs[term]
183
+
184
+ # Sort terms by TF-IDF score
185
+ sorted_terms = sorted(
186
+ term_scores.keys(), key=lambda term: term_scores[term], reverse=True
187
+ )
188
+ top_terms = sorted_terms[:3]
189
+
190
+ # Reconstruct signature string for title (replace '>' with ' -> ')
191
+ formatted_sig = sig.replace(">", " -> ") if sig else ""
192
+
193
+ # Outcome suffix (optional, if we want to differentiate failure clusters)
194
+ outcomes = [t.outcome.label if t.outcome else "unknown" for t in group]
195
+ outcome_counter = Counter(outcomes)
196
+ dominant_outcome = outcome_counter.most_common(1)[0][0]
197
+
198
+ # Build name
199
+ name_parts = []
200
+ if top_terms:
201
+ name_parts.append(" ".join(top_terms))
202
+ if formatted_sig:
203
+ name_parts.append(formatted_sig)
204
+ if dominant_outcome != "success":
205
+ name_parts.append(f"({dominant_outcome})")
206
+
207
+ name = " -> ".join(name_parts) if name_parts else "unclassified"
208
+
209
+ # Stable content hash ID based on sorted list of trace_ids in the cluster
210
+ sorted_ids = sorted(t.trace_id for t in group)
211
+ joined_ids = ",".join(sorted_ids)
212
+ content_hash = hashlib.sha256(joined_ids.encode("utf-8")).hexdigest()[:8]
213
+ cluster_id = f"c_{content_hash}"
214
+
215
+ clusters.append(
216
+ Cluster(
217
+ id=cluster_id,
218
+ name=name,
219
+ trace_ids=sorted_ids,
220
+ tool_signature=sig,
221
+ top_terms=top_terms,
222
+ )
223
+ )
224
+
225
+ # Sort clusters by id for deterministic outputs
226
+ return sorted(clusters, key=lambda c: c.id)
@@ -0,0 +1,84 @@
1
+ from pathlib import Path
2
+
3
+ import yaml
4
+
5
+ from traceval.analyze.cluster import Cluster, get_ngrams, tokenize
6
+ from traceval.model import Trace
7
+
8
+
9
+ def compute_coverage(
10
+ clusters: list[Cluster],
11
+ evals_dir: Path | None,
12
+ traces: list[Trace],
13
+ ) -> dict[str, int]:
14
+ coverage: dict[str, int] = {c.id: 0 for c in clusters}
15
+ if not evals_dir or not evals_dir.exists():
16
+ return coverage
17
+
18
+ # 1. Parse all eval files (*.yaml) in evals_dir recursively
19
+ eval_cases = []
20
+ for p in evals_dir.rglob("*.yaml"):
21
+ # Skip potential metadata yaml like traceval.yaml
22
+ if p.name == "traceval.yaml":
23
+ continue
24
+ try:
25
+ with open(p, encoding="utf-8") as f:
26
+ data = yaml.safe_load(f)
27
+ if isinstance(data, dict) and "input" in data:
28
+ eval_cases.append(data)
29
+ except Exception:
30
+ pass
31
+
32
+ # 2. For each eval case, map to a cluster
33
+ traces_by_id = {t.trace_id: t for t in traces}
34
+ cluster_by_id = {c.id: c for c in clusters}
35
+
36
+ for case in eval_cases:
37
+ mapped_cluster_id = None
38
+
39
+ # Scenario A: case has a explicit 'cluster' field
40
+ case_cluster = case.get("cluster")
41
+ if case_cluster and case_cluster in cluster_by_id:
42
+ mapped_cluster_id = case_cluster
43
+
44
+ # Scenario B: look up the source trace and find which cluster contains it
45
+ elif "source_trace_id" in case:
46
+ tid = case["source_trace_id"]
47
+ for c in clusters:
48
+ if tid in c.trace_ids:
49
+ mapped_cluster_id = c.id
50
+ break
51
+
52
+ # Scenario C: fallback to input similarity match
53
+ if not mapped_cluster_id:
54
+ case_input = case.get("input", "")
55
+ case_ngrams = get_ngrams(tokenize(case_input))
56
+ best_sim = -1.0
57
+ best_cid = None
58
+
59
+ for c in clusters:
60
+ # Compare similarity with each trace in the cluster
61
+ for tid in c.trace_ids:
62
+ t = traces_by_id.get(tid)
63
+ if not t:
64
+ continue
65
+ t_ngrams = get_ngrams(tokenize(t.task_input))
66
+ if not case_ngrams or not t_ngrams:
67
+ sim = 0.0
68
+ else:
69
+ union_len = len(case_ngrams.union(t_ngrams))
70
+ sim = (
71
+ len(case_ngrams.intersection(t_ngrams)) / union_len
72
+ if union_len > 0
73
+ else 0.0
74
+ )
75
+ if sim > best_sim:
76
+ best_sim = sim
77
+ best_cid = c.id
78
+ if best_sim >= 0.35:
79
+ mapped_cluster_id = best_cid
80
+
81
+ if mapped_cluster_id:
82
+ coverage[mapped_cluster_id] += 1
83
+
84
+ return coverage
@@ -0,0 +1,254 @@
1
+ import importlib.util
2
+ import logging
3
+ import re
4
+ import sys
5
+ from collections.abc import Callable
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from traceval.model import Outcome, Trace
10
+
11
+
12
+ class Rule:
13
+ def __init__(
14
+ self,
15
+ rule_id: str,
16
+ description: str,
17
+ fn: Callable[[Trace], Outcome | None],
18
+ ) -> None:
19
+ self.rule_id = rule_id
20
+ self.description = description
21
+ self.fn = fn
22
+
23
+
24
+ def R_TOOL_ERROR(trace: Trace) -> Outcome | None:
25
+ for step in trace.steps:
26
+ if step.kind == "tool" and step.tool and step.tool.error:
27
+ return Outcome(
28
+ label="tool_error",
29
+ reason=f"Tool '{step.tool.name}' failed with error: {step.tool.error}",
30
+ labeled_by="rule",
31
+ rule_id="R_TOOL_ERROR",
32
+ )
33
+ return None
34
+
35
+
36
+ def R_LLM_ERROR(trace: Trace) -> Outcome | None:
37
+ for step in trace.steps:
38
+ if step.kind == "llm" and step.llm and step.llm.error:
39
+ return Outcome(
40
+ label="bad_output",
41
+ reason=(
42
+ f"LLM call in step {step.index} failed with error: {step.llm.error}"
43
+ ),
44
+ labeled_by="rule",
45
+ rule_id="R_LLM_ERROR",
46
+ )
47
+ return None
48
+
49
+
50
+ def R_LOOP(trace: Trace, loop_step_threshold: int = 25) -> Outcome | None:
51
+ # 1. Check total steps
52
+ if len(trace.steps) > loop_step_threshold:
53
+ return Outcome(
54
+ label="loop",
55
+ reason=(
56
+ f"Total steps ({len(trace.steps)}) exceeded loop step "
57
+ f"threshold ({loop_step_threshold})"
58
+ ),
59
+ labeled_by="rule",
60
+ rule_id="R_LOOP",
61
+ )
62
+
63
+ # 2. Check identical consecutive tool calls (>=3 times)
64
+ consec_count = 0
65
+ last_name = None
66
+ last_args = None
67
+
68
+ for step in trace.steps:
69
+ if step.kind == "tool" and step.tool:
70
+ curr_name = step.tool.name
71
+ curr_args = step.tool.arguments_json
72
+ if curr_name == last_name and curr_args == last_args:
73
+ consec_count += 1
74
+ if consec_count >= 3:
75
+ return Outcome(
76
+ label="loop",
77
+ reason=(
78
+ f"Tool '{curr_name}' called with identical "
79
+ f"arguments {consec_count} times consecutively"
80
+ ),
81
+ labeled_by="rule",
82
+ rule_id="R_LOOP",
83
+ )
84
+ else:
85
+ consec_count = 1
86
+ last_name = curr_name
87
+ last_args = curr_args
88
+ else:
89
+ # reset consecutive count if another step kind occurs
90
+ consec_count = 0
91
+ last_name = None
92
+ last_args = None
93
+
94
+ return None
95
+
96
+
97
+ def R_TIMEOUT(trace: Trace, timeout_s: float = 300.0) -> Outcome | None:
98
+ if not trace.ended_at:
99
+ return Outcome(
100
+ label="timeout",
101
+ reason="Trace has not completed (ended_at is missing)",
102
+ labeled_by="rule",
103
+ rule_id="R_TIMEOUT",
104
+ )
105
+
106
+ duration = (trace.ended_at - trace.started_at).total_seconds()
107
+ if duration > timeout_s:
108
+ return Outcome(
109
+ label="timeout",
110
+ reason=f"Trace duration ({duration:.1f}s) exceeded limit of {timeout_s}s",
111
+ labeled_by="rule",
112
+ rule_id="R_TIMEOUT",
113
+ )
114
+ return None
115
+
116
+
117
+ DEFAULT_VALIDATION_REGEXES = [
118
+ re.compile(r"validation\s+error", re.IGNORECASE),
119
+ re.compile(r"field\s+required", re.IGNORECASE),
120
+ re.compile(r"value\s+is\s+not\s+a\s+valid", re.IGNORECASE),
121
+ re.compile(r"json-schema", re.IGNORECASE),
122
+ re.compile(r"json\s+schema\s+error", re.IGNORECASE),
123
+ ]
124
+
125
+
126
+ def R_VALIDATION(trace: Trace) -> Outcome | None:
127
+ if not trace.final_output:
128
+ return None
129
+ for pattern in DEFAULT_VALIDATION_REGEXES:
130
+ if pattern.search(trace.final_output):
131
+ return Outcome(
132
+ label="validation_error",
133
+ reason=(
134
+ f"Final output matched validation signature pattern: "
135
+ f"'{pattern.pattern}'"
136
+ ),
137
+ labeled_by="rule",
138
+ rule_id="R_VALIDATION",
139
+ )
140
+ return None
141
+
142
+
143
+ def R_EMPTY_OUTPUT(trace: Trace) -> Outcome | None:
144
+ if trace.task_input and not trace.final_output:
145
+ return Outcome(
146
+ label="bad_output",
147
+ reason="Task input is non-empty, but final output is empty or missing",
148
+ labeled_by="rule",
149
+ rule_id="R_EMPTY_OUTPUT",
150
+ )
151
+ return None
152
+
153
+
154
+ def R_DEFAULT_SUCCESS(trace: Trace) -> Outcome | None:
155
+ if trace.final_output:
156
+ return Outcome(
157
+ label="success",
158
+ reason="Trace completed with non-empty final output",
159
+ labeled_by="rule",
160
+ rule_id="R_DEFAULT_SUCCESS",
161
+ )
162
+ return None
163
+
164
+
165
+ def R_UNKNOWN(trace: Trace) -> Outcome | None:
166
+ return Outcome(
167
+ label="unknown",
168
+ reason="No rules matched this trace",
169
+ labeled_by="rule",
170
+ rule_id="R_UNKNOWN",
171
+ )
172
+
173
+
174
+ BUILTIN_RULES = [
175
+ R_TOOL_ERROR,
176
+ R_LLM_ERROR,
177
+ R_LOOP,
178
+ R_TIMEOUT,
179
+ R_VALIDATION,
180
+ R_EMPTY_OUTPUT,
181
+ R_DEFAULT_SUCCESS,
182
+ R_UNKNOWN,
183
+ ]
184
+
185
+
186
+ def load_user_rules(rules_path: str | Path) -> list[Any]:
187
+ path = Path(rules_path)
188
+ if not path.exists():
189
+ raise FileNotFoundError(f"Custom rules file not found: {rules_path}")
190
+
191
+ spec = importlib.util.spec_from_file_location("custom_rules", path)
192
+ if spec is None or spec.loader is None:
193
+ raise ValueError(f"Could not load custom rules from {rules_path}")
194
+
195
+ mod = importlib.util.module_from_spec(spec)
196
+ sys.modules["custom_rules"] = mod
197
+ spec.loader.exec_module(mod)
198
+
199
+ rules = getattr(mod, "RULES", [])
200
+ return rules
201
+
202
+
203
+ def label_trace(
204
+ trace: Trace,
205
+ user_rules: list[Any] | None = None,
206
+ loop_step_threshold: int = 25,
207
+ timeout_s: float = 300.0,
208
+ ) -> Outcome:
209
+ # 1. Run user rules first
210
+ if user_rules:
211
+ for rule in user_rules:
212
+ # support both Rule objects and simple callables
213
+ fn = getattr(rule, "fn", rule)
214
+ rule_id = getattr(rule, "rule_id", getattr(fn, "__name__", "user_rule"))
215
+ try:
216
+ res = fn(trace)
217
+ if res is not None:
218
+ # ensure correct metadata is set for user rule label
219
+ if isinstance(res, Outcome):
220
+ return Outcome(
221
+ label=res.label,
222
+ reason=res.reason,
223
+ labeled_by="user_rule",
224
+ rule_id=res.rule_id or rule_id,
225
+ )
226
+ return Outcome(
227
+ label=res,
228
+ reason=f"Matched custom rule '{rule_id}'",
229
+ labeled_by="user_rule",
230
+ rule_id=rule_id,
231
+ )
232
+ except Exception as e:
233
+ logger = logging.getLogger("traceval.analyze")
234
+ logger.error("Error executing user rule '%s': %s", rule_id, str(e))
235
+
236
+ # 2. Run built-in rules
237
+ # loop and timeout require extra arguments
238
+ for rule_fn in BUILTIN_RULES:
239
+ if rule_fn is R_LOOP:
240
+ res = R_LOOP(trace, loop_step_threshold=loop_step_threshold)
241
+ elif rule_fn is R_TIMEOUT:
242
+ res = R_TIMEOUT(trace, timeout_s=timeout_s)
243
+ else:
244
+ res = rule_fn(trace)
245
+
246
+ if res is not None:
247
+ return res
248
+
249
+ return Outcome(
250
+ label="unknown",
251
+ reason="Default fallback",
252
+ labeled_by="rule",
253
+ rule_id="R_UNKNOWN",
254
+ )