cane-eval 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cane_eval/__init__.py +44 -0
- cane_eval/cli.py +610 -0
- cane_eval/engine.py +358 -0
- cane_eval/export.py +250 -0
- cane_eval/integrations/__init__.py +26 -0
- cane_eval/integrations/_base.py +143 -0
- cane_eval/integrations/fastapi_agent.py +216 -0
- cane_eval/integrations/langchain.py +151 -0
- cane_eval/integrations/llamaindex.py +122 -0
- cane_eval/integrations/openai_compat.py +234 -0
- cane_eval/judge.py +268 -0
- cane_eval/mining.py +348 -0
- cane_eval/rca.py +425 -0
- cane_eval/suite.py +249 -0
- cane_eval-0.1.0.dist-info/METADATA +469 -0
- cane_eval-0.1.0.dist-info/RECORD +19 -0
- cane_eval-0.1.0.dist-info/WHEEL +4 -0
- cane_eval-0.1.0.dist-info/entry_points.txt +2 -0
- cane_eval-0.1.0.dist-info/licenses/LICENSE +21 -0
cane_eval/__init__.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cane-eval -- LLM-as-Judge evaluation for AI agents.
|
|
3
|
+
|
|
4
|
+
Open-source eval toolkit: YAML test suites, Claude-powered judging,
|
|
5
|
+
regression diffs, failure mining, root cause analysis, and training data export.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__version__ = "0.1.0"
|
|
9
|
+
|
|
10
|
+
from cane_eval.suite import TestSuite, TestCase
|
|
11
|
+
from cane_eval.judge import Judge, JudgeResult, CriteriaScore
|
|
12
|
+
from cane_eval.engine import EvalRunner, EvalResult, RunSummary
|
|
13
|
+
from cane_eval.export import Exporter
|
|
14
|
+
from cane_eval.mining import FailureMiner
|
|
15
|
+
from cane_eval.rca import RootCauseAnalyzer, RCAResult, TargetedRCAResult
|
|
16
|
+
|
|
17
|
+
# Integrations (lazy-loaded to avoid import errors if frameworks not installed)
|
|
18
|
+
from cane_eval.integrations import (
|
|
19
|
+
evaluate_langchain,
|
|
20
|
+
evaluate_llamaindex,
|
|
21
|
+
evaluate_openai,
|
|
22
|
+
evaluate_fastapi,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"TestSuite",
|
|
27
|
+
"TestCase",
|
|
28
|
+
"Judge",
|
|
29
|
+
"JudgeResult",
|
|
30
|
+
"CriteriaScore",
|
|
31
|
+
"EvalRunner",
|
|
32
|
+
"EvalResult",
|
|
33
|
+
"RunSummary",
|
|
34
|
+
"Exporter",
|
|
35
|
+
"FailureMiner",
|
|
36
|
+
"RootCauseAnalyzer",
|
|
37
|
+
"RCAResult",
|
|
38
|
+
"TargetedRCAResult",
|
|
39
|
+
# Integrations
|
|
40
|
+
"evaluate_langchain",
|
|
41
|
+
"evaluate_llamaindex",
|
|
42
|
+
"evaluate_openai",
|
|
43
|
+
"evaluate_fastapi",
|
|
44
|
+
]
|
cane_eval/cli.py
ADDED
|
@@ -0,0 +1,610 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cli.py -- Command-line interface for cane-eval.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
cane-eval run tests.yaml
|
|
6
|
+
cane-eval run tests.yaml --model claude-sonnet-4-5-20250929
|
|
7
|
+
cane-eval run tests.yaml --tags policy,returns
|
|
8
|
+
cane-eval run tests.yaml --export dpo --output training.jsonl
|
|
9
|
+
cane-eval run tests.yaml --mine --mine-threshold 60
|
|
10
|
+
cane-eval diff results_v1.json results_v2.json
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import argparse
|
|
14
|
+
import json
|
|
15
|
+
import sys
|
|
16
|
+
import os
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# ---- Color helpers (no dependency needed) ----
|
|
21
|
+
|
|
22
|
+
COLORS = {
|
|
23
|
+
"red": "\033[91m",
|
|
24
|
+
"green": "\033[92m",
|
|
25
|
+
"yellow": "\033[93m",
|
|
26
|
+
"blue": "\033[94m",
|
|
27
|
+
"magenta": "\033[95m",
|
|
28
|
+
"cyan": "\033[96m",
|
|
29
|
+
"bold": "\033[1m",
|
|
30
|
+
"dim": "\033[2m",
|
|
31
|
+
"reset": "\033[0m",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
def _supports_color():
|
|
35
|
+
"""Check if terminal supports color."""
|
|
36
|
+
if os.environ.get("NO_COLOR"):
|
|
37
|
+
return False
|
|
38
|
+
if os.environ.get("FORCE_COLOR"):
|
|
39
|
+
return True
|
|
40
|
+
return hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
|
|
41
|
+
|
|
42
|
+
USE_COLOR = _supports_color()
|
|
43
|
+
|
|
44
|
+
def c(text, color):
|
|
45
|
+
if not USE_COLOR:
|
|
46
|
+
return text
|
|
47
|
+
return f"{COLORS.get(color, '')}{text}{COLORS['reset']}"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# ---- Output formatters ----
|
|
51
|
+
|
|
52
|
+
def _status_badge(status):
|
|
53
|
+
"""Return colored status badge."""
|
|
54
|
+
if status == "pass":
|
|
55
|
+
return c(" PASS ", "green")
|
|
56
|
+
if status == "warn":
|
|
57
|
+
return c(" WARN ", "yellow")
|
|
58
|
+
return c(" FAIL ", "red")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _score_color(score):
|
|
62
|
+
"""Return score with appropriate color."""
|
|
63
|
+
if score >= 80:
|
|
64
|
+
return c(f"{score:.0f}", "green")
|
|
65
|
+
if score >= 60:
|
|
66
|
+
return c(f"{score:.0f}", "yellow")
|
|
67
|
+
return c(f"{score:.0f}", "red")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _bar(score, width=20):
|
|
71
|
+
"""Return a simple progress bar."""
|
|
72
|
+
filled = int(score / 100 * width)
|
|
73
|
+
empty = width - filled
|
|
74
|
+
if score >= 80:
|
|
75
|
+
color = "green"
|
|
76
|
+
elif score >= 60:
|
|
77
|
+
color = "yellow"
|
|
78
|
+
else:
|
|
79
|
+
color = "red"
|
|
80
|
+
bar_str = c("=" * filled, color) + c("-" * empty, "dim")
|
|
81
|
+
return f"[{bar_str}]"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def print_result(result, index, total):
|
|
85
|
+
"""Print a single eval result."""
|
|
86
|
+
q = result.question[:70] + "..." if len(result.question) > 70 else result.question
|
|
87
|
+
badge = _status_badge(result.status)
|
|
88
|
+
score = _score_color(result.score)
|
|
89
|
+
bar = _bar(result.score)
|
|
90
|
+
|
|
91
|
+
print(f" {badge} {c(f'{index}/{total}', 'dim')} {bar} {score} {q}")
|
|
92
|
+
|
|
93
|
+
if result.status == "fail":
|
|
94
|
+
reasoning = result.judge_result.overall_reasoning
|
|
95
|
+
if reasoning:
|
|
96
|
+
print(f" {c(reasoning[:120], 'dim')}")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def print_summary(summary):
|
|
100
|
+
"""Print run summary."""
|
|
101
|
+
print()
|
|
102
|
+
print(c(" =" * 40, "dim"))
|
|
103
|
+
print()
|
|
104
|
+
print(f" {c(summary.suite_name, 'bold')} {c(f'{summary.duration_seconds:.1f}s', 'dim')}")
|
|
105
|
+
print()
|
|
106
|
+
|
|
107
|
+
# Score bar
|
|
108
|
+
bar = _bar(summary.overall_score, 30)
|
|
109
|
+
score = _score_color(summary.overall_score)
|
|
110
|
+
print(f" Overall: {bar} {score}")
|
|
111
|
+
print()
|
|
112
|
+
|
|
113
|
+
# Pass/Warn/Fail counts
|
|
114
|
+
p = c(f"{summary.passed} passed", "green")
|
|
115
|
+
w = c(f"{summary.warned} warned", "yellow")
|
|
116
|
+
f_count = c(f"{summary.failed} failed", "red")
|
|
117
|
+
print(f" {p} {w} {f_count} ({summary.total} total)")
|
|
118
|
+
print(f" Pass rate: {c(f'{summary.pass_rate:.0f}%', 'bold')}")
|
|
119
|
+
print()
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def print_diff(old_results, new_results):
|
|
123
|
+
"""Print regression diff between two result sets."""
|
|
124
|
+
old_by_q = {r["question"]: r for r in old_results}
|
|
125
|
+
new_by_q = {r["question"]: r for r in new_results}
|
|
126
|
+
|
|
127
|
+
all_questions = list(dict.fromkeys(list(old_by_q.keys()) + list(new_by_q.keys())))
|
|
128
|
+
|
|
129
|
+
regressions = []
|
|
130
|
+
improvements = []
|
|
131
|
+
new_cases = []
|
|
132
|
+
removed_cases = []
|
|
133
|
+
|
|
134
|
+
for q in all_questions:
|
|
135
|
+
old = old_by_q.get(q)
|
|
136
|
+
new = new_by_q.get(q)
|
|
137
|
+
|
|
138
|
+
if old and not new:
|
|
139
|
+
removed_cases.append(q)
|
|
140
|
+
elif new and not old:
|
|
141
|
+
new_cases.append((q, new))
|
|
142
|
+
else:
|
|
143
|
+
old_score = old["overall_score"]
|
|
144
|
+
new_score = new["overall_score"]
|
|
145
|
+
delta = new_score - old_score
|
|
146
|
+
|
|
147
|
+
if delta < -5:
|
|
148
|
+
regressions.append((q, old_score, new_score, delta))
|
|
149
|
+
elif delta > 5:
|
|
150
|
+
improvements.append((q, old_score, new_score, delta))
|
|
151
|
+
|
|
152
|
+
print()
|
|
153
|
+
print(c(" Regression Diff", "bold"))
|
|
154
|
+
print(c(" " + "-" * 60, "dim"))
|
|
155
|
+
|
|
156
|
+
if regressions:
|
|
157
|
+
print()
|
|
158
|
+
print(f" {c(f'{len(regressions)} Regressions', 'red')}")
|
|
159
|
+
for q, old_s, new_s, delta in sorted(regressions, key=lambda x: x[3]):
|
|
160
|
+
q_short = q[:55] + "..." if len(q) > 55 else q
|
|
161
|
+
print(f" {c(f'{delta:+.0f}', 'red')} {old_s:.0f} -> {new_s:.0f} {q_short}")
|
|
162
|
+
|
|
163
|
+
if improvements:
|
|
164
|
+
print()
|
|
165
|
+
print(f" {c(f'{len(improvements)} Improvements', 'green')}")
|
|
166
|
+
for q, old_s, new_s, delta in sorted(improvements, key=lambda x: -x[3]):
|
|
167
|
+
q_short = q[:55] + "..." if len(q) > 55 else q
|
|
168
|
+
print(f" {c(f'{delta:+.0f}', 'green')} {old_s:.0f} -> {new_s:.0f} {q_short}")
|
|
169
|
+
|
|
170
|
+
if new_cases:
|
|
171
|
+
print()
|
|
172
|
+
print(f" {c(f'{len(new_cases)} New', 'cyan')}")
|
|
173
|
+
for q, r in new_cases:
|
|
174
|
+
q_short = q[:55] + "..." if len(q) > 55 else q
|
|
175
|
+
print(f" {_score_color(r['overall_score'])} {q_short}")
|
|
176
|
+
|
|
177
|
+
if removed_cases:
|
|
178
|
+
print()
|
|
179
|
+
print(f" {c(f'{len(removed_cases)} Removed', 'dim')}")
|
|
180
|
+
for q in removed_cases:
|
|
181
|
+
q_short = q[:55] + "..." if len(q) > 55 else q
|
|
182
|
+
print(f" {c('--', 'dim')} {q_short}")
|
|
183
|
+
|
|
184
|
+
if not regressions and not improvements and not new_cases and not removed_cases:
|
|
185
|
+
print(f"\n {c('No significant changes detected.', 'dim')}")
|
|
186
|
+
|
|
187
|
+
print()
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def print_mining_result(mining_result):
|
|
191
|
+
"""Print failure mining summary."""
|
|
192
|
+
print()
|
|
193
|
+
print(c(" Failure Mining", "bold"))
|
|
194
|
+
print(c(" " + "-" * 40, "dim"))
|
|
195
|
+
print(f" Mined {c(str(mining_result.total_mined), 'bold')} examples from {mining_result.total_failures} failures")
|
|
196
|
+
print()
|
|
197
|
+
|
|
198
|
+
if mining_result.failure_distribution:
|
|
199
|
+
print(" Failure types:")
|
|
200
|
+
for ftype, count in sorted(mining_result.failure_distribution.items(), key=lambda x: -x[1]):
|
|
201
|
+
pct = count / mining_result.total_mined * 100 if mining_result.total_mined else 0
|
|
202
|
+
print(f" {c(ftype, 'cyan'):30s} {count:3d} ({pct:.0f}%)")
|
|
203
|
+
print()
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _severity_badge(severity):
|
|
207
|
+
"""Return colored severity badge."""
|
|
208
|
+
colors = {"critical": "red", "high": "red", "medium": "yellow", "low": "dim"}
|
|
209
|
+
color = colors.get(severity, "dim")
|
|
210
|
+
return c(f" {severity.upper()} ", color)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _category_label(category):
|
|
214
|
+
"""Return colored category label."""
|
|
215
|
+
return c(category.replace("_", " "), "cyan")
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def print_rca_result(rca_result):
|
|
219
|
+
"""Print batch root cause analysis result."""
|
|
220
|
+
print()
|
|
221
|
+
print(c(" Root Cause Analysis", "bold"))
|
|
222
|
+
print(c(" " + "-" * 50, "dim"))
|
|
223
|
+
print()
|
|
224
|
+
print(f" Analyzed {c(str(rca_result.total_analyzed), 'bold')} failures")
|
|
225
|
+
if rca_result.avg_failure_score:
|
|
226
|
+
print(f" Avg failure score: {_score_color(rca_result.avg_failure_score)}")
|
|
227
|
+
if rca_result.score_range and rca_result.score_range != [0, 0]:
|
|
228
|
+
print(f" Score range: {rca_result.score_range[0]:.0f} - {rca_result.score_range[1]:.0f}")
|
|
229
|
+
print()
|
|
230
|
+
|
|
231
|
+
if rca_result.summary:
|
|
232
|
+
print(f" {c('Summary:', 'bold')} {rca_result.summary}")
|
|
233
|
+
print()
|
|
234
|
+
|
|
235
|
+
if rca_result.top_recommendation:
|
|
236
|
+
print(f" {c('Top recommendation:', 'green')} {rca_result.top_recommendation}")
|
|
237
|
+
print()
|
|
238
|
+
|
|
239
|
+
if rca_result.root_causes:
|
|
240
|
+
print(f" {c(f'{len(rca_result.root_causes)} Root Causes Found', 'bold')}")
|
|
241
|
+
print()
|
|
242
|
+
for i, rc in enumerate(rca_result.root_causes):
|
|
243
|
+
badge = _severity_badge(rc.severity)
|
|
244
|
+
cat = _category_label(rc.category)
|
|
245
|
+
print(f" {i+1}. {badge} {cat} {c(rc.title, 'bold')}")
|
|
246
|
+
if rc.description:
|
|
247
|
+
print(f" {rc.description}")
|
|
248
|
+
if rc.evidence:
|
|
249
|
+
for ev in rc.evidence[:3]:
|
|
250
|
+
print(f" {c('>', 'dim')} {ev}")
|
|
251
|
+
if rc.recommendation:
|
|
252
|
+
print(f" {c('Fix:', 'green')} {rc.recommendation}")
|
|
253
|
+
print()
|
|
254
|
+
print()
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def print_targeted_rca_result(targeted):
|
|
258
|
+
"""Print targeted (single result) RCA."""
|
|
259
|
+
print()
|
|
260
|
+
print(c(" Targeted Root Cause Analysis", "bold"))
|
|
261
|
+
print(c(" " + "-" * 50, "dim"))
|
|
262
|
+
print()
|
|
263
|
+
|
|
264
|
+
q = targeted.question[:80] + "..." if len(targeted.question) > 80 else targeted.question
|
|
265
|
+
print(f" Question: {q}")
|
|
266
|
+
print(f" Score: {_score_color(targeted.score)}")
|
|
267
|
+
print(f" Likely cause: {c(targeted.likely_cause.replace('_', ' '), 'cyan')}")
|
|
268
|
+
print(f" Confidence: {c(f'{targeted.confidence}%', 'bold')}")
|
|
269
|
+
print()
|
|
270
|
+
|
|
271
|
+
if targeted.diagnosis:
|
|
272
|
+
print(f" {c('Diagnosis:', 'bold')}")
|
|
273
|
+
print(f" {targeted.diagnosis}")
|
|
274
|
+
print()
|
|
275
|
+
|
|
276
|
+
if targeted.contributing_factors:
|
|
277
|
+
print(f" {c('Contributing factors:', 'bold')}")
|
|
278
|
+
for factor in targeted.contributing_factors:
|
|
279
|
+
print(f" {c('>', 'dim')} {factor}")
|
|
280
|
+
print()
|
|
281
|
+
|
|
282
|
+
if targeted.fix_actions:
|
|
283
|
+
print(f" {c('Fix actions:', 'bold')}")
|
|
284
|
+
for fa in targeted.fix_actions:
|
|
285
|
+
priority_color = {"high": "red", "medium": "yellow", "low": "dim"}.get(fa.priority, "dim")
|
|
286
|
+
print(f" [{c(fa.priority.upper(), priority_color)}] {fa.action} {c(f'({fa.effort})', 'dim')}")
|
|
287
|
+
print()
|
|
288
|
+
print()
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
# ---- Commands ----
|
|
292
|
+
|
|
293
|
+
def cmd_run(args):
|
|
294
|
+
"""Run eval suite."""
|
|
295
|
+
from cane_eval.suite import TestSuite
|
|
296
|
+
from cane_eval.engine import EvalRunner
|
|
297
|
+
from cane_eval.export import Exporter
|
|
298
|
+
|
|
299
|
+
# Load suite
|
|
300
|
+
try:
|
|
301
|
+
suite = TestSuite.from_yaml(args.suite)
|
|
302
|
+
except FileNotFoundError:
|
|
303
|
+
print(c(f" Error: Suite file not found: {args.suite}", "red"))
|
|
304
|
+
sys.exit(1)
|
|
305
|
+
except Exception as e:
|
|
306
|
+
print(c(f" Error loading suite: {e}", "red"))
|
|
307
|
+
sys.exit(1)
|
|
308
|
+
|
|
309
|
+
# Override model if specified
|
|
310
|
+
if args.model:
|
|
311
|
+
suite.model = args.model
|
|
312
|
+
|
|
313
|
+
print()
|
|
314
|
+
print(f" {c('cane-eval', 'cyan')} {c(suite.name, 'bold')}")
|
|
315
|
+
print(f" {len(suite.tests)} test cases | model: {suite.model}")
|
|
316
|
+
print()
|
|
317
|
+
|
|
318
|
+
# Parse tags
|
|
319
|
+
tags = args.tags.split(",") if args.tags else None
|
|
320
|
+
|
|
321
|
+
# Run
|
|
322
|
+
runner = EvalRunner(
|
|
323
|
+
api_key=args.api_key or os.environ.get("ANTHROPIC_API_KEY"),
|
|
324
|
+
model=args.model,
|
|
325
|
+
verbose=not args.quiet,
|
|
326
|
+
on_result=print_result if not args.quiet else None,
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
summary = runner.run(suite, tags=tags)
|
|
330
|
+
|
|
331
|
+
# Print summary
|
|
332
|
+
if not args.quiet:
|
|
333
|
+
print_summary(summary)
|
|
334
|
+
|
|
335
|
+
# Save results JSON
|
|
336
|
+
if args.output_json:
|
|
337
|
+
with open(args.output_json, "w") as f:
|
|
338
|
+
json.dump(summary.to_dict(), f, indent=2)
|
|
339
|
+
print(f" Results saved to {args.output_json}")
|
|
340
|
+
|
|
341
|
+
# Export training data
|
|
342
|
+
if args.export:
|
|
343
|
+
exporter = Exporter(summary)
|
|
344
|
+
output_path = args.output or f"eval_{args.export}.jsonl"
|
|
345
|
+
exporter.to_file(output_path, format=args.export)
|
|
346
|
+
print(f" Exported {args.export} data to {output_path}")
|
|
347
|
+
|
|
348
|
+
# Mine failures
|
|
349
|
+
if args.mine:
|
|
350
|
+
from cane_eval.mining import FailureMiner
|
|
351
|
+
|
|
352
|
+
miner = FailureMiner(
|
|
353
|
+
api_key=args.api_key or os.environ.get("ANTHROPIC_API_KEY"),
|
|
354
|
+
model=args.model or suite.model,
|
|
355
|
+
verbose=not args.quiet,
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
mining_result = miner.mine(
|
|
359
|
+
summary,
|
|
360
|
+
max_score=args.mine_threshold,
|
|
361
|
+
max_examples=args.mine_max,
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
if not args.quiet:
|
|
365
|
+
print_mining_result(mining_result)
|
|
366
|
+
|
|
367
|
+
if mining_result.total_mined > 0:
|
|
368
|
+
mine_output = args.mine_output or "mined_dpo.jsonl"
|
|
369
|
+
mining_result.to_file(mine_output, format=args.mine_format)
|
|
370
|
+
print(f" Mined data saved to {mine_output}")
|
|
371
|
+
|
|
372
|
+
# Exit code based on failures
|
|
373
|
+
if args.fail_on_warn:
|
|
374
|
+
sys.exit(1 if (summary.failed > 0 or summary.warned > 0) else 0)
|
|
375
|
+
else:
|
|
376
|
+
sys.exit(1 if summary.failed > 0 else 0)
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def cmd_diff(args):
|
|
380
|
+
"""Compare two eval result files."""
|
|
381
|
+
try:
|
|
382
|
+
with open(args.old, "r") as f:
|
|
383
|
+
old_data = json.load(f)
|
|
384
|
+
with open(args.new, "r") as f:
|
|
385
|
+
new_data = json.load(f)
|
|
386
|
+
except FileNotFoundError as e:
|
|
387
|
+
print(c(f" Error: {e}", "red"))
|
|
388
|
+
sys.exit(1)
|
|
389
|
+
|
|
390
|
+
old_results = old_data.get("results", old_data if isinstance(old_data, list) else [])
|
|
391
|
+
new_results = new_data.get("results", new_data if isinstance(new_data, list) else [])
|
|
392
|
+
|
|
393
|
+
print_diff(old_results, new_results)
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def cmd_rca(args):
|
|
397
|
+
"""Run root cause analysis on eval failures."""
|
|
398
|
+
from cane_eval.suite import TestSuite
|
|
399
|
+
from cane_eval.engine import EvalRunner
|
|
400
|
+
from cane_eval.rca import RootCauseAnalyzer
|
|
401
|
+
|
|
402
|
+
# Load suite
|
|
403
|
+
try:
|
|
404
|
+
suite = TestSuite.from_yaml(args.suite)
|
|
405
|
+
except FileNotFoundError:
|
|
406
|
+
print(c(f" Error: Suite file not found: {args.suite}", "red"))
|
|
407
|
+
sys.exit(1)
|
|
408
|
+
except Exception as e:
|
|
409
|
+
print(c(f" Error loading suite: {e}", "red"))
|
|
410
|
+
sys.exit(1)
|
|
411
|
+
|
|
412
|
+
# Override model if specified
|
|
413
|
+
if args.model:
|
|
414
|
+
suite.model = args.model
|
|
415
|
+
|
|
416
|
+
print()
|
|
417
|
+
print(f" {c('cane-eval rca', 'cyan')} {c(suite.name, 'bold')}")
|
|
418
|
+
print(f" {len(suite.tests)} test cases | model: {suite.model}")
|
|
419
|
+
print()
|
|
420
|
+
|
|
421
|
+
# Parse tags
|
|
422
|
+
tags = args.tags.split(",") if args.tags else None
|
|
423
|
+
|
|
424
|
+
# If results JSON provided, load from file instead of running
|
|
425
|
+
if args.results:
|
|
426
|
+
try:
|
|
427
|
+
with open(args.results, "r") as f:
|
|
428
|
+
data = json.load(f)
|
|
429
|
+
except FileNotFoundError:
|
|
430
|
+
print(c(f" Error: Results file not found: {args.results}", "red"))
|
|
431
|
+
sys.exit(1)
|
|
432
|
+
|
|
433
|
+
from cane_eval.engine import EvalResult as ER
|
|
434
|
+
from cane_eval.judge import JudgeResult, CriteriaScore
|
|
435
|
+
|
|
436
|
+
results_list = data.get("results", data if isinstance(data, list) else [])
|
|
437
|
+
eval_results = []
|
|
438
|
+
for r in results_list:
|
|
439
|
+
criteria_scores = []
|
|
440
|
+
for name, score_val in (r.get("criteria_scores") or {}).items():
|
|
441
|
+
criteria_scores.append(CriteriaScore(name=name, score=float(score_val), reasoning=""))
|
|
442
|
+
jr = JudgeResult(
|
|
443
|
+
overall_score=r.get("overall_score", 0),
|
|
444
|
+
overall_reasoning=r.get("judge_reasoning", ""),
|
|
445
|
+
status=r.get("status", "fail"),
|
|
446
|
+
criteria_scores=criteria_scores,
|
|
447
|
+
)
|
|
448
|
+
eval_results.append(ER(
|
|
449
|
+
question=r.get("question", ""),
|
|
450
|
+
expected_answer=r.get("expected_answer", ""),
|
|
451
|
+
agent_answer=r.get("agent_answer", ""),
|
|
452
|
+
judge_result=jr,
|
|
453
|
+
tags=r.get("tags", []),
|
|
454
|
+
))
|
|
455
|
+
|
|
456
|
+
from cane_eval.engine import RunSummary
|
|
457
|
+
summary = RunSummary(
|
|
458
|
+
suite_name=data.get("suite_name", "loaded"),
|
|
459
|
+
total=len(eval_results),
|
|
460
|
+
results=eval_results,
|
|
461
|
+
)
|
|
462
|
+
else:
|
|
463
|
+
# Run the eval first
|
|
464
|
+
print(" Running eval first...")
|
|
465
|
+
print()
|
|
466
|
+
|
|
467
|
+
runner = EvalRunner(
|
|
468
|
+
api_key=args.api_key or os.environ.get("ANTHROPIC_API_KEY"),
|
|
469
|
+
model=args.model,
|
|
470
|
+
verbose=not args.quiet,
|
|
471
|
+
on_result=print_result if not args.quiet else None,
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
summary = runner.run(suite, tags=tags)
|
|
475
|
+
|
|
476
|
+
if not args.quiet:
|
|
477
|
+
print_summary(summary)
|
|
478
|
+
|
|
479
|
+
# Now run RCA
|
|
480
|
+
analyzer = RootCauseAnalyzer(
|
|
481
|
+
api_key=args.api_key or os.environ.get("ANTHROPIC_API_KEY"),
|
|
482
|
+
model=args.model or suite.model,
|
|
483
|
+
verbose=not args.quiet,
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
print(f" {c('Running root cause analysis...', 'cyan')}")
|
|
487
|
+
print()
|
|
488
|
+
|
|
489
|
+
rca_result = analyzer.analyze(
|
|
490
|
+
summary,
|
|
491
|
+
max_score=args.threshold,
|
|
492
|
+
max_failures=args.max_failures,
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
if not args.quiet:
|
|
496
|
+
print_rca_result(rca_result)
|
|
497
|
+
|
|
498
|
+
# Optionally run targeted analysis on each failure
|
|
499
|
+
if args.targeted:
|
|
500
|
+
failures = [r for r in summary.results if r.score <= args.threshold]
|
|
501
|
+
failures.sort(key=lambda r: r.score)
|
|
502
|
+
targeted_limit = min(len(failures), args.targeted_max)
|
|
503
|
+
|
|
504
|
+
if failures:
|
|
505
|
+
print(f" {c(f'Running targeted analysis on {targeted_limit} worst failures...', 'cyan')}")
|
|
506
|
+
print()
|
|
507
|
+
|
|
508
|
+
for r in failures[:targeted_limit]:
|
|
509
|
+
targeted = analyzer.analyze_result(r)
|
|
510
|
+
if not args.quiet:
|
|
511
|
+
print_targeted_rca_result(targeted)
|
|
512
|
+
|
|
513
|
+
# Save results JSON
|
|
514
|
+
if args.output:
|
|
515
|
+
with open(args.output, "w") as f:
|
|
516
|
+
json.dump(rca_result.to_dict(), f, indent=2)
|
|
517
|
+
print(f" Results saved to {args.output}")
|
|
518
|
+
|
|
519
|
+
# Exit code based on critical root causes
|
|
520
|
+
critical_count = sum(1 for rc in rca_result.root_causes if rc.severity == "critical")
|
|
521
|
+
if critical_count > 0:
|
|
522
|
+
sys.exit(1)
|
|
523
|
+
sys.exit(0)
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
def cmd_validate(args):
|
|
527
|
+
"""Validate a test suite YAML file."""
|
|
528
|
+
from cane_eval.suite import TestSuite
|
|
529
|
+
|
|
530
|
+
try:
|
|
531
|
+
suite = TestSuite.from_yaml(args.suite)
|
|
532
|
+
print(f" {c('Valid', 'green')} {suite.name}")
|
|
533
|
+
print(f" {len(suite.tests)} test cases, {len(suite.criteria)} criteria")
|
|
534
|
+
if suite.custom_rules:
|
|
535
|
+
print(f" {len(suite.custom_rules)} custom rules")
|
|
536
|
+
if suite.target.type != "callable":
|
|
537
|
+
print(f" Target: {suite.target.type} ({suite.target.url or suite.target.command})")
|
|
538
|
+
except Exception as e:
|
|
539
|
+
print(c(f" Invalid: {e}", "red"))
|
|
540
|
+
sys.exit(1)
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
# ---- Main ----
|
|
544
|
+
|
|
545
|
+
def main():
|
|
546
|
+
parser = argparse.ArgumentParser(
|
|
547
|
+
prog="cane-eval",
|
|
548
|
+
description="LLM-as-Judge evaluation for AI agents",
|
|
549
|
+
)
|
|
550
|
+
parser.add_argument("--version", action="version", version="%(prog)s 0.1.0")
|
|
551
|
+
|
|
552
|
+
subparsers = parser.add_subparsers(dest="command", help="Command to run")
|
|
553
|
+
|
|
554
|
+
# run
|
|
555
|
+
run_parser = subparsers.add_parser("run", help="Run an eval suite")
|
|
556
|
+
run_parser.add_argument("suite", help="Path to YAML test suite")
|
|
557
|
+
run_parser.add_argument("--model", help="Override judge model")
|
|
558
|
+
run_parser.add_argument("--api-key", help="Anthropic API key (or set ANTHROPIC_API_KEY)")
|
|
559
|
+
run_parser.add_argument("--tags", help="Comma-separated tag filter")
|
|
560
|
+
run_parser.add_argument("--export", choices=["dpo", "sft", "openai", "raw"], help="Export format")
|
|
561
|
+
run_parser.add_argument("--output", help="Export output path")
|
|
562
|
+
run_parser.add_argument("--output-json", help="Save full results as JSON")
|
|
563
|
+
run_parser.add_argument("--mine", action="store_true", help="Run failure mining after eval")
|
|
564
|
+
run_parser.add_argument("--mine-threshold", type=float, default=60, help="Max score for mining (default: 60)")
|
|
565
|
+
run_parser.add_argument("--mine-max", type=int, default=100, help="Max examples to mine (default: 100)")
|
|
566
|
+
run_parser.add_argument("--mine-format", choices=["dpo", "sft"], default="dpo", help="Mining export format")
|
|
567
|
+
run_parser.add_argument("--mine-output", help="Mining export path")
|
|
568
|
+
run_parser.add_argument("--quiet", "-q", action="store_true", help="Minimal output")
|
|
569
|
+
run_parser.add_argument("--fail-on-warn", action="store_true", help="Exit 1 on warnings too")
|
|
570
|
+
|
|
571
|
+
# diff
|
|
572
|
+
diff_parser = subparsers.add_parser("diff", help="Compare two eval runs (regression diff)")
|
|
573
|
+
diff_parser.add_argument("old", help="Path to older results JSON")
|
|
574
|
+
diff_parser.add_argument("new", help="Path to newer results JSON")
|
|
575
|
+
|
|
576
|
+
# rca
|
|
577
|
+
rca_parser = subparsers.add_parser("rca", help="Run root cause analysis on eval failures")
|
|
578
|
+
rca_parser.add_argument("suite", help="Path to YAML test suite")
|
|
579
|
+
rca_parser.add_argument("--model", help="Override judge/analysis model")
|
|
580
|
+
rca_parser.add_argument("--api-key", help="Anthropic API key (or set ANTHROPIC_API_KEY)")
|
|
581
|
+
rca_parser.add_argument("--tags", help="Comma-separated tag filter")
|
|
582
|
+
rca_parser.add_argument("--results", help="Path to existing results JSON (skip running eval)")
|
|
583
|
+
rca_parser.add_argument("--threshold", type=float, default=60, help="Max score for analysis (default: 60)")
|
|
584
|
+
rca_parser.add_argument("--max-failures", type=int, default=30, help="Max failures to analyze (default: 30)")
|
|
585
|
+
rca_parser.add_argument("--targeted", action="store_true", help="Also run targeted analysis on each failure")
|
|
586
|
+
rca_parser.add_argument("--targeted-max", type=int, default=5, help="Max results for targeted analysis (default: 5)")
|
|
587
|
+
rca_parser.add_argument("--output", help="Save RCA results as JSON")
|
|
588
|
+
rca_parser.add_argument("--quiet", "-q", action="store_true", help="Minimal output")
|
|
589
|
+
|
|
590
|
+
# validate
|
|
591
|
+
validate_parser = subparsers.add_parser("validate", help="Validate a test suite YAML")
|
|
592
|
+
validate_parser.add_argument("suite", help="Path to YAML test suite")
|
|
593
|
+
|
|
594
|
+
args = parser.parse_args()
|
|
595
|
+
|
|
596
|
+
if args.command == "run":
|
|
597
|
+
cmd_run(args)
|
|
598
|
+
elif args.command == "diff":
|
|
599
|
+
cmd_diff(args)
|
|
600
|
+
elif args.command == "rca":
|
|
601
|
+
cmd_rca(args)
|
|
602
|
+
elif args.command == "validate":
|
|
603
|
+
cmd_validate(args)
|
|
604
|
+
else:
|
|
605
|
+
parser.print_help()
|
|
606
|
+
sys.exit(0)
|
|
607
|
+
|
|
608
|
+
|
|
609
|
+
if __name__ == "__main__":
|
|
610
|
+
main()
|