cane-eval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cane_eval/__init__.py ADDED
@@ -0,0 +1,44 @@
1
+ """
2
+ cane-eval -- LLM-as-Judge evaluation for AI agents.
3
+
4
+ Open-source eval toolkit: YAML test suites, Claude-powered judging,
5
+ regression diffs, failure mining, root cause analysis, and training data export.
6
+ """
7
+
8
+ __version__ = "0.1.0"
9
+
10
+ from cane_eval.suite import TestSuite, TestCase
11
+ from cane_eval.judge import Judge, JudgeResult, CriteriaScore
12
+ from cane_eval.engine import EvalRunner, EvalResult, RunSummary
13
+ from cane_eval.export import Exporter
14
+ from cane_eval.mining import FailureMiner
15
+ from cane_eval.rca import RootCauseAnalyzer, RCAResult, TargetedRCAResult
16
+
17
+ # Integrations (lazy-loaded to avoid import errors if frameworks not installed)
18
+ from cane_eval.integrations import (
19
+ evaluate_langchain,
20
+ evaluate_llamaindex,
21
+ evaluate_openai,
22
+ evaluate_fastapi,
23
+ )
24
+
25
+ __all__ = [
26
+ "TestSuite",
27
+ "TestCase",
28
+ "Judge",
29
+ "JudgeResult",
30
+ "CriteriaScore",
31
+ "EvalRunner",
32
+ "EvalResult",
33
+ "RunSummary",
34
+ "Exporter",
35
+ "FailureMiner",
36
+ "RootCauseAnalyzer",
37
+ "RCAResult",
38
+ "TargetedRCAResult",
39
+ # Integrations
40
+ "evaluate_langchain",
41
+ "evaluate_llamaindex",
42
+ "evaluate_openai",
43
+ "evaluate_fastapi",
44
+ ]
cane_eval/cli.py ADDED
@@ -0,0 +1,610 @@
1
+ """
2
+ cli.py -- Command-line interface for cane-eval.
3
+
4
+ Usage:
5
+ cane-eval run tests.yaml
6
+ cane-eval run tests.yaml --model claude-sonnet-4-5-20250929
7
+ cane-eval run tests.yaml --tags policy,returns
8
+ cane-eval run tests.yaml --export dpo --output training.jsonl
9
+ cane-eval run tests.yaml --mine --mine-threshold 60
10
+ cane-eval diff results_v1.json results_v2.json
11
+ """
12
+
13
+ import argparse
14
+ import json
15
+ import sys
16
+ import os
17
+ from pathlib import Path
18
+
19
+
20
+ # ---- Color helpers (no dependency needed) ----
21
+
22
+ COLORS = {
23
+ "red": "\033[91m",
24
+ "green": "\033[92m",
25
+ "yellow": "\033[93m",
26
+ "blue": "\033[94m",
27
+ "magenta": "\033[95m",
28
+ "cyan": "\033[96m",
29
+ "bold": "\033[1m",
30
+ "dim": "\033[2m",
31
+ "reset": "\033[0m",
32
+ }
33
+
34
+ def _supports_color():
35
+ """Check if terminal supports color."""
36
+ if os.environ.get("NO_COLOR"):
37
+ return False
38
+ if os.environ.get("FORCE_COLOR"):
39
+ return True
40
+ return hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
41
+
42
+ USE_COLOR = _supports_color()
43
+
44
+ def c(text, color):
45
+ if not USE_COLOR:
46
+ return text
47
+ return f"{COLORS.get(color, '')}{text}{COLORS['reset']}"
48
+
49
+
50
+ # ---- Output formatters ----
51
+
52
+ def _status_badge(status):
53
+ """Return colored status badge."""
54
+ if status == "pass":
55
+ return c(" PASS ", "green")
56
+ if status == "warn":
57
+ return c(" WARN ", "yellow")
58
+ return c(" FAIL ", "red")
59
+
60
+
61
+ def _score_color(score):
62
+ """Return score with appropriate color."""
63
+ if score >= 80:
64
+ return c(f"{score:.0f}", "green")
65
+ if score >= 60:
66
+ return c(f"{score:.0f}", "yellow")
67
+ return c(f"{score:.0f}", "red")
68
+
69
+
70
+ def _bar(score, width=20):
71
+ """Return a simple progress bar."""
72
+ filled = int(score / 100 * width)
73
+ empty = width - filled
74
+ if score >= 80:
75
+ color = "green"
76
+ elif score >= 60:
77
+ color = "yellow"
78
+ else:
79
+ color = "red"
80
+ bar_str = c("=" * filled, color) + c("-" * empty, "dim")
81
+ return f"[{bar_str}]"
82
+
83
+
84
+ def print_result(result, index, total):
85
+ """Print a single eval result."""
86
+ q = result.question[:70] + "..." if len(result.question) > 70 else result.question
87
+ badge = _status_badge(result.status)
88
+ score = _score_color(result.score)
89
+ bar = _bar(result.score)
90
+
91
+ print(f" {badge} {c(f'{index}/{total}', 'dim')} {bar} {score} {q}")
92
+
93
+ if result.status == "fail":
94
+ reasoning = result.judge_result.overall_reasoning
95
+ if reasoning:
96
+ print(f" {c(reasoning[:120], 'dim')}")
97
+
98
+
99
+ def print_summary(summary):
100
+ """Print run summary."""
101
+ print()
102
+ print(c(" =" * 40, "dim"))
103
+ print()
104
+ print(f" {c(summary.suite_name, 'bold')} {c(f'{summary.duration_seconds:.1f}s', 'dim')}")
105
+ print()
106
+
107
+ # Score bar
108
+ bar = _bar(summary.overall_score, 30)
109
+ score = _score_color(summary.overall_score)
110
+ print(f" Overall: {bar} {score}")
111
+ print()
112
+
113
+ # Pass/Warn/Fail counts
114
+ p = c(f"{summary.passed} passed", "green")
115
+ w = c(f"{summary.warned} warned", "yellow")
116
+ f_count = c(f"{summary.failed} failed", "red")
117
+ print(f" {p} {w} {f_count} ({summary.total} total)")
118
+ print(f" Pass rate: {c(f'{summary.pass_rate:.0f}%', 'bold')}")
119
+ print()
120
+
121
+
122
+ def print_diff(old_results, new_results):
123
+ """Print regression diff between two result sets."""
124
+ old_by_q = {r["question"]: r for r in old_results}
125
+ new_by_q = {r["question"]: r for r in new_results}
126
+
127
+ all_questions = list(dict.fromkeys(list(old_by_q.keys()) + list(new_by_q.keys())))
128
+
129
+ regressions = []
130
+ improvements = []
131
+ new_cases = []
132
+ removed_cases = []
133
+
134
+ for q in all_questions:
135
+ old = old_by_q.get(q)
136
+ new = new_by_q.get(q)
137
+
138
+ if old and not new:
139
+ removed_cases.append(q)
140
+ elif new and not old:
141
+ new_cases.append((q, new))
142
+ else:
143
+ old_score = old["overall_score"]
144
+ new_score = new["overall_score"]
145
+ delta = new_score - old_score
146
+
147
+ if delta < -5:
148
+ regressions.append((q, old_score, new_score, delta))
149
+ elif delta > 5:
150
+ improvements.append((q, old_score, new_score, delta))
151
+
152
+ print()
153
+ print(c(" Regression Diff", "bold"))
154
+ print(c(" " + "-" * 60, "dim"))
155
+
156
+ if regressions:
157
+ print()
158
+ print(f" {c(f'{len(regressions)} Regressions', 'red')}")
159
+ for q, old_s, new_s, delta in sorted(regressions, key=lambda x: x[3]):
160
+ q_short = q[:55] + "..." if len(q) > 55 else q
161
+ print(f" {c(f'{delta:+.0f}', 'red')} {old_s:.0f} -> {new_s:.0f} {q_short}")
162
+
163
+ if improvements:
164
+ print()
165
+ print(f" {c(f'{len(improvements)} Improvements', 'green')}")
166
+ for q, old_s, new_s, delta in sorted(improvements, key=lambda x: -x[3]):
167
+ q_short = q[:55] + "..." if len(q) > 55 else q
168
+ print(f" {c(f'{delta:+.0f}', 'green')} {old_s:.0f} -> {new_s:.0f} {q_short}")
169
+
170
+ if new_cases:
171
+ print()
172
+ print(f" {c(f'{len(new_cases)} New', 'cyan')}")
173
+ for q, r in new_cases:
174
+ q_short = q[:55] + "..." if len(q) > 55 else q
175
+ print(f" {_score_color(r['overall_score'])} {q_short}")
176
+
177
+ if removed_cases:
178
+ print()
179
+ print(f" {c(f'{len(removed_cases)} Removed', 'dim')}")
180
+ for q in removed_cases:
181
+ q_short = q[:55] + "..." if len(q) > 55 else q
182
+ print(f" {c('--', 'dim')} {q_short}")
183
+
184
+ if not regressions and not improvements and not new_cases and not removed_cases:
185
+ print(f"\n {c('No significant changes detected.', 'dim')}")
186
+
187
+ print()
188
+
189
+
190
+ def print_mining_result(mining_result):
191
+ """Print failure mining summary."""
192
+ print()
193
+ print(c(" Failure Mining", "bold"))
194
+ print(c(" " + "-" * 40, "dim"))
195
+ print(f" Mined {c(str(mining_result.total_mined), 'bold')} examples from {mining_result.total_failures} failures")
196
+ print()
197
+
198
+ if mining_result.failure_distribution:
199
+ print(" Failure types:")
200
+ for ftype, count in sorted(mining_result.failure_distribution.items(), key=lambda x: -x[1]):
201
+ pct = count / mining_result.total_mined * 100 if mining_result.total_mined else 0
202
+ print(f" {c(ftype, 'cyan'):30s} {count:3d} ({pct:.0f}%)")
203
+ print()
204
+
205
+
206
+ def _severity_badge(severity):
207
+ """Return colored severity badge."""
208
+ colors = {"critical": "red", "high": "red", "medium": "yellow", "low": "dim"}
209
+ color = colors.get(severity, "dim")
210
+ return c(f" {severity.upper()} ", color)
211
+
212
+
213
+ def _category_label(category):
214
+ """Return colored category label."""
215
+ return c(category.replace("_", " "), "cyan")
216
+
217
+
218
+ def print_rca_result(rca_result):
219
+ """Print batch root cause analysis result."""
220
+ print()
221
+ print(c(" Root Cause Analysis", "bold"))
222
+ print(c(" " + "-" * 50, "dim"))
223
+ print()
224
+ print(f" Analyzed {c(str(rca_result.total_analyzed), 'bold')} failures")
225
+ if rca_result.avg_failure_score:
226
+ print(f" Avg failure score: {_score_color(rca_result.avg_failure_score)}")
227
+ if rca_result.score_range and rca_result.score_range != [0, 0]:
228
+ print(f" Score range: {rca_result.score_range[0]:.0f} - {rca_result.score_range[1]:.0f}")
229
+ print()
230
+
231
+ if rca_result.summary:
232
+ print(f" {c('Summary:', 'bold')} {rca_result.summary}")
233
+ print()
234
+
235
+ if rca_result.top_recommendation:
236
+ print(f" {c('Top recommendation:', 'green')} {rca_result.top_recommendation}")
237
+ print()
238
+
239
+ if rca_result.root_causes:
240
+ print(f" {c(f'{len(rca_result.root_causes)} Root Causes Found', 'bold')}")
241
+ print()
242
+ for i, rc in enumerate(rca_result.root_causes):
243
+ badge = _severity_badge(rc.severity)
244
+ cat = _category_label(rc.category)
245
+ print(f" {i+1}. {badge} {cat} {c(rc.title, 'bold')}")
246
+ if rc.description:
247
+ print(f" {rc.description}")
248
+ if rc.evidence:
249
+ for ev in rc.evidence[:3]:
250
+ print(f" {c('>', 'dim')} {ev}")
251
+ if rc.recommendation:
252
+ print(f" {c('Fix:', 'green')} {rc.recommendation}")
253
+ print()
254
+ print()
255
+
256
+
257
+ def print_targeted_rca_result(targeted):
258
+ """Print targeted (single result) RCA."""
259
+ print()
260
+ print(c(" Targeted Root Cause Analysis", "bold"))
261
+ print(c(" " + "-" * 50, "dim"))
262
+ print()
263
+
264
+ q = targeted.question[:80] + "..." if len(targeted.question) > 80 else targeted.question
265
+ print(f" Question: {q}")
266
+ print(f" Score: {_score_color(targeted.score)}")
267
+ print(f" Likely cause: {c(targeted.likely_cause.replace('_', ' '), 'cyan')}")
268
+ print(f" Confidence: {c(f'{targeted.confidence}%', 'bold')}")
269
+ print()
270
+
271
+ if targeted.diagnosis:
272
+ print(f" {c('Diagnosis:', 'bold')}")
273
+ print(f" {targeted.diagnosis}")
274
+ print()
275
+
276
+ if targeted.contributing_factors:
277
+ print(f" {c('Contributing factors:', 'bold')}")
278
+ for factor in targeted.contributing_factors:
279
+ print(f" {c('>', 'dim')} {factor}")
280
+ print()
281
+
282
+ if targeted.fix_actions:
283
+ print(f" {c('Fix actions:', 'bold')}")
284
+ for fa in targeted.fix_actions:
285
+ priority_color = {"high": "red", "medium": "yellow", "low": "dim"}.get(fa.priority, "dim")
286
+ print(f" [{c(fa.priority.upper(), priority_color)}] {fa.action} {c(f'({fa.effort})', 'dim')}")
287
+ print()
288
+ print()
289
+
290
+
291
+ # ---- Commands ----
292
+
293
+ def cmd_run(args):
294
+ """Run eval suite."""
295
+ from cane_eval.suite import TestSuite
296
+ from cane_eval.engine import EvalRunner
297
+ from cane_eval.export import Exporter
298
+
299
+ # Load suite
300
+ try:
301
+ suite = TestSuite.from_yaml(args.suite)
302
+ except FileNotFoundError:
303
+ print(c(f" Error: Suite file not found: {args.suite}", "red"))
304
+ sys.exit(1)
305
+ except Exception as e:
306
+ print(c(f" Error loading suite: {e}", "red"))
307
+ sys.exit(1)
308
+
309
+ # Override model if specified
310
+ if args.model:
311
+ suite.model = args.model
312
+
313
+ print()
314
+ print(f" {c('cane-eval', 'cyan')} {c(suite.name, 'bold')}")
315
+ print(f" {len(suite.tests)} test cases | model: {suite.model}")
316
+ print()
317
+
318
+ # Parse tags
319
+ tags = args.tags.split(",") if args.tags else None
320
+
321
+ # Run
322
+ runner = EvalRunner(
323
+ api_key=args.api_key or os.environ.get("ANTHROPIC_API_KEY"),
324
+ model=args.model,
325
+ verbose=not args.quiet,
326
+ on_result=print_result if not args.quiet else None,
327
+ )
328
+
329
+ summary = runner.run(suite, tags=tags)
330
+
331
+ # Print summary
332
+ if not args.quiet:
333
+ print_summary(summary)
334
+
335
+ # Save results JSON
336
+ if args.output_json:
337
+ with open(args.output_json, "w") as f:
338
+ json.dump(summary.to_dict(), f, indent=2)
339
+ print(f" Results saved to {args.output_json}")
340
+
341
+ # Export training data
342
+ if args.export:
343
+ exporter = Exporter(summary)
344
+ output_path = args.output or f"eval_{args.export}.jsonl"
345
+ exporter.to_file(output_path, format=args.export)
346
+ print(f" Exported {args.export} data to {output_path}")
347
+
348
+ # Mine failures
349
+ if args.mine:
350
+ from cane_eval.mining import FailureMiner
351
+
352
+ miner = FailureMiner(
353
+ api_key=args.api_key or os.environ.get("ANTHROPIC_API_KEY"),
354
+ model=args.model or suite.model,
355
+ verbose=not args.quiet,
356
+ )
357
+
358
+ mining_result = miner.mine(
359
+ summary,
360
+ max_score=args.mine_threshold,
361
+ max_examples=args.mine_max,
362
+ )
363
+
364
+ if not args.quiet:
365
+ print_mining_result(mining_result)
366
+
367
+ if mining_result.total_mined > 0:
368
+ mine_output = args.mine_output or "mined_dpo.jsonl"
369
+ mining_result.to_file(mine_output, format=args.mine_format)
370
+ print(f" Mined data saved to {mine_output}")
371
+
372
+ # Exit code based on failures
373
+ if args.fail_on_warn:
374
+ sys.exit(1 if (summary.failed > 0 or summary.warned > 0) else 0)
375
+ else:
376
+ sys.exit(1 if summary.failed > 0 else 0)
377
+
378
+
379
+ def cmd_diff(args):
380
+ """Compare two eval result files."""
381
+ try:
382
+ with open(args.old, "r") as f:
383
+ old_data = json.load(f)
384
+ with open(args.new, "r") as f:
385
+ new_data = json.load(f)
386
+ except FileNotFoundError as e:
387
+ print(c(f" Error: {e}", "red"))
388
+ sys.exit(1)
389
+
390
+ old_results = old_data.get("results", old_data if isinstance(old_data, list) else [])
391
+ new_results = new_data.get("results", new_data if isinstance(new_data, list) else [])
392
+
393
+ print_diff(old_results, new_results)
394
+
395
+
396
+ def cmd_rca(args):
397
+ """Run root cause analysis on eval failures."""
398
+ from cane_eval.suite import TestSuite
399
+ from cane_eval.engine import EvalRunner
400
+ from cane_eval.rca import RootCauseAnalyzer
401
+
402
+ # Load suite
403
+ try:
404
+ suite = TestSuite.from_yaml(args.suite)
405
+ except FileNotFoundError:
406
+ print(c(f" Error: Suite file not found: {args.suite}", "red"))
407
+ sys.exit(1)
408
+ except Exception as e:
409
+ print(c(f" Error loading suite: {e}", "red"))
410
+ sys.exit(1)
411
+
412
+ # Override model if specified
413
+ if args.model:
414
+ suite.model = args.model
415
+
416
+ print()
417
+ print(f" {c('cane-eval rca', 'cyan')} {c(suite.name, 'bold')}")
418
+ print(f" {len(suite.tests)} test cases | model: {suite.model}")
419
+ print()
420
+
421
+ # Parse tags
422
+ tags = args.tags.split(",") if args.tags else None
423
+
424
+ # If results JSON provided, load from file instead of running
425
+ if args.results:
426
+ try:
427
+ with open(args.results, "r") as f:
428
+ data = json.load(f)
429
+ except FileNotFoundError:
430
+ print(c(f" Error: Results file not found: {args.results}", "red"))
431
+ sys.exit(1)
432
+
433
+ from cane_eval.engine import EvalResult as ER
434
+ from cane_eval.judge import JudgeResult, CriteriaScore
435
+
436
+ results_list = data.get("results", data if isinstance(data, list) else [])
437
+ eval_results = []
438
+ for r in results_list:
439
+ criteria_scores = []
440
+ for name, score_val in (r.get("criteria_scores") or {}).items():
441
+ criteria_scores.append(CriteriaScore(name=name, score=float(score_val), reasoning=""))
442
+ jr = JudgeResult(
443
+ overall_score=r.get("overall_score", 0),
444
+ overall_reasoning=r.get("judge_reasoning", ""),
445
+ status=r.get("status", "fail"),
446
+ criteria_scores=criteria_scores,
447
+ )
448
+ eval_results.append(ER(
449
+ question=r.get("question", ""),
450
+ expected_answer=r.get("expected_answer", ""),
451
+ agent_answer=r.get("agent_answer", ""),
452
+ judge_result=jr,
453
+ tags=r.get("tags", []),
454
+ ))
455
+
456
+ from cane_eval.engine import RunSummary
457
+ summary = RunSummary(
458
+ suite_name=data.get("suite_name", "loaded"),
459
+ total=len(eval_results),
460
+ results=eval_results,
461
+ )
462
+ else:
463
+ # Run the eval first
464
+ print(" Running eval first...")
465
+ print()
466
+
467
+ runner = EvalRunner(
468
+ api_key=args.api_key or os.environ.get("ANTHROPIC_API_KEY"),
469
+ model=args.model,
470
+ verbose=not args.quiet,
471
+ on_result=print_result if not args.quiet else None,
472
+ )
473
+
474
+ summary = runner.run(suite, tags=tags)
475
+
476
+ if not args.quiet:
477
+ print_summary(summary)
478
+
479
+ # Now run RCA
480
+ analyzer = RootCauseAnalyzer(
481
+ api_key=args.api_key or os.environ.get("ANTHROPIC_API_KEY"),
482
+ model=args.model or suite.model,
483
+ verbose=not args.quiet,
484
+ )
485
+
486
+ print(f" {c('Running root cause analysis...', 'cyan')}")
487
+ print()
488
+
489
+ rca_result = analyzer.analyze(
490
+ summary,
491
+ max_score=args.threshold,
492
+ max_failures=args.max_failures,
493
+ )
494
+
495
+ if not args.quiet:
496
+ print_rca_result(rca_result)
497
+
498
+ # Optionally run targeted analysis on each failure
499
+ if args.targeted:
500
+ failures = [r for r in summary.results if r.score <= args.threshold]
501
+ failures.sort(key=lambda r: r.score)
502
+ targeted_limit = min(len(failures), args.targeted_max)
503
+
504
+ if failures:
505
+ print(f" {c(f'Running targeted analysis on {targeted_limit} worst failures...', 'cyan')}")
506
+ print()
507
+
508
+ for r in failures[:targeted_limit]:
509
+ targeted = analyzer.analyze_result(r)
510
+ if not args.quiet:
511
+ print_targeted_rca_result(targeted)
512
+
513
+ # Save results JSON
514
+ if args.output:
515
+ with open(args.output, "w") as f:
516
+ json.dump(rca_result.to_dict(), f, indent=2)
517
+ print(f" Results saved to {args.output}")
518
+
519
+ # Exit code based on critical root causes
520
+ critical_count = sum(1 for rc in rca_result.root_causes if rc.severity == "critical")
521
+ if critical_count > 0:
522
+ sys.exit(1)
523
+ sys.exit(0)
524
+
525
+
526
+ def cmd_validate(args):
527
+ """Validate a test suite YAML file."""
528
+ from cane_eval.suite import TestSuite
529
+
530
+ try:
531
+ suite = TestSuite.from_yaml(args.suite)
532
+ print(f" {c('Valid', 'green')} {suite.name}")
533
+ print(f" {len(suite.tests)} test cases, {len(suite.criteria)} criteria")
534
+ if suite.custom_rules:
535
+ print(f" {len(suite.custom_rules)} custom rules")
536
+ if suite.target.type != "callable":
537
+ print(f" Target: {suite.target.type} ({suite.target.url or suite.target.command})")
538
+ except Exception as e:
539
+ print(c(f" Invalid: {e}", "red"))
540
+ sys.exit(1)
541
+
542
+
543
+ # ---- Main ----
544
+
545
+ def main():
546
+ parser = argparse.ArgumentParser(
547
+ prog="cane-eval",
548
+ description="LLM-as-Judge evaluation for AI agents",
549
+ )
550
+ parser.add_argument("--version", action="version", version="%(prog)s 0.1.0")
551
+
552
+ subparsers = parser.add_subparsers(dest="command", help="Command to run")
553
+
554
+ # run
555
+ run_parser = subparsers.add_parser("run", help="Run an eval suite")
556
+ run_parser.add_argument("suite", help="Path to YAML test suite")
557
+ run_parser.add_argument("--model", help="Override judge model")
558
+ run_parser.add_argument("--api-key", help="Anthropic API key (or set ANTHROPIC_API_KEY)")
559
+ run_parser.add_argument("--tags", help="Comma-separated tag filter")
560
+ run_parser.add_argument("--export", choices=["dpo", "sft", "openai", "raw"], help="Export format")
561
+ run_parser.add_argument("--output", help="Export output path")
562
+ run_parser.add_argument("--output-json", help="Save full results as JSON")
563
+ run_parser.add_argument("--mine", action="store_true", help="Run failure mining after eval")
564
+ run_parser.add_argument("--mine-threshold", type=float, default=60, help="Max score for mining (default: 60)")
565
+ run_parser.add_argument("--mine-max", type=int, default=100, help="Max examples to mine (default: 100)")
566
+ run_parser.add_argument("--mine-format", choices=["dpo", "sft"], default="dpo", help="Mining export format")
567
+ run_parser.add_argument("--mine-output", help="Mining export path")
568
+ run_parser.add_argument("--quiet", "-q", action="store_true", help="Minimal output")
569
+ run_parser.add_argument("--fail-on-warn", action="store_true", help="Exit 1 on warnings too")
570
+
571
+ # diff
572
+ diff_parser = subparsers.add_parser("diff", help="Compare two eval runs (regression diff)")
573
+ diff_parser.add_argument("old", help="Path to older results JSON")
574
+ diff_parser.add_argument("new", help="Path to newer results JSON")
575
+
576
+ # rca
577
+ rca_parser = subparsers.add_parser("rca", help="Run root cause analysis on eval failures")
578
+ rca_parser.add_argument("suite", help="Path to YAML test suite")
579
+ rca_parser.add_argument("--model", help="Override judge/analysis model")
580
+ rca_parser.add_argument("--api-key", help="Anthropic API key (or set ANTHROPIC_API_KEY)")
581
+ rca_parser.add_argument("--tags", help="Comma-separated tag filter")
582
+ rca_parser.add_argument("--results", help="Path to existing results JSON (skip running eval)")
583
+ rca_parser.add_argument("--threshold", type=float, default=60, help="Max score for analysis (default: 60)")
584
+ rca_parser.add_argument("--max-failures", type=int, default=30, help="Max failures to analyze (default: 30)")
585
+ rca_parser.add_argument("--targeted", action="store_true", help="Also run targeted analysis on each failure")
586
+ rca_parser.add_argument("--targeted-max", type=int, default=5, help="Max results for targeted analysis (default: 5)")
587
+ rca_parser.add_argument("--output", help="Save RCA results as JSON")
588
+ rca_parser.add_argument("--quiet", "-q", action="store_true", help="Minimal output")
589
+
590
+ # validate
591
+ validate_parser = subparsers.add_parser("validate", help="Validate a test suite YAML")
592
+ validate_parser.add_argument("suite", help="Path to YAML test suite")
593
+
594
+ args = parser.parse_args()
595
+
596
+ if args.command == "run":
597
+ cmd_run(args)
598
+ elif args.command == "diff":
599
+ cmd_diff(args)
600
+ elif args.command == "rca":
601
+ cmd_rca(args)
602
+ elif args.command == "validate":
603
+ cmd_validate(args)
604
+ else:
605
+ parser.print_help()
606
+ sys.exit(0)
607
+
608
+
609
+ if __name__ == "__main__":
610
+ main()