claude-turing 2.2.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/.claude-plugin/plugin.json +2 -2
  2. package/README.md +5 -2
  3. package/commands/diff.md +48 -0
  4. package/commands/regress.md +53 -0
  5. package/commands/turing.md +6 -0
  6. package/commands/watch.md +60 -0
  7. package/config/watch_alerts.yaml +36 -0
  8. package/package.json +1 -1
  9. package/src/install.js +2 -0
  10. package/src/verify.js +4 -0
  11. package/templates/__pycache__/evaluate.cpython-314.pyc +0 -0
  12. package/templates/__pycache__/prepare.cpython-314.pyc +0 -0
  13. package/templates/features/__pycache__/__init__.cpython-314.pyc +0 -0
  14. package/templates/features/__pycache__/featurizers.cpython-314.pyc +0 -0
  15. package/templates/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
  16. package/templates/scripts/__pycache__/check_convergence.cpython-314.pyc +0 -0
  17. package/templates/scripts/__pycache__/cost_frontier.cpython-314.pyc +0 -0
  18. package/templates/scripts/__pycache__/critique_hypothesis.cpython-314.pyc +0 -0
  19. package/templates/scripts/__pycache__/experiment_diff.cpython-314.pyc +0 -0
  20. package/templates/scripts/__pycache__/experiment_index.cpython-314.pyc +0 -0
  21. package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
  22. package/templates/scripts/__pycache__/generate_logbook.cpython-314.pyc +0 -0
  23. package/templates/scripts/__pycache__/log_experiment.cpython-314.pyc +0 -0
  24. package/templates/scripts/__pycache__/novelty_guard.cpython-314.pyc +0 -0
  25. package/templates/scripts/__pycache__/parse_metrics.cpython-314.pyc +0 -0
  26. package/templates/scripts/__pycache__/regression_gate.cpython-314.pyc +0 -0
  27. package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
  28. package/templates/scripts/__pycache__/show_experiment_tree.cpython-314.pyc +0 -0
  29. package/templates/scripts/__pycache__/show_families.cpython-314.pyc +0 -0
  30. package/templates/scripts/__pycache__/statistical_compare.cpython-314.pyc +0 -0
  31. package/templates/scripts/__pycache__/suggest_next.cpython-314.pyc +0 -0
  32. package/templates/scripts/__pycache__/sweep.cpython-314.pyc +0 -0
  33. package/templates/scripts/__pycache__/synthesize_decision.cpython-314.pyc +0 -0
  34. package/templates/scripts/__pycache__/training_monitor.cpython-314.pyc +0 -0
  35. package/templates/scripts/__pycache__/verify_placeholders.cpython-314.pyc +0 -0
  36. package/templates/scripts/experiment_diff.py +703 -0
  37. package/templates/scripts/generate_brief.py +44 -0
  38. package/templates/scripts/regression_gate.py +536 -0
  39. package/templates/scripts/scaffold.py +6 -0
  40. package/templates/scripts/training_monitor.py +611 -0
  41. package/templates/scripts/__pycache__/classify_task.cpython-314.pyc +0 -0
  42. package/templates/tests/__pycache__/__init__.cpython-314.pyc +0 -0
  43. package/templates/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc +0 -0
  44. package/templates/tests/__pycache__/test_cost_frontier.cpython-314-pytest-9.0.2.pyc +0 -0
@@ -0,0 +1,703 @@
1
+ #!/usr/bin/env python3
2
+ """Deep experiment comparison for the autoresearch pipeline.
3
+
4
+ Goes beyond simple metric tables to answer "at what point did these two
5
+ experiments diverge and why?" Shows config diffs with magnitudes, metric
6
+ deltas with statistical significance, per-class performance regressions,
7
+ training curve divergence points, and feature importance shifts.
8
+
9
+ Usage:
10
+ python scripts/experiment_diff.py exp-042 exp-053
11
+ python scripts/experiment_diff.py exp-042 exp-053 --code
12
+ python scripts/experiment_diff.py exp-042 exp-053 --json
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import argparse
18
+ import json
19
+ import math
20
+ import subprocess
21
+ import sys
22
+ from datetime import datetime, timezone
23
+ from pathlib import Path
24
+
25
+ import yaml
26
+
27
+ from scripts.turing_io import load_config, load_experiments
28
+
29
+ DEFAULT_LOG_PATH = "experiments/log.jsonl"
30
+
31
+
32
+ def find_experiment(experiments: list[dict], exp_id: str) -> dict | None:
33
+ """Find an experiment by ID."""
34
+ for exp in experiments:
35
+ if exp.get("experiment_id") == exp_id:
36
+ return exp
37
+ return None
38
+
39
+
40
+ # --- Config Diff ---
41
+
42
+
43
+ def diff_configs(config_a: dict, config_b: dict) -> list[dict]:
44
+ """Compute config differences between two experiments.
45
+
46
+ Flattens nested config dicts and computes magnitude of change
47
+ for numeric values.
48
+
49
+ Returns:
50
+ List of diff dicts with keys: key, value_a, value_b, changed,
51
+ and optionally pct_change for numeric values.
52
+ """
53
+ flat_a = _flatten_dict(config_a)
54
+ flat_b = _flatten_dict(config_b)
55
+ all_keys = sorted(set(flat_a) | set(flat_b))
56
+
57
+ diffs = []
58
+ for key in all_keys:
59
+ val_a = flat_a.get(key)
60
+ val_b = flat_b.get(key)
61
+ changed = val_a != val_b
62
+
63
+ entry = {
64
+ "key": key,
65
+ "value_a": val_a,
66
+ "value_b": val_b,
67
+ "changed": changed,
68
+ }
69
+
70
+ if changed and isinstance(val_a, (int, float)) and isinstance(val_b, (int, float)):
71
+ if val_a != 0:
72
+ entry["pct_change"] = (val_b - val_a) / abs(val_a) * 100
73
+ else:
74
+ entry["pct_change"] = float("inf") if val_b != 0 else 0.0
75
+
76
+ diffs.append(entry)
77
+
78
+ return diffs
79
+
80
+
81
+ def _flatten_dict(d: dict, prefix: str = "") -> dict:
82
+ """Flatten a nested dict with dot-separated keys."""
83
+ items = {}
84
+ for k, v in d.items():
85
+ full_key = f"{prefix}.{k}" if prefix else k
86
+ if isinstance(v, dict):
87
+ items.update(_flatten_dict(v, full_key))
88
+ else:
89
+ items[full_key] = v
90
+ return items
91
+
92
+
93
+ # --- Metric Diff ---
94
+
95
+
96
+ def diff_metrics(
97
+ metrics_a: dict,
98
+ metrics_b: dict,
99
+ lower_is_better_metrics: set[str] | None = None,
100
+ seed_studies: dict[str, dict] | None = None,
101
+ ) -> list[dict]:
102
+ """Compute metric differences with optional significance testing.
103
+
104
+ Args:
105
+ metrics_a: Metrics from experiment A.
106
+ metrics_b: Metrics from experiment B.
107
+ lower_is_better_metrics: Set of metric names where lower is better.
108
+ seed_studies: Map of exp_id -> seed study data for significance.
109
+
110
+ Returns:
111
+ List of metric diff dicts.
112
+ """
113
+ if lower_is_better_metrics is None:
114
+ lower_is_better_metrics = set()
115
+
116
+ all_keys = sorted(set(metrics_a) | set(metrics_b))
117
+ # Filter out non-numeric metadata
118
+ metadata_keys = {"model_type", "train_seconds", "n_params", "model_size_bytes"}
119
+
120
+ diffs = []
121
+ for key in all_keys:
122
+ val_a = metrics_a.get(key)
123
+ val_b = metrics_b.get(key)
124
+
125
+ entry = {
126
+ "metric": key,
127
+ "value_a": val_a,
128
+ "value_b": val_b,
129
+ }
130
+
131
+ if isinstance(val_a, (int, float)) and isinstance(val_b, (int, float)):
132
+ delta = val_b - val_a
133
+ entry["delta"] = round(delta, 6)
134
+
135
+ if key in lower_is_better_metrics:
136
+ entry["direction"] = "better" if delta < 0 else "worse" if delta > 0 else "same"
137
+ elif key not in metadata_keys:
138
+ entry["direction"] = "better" if delta > 0 else "worse" if delta < 0 else "same"
139
+ else:
140
+ entry["direction"] = "N/A"
141
+
142
+ # Significance from seed studies if available
143
+ if seed_studies:
144
+ entry["significance"] = _check_significance(
145
+ key, val_a, val_b, seed_studies,
146
+ )
147
+
148
+ diffs.append(entry)
149
+
150
+ return diffs
151
+
152
+
153
+ def _check_significance(
154
+ metric: str,
155
+ val_a: float,
156
+ val_b: float,
157
+ seed_studies: dict[str, dict],
158
+ ) -> dict | None:
159
+ """Check if a metric difference is statistically significant.
160
+
161
+ Uses seed study standard deviations to estimate a rough p-value
162
+ via the pooled two-sample z-test approximation.
163
+ """
164
+ # Collect std estimates from any available seed studies
165
+ stds = []
166
+ for study in seed_studies.values():
167
+ per_metric = study.get("per_metric", {})
168
+ if metric in per_metric and "std" in per_metric[metric]:
169
+ stds.append(per_metric[metric]["std"])
170
+
171
+ if not stds:
172
+ return None
173
+
174
+ pooled_std = sum(stds) / len(stds)
175
+ if pooled_std == 0:
176
+ return {"significant": val_a != val_b, "method": "zero_variance"}
177
+
178
+ z = abs(val_b - val_a) / (pooled_std * math.sqrt(2))
179
+
180
+ # Approximate two-tailed p-value from z-score
181
+ # Using the complementary error function approximation
182
+ p_value = 2 * (1 - _norm_cdf(z))
183
+
184
+ return {
185
+ "z_score": round(z, 3),
186
+ "p_value": round(p_value, 4),
187
+ "significant": p_value < 0.05,
188
+ "method": "pooled_z_test",
189
+ }
190
+
191
+
192
+ def _norm_cdf(x: float) -> float:
193
+ """Standard normal CDF approximation (Abramowitz & Stegun)."""
194
+ return 0.5 * (1 + math.erf(x / math.sqrt(2)))
195
+
196
+
197
+ # --- Per-Class Diff ---
198
+
199
+
200
+ def diff_per_class(
201
+ class_metrics_a: dict | None,
202
+ class_metrics_b: dict | None,
203
+ ) -> list[dict]:
204
+ """Compare per-class performance between two experiments.
205
+
206
+ Args:
207
+ class_metrics_a: Dict of {class_name: {metric: value}} from exp A.
208
+ class_metrics_b: Dict of {class_name: {metric: value}} from exp B.
209
+
210
+ Returns:
211
+ List of per-class diffs, highlighting regressions.
212
+ """
213
+ if not class_metrics_a or not class_metrics_b:
214
+ return []
215
+
216
+ all_classes = sorted(set(class_metrics_a) | set(class_metrics_b))
217
+ diffs = []
218
+
219
+ for cls in all_classes:
220
+ a_metrics = class_metrics_a.get(cls, {})
221
+ b_metrics = class_metrics_b.get(cls, {})
222
+ all_metrics = sorted(set(a_metrics) | set(b_metrics))
223
+
224
+ for metric in all_metrics:
225
+ val_a = a_metrics.get(metric)
226
+ val_b = b_metrics.get(metric)
227
+
228
+ entry = {
229
+ "class": cls,
230
+ "metric": metric,
231
+ "value_a": val_a,
232
+ "value_b": val_b,
233
+ }
234
+
235
+ if isinstance(val_a, (int, float)) and isinstance(val_b, (int, float)):
236
+ delta = val_b - val_a
237
+ entry["delta"] = round(delta, 6)
238
+ entry["regression"] = delta < -0.01 # Flag meaningful regressions
239
+
240
+ diffs.append(entry)
241
+
242
+ return diffs
243
+
244
+
245
+ # --- Training Curve Divergence ---
246
+
247
+
248
+ def find_curve_divergence(
249
+ curve_a: list[dict] | None,
250
+ curve_b: list[dict] | None,
251
+ metric: str = "loss",
252
+ threshold: float = 0.05,
253
+ ) -> dict | None:
254
+ """Find the epoch where two training curves meaningfully diverge.
255
+
256
+ Args:
257
+ curve_a: List of {epoch, metric_value} from experiment A.
258
+ curve_b: List of {epoch, metric_value} from experiment B.
259
+ metric: Which metric to compare.
260
+ threshold: Relative difference to consider "diverged".
261
+
262
+ Returns:
263
+ Dict with divergence_epoch, metric values at divergence, or None.
264
+ """
265
+ if not curve_a or not curve_b:
266
+ return None
267
+
268
+ # Build epoch -> value maps
269
+ map_a = {}
270
+ map_b = {}
271
+ for entry in curve_a:
272
+ epoch = entry.get("epoch")
273
+ val = entry.get(metric)
274
+ if epoch is not None and val is not None:
275
+ map_a[epoch] = val
276
+ for entry in curve_b:
277
+ epoch = entry.get("epoch")
278
+ val = entry.get(metric)
279
+ if epoch is not None and val is not None:
280
+ map_b[epoch] = val
281
+
282
+ common_epochs = sorted(set(map_a) & set(map_b))
283
+ if not common_epochs:
284
+ return None
285
+
286
+ for epoch in common_epochs:
287
+ va = map_a[epoch]
288
+ vb = map_b[epoch]
289
+ denom = abs(va) if va != 0 else 1.0
290
+ rel_diff = abs(vb - va) / denom
291
+
292
+ if rel_diff > threshold:
293
+ return {
294
+ "divergence_epoch": epoch,
295
+ "value_a": round(va, 6),
296
+ "value_b": round(vb, 6),
297
+ "relative_diff": round(rel_diff, 4),
298
+ "metric": metric,
299
+ "total_common_epochs": len(common_epochs),
300
+ }
301
+
302
+ return None
303
+
304
+
305
+ # --- Feature Importance Diff ---
306
+
307
+
308
+ def diff_feature_importance(
309
+ importance_a: dict | None,
310
+ importance_b: dict | None,
311
+ top_k: int = 10,
312
+ ) -> list[dict]:
313
+ """Compare feature importances between experiments.
314
+
315
+ Args:
316
+ importance_a: {feature_name: importance_value} from exp A.
317
+ importance_b: {feature_name: importance_value} from exp B.
318
+ top_k: Show top K features by absolute importance change.
319
+
320
+ Returns:
321
+ List of feature importance diffs, sorted by absolute delta.
322
+ """
323
+ if not importance_a or not importance_b:
324
+ return []
325
+
326
+ all_features = set(importance_a) | set(importance_b)
327
+ diffs = []
328
+
329
+ for feat in all_features:
330
+ val_a = importance_a.get(feat, 0.0)
331
+ val_b = importance_b.get(feat, 0.0)
332
+ delta = val_b - val_a
333
+
334
+ diffs.append({
335
+ "feature": feat,
336
+ "importance_a": round(val_a, 6),
337
+ "importance_b": round(val_b, 6),
338
+ "delta": round(delta, 6),
339
+ "abs_delta": round(abs(delta), 6),
340
+ })
341
+
342
+ diffs.sort(key=lambda d: d["abs_delta"], reverse=True)
343
+ return diffs[:top_k]
344
+
345
+
346
+ # --- Code Diff ---
347
+
348
+
349
+ def get_code_diff(commit_a: str | None, commit_b: str | None) -> str | None:
350
+ """Get git diff of train.py between two experiment commits.
351
+
352
+ Returns None if commits not available or git fails.
353
+ """
354
+ if not commit_a or not commit_b:
355
+ return None
356
+
357
+ try:
358
+ result = subprocess.run(
359
+ ["git", "diff", commit_a, commit_b, "--", "train.py"],
360
+ capture_output=True, text=True, timeout=30,
361
+ )
362
+ if result.returncode == 0 and result.stdout.strip():
363
+ return result.stdout.strip()
364
+ except (subprocess.TimeoutExpired, FileNotFoundError):
365
+ pass
366
+
367
+ return None
368
+
369
+
370
+ # --- Full Diff ---
371
+
372
+
373
+ def experiment_diff(
374
+ exp_id_a: str,
375
+ exp_id_b: str,
376
+ config_path: str = "config.yaml",
377
+ log_path: str = DEFAULT_LOG_PATH,
378
+ include_code: bool = False,
379
+ ) -> dict:
380
+ """Compute a comprehensive diff between two experiments.
381
+
382
+ Args:
383
+ exp_id_a: First experiment ID.
384
+ exp_id_b: Second experiment ID.
385
+ config_path: Path to config.yaml.
386
+ log_path: Path to experiment log.
387
+ include_code: Include git diff of train.py.
388
+
389
+ Returns:
390
+ Complete diff report dict.
391
+ """
392
+ config = load_config(config_path)
393
+ eval_cfg = config.get("evaluation", {})
394
+ primary_metric = eval_cfg.get("primary_metric", "accuracy")
395
+ lower_is_better = eval_cfg.get("lower_is_better", False)
396
+ lower_is_better_metrics = set(eval_cfg.get("metrics", [])) if lower_is_better else set()
397
+
398
+ experiments = load_experiments(log_path)
399
+
400
+ exp_a = find_experiment(experiments, exp_id_a)
401
+ exp_b = find_experiment(experiments, exp_id_b)
402
+
403
+ if not exp_a:
404
+ return {"error": f"Experiment {exp_id_a} not found in {log_path}"}
405
+ if not exp_b:
406
+ return {"error": f"Experiment {exp_id_b} not found in {log_path}"}
407
+
408
+ # Load seed studies if available
409
+ seed_studies = _load_seed_studies(exp_id_a, exp_id_b)
410
+
411
+ report = {
412
+ "experiment_a": exp_id_a,
413
+ "experiment_b": exp_id_b,
414
+ "generated_at": datetime.now(timezone.utc).isoformat(),
415
+ "primary_metric": primary_metric,
416
+ }
417
+
418
+ # Config diff
419
+ config_a = exp_a.get("config", {})
420
+ config_b = exp_b.get("config", {})
421
+ report["config_diff"] = diff_configs(config_a, config_b)
422
+
423
+ # Metric diff
424
+ metrics_a = exp_a.get("metrics", {})
425
+ metrics_b = exp_b.get("metrics", {})
426
+ report["metric_diff"] = diff_metrics(
427
+ metrics_a, metrics_b, lower_is_better_metrics, seed_studies,
428
+ )
429
+
430
+ # Per-class diff
431
+ class_a = exp_a.get("per_class_metrics")
432
+ class_b = exp_b.get("per_class_metrics")
433
+ report["per_class_diff"] = diff_per_class(class_a, class_b)
434
+
435
+ # Training curve divergence
436
+ curve_a = exp_a.get("training_curve")
437
+ curve_b = exp_b.get("training_curve")
438
+ report["curve_divergence"] = find_curve_divergence(curve_a, curve_b)
439
+
440
+ # Feature importance diff
441
+ imp_a = exp_a.get("feature_importance")
442
+ imp_b = exp_b.get("feature_importance")
443
+ report["feature_importance_diff"] = diff_feature_importance(imp_a, imp_b)
444
+
445
+ # Code diff
446
+ if include_code:
447
+ commit_a = exp_a.get("git_commit")
448
+ commit_b = exp_b.get("git_commit")
449
+ report["code_diff"] = get_code_diff(commit_a, commit_b)
450
+
451
+ # Summary verdict
452
+ report["summary"] = _build_summary(report, primary_metric)
453
+
454
+ return report
455
+
456
+
457
+ def _load_seed_studies(exp_id_a: str, exp_id_b: str) -> dict[str, dict]:
458
+ """Load seed studies for both experiments if available."""
459
+ from scripts.turing_io import load_seed_study
460
+
461
+ studies = {}
462
+ for exp_id in (exp_id_a, exp_id_b):
463
+ study = load_seed_study(exp_id)
464
+ if study:
465
+ studies[exp_id] = study
466
+ return studies
467
+
468
+
469
+ def _build_summary(report: dict, primary_metric: str) -> dict:
470
+ """Build a summary of the key differences."""
471
+ config_changes = [d for d in report.get("config_diff", []) if d["changed"]]
472
+ metric_diffs = report.get("metric_diff", [])
473
+ regressions = [d for d in report.get("per_class_diff", []) if d.get("regression")]
474
+ divergence = report.get("curve_divergence")
475
+ fi_shifts = report.get("feature_importance_diff", [])
476
+
477
+ # Find primary metric change
478
+ primary_change = None
479
+ for m in metric_diffs:
480
+ if m["metric"] == primary_metric:
481
+ primary_change = m
482
+ break
483
+
484
+ return {
485
+ "config_changes": len(config_changes),
486
+ "metric_changes": len([d for d in metric_diffs if d.get("delta", 0) != 0]),
487
+ "per_class_regressions": len(regressions),
488
+ "has_curve_divergence": divergence is not None,
489
+ "divergence_epoch": divergence["divergence_epoch"] if divergence else None,
490
+ "feature_importance_shifts": len(fi_shifts),
491
+ "primary_metric_delta": primary_change.get("delta") if primary_change else None,
492
+ "primary_metric_direction": primary_change.get("direction") if primary_change else None,
493
+ }
494
+
495
+
496
+ def save_diff_report(report: dict, output_dir: str = "experiments/diffs") -> Path:
497
+ """Save diff report to YAML file."""
498
+ out_path = Path(output_dir)
499
+ out_path.mkdir(parents=True, exist_ok=True)
500
+
501
+ a = report.get("experiment_a", "unknown")
502
+ b = report.get("experiment_b", "unknown")
503
+ filename = f"{a}-vs-{b}.yaml"
504
+ filepath = out_path / filename
505
+
506
+ with open(filepath, "w") as f:
507
+ yaml.dump(report, f, default_flow_style=False, sort_keys=False)
508
+
509
+ return filepath
510
+
511
+
512
+ def format_diff_report(report: dict) -> str:
513
+ """Format diff report as human-readable markdown."""
514
+ if "error" in report:
515
+ return f"ERROR: {report['error']}"
516
+
517
+ a = report["experiment_a"]
518
+ b = report["experiment_b"]
519
+ primary = report.get("primary_metric", "accuracy")
520
+
521
+ lines = [
522
+ f"# Experiment Diff: {a} vs {b}",
523
+ "",
524
+ f"*Generated {report.get('generated_at', 'N/A')[:19]}*",
525
+ "",
526
+ ]
527
+
528
+ # Config diff
529
+ config_diffs = report.get("config_diff", [])
530
+ changed = [d for d in config_diffs if d["changed"]]
531
+ if changed:
532
+ lines.extend(["## Config Changes", ""])
533
+ lines.append(f"| Parameter | {a} | {b} | Change |")
534
+ lines.append("|-----------|-----|-----|--------|")
535
+ for d in changed:
536
+ pct = f" ({d['pct_change']:+.0f}%)" if "pct_change" in d else ""
537
+ lines.append(
538
+ f"| {d['key']} | {d['value_a']} | {d['value_b']} | {pct} |"
539
+ )
540
+ lines.append("")
541
+ else:
542
+ lines.extend(["## Config Changes", "", "No config differences.", ""])
543
+
544
+ # Metric diff
545
+ metric_diffs = report.get("metric_diff", [])
546
+ if metric_diffs:
547
+ lines.extend(["## Metric Comparison", ""])
548
+ lines.append(f"| Metric | {a} | {b} | Delta | Verdict |")
549
+ lines.append("|--------|-----|-----|-------|---------|")
550
+ for m in metric_diffs:
551
+ va = m.get("value_a")
552
+ vb = m.get("value_b")
553
+ va_str = f"{va:.4f}" if isinstance(va, float) else str(va)
554
+ vb_str = f"{vb:.4f}" if isinstance(vb, float) else str(vb)
555
+ delta_str = f"{m['delta']:+.4f}" if "delta" in m else "N/A"
556
+ direction = m.get("direction", "")
557
+ sig = m.get("significance")
558
+ if sig and sig.get("significant"):
559
+ direction += f" (p={sig['p_value']:.3f} sig)"
560
+ elif sig and not sig.get("significant"):
561
+ direction += f" (p={sig['p_value']:.3f} ns)"
562
+ lines.append(
563
+ f"| {m['metric']} | {va_str} | {vb_str} | {delta_str} | {direction} |"
564
+ )
565
+ lines.append("")
566
+
567
+ # Per-class diff
568
+ class_diffs = report.get("per_class_diff", [])
569
+ if class_diffs:
570
+ regressions = [d for d in class_diffs if d.get("regression")]
571
+ lines.extend(["## Per-Class Performance", ""])
572
+ if regressions:
573
+ lines.append(f"**{len(regressions)} class regression(s) detected:**")
574
+ lines.append("")
575
+ lines.append(f"| Class | Metric | {a} | {b} | Delta | |")
576
+ lines.append("|-------|--------|-----|-----|-------|-|")
577
+ for d in class_diffs:
578
+ va = d.get("value_a")
579
+ vb = d.get("value_b")
580
+ va_str = f"{va:.4f}" if isinstance(va, float) else str(va)
581
+ vb_str = f"{vb:.4f}" if isinstance(vb, float) else str(vb)
582
+ delta_str = f"{d['delta']:+.4f}" if "delta" in d else "N/A"
583
+ flag = "REGRESSION" if d.get("regression") else ""
584
+ lines.append(
585
+ f"| {d['class']} | {d['metric']} | {va_str} | {vb_str} | {delta_str} | {flag} |"
586
+ )
587
+ lines.append("")
588
+
589
+ # Curve divergence
590
+ divergence = report.get("curve_divergence")
591
+ if divergence:
592
+ lines.extend([
593
+ "## Training Curve Divergence",
594
+ "",
595
+ f"Curves diverge at **epoch {divergence['divergence_epoch']}** "
596
+ f"({divergence['metric']}: {divergence['value_a']:.4f} vs {divergence['value_b']:.4f}, "
597
+ f"{divergence['relative_diff']:.1%} relative difference)",
598
+ f"out of {divergence['total_common_epochs']} common epochs.",
599
+ "",
600
+ ])
601
+
602
+ # Feature importance
603
+ fi_diffs = report.get("feature_importance_diff", [])
604
+ if fi_diffs:
605
+ lines.extend(["## Feature Importance Shifts", ""])
606
+ lines.append(f"| Feature | {a} | {b} | Delta |")
607
+ lines.append("|---------|-----|-----|-------|")
608
+ for f in fi_diffs:
609
+ lines.append(
610
+ f"| {f['feature']} | {f['importance_a']:.4f} | {f['importance_b']:.4f} | {f['delta']:+.4f} |"
611
+ )
612
+ lines.append("")
613
+
614
+ # Code diff
615
+ code_diff = report.get("code_diff")
616
+ if code_diff:
617
+ lines.extend([
618
+ "## Code Changes (train.py)",
619
+ "",
620
+ "```diff",
621
+ code_diff,
622
+ "```",
623
+ "",
624
+ ])
625
+
626
+ # Summary
627
+ summary = report.get("summary", {})
628
+ lines.extend([
629
+ "## Summary",
630
+ "",
631
+ f"- **Config changes:** {summary.get('config_changes', 0)}",
632
+ f"- **Metric changes:** {summary.get('metric_changes', 0)}",
633
+ f"- **Per-class regressions:** {summary.get('per_class_regressions', 0)}",
634
+ ])
635
+ if summary.get("has_curve_divergence"):
636
+ lines.append(f"- **Curves diverge at epoch:** {summary['divergence_epoch']}")
637
+ if summary.get("primary_metric_delta") is not None:
638
+ lines.append(
639
+ f"- **{primary} delta:** {summary['primary_metric_delta']:+.4f} "
640
+ f"({summary.get('primary_metric_direction', 'N/A')})"
641
+ )
642
+
643
+ return "\n".join(lines)
644
+
645
+
646
+ def main() -> None:
647
+ """CLI entry point."""
648
+ parser = argparse.ArgumentParser(
649
+ description="Deep experiment comparison",
650
+ )
651
+ parser.add_argument(
652
+ "exp_a",
653
+ help="First experiment ID (e.g., exp-042)",
654
+ )
655
+ parser.add_argument(
656
+ "exp_b",
657
+ help="Second experiment ID (e.g., exp-053)",
658
+ )
659
+ parser.add_argument(
660
+ "--config", default="config.yaml",
661
+ help="Path to config.yaml",
662
+ )
663
+ parser.add_argument(
664
+ "--log", default=DEFAULT_LOG_PATH,
665
+ help="Path to experiment log",
666
+ )
667
+ parser.add_argument(
668
+ "--code", action="store_true",
669
+ help="Include git diff of train.py between experiments",
670
+ )
671
+ parser.add_argument(
672
+ "--json", action="store_true",
673
+ help="Output raw JSON instead of formatted report",
674
+ )
675
+ args = parser.parse_args()
676
+
677
+ report = experiment_diff(
678
+ exp_id_a=args.exp_a,
679
+ exp_id_b=args.exp_b,
680
+ config_path=args.config,
681
+ log_path=args.log,
682
+ include_code=args.code,
683
+ )
684
+
685
+ # Save report
686
+ if "error" not in report:
687
+ filepath = save_diff_report(report)
688
+ print(f"Saved to {filepath}", file=sys.stderr)
689
+
690
+ # Output
691
+ if args.json:
692
+ print(json.dumps(report, indent=2, default=str))
693
+ else:
694
+ print(format_diff_report(report))
695
+
696
+ # Exit code: 1 if regressions detected
697
+ summary = report.get("summary", {})
698
+ if summary.get("per_class_regressions", 0) > 0:
699
+ sys.exit(1)
700
+
701
+
702
+ if __name__ == "__main__":
703
+ main()