claude-turing 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/.claude-plugin/plugin.json +2 -2
  2. package/README.md +33 -2
  3. package/commands/ablate.md +47 -0
  4. package/commands/diagnose.md +52 -0
  5. package/commands/frontier.md +45 -0
  6. package/commands/reproduce.md +48 -0
  7. package/commands/seed.md +47 -0
  8. package/commands/turing.md +10 -0
  9. package/package.json +1 -1
  10. package/src/install.js +2 -1
  11. package/src/verify.js +5 -0
  12. package/templates/config.yaml +10 -0
  13. package/templates/program.md +5 -0
  14. package/templates/scripts/__pycache__/ablation_study.cpython-314.pyc +0 -0
  15. package/templates/scripts/__pycache__/diagnose_errors.cpython-314.pyc +0 -0
  16. package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
  17. package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
  18. package/templates/scripts/__pycache__/pareto_frontier.cpython-314.pyc +0 -0
  19. package/templates/scripts/__pycache__/reproduce_experiment.cpython-314.pyc +0 -0
  20. package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
  21. package/templates/scripts/__pycache__/seed_runner.cpython-314.pyc +0 -0
  22. package/templates/scripts/__pycache__/turing_io.cpython-314.pyc +0 -0
  23. package/templates/scripts/__pycache__/update_state.cpython-314.pyc +0 -0
  24. package/templates/scripts/ablation_study.py +487 -0
  25. package/templates/scripts/diagnose_errors.py +601 -0
  26. package/templates/scripts/generate_brief.py +117 -0
  27. package/templates/scripts/generate_model_card.py +25 -0
  28. package/templates/scripts/leaderboard.py +10 -0
  29. package/templates/scripts/pareto_frontier.py +470 -0
  30. package/templates/scripts/reproduce_experiment.py +548 -0
  31. package/templates/scripts/scaffold.py +11 -0
  32. package/templates/scripts/seed_runner.py +414 -0
  33. package/templates/scripts/show_metrics.py +17 -0
  34. package/templates/scripts/turing_io.py +36 -0
  35. package/templates/scripts/update_state.py +13 -0
@@ -25,6 +25,7 @@ import yaml
25
25
 
26
26
  from scripts.cost_frontier import compute_pareto_frontier, load_cost_data, _format_seconds
27
27
  from scripts.turing_io import load_config, load_experiments, load_hypotheses
28
+ from scripts.seed_runner import CV_THRESHOLD
28
29
 
29
30
 
30
31
  def compute_campaign_summary(experiments: list[dict]) -> dict:
@@ -211,6 +212,57 @@ def detect_environment_drift(experiments: list[dict]) -> list[str]:
211
212
  return warnings
212
213
 
213
214
 
215
+ def load_diagnoses(diag_dir: str = "experiments/diagnoses") -> list[dict]:
216
+ """Load all diagnosis reports from YAML files."""
217
+ path = Path(diag_dir)
218
+ if not path.exists():
219
+ return []
220
+ diagnoses = []
221
+ for f in sorted(path.glob("*-diagnosis.yaml")):
222
+ try:
223
+ with open(f) as fh:
224
+ diag = yaml.safe_load(fh)
225
+ if diag and isinstance(diag, dict):
226
+ diagnoses.append(diag)
227
+ except (yaml.YAMLError, OSError):
228
+ continue
229
+ return diagnoses
230
+
231
+
232
+ def load_seed_studies(seed_dir: str = "experiments/seed_studies") -> list[dict]:
233
+ """Load all seed study results from YAML files."""
234
+ path = Path(seed_dir)
235
+ if not path.exists():
236
+ return []
237
+ studies = []
238
+ for f in sorted(path.glob("*-seeds.yaml")):
239
+ try:
240
+ with open(f) as fh:
241
+ study = yaml.safe_load(fh)
242
+ if study and isinstance(study, dict):
243
+ studies.append(study)
244
+ except (yaml.YAMLError, OSError):
245
+ continue
246
+ return studies
247
+
248
+
249
+ def load_reproductions(repro_dir: str = "experiments/reproductions") -> list[dict]:
250
+ """Load all reproduction reports from YAML files."""
251
+ path = Path(repro_dir)
252
+ if not path.exists():
253
+ return []
254
+ reports = []
255
+ for f in sorted(path.glob("*-repro.yaml")):
256
+ try:
257
+ with open(f) as fh:
258
+ report = yaml.safe_load(fh)
259
+ if report and isinstance(report, dict):
260
+ reports.append(report)
261
+ except (yaml.YAMLError, OSError):
262
+ continue
263
+ return reports
264
+
265
+
214
266
  def format_brief(
215
267
  campaign: dict,
216
268
  best: dict | None,
@@ -223,6 +275,9 @@ def format_brief(
223
275
  env_warnings: list[str] | None = None,
224
276
  cost_data: list | None = None,
225
277
  cost_frontier: list | None = None,
278
+ seed_studies: list[dict] | None = None,
279
+ reproductions: list[dict] | None = None,
280
+ diagnoses: list[dict] | None = None,
226
281
  ) -> str:
227
282
  """Format the research briefing as markdown."""
228
283
  direction = "lower" if lower_is_better else "higher"
@@ -361,6 +416,60 @@ def format_brief(
361
416
  f"The {pct:.1f}% improvement costs {ratio:.0f}x more compute.",
362
417
  ])
363
418
 
419
+ # Seed studies
420
+ if seed_studies:
421
+ lines.extend(["", "## Seed Studies", ""])
422
+ for study in seed_studies:
423
+ exp_id = study.get("experiment_id", "?")
424
+ sensitive = study.get("seed_sensitive", False)
425
+ status = "SEED-SENSITIVE" if sensitive else "STABLE"
426
+ lines.append(
427
+ f"- **{exp_id}:** {study.get('metric', metric)} = "
428
+ f"{study.get('mean', 0):.4f} +/- {study.get('std', 0):.4f} "
429
+ f"(CV={study.get('cv_percent', 0):.1f}%) — **{status}**"
430
+ )
431
+ if sensitive:
432
+ lines.append(
433
+ f" - 95% CI: [{study['ci_95'][0]:.4f}, {study['ci_95'][1]:.4f}] "
434
+ f"over {len(study.get('seeds_run', []))} seeds"
435
+ )
436
+ if any(s.get("seed_sensitive") for s in seed_studies):
437
+ lines.extend(["", "*Some results are seed-sensitive. Report distributions, not point estimates.*"])
438
+
439
+ # Reproduction reports
440
+ if reproductions:
441
+ lines.extend(["", "## Reproducibility", ""])
442
+ verdict_markers = {
443
+ "reproducible": "PASS",
444
+ "approximately_reproducible": "PASS (approx)",
445
+ "not_reproducible": "FAIL",
446
+ "environment_changed": "WARN (env)",
447
+ }
448
+ for report in reproductions:
449
+ exp_id = report.get("experiment_id", "?")
450
+ verdict = report.get("verdict", "unknown")
451
+ marker = verdict_markers.get(verdict, verdict)
452
+ lines.append(f"- **{exp_id}:** {marker} — {report.get('reason', 'N/A')}")
453
+ failed = [r for r in reproductions if r.get("verdict") in ("not_reproducible", "environment_changed")]
454
+ if failed:
455
+ lines.extend(["", f"*{len(failed)} experiment(s) failed reproducibility checks.*"])
456
+
457
+ # Diagnoses (error analysis)
458
+ if diagnoses:
459
+ lines.extend(["", "## Error Analysis", ""])
460
+ for diag in diagnoses:
461
+ exp_id = diag.get("experiment_id", "?")
462
+ modes = diag.get("failure_modes", [])
463
+ if modes:
464
+ lines.append(f"**{exp_id}** — {len(modes)} failure mode(s):")
465
+ for mode in modes[:3]:
466
+ lines.append(f"- {mode.get('id', '?')}: {mode.get('description', 'N/A')}")
467
+ if len(modes) > 3:
468
+ lines.append(f" *...and {len(modes) - 3} more (see full diagnosis)*")
469
+ auto_hyps = sum(len(d.get("auto_hypotheses", [])) for d in diagnoses)
470
+ if auto_hyps:
471
+ lines.append(f"\n*{auto_hyps} auto-generated hypotheses from failure analysis.*")
472
+
364
473
  lines.extend([
365
474
  "",
366
475
  "## Recommendations",
@@ -420,11 +529,19 @@ def generate_brief(
420
529
  cost_records = load_cost_data(log_path, metric)
421
530
  pareto = compute_pareto_frontier(cost_records, lower_is_better) if cost_records else []
422
531
 
532
+ # Load seed studies, reproduction reports, and diagnoses
533
+ seed_studies = load_seed_studies()
534
+ reproductions = load_reproductions()
535
+ diagnoses = load_diagnoses()
536
+
423
537
  return format_brief(
424
538
  campaign, best, trajectory, model_types, hypotheses,
425
539
  metric, lower_is_better, failures, env_warnings,
426
540
  cost_data=cost_records if cost_records else None,
427
541
  cost_frontier=pareto if cost_records else None,
542
+ seed_studies=seed_studies if seed_studies else None,
543
+ reproductions=reproductions if reproductions else None,
544
+ diagnoses=diagnoses if diagnoses else None,
428
545
  )
429
546
 
430
547
 
@@ -243,6 +243,31 @@ def generate_card(
243
243
  else:
244
244
  lines.append("No experiments completed yet.")
245
245
 
246
+ # --- Seed Study ---
247
+ if best:
248
+ seed_study_path = Path("experiments/seed_studies") / f"{best.get('experiment_id', 'unknown')}-seeds.yaml"
249
+ if seed_study_path.exists():
250
+ import yaml
251
+ with open(seed_study_path) as f:
252
+ seed_study = yaml.safe_load(f) or {}
253
+ if seed_study and "mean" in seed_study:
254
+ sensitive = seed_study.get("seed_sensitive", False)
255
+ status = "SEED-SENSITIVE" if sensitive else "STABLE"
256
+ lines.extend([
257
+ "",
258
+ "### Seed Study",
259
+ "",
260
+ f"- **Status:** {status}",
261
+ f"- **{metric}:** {seed_study['mean']:.4f} +/- {seed_study.get('std', 0):.4f}",
262
+ ])
263
+ if "ci_95" in seed_study:
264
+ ci = seed_study["ci_95"]
265
+ lines.append(f"- **95% CI:** [{ci[0]:.4f}, {ci[1]:.4f}]")
266
+ lines.append(f"- **CV:** {seed_study.get('cv_percent', 0):.2f}%")
267
+ lines.append(f"- **Seeds tested:** {len(seed_study.get('seeds_run', []))}")
268
+ if sensitive:
269
+ lines.append("- *Result varies significantly across seeds. Report distribution, not point estimate.*")
270
+
246
271
  # --- Training History ---
247
272
  lines.extend([
248
273
  "",
@@ -503,6 +503,16 @@ def main() -> None:
503
503
  print()
504
504
  print(footer)
505
505
 
506
+ # Show seed study status for #1 if available
507
+ if ranked and args.fmt not in ("csv",):
508
+ from scripts.turing_io import load_seed_study
509
+ best_id = ranked[0].get("experiment_id")
510
+ if best_id:
511
+ study = load_seed_study(best_id)
512
+ if study and "mean" in study:
513
+ sensitive = "SEED-SENSITIVE" if study.get("seed_sensitive") else "STABLE"
514
+ print(f"\n Seed study: {metric}={study['mean']:.4f}±{study.get('std',0):.4f} ({sensitive})")
515
+
506
516
 
507
517
  if __name__ == "__main__":
508
518
  main()
@@ -0,0 +1,470 @@
1
+ #!/usr/bin/env python3
2
+ """Multi-objective Pareto frontier visualization.
3
+
4
+ Visualizes the Pareto frontier across multiple objectives from
5
+ experiment history. Answers "which model is actually best?" when
6
+ there are tradeoffs between accuracy, latency, model size, etc.
7
+
8
+ Extends the existing cost_frontier.py (2D: metric vs train_time) to
9
+ N-dimensional Pareto analysis across arbitrary metric sets.
10
+
11
+ Usage:
12
+ python scripts/pareto_frontier.py # Default metrics
13
+ python scripts/pareto_frontier.py --metrics "accuracy,train_seconds" # Specific metrics
14
+ python scripts/pareto_frontier.py --metrics "accuracy,train_seconds,n_params" # 3D
15
+ python scripts/pareto_frontier.py --ascii # ASCII scatter
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import json
22
+ import math
23
+ import sys
24
+ from datetime import datetime, timezone
25
+ from pathlib import Path
26
+
27
+ import yaml
28
+
29
+ from scripts.turing_io import load_config, load_experiments
30
+
31
+ DEFAULT_LOG_PATH = "experiments/log.jsonl"
32
+
33
+
34
+ def extract_metric_vectors(
35
+ experiments: list[dict],
36
+ metrics: list[str],
37
+ status_filter: str = "kept",
38
+ ) -> list[dict]:
39
+ """Extract metric vectors from experiments for Pareto analysis.
40
+
41
+ Returns list of dicts with experiment_id, model_type, and metric values.
42
+ Only includes experiments that have ALL requested metrics.
43
+ """
44
+ results = []
45
+ for exp in experiments:
46
+ if status_filter and exp.get("status") != status_filter:
47
+ continue
48
+
49
+ exp_metrics = exp.get("metrics", {})
50
+ values = {}
51
+ complete = True
52
+ for m in metrics:
53
+ val = exp_metrics.get(m)
54
+ if val is None:
55
+ complete = False
56
+ break
57
+ try:
58
+ values[m] = float(val)
59
+ except (ValueError, TypeError):
60
+ complete = False
61
+ break
62
+
63
+ if complete:
64
+ results.append({
65
+ "experiment_id": exp.get("experiment_id", "?"),
66
+ "model_type": exp.get("config", {}).get("model_type", "unknown"),
67
+ "metrics": values,
68
+ "description": exp.get("description", ""),
69
+ })
70
+
71
+ return results
72
+
73
+
74
+ def compute_pareto_set(
75
+ data: list[dict],
76
+ metrics: list[str],
77
+ directions: dict[str, str],
78
+ ) -> list[dict]:
79
+ """Compute N-dimensional Pareto-optimal set.
80
+
81
+ An experiment is Pareto-optimal if no other experiment is strictly
82
+ better on ALL metrics simultaneously.
83
+
84
+ Args:
85
+ data: List of experiment dicts with metrics.
86
+ metrics: List of metric names.
87
+ directions: Dict mapping metric name to "higher" or "lower".
88
+
89
+ Returns:
90
+ List of Pareto-optimal experiment dicts.
91
+ """
92
+ if not data:
93
+ return []
94
+
95
+ frontier = []
96
+ for i, candidate in enumerate(data):
97
+ dominated = False
98
+ for j, other in enumerate(data):
99
+ if i == j:
100
+ continue
101
+ # Check if other dominates candidate
102
+ # "other dominates candidate" means other is >= on all metrics and > on at least one
103
+ all_at_least_as_good = True
104
+ strictly_better_on_one = False
105
+
106
+ for m in metrics:
107
+ c_val = candidate["metrics"][m]
108
+ o_val = other["metrics"][m]
109
+ direction = directions.get(m, "higher")
110
+
111
+ if direction == "higher":
112
+ if o_val < c_val:
113
+ all_at_least_as_good = False
114
+ break
115
+ if o_val > c_val:
116
+ strictly_better_on_one = True
117
+ else: # lower is better
118
+ if o_val > c_val:
119
+ all_at_least_as_good = False
120
+ break
121
+ if o_val < c_val:
122
+ strictly_better_on_one = True
123
+
124
+ if all_at_least_as_good and strictly_better_on_one:
125
+ dominated = True
126
+ break
127
+
128
+ if not dominated:
129
+ frontier.append(candidate)
130
+
131
+ return frontier
132
+
133
+
134
+ def find_closest_pareto_neighbor(
135
+ dominated: dict,
136
+ frontier: list[dict],
137
+ metrics: list[str],
138
+ directions: dict[str, str],
139
+ ) -> dict | None:
140
+ """Find the closest Pareto-optimal experiment to a dominated one.
141
+
142
+ Uses normalized Euclidean distance across all metrics.
143
+ """
144
+ if not frontier:
145
+ return None
146
+
147
+ # Compute normalization ranges
148
+ all_points = frontier + [dominated]
149
+ ranges = {}
150
+ for m in metrics:
151
+ values = [p["metrics"][m] for p in all_points]
152
+ r = max(values) - min(values)
153
+ ranges[m] = r if r > 0 else 1.0
154
+
155
+ best_dist = float("inf")
156
+ best_neighbor = None
157
+
158
+ for fp in frontier:
159
+ dist = 0
160
+ for m in metrics:
161
+ norm_diff = (dominated["metrics"][m] - fp["metrics"][m]) / ranges[m]
162
+ dist += norm_diff ** 2
163
+ dist = math.sqrt(dist)
164
+
165
+ if dist < best_dist:
166
+ best_dist = dist
167
+ best_neighbor = fp
168
+
169
+ return best_neighbor
170
+
171
+
172
+ def format_ascii_scatter(
173
+ data: list[dict],
174
+ frontier: list[dict],
175
+ x_metric: str,
176
+ y_metric: str,
177
+ width: int = 60,
178
+ height: int = 20,
179
+ ) -> str:
180
+ """Generate an ASCII scatter plot of two metrics with Pareto frontier marked."""
181
+ if not data:
182
+ return "No data to plot."
183
+
184
+ frontier_ids = {e["experiment_id"] for e in frontier}
185
+
186
+ x_vals = [d["metrics"][x_metric] for d in data]
187
+ y_vals = [d["metrics"][y_metric] for d in data]
188
+
189
+ x_min, x_max = min(x_vals), max(x_vals)
190
+ y_min, y_max = min(y_vals), max(y_vals)
191
+
192
+ # Add margin
193
+ x_range = x_max - x_min if x_max != x_min else 1.0
194
+ y_range = y_max - y_min if y_max != y_min else 1.0
195
+ x_min -= x_range * 0.05
196
+ x_max += x_range * 0.05
197
+ y_min -= y_range * 0.05
198
+ y_max += y_range * 0.05
199
+ x_range = x_max - x_min
200
+ y_range = y_max - y_min
201
+
202
+ # Create grid
203
+ grid = [[" "] * width for _ in range(height)]
204
+
205
+ # Plot points
206
+ for d in data:
207
+ x = int((d["metrics"][x_metric] - x_min) / x_range * (width - 1))
208
+ y = int((d["metrics"][y_metric] - y_min) / y_range * (height - 1))
209
+ x = max(0, min(width - 1, x))
210
+ y = max(0, min(height - 1, y))
211
+ y = height - 1 - y # Flip y axis
212
+
213
+ if d["experiment_id"] in frontier_ids:
214
+ grid[y][x] = "*"
215
+ elif grid[y][x] == " ":
216
+ grid[y][x] = "·"
217
+
218
+ # Build output
219
+ lines = [f" {y_metric} vs {x_metric} (* = Pareto-optimal, · = dominated)", ""]
220
+
221
+ # Y axis label
222
+ y_top = f"{y_max:.3f}"
223
+ y_bot = f"{y_min:.3f}"
224
+
225
+ for i, row in enumerate(grid):
226
+ label = ""
227
+ if i == 0:
228
+ label = y_top.rjust(8)
229
+ elif i == height - 1:
230
+ label = y_bot.rjust(8)
231
+ else:
232
+ label = " " * 8
233
+ lines.append(f"{label} |{''.join(row)}|")
234
+
235
+ # X axis
236
+ lines.append(f"{'':>8} +{'-' * width}+")
237
+ x_label = f"{x_min:.3f}{'':>{width - 12}}{x_max:.3f}"
238
+ lines.append(f"{'':>9} {x_label}")
239
+
240
+ return "\n".join(lines)
241
+
242
+
243
+ def format_frontier_report(
244
+ data: list[dict],
245
+ frontier: list[dict],
246
+ metrics: list[str],
247
+ directions: dict[str, str],
248
+ ) -> str:
249
+ """Format Pareto frontier analysis as a markdown report."""
250
+ lines = [
251
+ f"# Pareto Frontier Analysis",
252
+ "",
253
+ f"*{len(frontier)} Pareto-optimal of {len(data)} experiments across {len(metrics)} metrics*",
254
+ "",
255
+ ]
256
+
257
+ # Directions
258
+ dir_strs = [f"{m} ({'↓' if directions.get(m) == 'lower' else '↑'})" for m in metrics]
259
+ lines.append(f"**Metrics:** {', '.join(dir_strs)}")
260
+ lines.append("")
261
+
262
+ # Pareto-optimal table
263
+ lines.append("## Pareto-Optimal Experiments")
264
+ lines.append("")
265
+
266
+ header = "| Experiment | Model |"
267
+ sep = "|------------|-------|"
268
+ for m in metrics:
269
+ header += f" {m} |"
270
+ sep += f"{'---' * max(len(m) // 3, 1)}--|"
271
+ header += " Notes |"
272
+ sep += "-------|"
273
+ lines.append(header)
274
+ lines.append(sep)
275
+
276
+ for exp in frontier:
277
+ row = f"| {exp['experiment_id']} | {exp['model_type']} |"
278
+ for m in metrics:
279
+ row += f" {exp['metrics'][m]:.4f} |"
280
+ # Determine what this experiment is best at
281
+ best_at = []
282
+ for m in metrics:
283
+ is_best = True
284
+ for other in frontier:
285
+ if other is exp:
286
+ continue
287
+ if directions.get(m) == "lower":
288
+ if other["metrics"][m] < exp["metrics"][m]:
289
+ is_best = False
290
+ break
291
+ else:
292
+ if other["metrics"][m] > exp["metrics"][m]:
293
+ is_best = False
294
+ break
295
+ if is_best:
296
+ best_at.append(f"Best {m}")
297
+ notes = ", ".join(best_at) if best_at else "Balanced"
298
+ row += f" {notes} |"
299
+ lines.append(row)
300
+
301
+ # Dominated experiments with nearest neighbor
302
+ dominated = [d for d in data if d not in frontier]
303
+ if dominated:
304
+ lines.extend([
305
+ "",
306
+ "## Dominated Experiments",
307
+ "",
308
+ "| Experiment | Model |",
309
+ ])
310
+ header2 = "|------------|-------|"
311
+ for m in metrics:
312
+ lines[-1] += f" {m} |"
313
+ header2 += f"{'---' * max(len(m) // 3, 1)}--|"
314
+ lines[-1] += " Nearest Pareto |"
315
+ header2 += "----------------|"
316
+ lines.append(header2)
317
+
318
+ for exp in dominated[:10]:
319
+ row = f"| {exp['experiment_id']} | {exp['model_type']} |"
320
+ for m in metrics:
321
+ row += f" {exp['metrics'][m]:.4f} |"
322
+ neighbor = find_closest_pareto_neighbor(exp, frontier, metrics, directions)
323
+ if neighbor:
324
+ row += f" {neighbor['experiment_id']} |"
325
+ else:
326
+ row += " — |"
327
+ lines.append(row)
328
+
329
+ if len(dominated) > 10:
330
+ lines.append(f"*...and {len(dominated) - 10} more dominated experiments*")
331
+
332
+ return "\n".join(lines)
333
+
334
+
335
+ def save_frontier(frontier_data: dict, output_dir: str = "experiments/frontiers") -> Path:
336
+ """Save frontier analysis to YAML file."""
337
+ out_path = Path(output_dir)
338
+ out_path.mkdir(parents=True, exist_ok=True)
339
+ date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
340
+ filepath = out_path / f"frontier-{date_str}.yaml"
341
+ with open(filepath, "w") as f:
342
+ yaml.dump(frontier_data, f, default_flow_style=False, sort_keys=False)
343
+ return filepath
344
+
345
+
346
+ def run_frontier_analysis(
347
+ metrics_str: str | None = None,
348
+ config_path: str = "config.yaml",
349
+ log_path: str = DEFAULT_LOG_PATH,
350
+ ascii_plot: bool = False,
351
+ ) -> dict:
352
+ """Run Pareto frontier analysis.
353
+
354
+ Args:
355
+ metrics_str: Comma-separated metric names (defaults to config metrics).
356
+ config_path: Path to config.yaml.
357
+ log_path: Path to experiment log.
358
+ ascii_plot: Whether to generate ASCII scatter plot.
359
+
360
+ Returns:
361
+ Frontier analysis result dict.
362
+ """
363
+ config = load_config(config_path)
364
+ eval_cfg = config.get("evaluation", {})
365
+ primary_metric = eval_cfg.get("primary_metric", "accuracy")
366
+ lower_is_better = eval_cfg.get("lower_is_better", False)
367
+
368
+ # Determine metrics to analyze
369
+ if metrics_str:
370
+ metrics = [m.strip() for m in metrics_str.split(",")]
371
+ else:
372
+ # Default: primary metric + train_seconds
373
+ configured_metrics = eval_cfg.get("metrics", [primary_metric])
374
+ metrics = list(dict.fromkeys(configured_metrics + ["train_seconds"]))
375
+
376
+ # Determine direction for each metric
377
+ lower_metrics = {"train_seconds", "latency", "latency_ms", "n_params", "model_size",
378
+ "mse", "rmse", "mae", "loss", "log_loss", "error_rate"}
379
+ directions = {}
380
+ for m in metrics:
381
+ if m == primary_metric:
382
+ directions[m] = "lower" if lower_is_better else "higher"
383
+ elif m in lower_metrics:
384
+ directions[m] = "lower"
385
+ else:
386
+ directions[m] = "higher"
387
+
388
+ # Extract data
389
+ experiments = load_experiments(log_path)
390
+ data = extract_metric_vectors(experiments, metrics)
391
+
392
+ if not data:
393
+ return {
394
+ "error": f"No experiments with all metrics: {metrics}",
395
+ "metrics_requested": metrics,
396
+ "hint": "Ensure experiments log all requested metrics.",
397
+ }
398
+
399
+ # Compute Pareto set
400
+ frontier = compute_pareto_set(data, metrics, directions)
401
+
402
+ result = {
403
+ "timestamp": datetime.now(timezone.utc).isoformat(),
404
+ "metrics": metrics,
405
+ "directions": directions,
406
+ "total_experiments": len(data),
407
+ "pareto_optimal": len(frontier),
408
+ "frontier": [
409
+ {
410
+ "experiment_id": e["experiment_id"],
411
+ "model_type": e["model_type"],
412
+ "metrics": e["metrics"],
413
+ }
414
+ for e in frontier
415
+ ],
416
+ "dominated": len(data) - len(frontier),
417
+ }
418
+
419
+ if ascii_plot and len(metrics) >= 2:
420
+ result["ascii_plot"] = format_ascii_scatter(
421
+ data, frontier, metrics[0], metrics[1],
422
+ )
423
+
424
+ return result
425
+
426
+
427
+ def main() -> None:
428
+ """CLI entry point."""
429
+ parser = argparse.ArgumentParser(description="Multi-objective Pareto frontier visualization")
430
+ parser.add_argument("--metrics", default=None, help="Comma-separated metric names")
431
+ parser.add_argument("--config", default="config.yaml", help="Path to config.yaml")
432
+ parser.add_argument("--log", default=DEFAULT_LOG_PATH, help="Path to experiment log")
433
+ parser.add_argument("--ascii", action="store_true", help="Include ASCII scatter plot")
434
+ parser.add_argument("--json", action="store_true", help="Output raw JSON")
435
+ args = parser.parse_args()
436
+
437
+ result = run_frontier_analysis(
438
+ metrics_str=args.metrics,
439
+ config_path=args.config,
440
+ log_path=args.log,
441
+ ascii_plot=args.ascii,
442
+ )
443
+
444
+ if "error" not in result:
445
+ filepath = save_frontier(result)
446
+ print(f"Saved to {filepath}", file=sys.stderr)
447
+
448
+ if args.json:
449
+ print(json.dumps(result, indent=2))
450
+ else:
451
+ if "error" in result:
452
+ print(f"ERROR: {result['error']}")
453
+ else:
454
+ # Load full data for formatting
455
+ config = load_config(args.config)
456
+ experiments = load_experiments(args.log)
457
+ metrics = result["metrics"]
458
+ data = extract_metric_vectors(experiments, metrics)
459
+ frontier_exps = [d for d in data if d["experiment_id"] in
460
+ {f["experiment_id"] for f in result["frontier"]}]
461
+ report = format_frontier_report(data, frontier_exps, metrics, result["directions"])
462
+ print(report)
463
+
464
+ if result.get("ascii_plot"):
465
+ print()
466
+ print(result["ascii_plot"])
467
+
468
+
469
+ if __name__ == "__main__":
470
+ main()