claude-turing 1.3.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -212,6 +212,40 @@ def detect_environment_drift(experiments: list[dict]) -> list[str]:
212
212
  return warnings
213
213
 
214
214
 
215
+ def load_profiles(profile_dir: str = "experiments/profiles") -> list[dict]:
216
+ """Load all profiling results from YAML files."""
217
+ path = Path(profile_dir)
218
+ if not path.exists():
219
+ return []
220
+ profiles = []
221
+ for f in sorted(path.glob("*-profile.yaml")):
222
+ try:
223
+ with open(f) as fh:
224
+ profile = yaml.safe_load(fh)
225
+ if profile and isinstance(profile, dict):
226
+ profiles.append(profile)
227
+ except (yaml.YAMLError, OSError):
228
+ continue
229
+ return profiles
230
+
231
+
232
+ def load_diagnoses(diag_dir: str = "experiments/diagnoses") -> list[dict]:
233
+ """Load all diagnosis reports from YAML files."""
234
+ path = Path(diag_dir)
235
+ if not path.exists():
236
+ return []
237
+ diagnoses = []
238
+ for f in sorted(path.glob("*-diagnosis.yaml")):
239
+ try:
240
+ with open(f) as fh:
241
+ diag = yaml.safe_load(fh)
242
+ if diag and isinstance(diag, dict):
243
+ diagnoses.append(diag)
244
+ except (yaml.YAMLError, OSError):
245
+ continue
246
+ return diagnoses
247
+
248
+
215
249
  def load_seed_studies(seed_dir: str = "experiments/seed_studies") -> list[dict]:
216
250
  """Load all seed study results from YAML files."""
217
251
  path = Path(seed_dir)
@@ -260,6 +294,8 @@ def format_brief(
260
294
  cost_frontier: list | None = None,
261
295
  seed_studies: list[dict] | None = None,
262
296
  reproductions: list[dict] | None = None,
297
+ diagnoses: list[dict] | None = None,
298
+ profiles: list[dict] | None = None,
263
299
  ) -> str:
264
300
  """Format the research briefing as markdown."""
265
301
  direction = "lower" if lower_is_better else "higher"
@@ -436,6 +472,39 @@ def format_brief(
436
472
  if failed:
437
473
  lines.extend(["", f"*{len(failed)} experiment(s) failed reproducibility checks.*"])
438
474
 
475
+ # Profiles
476
+ if profiles:
477
+ lines.extend(["", "## Performance Profile", ""])
478
+ for prof in profiles[-1:]: # Show most recent
479
+ exp_id = prof.get("experiment_id", "?")
480
+ p = prof.get("profile", {})
481
+ bn = prof.get("bottleneck", {})
482
+ lines.append(f"**{exp_id}:** {p.get('total_time_sec', 0):.1f}s total")
483
+ mem = p.get("memory", {})
484
+ if mem.get("peak_rss_mb"):
485
+ lines.append(f"- Peak memory: {mem['peak_rss_mb']:.0f} MB")
486
+ if bn.get("type") and bn["type"] != "none_detected":
487
+ lines.append(f"- Bottleneck: **{bn['type']}** ({bn.get('severity', 'unknown')})")
488
+ recs = prof.get("recommendations", [])
489
+ if recs:
490
+ lines.append(f"- Top recommendation: {recs[0]}")
491
+
492
+ # Diagnoses (error analysis)
493
+ if diagnoses:
494
+ lines.extend(["", "## Error Analysis", ""])
495
+ for diag in diagnoses:
496
+ exp_id = diag.get("experiment_id", "?")
497
+ modes = diag.get("failure_modes", [])
498
+ if modes:
499
+ lines.append(f"**{exp_id}** — {len(modes)} failure mode(s):")
500
+ for mode in modes[:3]:
501
+ lines.append(f"- {mode.get('id', '?')}: {mode.get('description', 'N/A')}")
502
+ if len(modes) > 3:
503
+ lines.append(f" *...and {len(modes) - 3} more (see full diagnosis)*")
504
+ auto_hyps = sum(len(d.get("auto_hypotheses", [])) for d in diagnoses)
505
+ if auto_hyps:
506
+ lines.append(f"\n*{auto_hyps} auto-generated hypotheses from failure analysis.*")
507
+
439
508
  lines.extend([
440
509
  "",
441
510
  "## Recommendations",
@@ -495,9 +564,11 @@ def generate_brief(
495
564
  cost_records = load_cost_data(log_path, metric)
496
565
  pareto = compute_pareto_frontier(cost_records, lower_is_better) if cost_records else []
497
566
 
498
- # Load seed studies and reproduction reports
567
+ # Load seed studies, reproduction reports, diagnoses, and profiles
499
568
  seed_studies = load_seed_studies()
500
569
  reproductions = load_reproductions()
570
+ diagnoses = load_diagnoses()
571
+ profiles = load_profiles()
501
572
 
502
573
  return format_brief(
503
574
  campaign, best, trajectory, model_types, hypotheses,
@@ -506,6 +577,8 @@ def generate_brief(
506
577
  cost_frontier=pareto if cost_records else None,
507
578
  seed_studies=seed_studies if seed_studies else None,
508
579
  reproductions=reproductions if reproductions else None,
580
+ diagnoses=diagnoses if diagnoses else None,
581
+ profiles=profiles if profiles else None,
509
582
  )
510
583
 
511
584
 
@@ -0,0 +1,470 @@
1
+ #!/usr/bin/env python3
2
+ """Multi-objective Pareto frontier visualization.
3
+
4
+ Visualizes the Pareto frontier across multiple objectives from
5
+ experiment history. Answers "which model is actually best?" when
6
+ there are tradeoffs between accuracy, latency, model size, etc.
7
+
8
+ Extends the existing cost_frontier.py (2D: metric vs train_time) to
9
+ N-dimensional Pareto analysis across arbitrary metric sets.
10
+
11
+ Usage:
12
+ python scripts/pareto_frontier.py # Default metrics
13
+ python scripts/pareto_frontier.py --metrics "accuracy,train_seconds" # Specific metrics
14
+ python scripts/pareto_frontier.py --metrics "accuracy,train_seconds,n_params" # 3D
15
+ python scripts/pareto_frontier.py --ascii # ASCII scatter
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import json
22
+ import math
23
+ import sys
24
+ from datetime import datetime, timezone
25
+ from pathlib import Path
26
+
27
+ import yaml
28
+
29
+ from scripts.turing_io import load_config, load_experiments
30
+
31
+ DEFAULT_LOG_PATH = "experiments/log.jsonl"
32
+
33
+
34
+ def extract_metric_vectors(
35
+ experiments: list[dict],
36
+ metrics: list[str],
37
+ status_filter: str = "kept",
38
+ ) -> list[dict]:
39
+ """Extract metric vectors from experiments for Pareto analysis.
40
+
41
+ Returns list of dicts with experiment_id, model_type, and metric values.
42
+ Only includes experiments that have ALL requested metrics.
43
+ """
44
+ results = []
45
+ for exp in experiments:
46
+ if status_filter and exp.get("status") != status_filter:
47
+ continue
48
+
49
+ exp_metrics = exp.get("metrics", {})
50
+ values = {}
51
+ complete = True
52
+ for m in metrics:
53
+ val = exp_metrics.get(m)
54
+ if val is None:
55
+ complete = False
56
+ break
57
+ try:
58
+ values[m] = float(val)
59
+ except (ValueError, TypeError):
60
+ complete = False
61
+ break
62
+
63
+ if complete:
64
+ results.append({
65
+ "experiment_id": exp.get("experiment_id", "?"),
66
+ "model_type": exp.get("config", {}).get("model_type", "unknown"),
67
+ "metrics": values,
68
+ "description": exp.get("description", ""),
69
+ })
70
+
71
+ return results
72
+
73
+
74
+ def compute_pareto_set(
75
+ data: list[dict],
76
+ metrics: list[str],
77
+ directions: dict[str, str],
78
+ ) -> list[dict]:
79
+ """Compute N-dimensional Pareto-optimal set.
80
+
81
+ An experiment is Pareto-optimal if no other experiment is strictly
82
+ better on ALL metrics simultaneously.
83
+
84
+ Args:
85
+ data: List of experiment dicts with metrics.
86
+ metrics: List of metric names.
87
+ directions: Dict mapping metric name to "higher" or "lower".
88
+
89
+ Returns:
90
+ List of Pareto-optimal experiment dicts.
91
+ """
92
+ if not data:
93
+ return []
94
+
95
+ frontier = []
96
+ for i, candidate in enumerate(data):
97
+ dominated = False
98
+ for j, other in enumerate(data):
99
+ if i == j:
100
+ continue
101
+ # Check if other dominates candidate
102
+ # "other dominates candidate" means other is >= on all metrics and > on at least one
103
+ all_at_least_as_good = True
104
+ strictly_better_on_one = False
105
+
106
+ for m in metrics:
107
+ c_val = candidate["metrics"][m]
108
+ o_val = other["metrics"][m]
109
+ direction = directions.get(m, "higher")
110
+
111
+ if direction == "higher":
112
+ if o_val < c_val:
113
+ all_at_least_as_good = False
114
+ break
115
+ if o_val > c_val:
116
+ strictly_better_on_one = True
117
+ else: # lower is better
118
+ if o_val > c_val:
119
+ all_at_least_as_good = False
120
+ break
121
+ if o_val < c_val:
122
+ strictly_better_on_one = True
123
+
124
+ if all_at_least_as_good and strictly_better_on_one:
125
+ dominated = True
126
+ break
127
+
128
+ if not dominated:
129
+ frontier.append(candidate)
130
+
131
+ return frontier
132
+
133
+
134
+ def find_closest_pareto_neighbor(
135
+ dominated: dict,
136
+ frontier: list[dict],
137
+ metrics: list[str],
138
+ directions: dict[str, str],
139
+ ) -> dict | None:
140
+ """Find the closest Pareto-optimal experiment to a dominated one.
141
+
142
+ Uses normalized Euclidean distance across all metrics.
143
+ """
144
+ if not frontier:
145
+ return None
146
+
147
+ # Compute normalization ranges
148
+ all_points = frontier + [dominated]
149
+ ranges = {}
150
+ for m in metrics:
151
+ values = [p["metrics"][m] for p in all_points]
152
+ r = max(values) - min(values)
153
+ ranges[m] = r if r > 0 else 1.0
154
+
155
+ best_dist = float("inf")
156
+ best_neighbor = None
157
+
158
+ for fp in frontier:
159
+ dist = 0
160
+ for m in metrics:
161
+ norm_diff = (dominated["metrics"][m] - fp["metrics"][m]) / ranges[m]
162
+ dist += norm_diff ** 2
163
+ dist = math.sqrt(dist)
164
+
165
+ if dist < best_dist:
166
+ best_dist = dist
167
+ best_neighbor = fp
168
+
169
+ return best_neighbor
170
+
171
+
172
+ def format_ascii_scatter(
173
+ data: list[dict],
174
+ frontier: list[dict],
175
+ x_metric: str,
176
+ y_metric: str,
177
+ width: int = 60,
178
+ height: int = 20,
179
+ ) -> str:
180
+ """Generate an ASCII scatter plot of two metrics with Pareto frontier marked."""
181
+ if not data:
182
+ return "No data to plot."
183
+
184
+ frontier_ids = {e["experiment_id"] for e in frontier}
185
+
186
+ x_vals = [d["metrics"][x_metric] for d in data]
187
+ y_vals = [d["metrics"][y_metric] for d in data]
188
+
189
+ x_min, x_max = min(x_vals), max(x_vals)
190
+ y_min, y_max = min(y_vals), max(y_vals)
191
+
192
+ # Add margin
193
+ x_range = x_max - x_min if x_max != x_min else 1.0
194
+ y_range = y_max - y_min if y_max != y_min else 1.0
195
+ x_min -= x_range * 0.05
196
+ x_max += x_range * 0.05
197
+ y_min -= y_range * 0.05
198
+ y_max += y_range * 0.05
199
+ x_range = x_max - x_min
200
+ y_range = y_max - y_min
201
+
202
+ # Create grid
203
+ grid = [[" "] * width for _ in range(height)]
204
+
205
+ # Plot points
206
+ for d in data:
207
+ x = int((d["metrics"][x_metric] - x_min) / x_range * (width - 1))
208
+ y = int((d["metrics"][y_metric] - y_min) / y_range * (height - 1))
209
+ x = max(0, min(width - 1, x))
210
+ y = max(0, min(height - 1, y))
211
+ y = height - 1 - y # Flip y axis
212
+
213
+ if d["experiment_id"] in frontier_ids:
214
+ grid[y][x] = "*"
215
+ elif grid[y][x] == " ":
216
+ grid[y][x] = "·"
217
+
218
+ # Build output
219
+ lines = [f" {y_metric} vs {x_metric} (* = Pareto-optimal, · = dominated)", ""]
220
+
221
+ # Y axis label
222
+ y_top = f"{y_max:.3f}"
223
+ y_bot = f"{y_min:.3f}"
224
+
225
+ for i, row in enumerate(grid):
226
+ label = ""
227
+ if i == 0:
228
+ label = y_top.rjust(8)
229
+ elif i == height - 1:
230
+ label = y_bot.rjust(8)
231
+ else:
232
+ label = " " * 8
233
+ lines.append(f"{label} |{''.join(row)}|")
234
+
235
+ # X axis
236
+ lines.append(f"{'':>8} +{'-' * width}+")
237
+ x_label = f"{x_min:.3f}{'':>{width - 12}}{x_max:.3f}"
238
+ lines.append(f"{'':>9} {x_label}")
239
+
240
+ return "\n".join(lines)
241
+
242
+
243
+ def format_frontier_report(
244
+ data: list[dict],
245
+ frontier: list[dict],
246
+ metrics: list[str],
247
+ directions: dict[str, str],
248
+ ) -> str:
249
+ """Format Pareto frontier analysis as a markdown report."""
250
+ lines = [
251
+ f"# Pareto Frontier Analysis",
252
+ "",
253
+ f"*{len(frontier)} Pareto-optimal of {len(data)} experiments across {len(metrics)} metrics*",
254
+ "",
255
+ ]
256
+
257
+ # Directions
258
+ dir_strs = [f"{m} ({'↓' if directions.get(m) == 'lower' else '↑'})" for m in metrics]
259
+ lines.append(f"**Metrics:** {', '.join(dir_strs)}")
260
+ lines.append("")
261
+
262
+ # Pareto-optimal table
263
+ lines.append("## Pareto-Optimal Experiments")
264
+ lines.append("")
265
+
266
+ header = "| Experiment | Model |"
267
+ sep = "|------------|-------|"
268
+ for m in metrics:
269
+ header += f" {m} |"
270
+ sep += f"{'---' * max(len(m) // 3, 1)}--|"
271
+ header += " Notes |"
272
+ sep += "-------|"
273
+ lines.append(header)
274
+ lines.append(sep)
275
+
276
+ for exp in frontier:
277
+ row = f"| {exp['experiment_id']} | {exp['model_type']} |"
278
+ for m in metrics:
279
+ row += f" {exp['metrics'][m]:.4f} |"
280
+ # Determine what this experiment is best at
281
+ best_at = []
282
+ for m in metrics:
283
+ is_best = True
284
+ for other in frontier:
285
+ if other is exp:
286
+ continue
287
+ if directions.get(m) == "lower":
288
+ if other["metrics"][m] < exp["metrics"][m]:
289
+ is_best = False
290
+ break
291
+ else:
292
+ if other["metrics"][m] > exp["metrics"][m]:
293
+ is_best = False
294
+ break
295
+ if is_best:
296
+ best_at.append(f"Best {m}")
297
+ notes = ", ".join(best_at) if best_at else "Balanced"
298
+ row += f" {notes} |"
299
+ lines.append(row)
300
+
301
+ # Dominated experiments with nearest neighbor
302
+ dominated = [d for d in data if d not in frontier]
303
+ if dominated:
304
+ lines.extend([
305
+ "",
306
+ "## Dominated Experiments",
307
+ "",
308
+ "| Experiment | Model |",
309
+ ])
310
+ header2 = "|------------|-------|"
311
+ for m in metrics:
312
+ lines[-1] += f" {m} |"
313
+ header2 += f"{'---' * max(len(m) // 3, 1)}--|"
314
+ lines[-1] += " Nearest Pareto |"
315
+ header2 += "----------------|"
316
+ lines.append(header2)
317
+
318
+ for exp in dominated[:10]:
319
+ row = f"| {exp['experiment_id']} | {exp['model_type']} |"
320
+ for m in metrics:
321
+ row += f" {exp['metrics'][m]:.4f} |"
322
+ neighbor = find_closest_pareto_neighbor(exp, frontier, metrics, directions)
323
+ if neighbor:
324
+ row += f" {neighbor['experiment_id']} |"
325
+ else:
326
+ row += " — |"
327
+ lines.append(row)
328
+
329
+ if len(dominated) > 10:
330
+ lines.append(f"*...and {len(dominated) - 10} more dominated experiments*")
331
+
332
+ return "\n".join(lines)
333
+
334
+
335
+ def save_frontier(frontier_data: dict, output_dir: str = "experiments/frontiers") -> Path:
336
+ """Save frontier analysis to YAML file."""
337
+ out_path = Path(output_dir)
338
+ out_path.mkdir(parents=True, exist_ok=True)
339
+ date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
340
+ filepath = out_path / f"frontier-{date_str}.yaml"
341
+ with open(filepath, "w") as f:
342
+ yaml.dump(frontier_data, f, default_flow_style=False, sort_keys=False)
343
+ return filepath
344
+
345
+
346
+ def run_frontier_analysis(
347
+ metrics_str: str | None = None,
348
+ config_path: str = "config.yaml",
349
+ log_path: str = DEFAULT_LOG_PATH,
350
+ ascii_plot: bool = False,
351
+ ) -> dict:
352
+ """Run Pareto frontier analysis.
353
+
354
+ Args:
355
+ metrics_str: Comma-separated metric names (defaults to config metrics).
356
+ config_path: Path to config.yaml.
357
+ log_path: Path to experiment log.
358
+ ascii_plot: Whether to generate ASCII scatter plot.
359
+
360
+ Returns:
361
+ Frontier analysis result dict.
362
+ """
363
+ config = load_config(config_path)
364
+ eval_cfg = config.get("evaluation", {})
365
+ primary_metric = eval_cfg.get("primary_metric", "accuracy")
366
+ lower_is_better = eval_cfg.get("lower_is_better", False)
367
+
368
+ # Determine metrics to analyze
369
+ if metrics_str:
370
+ metrics = [m.strip() for m in metrics_str.split(",")]
371
+ else:
372
+ # Default: primary metric + train_seconds
373
+ configured_metrics = eval_cfg.get("metrics", [primary_metric])
374
+ metrics = list(dict.fromkeys(configured_metrics + ["train_seconds"]))
375
+
376
+ # Determine direction for each metric
377
+ lower_metrics = {"train_seconds", "latency", "latency_ms", "n_params", "model_size",
378
+ "mse", "rmse", "mae", "loss", "log_loss", "error_rate"}
379
+ directions = {}
380
+ for m in metrics:
381
+ if m == primary_metric:
382
+ directions[m] = "lower" if lower_is_better else "higher"
383
+ elif m in lower_metrics:
384
+ directions[m] = "lower"
385
+ else:
386
+ directions[m] = "higher"
387
+
388
+ # Extract data
389
+ experiments = load_experiments(log_path)
390
+ data = extract_metric_vectors(experiments, metrics)
391
+
392
+ if not data:
393
+ return {
394
+ "error": f"No experiments with all metrics: {metrics}",
395
+ "metrics_requested": metrics,
396
+ "hint": "Ensure experiments log all requested metrics.",
397
+ }
398
+
399
+ # Compute Pareto set
400
+ frontier = compute_pareto_set(data, metrics, directions)
401
+
402
+ result = {
403
+ "timestamp": datetime.now(timezone.utc).isoformat(),
404
+ "metrics": metrics,
405
+ "directions": directions,
406
+ "total_experiments": len(data),
407
+ "pareto_optimal": len(frontier),
408
+ "frontier": [
409
+ {
410
+ "experiment_id": e["experiment_id"],
411
+ "model_type": e["model_type"],
412
+ "metrics": e["metrics"],
413
+ }
414
+ for e in frontier
415
+ ],
416
+ "dominated": len(data) - len(frontier),
417
+ }
418
+
419
+ if ascii_plot and len(metrics) >= 2:
420
+ result["ascii_plot"] = format_ascii_scatter(
421
+ data, frontier, metrics[0], metrics[1],
422
+ )
423
+
424
+ return result
425
+
426
+
427
+ def main() -> None:
428
+ """CLI entry point."""
429
+ parser = argparse.ArgumentParser(description="Multi-objective Pareto frontier visualization")
430
+ parser.add_argument("--metrics", default=None, help="Comma-separated metric names")
431
+ parser.add_argument("--config", default="config.yaml", help="Path to config.yaml")
432
+ parser.add_argument("--log", default=DEFAULT_LOG_PATH, help="Path to experiment log")
433
+ parser.add_argument("--ascii", action="store_true", help="Include ASCII scatter plot")
434
+ parser.add_argument("--json", action="store_true", help="Output raw JSON")
435
+ args = parser.parse_args()
436
+
437
+ result = run_frontier_analysis(
438
+ metrics_str=args.metrics,
439
+ config_path=args.config,
440
+ log_path=args.log,
441
+ ascii_plot=args.ascii,
442
+ )
443
+
444
+ if "error" not in result:
445
+ filepath = save_frontier(result)
446
+ print(f"Saved to {filepath}", file=sys.stderr)
447
+
448
+ if args.json:
449
+ print(json.dumps(result, indent=2))
450
+ else:
451
+ if "error" in result:
452
+ print(f"ERROR: {result['error']}")
453
+ else:
454
+ # Load full data for formatting
455
+ config = load_config(args.config)
456
+ experiments = load_experiments(args.log)
457
+ metrics = result["metrics"]
458
+ data = extract_metric_vectors(experiments, metrics)
459
+ frontier_exps = [d for d in data if d["experiment_id"] in
460
+ {f["experiment_id"] for f in result["frontier"]}]
461
+ report = format_frontier_report(data, frontier_exps, metrics, result["directions"])
462
+ print(report)
463
+
464
+ if result.get("ascii_plot"):
465
+ print()
466
+ print(result["ascii_plot"])
467
+
468
+
469
+ if __name__ == "__main__":
470
+ main()