claude-turing 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/.claude-plugin/plugin.json +2 -2
  2. package/README.md +48 -7
  3. package/commands/brief.md +13 -1
  4. package/commands/card.md +36 -0
  5. package/commands/init.md +13 -0
  6. package/commands/train.md +16 -7
  7. package/commands/turing.md +4 -2
  8. package/package.json +1 -1
  9. package/src/install.js +1 -1
  10. package/src/verify.js +1 -0
  11. package/templates/model_contract.md +49 -0
  12. package/templates/model_registry.yaml +69 -0
  13. package/templates/program.md +2 -0
  14. package/templates/scripts/__pycache__/cost_frontier.cpython-314.pyc +0 -0
  15. package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
  16. package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
  17. package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
  18. package/templates/scripts/cleanup.py +599 -0
  19. package/templates/scripts/cost_frontier.py +292 -0
  20. package/templates/scripts/diff_configs.py +534 -0
  21. package/templates/scripts/export_results.py +457 -0
  22. package/templates/scripts/generate_brief.py +54 -0
  23. package/templates/scripts/generate_model_card.py +342 -0
  24. package/templates/scripts/leaderboard.py +508 -0
  25. package/templates/scripts/plot_trajectory.py +611 -0
  26. package/templates/scripts/scaffold.py +9 -0
  27. package/templates/scripts/show_metrics.py +23 -2
  28. package/templates/tests/__pycache__/__init__.cpython-314.pyc +0 -0
  29. package/templates/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc +0 -0
  30. package/templates/tests/__pycache__/test_cost_frontier.cpython-314-pytest-9.0.2.pyc +0 -0
  31. package/templates/tests/test_cost_frontier.py +222 -0
@@ -0,0 +1,508 @@
1
+ """Ranked leaderboard of experiments by primary metric.
2
+
3
+ One-glance view of where we stand: experiments ranked best-to-worst
4
+ with model type, key hyperparams, delta vs leader, and kept/discarded status.
5
+ Reads experiments/log.jsonl and config.yaml.
6
+
7
+ Usage:
8
+ python scripts/leaderboard.py # Top kept experiments
9
+ python scripts/leaderboard.py --top 5 # Top 5 only
10
+ python scripts/leaderboard.py --status all # Include discarded
11
+ python scripts/leaderboard.py --format markdown # For docs/README
12
+ python scripts/leaderboard.py --metric f1_weighted # Rank by F1
13
+ python scripts/leaderboard.py --compact # Minimal output
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import argparse
19
+ import csv
20
+ import io
21
+ import json
22
+ import sys
23
+ from pathlib import Path
24
+
25
+ from scripts.turing_io import load_config, load_experiments
26
+
27
+ DEFAULT_LOG_PATH = "experiments/log.jsonl"
28
+
29
+ # Key hyperparameter names to surface in the compact column.
30
+ # Order matters — first match wins when space is limited.
31
+ _HYPERPARAM_KEYS = [
32
+ "max_depth",
33
+ "depth",
34
+ "learning_rate",
35
+ "lr",
36
+ "n_estimators",
37
+ "n_est",
38
+ "num_leaves",
39
+ "subsample",
40
+ "colsample_bytree",
41
+ "min_child_weight",
42
+ "reg_alpha",
43
+ "reg_lambda",
44
+ "C",
45
+ "gamma",
46
+ "hidden_size",
47
+ "num_layers",
48
+ "dropout",
49
+ "batch_size",
50
+ "epochs",
51
+ ]
52
+
53
+ _HYPERPARAM_ALIASES = {
54
+ "max_depth": "depth",
55
+ "learning_rate": "lr",
56
+ "n_estimators": "n_est",
57
+ "num_leaves": "n_leaves",
58
+ "colsample_bytree": "col_samp",
59
+ "min_child_weight": "min_cw",
60
+ "hidden_size": "hidden",
61
+ "num_layers": "layers",
62
+ "batch_size": "bs",
63
+ }
64
+
65
+
66
+ def _compact_hyperparams(hyperparams: dict, max_pairs: int = 4) -> str:
67
+ """Format hyperparams as a compact key=value string.
68
+
69
+ Picks the most informative parameters (bias toward the key ones defined
70
+ in _HYPERPARAM_KEYS), aliases long names, and truncates beyond max_pairs.
71
+ """
72
+ if not hyperparams:
73
+ return "—"
74
+
75
+ chosen: list[tuple[str, object]] = []
76
+
77
+ # First pass: grab known interesting keys in priority order.
78
+ for key in _HYPERPARAM_KEYS:
79
+ if key in hyperparams:
80
+ alias = _HYPERPARAM_ALIASES.get(key, key)
81
+ chosen.append((alias, hyperparams[key]))
82
+ if len(chosen) >= max_pairs:
83
+ break
84
+
85
+ # Second pass: fill remaining slots with leftover keys.
86
+ if len(chosen) < max_pairs:
87
+ seen = {k for k, _ in chosen}
88
+ for key, val in hyperparams.items():
89
+ if key not in seen and isinstance(val, (int, float, str, bool)):
90
+ alias = _HYPERPARAM_ALIASES.get(key, key)
91
+ chosen.append((alias, val))
92
+ if len(chosen) >= max_pairs:
93
+ break
94
+
95
+ parts = []
96
+ for k, v in chosen:
97
+ if isinstance(v, float):
98
+ parts.append(f"{k}={v:.4g}")
99
+ else:
100
+ parts.append(f"{k}={v}")
101
+
102
+ return ", ".join(parts) if parts else "—"
103
+
104
+
105
+ def _fmt_metric(value: float | None, precision: int = 4) -> str:
106
+ """Format a metric value, returning '—' for None."""
107
+ if value is None:
108
+ return "—"
109
+ return f"{value:.{precision}f}"
110
+
111
+
112
+ def _fmt_delta(delta: float | None, lower_is_better: bool) -> str:
113
+ """Format delta vs leader. Always non-negative; sign indicates direction."""
114
+ if delta is None:
115
+ return "—"
116
+ if abs(delta) < 1e-9:
117
+ return "—" # This IS the leader; no delta shown.
118
+ # delta is already (leader_value - this_value) in absolute terms,
119
+ # normalised so that positive = worse than leader.
120
+ return f"-{abs(delta):.4f}"
121
+
122
+
123
+ def _status_marker(status: str) -> str:
124
+ return "✓" if status == "kept" else "✗"
125
+
126
+
127
+ def _sort_key(exp: dict, metric: str, lower_is_better: bool):
128
+ """Sort key: best first regardless of direction."""
129
+ val = exp.get("metrics", {}).get(metric)
130
+ if val is None:
131
+ # Push experiments with missing metric to the bottom.
132
+ return float("inf") if lower_is_better else float("-inf")
133
+ return val if lower_is_better else -val
134
+
135
+
136
+ def rank_experiments(
137
+ experiments: list[dict],
138
+ metric: str,
139
+ lower_is_better: bool,
140
+ status_filter: str,
141
+ ) -> list[dict]:
142
+ """Filter and rank experiments, returning list with injected 'rank' field."""
143
+ if status_filter == "kept":
144
+ filtered = [e for e in experiments if e.get("status") == "kept"]
145
+ else:
146
+ filtered = list(experiments)
147
+
148
+ ranked = sorted(filtered, key=lambda e: _sort_key(e, metric, lower_is_better))
149
+
150
+ # Inject rank
151
+ for i, exp in enumerate(ranked):
152
+ exp = dict(exp) # shallow copy — don't mutate the original
153
+ exp["_rank"] = i + 1
154
+ ranked[i] = exp
155
+
156
+ return ranked
157
+
158
+
159
+ def compute_delta(ranked: list[dict], metric: str, lower_is_better: bool) -> list[dict]:
160
+ """Inject '_delta' key: absolute gap behind the leader (positive = worse)."""
161
+ if not ranked:
162
+ return ranked
163
+
164
+ leader_val = ranked[0].get("metrics", {}).get(metric)
165
+
166
+ result = []
167
+ for exp in ranked:
168
+ exp = dict(exp)
169
+ val = exp.get("metrics", {}).get(metric)
170
+ if leader_val is not None and val is not None:
171
+ # For higher-is-better: leader - this (positive means worse).
172
+ # For lower-is-better: this - leader (positive means worse).
173
+ if lower_is_better:
174
+ exp["_delta"] = val - leader_val
175
+ else:
176
+ exp["_delta"] = leader_val - val
177
+ else:
178
+ exp["_delta"] = None
179
+ result.append(exp)
180
+
181
+ return result
182
+
183
+
184
+ # ---------------------------------------------------------------------------
185
+ # Formatters
186
+ # ---------------------------------------------------------------------------
187
+
188
+ def _build_rows(ranked: list[dict], metric: str, lower_is_better: bool) -> list[dict]:
189
+ """Build uniform row dicts for all formatters."""
190
+ rows = []
191
+ for exp in ranked:
192
+ rank = exp["_rank"]
193
+ exp_id = exp.get("experiment_id", "?")
194
+ model_type = exp.get("config", {}).get("model_type", "?")
195
+ hyperparams = exp.get("config", {}).get("hyperparams", {})
196
+ metric_val = exp.get("metrics", {}).get(metric)
197
+ delta = exp.get("_delta")
198
+ status = exp.get("status", "?")
199
+
200
+ rows.append({
201
+ "rank": rank,
202
+ "rank_label": f"#{rank}",
203
+ "experiment_id": exp_id,
204
+ "model_type": model_type,
205
+ "metric_value": metric_val,
206
+ "metric_str": _fmt_metric(metric_val),
207
+ "hyperparams": _compact_hyperparams(hyperparams),
208
+ "delta": _fmt_delta(delta, lower_is_better),
209
+ "status": status,
210
+ "status_marker": _status_marker(status),
211
+ })
212
+ return rows
213
+
214
+
215
+ def format_text(rows: list[dict], metric: str) -> str:
216
+ """Render as a fixed-width text table."""
217
+ if not rows:
218
+ return "No experiments to display."
219
+
220
+ # Column widths
221
+ w_rank = 4
222
+ w_id = max(len(r["experiment_id"]) for r in rows)
223
+ w_id = max(w_id, len("Experiment"))
224
+ w_model = max(len(r["model_type"]) for r in rows)
225
+ w_model = max(w_model, len("Model"))
226
+ w_metric = max(len(r["metric_str"]) for r in rows)
227
+ w_metric = max(w_metric, len(metric))
228
+ w_hp = max(len(r["hyperparams"]) for r in rows)
229
+ w_hp = max(w_hp, len("Hyperparams"))
230
+ w_delta = max(len(r["delta"]) for r in rows)
231
+ w_delta = max(w_delta, len("vs #1"))
232
+ w_status = 6 # "Status"
233
+
234
+ def sep():
235
+ return (
236
+ "+"
237
+ + "-" * (w_rank + 2)
238
+ + "+"
239
+ + "-" * (w_id + 2)
240
+ + "+"
241
+ + "-" * (w_model + 2)
242
+ + "+"
243
+ + "-" * (w_metric + 2)
244
+ + "+"
245
+ + "-" * (w_hp + 2)
246
+ + "+"
247
+ + "-" * (w_delta + 2)
248
+ + "+"
249
+ + "-" * (w_status + 2)
250
+ + "+"
251
+ )
252
+
253
+ def row_line(rank, exp_id, model, metric_v, hp, delta, status, highlight=False):
254
+ metric_cell = metric_v.ljust(w_metric)
255
+ if highlight:
256
+ metric_cell = f"[{metric_cell.strip()}]".ljust(w_metric)
257
+ return (
258
+ f"| {rank:<{w_rank}} "
259
+ f"| {exp_id:<{w_id}} "
260
+ f"| {model:<{w_model}} "
261
+ f"| {metric_cell} "
262
+ f"| {hp:<{w_hp}} "
263
+ f"| {delta:<{w_delta}} "
264
+ f"| {status:<{w_status}} |"
265
+ )
266
+
267
+ lines = [sep()]
268
+ header = row_line(
269
+ "Rank", "Experiment", "Model", metric[:w_metric], "Hyperparams", "vs #1", "Status"
270
+ )
271
+ lines.append(header)
272
+ lines.append(sep())
273
+
274
+ for r in rows:
275
+ highlight = r["rank"] == 1 and r["metric_value"] is not None
276
+ line = row_line(
277
+ r["rank_label"],
278
+ r["experiment_id"],
279
+ r["model_type"],
280
+ r["metric_str"],
281
+ r["hyperparams"],
282
+ r["delta"],
283
+ r["status_marker"],
284
+ highlight=highlight,
285
+ )
286
+ lines.append(line)
287
+
288
+ lines.append(sep())
289
+ return "\n".join(lines)
290
+
291
+
292
+ def format_markdown(rows: list[dict], metric: str) -> str:
293
+ """Render as a GitHub-flavored Markdown table."""
294
+ if not rows:
295
+ return "_No experiments to display._"
296
+
297
+ header = f"| Rank | Experiment | Model | {metric} | Hyperparams | vs #1 | Status |"
298
+ sep = "|------|------------|-------|" + "-" * (len(metric) + 2) + "|-------------|-------|--------|"
299
+
300
+ lines = [header, sep]
301
+ for r in rows:
302
+ metric_cell = r["metric_str"]
303
+ if r["rank"] == 1 and r["metric_value"] is not None:
304
+ metric_cell = f"**{metric_cell}**"
305
+ lines.append(
306
+ f"| {r['rank_label']} "
307
+ f"| {r['experiment_id']} "
308
+ f"| {r['model_type']} "
309
+ f"| {metric_cell} "
310
+ f"| {r['hyperparams']} "
311
+ f"| {r['delta']} "
312
+ f"| {r['status_marker']} |"
313
+ )
314
+
315
+ return "\n".join(lines)
316
+
317
+
318
+ def format_csv(rows: list[dict], metric: str) -> str:
319
+ """Render as CSV."""
320
+ if not rows:
321
+ return ""
322
+
323
+ buf = io.StringIO()
324
+ writer = csv.writer(buf)
325
+ writer.writerow(["rank", "experiment_id", "model_type", metric, "hyperparams", "vs_leader", "status"])
326
+ for r in rows:
327
+ writer.writerow([
328
+ r["rank"],
329
+ r["experiment_id"],
330
+ r["model_type"],
331
+ r["metric_str"],
332
+ r["hyperparams"],
333
+ r["delta"],
334
+ r["status"],
335
+ ])
336
+ return buf.getvalue().rstrip()
337
+
338
+
339
+ def format_compact(rows: list[dict], metric: str) -> str:
340
+ """One line per experiment, no borders."""
341
+ if not rows:
342
+ return "No experiments to display."
343
+
344
+ lines = []
345
+ for r in rows:
346
+ marker = "*" if r["rank"] == 1 else " "
347
+ lines.append(
348
+ f"{marker}{r['rank_label']:<4} {r['experiment_id']:<10} "
349
+ f"{r['model_type']:<15} {metric}={r['metric_str']} "
350
+ f"({r['hyperparams']}) vs#1={r['delta']} {r['status_marker']}"
351
+ )
352
+ return "\n".join(lines)
353
+
354
+
355
+ # ---------------------------------------------------------------------------
356
+ # Footer
357
+ # ---------------------------------------------------------------------------
358
+
359
+ def build_footer(
360
+ all_experiments: list[dict],
361
+ ranked: list[dict],
362
+ metric: str,
363
+ ) -> str:
364
+ """Compose the summary footer."""
365
+ total = len(all_experiments)
366
+ total_kept = sum(1 for e in all_experiments if e.get("status") == "kept")
367
+
368
+ lines = []
369
+
370
+ if not ranked:
371
+ lines.append(f"Total experiments: {total} | Kept: {total_kept}")
372
+ return "\n".join(lines)
373
+
374
+ leader = ranked[0]
375
+ leader_id = leader.get("experiment_id", "?")
376
+ leader_val = leader.get("metrics", {}).get(metric)
377
+ leader_val_str = _fmt_metric(leader_val)
378
+
379
+ lines.append(
380
+ f"Total: {total} experiments | Kept: {total_kept}"
381
+ f" | Best {metric}: {leader_val_str} ({leader_id})"
382
+ )
383
+
384
+ # Gap between #1 and #2
385
+ if len(ranked) >= 2:
386
+ second = ranked[1]
387
+ second_val = second.get("metrics", {}).get(metric)
388
+ if leader_val is not None and second_val is not None:
389
+ gap = abs(leader_val - second_val)
390
+ lines.append(f"Gap #1 → #2: {gap:.4f}")
391
+
392
+ # Timestamp of most recent experiment (by timestamp field, not rank)
393
+ timestamps = [
394
+ e.get("timestamp", "")
395
+ for e in all_experiments
396
+ if e.get("timestamp")
397
+ ]
398
+ if timestamps:
399
+ latest = max(timestamps)
400
+ lines.append(f"Most recent: {latest[:19]} UTC")
401
+
402
+ return "\n".join(lines)
403
+
404
+
405
+ # ---------------------------------------------------------------------------
406
+ # Main
407
+ # ---------------------------------------------------------------------------
408
+
409
+ def main() -> None:
410
+ """CLI entry point."""
411
+ parser = argparse.ArgumentParser(
412
+ description="Ranked leaderboard of experiments by primary metric.",
413
+ formatter_class=argparse.RawDescriptionHelpFormatter,
414
+ epilog=__doc__,
415
+ )
416
+ parser.add_argument(
417
+ "--log",
418
+ default=DEFAULT_LOG_PATH,
419
+ help=f"Path to experiment log (default: {DEFAULT_LOG_PATH})",
420
+ )
421
+ parser.add_argument(
422
+ "--status",
423
+ choices=["kept", "all"],
424
+ default="kept",
425
+ help="Which experiments to include: 'kept' (default) or 'all'",
426
+ )
427
+ parser.add_argument(
428
+ "--top",
429
+ type=int,
430
+ default=None,
431
+ metavar="N",
432
+ help="Show top N experiments only (default: all)",
433
+ )
434
+ parser.add_argument(
435
+ "--metric",
436
+ default=None,
437
+ metavar="NAME",
438
+ help="Rank by this metric (default: primary_metric from config.yaml)",
439
+ )
440
+ parser.add_argument(
441
+ "--format",
442
+ choices=["text", "markdown", "csv"],
443
+ default="text",
444
+ dest="fmt",
445
+ help="Output format (default: text)",
446
+ )
447
+ parser.add_argument(
448
+ "--compact",
449
+ action="store_true",
450
+ help="One-line-per-experiment minimal output (overrides --format)",
451
+ )
452
+ args = parser.parse_args()
453
+
454
+ # Load config
455
+ config = load_config()
456
+ eval_cfg = config.get("evaluation", {})
457
+ primary_metric = eval_cfg.get("primary_metric", "accuracy")
458
+ lower_is_better = eval_cfg.get("lower_is_better", False)
459
+ metric = args.metric if args.metric else primary_metric
460
+
461
+ # Load experiments
462
+ all_experiments = load_experiments(args.log)
463
+
464
+ if not all_experiments:
465
+ print(f"No experiments found in {args.log}.", file=sys.stderr)
466
+ sys.exit(0)
467
+
468
+ # Filter and rank
469
+ ranked = rank_experiments(all_experiments, metric, lower_is_better, args.status)
470
+
471
+ if not ranked:
472
+ if args.status == "kept":
473
+ print("No kept experiments found.", file=sys.stderr)
474
+ else:
475
+ print("No experiments match the filter.", file=sys.stderr)
476
+ sys.exit(0)
477
+
478
+ # Compute deltas vs leader
479
+ ranked = compute_delta(ranked, metric, lower_is_better)
480
+
481
+ # Apply --top
482
+ display = ranked[: args.top] if args.top and args.top > 0 else ranked
483
+
484
+ # Build row data
485
+ rows = _build_rows(display, metric, lower_is_better)
486
+
487
+ # Render
488
+ if args.compact:
489
+ body = format_compact(rows, metric)
490
+ elif args.fmt == "markdown":
491
+ body = format_markdown(rows, metric)
492
+ elif args.fmt == "csv":
493
+ body = format_csv(rows, metric)
494
+ else:
495
+ body = format_text(rows, metric)
496
+
497
+ print(body)
498
+
499
+ # Footer (skip for CSV and compact — they're meant for machines/scripts)
500
+ if args.fmt not in ("csv",) and not args.compact:
501
+ footer = build_footer(all_experiments, ranked, metric)
502
+ if footer:
503
+ print()
504
+ print(footer)
505
+
506
+
507
+ if __name__ == "__main__":
508
+ main()