claude-turing 1.0.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/.claude-plugin/plugin.json +2 -2
  2. package/README.md +66 -3
  3. package/commands/card.md +36 -0
  4. package/commands/explore.md +107 -0
  5. package/commands/suggest.md +68 -4
  6. package/commands/turing.md +4 -0
  7. package/package.json +1 -1
  8. package/src/claude-md.js +1 -0
  9. package/src/install.js +2 -2
  10. package/src/verify.js +2 -0
  11. package/templates/requirements.txt +4 -0
  12. package/templates/scripts/__pycache__/cost_frontier.cpython-314.pyc +0 -0
  13. package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
  14. package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
  15. package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
  16. package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
  17. package/templates/scripts/__pycache__/treequest_suggest.cpython-314.pyc +0 -0
  18. package/templates/scripts/cleanup.py +599 -0
  19. package/templates/scripts/cost_frontier.py +292 -0
  20. package/templates/scripts/diff_configs.py +534 -0
  21. package/templates/scripts/export_results.py +457 -0
  22. package/templates/scripts/generate_brief.py +58 -3
  23. package/templates/scripts/generate_model_card.py +342 -0
  24. package/templates/scripts/leaderboard.py +508 -0
  25. package/templates/scripts/manage_hypotheses.py +2 -2
  26. package/templates/scripts/plot_trajectory.py +611 -0
  27. package/templates/scripts/scaffold.py +8 -0
  28. package/templates/scripts/show_metrics.py +23 -2
  29. package/templates/scripts/treequest_suggest.py +520 -0
  30. package/templates/tests/__pycache__/__init__.cpython-314.pyc +0 -0
  31. package/templates/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc +0 -0
  32. package/templates/tests/__pycache__/test_cost_frontier.cpython-314-pytest-9.0.2.pyc +0 -0
  33. package/templates/tests/test_cost_frontier.py +222 -0
@@ -0,0 +1,534 @@
1
+ """Compare configurations and metrics between two experiments.
2
+
3
+ Computes a structured diff of config dicts (recursively) and metric deltas,
4
+ showing what changed, what was added, and what was removed between two
5
+ experiment log entries.
6
+
7
+ Usage:
8
+ python scripts/diff_configs.py exp-005 exp-012 # Human-readable diff
9
+ python scripts/diff_configs.py exp-005 exp-012 --json # Machine-readable JSON
10
+ python scripts/diff_configs.py exp-005 best # Compare against current best
11
+
12
+ The special keyword "best" resolves to the best kept experiment according to
13
+ the primary_metric and lower_is_better settings in config.yaml.
14
+
15
+ Ignored metadata fields: timestamp, experiment_id, git_commit.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import json
22
+ import sys
23
+ from pathlib import Path
24
+ from typing import Any
25
+
26
+ import yaml
27
+
28
+ from scripts.turing_io import load_config, load_experiments
29
+
30
+ DEFAULT_LOG_PATH = "experiments/log.jsonl"
31
+
32
+ # Fields stripped before diffing — they change every run and carry no signal
33
+ IGNORED_METADATA_FIELDS = {"timestamp", "experiment_id", "git_commit"}
34
+
35
+ # Hyperparameter-related config keys to surface in the focused HP section
36
+ HYPERPARAMETER_KEYS = {"hyperparams", "model", "learning_rate", "n_estimators",
37
+ "max_depth", "dropout", "weight_decay", "batch_size",
38
+ "optimizer", "scheduler", "epochs", "num_layers"}
39
+
40
+
41
+ # ---------------------------------------------------------------------------
42
+ # Experiment loading helpers
43
+ # ---------------------------------------------------------------------------
44
+
45
+ def load_experiment(log_path: str, experiment_id: str) -> dict | None:
46
+ """Load a single experiment entry by ID from experiments/log.jsonl.
47
+
48
+ Args:
49
+ log_path: Path to the JSONL log file.
50
+ experiment_id: Experiment ID string, e.g. "exp-005".
51
+
52
+ Returns:
53
+ Experiment dict, or None if the ID is not found.
54
+ """
55
+ path = Path(log_path)
56
+ if not path.exists():
57
+ return None
58
+
59
+ with open(path) as f:
60
+ for line in f:
61
+ line = line.strip()
62
+ if not line:
63
+ continue
64
+ try:
65
+ entry = json.loads(line)
66
+ if entry.get("experiment_id") == experiment_id:
67
+ return entry
68
+ except json.JSONDecodeError:
69
+ continue
70
+ return None
71
+
72
+
73
+ def resolve_best(log_path: str, primary_metric: str, lower_is_better: bool) -> dict | None:
74
+ """Return the best kept experiment by primary metric.
75
+
76
+ Args:
77
+ log_path: Path to the JSONL log file.
78
+ primary_metric: Metric name used for ranking.
79
+ lower_is_better: True for loss/error metrics, False for accuracy/F1.
80
+
81
+ Returns:
82
+ Best experiment dict, or None if no kept experiments exist.
83
+ """
84
+ experiments = load_experiments(log_path)
85
+ best: dict | None = None
86
+ best_value = float("inf") if lower_is_better else float("-inf")
87
+
88
+ for exp in experiments:
89
+ if exp.get("status") != "kept":
90
+ continue
91
+ value = exp.get("metrics", {}).get(primary_metric)
92
+ if value is None:
93
+ continue
94
+ if (lower_is_better and value < best_value) or (not lower_is_better and value > best_value):
95
+ best_value = value
96
+ best = exp
97
+
98
+ return best
99
+
100
+
101
+ # ---------------------------------------------------------------------------
102
+ # Core diff logic
103
+ # ---------------------------------------------------------------------------
104
+
105
+ def flatten_dict(d: dict, prefix: str = "") -> dict[str, Any]:
106
+ """Recursively flatten a nested dict with dot-separated keys.
107
+
108
+ Args:
109
+ d: Dict to flatten (may contain nested dicts).
110
+ prefix: Key prefix accumulated during recursion.
111
+
112
+ Returns:
113
+ Flat dict mapping dot-separated paths to leaf values.
114
+ """
115
+ out: dict[str, Any] = {}
116
+ for k, v in d.items():
117
+ full_key = f"{prefix}.{k}" if prefix else k
118
+ if isinstance(v, dict):
119
+ out.update(flatten_dict(v, full_key))
120
+ else:
121
+ out[full_key] = v
122
+ return out
123
+
124
+
125
+ def strip_ignored(d: dict) -> dict:
126
+ """Remove ignored metadata fields from a shallow dict copy."""
127
+ return {k: v for k, v in d.items() if k not in IGNORED_METADATA_FIELDS}
128
+
129
+
130
+ def compute_config_diff(config_a: dict, config_b: dict) -> dict:
131
+ """Compute a structured diff between two config dicts.
132
+
133
+ Recursively compares nested dicts using dot-separated key paths.
134
+
135
+ Args:
136
+ config_a: Config from experiment A (baseline).
137
+ config_b: Config from experiment B (comparison target).
138
+
139
+ Returns:
140
+ Dict with keys:
141
+ "changed": {key: {"old": val_a, "new": val_b}}
142
+ "added": {key: val} — keys only in B
143
+ "removed": {key: val} — keys only in A
144
+ "unchanged_count": int — number of identical keys (not listed)
145
+ """
146
+ flat_a = flatten_dict(strip_ignored(config_a))
147
+ flat_b = flatten_dict(strip_ignored(config_b))
148
+
149
+ all_keys = set(flat_a) | set(flat_b)
150
+ changed: dict[str, dict] = {}
151
+ added: dict[str, Any] = {}
152
+ removed: dict[str, Any] = {}
153
+ unchanged_count = 0
154
+
155
+ for key in sorted(all_keys):
156
+ in_a = key in flat_a
157
+ in_b = key in flat_b
158
+
159
+ if in_a and in_b:
160
+ if flat_a[key] != flat_b[key]:
161
+ changed[key] = {"old": flat_a[key], "new": flat_b[key]}
162
+ else:
163
+ unchanged_count += 1
164
+ elif in_b:
165
+ added[key] = flat_b[key]
166
+ else:
167
+ removed[key] = flat_a[key]
168
+
169
+ return {
170
+ "changed": changed,
171
+ "added": added,
172
+ "removed": removed,
173
+ "unchanged_count": unchanged_count,
174
+ }
175
+
176
+
177
+ def compute_metric_diff(
178
+ metrics_a: dict,
179
+ metrics_b: dict,
180
+ primary_metric: str,
181
+ lower_is_better: bool,
182
+ ) -> dict:
183
+ """Compute metric deltas with improvement direction indicators.
184
+
185
+ Args:
186
+ metrics_a: Metrics from experiment A.
187
+ metrics_b: Metrics from experiment B.
188
+ primary_metric: Name of the primary metric for the project.
189
+ lower_is_better: True for loss/error metrics.
190
+
191
+ Returns:
192
+ Dict with keys:
193
+ "primary": {metric: {val_a, val_b, delta, direction, is_improvement}}
194
+ "others": {metric: {val_a, val_b, delta, direction}}
195
+ "added": {metric: val} — metrics only in B
196
+ "removed": {metric: val} — metrics only in A
197
+ """
198
+ all_keys = set(metrics_a) | set(metrics_b)
199
+ primary_result: dict[str, Any] = {}
200
+ others: dict[str, dict] = {}
201
+ added: dict[str, Any] = {}
202
+ removed: dict[str, Any] = {}
203
+
204
+ for key in sorted(all_keys):
205
+ in_a = key in metrics_a
206
+ in_b = key in metrics_b
207
+
208
+ if in_a and in_b:
209
+ val_a = metrics_a[key]
210
+ val_b = metrics_b[key]
211
+
212
+ if isinstance(val_a, (int, float)) and isinstance(val_b, (int, float)):
213
+ delta = val_b - val_a
214
+ if abs(delta) < 1e-10:
215
+ direction = "="
216
+ is_improvement = False
217
+ elif lower_is_better:
218
+ direction = "↓" if delta < 0 else "↑"
219
+ is_improvement = delta < 0
220
+ else:
221
+ direction = "↑" if delta > 0 else "↓"
222
+ is_improvement = delta > 0
223
+
224
+ entry = {
225
+ "val_a": val_a,
226
+ "val_b": val_b,
227
+ "delta": delta,
228
+ "direction": direction,
229
+ "is_improvement": is_improvement,
230
+ }
231
+ else:
232
+ entry = {
233
+ "val_a": val_a,
234
+ "val_b": val_b,
235
+ "delta": None,
236
+ "direction": "=" if val_a == val_b else "~",
237
+ "is_improvement": False,
238
+ }
239
+
240
+ if key == primary_metric:
241
+ primary_result[key] = entry
242
+ else:
243
+ others[key] = entry
244
+
245
+ elif in_b:
246
+ added[key] = metrics_b[key]
247
+ else:
248
+ removed[key] = metrics_a[key]
249
+
250
+ return {
251
+ "primary": primary_result,
252
+ "others": others,
253
+ "added": added,
254
+ "removed": removed,
255
+ }
256
+
257
+
258
+ # ---------------------------------------------------------------------------
259
+ # Formatting helpers
260
+ # ---------------------------------------------------------------------------
261
+
262
+ def _is_hp_key(key: str) -> bool:
263
+ """Return True if a flat config key belongs to a hyperparameter section."""
264
+ parts = key.split(".")
265
+ return bool(set(parts) & HYPERPARAMETER_KEYS)
266
+
267
+
268
+ def format_value(v: Any) -> str:
269
+ """Render a value for display; floats use 6 significant figures."""
270
+ if isinstance(v, float):
271
+ return f"{v:.6g}"
272
+ return str(v)
273
+
274
+
275
+ def format_text_diff(
276
+ exp_a: dict,
277
+ exp_b: dict,
278
+ config_diff: dict,
279
+ metric_diff: dict,
280
+ primary_metric: str,
281
+ lower_is_better: bool,
282
+ ) -> str:
283
+ """Render a human-readable diff report.
284
+
285
+ Args:
286
+ exp_a: Full experiment A dict.
287
+ exp_b: Full experiment B dict.
288
+ config_diff: Output of compute_config_diff.
289
+ metric_diff: Output of compute_metric_diff.
290
+ primary_metric: Name of primary metric.
291
+ lower_is_better: True for loss/error metrics.
292
+
293
+ Returns:
294
+ Formatted multi-line string ready for printing.
295
+ """
296
+ id_a = exp_a.get("experiment_id", "?")
297
+ id_b = exp_b.get("experiment_id", "?")
298
+
299
+ lines: list[str] = []
300
+ lines.append("=" * 70)
301
+ lines.append(f" Config & Metric Diff: {id_a} → {id_b}")
302
+ lines.append("=" * 70)
303
+
304
+ # --- Primary metric spotlight ---
305
+ lines.append("")
306
+ lines.append("PRIMARY METRIC")
307
+ lines.append("-" * 40)
308
+ if config_diff.get("primary"):
309
+ # Shouldn't happen (primary_result is keyed by metric name), just a guard
310
+ pass
311
+ primary = metric_diff.get("primary", {})
312
+ if primary:
313
+ for metric_name, info in primary.items():
314
+ val_a = info["val_a"]
315
+ val_b = info["val_b"]
316
+ delta = info["delta"]
317
+ direction = info["direction"]
318
+ improvement_label = ""
319
+ if direction != "=":
320
+ improvement_label = " [IMPROVED]" if info["is_improvement"] else " [REGRESSED]"
321
+
322
+ direction_label = lower_is_better and "lower=better" or "higher=better"
323
+ lines.append(
324
+ f" {metric_name:<20s} {format_value(val_a):>12} → {format_value(val_b):<12}"
325
+ f" {direction} Δ={format_value(delta) if delta is not None else 'N/A'}"
326
+ f" ({direction_label}){improvement_label}"
327
+ )
328
+ else:
329
+ lines.append(f" {primary_metric} not found in one or both experiments.")
330
+
331
+ # --- Hyperparameter changes ---
332
+ changed = config_diff.get("changed", {})
333
+ added_cfg = config_diff.get("added", {})
334
+ removed_cfg = config_diff.get("removed", {})
335
+
336
+ hp_changed = {k: v for k, v in changed.items() if _is_hp_key(k)}
337
+ hp_added = {k: v for k, v in added_cfg.items() if _is_hp_key(k)}
338
+ hp_removed = {k: v for k, v in removed_cfg.items() if _is_hp_key(k)}
339
+
340
+ other_changed = {k: v for k, v in changed.items() if not _is_hp_key(k)}
341
+ other_added = {k: v for k, v in added_cfg.items() if not _is_hp_key(k)}
342
+ other_removed = {k: v for k, v in removed_cfg.items() if not _is_hp_key(k)}
343
+
344
+ if hp_changed or hp_added or hp_removed:
345
+ lines.append("")
346
+ lines.append("HYPERPARAMETER CHANGES")
347
+ lines.append("-" * 40)
348
+ for key, diff in sorted(hp_changed.items()):
349
+ lines.append(f" ~ {key:<30s} {format_value(diff['old']):>15} → {format_value(diff['new'])}")
350
+ for key, val in sorted(hp_added.items()):
351
+ lines.append(f" + {key:<30s} {'':>15} {format_value(val)} (added)")
352
+ for key, val in sorted(hp_removed.items()):
353
+ lines.append(f" - {key:<30s} {format_value(val):>15} {''} (removed)")
354
+ else:
355
+ lines.append("")
356
+ lines.append("HYPERPARAMETER CHANGES")
357
+ lines.append("-" * 40)
358
+ lines.append(" (no hyperparameter changes)")
359
+
360
+ # --- Other config changes ---
361
+ if other_changed or other_added or other_removed:
362
+ lines.append("")
363
+ lines.append("OTHER CONFIG CHANGES")
364
+ lines.append("-" * 40)
365
+ for key, diff in sorted(other_changed.items()):
366
+ lines.append(f" ~ {key:<30s} {format_value(diff['old']):>15} → {format_value(diff['new'])}")
367
+ for key, val in sorted(other_added.items()):
368
+ lines.append(f" + {key:<30s} {'':>15} {format_value(val)} (added)")
369
+ for key, val in sorted(other_removed.items()):
370
+ lines.append(f" - {key:<30s} {format_value(val):>15} {''} (removed)")
371
+
372
+ unchanged_count = config_diff.get("unchanged_count", 0)
373
+ lines.append("")
374
+ lines.append(f" ({unchanged_count} config keys unchanged)")
375
+
376
+ # --- Other metrics ---
377
+ other_metrics = metric_diff.get("others", {})
378
+ metric_added = metric_diff.get("added", {})
379
+ metric_removed = metric_diff.get("removed", {})
380
+
381
+ if other_metrics or metric_added or metric_removed:
382
+ lines.append("")
383
+ lines.append("OTHER METRICS")
384
+ lines.append("-" * 40)
385
+ for metric_name, info in sorted(other_metrics.items()):
386
+ direction = info["direction"]
387
+ delta = info["delta"]
388
+ delta_str = format_value(delta) if delta is not None else "N/A"
389
+ lines.append(
390
+ f" {metric_name:<22s} {format_value(info['val_a']):>12} → {format_value(info['val_b']):<12}"
391
+ f" {direction} Δ={delta_str}"
392
+ )
393
+ for metric_name, val in sorted(metric_added.items()):
394
+ lines.append(f" + {metric_name:<20s} (only in {id_b}): {format_value(val)}")
395
+ for metric_name, val in sorted(metric_removed.items()):
396
+ lines.append(f" - {metric_name:<20s} (only in {id_a}): {format_value(val)}")
397
+
398
+ # --- Experiment metadata footer ---
399
+ lines.append("")
400
+ lines.append("EXPERIMENT INFO")
401
+ lines.append("-" * 40)
402
+ lines.append(f" {'ID':<12} {id_a:<24} {id_b}")
403
+ lines.append(f" {'Status':<12} {exp_a.get('status', '?'):<24} {exp_b.get('status', '?')}")
404
+ lines.append(f" {'Timestamp':<12} {exp_a.get('timestamp', '?')[:19]:<24} {exp_b.get('timestamp', '?')[:19]}")
405
+ desc_a = (exp_a.get("description") or "")[:50]
406
+ desc_b = (exp_b.get("description") or "")[:50]
407
+ if desc_a or desc_b:
408
+ lines.append(f" {'Description':<12} {desc_a:<24} {desc_b}")
409
+ lines.append("=" * 70)
410
+
411
+ return "\n".join(lines)
412
+
413
+
414
+ def format_json_diff(
415
+ exp_a: dict,
416
+ exp_b: dict,
417
+ config_diff: dict,
418
+ metric_diff: dict,
419
+ primary_metric: str,
420
+ ) -> str:
421
+ """Render a machine-readable JSON diff.
422
+
423
+ Args:
424
+ exp_a: Full experiment A dict.
425
+ exp_b: Full experiment B dict.
426
+ config_diff: Output of compute_config_diff.
427
+ metric_diff: Output of compute_metric_diff.
428
+ primary_metric: Name of primary metric.
429
+
430
+ Returns:
431
+ JSON string.
432
+ """
433
+ result = {
434
+ "experiment_a": exp_a.get("experiment_id"),
435
+ "experiment_b": exp_b.get("experiment_id"),
436
+ "primary_metric": primary_metric,
437
+ "config_diff": config_diff,
438
+ "metric_diff": metric_diff,
439
+ }
440
+ return json.dumps(result, indent=2, default=str)
441
+
442
+
443
+ # ---------------------------------------------------------------------------
444
+ # CLI
445
+ # ---------------------------------------------------------------------------
446
+
447
+ def main() -> None:
448
+ """CLI entry point for config and metric diffing."""
449
+ parser = argparse.ArgumentParser(
450
+ description=(
451
+ "Compare configurations and metrics between two experiments. "
452
+ 'Use "best" as the second experiment ID to compare against the '
453
+ "current champion."
454
+ ),
455
+ formatter_class=argparse.RawDescriptionHelpFormatter,
456
+ epilog=(
457
+ "Examples:\n"
458
+ " python scripts/diff_configs.py exp-005 exp-012\n"
459
+ " python scripts/diff_configs.py exp-005 exp-012 --json\n"
460
+ " python scripts/diff_configs.py exp-005 best\n"
461
+ ),
462
+ )
463
+ parser.add_argument("exp_a", help='Baseline experiment ID (e.g. "exp-005")')
464
+ parser.add_argument(
465
+ "exp_b",
466
+ help='Target experiment ID, or "best" to use the current champion',
467
+ )
468
+ parser.add_argument(
469
+ "--json",
470
+ dest="json_output",
471
+ action="store_true",
472
+ help="Output machine-readable JSON instead of a text table",
473
+ )
474
+ parser.add_argument(
475
+ "--log",
476
+ default=DEFAULT_LOG_PATH,
477
+ help=f"Path to experiments/log.jsonl (default: {DEFAULT_LOG_PATH})",
478
+ )
479
+ parser.add_argument(
480
+ "--config",
481
+ default="config.yaml",
482
+ help="Path to config.yaml (default: config.yaml)",
483
+ )
484
+
485
+ args = parser.parse_args()
486
+
487
+ # Load project config for primary_metric and direction
488
+ cfg = load_config(args.config)
489
+ eval_cfg = cfg.get("evaluation", {})
490
+ primary_metric: str = eval_cfg.get("primary_metric", "accuracy")
491
+ lower_is_better: bool = eval_cfg.get("lower_is_better", False)
492
+
493
+ # Resolve experiment A
494
+ exp_a = load_experiment(args.log, args.exp_a)
495
+ if exp_a is None:
496
+ print(f"Error: experiment '{args.exp_a}' not found in {args.log}", file=sys.stderr)
497
+ sys.exit(1)
498
+
499
+ # Resolve experiment B (with "best" keyword support)
500
+ if args.exp_b == "best":
501
+ exp_b = resolve_best(args.log, primary_metric, lower_is_better)
502
+ if exp_b is None:
503
+ print(
504
+ f"Error: no kept experiments found in {args.log} to use as 'best'.",
505
+ file=sys.stderr,
506
+ )
507
+ sys.exit(1)
508
+ resolved_id = exp_b.get("experiment_id", "?")
509
+ if not args.json_output:
510
+ print(f" Resolved 'best' → {resolved_id} (primary metric: {primary_metric})\n")
511
+ else:
512
+ exp_b = load_experiment(args.log, args.exp_b)
513
+ if exp_b is None:
514
+ print(f"Error: experiment '{args.exp_b}' not found in {args.log}", file=sys.stderr)
515
+ sys.exit(1)
516
+
517
+ # Compute diffs
518
+ config_a = exp_a.get("config", {})
519
+ config_b = exp_b.get("config", {})
520
+ config_diff = compute_config_diff(config_a, config_b)
521
+
522
+ metrics_a = exp_a.get("metrics", {})
523
+ metrics_b = exp_b.get("metrics", {})
524
+ metric_diff = compute_metric_diff(metrics_a, metrics_b, primary_metric, lower_is_better)
525
+
526
+ # Render output
527
+ if args.json_output:
528
+ print(format_json_diff(exp_a, exp_b, config_diff, metric_diff, primary_metric))
529
+ else:
530
+ print(format_text_diff(exp_a, exp_b, config_diff, metric_diff, primary_metric, lower_is_better))
531
+
532
+
533
+ if __name__ == "__main__":
534
+ main()