claude-turing 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/.claude-plugin/plugin.json +2 -2
  2. package/README.md +48 -7
  3. package/commands/brief.md +13 -1
  4. package/commands/card.md +36 -0
  5. package/commands/init.md +13 -0
  6. package/commands/train.md +16 -7
  7. package/commands/turing.md +4 -2
  8. package/package.json +1 -1
  9. package/src/install.js +1 -1
  10. package/src/verify.js +1 -0
  11. package/templates/model_contract.md +49 -0
  12. package/templates/model_registry.yaml +69 -0
  13. package/templates/program.md +2 -0
  14. package/templates/scripts/__pycache__/cost_frontier.cpython-314.pyc +0 -0
  15. package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
  16. package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
  17. package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
  18. package/templates/scripts/cleanup.py +599 -0
  19. package/templates/scripts/cost_frontier.py +292 -0
  20. package/templates/scripts/diff_configs.py +534 -0
  21. package/templates/scripts/export_results.py +457 -0
  22. package/templates/scripts/generate_brief.py +54 -0
  23. package/templates/scripts/generate_model_card.py +342 -0
  24. package/templates/scripts/leaderboard.py +508 -0
  25. package/templates/scripts/plot_trajectory.py +611 -0
  26. package/templates/scripts/scaffold.py +9 -0
  27. package/templates/scripts/show_metrics.py +23 -2
  28. package/templates/tests/__pycache__/__init__.cpython-314.pyc +0 -0
  29. package/templates/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc +0 -0
  30. package/templates/tests/__pycache__/test_cost_frontier.cpython-314-pytest-9.0.2.pyc +0 -0
  31. package/templates/tests/test_cost_frontier.py +222 -0
@@ -0,0 +1,292 @@
1
+ """Compute cost-performance frontier from experiment history.
2
+
3
+ Answers the question every ML team should ask: "Is that 2% improvement
4
+ worth 800x the compute?" Reads experiment log, identifies Pareto-optimal
5
+ experiments (best metric for each compute budget), and produces a report
6
+ showing the cost-performance tradeoff.
7
+
8
+ Usage:
9
+ python scripts/cost_frontier.py [--log experiments/log.jsonl] [--config config.yaml]
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import argparse
15
+ from dataclasses import dataclass
16
+ from pathlib import Path
17
+
18
+ from scripts.turing_io import load_config, load_experiments
19
+
20
+ DEFAULT_LOG_PATH = "experiments/log.jsonl"
21
+
22
+
23
+ @dataclass
24
+ class CostRecord:
25
+ """A single experiment's cost-performance data point."""
26
+
27
+ experiment_id: str
28
+ metric_value: float
29
+ train_seconds: float
30
+ model_type: str
31
+
32
+
33
+ def load_cost_data(log_path: str, metric: str) -> list[CostRecord]:
34
+ """Extract cost-performance data from experiment log.
35
+
36
+ Only includes kept experiments that have both the requested metric
37
+ and train_seconds recorded in their metrics dict.
38
+
39
+ Args:
40
+ log_path: Path to experiments/log.jsonl.
41
+ metric: Name of the metric to use (e.g. "accuracy").
42
+
43
+ Returns:
44
+ List of CostRecord with (experiment_id, metric_value, train_seconds, model_type).
45
+ """
46
+ experiments = load_experiments(log_path)
47
+ records = []
48
+
49
+ for exp in experiments:
50
+ if exp.get("status") != "kept":
51
+ continue
52
+
53
+ metrics = exp.get("metrics", {})
54
+ metric_val = metrics.get(metric)
55
+ train_secs = metrics.get("train_seconds")
56
+
57
+ if metric_val is None or train_secs is None:
58
+ continue
59
+
60
+ records.append(CostRecord(
61
+ experiment_id=exp.get("experiment_id", "?"),
62
+ metric_value=float(metric_val),
63
+ train_seconds=float(train_secs),
64
+ model_type=exp.get("config", {}).get("model_type", "unknown"),
65
+ ))
66
+
67
+ return records
68
+
69
+
70
+ def compute_pareto_frontier(
71
+ data: list[CostRecord], lower_is_better: bool = False
72
+ ) -> list[CostRecord]:
73
+ """Find Pareto-optimal experiments on the cost-performance frontier.
74
+
75
+ An experiment is Pareto-optimal if no other experiment is both
76
+ faster AND has a better metric value.
77
+
78
+ Args:
79
+ data: List of CostRecord entries.
80
+ lower_is_better: True if lower metric values are better (e.g. MSE).
81
+
82
+ Returns:
83
+ List of CostRecord on the Pareto frontier, sorted by train_seconds.
84
+ """
85
+ if not data:
86
+ return []
87
+
88
+ frontier = []
89
+ for candidate in data:
90
+ dominated = False
91
+ for other in data:
92
+ if other is candidate:
93
+ continue
94
+ # "other" dominates "candidate" if other is both faster AND better
95
+ faster = other.train_seconds < candidate.train_seconds
96
+ if lower_is_better:
97
+ better_metric = other.metric_value < candidate.metric_value
98
+ else:
99
+ better_metric = other.metric_value > candidate.metric_value
100
+ # Also dominate if equal time but better metric, or equal metric but faster
101
+ equal_time = other.train_seconds == candidate.train_seconds
102
+ equal_metric = other.metric_value == candidate.metric_value
103
+ if (faster and better_metric) or (faster and equal_metric) or (equal_time and better_metric):
104
+ dominated = True
105
+ break
106
+ if not dominated:
107
+ frontier.append(candidate)
108
+
109
+ frontier.sort(key=lambda r: r.train_seconds)
110
+ return frontier
111
+
112
+
113
+ def compute_cost_efficiency(
114
+ data: list[CostRecord], lower_is_better: bool = False
115
+ ) -> list[dict]:
116
+ """Compute metric improvement per second relative to baseline.
117
+
118
+ Baseline is the worst-performing experiment. For each experiment,
119
+ efficiency = (metric_improvement_over_baseline) / train_seconds.
120
+
121
+ Args:
122
+ data: List of CostRecord entries.
123
+ lower_is_better: True if lower metric values are better.
124
+
125
+ Returns:
126
+ List of dicts with experiment_id, metric_value, train_seconds,
127
+ metric_per_second, and model_type. Sorted by efficiency descending.
128
+ """
129
+ if not data:
130
+ return []
131
+
132
+ if lower_is_better:
133
+ baseline = max(r.metric_value for r in data)
134
+ else:
135
+ baseline = min(r.metric_value for r in data)
136
+
137
+ results = []
138
+ for r in data:
139
+ improvement = abs(r.metric_value - baseline)
140
+ efficiency = improvement / r.train_seconds if r.train_seconds > 0 else 0.0
141
+ results.append({
142
+ "experiment_id": r.experiment_id,
143
+ "metric_value": r.metric_value,
144
+ "train_seconds": r.train_seconds,
145
+ "model_type": r.model_type,
146
+ "improvement_over_baseline": improvement,
147
+ "metric_per_second": efficiency,
148
+ })
149
+
150
+ results.sort(key=lambda x: -x["metric_per_second"])
151
+ return results
152
+
153
+
154
+ def _format_seconds(seconds: float) -> str:
155
+ """Format seconds into human-readable duration."""
156
+ if seconds < 60:
157
+ return f"{seconds:.1f}s"
158
+ elif seconds < 3600:
159
+ return f"{seconds / 60:.1f}m"
160
+ else:
161
+ return f"{seconds / 3600:.1f}h"
162
+
163
+
164
+ def format_cost_report(
165
+ data: list[CostRecord],
166
+ frontier: list[CostRecord],
167
+ metric_name: str,
168
+ lower_is_better: bool = False,
169
+ ) -> str:
170
+ """Format cost-performance analysis as a text report.
171
+
172
+ Shows all experiments with train_seconds, metric, and whether
173
+ each is on the Pareto frontier. Includes a summary comparing
174
+ the best metric vs the most cost-efficient alternative.
175
+
176
+ Args:
177
+ data: All CostRecord entries.
178
+ frontier: Pareto-optimal CostRecord entries.
179
+ metric_name: Name of the metric for display.
180
+ lower_is_better: True if lower metric values are better.
181
+
182
+ Returns:
183
+ Formatted text report.
184
+ """
185
+ if not data:
186
+ return "No cost-performance data available (no experiments have train_seconds)."
187
+
188
+ frontier_ids = {r.experiment_id for r in frontier}
189
+ efficiency = compute_cost_efficiency(data, lower_is_better)
190
+ eff_map = {e["experiment_id"]: e for e in efficiency}
191
+
192
+ lines = [
193
+ "## Cost-Performance Frontier",
194
+ "",
195
+ ]
196
+
197
+ # Table header
198
+ header = f"{'ID':<10} {'Model':<15} {metric_name:>12} {'Time':>10} {'Eff.':>10} {'Pareto':>8}"
199
+ lines.append(header)
200
+ lines.append("-" * len(header))
201
+
202
+ # Sort by metric (best first)
203
+ sorted_data = sorted(
204
+ data,
205
+ key=lambda r: r.metric_value,
206
+ reverse=not lower_is_better,
207
+ )
208
+
209
+ for r in sorted_data:
210
+ on_frontier = " YES" if r.experiment_id in frontier_ids else ""
211
+ eff = eff_map.get(r.experiment_id, {})
212
+ eff_val = eff.get("metric_per_second", 0.0)
213
+ lines.append(
214
+ f"{r.experiment_id:<10} {r.model_type:<15} {r.metric_value:>12.4f} "
215
+ f"{_format_seconds(r.train_seconds):>10} {eff_val:>10.6f} {on_frontier:>8}"
216
+ )
217
+
218
+ # Summary
219
+ if lower_is_better:
220
+ best_metric_exp = min(data, key=lambda r: r.metric_value)
221
+ else:
222
+ best_metric_exp = max(data, key=lambda r: r.metric_value)
223
+
224
+ most_efficient = efficiency[0] if efficiency else None
225
+ cheapest_frontier = frontier[0] if frontier else None
226
+
227
+ lines.extend(["", "### Summary", ""])
228
+ lines.append(
229
+ f"Best {metric_name}: {best_metric_exp.experiment_id} "
230
+ f"({best_metric_exp.metric_value:.4f}, "
231
+ f"{_format_seconds(best_metric_exp.train_seconds)})"
232
+ )
233
+
234
+ if most_efficient:
235
+ lines.append(
236
+ f"Best cost-efficiency: {most_efficient['experiment_id']} "
237
+ f"({most_efficient['metric_value']:.4f}, "
238
+ f"{_format_seconds(most_efficient['train_seconds'])})"
239
+ )
240
+
241
+ # Compute the "is it worth it?" comparison
242
+ if cheapest_frontier and best_metric_exp.experiment_id != cheapest_frontier.experiment_id:
243
+ if best_metric_exp.metric_value != cheapest_frontier.metric_value:
244
+ metric_diff = abs(best_metric_exp.metric_value - cheapest_frontier.metric_value)
245
+ if lower_is_better:
246
+ pct_diff = metric_diff / cheapest_frontier.metric_value * 100
247
+ else:
248
+ pct_diff = metric_diff / cheapest_frontier.metric_value * 100
249
+
250
+ if cheapest_frontier.train_seconds > 0:
251
+ compute_ratio = best_metric_exp.train_seconds / cheapest_frontier.train_seconds
252
+ else:
253
+ compute_ratio = float("inf")
254
+
255
+ lines.append(
256
+ f"The {pct_diff:.1f}% improvement costs "
257
+ f"{compute_ratio:.0f}x more compute."
258
+ )
259
+
260
+ return "\n".join(lines)
261
+
262
+
263
+ def main() -> None:
264
+ """CLI entry point for cost-performance analysis."""
265
+ parser = argparse.ArgumentParser(
266
+ description="Analyze cost-performance frontier from experiment log.",
267
+ )
268
+ parser.add_argument(
269
+ "--log",
270
+ default=DEFAULT_LOG_PATH,
271
+ help=f"Path to experiment log (default: {DEFAULT_LOG_PATH})",
272
+ )
273
+ parser.add_argument(
274
+ "--config",
275
+ default="config.yaml",
276
+ help="Path to config.yaml (default: config.yaml)",
277
+ )
278
+ args = parser.parse_args()
279
+
280
+ config = load_config(args.config)
281
+ eval_cfg = config.get("evaluation", {})
282
+ metric = eval_cfg.get("primary_metric", "accuracy")
283
+ lower_is_better = eval_cfg.get("lower_is_better", False)
284
+
285
+ data = load_cost_data(args.log, metric)
286
+ frontier = compute_pareto_frontier(data, lower_is_better)
287
+ report = format_cost_report(data, frontier, metric, lower_is_better)
288
+ print(report)
289
+
290
+
291
+ if __name__ == "__main__":
292
+ main()