claude-turing 1.0.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +66 -3
- package/commands/card.md +36 -0
- package/commands/explore.md +107 -0
- package/commands/suggest.md +68 -4
- package/commands/turing.md +4 -0
- package/package.json +1 -1
- package/src/claude-md.js +1 -0
- package/src/install.js +2 -2
- package/src/verify.js +2 -0
- package/templates/requirements.txt +4 -0
- package/templates/scripts/__pycache__/cost_frontier.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/treequest_suggest.cpython-314.pyc +0 -0
- package/templates/scripts/cleanup.py +599 -0
- package/templates/scripts/cost_frontier.py +292 -0
- package/templates/scripts/diff_configs.py +534 -0
- package/templates/scripts/export_results.py +457 -0
- package/templates/scripts/generate_brief.py +58 -3
- package/templates/scripts/generate_model_card.py +342 -0
- package/templates/scripts/leaderboard.py +508 -0
- package/templates/scripts/manage_hypotheses.py +2 -2
- package/templates/scripts/plot_trajectory.py +611 -0
- package/templates/scripts/scaffold.py +8 -0
- package/templates/scripts/show_metrics.py +23 -2
- package/templates/scripts/treequest_suggest.py +520 -0
- package/templates/tests/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc +0 -0
- package/templates/tests/__pycache__/test_cost_frontier.cpython-314-pytest-9.0.2.pyc +0 -0
- package/templates/tests/test_cost_frontier.py +222 -0
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
"""Compute cost-performance frontier from experiment history.
|
|
2
|
+
|
|
3
|
+
Answers the question every ML team should ask: "Is that 2% improvement
|
|
4
|
+
worth 800x the compute?" Reads experiment log, identifies Pareto-optimal
|
|
5
|
+
experiments (best metric for each compute budget), and produces a report
|
|
6
|
+
showing the cost-performance tradeoff.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python scripts/cost_frontier.py [--log experiments/log.jsonl] [--config config.yaml]
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import argparse
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
from scripts.turing_io import load_config, load_experiments
|
|
19
|
+
|
|
20
|
+
DEFAULT_LOG_PATH = "experiments/log.jsonl"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class CostRecord:
|
|
25
|
+
"""A single experiment's cost-performance data point."""
|
|
26
|
+
|
|
27
|
+
experiment_id: str
|
|
28
|
+
metric_value: float
|
|
29
|
+
train_seconds: float
|
|
30
|
+
model_type: str
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def load_cost_data(log_path: str, metric: str) -> list[CostRecord]:
|
|
34
|
+
"""Extract cost-performance data from experiment log.
|
|
35
|
+
|
|
36
|
+
Only includes kept experiments that have both the requested metric
|
|
37
|
+
and train_seconds recorded in their metrics dict.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
log_path: Path to experiments/log.jsonl.
|
|
41
|
+
metric: Name of the metric to use (e.g. "accuracy").
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
List of CostRecord with (experiment_id, metric_value, train_seconds, model_type).
|
|
45
|
+
"""
|
|
46
|
+
experiments = load_experiments(log_path)
|
|
47
|
+
records = []
|
|
48
|
+
|
|
49
|
+
for exp in experiments:
|
|
50
|
+
if exp.get("status") != "kept":
|
|
51
|
+
continue
|
|
52
|
+
|
|
53
|
+
metrics = exp.get("metrics", {})
|
|
54
|
+
metric_val = metrics.get(metric)
|
|
55
|
+
train_secs = metrics.get("train_seconds")
|
|
56
|
+
|
|
57
|
+
if metric_val is None or train_secs is None:
|
|
58
|
+
continue
|
|
59
|
+
|
|
60
|
+
records.append(CostRecord(
|
|
61
|
+
experiment_id=exp.get("experiment_id", "?"),
|
|
62
|
+
metric_value=float(metric_val),
|
|
63
|
+
train_seconds=float(train_secs),
|
|
64
|
+
model_type=exp.get("config", {}).get("model_type", "unknown"),
|
|
65
|
+
))
|
|
66
|
+
|
|
67
|
+
return records
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def compute_pareto_frontier(
|
|
71
|
+
data: list[CostRecord], lower_is_better: bool = False
|
|
72
|
+
) -> list[CostRecord]:
|
|
73
|
+
"""Find Pareto-optimal experiments on the cost-performance frontier.
|
|
74
|
+
|
|
75
|
+
An experiment is Pareto-optimal if no other experiment is both
|
|
76
|
+
faster AND has a better metric value.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
data: List of CostRecord entries.
|
|
80
|
+
lower_is_better: True if lower metric values are better (e.g. MSE).
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
List of CostRecord on the Pareto frontier, sorted by train_seconds.
|
|
84
|
+
"""
|
|
85
|
+
if not data:
|
|
86
|
+
return []
|
|
87
|
+
|
|
88
|
+
frontier = []
|
|
89
|
+
for candidate in data:
|
|
90
|
+
dominated = False
|
|
91
|
+
for other in data:
|
|
92
|
+
if other is candidate:
|
|
93
|
+
continue
|
|
94
|
+
# "other" dominates "candidate" if other is both faster AND better
|
|
95
|
+
faster = other.train_seconds < candidate.train_seconds
|
|
96
|
+
if lower_is_better:
|
|
97
|
+
better_metric = other.metric_value < candidate.metric_value
|
|
98
|
+
else:
|
|
99
|
+
better_metric = other.metric_value > candidate.metric_value
|
|
100
|
+
# Also dominate if equal time but better metric, or equal metric but faster
|
|
101
|
+
equal_time = other.train_seconds == candidate.train_seconds
|
|
102
|
+
equal_metric = other.metric_value == candidate.metric_value
|
|
103
|
+
if (faster and better_metric) or (faster and equal_metric) or (equal_time and better_metric):
|
|
104
|
+
dominated = True
|
|
105
|
+
break
|
|
106
|
+
if not dominated:
|
|
107
|
+
frontier.append(candidate)
|
|
108
|
+
|
|
109
|
+
frontier.sort(key=lambda r: r.train_seconds)
|
|
110
|
+
return frontier
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def compute_cost_efficiency(
|
|
114
|
+
data: list[CostRecord], lower_is_better: bool = False
|
|
115
|
+
) -> list[dict]:
|
|
116
|
+
"""Compute metric improvement per second relative to baseline.
|
|
117
|
+
|
|
118
|
+
Baseline is the worst-performing experiment. For each experiment,
|
|
119
|
+
efficiency = (metric_improvement_over_baseline) / train_seconds.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
data: List of CostRecord entries.
|
|
123
|
+
lower_is_better: True if lower metric values are better.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
List of dicts with experiment_id, metric_value, train_seconds,
|
|
127
|
+
metric_per_second, and model_type. Sorted by efficiency descending.
|
|
128
|
+
"""
|
|
129
|
+
if not data:
|
|
130
|
+
return []
|
|
131
|
+
|
|
132
|
+
if lower_is_better:
|
|
133
|
+
baseline = max(r.metric_value for r in data)
|
|
134
|
+
else:
|
|
135
|
+
baseline = min(r.metric_value for r in data)
|
|
136
|
+
|
|
137
|
+
results = []
|
|
138
|
+
for r in data:
|
|
139
|
+
improvement = abs(r.metric_value - baseline)
|
|
140
|
+
efficiency = improvement / r.train_seconds if r.train_seconds > 0 else 0.0
|
|
141
|
+
results.append({
|
|
142
|
+
"experiment_id": r.experiment_id,
|
|
143
|
+
"metric_value": r.metric_value,
|
|
144
|
+
"train_seconds": r.train_seconds,
|
|
145
|
+
"model_type": r.model_type,
|
|
146
|
+
"improvement_over_baseline": improvement,
|
|
147
|
+
"metric_per_second": efficiency,
|
|
148
|
+
})
|
|
149
|
+
|
|
150
|
+
results.sort(key=lambda x: -x["metric_per_second"])
|
|
151
|
+
return results
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _format_seconds(seconds: float) -> str:
|
|
155
|
+
"""Format seconds into human-readable duration."""
|
|
156
|
+
if seconds < 60:
|
|
157
|
+
return f"{seconds:.1f}s"
|
|
158
|
+
elif seconds < 3600:
|
|
159
|
+
return f"{seconds / 60:.1f}m"
|
|
160
|
+
else:
|
|
161
|
+
return f"{seconds / 3600:.1f}h"
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def format_cost_report(
|
|
165
|
+
data: list[CostRecord],
|
|
166
|
+
frontier: list[CostRecord],
|
|
167
|
+
metric_name: str,
|
|
168
|
+
lower_is_better: bool = False,
|
|
169
|
+
) -> str:
|
|
170
|
+
"""Format cost-performance analysis as a text report.
|
|
171
|
+
|
|
172
|
+
Shows all experiments with train_seconds, metric, and whether
|
|
173
|
+
each is on the Pareto frontier. Includes a summary comparing
|
|
174
|
+
the best metric vs the most cost-efficient alternative.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
data: All CostRecord entries.
|
|
178
|
+
frontier: Pareto-optimal CostRecord entries.
|
|
179
|
+
metric_name: Name of the metric for display.
|
|
180
|
+
lower_is_better: True if lower metric values are better.
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
Formatted text report.
|
|
184
|
+
"""
|
|
185
|
+
if not data:
|
|
186
|
+
return "No cost-performance data available (no experiments have train_seconds)."
|
|
187
|
+
|
|
188
|
+
frontier_ids = {r.experiment_id for r in frontier}
|
|
189
|
+
efficiency = compute_cost_efficiency(data, lower_is_better)
|
|
190
|
+
eff_map = {e["experiment_id"]: e for e in efficiency}
|
|
191
|
+
|
|
192
|
+
lines = [
|
|
193
|
+
"## Cost-Performance Frontier",
|
|
194
|
+
"",
|
|
195
|
+
]
|
|
196
|
+
|
|
197
|
+
# Table header
|
|
198
|
+
header = f"{'ID':<10} {'Model':<15} {metric_name:>12} {'Time':>10} {'Eff.':>10} {'Pareto':>8}"
|
|
199
|
+
lines.append(header)
|
|
200
|
+
lines.append("-" * len(header))
|
|
201
|
+
|
|
202
|
+
# Sort by metric (best first)
|
|
203
|
+
sorted_data = sorted(
|
|
204
|
+
data,
|
|
205
|
+
key=lambda r: r.metric_value,
|
|
206
|
+
reverse=not lower_is_better,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
for r in sorted_data:
|
|
210
|
+
on_frontier = " YES" if r.experiment_id in frontier_ids else ""
|
|
211
|
+
eff = eff_map.get(r.experiment_id, {})
|
|
212
|
+
eff_val = eff.get("metric_per_second", 0.0)
|
|
213
|
+
lines.append(
|
|
214
|
+
f"{r.experiment_id:<10} {r.model_type:<15} {r.metric_value:>12.4f} "
|
|
215
|
+
f"{_format_seconds(r.train_seconds):>10} {eff_val:>10.6f} {on_frontier:>8}"
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
# Summary
|
|
219
|
+
if lower_is_better:
|
|
220
|
+
best_metric_exp = min(data, key=lambda r: r.metric_value)
|
|
221
|
+
else:
|
|
222
|
+
best_metric_exp = max(data, key=lambda r: r.metric_value)
|
|
223
|
+
|
|
224
|
+
most_efficient = efficiency[0] if efficiency else None
|
|
225
|
+
cheapest_frontier = frontier[0] if frontier else None
|
|
226
|
+
|
|
227
|
+
lines.extend(["", "### Summary", ""])
|
|
228
|
+
lines.append(
|
|
229
|
+
f"Best {metric_name}: {best_metric_exp.experiment_id} "
|
|
230
|
+
f"({best_metric_exp.metric_value:.4f}, "
|
|
231
|
+
f"{_format_seconds(best_metric_exp.train_seconds)})"
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
if most_efficient:
|
|
235
|
+
lines.append(
|
|
236
|
+
f"Best cost-efficiency: {most_efficient['experiment_id']} "
|
|
237
|
+
f"({most_efficient['metric_value']:.4f}, "
|
|
238
|
+
f"{_format_seconds(most_efficient['train_seconds'])})"
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
# Compute the "is it worth it?" comparison
|
|
242
|
+
if cheapest_frontier and best_metric_exp.experiment_id != cheapest_frontier.experiment_id:
|
|
243
|
+
if best_metric_exp.metric_value != cheapest_frontier.metric_value:
|
|
244
|
+
metric_diff = abs(best_metric_exp.metric_value - cheapest_frontier.metric_value)
|
|
245
|
+
if lower_is_better:
|
|
246
|
+
pct_diff = metric_diff / cheapest_frontier.metric_value * 100
|
|
247
|
+
else:
|
|
248
|
+
pct_diff = metric_diff / cheapest_frontier.metric_value * 100
|
|
249
|
+
|
|
250
|
+
if cheapest_frontier.train_seconds > 0:
|
|
251
|
+
compute_ratio = best_metric_exp.train_seconds / cheapest_frontier.train_seconds
|
|
252
|
+
else:
|
|
253
|
+
compute_ratio = float("inf")
|
|
254
|
+
|
|
255
|
+
lines.append(
|
|
256
|
+
f"The {pct_diff:.1f}% improvement costs "
|
|
257
|
+
f"{compute_ratio:.0f}x more compute."
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
return "\n".join(lines)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def main() -> None:
|
|
264
|
+
"""CLI entry point for cost-performance analysis."""
|
|
265
|
+
parser = argparse.ArgumentParser(
|
|
266
|
+
description="Analyze cost-performance frontier from experiment log.",
|
|
267
|
+
)
|
|
268
|
+
parser.add_argument(
|
|
269
|
+
"--log",
|
|
270
|
+
default=DEFAULT_LOG_PATH,
|
|
271
|
+
help=f"Path to experiment log (default: {DEFAULT_LOG_PATH})",
|
|
272
|
+
)
|
|
273
|
+
parser.add_argument(
|
|
274
|
+
"--config",
|
|
275
|
+
default="config.yaml",
|
|
276
|
+
help="Path to config.yaml (default: config.yaml)",
|
|
277
|
+
)
|
|
278
|
+
args = parser.parse_args()
|
|
279
|
+
|
|
280
|
+
config = load_config(args.config)
|
|
281
|
+
eval_cfg = config.get("evaluation", {})
|
|
282
|
+
metric = eval_cfg.get("primary_metric", "accuracy")
|
|
283
|
+
lower_is_better = eval_cfg.get("lower_is_better", False)
|
|
284
|
+
|
|
285
|
+
data = load_cost_data(args.log, metric)
|
|
286
|
+
frontier = compute_pareto_frontier(data, lower_is_better)
|
|
287
|
+
report = format_cost_report(data, frontier, metric, lower_is_better)
|
|
288
|
+
print(report)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
if __name__ == "__main__":
|
|
292
|
+
main()
|