claude-turing 1.0.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +66 -3
- package/commands/card.md +36 -0
- package/commands/explore.md +107 -0
- package/commands/suggest.md +68 -4
- package/commands/turing.md +4 -0
- package/package.json +1 -1
- package/src/claude-md.js +1 -0
- package/src/install.js +2 -2
- package/src/verify.js +2 -0
- package/templates/requirements.txt +4 -0
- package/templates/scripts/__pycache__/cost_frontier.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/treequest_suggest.cpython-314.pyc +0 -0
- package/templates/scripts/cleanup.py +599 -0
- package/templates/scripts/cost_frontier.py +292 -0
- package/templates/scripts/diff_configs.py +534 -0
- package/templates/scripts/export_results.py +457 -0
- package/templates/scripts/generate_brief.py +58 -3
- package/templates/scripts/generate_model_card.py +342 -0
- package/templates/scripts/leaderboard.py +508 -0
- package/templates/scripts/manage_hypotheses.py +2 -2
- package/templates/scripts/plot_trajectory.py +611 -0
- package/templates/scripts/scaffold.py +8 -0
- package/templates/scripts/show_metrics.py +23 -2
- package/templates/scripts/treequest_suggest.py +520 -0
- package/templates/tests/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc +0 -0
- package/templates/tests/__pycache__/test_cost_frontier.cpython-314-pytest-9.0.2.pyc +0 -0
- package/templates/tests/test_cost_frontier.py +222 -0
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
"""Tests for the cost-performance frontier analysis module."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import pytest
|
|
9
|
+
|
|
10
|
+
from scripts.cost_frontier import (
|
|
11
|
+
CostRecord,
|
|
12
|
+
compute_cost_efficiency,
|
|
13
|
+
compute_pareto_frontier,
|
|
14
|
+
format_cost_report,
|
|
15
|
+
load_cost_data,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _make_record(exp_id: str, metric: float, seconds: float, model: str = "xgboost") -> CostRecord:
|
|
20
|
+
"""Helper to create a CostRecord."""
|
|
21
|
+
return CostRecord(
|
|
22
|
+
experiment_id=exp_id,
|
|
23
|
+
metric_value=metric,
|
|
24
|
+
train_seconds=seconds,
|
|
25
|
+
model_type=model,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _write_log(tmp_path: Path, entries: list[dict]) -> Path:
|
|
30
|
+
"""Write experiment entries to a JSONL file and return the path."""
|
|
31
|
+
log_path = tmp_path / "log.jsonl"
|
|
32
|
+
with open(log_path, "w") as f:
|
|
33
|
+
for entry in entries:
|
|
34
|
+
f.write(json.dumps(entry) + "\n")
|
|
35
|
+
return log_path
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class TestParetoFrontierBasic:
|
|
39
|
+
"""test_pareto_frontier_basic — 3 experiments, identify the efficient frontier."""
|
|
40
|
+
|
|
41
|
+
def test_three_experiments(self):
|
|
42
|
+
data = [
|
|
43
|
+
_make_record("exp-001", 0.85, 3.0, "xgboost"),
|
|
44
|
+
_make_record("exp-002", 0.87, 3.0, "xgboost"),
|
|
45
|
+
_make_record("exp-003", 0.89, 2400.0, "neural_net"),
|
|
46
|
+
]
|
|
47
|
+
frontier = compute_pareto_frontier(data, lower_is_better=False)
|
|
48
|
+
frontier_ids = {r.experiment_id for r in frontier}
|
|
49
|
+
|
|
50
|
+
# exp-002 dominates exp-001 (same time, better metric)
|
|
51
|
+
assert "exp-001" not in frontier_ids
|
|
52
|
+
# exp-002 and exp-003 are both on frontier (tradeoff)
|
|
53
|
+
assert "exp-002" in frontier_ids
|
|
54
|
+
assert "exp-003" in frontier_ids
|
|
55
|
+
assert len(frontier) == 2
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class TestParetoFrontierSingle:
|
|
59
|
+
"""test_pareto_frontier_single — 1 experiment is always on frontier."""
|
|
60
|
+
|
|
61
|
+
def test_single_experiment(self):
|
|
62
|
+
data = [_make_record("exp-001", 0.90, 10.0)]
|
|
63
|
+
frontier = compute_pareto_frontier(data, lower_is_better=False)
|
|
64
|
+
assert len(frontier) == 1
|
|
65
|
+
assert frontier[0].experiment_id == "exp-001"
|
|
66
|
+
|
|
67
|
+
def test_single_experiment_lower_is_better(self):
|
|
68
|
+
data = [_make_record("exp-001", 0.15, 10.0)]
|
|
69
|
+
frontier = compute_pareto_frontier(data, lower_is_better=True)
|
|
70
|
+
assert len(frontier) == 1
|
|
71
|
+
assert frontier[0].experiment_id == "exp-001"
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class TestParetoFrontierDominated:
|
|
75
|
+
"""test_pareto_frontier_dominated — dominated experiment excluded."""
|
|
76
|
+
|
|
77
|
+
def test_dominated_excluded(self):
|
|
78
|
+
data = [
|
|
79
|
+
_make_record("exp-001", 0.80, 100.0), # slow AND worse
|
|
80
|
+
_make_record("exp-002", 0.90, 10.0), # fast AND better
|
|
81
|
+
]
|
|
82
|
+
frontier = compute_pareto_frontier(data, lower_is_better=False)
|
|
83
|
+
frontier_ids = {r.experiment_id for r in frontier}
|
|
84
|
+
|
|
85
|
+
assert "exp-001" not in frontier_ids
|
|
86
|
+
assert "exp-002" in frontier_ids
|
|
87
|
+
assert len(frontier) == 1
|
|
88
|
+
|
|
89
|
+
def test_dominated_lower_is_better(self):
|
|
90
|
+
data = [
|
|
91
|
+
_make_record("exp-001", 0.50, 100.0), # slow AND worse (higher is worse)
|
|
92
|
+
_make_record("exp-002", 0.10, 10.0), # fast AND better (lower is better)
|
|
93
|
+
]
|
|
94
|
+
frontier = compute_pareto_frontier(data, lower_is_better=True)
|
|
95
|
+
frontier_ids = {r.experiment_id for r in frontier}
|
|
96
|
+
|
|
97
|
+
assert "exp-001" not in frontier_ids
|
|
98
|
+
assert "exp-002" in frontier_ids
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class TestCostEfficiency:
|
|
102
|
+
"""test_cost_efficiency — metric improvement per second computed correctly."""
|
|
103
|
+
|
|
104
|
+
def test_efficiency_computation(self):
|
|
105
|
+
data = [
|
|
106
|
+
_make_record("exp-001", 0.80, 10.0), # baseline (worst)
|
|
107
|
+
_make_record("exp-002", 0.85, 5.0), # 0.05 improvement in 5s
|
|
108
|
+
_make_record("exp-003", 0.90, 100.0), # 0.10 improvement in 100s
|
|
109
|
+
]
|
|
110
|
+
results = compute_cost_efficiency(data, lower_is_better=False)
|
|
111
|
+
|
|
112
|
+
eff_map = {r["experiment_id"]: r for r in results}
|
|
113
|
+
|
|
114
|
+
# exp-002: improvement=0.05, time=5s -> 0.01/s
|
|
115
|
+
assert abs(eff_map["exp-002"]["metric_per_second"] - 0.01) < 1e-9
|
|
116
|
+
# exp-003: improvement=0.10, time=100s -> 0.001/s
|
|
117
|
+
assert abs(eff_map["exp-003"]["metric_per_second"] - 0.001) < 1e-9
|
|
118
|
+
# exp-001 is baseline: improvement=0, so efficiency=0
|
|
119
|
+
assert eff_map["exp-001"]["metric_per_second"] == 0.0
|
|
120
|
+
|
|
121
|
+
# Should be sorted by efficiency descending
|
|
122
|
+
assert results[0]["experiment_id"] == "exp-002"
|
|
123
|
+
|
|
124
|
+
def test_efficiency_lower_is_better(self):
|
|
125
|
+
data = [
|
|
126
|
+
_make_record("exp-001", 0.50, 10.0), # worst (highest)
|
|
127
|
+
_make_record("exp-002", 0.30, 5.0), # 0.20 improvement in 5s
|
|
128
|
+
]
|
|
129
|
+
results = compute_cost_efficiency(data, lower_is_better=True)
|
|
130
|
+
eff_map = {r["experiment_id"]: r for r in results}
|
|
131
|
+
assert abs(eff_map["exp-002"]["metric_per_second"] - 0.04) < 1e-9
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class TestFormatCostReport:
|
|
135
|
+
"""test_format_cost_report — report contains key experiments and summary."""
|
|
136
|
+
|
|
137
|
+
def test_report_content(self):
|
|
138
|
+
data = [
|
|
139
|
+
_make_record("exp-001", 0.87, 3.0, "xgboost"),
|
|
140
|
+
_make_record("exp-002", 0.89, 2400.0, "neural_net"),
|
|
141
|
+
]
|
|
142
|
+
frontier = compute_pareto_frontier(data, lower_is_better=False)
|
|
143
|
+
report = format_cost_report(data, frontier, "accuracy", lower_is_better=False)
|
|
144
|
+
|
|
145
|
+
assert "exp-001" in report
|
|
146
|
+
assert "exp-002" in report
|
|
147
|
+
assert "xgboost" in report
|
|
148
|
+
assert "neural_net" in report
|
|
149
|
+
assert "Best accuracy" in report
|
|
150
|
+
assert "Best cost-efficiency" in report
|
|
151
|
+
assert "improvement costs" in report
|
|
152
|
+
assert "Pareto" in report
|
|
153
|
+
|
|
154
|
+
def test_report_with_single_experiment(self):
|
|
155
|
+
data = [_make_record("exp-001", 0.90, 10.0)]
|
|
156
|
+
frontier = compute_pareto_frontier(data, lower_is_better=False)
|
|
157
|
+
report = format_cost_report(data, frontier, "accuracy")
|
|
158
|
+
assert "exp-001" in report
|
|
159
|
+
assert "Best accuracy" in report
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class TestEmptyData:
|
|
163
|
+
"""test_empty_data — handles empty log gracefully."""
|
|
164
|
+
|
|
165
|
+
def test_empty_pareto(self):
|
|
166
|
+
assert compute_pareto_frontier([], lower_is_better=False) == []
|
|
167
|
+
|
|
168
|
+
def test_empty_efficiency(self):
|
|
169
|
+
assert compute_cost_efficiency([], lower_is_better=False) == []
|
|
170
|
+
|
|
171
|
+
def test_empty_report(self):
|
|
172
|
+
report = format_cost_report([], [], "accuracy")
|
|
173
|
+
assert "No cost-performance data" in report
|
|
174
|
+
|
|
175
|
+
def test_load_from_empty_file(self, tmp_path):
|
|
176
|
+
log_path = tmp_path / "empty.jsonl"
|
|
177
|
+
log_path.touch()
|
|
178
|
+
records = load_cost_data(str(log_path), "accuracy")
|
|
179
|
+
assert records == []
|
|
180
|
+
|
|
181
|
+
def test_load_from_missing_file(self, tmp_path):
|
|
182
|
+
records = load_cost_data(str(tmp_path / "nonexistent.jsonl"), "accuracy")
|
|
183
|
+
assert records == []
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
class TestLoadCostData:
|
|
187
|
+
"""Test load_cost_data extracts records correctly from JSONL."""
|
|
188
|
+
|
|
189
|
+
def test_load_with_train_seconds(self, tmp_path):
|
|
190
|
+
entries = [
|
|
191
|
+
{
|
|
192
|
+
"experiment_id": "exp-001",
|
|
193
|
+
"status": "kept",
|
|
194
|
+
"config": {"model_type": "xgboost"},
|
|
195
|
+
"metrics": {"accuracy": 0.87, "train_seconds": 3.2},
|
|
196
|
+
},
|
|
197
|
+
{
|
|
198
|
+
"experiment_id": "exp-002",
|
|
199
|
+
"status": "kept",
|
|
200
|
+
"config": {"model_type": "neural_net"},
|
|
201
|
+
"metrics": {"accuracy": 0.89, "train_seconds": 2400.0},
|
|
202
|
+
},
|
|
203
|
+
{
|
|
204
|
+
"experiment_id": "exp-003",
|
|
205
|
+
"status": "discarded",
|
|
206
|
+
"config": {"model_type": "svm"},
|
|
207
|
+
"metrics": {"accuracy": 0.50, "train_seconds": 1.0},
|
|
208
|
+
},
|
|
209
|
+
{
|
|
210
|
+
"experiment_id": "exp-004",
|
|
211
|
+
"status": "kept",
|
|
212
|
+
"config": {"model_type": "rf"},
|
|
213
|
+
"metrics": {"accuracy": 0.85},
|
|
214
|
+
},
|
|
215
|
+
]
|
|
216
|
+
log_path = _write_log(tmp_path, entries)
|
|
217
|
+
records = load_cost_data(str(log_path), "accuracy")
|
|
218
|
+
|
|
219
|
+
# exp-003 is discarded, exp-004 has no train_seconds
|
|
220
|
+
assert len(records) == 2
|
|
221
|
+
assert records[0].experiment_id == "exp-001"
|
|
222
|
+
assert records[1].experiment_id == "exp-002"
|