claude-turing 1.0.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/.claude-plugin/plugin.json +2 -2
  2. package/README.md +66 -3
  3. package/commands/card.md +36 -0
  4. package/commands/explore.md +107 -0
  5. package/commands/suggest.md +68 -4
  6. package/commands/turing.md +4 -0
  7. package/package.json +1 -1
  8. package/src/claude-md.js +1 -0
  9. package/src/install.js +2 -2
  10. package/src/verify.js +2 -0
  11. package/templates/requirements.txt +4 -0
  12. package/templates/scripts/__pycache__/cost_frontier.cpython-314.pyc +0 -0
  13. package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
  14. package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
  15. package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
  16. package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
  17. package/templates/scripts/__pycache__/treequest_suggest.cpython-314.pyc +0 -0
  18. package/templates/scripts/cleanup.py +599 -0
  19. package/templates/scripts/cost_frontier.py +292 -0
  20. package/templates/scripts/diff_configs.py +534 -0
  21. package/templates/scripts/export_results.py +457 -0
  22. package/templates/scripts/generate_brief.py +58 -3
  23. package/templates/scripts/generate_model_card.py +342 -0
  24. package/templates/scripts/leaderboard.py +508 -0
  25. package/templates/scripts/manage_hypotheses.py +2 -2
  26. package/templates/scripts/plot_trajectory.py +611 -0
  27. package/templates/scripts/scaffold.py +8 -0
  28. package/templates/scripts/show_metrics.py +23 -2
  29. package/templates/scripts/treequest_suggest.py +520 -0
  30. package/templates/tests/__pycache__/__init__.cpython-314.pyc +0 -0
  31. package/templates/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc +0 -0
  32. package/templates/tests/__pycache__/test_cost_frontier.cpython-314-pytest-9.0.2.pyc +0 -0
  33. package/templates/tests/test_cost_frontier.py +222 -0
@@ -0,0 +1,222 @@
1
+ """Tests for the cost-performance frontier analysis module."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+
8
+ import pytest
9
+
10
+ from scripts.cost_frontier import (
11
+ CostRecord,
12
+ compute_cost_efficiency,
13
+ compute_pareto_frontier,
14
+ format_cost_report,
15
+ load_cost_data,
16
+ )
17
+
18
+
19
+ def _make_record(exp_id: str, metric: float, seconds: float, model: str = "xgboost") -> CostRecord:
20
+ """Helper to create a CostRecord."""
21
+ return CostRecord(
22
+ experiment_id=exp_id,
23
+ metric_value=metric,
24
+ train_seconds=seconds,
25
+ model_type=model,
26
+ )
27
+
28
+
29
+ def _write_log(tmp_path: Path, entries: list[dict]) -> Path:
30
+ """Write experiment entries to a JSONL file and return the path."""
31
+ log_path = tmp_path / "log.jsonl"
32
+ with open(log_path, "w") as f:
33
+ for entry in entries:
34
+ f.write(json.dumps(entry) + "\n")
35
+ return log_path
36
+
37
+
38
+ class TestParetoFrontierBasic:
39
+ """test_pareto_frontier_basic — 3 experiments, identify the efficient frontier."""
40
+
41
+ def test_three_experiments(self):
42
+ data = [
43
+ _make_record("exp-001", 0.85, 3.0, "xgboost"),
44
+ _make_record("exp-002", 0.87, 3.0, "xgboost"),
45
+ _make_record("exp-003", 0.89, 2400.0, "neural_net"),
46
+ ]
47
+ frontier = compute_pareto_frontier(data, lower_is_better=False)
48
+ frontier_ids = {r.experiment_id for r in frontier}
49
+
50
+ # exp-002 dominates exp-001 (same time, better metric)
51
+ assert "exp-001" not in frontier_ids
52
+ # exp-002 and exp-003 are both on frontier (tradeoff)
53
+ assert "exp-002" in frontier_ids
54
+ assert "exp-003" in frontier_ids
55
+ assert len(frontier) == 2
56
+
57
+
58
+ class TestParetoFrontierSingle:
59
+ """test_pareto_frontier_single — 1 experiment is always on frontier."""
60
+
61
+ def test_single_experiment(self):
62
+ data = [_make_record("exp-001", 0.90, 10.0)]
63
+ frontier = compute_pareto_frontier(data, lower_is_better=False)
64
+ assert len(frontier) == 1
65
+ assert frontier[0].experiment_id == "exp-001"
66
+
67
+ def test_single_experiment_lower_is_better(self):
68
+ data = [_make_record("exp-001", 0.15, 10.0)]
69
+ frontier = compute_pareto_frontier(data, lower_is_better=True)
70
+ assert len(frontier) == 1
71
+ assert frontier[0].experiment_id == "exp-001"
72
+
73
+
74
+ class TestParetoFrontierDominated:
75
+ """test_pareto_frontier_dominated — dominated experiment excluded."""
76
+
77
+ def test_dominated_excluded(self):
78
+ data = [
79
+ _make_record("exp-001", 0.80, 100.0), # slow AND worse
80
+ _make_record("exp-002", 0.90, 10.0), # fast AND better
81
+ ]
82
+ frontier = compute_pareto_frontier(data, lower_is_better=False)
83
+ frontier_ids = {r.experiment_id for r in frontier}
84
+
85
+ assert "exp-001" not in frontier_ids
86
+ assert "exp-002" in frontier_ids
87
+ assert len(frontier) == 1
88
+
89
+ def test_dominated_lower_is_better(self):
90
+ data = [
91
+ _make_record("exp-001", 0.50, 100.0), # slow AND worse (higher is worse)
92
+ _make_record("exp-002", 0.10, 10.0), # fast AND better (lower is better)
93
+ ]
94
+ frontier = compute_pareto_frontier(data, lower_is_better=True)
95
+ frontier_ids = {r.experiment_id for r in frontier}
96
+
97
+ assert "exp-001" not in frontier_ids
98
+ assert "exp-002" in frontier_ids
99
+
100
+
101
+ class TestCostEfficiency:
102
+ """test_cost_efficiency — metric improvement per second computed correctly."""
103
+
104
+ def test_efficiency_computation(self):
105
+ data = [
106
+ _make_record("exp-001", 0.80, 10.0), # baseline (worst)
107
+ _make_record("exp-002", 0.85, 5.0), # 0.05 improvement in 5s
108
+ _make_record("exp-003", 0.90, 100.0), # 0.10 improvement in 100s
109
+ ]
110
+ results = compute_cost_efficiency(data, lower_is_better=False)
111
+
112
+ eff_map = {r["experiment_id"]: r for r in results}
113
+
114
+ # exp-002: improvement=0.05, time=5s -> 0.01/s
115
+ assert abs(eff_map["exp-002"]["metric_per_second"] - 0.01) < 1e-9
116
+ # exp-003: improvement=0.10, time=100s -> 0.001/s
117
+ assert abs(eff_map["exp-003"]["metric_per_second"] - 0.001) < 1e-9
118
+ # exp-001 is baseline: improvement=0, so efficiency=0
119
+ assert eff_map["exp-001"]["metric_per_second"] == 0.0
120
+
121
+ # Should be sorted by efficiency descending
122
+ assert results[0]["experiment_id"] == "exp-002"
123
+
124
+ def test_efficiency_lower_is_better(self):
125
+ data = [
126
+ _make_record("exp-001", 0.50, 10.0), # worst (highest)
127
+ _make_record("exp-002", 0.30, 5.0), # 0.20 improvement in 5s
128
+ ]
129
+ results = compute_cost_efficiency(data, lower_is_better=True)
130
+ eff_map = {r["experiment_id"]: r for r in results}
131
+ assert abs(eff_map["exp-002"]["metric_per_second"] - 0.04) < 1e-9
132
+
133
+
134
+ class TestFormatCostReport:
135
+ """test_format_cost_report — report contains key experiments and summary."""
136
+
137
+ def test_report_content(self):
138
+ data = [
139
+ _make_record("exp-001", 0.87, 3.0, "xgboost"),
140
+ _make_record("exp-002", 0.89, 2400.0, "neural_net"),
141
+ ]
142
+ frontier = compute_pareto_frontier(data, lower_is_better=False)
143
+ report = format_cost_report(data, frontier, "accuracy", lower_is_better=False)
144
+
145
+ assert "exp-001" in report
146
+ assert "exp-002" in report
147
+ assert "xgboost" in report
148
+ assert "neural_net" in report
149
+ assert "Best accuracy" in report
150
+ assert "Best cost-efficiency" in report
151
+ assert "improvement costs" in report
152
+ assert "Pareto" in report
153
+
154
+ def test_report_with_single_experiment(self):
155
+ data = [_make_record("exp-001", 0.90, 10.0)]
156
+ frontier = compute_pareto_frontier(data, lower_is_better=False)
157
+ report = format_cost_report(data, frontier, "accuracy")
158
+ assert "exp-001" in report
159
+ assert "Best accuracy" in report
160
+
161
+
162
+ class TestEmptyData:
163
+ """test_empty_data — handles empty log gracefully."""
164
+
165
+ def test_empty_pareto(self):
166
+ assert compute_pareto_frontier([], lower_is_better=False) == []
167
+
168
+ def test_empty_efficiency(self):
169
+ assert compute_cost_efficiency([], lower_is_better=False) == []
170
+
171
+ def test_empty_report(self):
172
+ report = format_cost_report([], [], "accuracy")
173
+ assert "No cost-performance data" in report
174
+
175
+ def test_load_from_empty_file(self, tmp_path):
176
+ log_path = tmp_path / "empty.jsonl"
177
+ log_path.touch()
178
+ records = load_cost_data(str(log_path), "accuracy")
179
+ assert records == []
180
+
181
+ def test_load_from_missing_file(self, tmp_path):
182
+ records = load_cost_data(str(tmp_path / "nonexistent.jsonl"), "accuracy")
183
+ assert records == []
184
+
185
+
186
+ class TestLoadCostData:
187
+ """Test load_cost_data extracts records correctly from JSONL."""
188
+
189
+ def test_load_with_train_seconds(self, tmp_path):
190
+ entries = [
191
+ {
192
+ "experiment_id": "exp-001",
193
+ "status": "kept",
194
+ "config": {"model_type": "xgboost"},
195
+ "metrics": {"accuracy": 0.87, "train_seconds": 3.2},
196
+ },
197
+ {
198
+ "experiment_id": "exp-002",
199
+ "status": "kept",
200
+ "config": {"model_type": "neural_net"},
201
+ "metrics": {"accuracy": 0.89, "train_seconds": 2400.0},
202
+ },
203
+ {
204
+ "experiment_id": "exp-003",
205
+ "status": "discarded",
206
+ "config": {"model_type": "svm"},
207
+ "metrics": {"accuracy": 0.50, "train_seconds": 1.0},
208
+ },
209
+ {
210
+ "experiment_id": "exp-004",
211
+ "status": "kept",
212
+ "config": {"model_type": "rf"},
213
+ "metrics": {"accuracy": 0.85},
214
+ },
215
+ ]
216
+ log_path = _write_log(tmp_path, entries)
217
+ records = load_cost_data(str(log_path), "accuracy")
218
+
219
+ # exp-003 is discarded, exp-004 has no train_seconds
220
+ assert len(records) == 2
221
+ assert records[0].experiment_id == "exp-001"
222
+ assert records[1].experiment_id == "exp-002"