claude-turing 4.1.0 → 4.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +7 -2
- package/commands/counterfactual.md +27 -0
- package/commands/registry.md +31 -0
- package/commands/simulate.md +28 -0
- package/commands/turing.md +10 -0
- package/commands/update.md +27 -0
- package/commands/whatif.md +31 -0
- package/package.json +1 -1
- package/src/install.js +2 -0
- package/src/verify.js +5 -0
- package/templates/scripts/__pycache__/counterfactual_explanation.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_simulator.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/incremental_update.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/model_lifecycle.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/whatif_engine.cpython-314.pyc +0 -0
- package/templates/scripts/counterfactual_explanation.py +485 -0
- package/templates/scripts/experiment_simulator.py +463 -0
- package/templates/scripts/generate_brief.py +125 -0
- package/templates/scripts/generate_model_card.py +154 -3
- package/templates/scripts/incremental_update.py +586 -0
- package/templates/scripts/model_lifecycle.py +549 -0
- package/templates/scripts/scaffold.py +10 -0
- package/templates/scripts/whatif_engine.py +763 -0
|
@@ -0,0 +1,463 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Experiment outcome simulator for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
Predicts experiment outcomes from prior data using a surrogate model.
|
|
5
|
+
Pre-filters experiment configs to save budget — only run the ones
|
|
6
|
+
predicted to beat the current best.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python scripts/experiment_simulator.py --configs configs.yaml
|
|
10
|
+
python scripts/experiment_simulator.py --top-k 5
|
|
11
|
+
python scripts/experiment_simulator.py --threshold 0.001
|
|
12
|
+
python scripts/experiment_simulator.py --json
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import json
|
|
19
|
+
import sys
|
|
20
|
+
from datetime import datetime, timezone
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
import numpy as np
|
|
24
|
+
import yaml
|
|
25
|
+
|
|
26
|
+
from scripts.turing_io import load_config, load_experiments
|
|
27
|
+
|
|
28
|
+
DEFAULT_LOG_PATH = "experiments/log.jsonl"
|
|
29
|
+
DEFAULT_TOP_K = 5
|
|
30
|
+
DEFAULT_IMPROVEMENT_THRESHOLD = 0.0
|
|
31
|
+
NOVELTY_PENALTY_FACTOR = 0.1
|
|
32
|
+
MIN_HISTORY_FOR_SURROGATE = 5
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# --- Feature Extraction ---
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def extract_config_features(config: dict) -> dict[str, float]:
|
|
39
|
+
"""Extract numeric features from an experiment config.
|
|
40
|
+
|
|
41
|
+
Flattens nested config into a flat dict of numeric values.
|
|
42
|
+
"""
|
|
43
|
+
features = {}
|
|
44
|
+
_flatten(config, features, prefix="")
|
|
45
|
+
return features
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _flatten(obj: dict, out: dict, prefix: str) -> None:
|
|
49
|
+
"""Recursively flatten a dict, keeping only numeric values."""
|
|
50
|
+
for key, val in obj.items():
|
|
51
|
+
full_key = f"{prefix}{key}" if not prefix else f"{prefix}.{key}"
|
|
52
|
+
if isinstance(val, (int, float)) and not isinstance(val, bool):
|
|
53
|
+
out[full_key] = float(val)
|
|
54
|
+
elif isinstance(val, dict):
|
|
55
|
+
_flatten(val, out, full_key)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def experiment_to_features(exp: dict) -> dict[str, float]:
|
|
59
|
+
"""Extract feature vector from an experiment log entry."""
|
|
60
|
+
features = {}
|
|
61
|
+
|
|
62
|
+
# Extract from config
|
|
63
|
+
config = exp.get("config", {})
|
|
64
|
+
if isinstance(config, dict):
|
|
65
|
+
hyperparams = config.get("hyperparams", config.get("model", {}).get("hyperparams", {}))
|
|
66
|
+
if isinstance(hyperparams, dict):
|
|
67
|
+
for k, v in hyperparams.items():
|
|
68
|
+
if isinstance(v, (int, float)) and not isinstance(v, bool):
|
|
69
|
+
features[k] = float(v)
|
|
70
|
+
|
|
71
|
+
# Also check top-level numeric fields
|
|
72
|
+
for key in ("learning_rate", "lr", "max_depth", "depth", "n_estimators",
|
|
73
|
+
"epochs", "batch_size", "hidden_size", "dropout"):
|
|
74
|
+
val = exp.get(key) or config.get(key)
|
|
75
|
+
if isinstance(val, (int, float)):
|
|
76
|
+
features[key] = float(val)
|
|
77
|
+
|
|
78
|
+
return features
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# --- Surrogate Model ---
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def build_surrogate(
|
|
85
|
+
experiments: list[dict],
|
|
86
|
+
primary_metric: str,
|
|
87
|
+
) -> dict:
|
|
88
|
+
"""Build a simple surrogate model from experiment history.
|
|
89
|
+
|
|
90
|
+
Uses a weighted k-NN approach: for a new config, predict the metric
|
|
91
|
+
as a weighted average of the k nearest experiments in config space.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Surrogate model dict with training data and feature info.
|
|
95
|
+
"""
|
|
96
|
+
data_points = []
|
|
97
|
+
for exp in experiments:
|
|
98
|
+
metric = exp.get("metrics", {}).get(primary_metric)
|
|
99
|
+
if metric is None:
|
|
100
|
+
continue
|
|
101
|
+
features = experiment_to_features(exp)
|
|
102
|
+
if features:
|
|
103
|
+
data_points.append({"features": features, "metric": metric})
|
|
104
|
+
|
|
105
|
+
if len(data_points) < MIN_HISTORY_FOR_SURROGATE:
|
|
106
|
+
return {
|
|
107
|
+
"status": "insufficient",
|
|
108
|
+
"n_points": len(data_points),
|
|
109
|
+
"min_required": MIN_HISTORY_FOR_SURROGATE,
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
# Collect all feature names
|
|
113
|
+
all_features = set()
|
|
114
|
+
for dp in data_points:
|
|
115
|
+
all_features.update(dp["features"].keys())
|
|
116
|
+
|
|
117
|
+
return {
|
|
118
|
+
"status": "ready",
|
|
119
|
+
"data_points": data_points,
|
|
120
|
+
"feature_names": sorted(all_features),
|
|
121
|
+
"n_points": len(data_points),
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def predict_with_surrogate(
|
|
126
|
+
surrogate: dict,
|
|
127
|
+
config_features: dict[str, float],
|
|
128
|
+
k: int = 3,
|
|
129
|
+
) -> dict:
|
|
130
|
+
"""Predict metric for a config using weighted k-NN surrogate.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
surrogate: Built surrogate model.
|
|
134
|
+
config_features: Feature dict for the config to predict.
|
|
135
|
+
k: Number of nearest neighbors.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
Prediction dict with predicted metric and uncertainty.
|
|
139
|
+
"""
|
|
140
|
+
if surrogate.get("status") != "ready":
|
|
141
|
+
return {"error": "Surrogate not ready", "predicted": None, "uncertainty": None}
|
|
142
|
+
|
|
143
|
+
data_points = surrogate["data_points"]
|
|
144
|
+
feature_names = surrogate["feature_names"]
|
|
145
|
+
|
|
146
|
+
# Compute distances
|
|
147
|
+
distances = []
|
|
148
|
+
for dp in data_points:
|
|
149
|
+
dist = _config_distance(config_features, dp["features"], feature_names)
|
|
150
|
+
distances.append((dist, dp["metric"]))
|
|
151
|
+
|
|
152
|
+
distances.sort(key=lambda x: x[0])
|
|
153
|
+
neighbors = distances[:k]
|
|
154
|
+
|
|
155
|
+
if not neighbors:
|
|
156
|
+
return {"error": "No neighbors found", "predicted": None, "uncertainty": None}
|
|
157
|
+
|
|
158
|
+
# Weighted average (inverse distance weighting)
|
|
159
|
+
metrics = [m for _, m in neighbors]
|
|
160
|
+
dists = [d for d, _ in neighbors]
|
|
161
|
+
|
|
162
|
+
if all(d == 0 for d in dists):
|
|
163
|
+
predicted = np.mean(metrics)
|
|
164
|
+
uncertainty = 0.0
|
|
165
|
+
elif all(d == float("inf") for d in dists):
|
|
166
|
+
predicted = np.mean(metrics)
|
|
167
|
+
uncertainty = float(np.std(metrics)) if len(metrics) > 1 else 0.0
|
|
168
|
+
else:
|
|
169
|
+
weights = [1.0 / (d + 1e-6) for d in dists if d != float("inf")]
|
|
170
|
+
valid_metrics = [m for d, m in zip(dists, metrics) if d != float("inf")]
|
|
171
|
+
if not weights:
|
|
172
|
+
predicted = np.mean(metrics)
|
|
173
|
+
uncertainty = float(np.std(metrics)) if len(metrics) > 1 else 0.0
|
|
174
|
+
else:
|
|
175
|
+
total_weight = sum(weights)
|
|
176
|
+
predicted = sum(w * m for w, m in zip(weights, valid_metrics)) / total_weight
|
|
177
|
+
uncertainty = float(np.std(valid_metrics)) if len(valid_metrics) > 1 else 0.0
|
|
178
|
+
|
|
179
|
+
# Novelty penalty: discount if far from training distribution
|
|
180
|
+
finite_dists = [d for d, _ in distances if d != float("inf")]
|
|
181
|
+
min_dist = min(finite_dists) if finite_dists else float("inf")
|
|
182
|
+
avg_dist = float(np.mean(finite_dists)) if finite_dists else 1.0
|
|
183
|
+
novelty = min_dist / avg_dist if avg_dist > 0 and min_dist != float("inf") else 1.0
|
|
184
|
+
novelty_penalty = novelty * NOVELTY_PENALTY_FACTOR
|
|
185
|
+
|
|
186
|
+
return {
|
|
187
|
+
"predicted": round(float(predicted - novelty_penalty), 6),
|
|
188
|
+
"uncertainty": round(float(uncertainty), 6),
|
|
189
|
+
"novelty_score": round(float(novelty), 4),
|
|
190
|
+
"n_neighbors": len(neighbors),
|
|
191
|
+
"nearest_distance": round(float(min_dist), 4),
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _config_distance(
|
|
196
|
+
config_a: dict[str, float],
|
|
197
|
+
config_b: dict[str, float],
|
|
198
|
+
feature_names: list[str],
|
|
199
|
+
) -> float:
|
|
200
|
+
"""Compute normalized distance between two configs."""
|
|
201
|
+
total = 0.0
|
|
202
|
+
n = 0
|
|
203
|
+
for feat in feature_names:
|
|
204
|
+
a = config_a.get(feat)
|
|
205
|
+
b = config_b.get(feat)
|
|
206
|
+
if a is not None and b is not None:
|
|
207
|
+
# Normalize by max(|a|, |b|, 1) to handle different scales
|
|
208
|
+
scale = max(abs(a), abs(b), 1.0)
|
|
209
|
+
total += ((a - b) / scale) ** 2
|
|
210
|
+
n += 1
|
|
211
|
+
|
|
212
|
+
if n == 0:
|
|
213
|
+
return float("inf")
|
|
214
|
+
return float(np.sqrt(total / n))
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
# --- Simulation Pipeline ---
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def simulate_experiments(
|
|
221
|
+
proposed_configs: list[dict],
|
|
222
|
+
experiments: list[dict],
|
|
223
|
+
primary_metric: str,
|
|
224
|
+
top_k: int = DEFAULT_TOP_K,
|
|
225
|
+
improvement_threshold: float = DEFAULT_IMPROVEMENT_THRESHOLD,
|
|
226
|
+
lower_is_better: bool = False,
|
|
227
|
+
) -> dict:
|
|
228
|
+
"""Simulate proposed experiments and rank by predicted outcome.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
proposed_configs: List of experiment configs to simulate.
|
|
232
|
+
experiments: Historical experiment data.
|
|
233
|
+
primary_metric: Metric to predict.
|
|
234
|
+
top_k: Number of top configs to recommend running.
|
|
235
|
+
improvement_threshold: Minimum predicted improvement over current best.
|
|
236
|
+
lower_is_better: Whether lower metric is better.
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
Simulation report with ranked configs and budget savings.
|
|
240
|
+
"""
|
|
241
|
+
if not proposed_configs:
|
|
242
|
+
return {"error": "No proposed configs to simulate"}
|
|
243
|
+
|
|
244
|
+
surrogate = build_surrogate(experiments, primary_metric)
|
|
245
|
+
if surrogate.get("status") != "ready":
|
|
246
|
+
return {
|
|
247
|
+
"error": f"Insufficient experiment history ({surrogate.get('n_points', 0)} experiments, "
|
|
248
|
+
f"need {MIN_HISTORY_FOR_SURROGATE})",
|
|
249
|
+
"suggestion": "Run more experiments first to build a reliable surrogate model.",
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
# Get current best
|
|
253
|
+
best_metrics = [
|
|
254
|
+
exp.get("metrics", {}).get(primary_metric)
|
|
255
|
+
for exp in experiments
|
|
256
|
+
if exp.get("metrics", {}).get(primary_metric) is not None
|
|
257
|
+
]
|
|
258
|
+
if lower_is_better:
|
|
259
|
+
current_best = min(best_metrics) if best_metrics else float("inf")
|
|
260
|
+
else:
|
|
261
|
+
current_best = max(best_metrics) if best_metrics else 0
|
|
262
|
+
|
|
263
|
+
# Predict each config
|
|
264
|
+
predictions = []
|
|
265
|
+
for i, config in enumerate(proposed_configs):
|
|
266
|
+
features = extract_config_features(config)
|
|
267
|
+
pred = predict_with_surrogate(surrogate, features)
|
|
268
|
+
predicted = pred.get("predicted")
|
|
269
|
+
uncertainty = pred.get("uncertainty", 0)
|
|
270
|
+
|
|
271
|
+
if predicted is not None:
|
|
272
|
+
if lower_is_better:
|
|
273
|
+
improvement = current_best - predicted
|
|
274
|
+
else:
|
|
275
|
+
improvement = predicted - current_best
|
|
276
|
+
|
|
277
|
+
# Classify uncertainty
|
|
278
|
+
if uncertainty < 0.005:
|
|
279
|
+
unc_level = "LOW"
|
|
280
|
+
elif uncertainty < 0.015:
|
|
281
|
+
unc_level = "MED"
|
|
282
|
+
else:
|
|
283
|
+
unc_level = "HIGH"
|
|
284
|
+
|
|
285
|
+
verdict = "RUN" if improvement > improvement_threshold else "SKIP"
|
|
286
|
+
|
|
287
|
+
predictions.append({
|
|
288
|
+
"rank": 0, # filled later
|
|
289
|
+
"config_index": i,
|
|
290
|
+
"config_summary": _summarize_config(config),
|
|
291
|
+
"predicted_metric": predicted,
|
|
292
|
+
"uncertainty": uncertainty,
|
|
293
|
+
"uncertainty_level": unc_level,
|
|
294
|
+
"improvement": round(improvement, 6),
|
|
295
|
+
"verdict": verdict,
|
|
296
|
+
"novelty_score": pred.get("novelty_score", 0),
|
|
297
|
+
})
|
|
298
|
+
|
|
299
|
+
# Sort by predicted metric
|
|
300
|
+
predictions.sort(
|
|
301
|
+
key=lambda p: p["predicted_metric"],
|
|
302
|
+
reverse=not lower_is_better,
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
# Assign ranks
|
|
306
|
+
for i, p in enumerate(predictions):
|
|
307
|
+
p["rank"] = i + 1
|
|
308
|
+
|
|
309
|
+
# Apply top-k
|
|
310
|
+
run_configs = [p for p in predictions if p["verdict"] == "RUN"][:top_k]
|
|
311
|
+
skip_configs = [p for p in predictions if p not in run_configs]
|
|
312
|
+
|
|
313
|
+
# Mark skipped
|
|
314
|
+
for p in skip_configs:
|
|
315
|
+
p["verdict"] = "SKIP"
|
|
316
|
+
|
|
317
|
+
total = len(predictions)
|
|
318
|
+
n_run = len(run_configs)
|
|
319
|
+
n_skip = total - n_run
|
|
320
|
+
savings = round(n_skip / total * 100, 1) if total > 0 else 0
|
|
321
|
+
|
|
322
|
+
return {
|
|
323
|
+
"current_best": current_best,
|
|
324
|
+
"primary_metric": primary_metric,
|
|
325
|
+
"total_proposed": total,
|
|
326
|
+
"run_count": n_run,
|
|
327
|
+
"skip_count": n_skip,
|
|
328
|
+
"budget_savings_pct": savings,
|
|
329
|
+
"predictions": predictions,
|
|
330
|
+
"surrogate_info": {
|
|
331
|
+
"n_training_points": surrogate["n_points"],
|
|
332
|
+
"n_features": len(surrogate["feature_names"]),
|
|
333
|
+
},
|
|
334
|
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def _summarize_config(config: dict, max_items: int = 4) -> str:
|
|
339
|
+
"""Create a short summary of a config dict."""
|
|
340
|
+
features = extract_config_features(config)
|
|
341
|
+
items = list(features.items())[:max_items]
|
|
342
|
+
parts = [f"{k}={v}" for k, v in items]
|
|
343
|
+
if len(features) > max_items:
|
|
344
|
+
parts.append("...")
|
|
345
|
+
return ", ".join(parts) if parts else "(empty config)"
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
# --- Report Formatting ---
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def save_simulation_report(report: dict, output_dir: str = "experiments/simulations") -> Path:
|
|
352
|
+
"""Save simulation report to YAML."""
|
|
353
|
+
out_path = Path(output_dir)
|
|
354
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
|
355
|
+
ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
|
|
356
|
+
filepath = out_path / f"simulation-{ts}.yaml"
|
|
357
|
+
with open(filepath, "w") as f:
|
|
358
|
+
yaml.dump(report, f, default_flow_style=False, sort_keys=False)
|
|
359
|
+
return filepath
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def format_simulation_report(report: dict) -> str:
|
|
363
|
+
"""Format simulation report as readable markdown."""
|
|
364
|
+
if "error" in report:
|
|
365
|
+
lines = [f"ERROR: {report['error']}"]
|
|
366
|
+
if "suggestion" in report:
|
|
367
|
+
lines.append(f"\n{report['suggestion']}")
|
|
368
|
+
return "\n".join(lines)
|
|
369
|
+
|
|
370
|
+
lines = ["# Experiment Simulation", ""]
|
|
371
|
+
lines.append(f"**Current best:** {report.get('current_best', 'N/A')}")
|
|
372
|
+
lines.append(f"**Proposed configs:** {report.get('total_proposed', 0)}")
|
|
373
|
+
lines.append(f"**Recommended to run:** {report.get('run_count', 0)}")
|
|
374
|
+
lines.append(f"**Budget savings:** {report.get('budget_savings_pct', 0)}%")
|
|
375
|
+
lines.append("")
|
|
376
|
+
|
|
377
|
+
predictions = report.get("predictions", [])
|
|
378
|
+
if predictions:
|
|
379
|
+
lines.append("| Rank | Config Summary | Predicted | Uncertainty | Verdict |")
|
|
380
|
+
lines.append("|------|---------------|-----------|-------------|---------|")
|
|
381
|
+
for p in predictions:
|
|
382
|
+
unc = f"{p['predicted_metric']:.4f} \u00b1 {p['uncertainty']:.4f}"
|
|
383
|
+
lines.append(
|
|
384
|
+
f"| {p['rank']} | {p['config_summary'][:40]} | {p['predicted_metric']:.4f} "
|
|
385
|
+
f"| {p['uncertainty_level']} | {p['verdict']} {'✓' if p['verdict'] == 'RUN' else '✗'} |"
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
lines.append("")
|
|
389
|
+
rec_run = report.get("run_count", 0)
|
|
390
|
+
rec_skip = report.get("skip_count", 0)
|
|
391
|
+
lines.append(
|
|
392
|
+
f"**Recommendation:** Run top {rec_run}, skip {rec_skip}. "
|
|
393
|
+
f"Estimated budget savings: {report.get('budget_savings_pct', 0)}%."
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
lines.append("")
|
|
397
|
+
lines.append(f"*Generated: {report.get('generated_at', 'N/A')}*")
|
|
398
|
+
return "\n".join(lines)
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
# --- CLI ---
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
def main():
|
|
405
|
+
parser = argparse.ArgumentParser(
|
|
406
|
+
description="Experiment outcome simulator — predict results before running"
|
|
407
|
+
)
|
|
408
|
+
parser.add_argument("--configs", help="YAML file with proposed experiment configs")
|
|
409
|
+
parser.add_argument("--top-k", type=int, default=DEFAULT_TOP_K,
|
|
410
|
+
help="Number of top configs to recommend")
|
|
411
|
+
parser.add_argument("--threshold", type=float, default=DEFAULT_IMPROVEMENT_THRESHOLD,
|
|
412
|
+
help="Minimum predicted improvement to recommend running")
|
|
413
|
+
parser.add_argument("--config", default="config.yaml", help="Path to project config.yaml")
|
|
414
|
+
parser.add_argument("--log", default=DEFAULT_LOG_PATH, help="Path to experiment log")
|
|
415
|
+
parser.add_argument("--json", action="store_true", help="Output raw JSON")
|
|
416
|
+
|
|
417
|
+
args = parser.parse_args()
|
|
418
|
+
|
|
419
|
+
config = load_config(args.config)
|
|
420
|
+
eval_cfg = config.get("evaluation", {})
|
|
421
|
+
primary_metric = eval_cfg.get("primary_metric", "accuracy")
|
|
422
|
+
lower_is_better = eval_cfg.get("lower_is_better", False)
|
|
423
|
+
|
|
424
|
+
experiments = load_experiments(args.log)
|
|
425
|
+
|
|
426
|
+
# Load proposed configs
|
|
427
|
+
proposed = []
|
|
428
|
+
if args.configs:
|
|
429
|
+
with open(args.configs) as f:
|
|
430
|
+
data = yaml.safe_load(f)
|
|
431
|
+
if isinstance(data, list):
|
|
432
|
+
proposed = data
|
|
433
|
+
elif isinstance(data, dict) and "configs" in data:
|
|
434
|
+
proposed = data["configs"]
|
|
435
|
+
else:
|
|
436
|
+
proposed = [data]
|
|
437
|
+
|
|
438
|
+
if not proposed:
|
|
439
|
+
print("No proposed configs provided. Use --configs <file.yaml>")
|
|
440
|
+
sys.exit(1)
|
|
441
|
+
|
|
442
|
+
report = simulate_experiments(
|
|
443
|
+
proposed_configs=proposed,
|
|
444
|
+
experiments=experiments,
|
|
445
|
+
primary_metric=primary_metric,
|
|
446
|
+
top_k=args.top_k,
|
|
447
|
+
improvement_threshold=args.threshold,
|
|
448
|
+
lower_is_better=lower_is_better,
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
if args.json:
|
|
452
|
+
print(json.dumps(report, indent=2))
|
|
453
|
+
else:
|
|
454
|
+
print(format_simulation_report(report))
|
|
455
|
+
|
|
456
|
+
if "error" not in report:
|
|
457
|
+
saved = save_simulation_report(report)
|
|
458
|
+
if not args.json:
|
|
459
|
+
print(f"\nSaved: {saved}")
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
if __name__ == "__main__":
|
|
463
|
+
main()
|
|
@@ -371,6 +371,71 @@ def load_audit_report(audit_dir: str = "experiments/audits") -> dict | None:
|
|
|
371
371
|
return None
|
|
372
372
|
|
|
373
373
|
|
|
374
|
+
def load_whatif_results(whatif_dir: str = "experiments/whatif") -> list[dict]:
|
|
375
|
+
"""Load recent what-if analysis results."""
|
|
376
|
+
path = Path(whatif_dir)
|
|
377
|
+
if not path.exists():
|
|
378
|
+
return []
|
|
379
|
+
results = []
|
|
380
|
+
for f in sorted(path.glob("whatif-*.yaml"))[-5:]: # Last 5
|
|
381
|
+
try:
|
|
382
|
+
with open(f) as fh:
|
|
383
|
+
data = yaml.safe_load(fh)
|
|
384
|
+
if isinstance(data, dict):
|
|
385
|
+
results.append(data)
|
|
386
|
+
except (yaml.YAMLError, OSError):
|
|
387
|
+
continue
|
|
388
|
+
return results
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def load_simulation_results(sim_dir: str = "experiments/simulations") -> dict | None:
|
|
392
|
+
"""Load the most recent simulation result."""
|
|
393
|
+
path = Path(sim_dir)
|
|
394
|
+
if not path.exists():
|
|
395
|
+
return None
|
|
396
|
+
files = sorted(path.glob("simulation-*.yaml"))
|
|
397
|
+
if not files:
|
|
398
|
+
return None
|
|
399
|
+
try:
|
|
400
|
+
with open(files[-1]) as f:
|
|
401
|
+
data = yaml.safe_load(f)
|
|
402
|
+
return data if isinstance(data, dict) else None
|
|
403
|
+
except (yaml.YAMLError, OSError):
|
|
404
|
+
return None
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def load_registry_summary(registry_path: str = "experiments/registry.yaml") -> dict | None:
|
|
408
|
+
"""Load model registry summary for briefing."""
|
|
409
|
+
path = Path(registry_path)
|
|
410
|
+
if not path.exists():
|
|
411
|
+
return None
|
|
412
|
+
try:
|
|
413
|
+
with open(path) as f:
|
|
414
|
+
data = yaml.safe_load(f)
|
|
415
|
+
if isinstance(data, dict) and data.get("models"):
|
|
416
|
+
return data
|
|
417
|
+
except (yaml.YAMLError, OSError):
|
|
418
|
+
pass
|
|
419
|
+
return None
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def load_update_history(update_dir: str = "experiments/updates") -> list[dict]:
|
|
423
|
+
"""Load recent incremental update reports."""
|
|
424
|
+
path = Path(update_dir)
|
|
425
|
+
if not path.exists():
|
|
426
|
+
return []
|
|
427
|
+
results = []
|
|
428
|
+
for f in sorted(path.glob("*-update-*.yaml"))[-3:]:
|
|
429
|
+
try:
|
|
430
|
+
with open(f) as fh:
|
|
431
|
+
data = yaml.safe_load(fh)
|
|
432
|
+
if isinstance(data, dict):
|
|
433
|
+
results.append(data)
|
|
434
|
+
except (yaml.YAMLError, OSError):
|
|
435
|
+
continue
|
|
436
|
+
return results
|
|
437
|
+
|
|
438
|
+
|
|
374
439
|
def format_brief(
|
|
375
440
|
campaign: dict,
|
|
376
441
|
best: dict | None,
|
|
@@ -393,6 +458,10 @@ def format_brief(
|
|
|
393
458
|
budget_status: dict | None = None,
|
|
394
459
|
scaling_results: list[dict] | None = None,
|
|
395
460
|
audit_report: dict | None = None,
|
|
461
|
+
whatif_results: list[dict] | None = None,
|
|
462
|
+
simulation_result: dict | None = None,
|
|
463
|
+
registry_summary: dict | None = None,
|
|
464
|
+
update_history: list[dict] | None = None,
|
|
396
465
|
) -> str:
|
|
397
466
|
"""Format the research briefing as markdown."""
|
|
398
467
|
direction = "lower" if lower_is_better else "higher"
|
|
@@ -698,6 +767,54 @@ def format_brief(
|
|
|
698
767
|
total = len(regression_checks)
|
|
699
768
|
lines.append(f"\n*{passed}/{total} regression checks passed.*")
|
|
700
769
|
|
|
770
|
+
# What-If & Simulation section
|
|
771
|
+
if whatif_results or simulation_result:
|
|
772
|
+
lines.extend(["", "## What-If Analysis & Simulation", ""])
|
|
773
|
+
|
|
774
|
+
if whatif_results:
|
|
775
|
+
lines.append(f"**Recent what-if queries:** {len(whatif_results)}")
|
|
776
|
+
for wf in whatif_results[-3:]:
|
|
777
|
+
q = wf.get("question", "N/A")
|
|
778
|
+
route = wf.get("route", "?")
|
|
779
|
+
result = wf.get("result", {})
|
|
780
|
+
est = result.get("estimate")
|
|
781
|
+
conf = result.get("confidence", "?")
|
|
782
|
+
if est is not None:
|
|
783
|
+
lines.append(f"- *\"{q}\"* → {est} ({conf} confidence) [{route}]")
|
|
784
|
+
elif "error" in result:
|
|
785
|
+
lines.append(f"- *\"{q}\"* → {result['error']} [{route}]")
|
|
786
|
+
lines.append("")
|
|
787
|
+
|
|
788
|
+
if simulation_result and "error" not in simulation_result:
|
|
789
|
+
run = simulation_result.get("run_count", 0)
|
|
790
|
+
skip = simulation_result.get("skip_count", 0)
|
|
791
|
+
savings = simulation_result.get("budget_savings_pct", 0)
|
|
792
|
+
lines.append(f"**Last simulation:** {run} configs recommended, {skip} skipped ({savings}% budget savings)")
|
|
793
|
+
lines.append("")
|
|
794
|
+
|
|
795
|
+
# Model Lifecycle section
|
|
796
|
+
if registry_summary or update_history:
|
|
797
|
+
lines.extend(["", "## Model Lifecycle", ""])
|
|
798
|
+
|
|
799
|
+
if registry_summary:
|
|
800
|
+
models = registry_summary.get("models", [])
|
|
801
|
+
for m in models:
|
|
802
|
+
if m.get("stage") != "archived":
|
|
803
|
+
metric = f"{m['metric']:.4f}" if m.get("metric") is not None else "—"
|
|
804
|
+
lines.append(f"- **{m['stage']}:** {m['exp_id']} ({m.get('version', '?')}, {m.get('metric_name', 'metric')}={metric})")
|
|
805
|
+
if not any(m.get("stage") != "archived" for m in models):
|
|
806
|
+
lines.append("- All models archived — register a new candidate with `/turing:registry register`")
|
|
807
|
+
lines.append("")
|
|
808
|
+
|
|
809
|
+
if update_history:
|
|
810
|
+
lines.append(f"**Recent updates:** {len(update_history)}")
|
|
811
|
+
for u in update_history[-2:]:
|
|
812
|
+
verdict = u.get("verdict", "?")
|
|
813
|
+
exp_id = u.get("experiment_id", "?")
|
|
814
|
+
strategy = u.get("plan", {}).get("strategy", "?")
|
|
815
|
+
lines.append(f"- {exp_id}: {strategy} — {verdict}")
|
|
816
|
+
lines.append("")
|
|
817
|
+
|
|
701
818
|
lines.extend([
|
|
702
819
|
"",
|
|
703
820
|
"## Recommendations",
|
|
@@ -768,6 +885,10 @@ def generate_brief(
|
|
|
768
885
|
budget_status = load_budget_status(log_path=log_path)
|
|
769
886
|
scaling_results = load_scaling_results()
|
|
770
887
|
audit_report = load_audit_report()
|
|
888
|
+
whatif_results = load_whatif_results()
|
|
889
|
+
simulation_result = load_simulation_results()
|
|
890
|
+
registry_summary = load_registry_summary()
|
|
891
|
+
update_history = load_update_history()
|
|
771
892
|
|
|
772
893
|
return format_brief(
|
|
773
894
|
campaign, best, trajectory, model_types, hypotheses,
|
|
@@ -784,6 +905,10 @@ def generate_brief(
|
|
|
784
905
|
budget_status=budget_status,
|
|
785
906
|
scaling_results=scaling_results if scaling_results else None,
|
|
786
907
|
audit_report=audit_report,
|
|
908
|
+
whatif_results=whatif_results if whatif_results else None,
|
|
909
|
+
simulation_result=simulation_result,
|
|
910
|
+
registry_summary=registry_summary,
|
|
911
|
+
update_history=update_history if update_history else None,
|
|
787
912
|
)
|
|
788
913
|
|
|
789
914
|
|