claude-turing 4.0.0 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +8 -2
- package/commands/counterfactual.md +27 -0
- package/commands/onboard.md +20 -0
- package/commands/review.md +20 -0
- package/commands/share.md +20 -0
- package/commands/simulate.md +28 -0
- package/commands/turing.md +12 -0
- package/commands/whatif.md +31 -0
- package/package.json +1 -1
- package/src/install.js +2 -0
- package/src/verify.js +6 -0
- package/templates/scripts/__pycache__/counterfactual_explanation.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_simulator.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_onboarding.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/package_experiments.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/simulate_review.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/whatif_engine.cpython-314.pyc +0 -0
- package/templates/scripts/counterfactual_explanation.py +485 -0
- package/templates/scripts/experiment_simulator.py +463 -0
- package/templates/scripts/generate_brief.py +64 -0
- package/templates/scripts/generate_onboarding.py +284 -0
- package/templates/scripts/package_experiments.py +285 -0
- package/templates/scripts/scaffold.py +11 -0
- package/templates/scripts/simulate_review.py +342 -0
- package/templates/scripts/whatif_engine.py +763 -0
|
@@ -0,0 +1,463 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Experiment outcome simulator for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
Predicts experiment outcomes from prior data using a surrogate model.
|
|
5
|
+
Pre-filters experiment configs to save budget — only run the ones
|
|
6
|
+
predicted to beat the current best.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python scripts/experiment_simulator.py --configs configs.yaml
|
|
10
|
+
python scripts/experiment_simulator.py --top-k 5
|
|
11
|
+
python scripts/experiment_simulator.py --threshold 0.001
|
|
12
|
+
python scripts/experiment_simulator.py --json
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import json
|
|
19
|
+
import sys
|
|
20
|
+
from datetime import datetime, timezone
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
import numpy as np
|
|
24
|
+
import yaml
|
|
25
|
+
|
|
26
|
+
from scripts.turing_io import load_config, load_experiments
|
|
27
|
+
|
|
28
|
+
DEFAULT_LOG_PATH = "experiments/log.jsonl"
|
|
29
|
+
DEFAULT_TOP_K = 5
|
|
30
|
+
DEFAULT_IMPROVEMENT_THRESHOLD = 0.0
|
|
31
|
+
NOVELTY_PENALTY_FACTOR = 0.1
|
|
32
|
+
MIN_HISTORY_FOR_SURROGATE = 5
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# --- Feature Extraction ---
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def extract_config_features(config: dict) -> dict[str, float]:
|
|
39
|
+
"""Extract numeric features from an experiment config.
|
|
40
|
+
|
|
41
|
+
Flattens nested config into a flat dict of numeric values.
|
|
42
|
+
"""
|
|
43
|
+
features = {}
|
|
44
|
+
_flatten(config, features, prefix="")
|
|
45
|
+
return features
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _flatten(obj: dict, out: dict, prefix: str) -> None:
|
|
49
|
+
"""Recursively flatten a dict, keeping only numeric values."""
|
|
50
|
+
for key, val in obj.items():
|
|
51
|
+
full_key = f"{prefix}{key}" if not prefix else f"{prefix}.{key}"
|
|
52
|
+
if isinstance(val, (int, float)) and not isinstance(val, bool):
|
|
53
|
+
out[full_key] = float(val)
|
|
54
|
+
elif isinstance(val, dict):
|
|
55
|
+
_flatten(val, out, full_key)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def experiment_to_features(exp: dict) -> dict[str, float]:
|
|
59
|
+
"""Extract feature vector from an experiment log entry."""
|
|
60
|
+
features = {}
|
|
61
|
+
|
|
62
|
+
# Extract from config
|
|
63
|
+
config = exp.get("config", {})
|
|
64
|
+
if isinstance(config, dict):
|
|
65
|
+
hyperparams = config.get("hyperparams", config.get("model", {}).get("hyperparams", {}))
|
|
66
|
+
if isinstance(hyperparams, dict):
|
|
67
|
+
for k, v in hyperparams.items():
|
|
68
|
+
if isinstance(v, (int, float)) and not isinstance(v, bool):
|
|
69
|
+
features[k] = float(v)
|
|
70
|
+
|
|
71
|
+
# Also check top-level numeric fields
|
|
72
|
+
for key in ("learning_rate", "lr", "max_depth", "depth", "n_estimators",
|
|
73
|
+
"epochs", "batch_size", "hidden_size", "dropout"):
|
|
74
|
+
val = exp.get(key) or config.get(key)
|
|
75
|
+
if isinstance(val, (int, float)):
|
|
76
|
+
features[key] = float(val)
|
|
77
|
+
|
|
78
|
+
return features
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# --- Surrogate Model ---
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def build_surrogate(
|
|
85
|
+
experiments: list[dict],
|
|
86
|
+
primary_metric: str,
|
|
87
|
+
) -> dict:
|
|
88
|
+
"""Build a simple surrogate model from experiment history.
|
|
89
|
+
|
|
90
|
+
Uses a weighted k-NN approach: for a new config, predict the metric
|
|
91
|
+
as a weighted average of the k nearest experiments in config space.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Surrogate model dict with training data and feature info.
|
|
95
|
+
"""
|
|
96
|
+
data_points = []
|
|
97
|
+
for exp in experiments:
|
|
98
|
+
metric = exp.get("metrics", {}).get(primary_metric)
|
|
99
|
+
if metric is None:
|
|
100
|
+
continue
|
|
101
|
+
features = experiment_to_features(exp)
|
|
102
|
+
if features:
|
|
103
|
+
data_points.append({"features": features, "metric": metric})
|
|
104
|
+
|
|
105
|
+
if len(data_points) < MIN_HISTORY_FOR_SURROGATE:
|
|
106
|
+
return {
|
|
107
|
+
"status": "insufficient",
|
|
108
|
+
"n_points": len(data_points),
|
|
109
|
+
"min_required": MIN_HISTORY_FOR_SURROGATE,
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
# Collect all feature names
|
|
113
|
+
all_features = set()
|
|
114
|
+
for dp in data_points:
|
|
115
|
+
all_features.update(dp["features"].keys())
|
|
116
|
+
|
|
117
|
+
return {
|
|
118
|
+
"status": "ready",
|
|
119
|
+
"data_points": data_points,
|
|
120
|
+
"feature_names": sorted(all_features),
|
|
121
|
+
"n_points": len(data_points),
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def predict_with_surrogate(
|
|
126
|
+
surrogate: dict,
|
|
127
|
+
config_features: dict[str, float],
|
|
128
|
+
k: int = 3,
|
|
129
|
+
) -> dict:
|
|
130
|
+
"""Predict metric for a config using weighted k-NN surrogate.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
surrogate: Built surrogate model.
|
|
134
|
+
config_features: Feature dict for the config to predict.
|
|
135
|
+
k: Number of nearest neighbors.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
Prediction dict with predicted metric and uncertainty.
|
|
139
|
+
"""
|
|
140
|
+
if surrogate.get("status") != "ready":
|
|
141
|
+
return {"error": "Surrogate not ready", "predicted": None, "uncertainty": None}
|
|
142
|
+
|
|
143
|
+
data_points = surrogate["data_points"]
|
|
144
|
+
feature_names = surrogate["feature_names"]
|
|
145
|
+
|
|
146
|
+
# Compute distances
|
|
147
|
+
distances = []
|
|
148
|
+
for dp in data_points:
|
|
149
|
+
dist = _config_distance(config_features, dp["features"], feature_names)
|
|
150
|
+
distances.append((dist, dp["metric"]))
|
|
151
|
+
|
|
152
|
+
distances.sort(key=lambda x: x[0])
|
|
153
|
+
neighbors = distances[:k]
|
|
154
|
+
|
|
155
|
+
if not neighbors:
|
|
156
|
+
return {"error": "No neighbors found", "predicted": None, "uncertainty": None}
|
|
157
|
+
|
|
158
|
+
# Weighted average (inverse distance weighting)
|
|
159
|
+
metrics = [m for _, m in neighbors]
|
|
160
|
+
dists = [d for d, _ in neighbors]
|
|
161
|
+
|
|
162
|
+
if all(d == 0 for d in dists):
|
|
163
|
+
predicted = np.mean(metrics)
|
|
164
|
+
uncertainty = 0.0
|
|
165
|
+
elif all(d == float("inf") for d in dists):
|
|
166
|
+
predicted = np.mean(metrics)
|
|
167
|
+
uncertainty = float(np.std(metrics)) if len(metrics) > 1 else 0.0
|
|
168
|
+
else:
|
|
169
|
+
weights = [1.0 / (d + 1e-6) for d in dists if d != float("inf")]
|
|
170
|
+
valid_metrics = [m for d, m in zip(dists, metrics) if d != float("inf")]
|
|
171
|
+
if not weights:
|
|
172
|
+
predicted = np.mean(metrics)
|
|
173
|
+
uncertainty = float(np.std(metrics)) if len(metrics) > 1 else 0.0
|
|
174
|
+
else:
|
|
175
|
+
total_weight = sum(weights)
|
|
176
|
+
predicted = sum(w * m for w, m in zip(weights, valid_metrics)) / total_weight
|
|
177
|
+
uncertainty = float(np.std(valid_metrics)) if len(valid_metrics) > 1 else 0.0
|
|
178
|
+
|
|
179
|
+
# Novelty penalty: discount if far from training distribution
|
|
180
|
+
finite_dists = [d for d, _ in distances if d != float("inf")]
|
|
181
|
+
min_dist = min(finite_dists) if finite_dists else float("inf")
|
|
182
|
+
avg_dist = float(np.mean(finite_dists)) if finite_dists else 1.0
|
|
183
|
+
novelty = min_dist / avg_dist if avg_dist > 0 and min_dist != float("inf") else 1.0
|
|
184
|
+
novelty_penalty = novelty * NOVELTY_PENALTY_FACTOR
|
|
185
|
+
|
|
186
|
+
return {
|
|
187
|
+
"predicted": round(float(predicted - novelty_penalty), 6),
|
|
188
|
+
"uncertainty": round(float(uncertainty), 6),
|
|
189
|
+
"novelty_score": round(float(novelty), 4),
|
|
190
|
+
"n_neighbors": len(neighbors),
|
|
191
|
+
"nearest_distance": round(float(min_dist), 4),
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _config_distance(
|
|
196
|
+
config_a: dict[str, float],
|
|
197
|
+
config_b: dict[str, float],
|
|
198
|
+
feature_names: list[str],
|
|
199
|
+
) -> float:
|
|
200
|
+
"""Compute normalized distance between two configs."""
|
|
201
|
+
total = 0.0
|
|
202
|
+
n = 0
|
|
203
|
+
for feat in feature_names:
|
|
204
|
+
a = config_a.get(feat)
|
|
205
|
+
b = config_b.get(feat)
|
|
206
|
+
if a is not None and b is not None:
|
|
207
|
+
# Normalize by max(|a|, |b|, 1) to handle different scales
|
|
208
|
+
scale = max(abs(a), abs(b), 1.0)
|
|
209
|
+
total += ((a - b) / scale) ** 2
|
|
210
|
+
n += 1
|
|
211
|
+
|
|
212
|
+
if n == 0:
|
|
213
|
+
return float("inf")
|
|
214
|
+
return float(np.sqrt(total / n))
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
# --- Simulation Pipeline ---
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def simulate_experiments(
|
|
221
|
+
proposed_configs: list[dict],
|
|
222
|
+
experiments: list[dict],
|
|
223
|
+
primary_metric: str,
|
|
224
|
+
top_k: int = DEFAULT_TOP_K,
|
|
225
|
+
improvement_threshold: float = DEFAULT_IMPROVEMENT_THRESHOLD,
|
|
226
|
+
lower_is_better: bool = False,
|
|
227
|
+
) -> dict:
|
|
228
|
+
"""Simulate proposed experiments and rank by predicted outcome.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
proposed_configs: List of experiment configs to simulate.
|
|
232
|
+
experiments: Historical experiment data.
|
|
233
|
+
primary_metric: Metric to predict.
|
|
234
|
+
top_k: Number of top configs to recommend running.
|
|
235
|
+
improvement_threshold: Minimum predicted improvement over current best.
|
|
236
|
+
lower_is_better: Whether lower metric is better.
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
Simulation report with ranked configs and budget savings.
|
|
240
|
+
"""
|
|
241
|
+
if not proposed_configs:
|
|
242
|
+
return {"error": "No proposed configs to simulate"}
|
|
243
|
+
|
|
244
|
+
surrogate = build_surrogate(experiments, primary_metric)
|
|
245
|
+
if surrogate.get("status") != "ready":
|
|
246
|
+
return {
|
|
247
|
+
"error": f"Insufficient experiment history ({surrogate.get('n_points', 0)} experiments, "
|
|
248
|
+
f"need {MIN_HISTORY_FOR_SURROGATE})",
|
|
249
|
+
"suggestion": "Run more experiments first to build a reliable surrogate model.",
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
# Get current best
|
|
253
|
+
best_metrics = [
|
|
254
|
+
exp.get("metrics", {}).get(primary_metric)
|
|
255
|
+
for exp in experiments
|
|
256
|
+
if exp.get("metrics", {}).get(primary_metric) is not None
|
|
257
|
+
]
|
|
258
|
+
if lower_is_better:
|
|
259
|
+
current_best = min(best_metrics) if best_metrics else float("inf")
|
|
260
|
+
else:
|
|
261
|
+
current_best = max(best_metrics) if best_metrics else 0
|
|
262
|
+
|
|
263
|
+
# Predict each config
|
|
264
|
+
predictions = []
|
|
265
|
+
for i, config in enumerate(proposed_configs):
|
|
266
|
+
features = extract_config_features(config)
|
|
267
|
+
pred = predict_with_surrogate(surrogate, features)
|
|
268
|
+
predicted = pred.get("predicted")
|
|
269
|
+
uncertainty = pred.get("uncertainty", 0)
|
|
270
|
+
|
|
271
|
+
if predicted is not None:
|
|
272
|
+
if lower_is_better:
|
|
273
|
+
improvement = current_best - predicted
|
|
274
|
+
else:
|
|
275
|
+
improvement = predicted - current_best
|
|
276
|
+
|
|
277
|
+
# Classify uncertainty
|
|
278
|
+
if uncertainty < 0.005:
|
|
279
|
+
unc_level = "LOW"
|
|
280
|
+
elif uncertainty < 0.015:
|
|
281
|
+
unc_level = "MED"
|
|
282
|
+
else:
|
|
283
|
+
unc_level = "HIGH"
|
|
284
|
+
|
|
285
|
+
verdict = "RUN" if improvement > improvement_threshold else "SKIP"
|
|
286
|
+
|
|
287
|
+
predictions.append({
|
|
288
|
+
"rank": 0, # filled later
|
|
289
|
+
"config_index": i,
|
|
290
|
+
"config_summary": _summarize_config(config),
|
|
291
|
+
"predicted_metric": predicted,
|
|
292
|
+
"uncertainty": uncertainty,
|
|
293
|
+
"uncertainty_level": unc_level,
|
|
294
|
+
"improvement": round(improvement, 6),
|
|
295
|
+
"verdict": verdict,
|
|
296
|
+
"novelty_score": pred.get("novelty_score", 0),
|
|
297
|
+
})
|
|
298
|
+
|
|
299
|
+
# Sort by predicted metric
|
|
300
|
+
predictions.sort(
|
|
301
|
+
key=lambda p: p["predicted_metric"],
|
|
302
|
+
reverse=not lower_is_better,
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
# Assign ranks
|
|
306
|
+
for i, p in enumerate(predictions):
|
|
307
|
+
p["rank"] = i + 1
|
|
308
|
+
|
|
309
|
+
# Apply top-k
|
|
310
|
+
run_configs = [p for p in predictions if p["verdict"] == "RUN"][:top_k]
|
|
311
|
+
skip_configs = [p for p in predictions if p not in run_configs]
|
|
312
|
+
|
|
313
|
+
# Mark skipped
|
|
314
|
+
for p in skip_configs:
|
|
315
|
+
p["verdict"] = "SKIP"
|
|
316
|
+
|
|
317
|
+
total = len(predictions)
|
|
318
|
+
n_run = len(run_configs)
|
|
319
|
+
n_skip = total - n_run
|
|
320
|
+
savings = round(n_skip / total * 100, 1) if total > 0 else 0
|
|
321
|
+
|
|
322
|
+
return {
|
|
323
|
+
"current_best": current_best,
|
|
324
|
+
"primary_metric": primary_metric,
|
|
325
|
+
"total_proposed": total,
|
|
326
|
+
"run_count": n_run,
|
|
327
|
+
"skip_count": n_skip,
|
|
328
|
+
"budget_savings_pct": savings,
|
|
329
|
+
"predictions": predictions,
|
|
330
|
+
"surrogate_info": {
|
|
331
|
+
"n_training_points": surrogate["n_points"],
|
|
332
|
+
"n_features": len(surrogate["feature_names"]),
|
|
333
|
+
},
|
|
334
|
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def _summarize_config(config: dict, max_items: int = 4) -> str:
|
|
339
|
+
"""Create a short summary of a config dict."""
|
|
340
|
+
features = extract_config_features(config)
|
|
341
|
+
items = list(features.items())[:max_items]
|
|
342
|
+
parts = [f"{k}={v}" for k, v in items]
|
|
343
|
+
if len(features) > max_items:
|
|
344
|
+
parts.append("...")
|
|
345
|
+
return ", ".join(parts) if parts else "(empty config)"
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
# --- Report Formatting ---
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def save_simulation_report(report: dict, output_dir: str = "experiments/simulations") -> Path:
|
|
352
|
+
"""Save simulation report to YAML."""
|
|
353
|
+
out_path = Path(output_dir)
|
|
354
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
|
355
|
+
ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
|
|
356
|
+
filepath = out_path / f"simulation-{ts}.yaml"
|
|
357
|
+
with open(filepath, "w") as f:
|
|
358
|
+
yaml.dump(report, f, default_flow_style=False, sort_keys=False)
|
|
359
|
+
return filepath
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def format_simulation_report(report: dict) -> str:
|
|
363
|
+
"""Format simulation report as readable markdown."""
|
|
364
|
+
if "error" in report:
|
|
365
|
+
lines = [f"ERROR: {report['error']}"]
|
|
366
|
+
if "suggestion" in report:
|
|
367
|
+
lines.append(f"\n{report['suggestion']}")
|
|
368
|
+
return "\n".join(lines)
|
|
369
|
+
|
|
370
|
+
lines = ["# Experiment Simulation", ""]
|
|
371
|
+
lines.append(f"**Current best:** {report.get('current_best', 'N/A')}")
|
|
372
|
+
lines.append(f"**Proposed configs:** {report.get('total_proposed', 0)}")
|
|
373
|
+
lines.append(f"**Recommended to run:** {report.get('run_count', 0)}")
|
|
374
|
+
lines.append(f"**Budget savings:** {report.get('budget_savings_pct', 0)}%")
|
|
375
|
+
lines.append("")
|
|
376
|
+
|
|
377
|
+
predictions = report.get("predictions", [])
|
|
378
|
+
if predictions:
|
|
379
|
+
lines.append("| Rank | Config Summary | Predicted | Uncertainty | Verdict |")
|
|
380
|
+
lines.append("|------|---------------|-----------|-------------|---------|")
|
|
381
|
+
for p in predictions:
|
|
382
|
+
unc = f"{p['predicted_metric']:.4f} \u00b1 {p['uncertainty']:.4f}"
|
|
383
|
+
lines.append(
|
|
384
|
+
f"| {p['rank']} | {p['config_summary'][:40]} | {p['predicted_metric']:.4f} "
|
|
385
|
+
f"| {p['uncertainty_level']} | {p['verdict']} {'✓' if p['verdict'] == 'RUN' else '✗'} |"
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
lines.append("")
|
|
389
|
+
rec_run = report.get("run_count", 0)
|
|
390
|
+
rec_skip = report.get("skip_count", 0)
|
|
391
|
+
lines.append(
|
|
392
|
+
f"**Recommendation:** Run top {rec_run}, skip {rec_skip}. "
|
|
393
|
+
f"Estimated budget savings: {report.get('budget_savings_pct', 0)}%."
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
lines.append("")
|
|
397
|
+
lines.append(f"*Generated: {report.get('generated_at', 'N/A')}*")
|
|
398
|
+
return "\n".join(lines)
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
# --- CLI ---
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
def main():
|
|
405
|
+
parser = argparse.ArgumentParser(
|
|
406
|
+
description="Experiment outcome simulator — predict results before running"
|
|
407
|
+
)
|
|
408
|
+
parser.add_argument("--configs", help="YAML file with proposed experiment configs")
|
|
409
|
+
parser.add_argument("--top-k", type=int, default=DEFAULT_TOP_K,
|
|
410
|
+
help="Number of top configs to recommend")
|
|
411
|
+
parser.add_argument("--threshold", type=float, default=DEFAULT_IMPROVEMENT_THRESHOLD,
|
|
412
|
+
help="Minimum predicted improvement to recommend running")
|
|
413
|
+
parser.add_argument("--config", default="config.yaml", help="Path to project config.yaml")
|
|
414
|
+
parser.add_argument("--log", default=DEFAULT_LOG_PATH, help="Path to experiment log")
|
|
415
|
+
parser.add_argument("--json", action="store_true", help="Output raw JSON")
|
|
416
|
+
|
|
417
|
+
args = parser.parse_args()
|
|
418
|
+
|
|
419
|
+
config = load_config(args.config)
|
|
420
|
+
eval_cfg = config.get("evaluation", {})
|
|
421
|
+
primary_metric = eval_cfg.get("primary_metric", "accuracy")
|
|
422
|
+
lower_is_better = eval_cfg.get("lower_is_better", False)
|
|
423
|
+
|
|
424
|
+
experiments = load_experiments(args.log)
|
|
425
|
+
|
|
426
|
+
# Load proposed configs
|
|
427
|
+
proposed = []
|
|
428
|
+
if args.configs:
|
|
429
|
+
with open(args.configs) as f:
|
|
430
|
+
data = yaml.safe_load(f)
|
|
431
|
+
if isinstance(data, list):
|
|
432
|
+
proposed = data
|
|
433
|
+
elif isinstance(data, dict) and "configs" in data:
|
|
434
|
+
proposed = data["configs"]
|
|
435
|
+
else:
|
|
436
|
+
proposed = [data]
|
|
437
|
+
|
|
438
|
+
if not proposed:
|
|
439
|
+
print("No proposed configs provided. Use --configs <file.yaml>")
|
|
440
|
+
sys.exit(1)
|
|
441
|
+
|
|
442
|
+
report = simulate_experiments(
|
|
443
|
+
proposed_configs=proposed,
|
|
444
|
+
experiments=experiments,
|
|
445
|
+
primary_metric=primary_metric,
|
|
446
|
+
top_k=args.top_k,
|
|
447
|
+
improvement_threshold=args.threshold,
|
|
448
|
+
lower_is_better=lower_is_better,
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
if args.json:
|
|
452
|
+
print(json.dumps(report, indent=2))
|
|
453
|
+
else:
|
|
454
|
+
print(format_simulation_report(report))
|
|
455
|
+
|
|
456
|
+
if "error" not in report:
|
|
457
|
+
saved = save_simulation_report(report)
|
|
458
|
+
if not args.json:
|
|
459
|
+
print(f"\nSaved: {saved}")
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
if __name__ == "__main__":
|
|
463
|
+
main()
|
|
@@ -371,6 +371,39 @@ def load_audit_report(audit_dir: str = "experiments/audits") -> dict | None:
|
|
|
371
371
|
return None
|
|
372
372
|
|
|
373
373
|
|
|
374
|
+
def load_whatif_results(whatif_dir: str = "experiments/whatif") -> list[dict]:
|
|
375
|
+
"""Load recent what-if analysis results."""
|
|
376
|
+
path = Path(whatif_dir)
|
|
377
|
+
if not path.exists():
|
|
378
|
+
return []
|
|
379
|
+
results = []
|
|
380
|
+
for f in sorted(path.glob("whatif-*.yaml"))[-5:]: # Last 5
|
|
381
|
+
try:
|
|
382
|
+
with open(f) as fh:
|
|
383
|
+
data = yaml.safe_load(fh)
|
|
384
|
+
if isinstance(data, dict):
|
|
385
|
+
results.append(data)
|
|
386
|
+
except (yaml.YAMLError, OSError):
|
|
387
|
+
continue
|
|
388
|
+
return results
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def load_simulation_results(sim_dir: str = "experiments/simulations") -> dict | None:
|
|
392
|
+
"""Load the most recent simulation result."""
|
|
393
|
+
path = Path(sim_dir)
|
|
394
|
+
if not path.exists():
|
|
395
|
+
return None
|
|
396
|
+
files = sorted(path.glob("simulation-*.yaml"))
|
|
397
|
+
if not files:
|
|
398
|
+
return None
|
|
399
|
+
try:
|
|
400
|
+
with open(files[-1]) as f:
|
|
401
|
+
data = yaml.safe_load(f)
|
|
402
|
+
return data if isinstance(data, dict) else None
|
|
403
|
+
except (yaml.YAMLError, OSError):
|
|
404
|
+
return None
|
|
405
|
+
|
|
406
|
+
|
|
374
407
|
def format_brief(
|
|
375
408
|
campaign: dict,
|
|
376
409
|
best: dict | None,
|
|
@@ -393,6 +426,8 @@ def format_brief(
|
|
|
393
426
|
budget_status: dict | None = None,
|
|
394
427
|
scaling_results: list[dict] | None = None,
|
|
395
428
|
audit_report: dict | None = None,
|
|
429
|
+
whatif_results: list[dict] | None = None,
|
|
430
|
+
simulation_result: dict | None = None,
|
|
396
431
|
) -> str:
|
|
397
432
|
"""Format the research briefing as markdown."""
|
|
398
433
|
direction = "lower" if lower_is_better else "higher"
|
|
@@ -698,6 +733,31 @@ def format_brief(
|
|
|
698
733
|
total = len(regression_checks)
|
|
699
734
|
lines.append(f"\n*{passed}/{total} regression checks passed.*")
|
|
700
735
|
|
|
736
|
+
# What-If & Simulation section
|
|
737
|
+
if whatif_results or simulation_result:
|
|
738
|
+
lines.extend(["", "## What-If Analysis & Simulation", ""])
|
|
739
|
+
|
|
740
|
+
if whatif_results:
|
|
741
|
+
lines.append(f"**Recent what-if queries:** {len(whatif_results)}")
|
|
742
|
+
for wf in whatif_results[-3:]:
|
|
743
|
+
q = wf.get("question", "N/A")
|
|
744
|
+
route = wf.get("route", "?")
|
|
745
|
+
result = wf.get("result", {})
|
|
746
|
+
est = result.get("estimate")
|
|
747
|
+
conf = result.get("confidence", "?")
|
|
748
|
+
if est is not None:
|
|
749
|
+
lines.append(f"- *\"{q}\"* → {est} ({conf} confidence) [{route}]")
|
|
750
|
+
elif "error" in result:
|
|
751
|
+
lines.append(f"- *\"{q}\"* → {result['error']} [{route}]")
|
|
752
|
+
lines.append("")
|
|
753
|
+
|
|
754
|
+
if simulation_result and "error" not in simulation_result:
|
|
755
|
+
run = simulation_result.get("run_count", 0)
|
|
756
|
+
skip = simulation_result.get("skip_count", 0)
|
|
757
|
+
savings = simulation_result.get("budget_savings_pct", 0)
|
|
758
|
+
lines.append(f"**Last simulation:** {run} configs recommended, {skip} skipped ({savings}% budget savings)")
|
|
759
|
+
lines.append("")
|
|
760
|
+
|
|
701
761
|
lines.extend([
|
|
702
762
|
"",
|
|
703
763
|
"## Recommendations",
|
|
@@ -768,6 +828,8 @@ def generate_brief(
|
|
|
768
828
|
budget_status = load_budget_status(log_path=log_path)
|
|
769
829
|
scaling_results = load_scaling_results()
|
|
770
830
|
audit_report = load_audit_report()
|
|
831
|
+
whatif_results = load_whatif_results()
|
|
832
|
+
simulation_result = load_simulation_results()
|
|
771
833
|
|
|
772
834
|
return format_brief(
|
|
773
835
|
campaign, best, trajectory, model_types, hypotheses,
|
|
@@ -784,6 +846,8 @@ def generate_brief(
|
|
|
784
846
|
budget_status=budget_status,
|
|
785
847
|
scaling_results=scaling_results if scaling_results else None,
|
|
786
848
|
audit_report=audit_report,
|
|
849
|
+
whatif_results=whatif_results if whatif_results else None,
|
|
850
|
+
simulation_result=simulation_result,
|
|
787
851
|
)
|
|
788
852
|
|
|
789
853
|
|