claude-turing 4.2.0 → 4.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +7 -2
- package/commands/doctor.md +30 -0
- package/commands/plan.md +27 -0
- package/commands/postmortem.md +28 -0
- package/commands/registry.md +31 -0
- package/commands/turing.md +10 -0
- package/commands/update.md +27 -0
- package/package.json +1 -1
- package/src/install.js +2 -0
- package/src/verify.js +5 -0
- package/templates/scripts/__pycache__/failure_postmortem.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/harness_doctor.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/incremental_update.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/model_lifecycle.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/research_planner.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/failure_postmortem.py +510 -0
- package/templates/scripts/generate_brief.py +122 -0
- package/templates/scripts/generate_model_card.py +154 -3
- package/templates/scripts/harness_doctor.py +466 -0
- package/templates/scripts/incremental_update.py +586 -0
- package/templates/scripts/model_lifecycle.py +549 -0
- package/templates/scripts/research_planner.py +470 -0
- package/templates/scripts/scaffold.py +10 -0
|
@@ -0,0 +1,510 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Automated failure postmortem for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
When experiments stop improving, diagnoses the root cause: search space
|
|
5
|
+
exhaustion, systematic config error, data issue, metric ceiling, or
|
|
6
|
+
noise floor. Produces actionable next steps.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python scripts/failure_postmortem.py
|
|
10
|
+
python scripts/failure_postmortem.py --window 10
|
|
11
|
+
python scripts/failure_postmortem.py --auto-trigger 5
|
|
12
|
+
python scripts/failure_postmortem.py --json
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import json
|
|
19
|
+
import sys
|
|
20
|
+
from datetime import datetime, timezone
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
import numpy as np
|
|
24
|
+
import yaml
|
|
25
|
+
|
|
26
|
+
from scripts.turing_io import load_config, load_experiments
|
|
27
|
+
|
|
28
|
+
DEFAULT_LOG_PATH = "experiments/log.jsonl"
|
|
29
|
+
DEFAULT_WINDOW = 10
|
|
30
|
+
DEFAULT_AUTO_TRIGGER = 5
|
|
31
|
+
|
|
32
|
+
DIAGNOSIS_TYPES = [
|
|
33
|
+
"search_space_exhaustion",
|
|
34
|
+
"systematic_config_error",
|
|
35
|
+
"data_issue",
|
|
36
|
+
"metric_ceiling",
|
|
37
|
+
"noise_floor",
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# --- Streak Detection ---
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def detect_failure_streak(
|
|
45
|
+
experiments: list[dict],
|
|
46
|
+
primary_metric: str,
|
|
47
|
+
lower_is_better: bool = False,
|
|
48
|
+
) -> dict:
|
|
49
|
+
"""Detect how many consecutive experiments failed to improve.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
Streak info with count, best metric, streak experiments.
|
|
53
|
+
"""
|
|
54
|
+
if not experiments:
|
|
55
|
+
return {"streak_length": 0, "best_metric": None, "streak_experiments": []}
|
|
56
|
+
|
|
57
|
+
# Find the best metric value
|
|
58
|
+
best_val = None
|
|
59
|
+
for exp in experiments:
|
|
60
|
+
val = exp.get("metrics", {}).get(primary_metric)
|
|
61
|
+
if val is None:
|
|
62
|
+
continue
|
|
63
|
+
if best_val is None:
|
|
64
|
+
best_val = val
|
|
65
|
+
elif (lower_is_better and val < best_val) or (not lower_is_better and val > best_val):
|
|
66
|
+
best_val = val
|
|
67
|
+
|
|
68
|
+
if best_val is None:
|
|
69
|
+
return {"streak_length": len(experiments), "best_metric": None, "streak_experiments": experiments}
|
|
70
|
+
|
|
71
|
+
# Count consecutive non-improvements from the end
|
|
72
|
+
streak = []
|
|
73
|
+
best_so_far = None
|
|
74
|
+
|
|
75
|
+
for exp in experiments:
|
|
76
|
+
val = exp.get("metrics", {}).get(primary_metric)
|
|
77
|
+
if val is None:
|
|
78
|
+
continue
|
|
79
|
+
if best_so_far is None:
|
|
80
|
+
best_so_far = val
|
|
81
|
+
elif (lower_is_better and val < best_so_far) or (not lower_is_better and val > best_so_far):
|
|
82
|
+
best_so_far = val
|
|
83
|
+
streak = [] # Reset streak on improvement
|
|
84
|
+
|
|
85
|
+
streak.append(exp)
|
|
86
|
+
|
|
87
|
+
# The streak is from last improvement to end
|
|
88
|
+
# Remove the improving experiment itself if it's the first
|
|
89
|
+
if streak and streak[0].get("metrics", {}).get(primary_metric) == best_so_far:
|
|
90
|
+
streak = streak[1:]
|
|
91
|
+
|
|
92
|
+
return {
|
|
93
|
+
"streak_length": len(streak),
|
|
94
|
+
"best_metric": best_val,
|
|
95
|
+
"streak_experiments": streak,
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# --- Diagnosis Functions ---
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def diagnose_search_space_exhaustion(
|
|
103
|
+
streak_experiments: list[dict],
|
|
104
|
+
primary_metric: str,
|
|
105
|
+
) -> dict:
|
|
106
|
+
"""Check if experiments cluster in a small config region."""
|
|
107
|
+
if len(streak_experiments) < 3:
|
|
108
|
+
return {"score": 0, "evidence": "Too few experiments for diagnosis"}
|
|
109
|
+
|
|
110
|
+
# Extract hyperparameters from streak
|
|
111
|
+
all_params = {}
|
|
112
|
+
for exp in streak_experiments:
|
|
113
|
+
config = exp.get("config", {})
|
|
114
|
+
hyperparams = config.get("hyperparams", {})
|
|
115
|
+
for k, v in hyperparams.items():
|
|
116
|
+
if isinstance(v, (int, float)) and not isinstance(v, bool):
|
|
117
|
+
all_params.setdefault(k, []).append(float(v))
|
|
118
|
+
|
|
119
|
+
if not all_params:
|
|
120
|
+
return {"score": 0, "evidence": "No numeric hyperparameters found"}
|
|
121
|
+
|
|
122
|
+
# Measure coefficient of variation for each param
|
|
123
|
+
low_variance_params = []
|
|
124
|
+
for param, values in all_params.items():
|
|
125
|
+
if len(values) < 2:
|
|
126
|
+
continue
|
|
127
|
+
mean = np.mean(values)
|
|
128
|
+
if abs(mean) < 1e-10:
|
|
129
|
+
continue
|
|
130
|
+
cv = np.std(values) / abs(mean)
|
|
131
|
+
if cv < 0.15: # Less than 15% variation
|
|
132
|
+
low_variance_params.append({"param": param, "cv": round(float(cv), 4), "mean": round(float(mean), 4)})
|
|
133
|
+
|
|
134
|
+
# Check family diversity
|
|
135
|
+
families = set()
|
|
136
|
+
for exp in streak_experiments:
|
|
137
|
+
family = exp.get("family", exp.get("config", {}).get("family", "unknown"))
|
|
138
|
+
families.add(family)
|
|
139
|
+
|
|
140
|
+
score = 0
|
|
141
|
+
evidence = []
|
|
142
|
+
|
|
143
|
+
if len(low_variance_params) > len(all_params) * 0.5:
|
|
144
|
+
score += 0.4
|
|
145
|
+
evidence.append(f"Config variance LOW: {len(low_variance_params)}/{len(all_params)} params within ±15%")
|
|
146
|
+
|
|
147
|
+
if len(families) <= 1:
|
|
148
|
+
score += 0.3
|
|
149
|
+
evidence.append(f"All experiments in same family: {families}")
|
|
150
|
+
|
|
151
|
+
# Check if metrics are clustered
|
|
152
|
+
metrics = [exp.get("metrics", {}).get(primary_metric) for exp in streak_experiments
|
|
153
|
+
if exp.get("metrics", {}).get(primary_metric) is not None]
|
|
154
|
+
if len(metrics) >= 2:
|
|
155
|
+
metric_cv = np.std(metrics) / abs(np.mean(metrics)) if abs(np.mean(metrics)) > 0 else 0
|
|
156
|
+
if metric_cv < 0.02:
|
|
157
|
+
score += 0.3
|
|
158
|
+
evidence.append(f"Metric range very tight (CV={metric_cv:.4f})")
|
|
159
|
+
|
|
160
|
+
return {
|
|
161
|
+
"score": round(score, 2),
|
|
162
|
+
"evidence": evidence if evidence else ["No strong evidence of exhaustion"],
|
|
163
|
+
"low_variance_params": low_variance_params,
|
|
164
|
+
"families": list(families),
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def diagnose_systematic_config_error(
|
|
169
|
+
streak_experiments: list[dict],
|
|
170
|
+
primary_metric: str,
|
|
171
|
+
best_metric: float | None,
|
|
172
|
+
) -> dict:
|
|
173
|
+
"""Check if all experiments share a common bad config."""
|
|
174
|
+
if len(streak_experiments) < 3:
|
|
175
|
+
return {"score": 0, "evidence": "Too few experiments"}
|
|
176
|
+
|
|
177
|
+
# Find params that are identical across all streak experiments
|
|
178
|
+
common_params = {}
|
|
179
|
+
first_config = streak_experiments[0].get("config", {}).get("hyperparams", {})
|
|
180
|
+
|
|
181
|
+
for k, v in first_config.items():
|
|
182
|
+
if not isinstance(v, (int, float, str)):
|
|
183
|
+
continue
|
|
184
|
+
all_same = all(
|
|
185
|
+
exp.get("config", {}).get("hyperparams", {}).get(k) == v
|
|
186
|
+
for exp in streak_experiments[1:]
|
|
187
|
+
)
|
|
188
|
+
if all_same:
|
|
189
|
+
common_params[k] = v
|
|
190
|
+
|
|
191
|
+
score = 0
|
|
192
|
+
evidence = []
|
|
193
|
+
|
|
194
|
+
if common_params:
|
|
195
|
+
ratio = len(common_params) / max(len(first_config), 1)
|
|
196
|
+
if ratio > 0.5:
|
|
197
|
+
score += 0.5
|
|
198
|
+
evidence.append(f"{len(common_params)} params unchanged across all {len(streak_experiments)} experiments")
|
|
199
|
+
evidence.append(f"Common: {common_params}")
|
|
200
|
+
|
|
201
|
+
# Check if all experiments are significantly worse than best
|
|
202
|
+
if best_metric is not None:
|
|
203
|
+
streak_metrics = [exp.get("metrics", {}).get(primary_metric) for exp in streak_experiments
|
|
204
|
+
if exp.get("metrics", {}).get(primary_metric) is not None]
|
|
205
|
+
if streak_metrics:
|
|
206
|
+
avg_gap = abs(np.mean(streak_metrics) - best_metric)
|
|
207
|
+
if avg_gap > 0.02:
|
|
208
|
+
score += 0.3
|
|
209
|
+
evidence.append(f"Average gap from best: {avg_gap:.4f}")
|
|
210
|
+
|
|
211
|
+
return {"score": round(score, 2), "evidence": evidence or ["No common config error detected"], "common_params": common_params}
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def diagnose_data_issue(
|
|
215
|
+
streak_experiments: list[dict],
|
|
216
|
+
primary_metric: str,
|
|
217
|
+
) -> dict:
|
|
218
|
+
"""Check if all models fail similarly regardless of type."""
|
|
219
|
+
if len(streak_experiments) < 3:
|
|
220
|
+
return {"score": 0, "evidence": "Too few experiments"}
|
|
221
|
+
|
|
222
|
+
# Check model type diversity
|
|
223
|
+
model_types = set()
|
|
224
|
+
for exp in streak_experiments:
|
|
225
|
+
mt = exp.get("config", {}).get("model_type", "unknown")
|
|
226
|
+
model_types.add(mt)
|
|
227
|
+
|
|
228
|
+
score = 0
|
|
229
|
+
evidence = []
|
|
230
|
+
|
|
231
|
+
# If multiple model types all fail similarly → data issue
|
|
232
|
+
if len(model_types) >= 2:
|
|
233
|
+
metrics = [exp.get("metrics", {}).get(primary_metric) for exp in streak_experiments
|
|
234
|
+
if exp.get("metrics", {}).get(primary_metric) is not None]
|
|
235
|
+
if len(metrics) >= 2:
|
|
236
|
+
cv = np.std(metrics) / abs(np.mean(metrics)) if abs(np.mean(metrics)) > 0 else 0
|
|
237
|
+
if cv < 0.03:
|
|
238
|
+
score += 0.6
|
|
239
|
+
evidence.append(f"{len(model_types)} different model types all perform similarly (CV={cv:.4f})")
|
|
240
|
+
evidence.append(f"Model types: {model_types}")
|
|
241
|
+
|
|
242
|
+
return {"score": round(score, 2), "evidence": evidence or ["No data issue pattern detected"], "model_types": list(model_types)}
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def diagnose_metric_ceiling(
|
|
246
|
+
streak_experiments: list[dict],
|
|
247
|
+
primary_metric: str,
|
|
248
|
+
best_metric: float | None,
|
|
249
|
+
) -> dict:
|
|
250
|
+
"""Check if metrics are plateauing near a theoretical limit."""
|
|
251
|
+
if best_metric is None:
|
|
252
|
+
return {"score": 0, "evidence": "No best metric available"}
|
|
253
|
+
|
|
254
|
+
score = 0
|
|
255
|
+
evidence = []
|
|
256
|
+
|
|
257
|
+
# Check if best metric is very high (suggesting ceiling)
|
|
258
|
+
if best_metric > 0.95:
|
|
259
|
+
score += 0.4
|
|
260
|
+
evidence.append(f"Current best {primary_metric}={best_metric:.4f} — near theoretical maximum")
|
|
261
|
+
|
|
262
|
+
# Check improvement rate (are improvements getting tiny?)
|
|
263
|
+
metrics = sorted([
|
|
264
|
+
exp.get("metrics", {}).get(primary_metric)
|
|
265
|
+
for exp in streak_experiments
|
|
266
|
+
if exp.get("metrics", {}).get(primary_metric) is not None
|
|
267
|
+
])
|
|
268
|
+
if len(metrics) >= 3:
|
|
269
|
+
range_val = max(metrics) - min(metrics)
|
|
270
|
+
if range_val < 0.005:
|
|
271
|
+
score += 0.3
|
|
272
|
+
evidence.append(f"Metric range in streak: {range_val:.4f} (< 0.005)")
|
|
273
|
+
|
|
274
|
+
return {"score": round(score, 2), "evidence": evidence or ["No ceiling pattern detected"]}
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def diagnose_noise_floor(
|
|
278
|
+
streak_experiments: list[dict],
|
|
279
|
+
primary_metric: str,
|
|
280
|
+
seed_dir: str = "experiments/seed_studies",
|
|
281
|
+
) -> dict:
|
|
282
|
+
"""Check if improvements are within seed variance."""
|
|
283
|
+
score = 0
|
|
284
|
+
evidence = []
|
|
285
|
+
|
|
286
|
+
# Check seed study data for variance estimate
|
|
287
|
+
seed_path = Path(seed_dir)
|
|
288
|
+
seed_variance = None
|
|
289
|
+
if seed_path.exists():
|
|
290
|
+
for f in sorted(seed_path.glob("*.yaml")):
|
|
291
|
+
try:
|
|
292
|
+
with open(f) as fh:
|
|
293
|
+
data = yaml.safe_load(fh)
|
|
294
|
+
if isinstance(data, dict) and "std" in data:
|
|
295
|
+
seed_variance = data["std"]
|
|
296
|
+
except (yaml.YAMLError, OSError):
|
|
297
|
+
continue
|
|
298
|
+
|
|
299
|
+
metrics = [exp.get("metrics", {}).get(primary_metric) for exp in streak_experiments
|
|
300
|
+
if exp.get("metrics", {}).get(primary_metric) is not None]
|
|
301
|
+
|
|
302
|
+
if len(metrics) >= 2:
|
|
303
|
+
streak_range = max(metrics) - min(metrics)
|
|
304
|
+
if seed_variance is not None:
|
|
305
|
+
if streak_range < seed_variance * 2:
|
|
306
|
+
score += 0.7
|
|
307
|
+
evidence.append(f"Streak range ({streak_range:.4f}) < 2x seed std ({seed_variance:.4f})")
|
|
308
|
+
else:
|
|
309
|
+
streak_std = float(np.std(metrics))
|
|
310
|
+
if streak_std < 0.005:
|
|
311
|
+
score += 0.3
|
|
312
|
+
evidence.append(f"Streak std ({streak_std:.4f}) very low — may be noise")
|
|
313
|
+
|
|
314
|
+
return {"score": round(score, 2), "evidence": evidence or ["No noise floor pattern detected"], "seed_variance": seed_variance}
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
# --- Main Pipeline ---
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def run_postmortem(
|
|
321
|
+
window: int = DEFAULT_WINDOW,
|
|
322
|
+
config_path: str = "config.yaml",
|
|
323
|
+
log_path: str = DEFAULT_LOG_PATH,
|
|
324
|
+
seed_dir: str = "experiments/seed_studies",
|
|
325
|
+
) -> dict:
|
|
326
|
+
"""Run failure postmortem analysis.
|
|
327
|
+
|
|
328
|
+
Args:
|
|
329
|
+
window: Number of recent experiments to analyze.
|
|
330
|
+
config_path: Path to config.yaml.
|
|
331
|
+
log_path: Path to experiment log.
|
|
332
|
+
seed_dir: Path to seed study directory.
|
|
333
|
+
|
|
334
|
+
Returns:
|
|
335
|
+
Postmortem report with diagnosis, evidence, and recommendations.
|
|
336
|
+
"""
|
|
337
|
+
config = load_config(config_path)
|
|
338
|
+
eval_cfg = config.get("evaluation", {})
|
|
339
|
+
primary_metric = eval_cfg.get("primary_metric", "accuracy")
|
|
340
|
+
lower_is_better = eval_cfg.get("lower_is_better", False)
|
|
341
|
+
|
|
342
|
+
experiments = load_experiments(log_path)
|
|
343
|
+
|
|
344
|
+
if not experiments:
|
|
345
|
+
return {"error": "No experiments found"}
|
|
346
|
+
|
|
347
|
+
# Use last N experiments
|
|
348
|
+
recent = experiments[-window:]
|
|
349
|
+
|
|
350
|
+
streak_info = detect_failure_streak(recent, primary_metric, lower_is_better)
|
|
351
|
+
streak_exps = streak_info["streak_experiments"]
|
|
352
|
+
best_metric = streak_info["best_metric"]
|
|
353
|
+
streak_len = streak_info["streak_length"]
|
|
354
|
+
|
|
355
|
+
if streak_len < 2:
|
|
356
|
+
return {
|
|
357
|
+
"streak_length": streak_len,
|
|
358
|
+
"message": "No significant failure streak detected",
|
|
359
|
+
"best_metric": best_metric,
|
|
360
|
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
# Run all diagnoses
|
|
364
|
+
diagnoses = {
|
|
365
|
+
"search_space_exhaustion": diagnose_search_space_exhaustion(streak_exps, primary_metric),
|
|
366
|
+
"systematic_config_error": diagnose_systematic_config_error(streak_exps, primary_metric, best_metric),
|
|
367
|
+
"data_issue": diagnose_data_issue(streak_exps, primary_metric),
|
|
368
|
+
"metric_ceiling": diagnose_metric_ceiling(streak_exps, primary_metric, best_metric),
|
|
369
|
+
"noise_floor": diagnose_noise_floor(streak_exps, primary_metric, seed_dir),
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
# Pick the highest-scoring diagnosis
|
|
373
|
+
primary_diagnosis = max(diagnoses.items(), key=lambda d: d[1]["score"])
|
|
374
|
+
diagnosis_name = primary_diagnosis[0]
|
|
375
|
+
diagnosis_data = primary_diagnosis[1]
|
|
376
|
+
|
|
377
|
+
# Generate recommendations
|
|
378
|
+
recommendations = _generate_recommendations(diagnosis_name, diagnosis_data, streak_len)
|
|
379
|
+
|
|
380
|
+
return {
|
|
381
|
+
"streak_length": streak_len,
|
|
382
|
+
"window": window,
|
|
383
|
+
"best_metric": best_metric,
|
|
384
|
+
"primary_metric": primary_metric,
|
|
385
|
+
"primary_diagnosis": diagnosis_name,
|
|
386
|
+
"diagnosis_score": diagnosis_data["score"],
|
|
387
|
+
"diagnosis_evidence": diagnosis_data["evidence"],
|
|
388
|
+
"all_diagnoses": {k: {"score": v["score"]} for k, v in diagnoses.items()},
|
|
389
|
+
"recommendations": recommendations,
|
|
390
|
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def _generate_recommendations(diagnosis: str, data: dict, streak_len: int) -> list[str]:
|
|
395
|
+
"""Generate actionable recommendations based on diagnosis."""
|
|
396
|
+
recs = {
|
|
397
|
+
"search_space_exhaustion": [
|
|
398
|
+
"Stop tuning hyperparameters — switch to `/turing:feature` for feature engineering",
|
|
399
|
+
"Try `/turing:ensemble` — combine existing models instead of building new ones",
|
|
400
|
+
"Run `/turing:scale --axis data` — check if more data would help",
|
|
401
|
+
],
|
|
402
|
+
"systematic_config_error": [
|
|
403
|
+
"Run `/turing:sensitivity` — identify which params actually matter",
|
|
404
|
+
"Check the common config values against sensitivity analysis",
|
|
405
|
+
"Try resetting to the best experiment's config and vary one param at a time",
|
|
406
|
+
],
|
|
407
|
+
"data_issue": [
|
|
408
|
+
"Run `/turing:leak` — check for data leakage masking real performance",
|
|
409
|
+
"Run `/turing:sanity` — verify data pipeline integrity",
|
|
410
|
+
"Inspect the raw data for quality issues or distribution shift",
|
|
411
|
+
],
|
|
412
|
+
"metric_ceiling": [
|
|
413
|
+
"Run `/turing:scale` to confirm you've hit the ceiling",
|
|
414
|
+
"Consider shifting to a different metric or task formulation",
|
|
415
|
+
"Try ensemble methods for marginal gains: `/turing:ensemble`",
|
|
416
|
+
],
|
|
417
|
+
"noise_floor": [
|
|
418
|
+
"Run `/turing:seed` with more seeds to measure true variance",
|
|
419
|
+
"Increase n_runs for each experiment to reduce noise",
|
|
420
|
+
"Consider whether the current metric resolution is sufficient",
|
|
421
|
+
],
|
|
422
|
+
}
|
|
423
|
+
return recs.get(diagnosis, [f"Investigate the last {streak_len} experiments manually"])
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
# --- Report Formatting ---
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def save_postmortem_report(report: dict, output_dir: str = "experiments/postmortems") -> Path:
|
|
430
|
+
"""Save postmortem report to YAML."""
|
|
431
|
+
out_path = Path(output_dir)
|
|
432
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
|
433
|
+
ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
|
|
434
|
+
filepath = out_path / f"postmortem-{ts}.yaml"
|
|
435
|
+
with open(filepath, "w") as f:
|
|
436
|
+
yaml.dump(report, f, default_flow_style=False, sort_keys=False)
|
|
437
|
+
return filepath
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def format_postmortem_report(report: dict) -> str:
|
|
441
|
+
"""Format postmortem report as readable markdown."""
|
|
442
|
+
if "error" in report:
|
|
443
|
+
return f"ERROR: {report['error']}"
|
|
444
|
+
|
|
445
|
+
if "message" in report:
|
|
446
|
+
return f"No failure streak: {report['message']} (best {report.get('best_metric', 'N/A')})"
|
|
447
|
+
|
|
448
|
+
lines = [
|
|
449
|
+
f"# Failure Postmortem (last {report.get('streak_length', '?')} experiments, 0 improvements)",
|
|
450
|
+
"",
|
|
451
|
+
f"**Diagnosis:** {report.get('primary_diagnosis', 'unknown').upper().replace('_', ' ')}",
|
|
452
|
+
f"**Confidence:** {report.get('diagnosis_score', 0):.0%}",
|
|
453
|
+
"",
|
|
454
|
+
"## Evidence",
|
|
455
|
+
"",
|
|
456
|
+
]
|
|
457
|
+
|
|
458
|
+
for e in report.get("diagnosis_evidence", []):
|
|
459
|
+
lines.append(f"- {e}")
|
|
460
|
+
|
|
461
|
+
lines.extend(["", "## All Diagnoses", ""])
|
|
462
|
+
for name, data in report.get("all_diagnoses", {}).items():
|
|
463
|
+
score = data.get("score", 0)
|
|
464
|
+
marker = "◀" if name == report.get("primary_diagnosis") else ""
|
|
465
|
+
lines.append(f"- {name.replace('_', ' ')}: {score:.0%} {marker}")
|
|
466
|
+
|
|
467
|
+
lines.extend(["", "## Recommended Actions", ""])
|
|
468
|
+
for i, rec in enumerate(report.get("recommendations", []), 1):
|
|
469
|
+
lines.append(f"{i}. {rec}")
|
|
470
|
+
|
|
471
|
+
lines.extend(["", f"*Generated: {report.get('generated_at', 'N/A')}*"])
|
|
472
|
+
return "\n".join(lines)
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
# --- CLI ---
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
def main():
|
|
479
|
+
parser = argparse.ArgumentParser(
|
|
480
|
+
description="Failure postmortem — diagnose why experiments stopped improving"
|
|
481
|
+
)
|
|
482
|
+
parser.add_argument("--window", type=int, default=DEFAULT_WINDOW,
|
|
483
|
+
help="Number of recent experiments to analyze")
|
|
484
|
+
parser.add_argument("--auto-trigger", type=int, default=DEFAULT_AUTO_TRIGGER,
|
|
485
|
+
help="Minimum streak length to trigger postmortem")
|
|
486
|
+
parser.add_argument("--config", default="config.yaml", help="Path to config.yaml")
|
|
487
|
+
parser.add_argument("--log", default=DEFAULT_LOG_PATH, help="Path to experiment log")
|
|
488
|
+
parser.add_argument("--json", action="store_true", help="Output raw JSON")
|
|
489
|
+
|
|
490
|
+
args = parser.parse_args()
|
|
491
|
+
|
|
492
|
+
report = run_postmortem(
|
|
493
|
+
window=args.window,
|
|
494
|
+
config_path=args.config,
|
|
495
|
+
log_path=args.log,
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
if args.json:
|
|
499
|
+
print(json.dumps(report, indent=2))
|
|
500
|
+
else:
|
|
501
|
+
print(format_postmortem_report(report))
|
|
502
|
+
|
|
503
|
+
if "error" not in report and "message" not in report:
|
|
504
|
+
saved = save_postmortem_report(report)
|
|
505
|
+
if not args.json:
|
|
506
|
+
print(f"\nSaved: {saved}")
|
|
507
|
+
|
|
508
|
+
|
|
509
|
+
if __name__ == "__main__":
|
|
510
|
+
main()
|
|
@@ -404,6 +404,70 @@ def load_simulation_results(sim_dir: str = "experiments/simulations") -> dict |
|
|
|
404
404
|
return None
|
|
405
405
|
|
|
406
406
|
|
|
407
|
+
def load_registry_summary(registry_path: str = "experiments/registry.yaml") -> dict | None:
|
|
408
|
+
"""Load model registry summary for briefing."""
|
|
409
|
+
path = Path(registry_path)
|
|
410
|
+
if not path.exists():
|
|
411
|
+
return None
|
|
412
|
+
try:
|
|
413
|
+
with open(path) as f:
|
|
414
|
+
data = yaml.safe_load(f)
|
|
415
|
+
if isinstance(data, dict) and data.get("models"):
|
|
416
|
+
return data
|
|
417
|
+
except (yaml.YAMLError, OSError):
|
|
418
|
+
pass
|
|
419
|
+
return None
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def load_update_history(update_dir: str = "experiments/updates") -> list[dict]:
|
|
423
|
+
"""Load recent incremental update reports."""
|
|
424
|
+
path = Path(update_dir)
|
|
425
|
+
if not path.exists():
|
|
426
|
+
return []
|
|
427
|
+
results = []
|
|
428
|
+
for f in sorted(path.glob("*-update-*.yaml"))[-3:]:
|
|
429
|
+
try:
|
|
430
|
+
with open(f) as fh:
|
|
431
|
+
data = yaml.safe_load(fh)
|
|
432
|
+
if isinstance(data, dict):
|
|
433
|
+
results.append(data)
|
|
434
|
+
except (yaml.YAMLError, OSError):
|
|
435
|
+
continue
|
|
436
|
+
return results
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
def load_postmortem_result(postmortem_dir: str = "experiments/postmortems") -> dict | None:
|
|
440
|
+
"""Load the most recent postmortem result."""
|
|
441
|
+
path = Path(postmortem_dir)
|
|
442
|
+
if not path.exists():
|
|
443
|
+
return None
|
|
444
|
+
files = sorted(path.glob("postmortem-*.yaml"))
|
|
445
|
+
if not files:
|
|
446
|
+
return None
|
|
447
|
+
try:
|
|
448
|
+
with open(files[-1]) as f:
|
|
449
|
+
data = yaml.safe_load(f)
|
|
450
|
+
return data if isinstance(data, dict) else None
|
|
451
|
+
except (yaml.YAMLError, OSError):
|
|
452
|
+
return None
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
def load_research_plan(plan_dir: str = "experiments/plans") -> dict | None:
|
|
456
|
+
"""Load the most recent research plan."""
|
|
457
|
+
path = Path(plan_dir)
|
|
458
|
+
if not path.exists():
|
|
459
|
+
return None
|
|
460
|
+
files = sorted(path.glob("plan-*.yaml"))
|
|
461
|
+
if not files:
|
|
462
|
+
return None
|
|
463
|
+
try:
|
|
464
|
+
with open(files[-1]) as f:
|
|
465
|
+
data = yaml.safe_load(f)
|
|
466
|
+
return data if isinstance(data, dict) else None
|
|
467
|
+
except (yaml.YAMLError, OSError):
|
|
468
|
+
return None
|
|
469
|
+
|
|
470
|
+
|
|
407
471
|
def format_brief(
|
|
408
472
|
campaign: dict,
|
|
409
473
|
best: dict | None,
|
|
@@ -428,6 +492,10 @@ def format_brief(
|
|
|
428
492
|
audit_report: dict | None = None,
|
|
429
493
|
whatif_results: list[dict] | None = None,
|
|
430
494
|
simulation_result: dict | None = None,
|
|
495
|
+
registry_summary: dict | None = None,
|
|
496
|
+
update_history: list[dict] | None = None,
|
|
497
|
+
postmortem_result: dict | None = None,
|
|
498
|
+
research_plan: dict | None = None,
|
|
431
499
|
) -> str:
|
|
432
500
|
"""Format the research briefing as markdown."""
|
|
433
501
|
direction = "lower" if lower_is_better else "higher"
|
|
@@ -758,6 +826,52 @@ def format_brief(
|
|
|
758
826
|
lines.append(f"**Last simulation:** {run} configs recommended, {skip} skipped ({savings}% budget savings)")
|
|
759
827
|
lines.append("")
|
|
760
828
|
|
|
829
|
+
# Model Lifecycle section
|
|
830
|
+
if registry_summary or update_history:
|
|
831
|
+
lines.extend(["", "## Model Lifecycle", ""])
|
|
832
|
+
|
|
833
|
+
if registry_summary:
|
|
834
|
+
models = registry_summary.get("models", [])
|
|
835
|
+
for m in models:
|
|
836
|
+
if m.get("stage") != "archived":
|
|
837
|
+
metric = f"{m['metric']:.4f}" if m.get("metric") is not None else "—"
|
|
838
|
+
lines.append(f"- **{m['stage']}:** {m['exp_id']} ({m.get('version', '?')}, {m.get('metric_name', 'metric')}={metric})")
|
|
839
|
+
if not any(m.get("stage") != "archived" for m in models):
|
|
840
|
+
lines.append("- All models archived — register a new candidate with `/turing:registry register`")
|
|
841
|
+
lines.append("")
|
|
842
|
+
|
|
843
|
+
if update_history:
|
|
844
|
+
lines.append(f"**Recent updates:** {len(update_history)}")
|
|
845
|
+
for u in update_history[-2:]:
|
|
846
|
+
verdict = u.get("verdict", "?")
|
|
847
|
+
exp_id = u.get("experiment_id", "?")
|
|
848
|
+
strategy = u.get("plan", {}).get("strategy", "?")
|
|
849
|
+
lines.append(f"- {exp_id}: {strategy} — {verdict}")
|
|
850
|
+
lines.append("")
|
|
851
|
+
|
|
852
|
+
# Operational Intelligence section
|
|
853
|
+
if postmortem_result or research_plan:
|
|
854
|
+
lines.extend(["", "## Operational Intelligence", ""])
|
|
855
|
+
|
|
856
|
+
if postmortem_result and "primary_diagnosis" in postmortem_result:
|
|
857
|
+
diagnosis = postmortem_result["primary_diagnosis"].replace("_", " ").title()
|
|
858
|
+
streak = postmortem_result.get("streak_length", "?")
|
|
859
|
+
score = postmortem_result.get("diagnosis_score", 0)
|
|
860
|
+
lines.append(f"**Failure postmortem:** {diagnosis} ({score:.0%} confidence, {streak} experiment streak)")
|
|
861
|
+
recs = postmortem_result.get("recommendations", [])
|
|
862
|
+
if recs:
|
|
863
|
+
lines.append(f" Action: {recs[0]}")
|
|
864
|
+
lines.append("")
|
|
865
|
+
|
|
866
|
+
if research_plan and "plan" in research_plan:
|
|
867
|
+
plan = research_plan["plan"]
|
|
868
|
+
n = plan.get("total_experiments", 0)
|
|
869
|
+
gain = plan.get("expected_gain", 0)
|
|
870
|
+
lines.append(f"**Active research plan:** {n} experiments planned (+{gain} expected gain)")
|
|
871
|
+
for phase in plan.get("phases", [])[:3]:
|
|
872
|
+
lines.append(f" - {phase['label']}: {phase['n_experiments']} experiments")
|
|
873
|
+
lines.append("")
|
|
874
|
+
|
|
761
875
|
lines.extend([
|
|
762
876
|
"",
|
|
763
877
|
"## Recommendations",
|
|
@@ -830,6 +944,10 @@ def generate_brief(
|
|
|
830
944
|
audit_report = load_audit_report()
|
|
831
945
|
whatif_results = load_whatif_results()
|
|
832
946
|
simulation_result = load_simulation_results()
|
|
947
|
+
registry_summary = load_registry_summary()
|
|
948
|
+
update_history = load_update_history()
|
|
949
|
+
postmortem_result = load_postmortem_result()
|
|
950
|
+
research_plan = load_research_plan()
|
|
833
951
|
|
|
834
952
|
return format_brief(
|
|
835
953
|
campaign, best, trajectory, model_types, hypotheses,
|
|
@@ -848,6 +966,10 @@ def generate_brief(
|
|
|
848
966
|
audit_report=audit_report,
|
|
849
967
|
whatif_results=whatif_results if whatif_results else None,
|
|
850
968
|
simulation_result=simulation_result,
|
|
969
|
+
registry_summary=registry_summary,
|
|
970
|
+
update_history=update_history if update_history else None,
|
|
971
|
+
postmortem_result=postmortem_result,
|
|
972
|
+
research_plan=research_plan,
|
|
851
973
|
)
|
|
852
974
|
|
|
853
975
|
|