claude-turing 4.2.0 → 4.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,510 @@
1
+ #!/usr/bin/env python3
2
+ """Automated failure postmortem for the autoresearch pipeline.
3
+
4
+ When experiments stop improving, diagnoses the root cause: search space
5
+ exhaustion, systematic config error, data issue, metric ceiling, or
6
+ noise floor. Produces actionable next steps.
7
+
8
+ Usage:
9
+ python scripts/failure_postmortem.py
10
+ python scripts/failure_postmortem.py --window 10
11
+ python scripts/failure_postmortem.py --auto-trigger 5
12
+ python scripts/failure_postmortem.py --json
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import argparse
18
+ import json
19
+ import sys
20
+ from datetime import datetime, timezone
21
+ from pathlib import Path
22
+
23
+ import numpy as np
24
+ import yaml
25
+
26
+ from scripts.turing_io import load_config, load_experiments
27
+
28
+ DEFAULT_LOG_PATH = "experiments/log.jsonl"
29
+ DEFAULT_WINDOW = 10
30
+ DEFAULT_AUTO_TRIGGER = 5
31
+
32
+ DIAGNOSIS_TYPES = [
33
+ "search_space_exhaustion",
34
+ "systematic_config_error",
35
+ "data_issue",
36
+ "metric_ceiling",
37
+ "noise_floor",
38
+ ]
39
+
40
+
41
+ # --- Streak Detection ---
42
+
43
+
44
+ def detect_failure_streak(
45
+ experiments: list[dict],
46
+ primary_metric: str,
47
+ lower_is_better: bool = False,
48
+ ) -> dict:
49
+ """Detect how many consecutive experiments failed to improve.
50
+
51
+ Returns:
52
+ Streak info with count, best metric, streak experiments.
53
+ """
54
+ if not experiments:
55
+ return {"streak_length": 0, "best_metric": None, "streak_experiments": []}
56
+
57
+ # Find the best metric value
58
+ best_val = None
59
+ for exp in experiments:
60
+ val = exp.get("metrics", {}).get(primary_metric)
61
+ if val is None:
62
+ continue
63
+ if best_val is None:
64
+ best_val = val
65
+ elif (lower_is_better and val < best_val) or (not lower_is_better and val > best_val):
66
+ best_val = val
67
+
68
+ if best_val is None:
69
+ return {"streak_length": len(experiments), "best_metric": None, "streak_experiments": experiments}
70
+
71
+ # Count consecutive non-improvements from the end
72
+ streak = []
73
+ best_so_far = None
74
+
75
+ for exp in experiments:
76
+ val = exp.get("metrics", {}).get(primary_metric)
77
+ if val is None:
78
+ continue
79
+ if best_so_far is None:
80
+ best_so_far = val
81
+ elif (lower_is_better and val < best_so_far) or (not lower_is_better and val > best_so_far):
82
+ best_so_far = val
83
+ streak = [] # Reset streak on improvement
84
+
85
+ streak.append(exp)
86
+
87
+ # The streak is from last improvement to end
88
+ # Remove the improving experiment itself if it's the first
89
+ if streak and streak[0].get("metrics", {}).get(primary_metric) == best_so_far:
90
+ streak = streak[1:]
91
+
92
+ return {
93
+ "streak_length": len(streak),
94
+ "best_metric": best_val,
95
+ "streak_experiments": streak,
96
+ }
97
+
98
+
99
+ # --- Diagnosis Functions ---
100
+
101
+
102
+ def diagnose_search_space_exhaustion(
103
+ streak_experiments: list[dict],
104
+ primary_metric: str,
105
+ ) -> dict:
106
+ """Check if experiments cluster in a small config region."""
107
+ if len(streak_experiments) < 3:
108
+ return {"score": 0, "evidence": "Too few experiments for diagnosis"}
109
+
110
+ # Extract hyperparameters from streak
111
+ all_params = {}
112
+ for exp in streak_experiments:
113
+ config = exp.get("config", {})
114
+ hyperparams = config.get("hyperparams", {})
115
+ for k, v in hyperparams.items():
116
+ if isinstance(v, (int, float)) and not isinstance(v, bool):
117
+ all_params.setdefault(k, []).append(float(v))
118
+
119
+ if not all_params:
120
+ return {"score": 0, "evidence": "No numeric hyperparameters found"}
121
+
122
+ # Measure coefficient of variation for each param
123
+ low_variance_params = []
124
+ for param, values in all_params.items():
125
+ if len(values) < 2:
126
+ continue
127
+ mean = np.mean(values)
128
+ if abs(mean) < 1e-10:
129
+ continue
130
+ cv = np.std(values) / abs(mean)
131
+ if cv < 0.15: # Less than 15% variation
132
+ low_variance_params.append({"param": param, "cv": round(float(cv), 4), "mean": round(float(mean), 4)})
133
+
134
+ # Check family diversity
135
+ families = set()
136
+ for exp in streak_experiments:
137
+ family = exp.get("family", exp.get("config", {}).get("family", "unknown"))
138
+ families.add(family)
139
+
140
+ score = 0
141
+ evidence = []
142
+
143
+ if len(low_variance_params) > len(all_params) * 0.5:
144
+ score += 0.4
145
+ evidence.append(f"Config variance LOW: {len(low_variance_params)}/{len(all_params)} params within ±15%")
146
+
147
+ if len(families) <= 1:
148
+ score += 0.3
149
+ evidence.append(f"All experiments in same family: {families}")
150
+
151
+ # Check if metrics are clustered
152
+ metrics = [exp.get("metrics", {}).get(primary_metric) for exp in streak_experiments
153
+ if exp.get("metrics", {}).get(primary_metric) is not None]
154
+ if len(metrics) >= 2:
155
+ metric_cv = np.std(metrics) / abs(np.mean(metrics)) if abs(np.mean(metrics)) > 0 else 0
156
+ if metric_cv < 0.02:
157
+ score += 0.3
158
+ evidence.append(f"Metric range very tight (CV={metric_cv:.4f})")
159
+
160
+ return {
161
+ "score": round(score, 2),
162
+ "evidence": evidence if evidence else ["No strong evidence of exhaustion"],
163
+ "low_variance_params": low_variance_params,
164
+ "families": list(families),
165
+ }
166
+
167
+
168
+ def diagnose_systematic_config_error(
169
+ streak_experiments: list[dict],
170
+ primary_metric: str,
171
+ best_metric: float | None,
172
+ ) -> dict:
173
+ """Check if all experiments share a common bad config."""
174
+ if len(streak_experiments) < 3:
175
+ return {"score": 0, "evidence": "Too few experiments"}
176
+
177
+ # Find params that are identical across all streak experiments
178
+ common_params = {}
179
+ first_config = streak_experiments[0].get("config", {}).get("hyperparams", {})
180
+
181
+ for k, v in first_config.items():
182
+ if not isinstance(v, (int, float, str)):
183
+ continue
184
+ all_same = all(
185
+ exp.get("config", {}).get("hyperparams", {}).get(k) == v
186
+ for exp in streak_experiments[1:]
187
+ )
188
+ if all_same:
189
+ common_params[k] = v
190
+
191
+ score = 0
192
+ evidence = []
193
+
194
+ if common_params:
195
+ ratio = len(common_params) / max(len(first_config), 1)
196
+ if ratio > 0.5:
197
+ score += 0.5
198
+ evidence.append(f"{len(common_params)} params unchanged across all {len(streak_experiments)} experiments")
199
+ evidence.append(f"Common: {common_params}")
200
+
201
+ # Check if all experiments are significantly worse than best
202
+ if best_metric is not None:
203
+ streak_metrics = [exp.get("metrics", {}).get(primary_metric) for exp in streak_experiments
204
+ if exp.get("metrics", {}).get(primary_metric) is not None]
205
+ if streak_metrics:
206
+ avg_gap = abs(np.mean(streak_metrics) - best_metric)
207
+ if avg_gap > 0.02:
208
+ score += 0.3
209
+ evidence.append(f"Average gap from best: {avg_gap:.4f}")
210
+
211
+ return {"score": round(score, 2), "evidence": evidence or ["No common config error detected"], "common_params": common_params}
212
+
213
+
214
+ def diagnose_data_issue(
215
+ streak_experiments: list[dict],
216
+ primary_metric: str,
217
+ ) -> dict:
218
+ """Check if all models fail similarly regardless of type."""
219
+ if len(streak_experiments) < 3:
220
+ return {"score": 0, "evidence": "Too few experiments"}
221
+
222
+ # Check model type diversity
223
+ model_types = set()
224
+ for exp in streak_experiments:
225
+ mt = exp.get("config", {}).get("model_type", "unknown")
226
+ model_types.add(mt)
227
+
228
+ score = 0
229
+ evidence = []
230
+
231
+ # If multiple model types all fail similarly → data issue
232
+ if len(model_types) >= 2:
233
+ metrics = [exp.get("metrics", {}).get(primary_metric) for exp in streak_experiments
234
+ if exp.get("metrics", {}).get(primary_metric) is not None]
235
+ if len(metrics) >= 2:
236
+ cv = np.std(metrics) / abs(np.mean(metrics)) if abs(np.mean(metrics)) > 0 else 0
237
+ if cv < 0.03:
238
+ score += 0.6
239
+ evidence.append(f"{len(model_types)} different model types all perform similarly (CV={cv:.4f})")
240
+ evidence.append(f"Model types: {model_types}")
241
+
242
+ return {"score": round(score, 2), "evidence": evidence or ["No data issue pattern detected"], "model_types": list(model_types)}
243
+
244
+
245
+ def diagnose_metric_ceiling(
246
+ streak_experiments: list[dict],
247
+ primary_metric: str,
248
+ best_metric: float | None,
249
+ ) -> dict:
250
+ """Check if metrics are plateauing near a theoretical limit."""
251
+ if best_metric is None:
252
+ return {"score": 0, "evidence": "No best metric available"}
253
+
254
+ score = 0
255
+ evidence = []
256
+
257
+ # Check if best metric is very high (suggesting ceiling)
258
+ if best_metric > 0.95:
259
+ score += 0.4
260
+ evidence.append(f"Current best {primary_metric}={best_metric:.4f} — near theoretical maximum")
261
+
262
+ # Check improvement rate (are improvements getting tiny?)
263
+ metrics = sorted([
264
+ exp.get("metrics", {}).get(primary_metric)
265
+ for exp in streak_experiments
266
+ if exp.get("metrics", {}).get(primary_metric) is not None
267
+ ])
268
+ if len(metrics) >= 3:
269
+ range_val = max(metrics) - min(metrics)
270
+ if range_val < 0.005:
271
+ score += 0.3
272
+ evidence.append(f"Metric range in streak: {range_val:.4f} (< 0.005)")
273
+
274
+ return {"score": round(score, 2), "evidence": evidence or ["No ceiling pattern detected"]}
275
+
276
+
277
+ def diagnose_noise_floor(
278
+ streak_experiments: list[dict],
279
+ primary_metric: str,
280
+ seed_dir: str = "experiments/seed_studies",
281
+ ) -> dict:
282
+ """Check if improvements are within seed variance."""
283
+ score = 0
284
+ evidence = []
285
+
286
+ # Check seed study data for variance estimate
287
+ seed_path = Path(seed_dir)
288
+ seed_variance = None
289
+ if seed_path.exists():
290
+ for f in sorted(seed_path.glob("*.yaml")):
291
+ try:
292
+ with open(f) as fh:
293
+ data = yaml.safe_load(fh)
294
+ if isinstance(data, dict) and "std" in data:
295
+ seed_variance = data["std"]
296
+ except (yaml.YAMLError, OSError):
297
+ continue
298
+
299
+ metrics = [exp.get("metrics", {}).get(primary_metric) for exp in streak_experiments
300
+ if exp.get("metrics", {}).get(primary_metric) is not None]
301
+
302
+ if len(metrics) >= 2:
303
+ streak_range = max(metrics) - min(metrics)
304
+ if seed_variance is not None:
305
+ if streak_range < seed_variance * 2:
306
+ score += 0.7
307
+ evidence.append(f"Streak range ({streak_range:.4f}) < 2x seed std ({seed_variance:.4f})")
308
+ else:
309
+ streak_std = float(np.std(metrics))
310
+ if streak_std < 0.005:
311
+ score += 0.3
312
+ evidence.append(f"Streak std ({streak_std:.4f}) very low — may be noise")
313
+
314
+ return {"score": round(score, 2), "evidence": evidence or ["No noise floor pattern detected"], "seed_variance": seed_variance}
315
+
316
+
317
+ # --- Main Pipeline ---
318
+
319
+
320
+ def run_postmortem(
321
+ window: int = DEFAULT_WINDOW,
322
+ config_path: str = "config.yaml",
323
+ log_path: str = DEFAULT_LOG_PATH,
324
+ seed_dir: str = "experiments/seed_studies",
325
+ ) -> dict:
326
+ """Run failure postmortem analysis.
327
+
328
+ Args:
329
+ window: Number of recent experiments to analyze.
330
+ config_path: Path to config.yaml.
331
+ log_path: Path to experiment log.
332
+ seed_dir: Path to seed study directory.
333
+
334
+ Returns:
335
+ Postmortem report with diagnosis, evidence, and recommendations.
336
+ """
337
+ config = load_config(config_path)
338
+ eval_cfg = config.get("evaluation", {})
339
+ primary_metric = eval_cfg.get("primary_metric", "accuracy")
340
+ lower_is_better = eval_cfg.get("lower_is_better", False)
341
+
342
+ experiments = load_experiments(log_path)
343
+
344
+ if not experiments:
345
+ return {"error": "No experiments found"}
346
+
347
+ # Use last N experiments
348
+ recent = experiments[-window:]
349
+
350
+ streak_info = detect_failure_streak(recent, primary_metric, lower_is_better)
351
+ streak_exps = streak_info["streak_experiments"]
352
+ best_metric = streak_info["best_metric"]
353
+ streak_len = streak_info["streak_length"]
354
+
355
+ if streak_len < 2:
356
+ return {
357
+ "streak_length": streak_len,
358
+ "message": "No significant failure streak detected",
359
+ "best_metric": best_metric,
360
+ "generated_at": datetime.now(timezone.utc).isoformat(),
361
+ }
362
+
363
+ # Run all diagnoses
364
+ diagnoses = {
365
+ "search_space_exhaustion": diagnose_search_space_exhaustion(streak_exps, primary_metric),
366
+ "systematic_config_error": diagnose_systematic_config_error(streak_exps, primary_metric, best_metric),
367
+ "data_issue": diagnose_data_issue(streak_exps, primary_metric),
368
+ "metric_ceiling": diagnose_metric_ceiling(streak_exps, primary_metric, best_metric),
369
+ "noise_floor": diagnose_noise_floor(streak_exps, primary_metric, seed_dir),
370
+ }
371
+
372
+ # Pick the highest-scoring diagnosis
373
+ primary_diagnosis = max(diagnoses.items(), key=lambda d: d[1]["score"])
374
+ diagnosis_name = primary_diagnosis[0]
375
+ diagnosis_data = primary_diagnosis[1]
376
+
377
+ # Generate recommendations
378
+ recommendations = _generate_recommendations(diagnosis_name, diagnosis_data, streak_len)
379
+
380
+ return {
381
+ "streak_length": streak_len,
382
+ "window": window,
383
+ "best_metric": best_metric,
384
+ "primary_metric": primary_metric,
385
+ "primary_diagnosis": diagnosis_name,
386
+ "diagnosis_score": diagnosis_data["score"],
387
+ "diagnosis_evidence": diagnosis_data["evidence"],
388
+ "all_diagnoses": {k: {"score": v["score"]} for k, v in diagnoses.items()},
389
+ "recommendations": recommendations,
390
+ "generated_at": datetime.now(timezone.utc).isoformat(),
391
+ }
392
+
393
+
394
+ def _generate_recommendations(diagnosis: str, data: dict, streak_len: int) -> list[str]:
395
+ """Generate actionable recommendations based on diagnosis."""
396
+ recs = {
397
+ "search_space_exhaustion": [
398
+ "Stop tuning hyperparameters — switch to `/turing:feature` for feature engineering",
399
+ "Try `/turing:ensemble` — combine existing models instead of building new ones",
400
+ "Run `/turing:scale --axis data` — check if more data would help",
401
+ ],
402
+ "systematic_config_error": [
403
+ "Run `/turing:sensitivity` — identify which params actually matter",
404
+ "Check the common config values against sensitivity analysis",
405
+ "Try resetting to the best experiment's config and vary one param at a time",
406
+ ],
407
+ "data_issue": [
408
+ "Run `/turing:leak` — check for data leakage masking real performance",
409
+ "Run `/turing:sanity` — verify data pipeline integrity",
410
+ "Inspect the raw data for quality issues or distribution shift",
411
+ ],
412
+ "metric_ceiling": [
413
+ "Run `/turing:scale` to confirm you've hit the ceiling",
414
+ "Consider shifting to a different metric or task formulation",
415
+ "Try ensemble methods for marginal gains: `/turing:ensemble`",
416
+ ],
417
+ "noise_floor": [
418
+ "Run `/turing:seed` with more seeds to measure true variance",
419
+ "Increase n_runs for each experiment to reduce noise",
420
+ "Consider whether the current metric resolution is sufficient",
421
+ ],
422
+ }
423
+ return recs.get(diagnosis, [f"Investigate the last {streak_len} experiments manually"])
424
+
425
+
426
+ # --- Report Formatting ---
427
+
428
+
429
+ def save_postmortem_report(report: dict, output_dir: str = "experiments/postmortems") -> Path:
430
+ """Save postmortem report to YAML."""
431
+ out_path = Path(output_dir)
432
+ out_path.mkdir(parents=True, exist_ok=True)
433
+ ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
434
+ filepath = out_path / f"postmortem-{ts}.yaml"
435
+ with open(filepath, "w") as f:
436
+ yaml.dump(report, f, default_flow_style=False, sort_keys=False)
437
+ return filepath
438
+
439
+
440
+ def format_postmortem_report(report: dict) -> str:
441
+ """Format postmortem report as readable markdown."""
442
+ if "error" in report:
443
+ return f"ERROR: {report['error']}"
444
+
445
+ if "message" in report:
446
+ return f"No failure streak: {report['message']} (best {report.get('best_metric', 'N/A')})"
447
+
448
+ lines = [
449
+ f"# Failure Postmortem (last {report.get('streak_length', '?')} experiments, 0 improvements)",
450
+ "",
451
+ f"**Diagnosis:** {report.get('primary_diagnosis', 'unknown').upper().replace('_', ' ')}",
452
+ f"**Confidence:** {report.get('diagnosis_score', 0):.0%}",
453
+ "",
454
+ "## Evidence",
455
+ "",
456
+ ]
457
+
458
+ for e in report.get("diagnosis_evidence", []):
459
+ lines.append(f"- {e}")
460
+
461
+ lines.extend(["", "## All Diagnoses", ""])
462
+ for name, data in report.get("all_diagnoses", {}).items():
463
+ score = data.get("score", 0)
464
+ marker = "◀" if name == report.get("primary_diagnosis") else ""
465
+ lines.append(f"- {name.replace('_', ' ')}: {score:.0%} {marker}")
466
+
467
+ lines.extend(["", "## Recommended Actions", ""])
468
+ for i, rec in enumerate(report.get("recommendations", []), 1):
469
+ lines.append(f"{i}. {rec}")
470
+
471
+ lines.extend(["", f"*Generated: {report.get('generated_at', 'N/A')}*"])
472
+ return "\n".join(lines)
473
+
474
+
475
+ # --- CLI ---
476
+
477
+
478
+ def main():
479
+ parser = argparse.ArgumentParser(
480
+ description="Failure postmortem — diagnose why experiments stopped improving"
481
+ )
482
+ parser.add_argument("--window", type=int, default=DEFAULT_WINDOW,
483
+ help="Number of recent experiments to analyze")
484
+ parser.add_argument("--auto-trigger", type=int, default=DEFAULT_AUTO_TRIGGER,
485
+ help="Minimum streak length to trigger postmortem")
486
+ parser.add_argument("--config", default="config.yaml", help="Path to config.yaml")
487
+ parser.add_argument("--log", default=DEFAULT_LOG_PATH, help="Path to experiment log")
488
+ parser.add_argument("--json", action="store_true", help="Output raw JSON")
489
+
490
+ args = parser.parse_args()
491
+
492
+ report = run_postmortem(
493
+ window=args.window,
494
+ config_path=args.config,
495
+ log_path=args.log,
496
+ )
497
+
498
+ if args.json:
499
+ print(json.dumps(report, indent=2))
500
+ else:
501
+ print(format_postmortem_report(report))
502
+
503
+ if "error" not in report and "message" not in report:
504
+ saved = save_postmortem_report(report)
505
+ if not args.json:
506
+ print(f"\nSaved: {saved}")
507
+
508
+
509
+ if __name__ == "__main__":
510
+ main()
@@ -404,6 +404,70 @@ def load_simulation_results(sim_dir: str = "experiments/simulations") -> dict |
404
404
  return None
405
405
 
406
406
 
407
+ def load_registry_summary(registry_path: str = "experiments/registry.yaml") -> dict | None:
408
+ """Load model registry summary for briefing."""
409
+ path = Path(registry_path)
410
+ if not path.exists():
411
+ return None
412
+ try:
413
+ with open(path) as f:
414
+ data = yaml.safe_load(f)
415
+ if isinstance(data, dict) and data.get("models"):
416
+ return data
417
+ except (yaml.YAMLError, OSError):
418
+ pass
419
+ return None
420
+
421
+
422
+ def load_update_history(update_dir: str = "experiments/updates") -> list[dict]:
423
+ """Load recent incremental update reports."""
424
+ path = Path(update_dir)
425
+ if not path.exists():
426
+ return []
427
+ results = []
428
+ for f in sorted(path.glob("*-update-*.yaml"))[-3:]:
429
+ try:
430
+ with open(f) as fh:
431
+ data = yaml.safe_load(fh)
432
+ if isinstance(data, dict):
433
+ results.append(data)
434
+ except (yaml.YAMLError, OSError):
435
+ continue
436
+ return results
437
+
438
+
439
+ def load_postmortem_result(postmortem_dir: str = "experiments/postmortems") -> dict | None:
440
+ """Load the most recent postmortem result."""
441
+ path = Path(postmortem_dir)
442
+ if not path.exists():
443
+ return None
444
+ files = sorted(path.glob("postmortem-*.yaml"))
445
+ if not files:
446
+ return None
447
+ try:
448
+ with open(files[-1]) as f:
449
+ data = yaml.safe_load(f)
450
+ return data if isinstance(data, dict) else None
451
+ except (yaml.YAMLError, OSError):
452
+ return None
453
+
454
+
455
+ def load_research_plan(plan_dir: str = "experiments/plans") -> dict | None:
456
+ """Load the most recent research plan."""
457
+ path = Path(plan_dir)
458
+ if not path.exists():
459
+ return None
460
+ files = sorted(path.glob("plan-*.yaml"))
461
+ if not files:
462
+ return None
463
+ try:
464
+ with open(files[-1]) as f:
465
+ data = yaml.safe_load(f)
466
+ return data if isinstance(data, dict) else None
467
+ except (yaml.YAMLError, OSError):
468
+ return None
469
+
470
+
407
471
  def format_brief(
408
472
  campaign: dict,
409
473
  best: dict | None,
@@ -428,6 +492,10 @@ def format_brief(
428
492
  audit_report: dict | None = None,
429
493
  whatif_results: list[dict] | None = None,
430
494
  simulation_result: dict | None = None,
495
+ registry_summary: dict | None = None,
496
+ update_history: list[dict] | None = None,
497
+ postmortem_result: dict | None = None,
498
+ research_plan: dict | None = None,
431
499
  ) -> str:
432
500
  """Format the research briefing as markdown."""
433
501
  direction = "lower" if lower_is_better else "higher"
@@ -758,6 +826,52 @@ def format_brief(
758
826
  lines.append(f"**Last simulation:** {run} configs recommended, {skip} skipped ({savings}% budget savings)")
759
827
  lines.append("")
760
828
 
829
+ # Model Lifecycle section
830
+ if registry_summary or update_history:
831
+ lines.extend(["", "## Model Lifecycle", ""])
832
+
833
+ if registry_summary:
834
+ models = registry_summary.get("models", [])
835
+ for m in models:
836
+ if m.get("stage") != "archived":
837
+ metric = f"{m['metric']:.4f}" if m.get("metric") is not None else "—"
838
+ lines.append(f"- **{m['stage']}:** {m['exp_id']} ({m.get('version', '?')}, {m.get('metric_name', 'metric')}={metric})")
839
+ if not any(m.get("stage") != "archived" for m in models):
840
+ lines.append("- All models archived — register a new candidate with `/turing:registry register`")
841
+ lines.append("")
842
+
843
+ if update_history:
844
+ lines.append(f"**Recent updates:** {len(update_history)}")
845
+ for u in update_history[-2:]:
846
+ verdict = u.get("verdict", "?")
847
+ exp_id = u.get("experiment_id", "?")
848
+ strategy = u.get("plan", {}).get("strategy", "?")
849
+ lines.append(f"- {exp_id}: {strategy} — {verdict}")
850
+ lines.append("")
851
+
852
+ # Operational Intelligence section
853
+ if postmortem_result or research_plan:
854
+ lines.extend(["", "## Operational Intelligence", ""])
855
+
856
+ if postmortem_result and "primary_diagnosis" in postmortem_result:
857
+ diagnosis = postmortem_result["primary_diagnosis"].replace("_", " ").title()
858
+ streak = postmortem_result.get("streak_length", "?")
859
+ score = postmortem_result.get("diagnosis_score", 0)
860
+ lines.append(f"**Failure postmortem:** {diagnosis} ({score:.0%} confidence, {streak} experiment streak)")
861
+ recs = postmortem_result.get("recommendations", [])
862
+ if recs:
863
+ lines.append(f" Action: {recs[0]}")
864
+ lines.append("")
865
+
866
+ if research_plan and "plan" in research_plan:
867
+ plan = research_plan["plan"]
868
+ n = plan.get("total_experiments", 0)
869
+ gain = plan.get("expected_gain", 0)
870
+ lines.append(f"**Active research plan:** {n} experiments planned (+{gain} expected gain)")
871
+ for phase in plan.get("phases", [])[:3]:
872
+ lines.append(f" - {phase['label']}: {phase['n_experiments']} experiments")
873
+ lines.append("")
874
+
761
875
  lines.extend([
762
876
  "",
763
877
  "## Recommendations",
@@ -830,6 +944,10 @@ def generate_brief(
830
944
  audit_report = load_audit_report()
831
945
  whatif_results = load_whatif_results()
832
946
  simulation_result = load_simulation_results()
947
+ registry_summary = load_registry_summary()
948
+ update_history = load_update_history()
949
+ postmortem_result = load_postmortem_result()
950
+ research_plan = load_research_plan()
833
951
 
834
952
  return format_brief(
835
953
  campaign, best, trajectory, model_types, hypotheses,
@@ -848,6 +966,10 @@ def generate_brief(
848
966
  audit_report=audit_report,
849
967
  whatif_results=whatif_results if whatif_results else None,
850
968
  simulation_result=simulation_result,
969
+ registry_summary=registry_summary,
970
+ update_history=update_history if update_history else None,
971
+ postmortem_result=postmortem_result,
972
+ research_plan=research_plan,
851
973
  )
852
974
 
853
975