claude-turing 2.4.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +7 -2
- package/commands/audit.md +56 -0
- package/commands/budget.md +52 -0
- package/commands/distill.md +56 -0
- package/commands/scale.md +55 -0
- package/commands/transfer.md +54 -0
- package/commands/turing.md +10 -0
- package/package.json +1 -1
- package/src/install.js +2 -0
- package/src/verify.js +5 -0
- package/templates/scripts/__pycache__/budget_manager.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/knowledge_transfer.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/methodology_audit.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/model_distiller.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaling_estimator.cpython-314.pyc +0 -0
- package/templates/scripts/budget_manager.py +419 -0
- package/templates/scripts/generate_brief.py +101 -0
- package/templates/scripts/knowledge_transfer.py +618 -0
- package/templates/scripts/methodology_audit.py +451 -0
- package/templates/scripts/model_distiller.py +478 -0
- package/templates/scripts/scaffold.py +9 -0
- package/templates/scripts/scaling_estimator.py +523 -0
|
@@ -0,0 +1,419 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Compute budget manager for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
Sets a total compute budget (experiments, hours, or cost) and allocates
|
|
5
|
+
across exploration vs exploitation. Auto-shifts to exploit mode when
|
|
6
|
+
budget runs low. Prevents runaway compute spend.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python scripts/budget_manager.py set --experiments 50 --hours 8
|
|
10
|
+
python scripts/budget_manager.py status
|
|
11
|
+
python scripts/budget_manager.py reset
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import json
|
|
18
|
+
import sys
|
|
19
|
+
from datetime import datetime, timezone
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
import yaml
|
|
23
|
+
|
|
24
|
+
from scripts.turing_io import load_config, load_experiments
|
|
25
|
+
|
|
26
|
+
DEFAULT_LOG_PATH = "experiments/log.jsonl"
|
|
27
|
+
DEFAULT_STATE_PATH = "experiment_state.yaml"
|
|
28
|
+
|
|
29
|
+
# Budget allocation policy thresholds
|
|
30
|
+
EXPLORE_PHASE_END = 0.50 # 0-50% budget: explore
|
|
31
|
+
MIXED_PHASE_END = 0.80 # 50-80%: mixed
|
|
32
|
+
# 80-100%: exploit only
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# --- Budget State ---
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def load_budget(state_path: str = DEFAULT_STATE_PATH) -> dict | None:
|
|
39
|
+
"""Load budget from experiment state file."""
|
|
40
|
+
path = Path(state_path)
|
|
41
|
+
if not path.exists():
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
with open(path) as f:
|
|
45
|
+
state = yaml.safe_load(f) or {}
|
|
46
|
+
|
|
47
|
+
return state.get("budget")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def save_budget(budget: dict, state_path: str = DEFAULT_STATE_PATH) -> None:
|
|
51
|
+
"""Save budget to experiment state file."""
|
|
52
|
+
path = Path(state_path)
|
|
53
|
+
|
|
54
|
+
state = {}
|
|
55
|
+
if path.exists():
|
|
56
|
+
with open(path) as f:
|
|
57
|
+
state = yaml.safe_load(f) or {}
|
|
58
|
+
|
|
59
|
+
state["budget"] = budget
|
|
60
|
+
|
|
61
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
62
|
+
with open(path, "w") as f:
|
|
63
|
+
yaml.dump(state, f, default_flow_style=False, sort_keys=False)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# --- Budget Operations ---
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def set_budget(
|
|
70
|
+
max_experiments: int | None = None,
|
|
71
|
+
max_hours: float | None = None,
|
|
72
|
+
state_path: str = DEFAULT_STATE_PATH,
|
|
73
|
+
) -> dict:
|
|
74
|
+
"""Set a compute budget.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
max_experiments: Maximum number of experiments.
|
|
78
|
+
max_hours: Maximum wall-clock hours.
|
|
79
|
+
state_path: Path to experiment state file.
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
Budget status dict.
|
|
83
|
+
"""
|
|
84
|
+
if max_experiments is None and max_hours is None:
|
|
85
|
+
return {"error": "Specify at least one constraint: --experiments or --hours"}
|
|
86
|
+
|
|
87
|
+
budget = {
|
|
88
|
+
"set_at": datetime.now(timezone.utc).isoformat(),
|
|
89
|
+
"max_experiments": max_experiments,
|
|
90
|
+
"max_hours": max_hours,
|
|
91
|
+
"active": True,
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
save_budget(budget, state_path)
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
"action": "set",
|
|
98
|
+
"budget": budget,
|
|
99
|
+
"message": _format_budget_constraints(budget),
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def get_budget_status(
|
|
104
|
+
state_path: str = DEFAULT_STATE_PATH,
|
|
105
|
+
log_path: str = DEFAULT_LOG_PATH,
|
|
106
|
+
) -> dict:
|
|
107
|
+
"""Get current budget status with usage and projections.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
state_path: Path to experiment state file.
|
|
111
|
+
log_path: Path to experiment log.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
Complete budget status dict.
|
|
115
|
+
"""
|
|
116
|
+
budget = load_budget(state_path)
|
|
117
|
+
if not budget or not budget.get("active"):
|
|
118
|
+
return {"error": "No active budget. Use `/turing:budget set` first."}
|
|
119
|
+
|
|
120
|
+
experiments = load_experiments(log_path)
|
|
121
|
+
|
|
122
|
+
# Count experiments since budget was set
|
|
123
|
+
budget_set_at = budget.get("set_at", "")
|
|
124
|
+
experiments_since = [
|
|
125
|
+
e for e in experiments
|
|
126
|
+
if e.get("timestamp", "") >= budget_set_at
|
|
127
|
+
]
|
|
128
|
+
|
|
129
|
+
used_experiments = len(experiments_since)
|
|
130
|
+
max_experiments = budget.get("max_experiments")
|
|
131
|
+
|
|
132
|
+
# Compute time usage
|
|
133
|
+
total_seconds = sum(
|
|
134
|
+
e.get("metrics", {}).get("train_seconds", 0)
|
|
135
|
+
for e in experiments_since
|
|
136
|
+
if isinstance(e.get("metrics", {}).get("train_seconds"), (int, float))
|
|
137
|
+
)
|
|
138
|
+
used_hours = total_seconds / 3600
|
|
139
|
+
max_hours = budget.get("max_hours")
|
|
140
|
+
|
|
141
|
+
# Compute budget fraction used
|
|
142
|
+
fractions = []
|
|
143
|
+
if max_experiments and max_experiments > 0:
|
|
144
|
+
fractions.append(used_experiments / max_experiments)
|
|
145
|
+
if max_hours and max_hours > 0:
|
|
146
|
+
fractions.append(used_hours / max_hours)
|
|
147
|
+
|
|
148
|
+
budget_used = max(fractions) if fractions else 0.0
|
|
149
|
+
|
|
150
|
+
# Determine current phase and recommended mode
|
|
151
|
+
phase = determine_phase(budget_used)
|
|
152
|
+
recommended_mode = phase_to_mode(phase)
|
|
153
|
+
|
|
154
|
+
# Burn rate
|
|
155
|
+
burn_rate = None
|
|
156
|
+
if used_hours > 0 and used_experiments > 0:
|
|
157
|
+
burn_rate = used_experiments / used_hours
|
|
158
|
+
|
|
159
|
+
# Projection
|
|
160
|
+
remaining_experiments = (max_experiments - used_experiments) if max_experiments else None
|
|
161
|
+
remaining_hours = (max_hours - used_hours) if max_hours else None
|
|
162
|
+
projected_exhaustion_hours = None
|
|
163
|
+
if burn_rate and burn_rate > 0 and remaining_experiments:
|
|
164
|
+
projected_exhaustion_hours = remaining_experiments / burn_rate
|
|
165
|
+
|
|
166
|
+
# Check if exhausted
|
|
167
|
+
exhausted = budget_used >= 1.0
|
|
168
|
+
|
|
169
|
+
# Allocation breakdown
|
|
170
|
+
explore_count = sum(1 for e in experiments_since if _is_explore(e))
|
|
171
|
+
exploit_count = used_experiments - explore_count
|
|
172
|
+
|
|
173
|
+
status = {
|
|
174
|
+
"action": "status",
|
|
175
|
+
"budget": budget,
|
|
176
|
+
"usage": {
|
|
177
|
+
"experiments_used": used_experiments,
|
|
178
|
+
"experiments_max": max_experiments,
|
|
179
|
+
"experiments_remaining": remaining_experiments,
|
|
180
|
+
"hours_used": round(used_hours, 2),
|
|
181
|
+
"hours_max": max_hours,
|
|
182
|
+
"hours_remaining": round(remaining_hours, 2) if remaining_hours is not None else None,
|
|
183
|
+
"budget_fraction": round(budget_used, 4),
|
|
184
|
+
},
|
|
185
|
+
"phase": phase,
|
|
186
|
+
"recommended_mode": recommended_mode,
|
|
187
|
+
"allocation": {
|
|
188
|
+
"explore": explore_count,
|
|
189
|
+
"exploit": exploit_count,
|
|
190
|
+
},
|
|
191
|
+
"burn_rate": round(burn_rate, 2) if burn_rate else None,
|
|
192
|
+
"projected_exhaustion_hours": round(projected_exhaustion_hours, 2) if projected_exhaustion_hours else None,
|
|
193
|
+
"exhausted": exhausted,
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
if exhausted:
|
|
197
|
+
status["warning"] = "Budget exhausted. /turing:train will refuse to start new experiments."
|
|
198
|
+
|
|
199
|
+
return status
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def reset_budget(state_path: str = DEFAULT_STATE_PATH) -> dict:
|
|
203
|
+
"""Reset (deactivate) the current budget."""
|
|
204
|
+
budget = load_budget(state_path)
|
|
205
|
+
if not budget:
|
|
206
|
+
return {"action": "reset", "message": "No budget to reset."}
|
|
207
|
+
|
|
208
|
+
budget["active"] = False
|
|
209
|
+
budget["reset_at"] = datetime.now(timezone.utc).isoformat()
|
|
210
|
+
save_budget(budget, state_path)
|
|
211
|
+
|
|
212
|
+
return {"action": "reset", "message": "Budget deactivated."}
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def check_budget_allows(state_path: str = DEFAULT_STATE_PATH, log_path: str = DEFAULT_LOG_PATH) -> dict:
|
|
216
|
+
"""Check if the budget allows another experiment.
|
|
217
|
+
|
|
218
|
+
Returns dict with allowed (bool) and reason.
|
|
219
|
+
"""
|
|
220
|
+
budget = load_budget(state_path)
|
|
221
|
+
if not budget or not budget.get("active"):
|
|
222
|
+
return {"allowed": True, "reason": "No active budget"}
|
|
223
|
+
|
|
224
|
+
status = get_budget_status(state_path, log_path)
|
|
225
|
+
if "error" in status:
|
|
226
|
+
return {"allowed": True, "reason": "Budget status unavailable"}
|
|
227
|
+
|
|
228
|
+
if status.get("exhausted"):
|
|
229
|
+
return {
|
|
230
|
+
"allowed": False,
|
|
231
|
+
"reason": f"Budget exhausted ({status['usage']['budget_fraction']:.0%} used)",
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
return {
|
|
235
|
+
"allowed": True,
|
|
236
|
+
"reason": f"Budget at {status['usage']['budget_fraction']:.0%}",
|
|
237
|
+
"recommended_mode": status.get("recommended_mode"),
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
# --- Phase Logic ---
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def determine_phase(budget_fraction: float) -> str:
|
|
245
|
+
"""Determine budget phase from fraction used.
|
|
246
|
+
|
|
247
|
+
Returns: 'explore', 'mixed', or 'exploit'.
|
|
248
|
+
"""
|
|
249
|
+
if budget_fraction < EXPLORE_PHASE_END:
|
|
250
|
+
return "explore"
|
|
251
|
+
elif budget_fraction < MIXED_PHASE_END:
|
|
252
|
+
return "mixed"
|
|
253
|
+
else:
|
|
254
|
+
return "exploit"
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def phase_to_mode(phase: str) -> str:
|
|
258
|
+
"""Map budget phase to recommended research mode."""
|
|
259
|
+
return {
|
|
260
|
+
"explore": "explore",
|
|
261
|
+
"mixed": "explore", # Still explore promising, but start exploiting
|
|
262
|
+
"exploit": "exploit",
|
|
263
|
+
}.get(phase, "explore")
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _is_explore(experiment: dict) -> bool:
|
|
267
|
+
"""Heuristic: classify experiment as explore vs exploit."""
|
|
268
|
+
config = experiment.get("config", {})
|
|
269
|
+
# New model types or significantly different configs = explore
|
|
270
|
+
# Similar to prior experiments = exploit
|
|
271
|
+
# Simple heuristic: if experiment has a novel model_type, it's exploration
|
|
272
|
+
return config.get("model_type", "") != config.get("base_model_type", config.get("model_type", ""))
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def _format_budget_constraints(budget: dict) -> str:
|
|
276
|
+
"""Format budget constraints as human-readable string."""
|
|
277
|
+
parts = []
|
|
278
|
+
if budget.get("max_experiments"):
|
|
279
|
+
parts.append(f"{budget['max_experiments']} experiments")
|
|
280
|
+
if budget.get("max_hours"):
|
|
281
|
+
parts.append(f"{budget['max_hours']} hours")
|
|
282
|
+
return "Budget set: " + ", ".join(parts) if parts else "Budget set (no constraints)"
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
# --- Report Formatting ---
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def format_budget_report(report: dict) -> str:
|
|
289
|
+
"""Format budget report as markdown."""
|
|
290
|
+
if "error" in report:
|
|
291
|
+
return f"ERROR: {report['error']}"
|
|
292
|
+
|
|
293
|
+
action = report.get("action", "?")
|
|
294
|
+
|
|
295
|
+
if action == "set":
|
|
296
|
+
return f"# Budget Set\n\n{report.get('message', '')}"
|
|
297
|
+
|
|
298
|
+
if action == "reset":
|
|
299
|
+
return f"# Budget Reset\n\n{report.get('message', '')}"
|
|
300
|
+
|
|
301
|
+
if action != "status":
|
|
302
|
+
return f"Unknown action: {action}"
|
|
303
|
+
|
|
304
|
+
usage = report.get("usage", {})
|
|
305
|
+
phase = report.get("phase", "?")
|
|
306
|
+
|
|
307
|
+
lines = [
|
|
308
|
+
"# Budget Status",
|
|
309
|
+
"",
|
|
310
|
+
]
|
|
311
|
+
|
|
312
|
+
# Experiments
|
|
313
|
+
if usage.get("experiments_max"):
|
|
314
|
+
pct = usage["experiments_used"] / usage["experiments_max"] * 100
|
|
315
|
+
lines.append(
|
|
316
|
+
f"**Experiments:** {usage['experiments_used']}/{usage['experiments_max']} "
|
|
317
|
+
f"used ({pct:.0f}%), {usage.get('experiments_remaining', 0)} remaining"
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
# Time
|
|
321
|
+
if usage.get("hours_max"):
|
|
322
|
+
pct = usage["hours_used"] / usage["hours_max"] * 100
|
|
323
|
+
lines.append(
|
|
324
|
+
f"**Time:** {usage['hours_used']:.1f}/{usage['hours_max']:.1f}h "
|
|
325
|
+
f"used ({pct:.0f}%), {usage.get('hours_remaining', 0):.1f}h remaining"
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
# Burn rate
|
|
329
|
+
burn = report.get("burn_rate")
|
|
330
|
+
if burn:
|
|
331
|
+
lines.append(f"**Burn rate:** {burn:.1f} experiments/hour")
|
|
332
|
+
|
|
333
|
+
proj = report.get("projected_exhaustion_hours")
|
|
334
|
+
if proj:
|
|
335
|
+
lines.append(f"**Projected exhaustion:** ~{proj:.1f} hours")
|
|
336
|
+
|
|
337
|
+
lines.append("")
|
|
338
|
+
|
|
339
|
+
# Phase
|
|
340
|
+
phase_labels = {
|
|
341
|
+
"explore": "EXPLORE (try diverse hypotheses)",
|
|
342
|
+
"mixed": "MIXED (explore promising, exploit best)",
|
|
343
|
+
"exploit": "EXPLOIT ONLY (refine the winner)",
|
|
344
|
+
}
|
|
345
|
+
lines.append(f"**Phase:** {phase_labels.get(phase, phase)}")
|
|
346
|
+
lines.append(f"**Recommended mode:** {report.get('recommended_mode', '?')}")
|
|
347
|
+
|
|
348
|
+
# Allocation
|
|
349
|
+
alloc = report.get("allocation", {})
|
|
350
|
+
if alloc:
|
|
351
|
+
lines.extend([
|
|
352
|
+
"",
|
|
353
|
+
"**Allocation:**",
|
|
354
|
+
f"- Explore: {alloc.get('explore', 0)} experiments",
|
|
355
|
+
f"- Exploit: {alloc.get('exploit', 0)} experiments",
|
|
356
|
+
])
|
|
357
|
+
|
|
358
|
+
# Warning
|
|
359
|
+
if report.get("warning"):
|
|
360
|
+
lines.extend(["", f"**WARNING:** {report['warning']}"])
|
|
361
|
+
|
|
362
|
+
# Auto-mode shift info
|
|
363
|
+
if phase == "explore":
|
|
364
|
+
shift_at = usage.get("experiments_max", 0) * MIXED_PHASE_END
|
|
365
|
+
lines.extend(["", f"*Auto-shift to mixed mode at experiment {shift_at:.0f}*"])
|
|
366
|
+
|
|
367
|
+
return "\n".join(lines)
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def main() -> None:
|
|
371
|
+
"""CLI entry point."""
|
|
372
|
+
parser = argparse.ArgumentParser(
|
|
373
|
+
description="Compute budget manager",
|
|
374
|
+
)
|
|
375
|
+
parser.add_argument(
|
|
376
|
+
"action", choices=["set", "status", "reset", "check"],
|
|
377
|
+
help="Budget action",
|
|
378
|
+
)
|
|
379
|
+
parser.add_argument(
|
|
380
|
+
"--experiments", type=int,
|
|
381
|
+
help="Max experiments budget",
|
|
382
|
+
)
|
|
383
|
+
parser.add_argument(
|
|
384
|
+
"--hours", type=float,
|
|
385
|
+
help="Max hours budget",
|
|
386
|
+
)
|
|
387
|
+
parser.add_argument(
|
|
388
|
+
"--state", default=DEFAULT_STATE_PATH,
|
|
389
|
+
help=f"Path to experiment state (default: {DEFAULT_STATE_PATH})",
|
|
390
|
+
)
|
|
391
|
+
parser.add_argument(
|
|
392
|
+
"--log", default=DEFAULT_LOG_PATH,
|
|
393
|
+
help="Path to experiment log",
|
|
394
|
+
)
|
|
395
|
+
parser.add_argument(
|
|
396
|
+
"--json", action="store_true",
|
|
397
|
+
help="Output raw JSON instead of formatted report",
|
|
398
|
+
)
|
|
399
|
+
args = parser.parse_args()
|
|
400
|
+
|
|
401
|
+
if args.action == "set":
|
|
402
|
+
report = set_budget(args.experiments, args.hours, args.state)
|
|
403
|
+
elif args.action == "status":
|
|
404
|
+
report = get_budget_status(args.state, args.log)
|
|
405
|
+
elif args.action == "reset":
|
|
406
|
+
report = reset_budget(args.state)
|
|
407
|
+
elif args.action == "check":
|
|
408
|
+
report = check_budget_allows(args.state, args.log)
|
|
409
|
+
else:
|
|
410
|
+
report = {"error": f"Unknown action: {args.action}"}
|
|
411
|
+
|
|
412
|
+
if args.json:
|
|
413
|
+
print(json.dumps(report, indent=2, default=str))
|
|
414
|
+
else:
|
|
415
|
+
print(format_budget_report(report))
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
if __name__ == "__main__":
|
|
419
|
+
main()
|
|
@@ -326,6 +326,51 @@ def load_ensemble_results(ensemble_dir: str = "experiments/ensembles") -> list[d
|
|
|
326
326
|
return reports
|
|
327
327
|
|
|
328
328
|
|
|
329
|
+
def load_budget_status(state_path: str = "experiment_state.yaml", log_path: str = "experiments/log.jsonl") -> dict | None:
|
|
330
|
+
"""Load budget status if active."""
|
|
331
|
+
try:
|
|
332
|
+
from scripts.budget_manager import get_budget_status
|
|
333
|
+
result = get_budget_status(state_path, log_path)
|
|
334
|
+
if "error" not in result:
|
|
335
|
+
return result
|
|
336
|
+
except (ImportError, Exception):
|
|
337
|
+
pass
|
|
338
|
+
return None
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def load_scaling_results(scaling_dir: str = "experiments/scaling") -> list[dict]:
|
|
342
|
+
"""Load scaling study results from YAML files."""
|
|
343
|
+
path = Path(scaling_dir)
|
|
344
|
+
if not path.exists():
|
|
345
|
+
return []
|
|
346
|
+
reports = []
|
|
347
|
+
for f in sorted(path.glob("scale-*.yaml")):
|
|
348
|
+
try:
|
|
349
|
+
with open(f) as fh:
|
|
350
|
+
report = yaml.safe_load(fh)
|
|
351
|
+
if report and isinstance(report, dict) and "verdict" in report:
|
|
352
|
+
reports.append(report)
|
|
353
|
+
except (yaml.YAMLError, OSError):
|
|
354
|
+
continue
|
|
355
|
+
return reports
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def load_audit_report(audit_dir: str = "experiments/audits") -> dict | None:
|
|
359
|
+
"""Load the most recent audit report."""
|
|
360
|
+
path = Path(audit_dir)
|
|
361
|
+
if not path.exists():
|
|
362
|
+
return None
|
|
363
|
+
files = sorted(path.glob("audit-*.yaml"))
|
|
364
|
+
if not files:
|
|
365
|
+
return None
|
|
366
|
+
try:
|
|
367
|
+
with open(files[-1]) as f:
|
|
368
|
+
report = yaml.safe_load(f)
|
|
369
|
+
return report if isinstance(report, dict) else None
|
|
370
|
+
except (yaml.YAMLError, OSError):
|
|
371
|
+
return None
|
|
372
|
+
|
|
373
|
+
|
|
329
374
|
def format_brief(
|
|
330
375
|
campaign: dict,
|
|
331
376
|
best: dict | None,
|
|
@@ -345,6 +390,9 @@ def format_brief(
|
|
|
345
390
|
queue_summary: dict | None = None,
|
|
346
391
|
regression_checks: list[dict] | None = None,
|
|
347
392
|
ensemble_results: list[dict] | None = None,
|
|
393
|
+
budget_status: dict | None = None,
|
|
394
|
+
scaling_results: list[dict] | None = None,
|
|
395
|
+
audit_report: dict | None = None,
|
|
348
396
|
) -> str:
|
|
349
397
|
"""Format the research briefing as markdown."""
|
|
350
398
|
direction = "lower" if lower_is_better else "higher"
|
|
@@ -579,6 +627,53 @@ def format_brief(
|
|
|
579
627
|
else:
|
|
580
628
|
lines.append(f"- {n_models}-model ensemble: no improvement over best single")
|
|
581
629
|
|
|
630
|
+
# Budget status
|
|
631
|
+
if budget_status and budget_status.get("usage"):
|
|
632
|
+
usage = budget_status["usage"]
|
|
633
|
+
phase = budget_status.get("phase", "?")
|
|
634
|
+
lines.extend(["", "## Budget", ""])
|
|
635
|
+
if usage.get("experiments_max"):
|
|
636
|
+
lines.append(
|
|
637
|
+
f"- **Experiments:** {usage['experiments_used']}/{usage['experiments_max']} "
|
|
638
|
+
f"({usage['budget_fraction']:.0%} used)"
|
|
639
|
+
)
|
|
640
|
+
if usage.get("hours_max"):
|
|
641
|
+
lines.append(f"- **Time:** {usage['hours_used']:.1f}/{usage['hours_max']:.1f}h")
|
|
642
|
+
lines.append(f"- **Phase:** {phase}")
|
|
643
|
+
if budget_status.get("exhausted"):
|
|
644
|
+
lines.append("- **STATUS: EXHAUSTED** — no more experiments will run")
|
|
645
|
+
|
|
646
|
+
# Scaling predictions
|
|
647
|
+
if scaling_results:
|
|
648
|
+
lines.extend(["", "## Scaling Predictions", ""])
|
|
649
|
+
for study in scaling_results:
|
|
650
|
+
verdict = study.get("verdict", {})
|
|
651
|
+
v = verdict.get("verdict", "?")
|
|
652
|
+
reason = verdict.get("reason", "")
|
|
653
|
+
lines.append(f"- **{v.upper()}**: {reason}")
|
|
654
|
+
|
|
655
|
+
# Methodology audit
|
|
656
|
+
if audit_report and audit_report.get("score"):
|
|
657
|
+
score = audit_report["score"]
|
|
658
|
+
verdict = audit_report.get("verdict", "?")
|
|
659
|
+
verdict_labels = {
|
|
660
|
+
"pass": "PASS",
|
|
661
|
+
"pass_with_warnings": "PASS (warnings)",
|
|
662
|
+
"needs_work": "NEEDS WORK",
|
|
663
|
+
"fail": "FAIL",
|
|
664
|
+
}
|
|
665
|
+
lines.extend(["", "## Methodology Audit", ""])
|
|
666
|
+
lines.append(
|
|
667
|
+
f"**{verdict_labels.get(verdict, verdict.upper())}** — "
|
|
668
|
+
f"{score.get('pass', 0)}/{score.get('checkable', 0)} checks passed, "
|
|
669
|
+
f"{score.get('fail', 0)} failure(s)"
|
|
670
|
+
)
|
|
671
|
+
actions = audit_report.get("actions", [])
|
|
672
|
+
if actions:
|
|
673
|
+
lines.append("")
|
|
674
|
+
for a in actions[:3]:
|
|
675
|
+
lines.append(f"- Fix: `{a['fix']}` ({a['check']})")
|
|
676
|
+
|
|
582
677
|
# Regression check history (stability)
|
|
583
678
|
if regression_checks:
|
|
584
679
|
lines.extend(["", "## Stability", ""])
|
|
@@ -670,6 +765,9 @@ def generate_brief(
|
|
|
670
765
|
queue_summary = load_queue_summary()
|
|
671
766
|
regression_checks = load_regression_checks()
|
|
672
767
|
ensemble_results = load_ensemble_results()
|
|
768
|
+
budget_status = load_budget_status(log_path=log_path)
|
|
769
|
+
scaling_results = load_scaling_results()
|
|
770
|
+
audit_report = load_audit_report()
|
|
673
771
|
|
|
674
772
|
return format_brief(
|
|
675
773
|
campaign, best, trajectory, model_types, hypotheses,
|
|
@@ -683,6 +781,9 @@ def generate_brief(
|
|
|
683
781
|
queue_summary=queue_summary,
|
|
684
782
|
regression_checks=regression_checks if regression_checks else None,
|
|
685
783
|
ensemble_results=ensemble_results if ensemble_results else None,
|
|
784
|
+
budget_status=budget_status,
|
|
785
|
+
scaling_results=scaling_results if scaling_results else None,
|
|
786
|
+
audit_report=audit_report,
|
|
686
787
|
)
|
|
687
788
|
|
|
688
789
|
|