claude-turing 2.5.0 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +7 -2
- package/commands/audit.md +56 -0
- package/commands/baseline.md +45 -0
- package/commands/leak.md +47 -0
- package/commands/sanity.md +48 -0
- package/commands/transfer.md +54 -0
- package/commands/turing.md +10 -0
- package/package.json +1 -1
- package/src/install.js +2 -0
- package/src/verify.js +5 -0
- package/templates/scripts/__pycache__/generate_baselines.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/knowledge_transfer.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/leakage_detector.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/methodology_audit.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/sanity_checks.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/generate_baselines.py +423 -0
- package/templates/scripts/generate_brief.py +41 -0
- package/templates/scripts/knowledge_transfer.py +618 -0
- package/templates/scripts/leakage_detector.py +402 -0
- package/templates/scripts/methodology_audit.py +451 -0
- package/templates/scripts/sanity_checks.py +503 -0
- package/templates/scripts/scaffold.py +10 -0
|
@@ -0,0 +1,423 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Automatic baseline generation for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
Auto-generates trivial baselines (majority, mean, random, linear, k-NN)
|
|
5
|
+
so every experiment has a "is this better than dumb?" reference point.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
python scripts/generate_baselines.py
|
|
9
|
+
python scripts/generate_baselines.py --methods all
|
|
10
|
+
python scripts/generate_baselines.py --methods simple
|
|
11
|
+
python scripts/generate_baselines.py --json
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import json
|
|
18
|
+
import sys
|
|
19
|
+
from datetime import datetime, timezone
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
import numpy as np
|
|
23
|
+
import yaml
|
|
24
|
+
|
|
25
|
+
from scripts.turing_io import load_config, load_experiments
|
|
26
|
+
|
|
27
|
+
DEFAULT_LOG_PATH = "experiments/log.jsonl"
|
|
28
|
+
|
|
29
|
+
# Baseline method groups
|
|
30
|
+
SIMPLE_METHODS = ["random", "majority_or_mean"]
|
|
31
|
+
LINEAR_METHODS = ["linear"]
|
|
32
|
+
ALL_METHODS = ["random", "majority_or_mean", "stratified_or_median", "linear", "knn"]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# --- Baseline Methods ---
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def random_baseline(y: np.ndarray, task_type: str = "classification") -> np.ndarray:
|
|
39
|
+
"""Random predictions."""
|
|
40
|
+
n = len(y)
|
|
41
|
+
if task_type == "classification":
|
|
42
|
+
classes = np.unique(y)
|
|
43
|
+
return np.random.choice(classes, size=n)
|
|
44
|
+
else:
|
|
45
|
+
return np.random.uniform(np.min(y), np.max(y), size=n)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def majority_or_mean_baseline(y: np.ndarray, task_type: str = "classification") -> np.ndarray:
|
|
49
|
+
"""Majority class (classification) or mean (regression)."""
|
|
50
|
+
n = len(y)
|
|
51
|
+
if task_type == "classification":
|
|
52
|
+
from scipy import stats as scipy_stats
|
|
53
|
+
mode_result = scipy_stats.mode(y, keepdims=False)
|
|
54
|
+
majority = mode_result.mode
|
|
55
|
+
return np.full(n, majority)
|
|
56
|
+
else:
|
|
57
|
+
return np.full(n, np.mean(y))
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def stratified_or_median_baseline(y: np.ndarray, task_type: str = "classification") -> np.ndarray:
|
|
61
|
+
"""Stratified random (classification) or median (regression)."""
|
|
62
|
+
n = len(y)
|
|
63
|
+
if task_type == "classification":
|
|
64
|
+
classes, counts = np.unique(y, return_counts=True)
|
|
65
|
+
probs = counts / counts.sum()
|
|
66
|
+
return np.random.choice(classes, size=n, p=probs)
|
|
67
|
+
else:
|
|
68
|
+
return np.full(n, np.median(y))
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def linear_baseline(
|
|
72
|
+
X: np.ndarray,
|
|
73
|
+
y: np.ndarray,
|
|
74
|
+
task_type: str = "classification",
|
|
75
|
+
) -> dict:
|
|
76
|
+
"""Linear model baseline (LogisticRegression / Ridge)."""
|
|
77
|
+
from sklearn.linear_model import LogisticRegression, Ridge
|
|
78
|
+
|
|
79
|
+
n_samples = X.shape[0]
|
|
80
|
+
split = int(n_samples * 0.7)
|
|
81
|
+
X_train, X_test = X[:split], X[split:]
|
|
82
|
+
y_train, y_test = y[:split], y[split:]
|
|
83
|
+
|
|
84
|
+
if task_type == "classification":
|
|
85
|
+
model = LogisticRegression(max_iter=1000, solver="lbfgs")
|
|
86
|
+
else:
|
|
87
|
+
model = Ridge(alpha=1.0)
|
|
88
|
+
|
|
89
|
+
model.fit(X_train, y_train)
|
|
90
|
+
predictions = model.predict(X_test)
|
|
91
|
+
|
|
92
|
+
return {
|
|
93
|
+
"predictions": predictions,
|
|
94
|
+
"labels": y_test,
|
|
95
|
+
"model_name": "LogisticRegression" if task_type == "classification" else "Ridge",
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def knn_baseline(
|
|
100
|
+
X: np.ndarray,
|
|
101
|
+
y: np.ndarray,
|
|
102
|
+
task_type: str = "classification",
|
|
103
|
+
n_neighbors: int = 5,
|
|
104
|
+
) -> dict:
|
|
105
|
+
"""k-NN baseline."""
|
|
106
|
+
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
|
|
107
|
+
|
|
108
|
+
n_samples = X.shape[0]
|
|
109
|
+
split = int(n_samples * 0.7)
|
|
110
|
+
X_train, X_test = X[:split], X[split:]
|
|
111
|
+
y_train, y_test = y[:split], y[split:]
|
|
112
|
+
|
|
113
|
+
k = min(n_neighbors, len(X_train))
|
|
114
|
+
if task_type == "classification":
|
|
115
|
+
model = KNeighborsClassifier(n_neighbors=k)
|
|
116
|
+
else:
|
|
117
|
+
model = KNeighborsRegressor(n_neighbors=k)
|
|
118
|
+
|
|
119
|
+
model.fit(X_train, y_train)
|
|
120
|
+
predictions = model.predict(X_test)
|
|
121
|
+
|
|
122
|
+
return {
|
|
123
|
+
"predictions": predictions,
|
|
124
|
+
"labels": y_test,
|
|
125
|
+
"model_name": f"k-NN (k={k})",
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# --- Evaluation ---
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def evaluate_predictions(
|
|
133
|
+
predictions: np.ndarray,
|
|
134
|
+
labels: np.ndarray,
|
|
135
|
+
task_type: str = "classification",
|
|
136
|
+
primary_metric: str = "accuracy",
|
|
137
|
+
) -> dict:
|
|
138
|
+
"""Evaluate baseline predictions."""
|
|
139
|
+
min_len = min(len(predictions), len(labels))
|
|
140
|
+
predictions = predictions[:min_len]
|
|
141
|
+
labels = labels[:min_len]
|
|
142
|
+
|
|
143
|
+
if task_type == "classification":
|
|
144
|
+
accuracy = float(np.mean(predictions == labels))
|
|
145
|
+
return {"accuracy": round(accuracy, 6), "n_samples": min_len}
|
|
146
|
+
else:
|
|
147
|
+
mse = float(np.mean((predictions - labels) ** 2))
|
|
148
|
+
rmse = float(np.sqrt(mse))
|
|
149
|
+
return {"mse": round(mse, 6), "rmse": round(rmse, 6), "n_samples": min_len}
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# --- Full Pipeline ---
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def generate_baselines(
|
|
156
|
+
methods: str = "all",
|
|
157
|
+
config_path: str = "config.yaml",
|
|
158
|
+
log_path: str = DEFAULT_LOG_PATH,
|
|
159
|
+
data_path: str | None = None,
|
|
160
|
+
) -> dict:
|
|
161
|
+
"""Generate baseline results.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
methods: Method group (all, simple, linear).
|
|
165
|
+
config_path: Path to config.yaml.
|
|
166
|
+
log_path: Path to experiment log.
|
|
167
|
+
data_path: Path to data (optional, for linear/knn).
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
Baseline report dict.
|
|
171
|
+
"""
|
|
172
|
+
config = load_config(config_path)
|
|
173
|
+
eval_cfg = config.get("evaluation", {})
|
|
174
|
+
primary_metric = eval_cfg.get("primary_metric", "accuracy")
|
|
175
|
+
task_type = config.get("task", {}).get("type", "classification")
|
|
176
|
+
|
|
177
|
+
experiments = load_experiments(log_path)
|
|
178
|
+
|
|
179
|
+
# Find current best for comparison
|
|
180
|
+
kept = [e for e in experiments if e.get("status") == "kept"]
|
|
181
|
+
current_best_value = None
|
|
182
|
+
if kept:
|
|
183
|
+
best = max(kept, key=lambda e: e.get("metrics", {}).get(primary_metric, 0))
|
|
184
|
+
current_best_value = best.get("metrics", {}).get(primary_metric)
|
|
185
|
+
|
|
186
|
+
# Select methods
|
|
187
|
+
if methods == "simple":
|
|
188
|
+
method_list = SIMPLE_METHODS
|
|
189
|
+
elif methods == "linear":
|
|
190
|
+
method_list = LINEAR_METHODS
|
|
191
|
+
else:
|
|
192
|
+
method_list = ALL_METHODS
|
|
193
|
+
|
|
194
|
+
# For methods that need data, check if data is available
|
|
195
|
+
has_data = data_path is not None and Path(data_path).exists()
|
|
196
|
+
|
|
197
|
+
report = {
|
|
198
|
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
199
|
+
"task_type": task_type,
|
|
200
|
+
"primary_metric": primary_metric,
|
|
201
|
+
"methods_requested": methods,
|
|
202
|
+
"baselines": [],
|
|
203
|
+
"current_best": current_best_value,
|
|
204
|
+
"data_available": has_data,
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
# Generate synthetic labels for demo if no data
|
|
208
|
+
# In real use, evaluate.py would provide these
|
|
209
|
+
if not has_data:
|
|
210
|
+
report["note"] = "No data loaded — baseline plan generated. Run with --data to compute actual scores."
|
|
211
|
+
for method in method_list:
|
|
212
|
+
report["baselines"].append({
|
|
213
|
+
"method": _method_display_name(method, task_type),
|
|
214
|
+
"metric_value": None,
|
|
215
|
+
"notes": "Requires data",
|
|
216
|
+
})
|
|
217
|
+
return report
|
|
218
|
+
|
|
219
|
+
# Load data
|
|
220
|
+
try:
|
|
221
|
+
data = np.load(data_path, allow_pickle=True)
|
|
222
|
+
X = data.get("X", data.get("features"))
|
|
223
|
+
y = data.get("y", data.get("labels", data.get("target")))
|
|
224
|
+
if X is None or y is None:
|
|
225
|
+
return {"error": f"Data file {data_path} missing X/y arrays"}
|
|
226
|
+
except Exception as e:
|
|
227
|
+
return {"error": f"Failed to load data: {e}"}
|
|
228
|
+
|
|
229
|
+
# Run baselines
|
|
230
|
+
for method in method_list:
|
|
231
|
+
result = _run_baseline(method, X, y, task_type, primary_metric)
|
|
232
|
+
report["baselines"].append(result)
|
|
233
|
+
|
|
234
|
+
# Add current best for comparison
|
|
235
|
+
if current_best_value is not None:
|
|
236
|
+
report["baselines"].append({
|
|
237
|
+
"method": "Current best",
|
|
238
|
+
"metric_value": current_best_value,
|
|
239
|
+
"notes": "",
|
|
240
|
+
})
|
|
241
|
+
|
|
242
|
+
# Compute improvement over linear baseline
|
|
243
|
+
linear_result = next((b for b in report["baselines"] if "linear" in b.get("method", "").lower()), None)
|
|
244
|
+
if linear_result and linear_result.get("metric_value") and current_best_value:
|
|
245
|
+
improvement = current_best_value - linear_result["metric_value"]
|
|
246
|
+
report["improvement_over_linear"] = round(improvement, 6)
|
|
247
|
+
|
|
248
|
+
return report
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _method_display_name(method: str, task_type: str) -> str:
|
|
252
|
+
"""Human-readable method name."""
|
|
253
|
+
names = {
|
|
254
|
+
"random": "Random",
|
|
255
|
+
"majority_or_mean": "Majority class" if task_type == "classification" else "Mean predictor",
|
|
256
|
+
"stratified_or_median": "Stratified random" if task_type == "classification" else "Median predictor",
|
|
257
|
+
"linear": "Logistic Regression" if task_type == "classification" else "Ridge Regression",
|
|
258
|
+
"knn": "k-NN (k=5)",
|
|
259
|
+
}
|
|
260
|
+
return names.get(method, method)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def _run_baseline(
|
|
264
|
+
method: str,
|
|
265
|
+
X: np.ndarray,
|
|
266
|
+
y: np.ndarray,
|
|
267
|
+
task_type: str,
|
|
268
|
+
primary_metric: str,
|
|
269
|
+
) -> dict:
|
|
270
|
+
"""Run a single baseline method."""
|
|
271
|
+
try:
|
|
272
|
+
if method == "random":
|
|
273
|
+
preds = random_baseline(y, task_type)
|
|
274
|
+
eval_result = evaluate_predictions(preds, y, task_type, primary_metric)
|
|
275
|
+
return {
|
|
276
|
+
"method": "Random",
|
|
277
|
+
"metric_value": eval_result.get(primary_metric, eval_result.get("accuracy", eval_result.get("rmse"))),
|
|
278
|
+
"notes": "Floor — below this = bug",
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
elif method == "majority_or_mean":
|
|
282
|
+
preds = majority_or_mean_baseline(y, task_type)
|
|
283
|
+
eval_result = evaluate_predictions(preds, y, task_type, primary_metric)
|
|
284
|
+
name = "Majority class" if task_type == "classification" else "Mean predictor"
|
|
285
|
+
return {
|
|
286
|
+
"method": name,
|
|
287
|
+
"metric_value": eval_result.get(primary_metric, eval_result.get("accuracy", eval_result.get("rmse"))),
|
|
288
|
+
"notes": "Naive floor",
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
elif method == "stratified_or_median":
|
|
292
|
+
preds = stratified_or_median_baseline(y, task_type)
|
|
293
|
+
eval_result = evaluate_predictions(preds, y, task_type, primary_metric)
|
|
294
|
+
name = "Stratified random" if task_type == "classification" else "Median predictor"
|
|
295
|
+
return {
|
|
296
|
+
"method": name,
|
|
297
|
+
"metric_value": eval_result.get(primary_metric, eval_result.get("accuracy", eval_result.get("rmse"))),
|
|
298
|
+
"notes": "",
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
elif method == "linear":
|
|
302
|
+
result = linear_baseline(X, y, task_type)
|
|
303
|
+
eval_result = evaluate_predictions(result["predictions"], result["labels"], task_type, primary_metric)
|
|
304
|
+
return {
|
|
305
|
+
"method": result["model_name"],
|
|
306
|
+
"metric_value": eval_result.get(primary_metric, eval_result.get("accuracy", eval_result.get("rmse"))),
|
|
307
|
+
"notes": "Linear ceiling",
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
elif method == "knn":
|
|
311
|
+
result = knn_baseline(X, y, task_type)
|
|
312
|
+
eval_result = evaluate_predictions(result["predictions"], result["labels"], task_type, primary_metric)
|
|
313
|
+
return {
|
|
314
|
+
"method": result["model_name"],
|
|
315
|
+
"metric_value": eval_result.get(primary_metric, eval_result.get("accuracy", eval_result.get("rmse"))),
|
|
316
|
+
"notes": "Non-parametric reference",
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
except Exception as e:
|
|
320
|
+
return {"method": method, "metric_value": None, "notes": f"Error: {e}"}
|
|
321
|
+
|
|
322
|
+
return {"method": method, "metric_value": None, "notes": "Unknown method"}
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
# --- Report Formatting ---
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def save_baseline_report(report: dict, output_dir: str = "experiments/baselines") -> Path:
|
|
329
|
+
"""Save baseline report to YAML."""
|
|
330
|
+
out_path = Path(output_dir)
|
|
331
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
|
332
|
+
|
|
333
|
+
date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
|
334
|
+
filepath = out_path / f"baselines-{date}.yaml"
|
|
335
|
+
|
|
336
|
+
with open(filepath, "w") as f:
|
|
337
|
+
yaml.dump(report, f, default_flow_style=False, sort_keys=False)
|
|
338
|
+
|
|
339
|
+
return filepath
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def format_baseline_report(report: dict) -> str:
|
|
343
|
+
"""Format baseline report as markdown."""
|
|
344
|
+
if "error" in report:
|
|
345
|
+
return f"ERROR: {report['error']}"
|
|
346
|
+
|
|
347
|
+
task = report.get("task_type", "?")
|
|
348
|
+
metric = report.get("primary_metric", "metric")
|
|
349
|
+
|
|
350
|
+
lines = [
|
|
351
|
+
f"# Baselines for {task} ({metric})",
|
|
352
|
+
"",
|
|
353
|
+
f"*Generated {report.get('generated_at', 'N/A')[:19]}*",
|
|
354
|
+
"",
|
|
355
|
+
]
|
|
356
|
+
|
|
357
|
+
baselines = report.get("baselines", [])
|
|
358
|
+
if baselines:
|
|
359
|
+
lines.append(f"| Method | {metric} | Notes |")
|
|
360
|
+
lines.append("|--------|--------|-------|")
|
|
361
|
+
for b in baselines:
|
|
362
|
+
val = b.get("metric_value")
|
|
363
|
+
val_str = f"{val:.4f}" if isinstance(val, (int, float)) else str(val or "N/A")
|
|
364
|
+
lines.append(f"| {b.get('method', '?')} | {val_str} | {b.get('notes', '')} |")
|
|
365
|
+
lines.append("")
|
|
366
|
+
|
|
367
|
+
improvement = report.get("improvement_over_linear")
|
|
368
|
+
if improvement is not None:
|
|
369
|
+
lines.append(f"**Your model beats the linear baseline by {improvement:+.4f} ({improvement / report.get('current_best', 1) * 100:.1f}%)**")
|
|
370
|
+
lines.append("")
|
|
371
|
+
|
|
372
|
+
if report.get("note"):
|
|
373
|
+
lines.append(f"*{report['note']}*")
|
|
374
|
+
|
|
375
|
+
return "\n".join(lines)
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def main() -> None:
|
|
379
|
+
"""CLI entry point."""
|
|
380
|
+
parser = argparse.ArgumentParser(
|
|
381
|
+
description="Automatic baseline generation",
|
|
382
|
+
)
|
|
383
|
+
parser.add_argument(
|
|
384
|
+
"--methods", choices=["all", "simple", "linear"], default="all",
|
|
385
|
+
help="Baseline method group (default: all)",
|
|
386
|
+
)
|
|
387
|
+
parser.add_argument(
|
|
388
|
+
"--data",
|
|
389
|
+
help="Path to data file (.npz with X and y arrays)",
|
|
390
|
+
)
|
|
391
|
+
parser.add_argument(
|
|
392
|
+
"--config", default="config.yaml",
|
|
393
|
+
help="Path to config.yaml",
|
|
394
|
+
)
|
|
395
|
+
parser.add_argument(
|
|
396
|
+
"--log", default=DEFAULT_LOG_PATH,
|
|
397
|
+
help="Path to experiment log",
|
|
398
|
+
)
|
|
399
|
+
parser.add_argument(
|
|
400
|
+
"--json", action="store_true",
|
|
401
|
+
help="Output raw JSON instead of formatted report",
|
|
402
|
+
)
|
|
403
|
+
args = parser.parse_args()
|
|
404
|
+
|
|
405
|
+
report = generate_baselines(
|
|
406
|
+
methods=args.methods,
|
|
407
|
+
config_path=args.config,
|
|
408
|
+
log_path=args.log,
|
|
409
|
+
data_path=args.data,
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
if "error" not in report:
|
|
413
|
+
filepath = save_baseline_report(report)
|
|
414
|
+
print(f"Saved to {filepath}", file=sys.stderr)
|
|
415
|
+
|
|
416
|
+
if args.json:
|
|
417
|
+
print(json.dumps(report, indent=2, default=str))
|
|
418
|
+
else:
|
|
419
|
+
print(format_baseline_report(report))
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
if __name__ == "__main__":
|
|
423
|
+
main()
|
|
@@ -355,6 +355,22 @@ def load_scaling_results(scaling_dir: str = "experiments/scaling") -> list[dict]
|
|
|
355
355
|
return reports
|
|
356
356
|
|
|
357
357
|
|
|
358
|
+
def load_audit_report(audit_dir: str = "experiments/audits") -> dict | None:
|
|
359
|
+
"""Load the most recent audit report."""
|
|
360
|
+
path = Path(audit_dir)
|
|
361
|
+
if not path.exists():
|
|
362
|
+
return None
|
|
363
|
+
files = sorted(path.glob("audit-*.yaml"))
|
|
364
|
+
if not files:
|
|
365
|
+
return None
|
|
366
|
+
try:
|
|
367
|
+
with open(files[-1]) as f:
|
|
368
|
+
report = yaml.safe_load(f)
|
|
369
|
+
return report if isinstance(report, dict) else None
|
|
370
|
+
except (yaml.YAMLError, OSError):
|
|
371
|
+
return None
|
|
372
|
+
|
|
373
|
+
|
|
358
374
|
def format_brief(
|
|
359
375
|
campaign: dict,
|
|
360
376
|
best: dict | None,
|
|
@@ -376,6 +392,7 @@ def format_brief(
|
|
|
376
392
|
ensemble_results: list[dict] | None = None,
|
|
377
393
|
budget_status: dict | None = None,
|
|
378
394
|
scaling_results: list[dict] | None = None,
|
|
395
|
+
audit_report: dict | None = None,
|
|
379
396
|
) -> str:
|
|
380
397
|
"""Format the research briefing as markdown."""
|
|
381
398
|
direction = "lower" if lower_is_better else "higher"
|
|
@@ -635,6 +652,28 @@ def format_brief(
|
|
|
635
652
|
reason = verdict.get("reason", "")
|
|
636
653
|
lines.append(f"- **{v.upper()}**: {reason}")
|
|
637
654
|
|
|
655
|
+
# Methodology audit
|
|
656
|
+
if audit_report and audit_report.get("score"):
|
|
657
|
+
score = audit_report["score"]
|
|
658
|
+
verdict = audit_report.get("verdict", "?")
|
|
659
|
+
verdict_labels = {
|
|
660
|
+
"pass": "PASS",
|
|
661
|
+
"pass_with_warnings": "PASS (warnings)",
|
|
662
|
+
"needs_work": "NEEDS WORK",
|
|
663
|
+
"fail": "FAIL",
|
|
664
|
+
}
|
|
665
|
+
lines.extend(["", "## Methodology Audit", ""])
|
|
666
|
+
lines.append(
|
|
667
|
+
f"**{verdict_labels.get(verdict, verdict.upper())}** — "
|
|
668
|
+
f"{score.get('pass', 0)}/{score.get('checkable', 0)} checks passed, "
|
|
669
|
+
f"{score.get('fail', 0)} failure(s)"
|
|
670
|
+
)
|
|
671
|
+
actions = audit_report.get("actions", [])
|
|
672
|
+
if actions:
|
|
673
|
+
lines.append("")
|
|
674
|
+
for a in actions[:3]:
|
|
675
|
+
lines.append(f"- Fix: `{a['fix']}` ({a['check']})")
|
|
676
|
+
|
|
638
677
|
# Regression check history (stability)
|
|
639
678
|
if regression_checks:
|
|
640
679
|
lines.extend(["", "## Stability", ""])
|
|
@@ -728,6 +767,7 @@ def generate_brief(
|
|
|
728
767
|
ensemble_results = load_ensemble_results()
|
|
729
768
|
budget_status = load_budget_status(log_path=log_path)
|
|
730
769
|
scaling_results = load_scaling_results()
|
|
770
|
+
audit_report = load_audit_report()
|
|
731
771
|
|
|
732
772
|
return format_brief(
|
|
733
773
|
campaign, best, trajectory, model_types, hypotheses,
|
|
@@ -743,6 +783,7 @@ def generate_brief(
|
|
|
743
783
|
ensemble_results=ensemble_results if ensemble_results else None,
|
|
744
784
|
budget_status=budget_status,
|
|
745
785
|
scaling_results=scaling_results if scaling_results else None,
|
|
786
|
+
audit_report=audit_report,
|
|
746
787
|
)
|
|
747
788
|
|
|
748
789
|
|