claude-turing 3.0.0 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +5 -2
- package/commands/baseline.md +45 -0
- package/commands/leak.md +47 -0
- package/commands/sanity.md +48 -0
- package/commands/turing.md +6 -0
- package/package.json +1 -1
- package/src/install.js +1 -0
- package/src/verify.js +3 -0
- package/templates/scripts/__pycache__/generate_baselines.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/leakage_detector.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/sanity_checks.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/generate_baselines.py +423 -0
- package/templates/scripts/leakage_detector.py +402 -0
- package/templates/scripts/sanity_checks.py +503 -0
- package/templates/scripts/scaffold.py +6 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "turing",
|
|
3
|
-
"version": "3.
|
|
4
|
-
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol.
|
|
3
|
+
"version": "3.1.0",
|
|
4
|
+
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol. 44 commands, 2 specialized agents, pre-training intelligence (sanity checks + baseline generation + leakage detection), meta-intelligence (cross-project knowledge transfer + methodology audit), scaling & efficiency (scaling laws + compute budget + model distillation), model composition (ensemble + pipeline stitch + warm-start), deep analysis (experiment diff + live training monitor + regression gate), experiment orchestration (batch queue + smart retry + branching), literature integration + paper drafting, production model export, performance profiling, smart checkpoints, experiment intelligence, statistical rigor, tree-search hypothesis exploration, cost-performance frontier, model cards, model registry, hypothesis database with novelty guard, anti-cheating guardrails, and the taste-leverage loop. Inspired by Karpathy's autoresearch and the scientific method itself.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "pragnition"
|
|
7
7
|
},
|
package/README.md
CHANGED
|
@@ -352,6 +352,9 @@ The index (`hypotheses.yaml`) is the lightweight queue. The detail files (`hypot
|
|
|
352
352
|
| `/turing:distill <exp-id>` | Model compression — distill teacher into smaller student with accuracy/size tradeoff |
|
|
353
353
|
| `/turing:transfer [--from]` | Cross-project knowledge transfer — find similar projects, surface what worked |
|
|
354
354
|
| `/turing:audit [--strict]` | Pre-submission methodology audit — data leakage, baselines, seeds, ablations, reproducibility |
|
|
355
|
+
| `/turing:sanity [--quick]` | Pre-training sanity checks — initial loss, single-batch overfit, gradient flow, output validation |
|
|
356
|
+
| `/turing:baseline [--methods]` | Automatic baseline generation — random, majority/mean, linear, k-NN |
|
|
357
|
+
| `/turing:leak [--deep]` | Targeted leakage detection — single-feature tests, correlation, train/test overlap |
|
|
355
358
|
|
|
356
359
|
And for fully hands-off operation:
|
|
357
360
|
|
|
@@ -536,11 +539,11 @@ Each project gets independent config, data, experiments, models, and agent memor
|
|
|
536
539
|
|
|
537
540
|
## Architecture of Turing Itself
|
|
538
541
|
|
|
539
|
-
|
|
542
|
+
44 commands, 2 agents, 10 config files, 63 template scripts, model registry, artifact contract, cost-performance frontier, model cards, tree-search exploration, statistical rigor, experiment intelligence, performance profiling, smart checkpoints, production model export, literature integration, paper section drafting, experiment orchestration (queue + retry + fork), deep analysis (diff + watch + regress), model composition (ensemble + stitch + warm), scaling & efficiency (scale + budget + distill), meta-intelligence (transfer + audit), pre-training intelligence (sanity + baseline + leak), 16 ADRs. See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the full codemap.
|
|
540
543
|
|
|
541
544
|
```
|
|
542
545
|
turing/
|
|
543
|
-
├── commands/
|
|
546
|
+
├── commands/ 43 skill files (core + taste-leverage + reporting + exploration + statistical rigor + experiment intelligence + performance + deployment + research workflow + orchestration + deep analysis + model composition + scaling & efficiency + meta-intelligence + pre-training intelligence)
|
|
544
547
|
├── agents/ 2 agents (researcher: read/write, evaluator: read-only)
|
|
545
548
|
├── config/ 8 files (lifecycle, taxonomy, archetypes, novelty aliases)
|
|
546
549
|
├── templates/ Scaffolded into user projects by /turing:init
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: baseline
|
|
3
|
+
description: Automatic baseline generation — random, majority/mean, linear, k-NN baselines in 60 seconds. Every experiment needs a "is this better than dumb?" reference.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[--methods all|simple|linear] [--data data.npz]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Generate trivial baselines so you always know if your model is meaningfully better than simple approaches.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- `--methods all|simple|linear` — baseline group (default: all)
|
|
20
|
+
- `--data data.npz` — data file with X and y arrays
|
|
21
|
+
- `--json` — raw JSON output
|
|
22
|
+
|
|
23
|
+
3. **Run baseline generation:**
|
|
24
|
+
```bash
|
|
25
|
+
python scripts/generate_baselines.py $ARGUMENTS
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
4. **Baselines generated:**
|
|
29
|
+
- **Classification:** Random, Majority class, Stratified random, Logistic Regression, k-NN
|
|
30
|
+
- **Regression:** Random, Mean predictor, Median predictor, Ridge Regression, k-NN
|
|
31
|
+
- Each evaluated with the same protocol as real experiments
|
|
32
|
+
|
|
33
|
+
5. **Report includes:** comparison table with metric values and notes (floor, ceiling, reference)
|
|
34
|
+
|
|
35
|
+
6. **Integration:** satisfies the "baseline comparison" check in `/turing:audit`
|
|
36
|
+
|
|
37
|
+
7. **Saved output:** report in `experiments/baselines/baselines-*.yaml`
|
|
38
|
+
|
|
39
|
+
## Examples
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
/turing:baseline # All baselines
|
|
43
|
+
/turing:baseline --methods simple # Just random + majority
|
|
44
|
+
/turing:baseline --data data/processed.npz # With actual data
|
|
45
|
+
```
|
package/commands/leak.md
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: leak
|
|
3
|
+
description: Targeted leakage detection — probe for data leakage with single-feature tests, correlation checks, and train/test overlap detection.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[--deep] [--features feature_1,feature_2]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Actively probe for data leakage. The #1 cause of "too good to be true" results.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- `--deep` — run full single-feature analysis (slow but thorough)
|
|
20
|
+
- `--features "feat_1,feat_2"` — check specific features
|
|
21
|
+
- `--json` — raw JSON output
|
|
22
|
+
|
|
23
|
+
3. **Run leakage scan:**
|
|
24
|
+
```bash
|
|
25
|
+
python scripts/leakage_detector.py $ARGUMENTS
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
4. **Checks performed:**
|
|
29
|
+
- **Feature-target correlation:** flag features with >0.95 correlation to target
|
|
30
|
+
- **Single-feature predictiveness (--deep):** train on each feature alone, flag any that achieve >80% of full model performance
|
|
31
|
+
- **Train/test overlap:** hash-based deduplication across splits
|
|
32
|
+
|
|
33
|
+
5. **Verdicts:**
|
|
34
|
+
- **CLEAN** — no leakage detected
|
|
35
|
+
- **SUSPICIOUS** — warnings to review
|
|
36
|
+
- **LEAKAGE DETECTED** — critical flags found
|
|
37
|
+
|
|
38
|
+
6. **Integration:** satisfies the "data leakage" check in `/turing:audit`
|
|
39
|
+
|
|
40
|
+
7. **Saved output:** report in `experiments/leakage/leak-*.yaml`
|
|
41
|
+
|
|
42
|
+
## Examples
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
/turing:leak # Standard correlation + overlap checks
|
|
46
|
+
/turing:leak --deep # Full single-feature analysis
|
|
47
|
+
```
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: sanity
|
|
3
|
+
description: Pre-training sanity checks — catch broken data loaders, misconfigured losses, and dead gradients in 30 seconds before wasting hours.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[--quick] [--verbose]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Run a battery of fast checks before committing to a full training run. Catches wiring bugs in seconds.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- `--quick` — skip single-batch overfit test (fastest, ~5 seconds)
|
|
20
|
+
- `--verbose` — show detailed check output
|
|
21
|
+
- `--json` — raw JSON output
|
|
22
|
+
|
|
23
|
+
3. **Run sanity checks:**
|
|
24
|
+
```bash
|
|
25
|
+
python scripts/sanity_checks.py $ARGUMENTS
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
4. **Checks performed:**
|
|
29
|
+
- **Data pipeline** (critical): first batch loads, shapes match, no NaN/Inf
|
|
30
|
+
- **Initial loss** (high): loss at initialization matches theory (e.g., -log(1/C) for cross-entropy)
|
|
31
|
+
- **Gradient flow** (high): all parameters have non-zero, non-exploding gradients
|
|
32
|
+
- **Single-batch overfit** (critical): model can memorize 1 batch in 50 steps — if not, something is broken
|
|
33
|
+
- **Output validation** (high): predictions are non-NaN, non-constant, reasonable range
|
|
34
|
+
- **Config consistency** (medium): learning rate, batch size in reasonable ranges
|
|
35
|
+
|
|
36
|
+
5. **Verdicts:**
|
|
37
|
+
- **PASS** — safe to proceed
|
|
38
|
+
- **PASS (with warnings)** — review before training
|
|
39
|
+
- **FAIL** — do not proceed, fix issues first
|
|
40
|
+
|
|
41
|
+
6. **Saved output:** report in `experiments/sanity/sanity-*.yaml`
|
|
42
|
+
|
|
43
|
+
## Examples
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
/turing:sanity # Full check (~30 seconds)
|
|
47
|
+
/turing:sanity --quick # Skip overfit test (~5 seconds)
|
|
48
|
+
```
|
package/commands/turing.md
CHANGED
|
@@ -50,6 +50,9 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
50
50
|
| "distill", "compress", "smaller model", "student model", "knowledge distillation", "model compression" | `/turing:distill` | Deploy |
|
|
51
51
|
| "transfer", "what worked before", "similar project", "cross-project", "institutional knowledge", "prior projects" | `/turing:transfer` | Research |
|
|
52
52
|
| "audit", "methodology check", "pre-submission", "reviewer checklist", "data leakage", "missing baselines" | `/turing:audit` | Validate |
|
|
53
|
+
| "sanity", "sanity check", "pre-training", "is it broken", "before training", "quick check" | `/turing:sanity` | Check |
|
|
54
|
+
| "baseline", "baselines", "trivial baseline", "majority class", "is it better than random" | `/turing:baseline` | Analyze |
|
|
55
|
+
| "leak", "leakage", "data leakage scan", "suspicious feature", "train test overlap" | `/turing:leak` | Validate |
|
|
53
56
|
|
|
54
57
|
## Sub-commands
|
|
55
58
|
|
|
@@ -96,6 +99,9 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
96
99
|
| `/turing:distill <exp-id>` | Model compression: distill teacher into smaller student model | (inline) |
|
|
97
100
|
| `/turing:transfer [--from]` | Cross-project knowledge transfer: find similar prior projects, surface what worked | (inline) |
|
|
98
101
|
| `/turing:audit [--strict]` | Pre-submission methodology audit: data leakage, baselines, seeds, ablations, reproducibility | (inline) |
|
|
102
|
+
| `/turing:sanity [--quick]` | Pre-training sanity checks: initial loss, overfit test, gradient flow, output validation | (inline) |
|
|
103
|
+
| `/turing:baseline [--methods]` | Automatic baseline generation: random, majority/mean, linear, k-NN | (inline) |
|
|
104
|
+
| `/turing:leak [--deep]` | Targeted leakage detection: single-feature tests, correlation, train/test overlap | (inline) |
|
|
99
105
|
|
|
100
106
|
## Proactive Detection
|
|
101
107
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "claude-turing",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.1.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Autonomous ML research harness for Claude Code. The autoresearch loop as a formal protocol — iteratively trains, evaluates, and improves ML models with structured experiment tracking, convergence detection, immutable evaluation infrastructure, and safety guardrails.",
|
|
6
6
|
"bin": {
|
package/src/install.js
CHANGED
package/src/verify.js
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,423 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Automatic baseline generation for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
Auto-generates trivial baselines (majority, mean, random, linear, k-NN)
|
|
5
|
+
so every experiment has a "is this better than dumb?" reference point.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
python scripts/generate_baselines.py
|
|
9
|
+
python scripts/generate_baselines.py --methods all
|
|
10
|
+
python scripts/generate_baselines.py --methods simple
|
|
11
|
+
python scripts/generate_baselines.py --json
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import json
|
|
18
|
+
import sys
|
|
19
|
+
from datetime import datetime, timezone
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
import numpy as np
|
|
23
|
+
import yaml
|
|
24
|
+
|
|
25
|
+
from scripts.turing_io import load_config, load_experiments
|
|
26
|
+
|
|
27
|
+
DEFAULT_LOG_PATH = "experiments/log.jsonl"
|
|
28
|
+
|
|
29
|
+
# Baseline method groups
|
|
30
|
+
SIMPLE_METHODS = ["random", "majority_or_mean"]
|
|
31
|
+
LINEAR_METHODS = ["linear"]
|
|
32
|
+
ALL_METHODS = ["random", "majority_or_mean", "stratified_or_median", "linear", "knn"]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# --- Baseline Methods ---
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def random_baseline(y: np.ndarray, task_type: str = "classification") -> np.ndarray:
|
|
39
|
+
"""Random predictions."""
|
|
40
|
+
n = len(y)
|
|
41
|
+
if task_type == "classification":
|
|
42
|
+
classes = np.unique(y)
|
|
43
|
+
return np.random.choice(classes, size=n)
|
|
44
|
+
else:
|
|
45
|
+
return np.random.uniform(np.min(y), np.max(y), size=n)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def majority_or_mean_baseline(y: np.ndarray, task_type: str = "classification") -> np.ndarray:
|
|
49
|
+
"""Majority class (classification) or mean (regression)."""
|
|
50
|
+
n = len(y)
|
|
51
|
+
if task_type == "classification":
|
|
52
|
+
from scipy import stats as scipy_stats
|
|
53
|
+
mode_result = scipy_stats.mode(y, keepdims=False)
|
|
54
|
+
majority = mode_result.mode
|
|
55
|
+
return np.full(n, majority)
|
|
56
|
+
else:
|
|
57
|
+
return np.full(n, np.mean(y))
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def stratified_or_median_baseline(y: np.ndarray, task_type: str = "classification") -> np.ndarray:
|
|
61
|
+
"""Stratified random (classification) or median (regression)."""
|
|
62
|
+
n = len(y)
|
|
63
|
+
if task_type == "classification":
|
|
64
|
+
classes, counts = np.unique(y, return_counts=True)
|
|
65
|
+
probs = counts / counts.sum()
|
|
66
|
+
return np.random.choice(classes, size=n, p=probs)
|
|
67
|
+
else:
|
|
68
|
+
return np.full(n, np.median(y))
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def linear_baseline(
|
|
72
|
+
X: np.ndarray,
|
|
73
|
+
y: np.ndarray,
|
|
74
|
+
task_type: str = "classification",
|
|
75
|
+
) -> dict:
|
|
76
|
+
"""Linear model baseline (LogisticRegression / Ridge)."""
|
|
77
|
+
from sklearn.linear_model import LogisticRegression, Ridge
|
|
78
|
+
|
|
79
|
+
n_samples = X.shape[0]
|
|
80
|
+
split = int(n_samples * 0.7)
|
|
81
|
+
X_train, X_test = X[:split], X[split:]
|
|
82
|
+
y_train, y_test = y[:split], y[split:]
|
|
83
|
+
|
|
84
|
+
if task_type == "classification":
|
|
85
|
+
model = LogisticRegression(max_iter=1000, solver="lbfgs")
|
|
86
|
+
else:
|
|
87
|
+
model = Ridge(alpha=1.0)
|
|
88
|
+
|
|
89
|
+
model.fit(X_train, y_train)
|
|
90
|
+
predictions = model.predict(X_test)
|
|
91
|
+
|
|
92
|
+
return {
|
|
93
|
+
"predictions": predictions,
|
|
94
|
+
"labels": y_test,
|
|
95
|
+
"model_name": "LogisticRegression" if task_type == "classification" else "Ridge",
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def knn_baseline(
|
|
100
|
+
X: np.ndarray,
|
|
101
|
+
y: np.ndarray,
|
|
102
|
+
task_type: str = "classification",
|
|
103
|
+
n_neighbors: int = 5,
|
|
104
|
+
) -> dict:
|
|
105
|
+
"""k-NN baseline."""
|
|
106
|
+
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
|
|
107
|
+
|
|
108
|
+
n_samples = X.shape[0]
|
|
109
|
+
split = int(n_samples * 0.7)
|
|
110
|
+
X_train, X_test = X[:split], X[split:]
|
|
111
|
+
y_train, y_test = y[:split], y[split:]
|
|
112
|
+
|
|
113
|
+
k = min(n_neighbors, len(X_train))
|
|
114
|
+
if task_type == "classification":
|
|
115
|
+
model = KNeighborsClassifier(n_neighbors=k)
|
|
116
|
+
else:
|
|
117
|
+
model = KNeighborsRegressor(n_neighbors=k)
|
|
118
|
+
|
|
119
|
+
model.fit(X_train, y_train)
|
|
120
|
+
predictions = model.predict(X_test)
|
|
121
|
+
|
|
122
|
+
return {
|
|
123
|
+
"predictions": predictions,
|
|
124
|
+
"labels": y_test,
|
|
125
|
+
"model_name": f"k-NN (k={k})",
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# --- Evaluation ---
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def evaluate_predictions(
|
|
133
|
+
predictions: np.ndarray,
|
|
134
|
+
labels: np.ndarray,
|
|
135
|
+
task_type: str = "classification",
|
|
136
|
+
primary_metric: str = "accuracy",
|
|
137
|
+
) -> dict:
|
|
138
|
+
"""Evaluate baseline predictions."""
|
|
139
|
+
min_len = min(len(predictions), len(labels))
|
|
140
|
+
predictions = predictions[:min_len]
|
|
141
|
+
labels = labels[:min_len]
|
|
142
|
+
|
|
143
|
+
if task_type == "classification":
|
|
144
|
+
accuracy = float(np.mean(predictions == labels))
|
|
145
|
+
return {"accuracy": round(accuracy, 6), "n_samples": min_len}
|
|
146
|
+
else:
|
|
147
|
+
mse = float(np.mean((predictions - labels) ** 2))
|
|
148
|
+
rmse = float(np.sqrt(mse))
|
|
149
|
+
return {"mse": round(mse, 6), "rmse": round(rmse, 6), "n_samples": min_len}
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# --- Full Pipeline ---
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def generate_baselines(
|
|
156
|
+
methods: str = "all",
|
|
157
|
+
config_path: str = "config.yaml",
|
|
158
|
+
log_path: str = DEFAULT_LOG_PATH,
|
|
159
|
+
data_path: str | None = None,
|
|
160
|
+
) -> dict:
|
|
161
|
+
"""Generate baseline results.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
methods: Method group (all, simple, linear).
|
|
165
|
+
config_path: Path to config.yaml.
|
|
166
|
+
log_path: Path to experiment log.
|
|
167
|
+
data_path: Path to data (optional, for linear/knn).
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
Baseline report dict.
|
|
171
|
+
"""
|
|
172
|
+
config = load_config(config_path)
|
|
173
|
+
eval_cfg = config.get("evaluation", {})
|
|
174
|
+
primary_metric = eval_cfg.get("primary_metric", "accuracy")
|
|
175
|
+
task_type = config.get("task", {}).get("type", "classification")
|
|
176
|
+
|
|
177
|
+
experiments = load_experiments(log_path)
|
|
178
|
+
|
|
179
|
+
# Find current best for comparison
|
|
180
|
+
kept = [e for e in experiments if e.get("status") == "kept"]
|
|
181
|
+
current_best_value = None
|
|
182
|
+
if kept:
|
|
183
|
+
best = max(kept, key=lambda e: e.get("metrics", {}).get(primary_metric, 0))
|
|
184
|
+
current_best_value = best.get("metrics", {}).get(primary_metric)
|
|
185
|
+
|
|
186
|
+
# Select methods
|
|
187
|
+
if methods == "simple":
|
|
188
|
+
method_list = SIMPLE_METHODS
|
|
189
|
+
elif methods == "linear":
|
|
190
|
+
method_list = LINEAR_METHODS
|
|
191
|
+
else:
|
|
192
|
+
method_list = ALL_METHODS
|
|
193
|
+
|
|
194
|
+
# For methods that need data, check if data is available
|
|
195
|
+
has_data = data_path is not None and Path(data_path).exists()
|
|
196
|
+
|
|
197
|
+
report = {
|
|
198
|
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
199
|
+
"task_type": task_type,
|
|
200
|
+
"primary_metric": primary_metric,
|
|
201
|
+
"methods_requested": methods,
|
|
202
|
+
"baselines": [],
|
|
203
|
+
"current_best": current_best_value,
|
|
204
|
+
"data_available": has_data,
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
# Generate synthetic labels for demo if no data
|
|
208
|
+
# In real use, evaluate.py would provide these
|
|
209
|
+
if not has_data:
|
|
210
|
+
report["note"] = "No data loaded — baseline plan generated. Run with --data to compute actual scores."
|
|
211
|
+
for method in method_list:
|
|
212
|
+
report["baselines"].append({
|
|
213
|
+
"method": _method_display_name(method, task_type),
|
|
214
|
+
"metric_value": None,
|
|
215
|
+
"notes": "Requires data",
|
|
216
|
+
})
|
|
217
|
+
return report
|
|
218
|
+
|
|
219
|
+
# Load data
|
|
220
|
+
try:
|
|
221
|
+
data = np.load(data_path, allow_pickle=True)
|
|
222
|
+
X = data.get("X", data.get("features"))
|
|
223
|
+
y = data.get("y", data.get("labels", data.get("target")))
|
|
224
|
+
if X is None or y is None:
|
|
225
|
+
return {"error": f"Data file {data_path} missing X/y arrays"}
|
|
226
|
+
except Exception as e:
|
|
227
|
+
return {"error": f"Failed to load data: {e}"}
|
|
228
|
+
|
|
229
|
+
# Run baselines
|
|
230
|
+
for method in method_list:
|
|
231
|
+
result = _run_baseline(method, X, y, task_type, primary_metric)
|
|
232
|
+
report["baselines"].append(result)
|
|
233
|
+
|
|
234
|
+
# Add current best for comparison
|
|
235
|
+
if current_best_value is not None:
|
|
236
|
+
report["baselines"].append({
|
|
237
|
+
"method": "Current best",
|
|
238
|
+
"metric_value": current_best_value,
|
|
239
|
+
"notes": "",
|
|
240
|
+
})
|
|
241
|
+
|
|
242
|
+
# Compute improvement over linear baseline
|
|
243
|
+
linear_result = next((b for b in report["baselines"] if "linear" in b.get("method", "").lower()), None)
|
|
244
|
+
if linear_result and linear_result.get("metric_value") and current_best_value:
|
|
245
|
+
improvement = current_best_value - linear_result["metric_value"]
|
|
246
|
+
report["improvement_over_linear"] = round(improvement, 6)
|
|
247
|
+
|
|
248
|
+
return report
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _method_display_name(method: str, task_type: str) -> str:
|
|
252
|
+
"""Human-readable method name."""
|
|
253
|
+
names = {
|
|
254
|
+
"random": "Random",
|
|
255
|
+
"majority_or_mean": "Majority class" if task_type == "classification" else "Mean predictor",
|
|
256
|
+
"stratified_or_median": "Stratified random" if task_type == "classification" else "Median predictor",
|
|
257
|
+
"linear": "Logistic Regression" if task_type == "classification" else "Ridge Regression",
|
|
258
|
+
"knn": "k-NN (k=5)",
|
|
259
|
+
}
|
|
260
|
+
return names.get(method, method)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def _run_baseline(
|
|
264
|
+
method: str,
|
|
265
|
+
X: np.ndarray,
|
|
266
|
+
y: np.ndarray,
|
|
267
|
+
task_type: str,
|
|
268
|
+
primary_metric: str,
|
|
269
|
+
) -> dict:
|
|
270
|
+
"""Run a single baseline method."""
|
|
271
|
+
try:
|
|
272
|
+
if method == "random":
|
|
273
|
+
preds = random_baseline(y, task_type)
|
|
274
|
+
eval_result = evaluate_predictions(preds, y, task_type, primary_metric)
|
|
275
|
+
return {
|
|
276
|
+
"method": "Random",
|
|
277
|
+
"metric_value": eval_result.get(primary_metric, eval_result.get("accuracy", eval_result.get("rmse"))),
|
|
278
|
+
"notes": "Floor — below this = bug",
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
elif method == "majority_or_mean":
|
|
282
|
+
preds = majority_or_mean_baseline(y, task_type)
|
|
283
|
+
eval_result = evaluate_predictions(preds, y, task_type, primary_metric)
|
|
284
|
+
name = "Majority class" if task_type == "classification" else "Mean predictor"
|
|
285
|
+
return {
|
|
286
|
+
"method": name,
|
|
287
|
+
"metric_value": eval_result.get(primary_metric, eval_result.get("accuracy", eval_result.get("rmse"))),
|
|
288
|
+
"notes": "Naive floor",
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
elif method == "stratified_or_median":
|
|
292
|
+
preds = stratified_or_median_baseline(y, task_type)
|
|
293
|
+
eval_result = evaluate_predictions(preds, y, task_type, primary_metric)
|
|
294
|
+
name = "Stratified random" if task_type == "classification" else "Median predictor"
|
|
295
|
+
return {
|
|
296
|
+
"method": name,
|
|
297
|
+
"metric_value": eval_result.get(primary_metric, eval_result.get("accuracy", eval_result.get("rmse"))),
|
|
298
|
+
"notes": "",
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
elif method == "linear":
|
|
302
|
+
result = linear_baseline(X, y, task_type)
|
|
303
|
+
eval_result = evaluate_predictions(result["predictions"], result["labels"], task_type, primary_metric)
|
|
304
|
+
return {
|
|
305
|
+
"method": result["model_name"],
|
|
306
|
+
"metric_value": eval_result.get(primary_metric, eval_result.get("accuracy", eval_result.get("rmse"))),
|
|
307
|
+
"notes": "Linear ceiling",
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
elif method == "knn":
|
|
311
|
+
result = knn_baseline(X, y, task_type)
|
|
312
|
+
eval_result = evaluate_predictions(result["predictions"], result["labels"], task_type, primary_metric)
|
|
313
|
+
return {
|
|
314
|
+
"method": result["model_name"],
|
|
315
|
+
"metric_value": eval_result.get(primary_metric, eval_result.get("accuracy", eval_result.get("rmse"))),
|
|
316
|
+
"notes": "Non-parametric reference",
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
except Exception as e:
|
|
320
|
+
return {"method": method, "metric_value": None, "notes": f"Error: {e}"}
|
|
321
|
+
|
|
322
|
+
return {"method": method, "metric_value": None, "notes": "Unknown method"}
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
# --- Report Formatting ---
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def save_baseline_report(report: dict, output_dir: str = "experiments/baselines") -> Path:
|
|
329
|
+
"""Save baseline report to YAML."""
|
|
330
|
+
out_path = Path(output_dir)
|
|
331
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
|
332
|
+
|
|
333
|
+
date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
|
334
|
+
filepath = out_path / f"baselines-{date}.yaml"
|
|
335
|
+
|
|
336
|
+
with open(filepath, "w") as f:
|
|
337
|
+
yaml.dump(report, f, default_flow_style=False, sort_keys=False)
|
|
338
|
+
|
|
339
|
+
return filepath
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def format_baseline_report(report: dict) -> str:
|
|
343
|
+
"""Format baseline report as markdown."""
|
|
344
|
+
if "error" in report:
|
|
345
|
+
return f"ERROR: {report['error']}"
|
|
346
|
+
|
|
347
|
+
task = report.get("task_type", "?")
|
|
348
|
+
metric = report.get("primary_metric", "metric")
|
|
349
|
+
|
|
350
|
+
lines = [
|
|
351
|
+
f"# Baselines for {task} ({metric})",
|
|
352
|
+
"",
|
|
353
|
+
f"*Generated {report.get('generated_at', 'N/A')[:19]}*",
|
|
354
|
+
"",
|
|
355
|
+
]
|
|
356
|
+
|
|
357
|
+
baselines = report.get("baselines", [])
|
|
358
|
+
if baselines:
|
|
359
|
+
lines.append(f"| Method | {metric} | Notes |")
|
|
360
|
+
lines.append("|--------|--------|-------|")
|
|
361
|
+
for b in baselines:
|
|
362
|
+
val = b.get("metric_value")
|
|
363
|
+
val_str = f"{val:.4f}" if isinstance(val, (int, float)) else str(val or "N/A")
|
|
364
|
+
lines.append(f"| {b.get('method', '?')} | {val_str} | {b.get('notes', '')} |")
|
|
365
|
+
lines.append("")
|
|
366
|
+
|
|
367
|
+
improvement = report.get("improvement_over_linear")
|
|
368
|
+
if improvement is not None:
|
|
369
|
+
lines.append(f"**Your model beats the linear baseline by {improvement:+.4f} ({improvement / report.get('current_best', 1) * 100:.1f}%)**")
|
|
370
|
+
lines.append("")
|
|
371
|
+
|
|
372
|
+
if report.get("note"):
|
|
373
|
+
lines.append(f"*{report['note']}*")
|
|
374
|
+
|
|
375
|
+
return "\n".join(lines)
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def main() -> None:
|
|
379
|
+
"""CLI entry point."""
|
|
380
|
+
parser = argparse.ArgumentParser(
|
|
381
|
+
description="Automatic baseline generation",
|
|
382
|
+
)
|
|
383
|
+
parser.add_argument(
|
|
384
|
+
"--methods", choices=["all", "simple", "linear"], default="all",
|
|
385
|
+
help="Baseline method group (default: all)",
|
|
386
|
+
)
|
|
387
|
+
parser.add_argument(
|
|
388
|
+
"--data",
|
|
389
|
+
help="Path to data file (.npz with X and y arrays)",
|
|
390
|
+
)
|
|
391
|
+
parser.add_argument(
|
|
392
|
+
"--config", default="config.yaml",
|
|
393
|
+
help="Path to config.yaml",
|
|
394
|
+
)
|
|
395
|
+
parser.add_argument(
|
|
396
|
+
"--log", default=DEFAULT_LOG_PATH,
|
|
397
|
+
help="Path to experiment log",
|
|
398
|
+
)
|
|
399
|
+
parser.add_argument(
|
|
400
|
+
"--json", action="store_true",
|
|
401
|
+
help="Output raw JSON instead of formatted report",
|
|
402
|
+
)
|
|
403
|
+
args = parser.parse_args()
|
|
404
|
+
|
|
405
|
+
report = generate_baselines(
|
|
406
|
+
methods=args.methods,
|
|
407
|
+
config_path=args.config,
|
|
408
|
+
log_path=args.log,
|
|
409
|
+
data_path=args.data,
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
if "error" not in report:
|
|
413
|
+
filepath = save_baseline_report(report)
|
|
414
|
+
print(f"Saved to {filepath}", file=sys.stderr)
|
|
415
|
+
|
|
416
|
+
if args.json:
|
|
417
|
+
print(json.dumps(report, indent=2, default=str))
|
|
418
|
+
else:
|
|
419
|
+
print(format_baseline_report(report))
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
if __name__ == "__main__":
|
|
423
|
+
main()
|