claude-turing 3.1.0 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +5 -2
- package/commands/calibrate.md +47 -0
- package/commands/sensitivity.md +41 -0
- package/commands/turing.md +6 -0
- package/commands/xray.md +43 -0
- package/package.json +1 -1
- package/src/install.js +1 -0
- package/src/verify.js +3 -0
- package/templates/scripts/__pycache__/calibration.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/model_xray.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/sensitivity_analysis.cpython-314.pyc +0 -0
- package/templates/scripts/calibration.py +364 -0
- package/templates/scripts/model_xray.py +317 -0
- package/templates/scripts/scaffold.py +6 -0
- package/templates/scripts/sensitivity_analysis.py +335 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "turing",
|
|
3
|
-
"version": "3.
|
|
4
|
-
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol.
|
|
3
|
+
"version": "3.2.0",
|
|
4
|
+
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol. 47 commands, 2 specialized agents, model debugging (xray + sensitivity + calibration), pre-training intelligence (sanity checks + baseline generation + leakage detection), meta-intelligence (cross-project knowledge transfer + methodology audit), scaling & efficiency (scaling laws + compute budget + model distillation), model composition (ensemble + pipeline stitch + warm-start), deep analysis (experiment diff + live training monitor + regression gate), experiment orchestration (batch queue + smart retry + branching), literature integration + paper drafting, production model export, performance profiling, smart checkpoints, experiment intelligence, statistical rigor, tree-search hypothesis exploration, cost-performance frontier, model cards, model registry, hypothesis database with novelty guard, anti-cheating guardrails, and the taste-leverage loop. Inspired by Karpathy's autoresearch and the scientific method itself.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "pragnition"
|
|
7
7
|
},
|
package/README.md
CHANGED
|
@@ -355,6 +355,9 @@ The index (`hypotheses.yaml`) is the lightweight queue. The detail files (`hypot
|
|
|
355
355
|
| `/turing:sanity [--quick]` | Pre-training sanity checks — initial loss, single-batch overfit, gradient flow, output validation |
|
|
356
356
|
| `/turing:baseline [--methods]` | Automatic baseline generation — random, majority/mean, linear, k-NN |
|
|
357
357
|
| `/turing:leak [--deep]` | Targeted leakage detection — single-feature tests, correlation, train/test overlap |
|
|
358
|
+
| `/turing:xray [exp-id]` | Internal model diagnostics — gradient flow, dead neurons, weight distributions, tree analysis |
|
|
359
|
+
| `/turing:sensitivity [exp-id]` | Hyperparameter sensitivity — rank parameters by impact, detect non-monotonic responses |
|
|
360
|
+
| `/turing:calibrate [exp-id]` | Probability calibration — ECE/MCE, reliability diagrams, Platt/isotonic/temperature scaling |
|
|
358
361
|
|
|
359
362
|
And for fully hands-off operation:
|
|
360
363
|
|
|
@@ -539,11 +542,11 @@ Each project gets independent config, data, experiments, models, and agent memor
|
|
|
539
542
|
|
|
540
543
|
## Architecture of Turing Itself
|
|
541
544
|
|
|
542
|
-
|
|
545
|
+
47 commands, 2 agents, 10 config files, 66 template scripts, model registry, artifact contract, cost-performance frontier, model cards, tree-search exploration, statistical rigor, experiment intelligence, performance profiling, smart checkpoints, production model export, literature integration, paper section drafting, experiment orchestration (queue + retry + fork), deep analysis (diff + watch + regress), model composition (ensemble + stitch + warm), scaling & efficiency (scale + budget + distill), meta-intelligence (transfer + audit), pre-training intelligence (sanity + baseline + leak), model debugging (xray + sensitivity + calibrate), 16 ADRs. See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the full codemap.
|
|
543
546
|
|
|
544
547
|
```
|
|
545
548
|
turing/
|
|
546
|
-
├── commands/
|
|
549
|
+
├── commands/ 46 skill files (core + taste-leverage + reporting + exploration + statistical rigor + experiment intelligence + performance + deployment + research workflow + orchestration + deep analysis + model composition + scaling & efficiency + meta-intelligence + pre-training intelligence + model debugging)
|
|
547
550
|
├── agents/ 2 agents (researcher: read/write, evaluator: read-only)
|
|
548
551
|
├── config/ 8 files (lifecycle, taxonomy, archetypes, novelty aliases)
|
|
549
552
|
├── templates/ Scaffolded into user projects by /turing:init
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: calibrate
|
|
3
|
+
description: Probability calibration — measure ECE, plot reliability diagrams, apply Platt scaling or isotonic regression.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[exp-id] [--method platt|isotonic|temperature|auto]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Make model probabilities trustworthy. Does 80% confidence actually mean 80% correct?
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- Optional experiment ID
|
|
20
|
+
- `--method platt|isotonic|temperature|auto` — calibration method (default: auto)
|
|
21
|
+
- `--json` — raw JSON output
|
|
22
|
+
|
|
23
|
+
3. **Run calibration:**
|
|
24
|
+
```bash
|
|
25
|
+
python scripts/calibration.py $ARGUMENTS
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
4. **Report includes:**
|
|
29
|
+
- ECE/MCE before calibration
|
|
30
|
+
- Reliability diagram (predicted vs actual per bin)
|
|
31
|
+
- Calibration method comparison table
|
|
32
|
+
- Verdict: ALREADY CALIBRATED / IMPROVED / NO IMPROVEMENT
|
|
33
|
+
|
|
34
|
+
5. **Methods:**
|
|
35
|
+
- **Platt:** logistic regression on logits
|
|
36
|
+
- **Isotonic:** non-parametric (more flexible, needs more data)
|
|
37
|
+
- **Temperature:** single scalar T parameter
|
|
38
|
+
- **Auto:** tries all, picks lowest ECE
|
|
39
|
+
|
|
40
|
+
6. **Saved output:** report in `experiments/calibration/<exp-id>-calibration.yaml`
|
|
41
|
+
|
|
42
|
+
## Examples
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
/turing:calibrate exp-042 # Auto-select best method
|
|
46
|
+
/turing:calibrate exp-042 --method platt # Platt scaling only
|
|
47
|
+
```
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: sensitivity
|
|
3
|
+
description: Hyperparameter sensitivity analysis — rank parameters by impact, identify which matter and which are noise.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[exp-id] [--params learning_rate,max_depth]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Which hyperparameters actually matter? Stop wasting time on the ones that don't.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- Optional experiment ID
|
|
20
|
+
- `--params "learning_rate,max_depth"` — specific parameters to analyze
|
|
21
|
+
- `--json` — raw JSON output
|
|
22
|
+
|
|
23
|
+
3. **Run sensitivity analysis:**
|
|
24
|
+
```bash
|
|
25
|
+
python scripts/sensitivity_analysis.py $ARGUMENTS
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
4. **Report includes:**
|
|
29
|
+
- Per-parameter sensitivity ranking: HIGH / MED / LOW / NONE
|
|
30
|
+
- Metric range for each parameter sweep
|
|
31
|
+
- Monotonicity detection (is there a sweet spot?)
|
|
32
|
+
- Recommendations: focus tuning on X, stop tuning Y
|
|
33
|
+
|
|
34
|
+
5. **Saved output:** report in `experiments/sensitivity/<exp-id>-sensitivity.yaml`
|
|
35
|
+
|
|
36
|
+
## Examples
|
|
37
|
+
|
|
38
|
+
```
|
|
39
|
+
/turing:sensitivity exp-042 # All tunable params
|
|
40
|
+
/turing:sensitivity --params "learning_rate,max_depth" # Specific params
|
|
41
|
+
```
|
package/commands/turing.md
CHANGED
|
@@ -53,6 +53,9 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
53
53
|
| "sanity", "sanity check", "pre-training", "is it broken", "before training", "quick check" | `/turing:sanity` | Check |
|
|
54
54
|
| "baseline", "baselines", "trivial baseline", "majority class", "is it better than random" | `/turing:baseline` | Analyze |
|
|
55
55
|
| "leak", "leakage", "data leakage scan", "suspicious feature", "train test overlap" | `/turing:leak` | Validate |
|
|
56
|
+
| "xray", "model internals", "dead neurons", "gradient flow", "weight distribution", "inside the model" | `/turing:xray` | Analyze |
|
|
57
|
+
| "sensitivity", "which params matter", "hyperparameter importance", "parameter ranking" | `/turing:sensitivity` | Analyze |
|
|
58
|
+
| "calibrate", "calibration", "ECE", "reliability diagram", "overconfident", "probability calibration" | `/turing:calibrate` | Analyze |
|
|
56
59
|
|
|
57
60
|
## Sub-commands
|
|
58
61
|
|
|
@@ -102,6 +105,9 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
102
105
|
| `/turing:sanity [--quick]` | Pre-training sanity checks: initial loss, overfit test, gradient flow, output validation | (inline) |
|
|
103
106
|
| `/turing:baseline [--methods]` | Automatic baseline generation: random, majority/mean, linear, k-NN | (inline) |
|
|
104
107
|
| `/turing:leak [--deep]` | Targeted leakage detection: single-feature tests, correlation, train/test overlap | (inline) |
|
|
108
|
+
| `/turing:xray [exp-id]` | Internal model diagnostics: gradient flow, dead neurons, weight distributions, tree analysis | (inline) |
|
|
109
|
+
| `/turing:sensitivity [exp-id]` | Hyperparameter sensitivity analysis: rank parameters by impact, detect non-monotonic responses | (inline) |
|
|
110
|
+
| `/turing:calibrate [exp-id]` | Probability calibration: ECE/MCE, reliability diagrams, Platt/isotonic/temperature scaling | (inline) |
|
|
105
111
|
|
|
106
112
|
## Proactive Detection
|
|
107
113
|
|
package/commands/xray.md
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: xray
|
|
3
|
+
description: Internal model diagnostics — gradient flow, dead neurons, activation stats, weight distributions, tree depth analysis.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[exp-id] [--layer encoder.layer.2] [--compare exp-a exp-b]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
See inside the model. When it underperforms, the fix depends on *why*.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- Optional experiment ID
|
|
20
|
+
- `--layer "name"` — focus on specific layer
|
|
21
|
+
- `--compare exp-a exp-b` — side-by-side diagnostics
|
|
22
|
+
- `--json` — raw JSON output
|
|
23
|
+
|
|
24
|
+
3. **Run model diagnostics:**
|
|
25
|
+
```bash
|
|
26
|
+
python scripts/model_xray.py $ARGUMENTS
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
4. **Diagnostics by model type:**
|
|
30
|
+
- **Neural networks:** gradient magnitudes, activation stats, dead neuron %, weight distributions, gradient-to-weight ratio
|
|
31
|
+
- **Tree models:** depth utilization, leaf purity, feature split dominance
|
|
32
|
+
- **scikit-learn:** coefficient magnitudes, feature importance concentration
|
|
33
|
+
|
|
34
|
+
5. **Issues detected:** dead gradients, vanishing/exploding gradients, dead neurons, sparse weights, feature dominance, overfitting risk
|
|
35
|
+
|
|
36
|
+
6. **Saved output:** report in `experiments/xrays/<exp-id>-xray.yaml`
|
|
37
|
+
|
|
38
|
+
## Examples
|
|
39
|
+
|
|
40
|
+
```
|
|
41
|
+
/turing:xray exp-042 # Full diagnostics
|
|
42
|
+
/turing:xray # Best experiment
|
|
43
|
+
```
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "claude-turing",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.2.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Autonomous ML research harness for Claude Code. The autoresearch loop as a formal protocol — iteratively trains, evaluates, and improves ML models with structured experiment tracking, convergence detection, immutable evaluation infrastructure, and safety guardrails.",
|
|
6
6
|
"bin": {
|
package/src/install.js
CHANGED
package/src/verify.js
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,364 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Probability calibration for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
Measures whether model probabilities are well-calibrated, computes ECE/MCE,
|
|
5
|
+
generates reliability diagrams, and applies post-hoc calibration (Platt
|
|
6
|
+
scaling, isotonic regression, temperature scaling).
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python scripts/calibration.py exp-042
|
|
10
|
+
python scripts/calibration.py exp-042 --method platt
|
|
11
|
+
python scripts/calibration.py exp-042 --method auto
|
|
12
|
+
python scripts/calibration.py --json
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import json
|
|
19
|
+
import sys
|
|
20
|
+
from datetime import datetime, timezone
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
import numpy as np
|
|
24
|
+
import yaml
|
|
25
|
+
|
|
26
|
+
from scripts.turing_io import load_config, load_experiments
|
|
27
|
+
|
|
28
|
+
DEFAULT_LOG_PATH = "experiments/log.jsonl"
|
|
29
|
+
DEFAULT_N_BINS = 10
|
|
30
|
+
CALIBRATION_METHODS = ["platt", "isotonic", "temperature"]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# --- Calibration Metrics ---
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def compute_ece(
|
|
37
|
+
probabilities: np.ndarray,
|
|
38
|
+
labels: np.ndarray,
|
|
39
|
+
n_bins: int = DEFAULT_N_BINS,
|
|
40
|
+
) -> float:
|
|
41
|
+
"""Compute Expected Calibration Error.
|
|
42
|
+
|
|
43
|
+
ECE = sum(|bin_accuracy - bin_confidence| * bin_size / total)
|
|
44
|
+
"""
|
|
45
|
+
if len(probabilities) == 0:
|
|
46
|
+
return 0.0
|
|
47
|
+
|
|
48
|
+
bin_boundaries = np.linspace(0, 1, n_bins + 1)
|
|
49
|
+
ece = 0.0
|
|
50
|
+
|
|
51
|
+
for i in range(n_bins):
|
|
52
|
+
mask = (probabilities >= bin_boundaries[i]) & (probabilities < bin_boundaries[i + 1])
|
|
53
|
+
if i == n_bins - 1:
|
|
54
|
+
mask = (probabilities >= bin_boundaries[i]) & (probabilities <= bin_boundaries[i + 1])
|
|
55
|
+
|
|
56
|
+
bin_size = np.sum(mask)
|
|
57
|
+
if bin_size == 0:
|
|
58
|
+
continue
|
|
59
|
+
|
|
60
|
+
bin_accuracy = np.mean(labels[mask])
|
|
61
|
+
bin_confidence = np.mean(probabilities[mask])
|
|
62
|
+
ece += abs(bin_accuracy - bin_confidence) * bin_size / len(probabilities)
|
|
63
|
+
|
|
64
|
+
return round(float(ece), 6)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def compute_mce(
|
|
68
|
+
probabilities: np.ndarray,
|
|
69
|
+
labels: np.ndarray,
|
|
70
|
+
n_bins: int = DEFAULT_N_BINS,
|
|
71
|
+
) -> float:
|
|
72
|
+
"""Compute Maximum Calibration Error."""
|
|
73
|
+
if len(probabilities) == 0:
|
|
74
|
+
return 0.0
|
|
75
|
+
|
|
76
|
+
bin_boundaries = np.linspace(0, 1, n_bins + 1)
|
|
77
|
+
max_gap = 0.0
|
|
78
|
+
|
|
79
|
+
for i in range(n_bins):
|
|
80
|
+
mask = (probabilities >= bin_boundaries[i]) & (probabilities < bin_boundaries[i + 1])
|
|
81
|
+
if i == n_bins - 1:
|
|
82
|
+
mask = (probabilities >= bin_boundaries[i]) & (probabilities <= bin_boundaries[i + 1])
|
|
83
|
+
|
|
84
|
+
if np.sum(mask) == 0:
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
bin_accuracy = np.mean(labels[mask])
|
|
88
|
+
bin_confidence = np.mean(probabilities[mask])
|
|
89
|
+
max_gap = max(max_gap, abs(bin_accuracy - bin_confidence))
|
|
90
|
+
|
|
91
|
+
return round(float(max_gap), 6)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def compute_reliability_diagram(
|
|
95
|
+
probabilities: np.ndarray,
|
|
96
|
+
labels: np.ndarray,
|
|
97
|
+
n_bins: int = DEFAULT_N_BINS,
|
|
98
|
+
) -> list[dict]:
|
|
99
|
+
"""Compute reliability diagram data."""
|
|
100
|
+
if len(probabilities) == 0:
|
|
101
|
+
return []
|
|
102
|
+
|
|
103
|
+
bin_boundaries = np.linspace(0, 1, n_bins + 1)
|
|
104
|
+
bins = []
|
|
105
|
+
|
|
106
|
+
for i in range(n_bins):
|
|
107
|
+
lo = bin_boundaries[i]
|
|
108
|
+
hi = bin_boundaries[i + 1]
|
|
109
|
+
mask = (probabilities >= lo) & (probabilities < hi)
|
|
110
|
+
if i == n_bins - 1:
|
|
111
|
+
mask = (probabilities >= lo) & (probabilities <= hi)
|
|
112
|
+
|
|
113
|
+
bin_size = int(np.sum(mask))
|
|
114
|
+
if bin_size == 0:
|
|
115
|
+
bins.append({"bin": f"[{lo:.1f}-{hi:.1f}]", "predicted": None,
|
|
116
|
+
"actual": None, "gap": None, "n": 0})
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
predicted = float(np.mean(probabilities[mask]))
|
|
120
|
+
actual = float(np.mean(labels[mask]))
|
|
121
|
+
gap = actual - predicted
|
|
122
|
+
|
|
123
|
+
bins.append({
|
|
124
|
+
"bin": f"[{lo:.1f}-{hi:.1f}]",
|
|
125
|
+
"predicted": round(predicted, 4),
|
|
126
|
+
"actual": round(actual, 4),
|
|
127
|
+
"gap": round(gap, 4),
|
|
128
|
+
"n": bin_size,
|
|
129
|
+
})
|
|
130
|
+
|
|
131
|
+
return bins
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# --- Calibration Methods ---
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def platt_scaling(
|
|
138
|
+
logits: np.ndarray,
|
|
139
|
+
labels: np.ndarray,
|
|
140
|
+
) -> dict:
|
|
141
|
+
"""Apply Platt scaling (logistic regression on logits)."""
|
|
142
|
+
from scipy.special import expit
|
|
143
|
+
|
|
144
|
+
# Fit logistic regression: P(y=1|f) = sigmoid(a*f + b)
|
|
145
|
+
# Simple gradient descent for a, b
|
|
146
|
+
a, b = 1.0, 0.0
|
|
147
|
+
lr = 0.01
|
|
148
|
+
for _ in range(1000):
|
|
149
|
+
pred = expit(a * logits + b)
|
|
150
|
+
pred = np.clip(pred, 1e-7, 1 - 1e-7)
|
|
151
|
+
grad_a = np.mean((pred - labels) * logits)
|
|
152
|
+
grad_b = np.mean(pred - labels)
|
|
153
|
+
a -= lr * grad_a
|
|
154
|
+
b -= lr * grad_b
|
|
155
|
+
|
|
156
|
+
calibrated = expit(a * logits + b)
|
|
157
|
+
return {"method": "platt", "params": {"a": round(float(a), 6), "b": round(float(b), 6)},
|
|
158
|
+
"calibrated_probabilities": calibrated}
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def isotonic_calibration(
|
|
162
|
+
probabilities: np.ndarray,
|
|
163
|
+
labels: np.ndarray,
|
|
164
|
+
) -> dict:
|
|
165
|
+
"""Apply isotonic regression calibration."""
|
|
166
|
+
from sklearn.isotonic import IsotonicRegression
|
|
167
|
+
|
|
168
|
+
iso = IsotonicRegression(out_of_bounds="clip")
|
|
169
|
+
calibrated = iso.fit_transform(probabilities, labels)
|
|
170
|
+
return {"method": "isotonic", "params": {},
|
|
171
|
+
"calibrated_probabilities": np.clip(calibrated, 0, 1)}
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def temperature_scaling(
|
|
175
|
+
logits: np.ndarray,
|
|
176
|
+
labels: np.ndarray,
|
|
177
|
+
) -> dict:
|
|
178
|
+
"""Apply temperature scaling (single parameter T)."""
|
|
179
|
+
from scipy.special import expit
|
|
180
|
+
|
|
181
|
+
best_t = 1.0
|
|
182
|
+
best_ece = float("inf")
|
|
183
|
+
|
|
184
|
+
for t in np.arange(0.5, 5.0, 0.1):
|
|
185
|
+
scaled = expit(logits / t)
|
|
186
|
+
ece = compute_ece(scaled, labels)
|
|
187
|
+
if ece < best_ece:
|
|
188
|
+
best_ece = ece
|
|
189
|
+
best_t = t
|
|
190
|
+
|
|
191
|
+
calibrated = expit(logits / best_t)
|
|
192
|
+
return {"method": "temperature", "params": {"T": round(float(best_t), 2)},
|
|
193
|
+
"calibrated_probabilities": calibrated}
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
# --- Full Pipeline ---
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def calibrate_model(
|
|
200
|
+
probabilities: np.ndarray | None = None,
|
|
201
|
+
logits: np.ndarray | None = None,
|
|
202
|
+
labels: np.ndarray | None = None,
|
|
203
|
+
method: str = "auto",
|
|
204
|
+
exp_id: str | None = None,
|
|
205
|
+
config_path: str = "config.yaml",
|
|
206
|
+
) -> dict:
|
|
207
|
+
"""Run calibration analysis and optionally apply post-hoc calibration."""
|
|
208
|
+
if (probabilities is None and logits is None) or labels is None:
|
|
209
|
+
return {"error": "Provide probabilities (or logits) and labels for calibration"}
|
|
210
|
+
|
|
211
|
+
if probabilities is None and logits is not None:
|
|
212
|
+
from scipy.special import expit
|
|
213
|
+
probabilities = expit(logits)
|
|
214
|
+
|
|
215
|
+
# Before calibration
|
|
216
|
+
ece_before = compute_ece(probabilities, labels)
|
|
217
|
+
mce_before = compute_mce(probabilities, labels)
|
|
218
|
+
reliability = compute_reliability_diagram(probabilities, labels)
|
|
219
|
+
|
|
220
|
+
# Determine overconfidence
|
|
221
|
+
overconfident_bins = [b for b in reliability if b.get("gap") is not None and b["gap"] < -0.05 and b["n"] > 0]
|
|
222
|
+
|
|
223
|
+
report = {
|
|
224
|
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
225
|
+
"experiment_id": exp_id,
|
|
226
|
+
"before": {"ece": ece_before, "mce": mce_before},
|
|
227
|
+
"reliability_diagram": reliability,
|
|
228
|
+
"overconfident_bins": len(overconfident_bins),
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
# Apply calibration
|
|
232
|
+
methods_to_try = CALIBRATION_METHODS if method == "auto" else [method]
|
|
233
|
+
results = []
|
|
234
|
+
|
|
235
|
+
for m in methods_to_try:
|
|
236
|
+
try:
|
|
237
|
+
if m == "platt" and logits is not None:
|
|
238
|
+
cal = platt_scaling(logits, labels)
|
|
239
|
+
elif m == "isotonic":
|
|
240
|
+
cal = isotonic_calibration(probabilities, labels)
|
|
241
|
+
elif m == "temperature" and logits is not None:
|
|
242
|
+
cal = temperature_scaling(logits, labels)
|
|
243
|
+
else:
|
|
244
|
+
continue
|
|
245
|
+
|
|
246
|
+
ece_after = compute_ece(cal["calibrated_probabilities"], labels)
|
|
247
|
+
results.append({
|
|
248
|
+
"method": m,
|
|
249
|
+
"ece_after": ece_after,
|
|
250
|
+
"improvement": round(ece_before - ece_after, 6),
|
|
251
|
+
"params": cal.get("params", {}),
|
|
252
|
+
})
|
|
253
|
+
except Exception:
|
|
254
|
+
continue
|
|
255
|
+
|
|
256
|
+
# Find best method
|
|
257
|
+
best = None
|
|
258
|
+
if results:
|
|
259
|
+
best = min(results, key=lambda r: r["ece_after"])
|
|
260
|
+
|
|
261
|
+
report["calibration_results"] = results
|
|
262
|
+
report["best_method"] = best
|
|
263
|
+
|
|
264
|
+
# Verdict
|
|
265
|
+
if ece_before < 0.02:
|
|
266
|
+
report["verdict"] = "already_calibrated"
|
|
267
|
+
report["reason"] = f"ECE {ece_before:.4f} is already low — calibration not needed"
|
|
268
|
+
elif best and best["improvement"] > 0.01:
|
|
269
|
+
report["verdict"] = "improved"
|
|
270
|
+
report["reason"] = f"{best['method']} reduces ECE from {ece_before:.4f} to {best['ece_after']:.4f}"
|
|
271
|
+
elif best:
|
|
272
|
+
report["verdict"] = "marginal_improvement"
|
|
273
|
+
report["reason"] = f"Best method ({best['method']}) improves ECE by only {best['improvement']:.4f}"
|
|
274
|
+
else:
|
|
275
|
+
report["verdict"] = "no_improvement"
|
|
276
|
+
report["reason"] = "No calibration method improved ECE"
|
|
277
|
+
|
|
278
|
+
return report
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
# --- Report Formatting ---
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def save_calibration_report(report: dict, output_dir: str = "experiments/calibration") -> Path:
|
|
285
|
+
out_path = Path(output_dir)
|
|
286
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
|
287
|
+
exp_id = report.get("experiment_id", "unknown")
|
|
288
|
+
filepath = out_path / f"{exp_id}-calibration.yaml"
|
|
289
|
+
clean = json.loads(json.dumps(report, default=str))
|
|
290
|
+
with open(filepath, "w") as f:
|
|
291
|
+
yaml.dump(clean, f, default_flow_style=False, sort_keys=False)
|
|
292
|
+
return filepath
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def format_calibration_report(report: dict) -> str:
|
|
296
|
+
if "error" in report:
|
|
297
|
+
return f"ERROR: {report['error']}"
|
|
298
|
+
|
|
299
|
+
exp_id = report.get("experiment_id", "?")
|
|
300
|
+
before = report.get("before", {})
|
|
301
|
+
|
|
302
|
+
lines = [f"# Calibration: {exp_id}", "",
|
|
303
|
+
f"*Generated {report.get('generated_at', 'N/A')[:19]}*", "",
|
|
304
|
+
f"**ECE before:** {before.get('ece', '?')}",
|
|
305
|
+
f"**MCE before:** {before.get('mce', '?')}", ""]
|
|
306
|
+
|
|
307
|
+
# Reliability diagram
|
|
308
|
+
diagram = report.get("reliability_diagram", [])
|
|
309
|
+
if diagram:
|
|
310
|
+
lines.extend(["## Reliability Diagram", "",
|
|
311
|
+
"| Bin | Predicted | Actual | Gap |",
|
|
312
|
+
"|-----|-----------|--------|-----|"])
|
|
313
|
+
for b in diagram:
|
|
314
|
+
if b["predicted"] is not None:
|
|
315
|
+
gap_marker = " overconfident" if b["gap"] is not None and b["gap"] < -0.05 else ""
|
|
316
|
+
lines.append(f"| {b['bin']} | {b['predicted']:.4f} | {b['actual']:.4f} | {b['gap']:+.4f}{gap_marker} |")
|
|
317
|
+
lines.append("")
|
|
318
|
+
|
|
319
|
+
# Calibration results
|
|
320
|
+
results = report.get("calibration_results", [])
|
|
321
|
+
if results:
|
|
322
|
+
lines.extend(["## Calibration Methods", "",
|
|
323
|
+
"| Method | ECE After | Improvement |",
|
|
324
|
+
"|--------|-----------|-------------|"])
|
|
325
|
+
best = report.get("best_method", {})
|
|
326
|
+
for r in results:
|
|
327
|
+
marker = " BEST" if r["method"] == best.get("method") else ""
|
|
328
|
+
lines.append(f"| {r['method']} | {r['ece_after']:.4f} | {r['improvement']:+.4f} |{marker}")
|
|
329
|
+
lines.append("")
|
|
330
|
+
|
|
331
|
+
# Verdict
|
|
332
|
+
verdict = report.get("verdict", "?")
|
|
333
|
+
labels = {"already_calibrated": "ALREADY CALIBRATED", "improved": "IMPROVED",
|
|
334
|
+
"marginal_improvement": "MARGINAL IMPROVEMENT", "no_improvement": "NO IMPROVEMENT"}
|
|
335
|
+
lines.extend(["## Verdict", "", f"**{labels.get(verdict, verdict.upper())}**", "",
|
|
336
|
+
report.get("reason", "")])
|
|
337
|
+
|
|
338
|
+
return "\n".join(lines)
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def main() -> None:
|
|
342
|
+
parser = argparse.ArgumentParser(description="Probability calibration")
|
|
343
|
+
parser.add_argument("exp_id", nargs="?", help="Experiment ID")
|
|
344
|
+
parser.add_argument("--method", choices=CALIBRATION_METHODS + ["auto"], default="auto")
|
|
345
|
+
parser.add_argument("--config", default="config.yaml")
|
|
346
|
+
parser.add_argument("--log", default=DEFAULT_LOG_PATH)
|
|
347
|
+
parser.add_argument("--json", action="store_true")
|
|
348
|
+
args = parser.parse_args()
|
|
349
|
+
|
|
350
|
+
# Without data, show usage
|
|
351
|
+
report = calibrate_model(exp_id=args.exp_id, method=args.method, config_path=args.config)
|
|
352
|
+
|
|
353
|
+
if "error" not in report:
|
|
354
|
+
filepath = save_calibration_report(report)
|
|
355
|
+
print(f"Saved to {filepath}", file=sys.stderr)
|
|
356
|
+
|
|
357
|
+
if args.json:
|
|
358
|
+
print(json.dumps(report, indent=2, default=str))
|
|
359
|
+
else:
|
|
360
|
+
print(format_calibration_report(report))
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
if __name__ == "__main__":
|
|
364
|
+
main()
|
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Internal model diagnostics for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
Inspects model internals: gradient flow per layer, activation statistics,
|
|
5
|
+
dead neurons, weight distributions, decision path analysis. Answers
|
|
6
|
+
"what is the model doing internally?" rather than "what are its predictions?"
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python scripts/model_xray.py exp-042
|
|
10
|
+
python scripts/model_xray.py exp-042 --layer "encoder.layer.2"
|
|
11
|
+
python scripts/model_xray.py --compare exp-042 exp-053
|
|
12
|
+
python scripts/model_xray.py --json
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import json
|
|
19
|
+
import math
|
|
20
|
+
import sys
|
|
21
|
+
from datetime import datetime, timezone
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
import numpy as np
|
|
25
|
+
import yaml
|
|
26
|
+
|
|
27
|
+
from scripts.turing_io import load_config, load_experiments
|
|
28
|
+
|
|
29
|
+
DEFAULT_LOG_PATH = "experiments/log.jsonl"
|
|
30
|
+
DEAD_NEURON_THRESHOLD = 0.01 # Activation below this = dead
|
|
31
|
+
EXPLODING_GRADIENT_RATIO = 100 # Gradient > N * mean = exploding
|
|
32
|
+
NEAR_ZERO_WEIGHT = 0.001 # Weight below this = pruning candidate
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# --- Neural Network Diagnostics ---
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def diagnose_neural_layers(layer_stats: list[dict]) -> dict:
|
|
39
|
+
"""Analyze neural network layer statistics.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
layer_stats: List of dicts with name, grad_mean, grad_max, act_mean,
|
|
43
|
+
act_std, dead_pct, weight_mean, weight_std.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
Diagnosis dict with per-layer analysis and detected issues.
|
|
47
|
+
"""
|
|
48
|
+
if not layer_stats:
|
|
49
|
+
return {"layers": [], "issues": [], "model_type": "neural"}
|
|
50
|
+
|
|
51
|
+
issues = []
|
|
52
|
+
analyzed = []
|
|
53
|
+
|
|
54
|
+
# Compute global gradient mean for relative comparison
|
|
55
|
+
grad_means = [abs(l.get("grad_mean", 0)) for l in layer_stats if l.get("grad_mean") is not None]
|
|
56
|
+
global_grad_mean = np.mean(grad_means) if grad_means else 0
|
|
57
|
+
|
|
58
|
+
for layer in layer_stats:
|
|
59
|
+
name = layer.get("name", "?")
|
|
60
|
+
analysis = {"name": name}
|
|
61
|
+
|
|
62
|
+
# Gradient analysis
|
|
63
|
+
grad_mean = abs(layer.get("grad_mean", 0))
|
|
64
|
+
grad_max = abs(layer.get("grad_max", 0))
|
|
65
|
+
analysis["grad_mean"] = grad_mean
|
|
66
|
+
analysis["grad_max"] = grad_max
|
|
67
|
+
|
|
68
|
+
if grad_mean == 0 and grad_max == 0:
|
|
69
|
+
issues.append({"layer": name, "issue": "dead_gradient", "severity": "high",
|
|
70
|
+
"message": f"{name}: zero gradients — layer is not learning"})
|
|
71
|
+
elif global_grad_mean > 0 and grad_mean < global_grad_mean / EXPLODING_GRADIENT_RATIO:
|
|
72
|
+
ratio = global_grad_mean / grad_mean if grad_mean > 0 else float("inf")
|
|
73
|
+
issues.append({"layer": name, "issue": "vanishing_gradient", "severity": "high",
|
|
74
|
+
"message": f"{name}: gradient {ratio:.0f}x weaker than average — possible vanishing gradient"})
|
|
75
|
+
elif global_grad_mean > 0 and grad_max > EXPLODING_GRADIENT_RATIO * global_grad_mean:
|
|
76
|
+
issues.append({"layer": name, "issue": "exploding_gradient", "severity": "critical",
|
|
77
|
+
"message": f"{name}: gradient max {grad_max:.2e} is {grad_max/global_grad_mean:.0f}x the average — exploding gradient"})
|
|
78
|
+
|
|
79
|
+
# Activation analysis
|
|
80
|
+
dead_pct = layer.get("dead_pct", 0)
|
|
81
|
+
analysis["dead_pct"] = dead_pct
|
|
82
|
+
if dead_pct > 20:
|
|
83
|
+
issues.append({"layer": name, "issue": "dead_neurons", "severity": "high",
|
|
84
|
+
"message": f"{name}: {dead_pct:.0f}% dead neurons — consider batch norm or layer width reduction"})
|
|
85
|
+
elif dead_pct > 5:
|
|
86
|
+
issues.append({"layer": name, "issue": "dying_neurons", "severity": "medium",
|
|
87
|
+
"message": f"{name}: {dead_pct:.0f}% near-dead neurons"})
|
|
88
|
+
|
|
89
|
+
# Weight analysis
|
|
90
|
+
weight_std = layer.get("weight_std", 0)
|
|
91
|
+
near_zero_pct = layer.get("near_zero_pct", 0)
|
|
92
|
+
analysis["weight_std"] = weight_std
|
|
93
|
+
analysis["near_zero_pct"] = near_zero_pct
|
|
94
|
+
if near_zero_pct > 50:
|
|
95
|
+
issues.append({"layer": name, "issue": "sparse_weights", "severity": "medium",
|
|
96
|
+
"message": f"{name}: {near_zero_pct:.0f}% near-zero weights — pruning candidate"})
|
|
97
|
+
|
|
98
|
+
analyzed.append(analysis)
|
|
99
|
+
|
|
100
|
+
return {"layers": analyzed, "issues": issues, "model_type": "neural"}
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# --- Tree Model Diagnostics ---
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def diagnose_tree_model(tree_stats: dict) -> dict:
|
|
107
|
+
"""Analyze tree-based model statistics.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
tree_stats: Dict with n_trees, avg_depth, max_depth_allowed,
|
|
111
|
+
feature_split_freq, leaf_purity.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
Diagnosis dict.
|
|
115
|
+
"""
|
|
116
|
+
issues = []
|
|
117
|
+
|
|
118
|
+
n_trees = tree_stats.get("n_trees", 0)
|
|
119
|
+
avg_depth = tree_stats.get("avg_depth", 0)
|
|
120
|
+
max_depth = tree_stats.get("max_depth_allowed", 0)
|
|
121
|
+
feature_splits = tree_stats.get("feature_split_freq", {})
|
|
122
|
+
leaf_purity = tree_stats.get("leaf_purity", 0)
|
|
123
|
+
|
|
124
|
+
# Depth utilization
|
|
125
|
+
if max_depth > 0 and avg_depth > 0:
|
|
126
|
+
utilization = avg_depth / max_depth
|
|
127
|
+
if utilization < 0.5:
|
|
128
|
+
issues.append({"issue": "underutilized_depth", "severity": "medium",
|
|
129
|
+
"message": f"Trees use only {utilization:.0%} of allowed depth ({avg_depth:.1f}/{max_depth}) — consider reducing max_depth"})
|
|
130
|
+
elif utilization > 0.95:
|
|
131
|
+
issues.append({"issue": "depth_saturated", "severity": "medium",
|
|
132
|
+
"message": f"Trees use {utilization:.0%} of allowed depth — consider increasing max_depth"})
|
|
133
|
+
|
|
134
|
+
# Feature dominance
|
|
135
|
+
if feature_splits:
|
|
136
|
+
total_splits = sum(feature_splits.values())
|
|
137
|
+
if total_splits > 0:
|
|
138
|
+
top_feature = max(feature_splits, key=feature_splits.get)
|
|
139
|
+
top_pct = feature_splits[top_feature] / total_splits
|
|
140
|
+
if top_pct > 0.5:
|
|
141
|
+
issues.append({"issue": "feature_dominance", "severity": "medium",
|
|
142
|
+
"message": f"Feature '{top_feature}' dominates {top_pct:.0%} of splits — check for leakage or engineering opportunity"})
|
|
143
|
+
|
|
144
|
+
# Leaf purity
|
|
145
|
+
if leaf_purity > 0.99:
|
|
146
|
+
issues.append({"issue": "overfitting_risk", "severity": "medium",
|
|
147
|
+
"message": f"Leaf purity {leaf_purity:.4f} — model may be overfitting"})
|
|
148
|
+
|
|
149
|
+
return {
|
|
150
|
+
"model_type": "tree",
|
|
151
|
+
"n_trees": n_trees,
|
|
152
|
+
"avg_depth": avg_depth,
|
|
153
|
+
"max_depth_allowed": max_depth,
|
|
154
|
+
"depth_utilization": round(avg_depth / max_depth, 3) if max_depth > 0 else None,
|
|
155
|
+
"feature_split_freq": feature_splits,
|
|
156
|
+
"leaf_purity": leaf_purity,
|
|
157
|
+
"issues": issues,
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# --- sklearn Diagnostics ---
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def diagnose_sklearn_model(model_stats: dict) -> dict:
|
|
165
|
+
"""Analyze scikit-learn model statistics.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
model_stats: Dict with model_type, coefficients, feature_importances.
|
|
169
|
+
"""
|
|
170
|
+
issues = []
|
|
171
|
+
model_type = model_stats.get("model_type", "unknown")
|
|
172
|
+
|
|
173
|
+
coefficients = model_stats.get("coefficients", [])
|
|
174
|
+
if coefficients:
|
|
175
|
+
coef_arr = np.array(coefficients)
|
|
176
|
+
max_coef = float(np.max(np.abs(coef_arr)))
|
|
177
|
+
near_zero = float(np.mean(np.abs(coef_arr) < NEAR_ZERO_WEIGHT))
|
|
178
|
+
|
|
179
|
+
if max_coef > 100:
|
|
180
|
+
issues.append({"issue": "large_coefficients", "severity": "high",
|
|
181
|
+
"message": f"Max coefficient magnitude {max_coef:.1f} — consider regularization"})
|
|
182
|
+
if near_zero > 0.5:
|
|
183
|
+
issues.append({"issue": "sparse_coefficients", "severity": "medium",
|
|
184
|
+
"message": f"{near_zero:.0%} near-zero coefficients — feature selection may help"})
|
|
185
|
+
|
|
186
|
+
importances = model_stats.get("feature_importances", [])
|
|
187
|
+
if importances:
|
|
188
|
+
imp_arr = np.array(importances)
|
|
189
|
+
if len(imp_arr) > 0 and np.std(imp_arr) > 0:
|
|
190
|
+
top_k = min(3, len(imp_arr))
|
|
191
|
+
top_indices = np.argsort(imp_arr)[-top_k:]
|
|
192
|
+
top_total = float(np.sum(imp_arr[top_indices]))
|
|
193
|
+
if top_total > 0.8:
|
|
194
|
+
issues.append({"issue": "importance_concentrated", "severity": "medium",
|
|
195
|
+
"message": f"Top {top_k} features account for {top_total:.0%} of importance"})
|
|
196
|
+
|
|
197
|
+
return {
|
|
198
|
+
"model_type": model_type,
|
|
199
|
+
"n_coefficients": len(coefficients),
|
|
200
|
+
"n_importances": len(importances),
|
|
201
|
+
"issues": issues,
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
# --- Full X-Ray Pipeline ---
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def xray_model(
|
|
209
|
+
exp_id: str | None = None,
|
|
210
|
+
layer_stats: list[dict] | None = None,
|
|
211
|
+
tree_stats: dict | None = None,
|
|
212
|
+
sklearn_stats: dict | None = None,
|
|
213
|
+
config_path: str = "config.yaml",
|
|
214
|
+
log_path: str = DEFAULT_LOG_PATH,
|
|
215
|
+
) -> dict:
|
|
216
|
+
"""Run model diagnostics."""
|
|
217
|
+
config = load_config(config_path)
|
|
218
|
+
model_type_hint = config.get("model", {}).get("type", "")
|
|
219
|
+
|
|
220
|
+
diagnosis = None
|
|
221
|
+
if layer_stats is not None:
|
|
222
|
+
diagnosis = diagnose_neural_layers(layer_stats)
|
|
223
|
+
elif tree_stats is not None:
|
|
224
|
+
diagnosis = diagnose_tree_model(tree_stats)
|
|
225
|
+
elif sklearn_stats is not None:
|
|
226
|
+
diagnosis = diagnose_sklearn_model(sklearn_stats)
|
|
227
|
+
else:
|
|
228
|
+
diagnosis = {"model_type": "unknown", "issues": [],
|
|
229
|
+
"note": "No model stats provided. Run with model-specific stats for full diagnostics."}
|
|
230
|
+
|
|
231
|
+
return {
|
|
232
|
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
233
|
+
"experiment_id": exp_id,
|
|
234
|
+
"diagnosis": diagnosis,
|
|
235
|
+
"n_issues": len(diagnosis.get("issues", [])),
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
# --- Report Formatting ---
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def save_xray_report(report: dict, output_dir: str = "experiments/xrays") -> Path:
|
|
243
|
+
out_path = Path(output_dir)
|
|
244
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
|
245
|
+
exp_id = report.get("experiment_id", "unknown")
|
|
246
|
+
filepath = out_path / f"{exp_id}-xray.yaml"
|
|
247
|
+
with open(filepath, "w") as f:
|
|
248
|
+
yaml.dump(report, f, default_flow_style=False, sort_keys=False)
|
|
249
|
+
return filepath
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def format_xray_report(report: dict) -> str:
|
|
253
|
+
if "error" in report:
|
|
254
|
+
return f"ERROR: {report['error']}"
|
|
255
|
+
|
|
256
|
+
diag = report.get("diagnosis", {})
|
|
257
|
+
model_type = diag.get("model_type", "?")
|
|
258
|
+
exp_id = report.get("experiment_id", "?")
|
|
259
|
+
issues = diag.get("issues", [])
|
|
260
|
+
|
|
261
|
+
lines = [f"# X-Ray: {exp_id} ({model_type})", "",
|
|
262
|
+
f"*Generated {report.get('generated_at', 'N/A')[:19]}*", ""]
|
|
263
|
+
|
|
264
|
+
# Neural layer table
|
|
265
|
+
layers = diag.get("layers", [])
|
|
266
|
+
if layers:
|
|
267
|
+
lines.extend(["## Layer Analysis", "",
|
|
268
|
+
"| Layer | Grad Mean | Grad Max | Dead % | Weight Std |",
|
|
269
|
+
"|-------|-----------|----------|--------|------------|"])
|
|
270
|
+
for l in layers:
|
|
271
|
+
lines.append(f"| {l['name']} | {l.get('grad_mean', 0):.2e} | {l.get('grad_max', 0):.2e} | {l.get('dead_pct', 0):.0f}% | {l.get('weight_std', 0):.4f} |")
|
|
272
|
+
lines.append("")
|
|
273
|
+
|
|
274
|
+
# Tree stats
|
|
275
|
+
if model_type == "tree":
|
|
276
|
+
lines.extend(["## Tree Statistics", "",
|
|
277
|
+
f"- **Trees:** {diag.get('n_trees', '?')}",
|
|
278
|
+
f"- **Avg depth:** {diag.get('avg_depth', '?')}/{diag.get('max_depth_allowed', '?')}",
|
|
279
|
+
f"- **Leaf purity:** {diag.get('leaf_purity', '?')}", ""])
|
|
280
|
+
|
|
281
|
+
# Issues
|
|
282
|
+
if issues:
|
|
283
|
+
lines.extend(["## Issues Detected", ""])
|
|
284
|
+
for i in issues:
|
|
285
|
+
sev = i.get("severity", "?").upper()
|
|
286
|
+
lines.append(f"- **[{sev}]** {i.get('message', 'N/A')}")
|
|
287
|
+
else:
|
|
288
|
+
lines.extend(["## Issues Detected", "", "No issues found."])
|
|
289
|
+
|
|
290
|
+
if diag.get("note"):
|
|
291
|
+
lines.extend(["", f"*{diag['note']}*"])
|
|
292
|
+
|
|
293
|
+
return "\n".join(lines)
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def main() -> None:
|
|
297
|
+
parser = argparse.ArgumentParser(description="Internal model diagnostics")
|
|
298
|
+
parser.add_argument("exp_id", nargs="?", help="Experiment ID")
|
|
299
|
+
parser.add_argument("--config", default="config.yaml")
|
|
300
|
+
parser.add_argument("--log", default=DEFAULT_LOG_PATH)
|
|
301
|
+
parser.add_argument("--json", action="store_true")
|
|
302
|
+
args = parser.parse_args()
|
|
303
|
+
|
|
304
|
+
report = xray_model(exp_id=args.exp_id, config_path=args.config, log_path=args.log)
|
|
305
|
+
|
|
306
|
+
if "error" not in report:
|
|
307
|
+
filepath = save_xray_report(report)
|
|
308
|
+
print(f"Saved to {filepath}", file=sys.stderr)
|
|
309
|
+
|
|
310
|
+
if args.json:
|
|
311
|
+
print(json.dumps(report, indent=2, default=str))
|
|
312
|
+
else:
|
|
313
|
+
print(format_xray_report(report))
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
if __name__ == "__main__":
|
|
317
|
+
main()
|
|
@@ -121,6 +121,9 @@ TEMPLATE_DIRS = {
|
|
|
121
121
|
"sanity_checks.py",
|
|
122
122
|
"generate_baselines.py",
|
|
123
123
|
"leakage_detector.py",
|
|
124
|
+
"model_xray.py",
|
|
125
|
+
"sensitivity_analysis.py",
|
|
126
|
+
"calibration.py",
|
|
124
127
|
],
|
|
125
128
|
"tests": ["__init__.py", "conftest.py"],
|
|
126
129
|
}
|
|
@@ -154,6 +157,9 @@ DIRECTORIES_TO_CREATE = [
|
|
|
154
157
|
"experiments/sanity",
|
|
155
158
|
"experiments/baselines",
|
|
156
159
|
"experiments/leakage",
|
|
160
|
+
"experiments/xrays",
|
|
161
|
+
"experiments/sensitivity",
|
|
162
|
+
"experiments/calibration",
|
|
157
163
|
"experiments/logs",
|
|
158
164
|
"models/best",
|
|
159
165
|
"models/archive",
|
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Hyperparameter sensitivity analysis for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
Varies each hyperparameter individually while holding others fixed,
|
|
5
|
+
measures the metric response, and ranks hyperparameters by sensitivity.
|
|
6
|
+
Answers "which hyperparameters actually matter?"
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python scripts/sensitivity_analysis.py exp-042
|
|
10
|
+
python scripts/sensitivity_analysis.py --params "learning_rate,max_depth"
|
|
11
|
+
python scripts/sensitivity_analysis.py --json
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import json
|
|
18
|
+
import math
|
|
19
|
+
import sys
|
|
20
|
+
from datetime import datetime, timezone
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
import numpy as np
|
|
24
|
+
import yaml
|
|
25
|
+
|
|
26
|
+
from scripts.turing_io import load_config, load_experiments
|
|
27
|
+
|
|
28
|
+
DEFAULT_LOG_PATH = "experiments/log.jsonl"
|
|
29
|
+
DEFAULT_N_POINTS = 5
|
|
30
|
+
SENSITIVITY_THRESHOLDS = {"HIGH": 0.02, "MED": 0.005, "LOW": 0.002}
|
|
31
|
+
DEFAULT_MULTIPLIERS = [0.5, 0.75, 1.0, 1.5, 2.0]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# --- Sweep Generation ---
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def generate_sweep(
|
|
38
|
+
param_name: str,
|
|
39
|
+
current_value: float | int,
|
|
40
|
+
n_points: int = DEFAULT_N_POINTS,
|
|
41
|
+
multipliers: list[float] | None = None,
|
|
42
|
+
) -> list[dict]:
|
|
43
|
+
"""Generate sweep values for a hyperparameter.
|
|
44
|
+
|
|
45
|
+
Returns list of {value, multiplier} dicts.
|
|
46
|
+
"""
|
|
47
|
+
if multipliers is None:
|
|
48
|
+
multipliers = DEFAULT_MULTIPLIERS[:n_points]
|
|
49
|
+
|
|
50
|
+
points = []
|
|
51
|
+
for m in multipliers:
|
|
52
|
+
if isinstance(current_value, int):
|
|
53
|
+
val = max(1, int(current_value * m))
|
|
54
|
+
else:
|
|
55
|
+
val = current_value * m
|
|
56
|
+
points.append({
|
|
57
|
+
"value": val,
|
|
58
|
+
"multiplier": round(m, 2),
|
|
59
|
+
"is_current": abs(m - 1.0) < 0.01,
|
|
60
|
+
})
|
|
61
|
+
|
|
62
|
+
return points
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def extract_tunable_params(config: dict) -> dict:
|
|
66
|
+
"""Extract tunable hyperparameters from config."""
|
|
67
|
+
hyperparams = config.get("model", {}).get("hyperparams", {})
|
|
68
|
+
|
|
69
|
+
tunable = {}
|
|
70
|
+
for key, val in hyperparams.items():
|
|
71
|
+
if isinstance(val, (int, float)) and key not in ("seed", "random_state", "verbose"):
|
|
72
|
+
tunable[key] = val
|
|
73
|
+
|
|
74
|
+
return tunable
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# --- Sensitivity Scoring ---
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def compute_sensitivity(
|
|
81
|
+
param_name: str,
|
|
82
|
+
sweep_results: list[dict],
|
|
83
|
+
primary_metric: str,
|
|
84
|
+
) -> dict:
|
|
85
|
+
"""Compute sensitivity score for a hyperparameter.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
param_name: Hyperparameter name.
|
|
89
|
+
sweep_results: List of {value, metric_value} dicts.
|
|
90
|
+
primary_metric: Name of the primary metric.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Sensitivity dict with score, level, range, best value, monotonicity.
|
|
94
|
+
"""
|
|
95
|
+
if not sweep_results or len(sweep_results) < 2:
|
|
96
|
+
return {"param": param_name, "sensitivity": 0, "level": "NONE",
|
|
97
|
+
"reason": "Insufficient sweep data"}
|
|
98
|
+
|
|
99
|
+
values = [r.get("value") for r in sweep_results]
|
|
100
|
+
metrics = [r.get("metric_value") for r in sweep_results
|
|
101
|
+
if r.get("metric_value") is not None]
|
|
102
|
+
|
|
103
|
+
if len(metrics) < 2:
|
|
104
|
+
return {"param": param_name, "sensitivity": 0, "level": "NONE",
|
|
105
|
+
"reason": "Insufficient metric data"}
|
|
106
|
+
|
|
107
|
+
metric_range = max(metrics) - min(metrics)
|
|
108
|
+
metric_mean = np.mean(metrics)
|
|
109
|
+
|
|
110
|
+
# Normalized sensitivity
|
|
111
|
+
sensitivity = metric_range / abs(metric_mean) if metric_mean != 0 else metric_range
|
|
112
|
+
|
|
113
|
+
# Classify level
|
|
114
|
+
if sensitivity > SENSITIVITY_THRESHOLDS["HIGH"]:
|
|
115
|
+
level = "HIGH"
|
|
116
|
+
elif sensitivity > SENSITIVITY_THRESHOLDS["MED"]:
|
|
117
|
+
level = "MED"
|
|
118
|
+
elif sensitivity > SENSITIVITY_THRESHOLDS["LOW"]:
|
|
119
|
+
level = "LOW"
|
|
120
|
+
else:
|
|
121
|
+
level = "NONE"
|
|
122
|
+
|
|
123
|
+
# Check monotonicity
|
|
124
|
+
monotonic = _check_monotonicity(metrics)
|
|
125
|
+
|
|
126
|
+
# Best value
|
|
127
|
+
best_idx = np.argmax(metrics)
|
|
128
|
+
best_value = values[best_idx] if best_idx < len(values) else None
|
|
129
|
+
|
|
130
|
+
return {
|
|
131
|
+
"param": param_name,
|
|
132
|
+
"current_value": next((r["value"] for r in sweep_results if r.get("is_current")), None),
|
|
133
|
+
"sensitivity": round(float(sensitivity), 6),
|
|
134
|
+
"metric_range": round(float(metric_range), 6),
|
|
135
|
+
"metric_min": round(float(min(metrics)), 6),
|
|
136
|
+
"metric_max": round(float(max(metrics)), 6),
|
|
137
|
+
"level": level,
|
|
138
|
+
"best_value": best_value,
|
|
139
|
+
"monotonic": monotonic,
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _check_monotonicity(values: list[float]) -> str:
|
|
144
|
+
"""Check if values are monotonically increasing, decreasing, or non-monotonic."""
|
|
145
|
+
if len(values) < 2:
|
|
146
|
+
return "unknown"
|
|
147
|
+
|
|
148
|
+
diffs = [values[i + 1] - values[i] for i in range(len(values) - 1)]
|
|
149
|
+
all_pos = all(d >= 0 for d in diffs)
|
|
150
|
+
all_neg = all(d <= 0 for d in diffs)
|
|
151
|
+
|
|
152
|
+
if all_pos:
|
|
153
|
+
return "increasing"
|
|
154
|
+
elif all_neg:
|
|
155
|
+
return "decreasing"
|
|
156
|
+
else:
|
|
157
|
+
return "non_monotonic"
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def rank_sensitivities(sensitivities: list[dict]) -> list[dict]:
|
|
161
|
+
"""Rank parameters by sensitivity (highest first)."""
|
|
162
|
+
return sorted(sensitivities, key=lambda s: s.get("sensitivity", 0), reverse=True)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
# --- Recommendations ---
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def generate_recommendations(ranked: list[dict]) -> list[str]:
|
|
169
|
+
"""Generate tuning recommendations from sensitivity ranking."""
|
|
170
|
+
recs = []
|
|
171
|
+
|
|
172
|
+
high = [s for s in ranked if s["level"] == "HIGH"]
|
|
173
|
+
none = [s for s in ranked if s["level"] == "NONE"]
|
|
174
|
+
|
|
175
|
+
if high:
|
|
176
|
+
names = ", ".join(s["param"] for s in high)
|
|
177
|
+
recs.append(f"Focus tuning on {names}")
|
|
178
|
+
|
|
179
|
+
if none:
|
|
180
|
+
names = ", ".join(s["param"] for s in none)
|
|
181
|
+
recs.append(f"Stop tuning {names} — they don't matter for this model")
|
|
182
|
+
|
|
183
|
+
non_mono = [s for s in ranked if s.get("monotonic") == "non_monotonic" and s["level"] in ("HIGH", "MED")]
|
|
184
|
+
if non_mono:
|
|
185
|
+
for s in non_mono:
|
|
186
|
+
recs.append(f"{s['param']} has a non-monotonic relationship — there's an optimal sweet spot around {s.get('best_value')}")
|
|
187
|
+
|
|
188
|
+
return recs
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
# --- Full Pipeline ---
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def sensitivity_analysis(
|
|
195
|
+
exp_id: str | None = None,
|
|
196
|
+
params: list[str] | None = None,
|
|
197
|
+
sweep_data: dict[str, list[dict]] | None = None,
|
|
198
|
+
config_path: str = "config.yaml",
|
|
199
|
+
log_path: str = DEFAULT_LOG_PATH,
|
|
200
|
+
) -> dict:
|
|
201
|
+
"""Run sensitivity analysis.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
exp_id: Experiment ID to analyze.
|
|
205
|
+
params: Specific parameters to analyze.
|
|
206
|
+
sweep_data: Pre-computed sweep results {param: [{value, metric_value}]}.
|
|
207
|
+
config_path: Path to config.yaml.
|
|
208
|
+
log_path: Path to experiment log.
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
Sensitivity analysis report.
|
|
212
|
+
"""
|
|
213
|
+
config = load_config(config_path)
|
|
214
|
+
eval_cfg = config.get("evaluation", {})
|
|
215
|
+
primary_metric = eval_cfg.get("primary_metric", "accuracy")
|
|
216
|
+
|
|
217
|
+
sensitivities = []
|
|
218
|
+
|
|
219
|
+
if sweep_data:
|
|
220
|
+
# Analyze pre-computed sweep data
|
|
221
|
+
for param, results in sweep_data.items():
|
|
222
|
+
sens = compute_sensitivity(param, results, primary_metric)
|
|
223
|
+
sensitivities.append(sens)
|
|
224
|
+
else:
|
|
225
|
+
# Generate sweep plan (actual execution done by agent)
|
|
226
|
+
tunable = extract_tunable_params(config)
|
|
227
|
+
if params:
|
|
228
|
+
tunable = {k: v for k, v in tunable.items() if k in params}
|
|
229
|
+
|
|
230
|
+
if not tunable:
|
|
231
|
+
return {"error": "No tunable hyperparameters found in config"}
|
|
232
|
+
|
|
233
|
+
sweep_plans = {}
|
|
234
|
+
for param, value in tunable.items():
|
|
235
|
+
sweep_plans[param] = generate_sweep(param, value)
|
|
236
|
+
|
|
237
|
+
return {
|
|
238
|
+
"action": "plan",
|
|
239
|
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
240
|
+
"primary_metric": primary_metric,
|
|
241
|
+
"experiment_id": exp_id,
|
|
242
|
+
"sweep_plans": sweep_plans,
|
|
243
|
+
"n_experiments_needed": sum(len(s) for s in sweep_plans.values()),
|
|
244
|
+
"message": f"Sweep {len(sweep_plans)} parameters × {DEFAULT_N_POINTS} values each",
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
ranked = rank_sensitivities(sensitivities)
|
|
248
|
+
recommendations = generate_recommendations(ranked)
|
|
249
|
+
|
|
250
|
+
return {
|
|
251
|
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
252
|
+
"primary_metric": primary_metric,
|
|
253
|
+
"experiment_id": exp_id,
|
|
254
|
+
"sensitivities": ranked,
|
|
255
|
+
"recommendations": recommendations,
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
# --- Report Formatting ---
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def save_sensitivity_report(report: dict, output_dir: str = "experiments/sensitivity") -> Path:
|
|
263
|
+
out_path = Path(output_dir)
|
|
264
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
|
265
|
+
exp_id = report.get("experiment_id", "unknown")
|
|
266
|
+
filepath = out_path / f"{exp_id}-sensitivity.yaml"
|
|
267
|
+
with open(filepath, "w") as f:
|
|
268
|
+
yaml.dump(report, f, default_flow_style=False, sort_keys=False)
|
|
269
|
+
return filepath
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def format_sensitivity_report(report: dict) -> str:
|
|
273
|
+
if "error" in report:
|
|
274
|
+
return f"ERROR: {report['error']}"
|
|
275
|
+
|
|
276
|
+
if report.get("action") == "plan":
|
|
277
|
+
plans = report.get("sweep_plans", {})
|
|
278
|
+
lines = ["# Sensitivity Analysis Plan", "",
|
|
279
|
+
f"**{report.get('n_experiments_needed', 0)} experiments** needed for {len(plans)} parameters", ""]
|
|
280
|
+
for param, points in plans.items():
|
|
281
|
+
vals = ", ".join(str(p["value"]) for p in points)
|
|
282
|
+
lines.append(f"- **{param}:** [{vals}]")
|
|
283
|
+
return "\n".join(lines)
|
|
284
|
+
|
|
285
|
+
metric = report.get("primary_metric", "metric")
|
|
286
|
+
exp_id = report.get("experiment_id", "?")
|
|
287
|
+
|
|
288
|
+
lines = [f"# Hyperparameter Sensitivity Analysis ({exp_id})", "",
|
|
289
|
+
f"*Generated {report.get('generated_at', 'N/A')[:19]}*", "",
|
|
290
|
+
f"| Parameter | Current | Range Tested | {metric} Range | Sensitivity |",
|
|
291
|
+
"|-----------|---------|-------------|----------------|-------------|"]
|
|
292
|
+
|
|
293
|
+
for s in report.get("sensitivities", []):
|
|
294
|
+
current = s.get("current_value", "?")
|
|
295
|
+
metric_range = f"{s['metric_min']:.4f}–{s['metric_max']:.4f}" if s.get("metric_min") is not None else "N/A"
|
|
296
|
+
sens = f"{s['level']} ({s['sensitivity']:.4f})"
|
|
297
|
+
lines.append(f"| {s['param']} | {current} | — | {metric_range} | {sens} |")
|
|
298
|
+
|
|
299
|
+
recs = report.get("recommendations", [])
|
|
300
|
+
if recs:
|
|
301
|
+
lines.extend(["", "## Recommendations", ""])
|
|
302
|
+
for r in recs:
|
|
303
|
+
lines.append(f"- {r}")
|
|
304
|
+
|
|
305
|
+
return "\n".join(lines)
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def main() -> None:
|
|
309
|
+
parser = argparse.ArgumentParser(description="Hyperparameter sensitivity analysis")
|
|
310
|
+
parser.add_argument("exp_id", nargs="?", help="Experiment ID")
|
|
311
|
+
parser.add_argument("--params", help="Comma-separated parameter names")
|
|
312
|
+
parser.add_argument("--config", default="config.yaml")
|
|
313
|
+
parser.add_argument("--log", default=DEFAULT_LOG_PATH)
|
|
314
|
+
parser.add_argument("--json", action="store_true")
|
|
315
|
+
args = parser.parse_args()
|
|
316
|
+
|
|
317
|
+
params = [p.strip() for p in args.params.split(",")] if args.params else None
|
|
318
|
+
|
|
319
|
+
report = sensitivity_analysis(
|
|
320
|
+
exp_id=args.exp_id, params=params,
|
|
321
|
+
config_path=args.config, log_path=args.log,
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
if "error" not in report:
|
|
325
|
+
filepath = save_sensitivity_report(report)
|
|
326
|
+
print(f"Saved to {filepath}", file=sys.stderr)
|
|
327
|
+
|
|
328
|
+
if args.json:
|
|
329
|
+
print(json.dumps(report, indent=2, default=str))
|
|
330
|
+
else:
|
|
331
|
+
print(format_sensitivity_report(report))
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
if __name__ == "__main__":
|
|
335
|
+
main()
|