@zigrivers/scaffold 3.13.0 → 3.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -10
- package/content/knowledge/research/research-architecture.md +385 -0
- package/content/knowledge/research/research-conventions.md +248 -0
- package/content/knowledge/research/research-dev-environment.md +303 -0
- package/content/knowledge/research/research-experiment-loop.md +429 -0
- package/content/knowledge/research/research-experiment-tracking.md +336 -0
- package/content/knowledge/research/research-ml-architecture-search.md +383 -0
- package/content/knowledge/research/research-ml-evaluation.md +407 -0
- package/content/knowledge/research/research-ml-experiment-tracking.md +466 -0
- package/content/knowledge/research/research-ml-training-patterns.md +413 -0
- package/content/knowledge/research/research-observability.md +395 -0
- package/content/knowledge/research/research-overfitting-prevention.md +306 -0
- package/content/knowledge/research/research-project-structure.md +264 -0
- package/content/knowledge/research/research-quant-backtesting.md +326 -0
- package/content/knowledge/research/research-quant-market-data.md +366 -0
- package/content/knowledge/research/research-quant-metrics.md +335 -0
- package/content/knowledge/research/research-quant-requirements.md +223 -0
- package/content/knowledge/research/research-quant-risk.md +469 -0
- package/content/knowledge/research/research-quant-strategy-patterns.md +412 -0
- package/content/knowledge/research/research-requirements.md +201 -0
- package/content/knowledge/research/research-security.md +374 -0
- package/content/knowledge/research/research-sim-compute-management.md +538 -0
- package/content/knowledge/research/research-sim-engine-patterns.md +448 -0
- package/content/knowledge/research/research-sim-parameter-spaces.md +425 -0
- package/content/knowledge/research/research-sim-validation.md +456 -0
- package/content/knowledge/research/research-testing.md +334 -0
- package/content/methodology/research-ml-research.yml +23 -0
- package/content/methodology/research-overlay.yml +65 -0
- package/content/methodology/research-quant-finance.yml +29 -0
- package/content/methodology/research-simulation.yml +23 -0
- package/dist/cli/commands/adopt.d.ts.map +1 -1
- package/dist/cli/commands/adopt.js +30 -8
- package/dist/cli/commands/adopt.js.map +1 -1
- package/dist/cli/commands/adopt.serialization.test.js +49 -0
- package/dist/cli/commands/adopt.serialization.test.js.map +1 -1
- package/dist/cli/commands/adopt.test.js +8 -0
- package/dist/cli/commands/adopt.test.js.map +1 -1
- package/dist/cli/commands/build.d.ts.map +1 -1
- package/dist/cli/commands/build.js +191 -180
- package/dist/cli/commands/build.js.map +1 -1
- package/dist/cli/commands/complete.d.ts.map +1 -1
- package/dist/cli/commands/complete.js +16 -12
- package/dist/cli/commands/complete.js.map +1 -1
- package/dist/cli/commands/complete.test.js +14 -5
- package/dist/cli/commands/complete.test.js.map +1 -1
- package/dist/cli/commands/init.d.ts +4 -0
- package/dist/cli/commands/init.d.ts.map +1 -1
- package/dist/cli/commands/init.js +75 -51
- package/dist/cli/commands/init.js.map +1 -1
- package/dist/cli/commands/init.test.js +33 -27
- package/dist/cli/commands/init.test.js.map +1 -1
- package/dist/cli/commands/reset.d.ts.map +1 -1
- package/dist/cli/commands/reset.js +44 -40
- package/dist/cli/commands/reset.js.map +1 -1
- package/dist/cli/commands/reset.test.js +42 -20
- package/dist/cli/commands/reset.test.js.map +1 -1
- package/dist/cli/commands/rework.d.ts.map +1 -1
- package/dist/cli/commands/rework.js +16 -12
- package/dist/cli/commands/rework.js.map +1 -1
- package/dist/cli/commands/rework.test.js +12 -3
- package/dist/cli/commands/rework.test.js.map +1 -1
- package/dist/cli/commands/run.d.ts.map +1 -1
- package/dist/cli/commands/run.js +318 -298
- package/dist/cli/commands/run.js.map +1 -1
- package/dist/cli/commands/run.test.js +92 -120
- package/dist/cli/commands/run.test.js.map +1 -1
- package/dist/cli/commands/skip.d.ts.map +1 -1
- package/dist/cli/commands/skip.js +19 -15
- package/dist/cli/commands/skip.js.map +1 -1
- package/dist/cli/commands/skip.test.js +22 -11
- package/dist/cli/commands/skip.test.js.map +1 -1
- package/dist/cli/commands/update.d.ts.map +1 -1
- package/dist/cli/commands/update.js +3 -1
- package/dist/cli/commands/update.js.map +1 -1
- package/dist/cli/commands/update.test.js +8 -4
- package/dist/cli/commands/update.test.js.map +1 -1
- package/dist/cli/commands/version.d.ts.map +1 -1
- package/dist/cli/commands/version.js +3 -1
- package/dist/cli/commands/version.js.map +1 -1
- package/dist/cli/commands/version.test.js +9 -5
- package/dist/cli/commands/version.test.js.map +1 -1
- package/dist/cli/index.d.ts.map +1 -1
- package/dist/cli/index.js +2 -0
- package/dist/cli/index.js.map +1 -1
- package/dist/cli/init-flag-families.d.ts +6 -1
- package/dist/cli/init-flag-families.d.ts.map +1 -1
- package/dist/cli/init-flag-families.js +32 -1
- package/dist/cli/init-flag-families.js.map +1 -1
- package/dist/cli/init-flag-families.test.js +47 -0
- package/dist/cli/init-flag-families.test.js.map +1 -1
- package/dist/cli/output/interactive.d.ts +1 -0
- package/dist/cli/output/interactive.d.ts.map +1 -1
- package/dist/cli/output/interactive.js +5 -0
- package/dist/cli/output/interactive.js.map +1 -1
- package/dist/cli/shutdown.d.ts +51 -0
- package/dist/cli/shutdown.d.ts.map +1 -0
- package/dist/cli/shutdown.js +199 -0
- package/dist/cli/shutdown.js.map +1 -0
- package/dist/cli/shutdown.test.d.ts +2 -0
- package/dist/cli/shutdown.test.d.ts.map +1 -0
- package/dist/cli/shutdown.test.js +316 -0
- package/dist/cli/shutdown.test.js.map +1 -0
- package/dist/config/schema.d.ts +272 -16
- package/dist/config/schema.d.ts.map +1 -1
- package/dist/config/schema.js +25 -1
- package/dist/config/schema.js.map +1 -1
- package/dist/config/schema.test.js +103 -3
- package/dist/config/schema.test.js.map +1 -1
- package/dist/core/assembly/overlay-loader.d.ts +12 -0
- package/dist/core/assembly/overlay-loader.d.ts.map +1 -1
- package/dist/core/assembly/overlay-loader.js +30 -0
- package/dist/core/assembly/overlay-loader.js.map +1 -1
- package/dist/core/assembly/overlay-loader.test.js +66 -1
- package/dist/core/assembly/overlay-loader.test.js.map +1 -1
- package/dist/core/assembly/overlay-state-resolver.d.ts.map +1 -1
- package/dist/core/assembly/overlay-state-resolver.js +48 -19
- package/dist/core/assembly/overlay-state-resolver.js.map +1 -1
- package/dist/core/assembly/overlay-state-resolver.test.js +80 -0
- package/dist/core/assembly/overlay-state-resolver.test.js.map +1 -1
- package/dist/e2e/init.test.js +5 -4
- package/dist/e2e/init.test.js.map +1 -1
- package/dist/e2e/project-type-overlays.test.js +119 -0
- package/dist/e2e/project-type-overlays.test.js.map +1 -1
- package/dist/project/adopt.d.ts.map +1 -1
- package/dist/project/adopt.js +3 -1
- package/dist/project/adopt.js.map +1 -1
- package/dist/project/detectors/disambiguate.js +1 -1
- package/dist/project/detectors/disambiguate.js.map +1 -1
- package/dist/project/detectors/index.d.ts.map +1 -1
- package/dist/project/detectors/index.js +2 -1
- package/dist/project/detectors/index.js.map +1 -1
- package/dist/project/detectors/ml.d.ts.map +1 -1
- package/dist/project/detectors/ml.js +2 -6
- package/dist/project/detectors/ml.js.map +1 -1
- package/dist/project/detectors/research.d.ts +4 -0
- package/dist/project/detectors/research.d.ts.map +1 -0
- package/dist/project/detectors/research.js +141 -0
- package/dist/project/detectors/research.js.map +1 -0
- package/dist/project/detectors/research.test.d.ts +2 -0
- package/dist/project/detectors/research.test.d.ts.map +1 -0
- package/dist/project/detectors/research.test.js +235 -0
- package/dist/project/detectors/research.test.js.map +1 -0
- package/dist/project/detectors/shared-signals.d.ts +3 -0
- package/dist/project/detectors/shared-signals.d.ts.map +1 -0
- package/dist/project/detectors/shared-signals.js +9 -0
- package/dist/project/detectors/shared-signals.js.map +1 -0
- package/dist/project/detectors/types.d.ts +6 -2
- package/dist/project/detectors/types.d.ts.map +1 -1
- package/dist/project/detectors/types.js.map +1 -1
- package/dist/state/lock-manager.d.ts +1 -0
- package/dist/state/lock-manager.d.ts.map +1 -1
- package/dist/state/lock-manager.js +1 -1
- package/dist/state/lock-manager.js.map +1 -1
- package/dist/types/config.d.ts +7 -1
- package/dist/types/config.d.ts.map +1 -1
- package/dist/wizard/copy/core.d.ts.map +1 -1
- package/dist/wizard/copy/core.js +4 -0
- package/dist/wizard/copy/core.js.map +1 -1
- package/dist/wizard/copy/index.d.ts.map +1 -1
- package/dist/wizard/copy/index.js +2 -0
- package/dist/wizard/copy/index.js.map +1 -1
- package/dist/wizard/copy/research.d.ts +3 -0
- package/dist/wizard/copy/research.d.ts.map +1 -0
- package/dist/wizard/copy/research.js +27 -0
- package/dist/wizard/copy/research.js.map +1 -0
- package/dist/wizard/copy/types.d.ts +5 -1
- package/dist/wizard/copy/types.d.ts.map +1 -1
- package/dist/wizard/flags.d.ts +7 -1
- package/dist/wizard/flags.d.ts.map +1 -1
- package/dist/wizard/questions.d.ts +4 -2
- package/dist/wizard/questions.d.ts.map +1 -1
- package/dist/wizard/questions.js +27 -1
- package/dist/wizard/questions.js.map +1 -1
- package/dist/wizard/questions.test.js +51 -0
- package/dist/wizard/questions.test.js.map +1 -1
- package/dist/wizard/wizard.d.ts +3 -2
- package/dist/wizard/wizard.d.ts.map +1 -1
- package/dist/wizard/wizard.js +3 -1
- package/dist/wizard/wizard.js.map +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,407 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: research-ml-evaluation
|
|
3
|
+
description: Research evaluation patterns including ablation studies, statistical significance testing, multiple comparison correction, effect sizes, learning curve analysis, and efficiency frontiers
|
|
4
|
+
topics: [research, ml-research, evaluation, ablation, significance, bootstrap, bonferroni, effect-size, learning-curve, pareto]
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
Research evaluation differs fundamentally from production ML evaluation. In production, you measure a single model on held-out data and report aggregate metrics. In research, you compare multiple methods, ablate components to understand contributions, test whether differences are statistically significant rather than due to random variation, and characterize the efficiency frontier (what accuracy is achievable at what compute cost). Reporting a number without confidence intervals, without significance testing against baselines, and without ablations is not research evaluation -- it is anecdote.
|
|
8
|
+
|
|
9
|
+
## Summary
|
|
10
|
+
|
|
11
|
+
Evaluate research results with statistical rigor: run multiple seeds per configuration and report mean with confidence intervals, use paired statistical tests (paired t-test, Wilcoxon signed-rank, bootstrap) to determine whether improvements are significant, apply multiple comparison correction (Bonferroni, Holm-Bonferroni) when comparing against many baselines, report effect sizes (Cohen's d) alongside p-values, conduct systematic ablation studies to attribute performance to specific components, analyze learning curves to understand sample efficiency, and plot efficiency frontiers (accuracy vs compute) to characterize the cost-performance tradeoff.
|
|
12
|
+
|
|
13
|
+
## Deep Guidance
|
|
14
|
+
|
|
15
|
+
### Ablation Studies
|
|
16
|
+
|
|
17
|
+
Ablations remove or disable one component at a time to measure its contribution. Without ablations, you cannot claim which parts of your method actually matter:
|
|
18
|
+
|
|
19
|
+
```python
|
|
20
|
+
# src/evaluation/ablation.py
|
|
21
|
+
from dataclasses import dataclass, field
|
|
22
|
+
from typing import Any, Callable
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class AblationConfig:
|
|
26
|
+
"""Define what components to ablate."""
|
|
27
|
+
full_config: dict[str, Any] # The complete config (baseline)
|
|
28
|
+
ablation_map: dict[str, Any] = field(default_factory=dict)
|
|
29
|
+
# Maps component name -> value that disables it
|
|
30
|
+
# e.g., {"attention": False, "residual": False, "dropout": 0.0}
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class AblationResult:
|
|
34
|
+
component_removed: str
|
|
35
|
+
full_score: float
|
|
36
|
+
ablated_score: float
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def contribution(self) -> float:
|
|
40
|
+
"""How much this component contributes to performance."""
|
|
41
|
+
return self.full_score - self.ablated_score
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def relative_contribution(self) -> float:
|
|
45
|
+
"""Contribution as a fraction of total performance."""
|
|
46
|
+
if self.full_score == 0:
|
|
47
|
+
return 0.0
|
|
48
|
+
return self.contribution / self.full_score
|
|
49
|
+
|
|
50
|
+
def run_ablation_study(
|
|
51
|
+
full_config: dict[str, Any],
|
|
52
|
+
ablation_map: dict[str, Any],
|
|
53
|
+
train_and_eval_fn: Callable[[dict], float],
|
|
54
|
+
n_seeds: int = 5,
|
|
55
|
+
) -> list[AblationResult]:
|
|
56
|
+
"""Run complete ablation study with multiple seeds."""
|
|
57
|
+
import numpy as np
|
|
58
|
+
|
|
59
|
+
# Evaluate full model
|
|
60
|
+
full_scores = [
|
|
61
|
+
train_and_eval_fn({**full_config, "seed": seed})
|
|
62
|
+
for seed in range(n_seeds)
|
|
63
|
+
]
|
|
64
|
+
full_mean = np.mean(full_scores)
|
|
65
|
+
|
|
66
|
+
results = []
|
|
67
|
+
for component, disabled_value in ablation_map.items():
|
|
68
|
+
# Create config with this component disabled
|
|
69
|
+
ablated_config = {**full_config, component: disabled_value}
|
|
70
|
+
ablated_scores = [
|
|
71
|
+
train_and_eval_fn({**ablated_config, "seed": seed})
|
|
72
|
+
for seed in range(n_seeds)
|
|
73
|
+
]
|
|
74
|
+
ablated_mean = np.mean(ablated_scores)
|
|
75
|
+
results.append(AblationResult(
|
|
76
|
+
component_removed=component,
|
|
77
|
+
full_score=full_mean,
|
|
78
|
+
ablated_score=ablated_mean,
|
|
79
|
+
))
|
|
80
|
+
|
|
81
|
+
# Sort by contribution (most important first)
|
|
82
|
+
results.sort(key=lambda r: r.contribution, reverse=True)
|
|
83
|
+
return results
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Statistical Significance Testing
|
|
87
|
+
|
|
88
|
+
Never report that method A outperforms method B without testing statistical significance. A difference of 0.5% accuracy on a single seed is noise, not signal:
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
# src/evaluation/significance.py
|
|
92
|
+
import numpy as np
|
|
93
|
+
from scipy import stats
|
|
94
|
+
from dataclasses import dataclass
|
|
95
|
+
|
|
96
|
+
@dataclass
|
|
97
|
+
class SignificanceResult:
|
|
98
|
+
test_name: str
|
|
99
|
+
statistic: float
|
|
100
|
+
p_value: float
|
|
101
|
+
is_significant: bool # At the given alpha level
|
|
102
|
+
alpha: float
|
|
103
|
+
effect_size: float # Cohen's d
|
|
104
|
+
confidence_interval: tuple[float, float]
|
|
105
|
+
|
|
106
|
+
def paired_t_test(
|
|
107
|
+
scores_a: list[float],
|
|
108
|
+
scores_b: list[float],
|
|
109
|
+
alpha: float = 0.05,
|
|
110
|
+
) -> SignificanceResult:
|
|
111
|
+
"""Paired t-test for comparing two methods across the same seeds/folds."""
|
|
112
|
+
assert len(scores_a) == len(scores_b), "Must have paired observations"
|
|
113
|
+
a = np.array(scores_a)
|
|
114
|
+
b = np.array(scores_b)
|
|
115
|
+
|
|
116
|
+
statistic, p_value = stats.ttest_rel(a, b)
|
|
117
|
+
effect_size = cohens_d_paired(a, b)
|
|
118
|
+
|
|
119
|
+
# Confidence interval on the mean difference
|
|
120
|
+
diff = a - b
|
|
121
|
+
ci = stats.t.interval(
|
|
122
|
+
1 - alpha,
|
|
123
|
+
df=len(diff) - 1,
|
|
124
|
+
loc=np.mean(diff),
|
|
125
|
+
scale=stats.sem(diff),
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
return SignificanceResult(
|
|
129
|
+
test_name="paired_t_test",
|
|
130
|
+
statistic=statistic,
|
|
131
|
+
p_value=p_value,
|
|
132
|
+
is_significant=p_value < alpha,
|
|
133
|
+
alpha=alpha,
|
|
134
|
+
effect_size=effect_size,
|
|
135
|
+
confidence_interval=ci,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
def bootstrap_ci(
|
|
139
|
+
scores_a: list[float],
|
|
140
|
+
scores_b: list[float],
|
|
141
|
+
n_bootstrap: int = 10000,
|
|
142
|
+
alpha: float = 0.05,
|
|
143
|
+
) -> SignificanceResult:
|
|
144
|
+
"""Bootstrap confidence interval for the difference in means."""
|
|
145
|
+
a = np.array(scores_a)
|
|
146
|
+
b = np.array(scores_b)
|
|
147
|
+
observed_diff = np.mean(a) - np.mean(b)
|
|
148
|
+
|
|
149
|
+
rng = np.random.default_rng(42)
|
|
150
|
+
bootstrap_diffs = []
|
|
151
|
+
n = len(a)
|
|
152
|
+
for _ in range(n_bootstrap):
|
|
153
|
+
idx = rng.integers(0, n, size=n)
|
|
154
|
+
bootstrap_diffs.append(np.mean(a[idx]) - np.mean(b[idx]))
|
|
155
|
+
|
|
156
|
+
bootstrap_diffs = np.array(bootstrap_diffs)
|
|
157
|
+
ci_low = np.percentile(bootstrap_diffs, 100 * alpha / 2)
|
|
158
|
+
ci_high = np.percentile(bootstrap_diffs, 100 * (1 - alpha / 2))
|
|
159
|
+
|
|
160
|
+
# Significant if CI does not contain 0
|
|
161
|
+
is_significant = ci_low > 0 or ci_high < 0
|
|
162
|
+
|
|
163
|
+
return SignificanceResult(
|
|
164
|
+
test_name="bootstrap_ci",
|
|
165
|
+
statistic=observed_diff,
|
|
166
|
+
p_value=np.mean(bootstrap_diffs <= 0) if observed_diff > 0 else np.mean(bootstrap_diffs >= 0),
|
|
167
|
+
is_significant=is_significant,
|
|
168
|
+
alpha=alpha,
|
|
169
|
+
effect_size=cohens_d_paired(a, b),
|
|
170
|
+
confidence_interval=(ci_low, ci_high),
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
def cohens_d_paired(a: np.ndarray, b: np.ndarray) -> float:
|
|
174
|
+
"""Cohen's d for paired samples."""
|
|
175
|
+
diff = a - b
|
|
176
|
+
return np.mean(diff) / np.std(diff, ddof=1)
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
### Multiple Comparison Correction
|
|
180
|
+
|
|
181
|
+
When comparing against multiple baselines, the probability of at least one false positive grows. Correct for this:
|
|
182
|
+
|
|
183
|
+
```python
|
|
184
|
+
# src/evaluation/multiple_comparisons.py
|
|
185
|
+
import numpy as np
|
|
186
|
+
|
|
187
|
+
def bonferroni_correction(p_values: list[float], alpha: float = 0.05) -> list[bool]:
|
|
188
|
+
"""Bonferroni correction: divide alpha by number of comparisons."""
|
|
189
|
+
adjusted_alpha = alpha / len(p_values)
|
|
190
|
+
return [p < adjusted_alpha for p in p_values]
|
|
191
|
+
|
|
192
|
+
def holm_bonferroni(p_values: list[float], alpha: float = 0.05) -> list[bool]:
|
|
193
|
+
"""Holm-Bonferroni step-down procedure (more powerful than Bonferroni)."""
|
|
194
|
+
n = len(p_values)
|
|
195
|
+
# Sort p-values and track original indices
|
|
196
|
+
indexed = sorted(enumerate(p_values), key=lambda x: x[1])
|
|
197
|
+
significant = [False] * n
|
|
198
|
+
|
|
199
|
+
for rank, (orig_idx, p) in enumerate(indexed):
|
|
200
|
+
adjusted_alpha = alpha / (n - rank)
|
|
201
|
+
if p < adjusted_alpha:
|
|
202
|
+
significant[orig_idx] = True
|
|
203
|
+
else:
|
|
204
|
+
# Once we fail to reject, stop (step-down)
|
|
205
|
+
break
|
|
206
|
+
|
|
207
|
+
return significant
|
|
208
|
+
|
|
209
|
+
def format_comparison_table(
|
|
210
|
+
method_names: list[str],
|
|
211
|
+
scores: list[list[float]], # [method][seed]
|
|
212
|
+
baseline_idx: int = 0,
|
|
213
|
+
alpha: float = 0.05,
|
|
214
|
+
) -> str:
|
|
215
|
+
"""Format a comparison table with significance markers."""
|
|
216
|
+
from src.evaluation.significance import paired_t_test
|
|
217
|
+
|
|
218
|
+
baseline_scores = scores[baseline_idx]
|
|
219
|
+
lines = [f"{'Method':<20} {'Mean':>8} {'Std':>8} {'vs Baseline':>12} {'Sig?':>6}"]
|
|
220
|
+
lines.append("-" * 60)
|
|
221
|
+
|
|
222
|
+
p_values = []
|
|
223
|
+
results = []
|
|
224
|
+
for i, (name, method_scores) in enumerate(zip(method_names, scores)):
|
|
225
|
+
mean = np.mean(method_scores)
|
|
226
|
+
std = np.std(method_scores)
|
|
227
|
+
if i == baseline_idx:
|
|
228
|
+
lines.append(f"{name:<20} {mean:>8.4f} {std:>8.4f} {'(baseline)':>12} {'---':>6}")
|
|
229
|
+
continue
|
|
230
|
+
result = paired_t_test(method_scores, baseline_scores, alpha)
|
|
231
|
+
p_values.append(result.p_value)
|
|
232
|
+
results.append((name, mean, std, result))
|
|
233
|
+
|
|
234
|
+
# Apply Holm-Bonferroni correction
|
|
235
|
+
corrected = holm_bonferroni(p_values, alpha)
|
|
236
|
+
for (name, mean, std, result), is_sig in zip(results, corrected):
|
|
237
|
+
diff = mean - np.mean(baseline_scores)
|
|
238
|
+
sig_marker = "*" if is_sig else ""
|
|
239
|
+
lines.append(
|
|
240
|
+
f"{name:<20} {mean:>8.4f} {std:>8.4f} {diff:>+12.4f} {sig_marker:>6}"
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
return "\n".join(lines)
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
### Effect Sizes
|
|
247
|
+
|
|
248
|
+
P-values tell you whether a difference exists; effect sizes tell you whether it matters:
|
|
249
|
+
|
|
250
|
+
```python
|
|
251
|
+
# src/evaluation/effect_size.py
|
|
252
|
+
import numpy as np
|
|
253
|
+
|
|
254
|
+
def interpret_cohens_d(d: float) -> str:
|
|
255
|
+
"""Interpret Cohen's d magnitude (Cohen 1988 conventions)."""
|
|
256
|
+
abs_d = abs(d)
|
|
257
|
+
if abs_d < 0.2:
|
|
258
|
+
return "negligible"
|
|
259
|
+
elif abs_d < 0.5:
|
|
260
|
+
return "small"
|
|
261
|
+
elif abs_d < 0.8:
|
|
262
|
+
return "medium"
|
|
263
|
+
else:
|
|
264
|
+
return "large"
|
|
265
|
+
|
|
266
|
+
def common_language_effect(scores_a: list[float], scores_b: list[float]) -> float:
|
|
267
|
+
"""Probability that a random score from A exceeds a random score from B."""
|
|
268
|
+
a = np.array(scores_a)
|
|
269
|
+
b = np.array(scores_b)
|
|
270
|
+
count = sum(1 for ai in a for bi in b if ai > bi)
|
|
271
|
+
ties = sum(1 for ai in a for bi in b if ai == bi)
|
|
272
|
+
return (count + 0.5 * ties) / (len(a) * len(b))
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
### Learning Curve Analysis
|
|
276
|
+
|
|
277
|
+
Learning curves reveal sample efficiency -- how much data a method needs to reach a given performance level:
|
|
278
|
+
|
|
279
|
+
```python
|
|
280
|
+
# src/evaluation/learning_curves.py
|
|
281
|
+
import numpy as np
|
|
282
|
+
from dataclasses import dataclass
|
|
283
|
+
|
|
284
|
+
@dataclass
|
|
285
|
+
class LearningCurvePoint:
|
|
286
|
+
train_size: int
|
|
287
|
+
train_score: float
|
|
288
|
+
val_score: float
|
|
289
|
+
train_score_std: float
|
|
290
|
+
val_score_std: float
|
|
291
|
+
|
|
292
|
+
def compute_learning_curve(
|
|
293
|
+
train_and_eval_fn,
|
|
294
|
+
total_samples: int,
|
|
295
|
+
fractions: list[float] | None = None,
|
|
296
|
+
n_seeds: int = 5,
|
|
297
|
+
) -> list[LearningCurvePoint]:
|
|
298
|
+
"""Compute learning curve at multiple dataset sizes."""
|
|
299
|
+
if fractions is None:
|
|
300
|
+
fractions = [0.1, 0.2, 0.3, 0.5, 0.7, 1.0]
|
|
301
|
+
|
|
302
|
+
points = []
|
|
303
|
+
for frac in fractions:
|
|
304
|
+
train_size = int(total_samples * frac)
|
|
305
|
+
train_scores = []
|
|
306
|
+
val_scores = []
|
|
307
|
+
|
|
308
|
+
for seed in range(n_seeds):
|
|
309
|
+
result = train_and_eval_fn(train_size=train_size, seed=seed)
|
|
310
|
+
train_scores.append(result["train_score"])
|
|
311
|
+
val_scores.append(result["val_score"])
|
|
312
|
+
|
|
313
|
+
points.append(LearningCurvePoint(
|
|
314
|
+
train_size=train_size,
|
|
315
|
+
train_score=np.mean(train_scores),
|
|
316
|
+
val_score=np.mean(val_scores),
|
|
317
|
+
train_score_std=np.std(train_scores),
|
|
318
|
+
val_score_std=np.std(val_scores),
|
|
319
|
+
))
|
|
320
|
+
|
|
321
|
+
return points
|
|
322
|
+
|
|
323
|
+
def extrapolate_performance(
|
|
324
|
+
curve: list[LearningCurvePoint],
|
|
325
|
+
target_score: float,
|
|
326
|
+
) -> int | None:
|
|
327
|
+
"""Estimate how many samples are needed to reach target score.
|
|
328
|
+
|
|
329
|
+
Uses power-law extrapolation: score = a - b * n^(-c)
|
|
330
|
+
"""
|
|
331
|
+
from scipy.optimize import curve_fit
|
|
332
|
+
|
|
333
|
+
sizes = np.array([p.train_size for p in curve])
|
|
334
|
+
scores = np.array([p.val_score for p in curve])
|
|
335
|
+
|
|
336
|
+
def power_law(n, a, b, c):
|
|
337
|
+
return a - b * np.power(n, -c)
|
|
338
|
+
|
|
339
|
+
try:
|
|
340
|
+
params, _ = curve_fit(power_law, sizes, scores, p0=[1.0, 1.0, 0.5], maxfev=5000)
|
|
341
|
+
a, b, c = params
|
|
342
|
+
if target_score >= a:
|
|
343
|
+
return None # Asymptote is below target
|
|
344
|
+
needed = (b / (a - target_score)) ** (1 / c)
|
|
345
|
+
return int(np.ceil(needed))
|
|
346
|
+
except (RuntimeError, ValueError):
|
|
347
|
+
return None
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
### Efficiency Frontiers
|
|
351
|
+
|
|
352
|
+
Plot accuracy against compute cost to understand cost-performance tradeoffs:
|
|
353
|
+
|
|
354
|
+
```python
|
|
355
|
+
# src/evaluation/efficiency.py
|
|
356
|
+
from dataclasses import dataclass
|
|
357
|
+
|
|
358
|
+
@dataclass
|
|
359
|
+
class EfficiencyPoint:
|
|
360
|
+
method_name: str
|
|
361
|
+
accuracy: float
|
|
362
|
+
gpu_hours: float
|
|
363
|
+
params_millions: float
|
|
364
|
+
flops_per_sample: float
|
|
365
|
+
|
|
366
|
+
def compute_efficiency_frontier(
|
|
367
|
+
points: list[EfficiencyPoint],
|
|
368
|
+
) -> list[EfficiencyPoint]:
|
|
369
|
+
"""Extract Pareto frontier of accuracy vs compute."""
|
|
370
|
+
# Sort by compute (gpu_hours)
|
|
371
|
+
sorted_points = sorted(points, key=lambda p: p.gpu_hours)
|
|
372
|
+
frontier = []
|
|
373
|
+
best_accuracy = -float("inf")
|
|
374
|
+
|
|
375
|
+
for point in sorted_points:
|
|
376
|
+
if point.accuracy > best_accuracy:
|
|
377
|
+
frontier.append(point)
|
|
378
|
+
best_accuracy = point.accuracy
|
|
379
|
+
|
|
380
|
+
return frontier
|
|
381
|
+
|
|
382
|
+
def compute_efficiency_ratio(point: EfficiencyPoint, baseline: EfficiencyPoint) -> dict:
|
|
383
|
+
"""Compare efficiency of a method against a baseline."""
|
|
384
|
+
accuracy_gain = point.accuracy - baseline.accuracy
|
|
385
|
+
compute_ratio = point.gpu_hours / baseline.gpu_hours
|
|
386
|
+
return {
|
|
387
|
+
"accuracy_gain": accuracy_gain,
|
|
388
|
+
"compute_ratio": compute_ratio,
|
|
389
|
+
"accuracy_per_gpu_hour": accuracy_gain / point.gpu_hours if point.gpu_hours > 0 else 0,
|
|
390
|
+
"is_efficient": accuracy_gain / max(compute_ratio, 1e-6) > 0,
|
|
391
|
+
}
|
|
392
|
+
```
|
|
393
|
+
|
|
394
|
+
### Minimum Reporting Standard
|
|
395
|
+
|
|
396
|
+
Every research evaluation must report at minimum:
|
|
397
|
+
|
|
398
|
+
| Element | Required | Purpose |
|
|
399
|
+
|---------|----------|---------|
|
|
400
|
+
| Mean metric (N seeds) | Yes | Central estimate |
|
|
401
|
+
| Standard deviation / CI | Yes | Uncertainty quantification |
|
|
402
|
+
| Significance test vs baseline | Yes | Is the improvement real? |
|
|
403
|
+
| Effect size | Yes | Is the improvement meaningful? |
|
|
404
|
+
| Number of seeds/folds | Yes | Reproducibility verification |
|
|
405
|
+
| Compute cost (GPU-hours) | Yes | Efficiency context |
|
|
406
|
+
| Ablation table | For new methods | Component contributions |
|
|
407
|
+
| Learning curve | For data-sensitive claims | Sample efficiency |
|