@zigrivers/scaffold 3.14.0 → 3.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. package/README.md +31 -9
  2. package/content/knowledge/research/research-architecture.md +385 -0
  3. package/content/knowledge/research/research-conventions.md +248 -0
  4. package/content/knowledge/research/research-dev-environment.md +303 -0
  5. package/content/knowledge/research/research-experiment-loop.md +429 -0
  6. package/content/knowledge/research/research-experiment-tracking.md +336 -0
  7. package/content/knowledge/research/research-ml-architecture-search.md +383 -0
  8. package/content/knowledge/research/research-ml-evaluation.md +407 -0
  9. package/content/knowledge/research/research-ml-experiment-tracking.md +466 -0
  10. package/content/knowledge/research/research-ml-training-patterns.md +413 -0
  11. package/content/knowledge/research/research-observability.md +395 -0
  12. package/content/knowledge/research/research-overfitting-prevention.md +306 -0
  13. package/content/knowledge/research/research-project-structure.md +264 -0
  14. package/content/knowledge/research/research-quant-backtesting.md +326 -0
  15. package/content/knowledge/research/research-quant-market-data.md +366 -0
  16. package/content/knowledge/research/research-quant-metrics.md +335 -0
  17. package/content/knowledge/research/research-quant-requirements.md +223 -0
  18. package/content/knowledge/research/research-quant-risk.md +469 -0
  19. package/content/knowledge/research/research-quant-strategy-patterns.md +412 -0
  20. package/content/knowledge/research/research-requirements.md +201 -0
  21. package/content/knowledge/research/research-security.md +374 -0
  22. package/content/knowledge/research/research-sim-compute-management.md +538 -0
  23. package/content/knowledge/research/research-sim-engine-patterns.md +448 -0
  24. package/content/knowledge/research/research-sim-parameter-spaces.md +425 -0
  25. package/content/knowledge/research/research-sim-validation.md +456 -0
  26. package/content/knowledge/research/research-testing.md +334 -0
  27. package/content/methodology/research-ml-research.yml +23 -0
  28. package/content/methodology/research-overlay.yml +65 -0
  29. package/content/methodology/research-quant-finance.yml +29 -0
  30. package/content/methodology/research-simulation.yml +23 -0
  31. package/dist/cli/commands/adopt.d.ts.map +1 -1
  32. package/dist/cli/commands/adopt.js +22 -1
  33. package/dist/cli/commands/adopt.js.map +1 -1
  34. package/dist/cli/commands/adopt.serialization.test.js +41 -0
  35. package/dist/cli/commands/adopt.serialization.test.js.map +1 -1
  36. package/dist/cli/commands/init.d.ts +4 -0
  37. package/dist/cli/commands/init.d.ts.map +1 -1
  38. package/dist/cli/commands/init.js +32 -2
  39. package/dist/cli/commands/init.js.map +1 -1
  40. package/dist/cli/init-flag-families.d.ts +6 -1
  41. package/dist/cli/init-flag-families.d.ts.map +1 -1
  42. package/dist/cli/init-flag-families.js +32 -1
  43. package/dist/cli/init-flag-families.js.map +1 -1
  44. package/dist/cli/init-flag-families.test.js +47 -0
  45. package/dist/cli/init-flag-families.test.js.map +1 -1
  46. package/dist/config/schema.d.ts +272 -16
  47. package/dist/config/schema.d.ts.map +1 -1
  48. package/dist/config/schema.js +25 -1
  49. package/dist/config/schema.js.map +1 -1
  50. package/dist/config/schema.test.js +103 -3
  51. package/dist/config/schema.test.js.map +1 -1
  52. package/dist/core/assembly/overlay-loader.d.ts +12 -0
  53. package/dist/core/assembly/overlay-loader.d.ts.map +1 -1
  54. package/dist/core/assembly/overlay-loader.js +30 -0
  55. package/dist/core/assembly/overlay-loader.js.map +1 -1
  56. package/dist/core/assembly/overlay-loader.test.js +66 -1
  57. package/dist/core/assembly/overlay-loader.test.js.map +1 -1
  58. package/dist/core/assembly/overlay-state-resolver.d.ts.map +1 -1
  59. package/dist/core/assembly/overlay-state-resolver.js +48 -19
  60. package/dist/core/assembly/overlay-state-resolver.js.map +1 -1
  61. package/dist/core/assembly/overlay-state-resolver.test.js +80 -0
  62. package/dist/core/assembly/overlay-state-resolver.test.js.map +1 -1
  63. package/dist/e2e/project-type-overlays.test.js +119 -0
  64. package/dist/e2e/project-type-overlays.test.js.map +1 -1
  65. package/dist/project/adopt.d.ts.map +1 -1
  66. package/dist/project/adopt.js +3 -1
  67. package/dist/project/adopt.js.map +1 -1
  68. package/dist/project/detectors/disambiguate.js +1 -1
  69. package/dist/project/detectors/disambiguate.js.map +1 -1
  70. package/dist/project/detectors/index.d.ts.map +1 -1
  71. package/dist/project/detectors/index.js +2 -1
  72. package/dist/project/detectors/index.js.map +1 -1
  73. package/dist/project/detectors/ml.d.ts.map +1 -1
  74. package/dist/project/detectors/ml.js +2 -6
  75. package/dist/project/detectors/ml.js.map +1 -1
  76. package/dist/project/detectors/research.d.ts +4 -0
  77. package/dist/project/detectors/research.d.ts.map +1 -0
  78. package/dist/project/detectors/research.js +141 -0
  79. package/dist/project/detectors/research.js.map +1 -0
  80. package/dist/project/detectors/research.test.d.ts +2 -0
  81. package/dist/project/detectors/research.test.d.ts.map +1 -0
  82. package/dist/project/detectors/research.test.js +235 -0
  83. package/dist/project/detectors/research.test.js.map +1 -0
  84. package/dist/project/detectors/shared-signals.d.ts +3 -0
  85. package/dist/project/detectors/shared-signals.d.ts.map +1 -0
  86. package/dist/project/detectors/shared-signals.js +9 -0
  87. package/dist/project/detectors/shared-signals.js.map +1 -0
  88. package/dist/project/detectors/types.d.ts +6 -2
  89. package/dist/project/detectors/types.d.ts.map +1 -1
  90. package/dist/project/detectors/types.js.map +1 -1
  91. package/dist/types/config.d.ts +7 -1
  92. package/dist/types/config.d.ts.map +1 -1
  93. package/dist/wizard/copy/core.d.ts.map +1 -1
  94. package/dist/wizard/copy/core.js +4 -0
  95. package/dist/wizard/copy/core.js.map +1 -1
  96. package/dist/wizard/copy/index.d.ts.map +1 -1
  97. package/dist/wizard/copy/index.js +2 -0
  98. package/dist/wizard/copy/index.js.map +1 -1
  99. package/dist/wizard/copy/research.d.ts +3 -0
  100. package/dist/wizard/copy/research.d.ts.map +1 -0
  101. package/dist/wizard/copy/research.js +27 -0
  102. package/dist/wizard/copy/research.js.map +1 -0
  103. package/dist/wizard/copy/types.d.ts +5 -1
  104. package/dist/wizard/copy/types.d.ts.map +1 -1
  105. package/dist/wizard/flags.d.ts +7 -1
  106. package/dist/wizard/flags.d.ts.map +1 -1
  107. package/dist/wizard/questions.d.ts +4 -2
  108. package/dist/wizard/questions.d.ts.map +1 -1
  109. package/dist/wizard/questions.js +27 -1
  110. package/dist/wizard/questions.js.map +1 -1
  111. package/dist/wizard/questions.test.js +51 -0
  112. package/dist/wizard/questions.test.js.map +1 -1
  113. package/dist/wizard/wizard.d.ts +3 -2
  114. package/dist/wizard/wizard.d.ts.map +1 -1
  115. package/dist/wizard/wizard.js +3 -1
  116. package/dist/wizard/wizard.js.map +1 -1
  117. package/package.json +1 -1
@@ -0,0 +1,407 @@
1
+ ---
2
+ name: research-ml-evaluation
3
+ description: Research evaluation patterns including ablation studies, statistical significance testing, multiple comparison correction, effect sizes, learning curve analysis, and efficiency frontiers
4
+ topics: [research, ml-research, evaluation, ablation, significance, bootstrap, bonferroni, effect-size, learning-curve, pareto]
5
+ ---
6
+
7
+ Research evaluation differs fundamentally from production ML evaluation. In production, you measure a single model on held-out data and report aggregate metrics. In research, you compare multiple methods, ablate components to understand contributions, test whether differences are statistically significant rather than due to random variation, and characterize the efficiency frontier (what accuracy is achievable at what compute cost). Reporting a number without confidence intervals, without significance testing against baselines, and without ablations is not research evaluation -- it is anecdote.
8
+
9
+ ## Summary
10
+
11
+ Evaluate research results with statistical rigor: run multiple seeds per configuration and report mean with confidence intervals, use paired statistical tests (paired t-test, Wilcoxon signed-rank, bootstrap) to determine whether improvements are significant, apply multiple comparison correction (Bonferroni, Holm-Bonferroni) when comparing against many baselines, report effect sizes (Cohen's d) alongside p-values, conduct systematic ablation studies to attribute performance to specific components, analyze learning curves to understand sample efficiency, and plot efficiency frontiers (accuracy vs compute) to characterize the cost-performance tradeoff.
12
+
13
+ ## Deep Guidance
14
+
15
+ ### Ablation Studies
16
+
17
+ Ablations remove or disable one component at a time to measure its contribution. Without ablations, you cannot claim which parts of your method actually matter:
18
+
19
+ ```python
20
+ # src/evaluation/ablation.py
21
+ from dataclasses import dataclass, field
22
+ from typing import Any, Callable
23
+
24
+ @dataclass
25
+ class AblationConfig:
26
+ """Define what components to ablate."""
27
+ full_config: dict[str, Any] # The complete config (baseline)
28
+ ablation_map: dict[str, Any] = field(default_factory=dict)
29
+ # Maps component name -> value that disables it
30
+ # e.g., {"attention": False, "residual": False, "dropout": 0.0}
31
+
32
+ @dataclass
33
+ class AblationResult:
34
+ component_removed: str
35
+ full_score: float
36
+ ablated_score: float
37
+
38
+ @property
39
+ def contribution(self) -> float:
40
+ """How much this component contributes to performance."""
41
+ return self.full_score - self.ablated_score
42
+
43
+ @property
44
+ def relative_contribution(self) -> float:
45
+ """Contribution as a fraction of total performance."""
46
+ if self.full_score == 0:
47
+ return 0.0
48
+ return self.contribution / self.full_score
49
+
50
+ def run_ablation_study(
51
+ full_config: dict[str, Any],
52
+ ablation_map: dict[str, Any],
53
+ train_and_eval_fn: Callable[[dict], float],
54
+ n_seeds: int = 5,
55
+ ) -> list[AblationResult]:
56
+ """Run complete ablation study with multiple seeds."""
57
+ import numpy as np
58
+
59
+ # Evaluate full model
60
+ full_scores = [
61
+ train_and_eval_fn({**full_config, "seed": seed})
62
+ for seed in range(n_seeds)
63
+ ]
64
+ full_mean = np.mean(full_scores)
65
+
66
+ results = []
67
+ for component, disabled_value in ablation_map.items():
68
+ # Create config with this component disabled
69
+ ablated_config = {**full_config, component: disabled_value}
70
+ ablated_scores = [
71
+ train_and_eval_fn({**ablated_config, "seed": seed})
72
+ for seed in range(n_seeds)
73
+ ]
74
+ ablated_mean = np.mean(ablated_scores)
75
+ results.append(AblationResult(
76
+ component_removed=component,
77
+ full_score=full_mean,
78
+ ablated_score=ablated_mean,
79
+ ))
80
+
81
+ # Sort by contribution (most important first)
82
+ results.sort(key=lambda r: r.contribution, reverse=True)
83
+ return results
84
+ ```
85
+
86
+ ### Statistical Significance Testing
87
+
88
+ Never report that method A outperforms method B without testing statistical significance. A difference of 0.5% accuracy on a single seed is noise, not signal:
89
+
90
+ ```python
91
+ # src/evaluation/significance.py
92
+ import numpy as np
93
+ from scipy import stats
94
+ from dataclasses import dataclass
95
+
96
+ @dataclass
97
+ class SignificanceResult:
98
+ test_name: str
99
+ statistic: float
100
+ p_value: float
101
+ is_significant: bool # At the given alpha level
102
+ alpha: float
103
+ effect_size: float # Cohen's d
104
+ confidence_interval: tuple[float, float]
105
+
106
+ def paired_t_test(
107
+ scores_a: list[float],
108
+ scores_b: list[float],
109
+ alpha: float = 0.05,
110
+ ) -> SignificanceResult:
111
+ """Paired t-test for comparing two methods across the same seeds/folds."""
112
+ assert len(scores_a) == len(scores_b), "Must have paired observations"
113
+ a = np.array(scores_a)
114
+ b = np.array(scores_b)
115
+
116
+ statistic, p_value = stats.ttest_rel(a, b)
117
+ effect_size = cohens_d_paired(a, b)
118
+
119
+ # Confidence interval on the mean difference
120
+ diff = a - b
121
+ ci = stats.t.interval(
122
+ 1 - alpha,
123
+ df=len(diff) - 1,
124
+ loc=np.mean(diff),
125
+ scale=stats.sem(diff),
126
+ )
127
+
128
+ return SignificanceResult(
129
+ test_name="paired_t_test",
130
+ statistic=statistic,
131
+ p_value=p_value,
132
+ is_significant=p_value < alpha,
133
+ alpha=alpha,
134
+ effect_size=effect_size,
135
+ confidence_interval=ci,
136
+ )
137
+
138
+ def bootstrap_ci(
139
+ scores_a: list[float],
140
+ scores_b: list[float],
141
+ n_bootstrap: int = 10000,
142
+ alpha: float = 0.05,
143
+ ) -> SignificanceResult:
144
+ """Bootstrap confidence interval for the difference in means."""
145
+ a = np.array(scores_a)
146
+ b = np.array(scores_b)
147
+ observed_diff = np.mean(a) - np.mean(b)
148
+
149
+ rng = np.random.default_rng(42)
150
+ bootstrap_diffs = []
151
+ n = len(a)
152
+ for _ in range(n_bootstrap):
153
+ idx = rng.integers(0, n, size=n)
154
+ bootstrap_diffs.append(np.mean(a[idx]) - np.mean(b[idx]))
155
+
156
+ bootstrap_diffs = np.array(bootstrap_diffs)
157
+ ci_low = np.percentile(bootstrap_diffs, 100 * alpha / 2)
158
+ ci_high = np.percentile(bootstrap_diffs, 100 * (1 - alpha / 2))
159
+
160
+ # Significant if CI does not contain 0
161
+ is_significant = ci_low > 0 or ci_high < 0
162
+
163
+ return SignificanceResult(
164
+ test_name="bootstrap_ci",
165
+ statistic=observed_diff,
166
+ p_value=np.mean(bootstrap_diffs <= 0) if observed_diff > 0 else np.mean(bootstrap_diffs >= 0),
167
+ is_significant=is_significant,
168
+ alpha=alpha,
169
+ effect_size=cohens_d_paired(a, b),
170
+ confidence_interval=(ci_low, ci_high),
171
+ )
172
+
173
+ def cohens_d_paired(a: np.ndarray, b: np.ndarray) -> float:
174
+ """Cohen's d for paired samples."""
175
+ diff = a - b
176
+ return np.mean(diff) / np.std(diff, ddof=1)
177
+ ```
178
+
179
+ ### Multiple Comparison Correction
180
+
181
+ When comparing against multiple baselines, the probability of at least one false positive grows. Correct for this:
182
+
183
+ ```python
184
+ # src/evaluation/multiple_comparisons.py
185
+ import numpy as np
186
+
187
+ def bonferroni_correction(p_values: list[float], alpha: float = 0.05) -> list[bool]:
188
+ """Bonferroni correction: divide alpha by number of comparisons."""
189
+ adjusted_alpha = alpha / len(p_values)
190
+ return [p < adjusted_alpha for p in p_values]
191
+
192
+ def holm_bonferroni(p_values: list[float], alpha: float = 0.05) -> list[bool]:
193
+ """Holm-Bonferroni step-down procedure (more powerful than Bonferroni)."""
194
+ n = len(p_values)
195
+ # Sort p-values and track original indices
196
+ indexed = sorted(enumerate(p_values), key=lambda x: x[1])
197
+ significant = [False] * n
198
+
199
+ for rank, (orig_idx, p) in enumerate(indexed):
200
+ adjusted_alpha = alpha / (n - rank)
201
+ if p < adjusted_alpha:
202
+ significant[orig_idx] = True
203
+ else:
204
+ # Once we fail to reject, stop (step-down)
205
+ break
206
+
207
+ return significant
208
+
209
+ def format_comparison_table(
210
+ method_names: list[str],
211
+ scores: list[list[float]], # [method][seed]
212
+ baseline_idx: int = 0,
213
+ alpha: float = 0.05,
214
+ ) -> str:
215
+ """Format a comparison table with significance markers."""
216
+ from src.evaluation.significance import paired_t_test
217
+
218
+ baseline_scores = scores[baseline_idx]
219
+ lines = [f"{'Method':<20} {'Mean':>8} {'Std':>8} {'vs Baseline':>12} {'Sig?':>6}"]
220
+ lines.append("-" * 60)
221
+
222
+ p_values = []
223
+ results = []
224
+ for i, (name, method_scores) in enumerate(zip(method_names, scores)):
225
+ mean = np.mean(method_scores)
226
+ std = np.std(method_scores)
227
+ if i == baseline_idx:
228
+ lines.append(f"{name:<20} {mean:>8.4f} {std:>8.4f} {'(baseline)':>12} {'---':>6}")
229
+ continue
230
+ result = paired_t_test(method_scores, baseline_scores, alpha)
231
+ p_values.append(result.p_value)
232
+ results.append((name, mean, std, result))
233
+
234
+ # Apply Holm-Bonferroni correction
235
+ corrected = holm_bonferroni(p_values, alpha)
236
+ for (name, mean, std, result), is_sig in zip(results, corrected):
237
+ diff = mean - np.mean(baseline_scores)
238
+ sig_marker = "*" if is_sig else ""
239
+ lines.append(
240
+ f"{name:<20} {mean:>8.4f} {std:>8.4f} {diff:>+12.4f} {sig_marker:>6}"
241
+ )
242
+
243
+ return "\n".join(lines)
244
+ ```
245
+
246
+ ### Effect Sizes
247
+
248
+ P-values tell you whether a difference exists; effect sizes tell you whether it matters:
249
+
250
+ ```python
251
+ # src/evaluation/effect_size.py
252
+ import numpy as np
253
+
254
+ def interpret_cohens_d(d: float) -> str:
255
+ """Interpret Cohen's d magnitude (Cohen 1988 conventions)."""
256
+ abs_d = abs(d)
257
+ if abs_d < 0.2:
258
+ return "negligible"
259
+ elif abs_d < 0.5:
260
+ return "small"
261
+ elif abs_d < 0.8:
262
+ return "medium"
263
+ else:
264
+ return "large"
265
+
266
+ def common_language_effect(scores_a: list[float], scores_b: list[float]) -> float:
267
+ """Probability that a random score from A exceeds a random score from B."""
268
+ a = np.array(scores_a)
269
+ b = np.array(scores_b)
270
+ count = sum(1 for ai in a for bi in b if ai > bi)
271
+ ties = sum(1 for ai in a for bi in b if ai == bi)
272
+ return (count + 0.5 * ties) / (len(a) * len(b))
273
+ ```
274
+
275
+ ### Learning Curve Analysis
276
+
277
+ Learning curves reveal sample efficiency -- how much data a method needs to reach a given performance level:
278
+
279
+ ```python
280
+ # src/evaluation/learning_curves.py
281
+ import numpy as np
282
+ from dataclasses import dataclass
283
+
284
+ @dataclass
285
+ class LearningCurvePoint:
286
+ train_size: int
287
+ train_score: float
288
+ val_score: float
289
+ train_score_std: float
290
+ val_score_std: float
291
+
292
+ def compute_learning_curve(
293
+ train_and_eval_fn,
294
+ total_samples: int,
295
+ fractions: list[float] | None = None,
296
+ n_seeds: int = 5,
297
+ ) -> list[LearningCurvePoint]:
298
+ """Compute learning curve at multiple dataset sizes."""
299
+ if fractions is None:
300
+ fractions = [0.1, 0.2, 0.3, 0.5, 0.7, 1.0]
301
+
302
+ points = []
303
+ for frac in fractions:
304
+ train_size = int(total_samples * frac)
305
+ train_scores = []
306
+ val_scores = []
307
+
308
+ for seed in range(n_seeds):
309
+ result = train_and_eval_fn(train_size=train_size, seed=seed)
310
+ train_scores.append(result["train_score"])
311
+ val_scores.append(result["val_score"])
312
+
313
+ points.append(LearningCurvePoint(
314
+ train_size=train_size,
315
+ train_score=np.mean(train_scores),
316
+ val_score=np.mean(val_scores),
317
+ train_score_std=np.std(train_scores),
318
+ val_score_std=np.std(val_scores),
319
+ ))
320
+
321
+ return points
322
+
323
+ def extrapolate_performance(
324
+ curve: list[LearningCurvePoint],
325
+ target_score: float,
326
+ ) -> int | None:
327
+ """Estimate how many samples are needed to reach target score.
328
+
329
+ Uses power-law extrapolation: score = a - b * n^(-c)
330
+ """
331
+ from scipy.optimize import curve_fit
332
+
333
+ sizes = np.array([p.train_size for p in curve])
334
+ scores = np.array([p.val_score for p in curve])
335
+
336
+ def power_law(n, a, b, c):
337
+ return a - b * np.power(n, -c)
338
+
339
+ try:
340
+ params, _ = curve_fit(power_law, sizes, scores, p0=[1.0, 1.0, 0.5], maxfev=5000)
341
+ a, b, c = params
342
+ if target_score >= a:
343
+ return None # Asymptote is below target
344
+ needed = (b / (a - target_score)) ** (1 / c)
345
+ return int(np.ceil(needed))
346
+ except (RuntimeError, ValueError):
347
+ return None
348
+ ```
349
+
350
+ ### Efficiency Frontiers
351
+
352
+ Plot accuracy against compute cost to understand cost-performance tradeoffs:
353
+
354
+ ```python
355
+ # src/evaluation/efficiency.py
356
+ from dataclasses import dataclass
357
+
358
+ @dataclass
359
+ class EfficiencyPoint:
360
+ method_name: str
361
+ accuracy: float
362
+ gpu_hours: float
363
+ params_millions: float
364
+ flops_per_sample: float
365
+
366
+ def compute_efficiency_frontier(
367
+ points: list[EfficiencyPoint],
368
+ ) -> list[EfficiencyPoint]:
369
+ """Extract Pareto frontier of accuracy vs compute."""
370
+ # Sort by compute (gpu_hours)
371
+ sorted_points = sorted(points, key=lambda p: p.gpu_hours)
372
+ frontier = []
373
+ best_accuracy = -float("inf")
374
+
375
+ for point in sorted_points:
376
+ if point.accuracy > best_accuracy:
377
+ frontier.append(point)
378
+ best_accuracy = point.accuracy
379
+
380
+ return frontier
381
+
382
+ def compute_efficiency_ratio(point: EfficiencyPoint, baseline: EfficiencyPoint) -> dict:
383
+ """Compare efficiency of a method against a baseline."""
384
+ accuracy_gain = point.accuracy - baseline.accuracy
385
+ compute_ratio = point.gpu_hours / baseline.gpu_hours
386
+ return {
387
+ "accuracy_gain": accuracy_gain,
388
+ "compute_ratio": compute_ratio,
389
+ "accuracy_per_gpu_hour": accuracy_gain / point.gpu_hours if point.gpu_hours > 0 else 0,
390
+ "is_efficient": accuracy_gain / max(compute_ratio, 1e-6) > 0,
391
+ }
392
+ ```
393
+
394
+ ### Minimum Reporting Standard
395
+
396
+ Every research evaluation must report at minimum:
397
+
398
+ | Element | Required | Purpose |
399
+ |---------|----------|---------|
400
+ | Mean metric (N seeds) | Yes | Central estimate |
401
+ | Standard deviation / CI | Yes | Uncertainty quantification |
402
+ | Significance test vs baseline | Yes | Is the improvement real? |
403
+ | Effect size | Yes | Is the improvement meaningful? |
404
+ | Number of seeds/folds | Yes | Reproducibility verification |
405
+ | Compute cost (GPU-hours) | Yes | Efficiency context |
406
+ | Ablation table | For new methods | Component contributions |
407
+ | Learning curve | For data-sensitive claims | Sample efficiency |