alberta-framework 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alberta_framework/__init__.py +196 -0
- alberta_framework/core/__init__.py +27 -0
- alberta_framework/core/learners.py +530 -0
- alberta_framework/core/normalizers.py +192 -0
- alberta_framework/core/optimizers.py +422 -0
- alberta_framework/core/types.py +198 -0
- alberta_framework/py.typed +0 -0
- alberta_framework/streams/__init__.py +83 -0
- alberta_framework/streams/base.py +70 -0
- alberta_framework/streams/gymnasium.py +655 -0
- alberta_framework/streams/synthetic.py +995 -0
- alberta_framework/utils/__init__.py +113 -0
- alberta_framework/utils/experiments.py +334 -0
- alberta_framework/utils/export.py +509 -0
- alberta_framework/utils/metrics.py +112 -0
- alberta_framework/utils/statistics.py +527 -0
- alberta_framework/utils/timing.py +138 -0
- alberta_framework/utils/visualization.py +571 -0
- alberta_framework-0.1.0.dist-info/METADATA +198 -0
- alberta_framework-0.1.0.dist-info/RECORD +22 -0
- alberta_framework-0.1.0.dist-info/WHEEL +4 -0
- alberta_framework-0.1.0.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,527 @@
|
|
|
1
|
+
"""Statistical analysis utilities for publication-quality experiments.
|
|
2
|
+
|
|
3
|
+
Provides functions for computing confidence intervals, significance tests,
|
|
4
|
+
effect sizes, and multiple comparison corrections.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import TYPE_CHECKING, NamedTuple
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
from numpy.typing import NDArray
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from alberta_framework.utils.experiments import AggregatedResults
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class StatisticalSummary(NamedTuple):
|
|
17
|
+
"""Summary statistics for a set of values.
|
|
18
|
+
|
|
19
|
+
Attributes:
|
|
20
|
+
mean: Arithmetic mean
|
|
21
|
+
std: Standard deviation
|
|
22
|
+
sem: Standard error of the mean
|
|
23
|
+
ci_lower: Lower bound of confidence interval
|
|
24
|
+
ci_upper: Upper bound of confidence interval
|
|
25
|
+
median: Median value
|
|
26
|
+
iqr: Interquartile range
|
|
27
|
+
n_seeds: Number of samples
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
mean: float
|
|
31
|
+
std: float
|
|
32
|
+
sem: float
|
|
33
|
+
ci_lower: float
|
|
34
|
+
ci_upper: float
|
|
35
|
+
median: float
|
|
36
|
+
iqr: float
|
|
37
|
+
n_seeds: int
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class SignificanceResult(NamedTuple):
|
|
41
|
+
"""Result of a statistical significance test.
|
|
42
|
+
|
|
43
|
+
Attributes:
|
|
44
|
+
test_name: Name of the test performed
|
|
45
|
+
statistic: Test statistic value
|
|
46
|
+
p_value: P-value of the test
|
|
47
|
+
significant: Whether the result is significant at the given alpha
|
|
48
|
+
alpha: Significance level used
|
|
49
|
+
effect_size: Effect size (e.g., Cohen's d)
|
|
50
|
+
method_a: Name of first method
|
|
51
|
+
method_b: Name of second method
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
test_name: str
|
|
55
|
+
statistic: float
|
|
56
|
+
p_value: float
|
|
57
|
+
significant: bool
|
|
58
|
+
alpha: float
|
|
59
|
+
effect_size: float
|
|
60
|
+
method_a: str
|
|
61
|
+
method_b: str
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def compute_statistics(
|
|
65
|
+
values: NDArray[np.float64] | list[float],
|
|
66
|
+
confidence_level: float = 0.95,
|
|
67
|
+
) -> StatisticalSummary:
|
|
68
|
+
"""Compute comprehensive statistics for a set of values.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
values: Array of values (e.g., final performance across seeds)
|
|
72
|
+
confidence_level: Confidence level for CI (default 0.95)
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
StatisticalSummary with all statistics
|
|
76
|
+
"""
|
|
77
|
+
arr = np.asarray(values)
|
|
78
|
+
n = len(arr)
|
|
79
|
+
|
|
80
|
+
mean = float(np.mean(arr))
|
|
81
|
+
std = float(np.std(arr, ddof=1)) if n > 1 else 0.0
|
|
82
|
+
sem = std / np.sqrt(n) if n > 1 else 0.0
|
|
83
|
+
median = float(np.median(arr))
|
|
84
|
+
q75, q25 = np.percentile(arr, [75, 25])
|
|
85
|
+
iqr = float(q75 - q25)
|
|
86
|
+
|
|
87
|
+
# Compute confidence interval
|
|
88
|
+
try:
|
|
89
|
+
from scipy import stats
|
|
90
|
+
|
|
91
|
+
if n > 1:
|
|
92
|
+
t_value = float(stats.t.ppf((1 + confidence_level) / 2, n - 1))
|
|
93
|
+
margin = t_value * sem
|
|
94
|
+
ci_lower = mean - margin
|
|
95
|
+
ci_upper = mean + margin
|
|
96
|
+
else:
|
|
97
|
+
ci_lower = ci_upper = mean
|
|
98
|
+
except ImportError:
|
|
99
|
+
# Fallback without scipy: use normal approximation
|
|
100
|
+
z_value = 1.96 if confidence_level == 0.95 else 2.576 # 95% or 99%
|
|
101
|
+
margin = z_value * sem
|
|
102
|
+
ci_lower = mean - margin
|
|
103
|
+
ci_upper = mean + margin
|
|
104
|
+
|
|
105
|
+
return StatisticalSummary(
|
|
106
|
+
mean=mean,
|
|
107
|
+
std=std,
|
|
108
|
+
sem=sem,
|
|
109
|
+
ci_lower=float(ci_lower),
|
|
110
|
+
ci_upper=float(ci_upper),
|
|
111
|
+
median=median,
|
|
112
|
+
iqr=iqr,
|
|
113
|
+
n_seeds=n,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def compute_timeseries_statistics(
|
|
118
|
+
metric_array: NDArray[np.float64],
|
|
119
|
+
confidence_level: float = 0.95,
|
|
120
|
+
) -> tuple[NDArray[np.float64], NDArray[np.float64], NDArray[np.float64]]:
|
|
121
|
+
"""Compute mean and confidence intervals for timeseries data.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
metric_array: Array of shape (n_seeds, n_steps)
|
|
125
|
+
confidence_level: Confidence level for CI
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
Tuple of (mean, ci_lower, ci_upper) arrays of shape (n_steps,)
|
|
129
|
+
"""
|
|
130
|
+
n_seeds = metric_array.shape[0]
|
|
131
|
+
mean = np.mean(metric_array, axis=0)
|
|
132
|
+
std = np.std(metric_array, axis=0, ddof=1)
|
|
133
|
+
sem = std / np.sqrt(n_seeds)
|
|
134
|
+
|
|
135
|
+
try:
|
|
136
|
+
from scipy import stats
|
|
137
|
+
|
|
138
|
+
t_value = stats.t.ppf((1 + confidence_level) / 2, n_seeds - 1)
|
|
139
|
+
except ImportError:
|
|
140
|
+
t_value = 1.96 if confidence_level == 0.95 else 2.576
|
|
141
|
+
|
|
142
|
+
margin = t_value * sem
|
|
143
|
+
ci_lower = mean - margin
|
|
144
|
+
ci_upper = mean + margin
|
|
145
|
+
|
|
146
|
+
return mean, ci_lower, ci_upper
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def cohens_d(
|
|
150
|
+
values_a: NDArray[np.float64] | list[float],
|
|
151
|
+
values_b: NDArray[np.float64] | list[float],
|
|
152
|
+
) -> float:
|
|
153
|
+
"""Compute Cohen's d effect size.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
values_a: Values for first group
|
|
157
|
+
values_b: Values for second group
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Cohen's d (positive means a > b)
|
|
161
|
+
"""
|
|
162
|
+
a = np.asarray(values_a)
|
|
163
|
+
b = np.asarray(values_b)
|
|
164
|
+
|
|
165
|
+
mean_a = np.mean(a)
|
|
166
|
+
mean_b = np.mean(b)
|
|
167
|
+
|
|
168
|
+
n_a = len(a)
|
|
169
|
+
n_b = len(b)
|
|
170
|
+
|
|
171
|
+
# Pooled standard deviation
|
|
172
|
+
var_a = np.var(a, ddof=1) if n_a > 1 else 0.0
|
|
173
|
+
var_b = np.var(b, ddof=1) if n_b > 1 else 0.0
|
|
174
|
+
|
|
175
|
+
pooled_std = np.sqrt(((n_a - 1) * var_a + (n_b - 1) * var_b) / (n_a + n_b - 2))
|
|
176
|
+
|
|
177
|
+
if pooled_std == 0:
|
|
178
|
+
return 0.0
|
|
179
|
+
|
|
180
|
+
return float((mean_a - mean_b) / pooled_std)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def ttest_comparison(
|
|
184
|
+
values_a: NDArray[np.float64] | list[float],
|
|
185
|
+
values_b: NDArray[np.float64] | list[float],
|
|
186
|
+
paired: bool = True,
|
|
187
|
+
alpha: float = 0.05,
|
|
188
|
+
method_a: str = "A",
|
|
189
|
+
method_b: str = "B",
|
|
190
|
+
) -> SignificanceResult:
|
|
191
|
+
"""Perform t-test comparison between two methods.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
values_a: Values for first method
|
|
195
|
+
values_b: Values for second method
|
|
196
|
+
paired: Whether to use paired t-test (default True for same seeds)
|
|
197
|
+
alpha: Significance level
|
|
198
|
+
method_a: Name of first method
|
|
199
|
+
method_b: Name of second method
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
SignificanceResult with test results
|
|
203
|
+
"""
|
|
204
|
+
a = np.asarray(values_a)
|
|
205
|
+
b = np.asarray(values_b)
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
from scipy import stats
|
|
209
|
+
|
|
210
|
+
if paired:
|
|
211
|
+
result = stats.ttest_rel(a, b)
|
|
212
|
+
test_name = "paired t-test"
|
|
213
|
+
else:
|
|
214
|
+
result = stats.ttest_ind(a, b)
|
|
215
|
+
test_name = "independent t-test"
|
|
216
|
+
# scipy returns (statistic, pvalue) tuple
|
|
217
|
+
stat_val = float(result[0])
|
|
218
|
+
p_val = float(result[1])
|
|
219
|
+
except ImportError:
|
|
220
|
+
raise ImportError("scipy is required for t-test. Install with: pip install scipy")
|
|
221
|
+
|
|
222
|
+
effect = cohens_d(a, b)
|
|
223
|
+
|
|
224
|
+
return SignificanceResult(
|
|
225
|
+
test_name=test_name,
|
|
226
|
+
statistic=stat_val,
|
|
227
|
+
p_value=p_val,
|
|
228
|
+
significant=p_val < alpha,
|
|
229
|
+
alpha=alpha,
|
|
230
|
+
effect_size=effect,
|
|
231
|
+
method_a=method_a,
|
|
232
|
+
method_b=method_b,
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def mann_whitney_comparison(
|
|
237
|
+
values_a: NDArray[np.float64] | list[float],
|
|
238
|
+
values_b: NDArray[np.float64] | list[float],
|
|
239
|
+
alpha: float = 0.05,
|
|
240
|
+
method_a: str = "A",
|
|
241
|
+
method_b: str = "B",
|
|
242
|
+
) -> SignificanceResult:
|
|
243
|
+
"""Perform Mann-Whitney U test (non-parametric).
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
values_a: Values for first method
|
|
247
|
+
values_b: Values for second method
|
|
248
|
+
alpha: Significance level
|
|
249
|
+
method_a: Name of first method
|
|
250
|
+
method_b: Name of second method
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
SignificanceResult with test results
|
|
254
|
+
"""
|
|
255
|
+
a = np.asarray(values_a)
|
|
256
|
+
b = np.asarray(values_b)
|
|
257
|
+
|
|
258
|
+
try:
|
|
259
|
+
from scipy import stats
|
|
260
|
+
|
|
261
|
+
result = stats.mannwhitneyu(a, b, alternative="two-sided")
|
|
262
|
+
# scipy returns (statistic, pvalue) tuple
|
|
263
|
+
stat_val = float(result[0])
|
|
264
|
+
p_val = float(result[1])
|
|
265
|
+
except ImportError:
|
|
266
|
+
raise ImportError(
|
|
267
|
+
"scipy is required for Mann-Whitney test. Install with: pip install scipy"
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
# Compute rank-biserial correlation as effect size
|
|
271
|
+
n_a, n_b = len(a), len(b)
|
|
272
|
+
r = 1 - (2 * stat_val) / (n_a * n_b)
|
|
273
|
+
|
|
274
|
+
return SignificanceResult(
|
|
275
|
+
test_name="Mann-Whitney U",
|
|
276
|
+
statistic=stat_val,
|
|
277
|
+
p_value=p_val,
|
|
278
|
+
significant=p_val < alpha,
|
|
279
|
+
alpha=alpha,
|
|
280
|
+
effect_size=r,
|
|
281
|
+
method_a=method_a,
|
|
282
|
+
method_b=method_b,
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def wilcoxon_comparison(
|
|
287
|
+
values_a: NDArray[np.float64] | list[float],
|
|
288
|
+
values_b: NDArray[np.float64] | list[float],
|
|
289
|
+
alpha: float = 0.05,
|
|
290
|
+
method_a: str = "A",
|
|
291
|
+
method_b: str = "B",
|
|
292
|
+
) -> SignificanceResult:
|
|
293
|
+
"""Perform Wilcoxon signed-rank test (paired non-parametric).
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
values_a: Values for first method
|
|
297
|
+
values_b: Values for second method
|
|
298
|
+
alpha: Significance level
|
|
299
|
+
method_a: Name of first method
|
|
300
|
+
method_b: Name of second method
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
SignificanceResult with test results
|
|
304
|
+
"""
|
|
305
|
+
a = np.asarray(values_a)
|
|
306
|
+
b = np.asarray(values_b)
|
|
307
|
+
|
|
308
|
+
try:
|
|
309
|
+
from scipy import stats
|
|
310
|
+
|
|
311
|
+
result = stats.wilcoxon(a, b, alternative="two-sided")
|
|
312
|
+
# scipy returns (statistic, pvalue) tuple
|
|
313
|
+
stat_val = float(result[0])
|
|
314
|
+
p_val = float(result[1])
|
|
315
|
+
except ImportError:
|
|
316
|
+
raise ImportError(
|
|
317
|
+
"scipy is required for Wilcoxon test. Install with: pip install scipy"
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
effect = cohens_d(a, b)
|
|
321
|
+
|
|
322
|
+
return SignificanceResult(
|
|
323
|
+
test_name="Wilcoxon signed-rank",
|
|
324
|
+
statistic=stat_val,
|
|
325
|
+
p_value=p_val,
|
|
326
|
+
significant=p_val < alpha,
|
|
327
|
+
alpha=alpha,
|
|
328
|
+
effect_size=effect,
|
|
329
|
+
method_a=method_a,
|
|
330
|
+
method_b=method_b,
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def bonferroni_correction(
|
|
335
|
+
p_values: list[float],
|
|
336
|
+
alpha: float = 0.05,
|
|
337
|
+
) -> tuple[list[bool], float]:
|
|
338
|
+
"""Apply Bonferroni correction for multiple comparisons.
|
|
339
|
+
|
|
340
|
+
Args:
|
|
341
|
+
p_values: List of p-values from multiple tests
|
|
342
|
+
alpha: Family-wise significance level
|
|
343
|
+
|
|
344
|
+
Returns:
|
|
345
|
+
Tuple of (list of significant booleans, corrected alpha)
|
|
346
|
+
"""
|
|
347
|
+
n_tests = len(p_values)
|
|
348
|
+
corrected_alpha = alpha / n_tests
|
|
349
|
+
significant = [p < corrected_alpha for p in p_values]
|
|
350
|
+
return significant, corrected_alpha
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def holm_correction(
|
|
354
|
+
p_values: list[float],
|
|
355
|
+
alpha: float = 0.05,
|
|
356
|
+
) -> list[bool]:
|
|
357
|
+
"""Apply Holm-Bonferroni step-down correction.
|
|
358
|
+
|
|
359
|
+
More powerful than Bonferroni while still controlling FWER.
|
|
360
|
+
|
|
361
|
+
Args:
|
|
362
|
+
p_values: List of p-values from multiple tests
|
|
363
|
+
alpha: Family-wise significance level
|
|
364
|
+
|
|
365
|
+
Returns:
|
|
366
|
+
List of significant booleans
|
|
367
|
+
"""
|
|
368
|
+
n_tests = len(p_values)
|
|
369
|
+
|
|
370
|
+
# Sort p-values and track original indices
|
|
371
|
+
sorted_indices = np.argsort(p_values)
|
|
372
|
+
sorted_p = [p_values[i] for i in sorted_indices]
|
|
373
|
+
|
|
374
|
+
# Apply Holm correction
|
|
375
|
+
significant_sorted = []
|
|
376
|
+
for i, p in enumerate(sorted_p):
|
|
377
|
+
corrected_alpha = alpha / (n_tests - i)
|
|
378
|
+
if p < corrected_alpha:
|
|
379
|
+
significant_sorted.append(True)
|
|
380
|
+
else:
|
|
381
|
+
# Once we fail to reject, all subsequent are not significant
|
|
382
|
+
significant_sorted.extend([False] * (n_tests - i))
|
|
383
|
+
break
|
|
384
|
+
|
|
385
|
+
# Restore original order
|
|
386
|
+
significant = [False] * n_tests
|
|
387
|
+
for orig_idx, sig in zip(sorted_indices, significant_sorted, strict=False):
|
|
388
|
+
significant[orig_idx] = sig
|
|
389
|
+
|
|
390
|
+
return significant
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def pairwise_comparisons(
|
|
394
|
+
results: "dict[str, AggregatedResults]", # noqa: F821
|
|
395
|
+
metric: str = "squared_error",
|
|
396
|
+
test: str = "ttest",
|
|
397
|
+
correction: str = "bonferroni",
|
|
398
|
+
alpha: float = 0.05,
|
|
399
|
+
window: int = 100,
|
|
400
|
+
) -> dict[tuple[str, str], SignificanceResult]:
|
|
401
|
+
"""Perform all pairwise comparisons between methods.
|
|
402
|
+
|
|
403
|
+
Args:
|
|
404
|
+
results: Dictionary mapping config name to AggregatedResults
|
|
405
|
+
metric: Metric to compare
|
|
406
|
+
test: Test to use ("ttest", "mann_whitney", or "wilcoxon")
|
|
407
|
+
correction: Multiple comparison correction ("bonferroni" or "holm")
|
|
408
|
+
alpha: Significance level
|
|
409
|
+
window: Number of final steps to average
|
|
410
|
+
|
|
411
|
+
Returns:
|
|
412
|
+
Dictionary mapping (method_a, method_b) to SignificanceResult
|
|
413
|
+
"""
|
|
414
|
+
from alberta_framework.utils.experiments import AggregatedResults
|
|
415
|
+
|
|
416
|
+
names = list(results.keys())
|
|
417
|
+
n = len(names)
|
|
418
|
+
|
|
419
|
+
if n < 2:
|
|
420
|
+
return {}
|
|
421
|
+
|
|
422
|
+
# Extract final values for each method
|
|
423
|
+
final_values: dict[str, NDArray[np.float64]] = {}
|
|
424
|
+
for name, agg in results.items():
|
|
425
|
+
if not isinstance(agg, AggregatedResults):
|
|
426
|
+
raise TypeError(f"Expected AggregatedResults, got {type(agg)}")
|
|
427
|
+
arr = agg.metric_arrays[metric]
|
|
428
|
+
final_window = min(window, arr.shape[1])
|
|
429
|
+
final_values[name] = np.mean(arr[:, -final_window:], axis=1)
|
|
430
|
+
|
|
431
|
+
if test not in ("ttest", "mann_whitney", "wilcoxon"):
|
|
432
|
+
raise ValueError(f"Unknown test: {test}")
|
|
433
|
+
|
|
434
|
+
# Perform all pairwise comparisons
|
|
435
|
+
comparisons: dict[tuple[str, str], SignificanceResult] = {}
|
|
436
|
+
p_values: list[float] = []
|
|
437
|
+
|
|
438
|
+
for i in range(n):
|
|
439
|
+
for j in range(i + 1, n):
|
|
440
|
+
name_a, name_b = names[i], names[j]
|
|
441
|
+
values_a = final_values[name_a]
|
|
442
|
+
values_b = final_values[name_b]
|
|
443
|
+
|
|
444
|
+
if test == "ttest":
|
|
445
|
+
result = ttest_comparison(
|
|
446
|
+
values_a, values_b, paired=True, alpha=alpha,
|
|
447
|
+
method_a=name_a, method_b=name_b,
|
|
448
|
+
)
|
|
449
|
+
elif test == "mann_whitney":
|
|
450
|
+
result = mann_whitney_comparison(
|
|
451
|
+
values_a, values_b, alpha=alpha,
|
|
452
|
+
method_a=name_a, method_b=name_b,
|
|
453
|
+
)
|
|
454
|
+
else: # wilcoxon
|
|
455
|
+
result = wilcoxon_comparison(
|
|
456
|
+
values_a, values_b, alpha=alpha,
|
|
457
|
+
method_a=name_a, method_b=name_b,
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
comparisons[(name_a, name_b)] = result
|
|
461
|
+
p_values.append(result.p_value)
|
|
462
|
+
|
|
463
|
+
# Apply multiple comparison correction
|
|
464
|
+
if correction == "bonferroni":
|
|
465
|
+
significant_list, _ = bonferroni_correction(p_values, alpha)
|
|
466
|
+
elif correction == "holm":
|
|
467
|
+
significant_list = holm_correction(p_values, alpha)
|
|
468
|
+
else:
|
|
469
|
+
raise ValueError(f"Unknown correction: {correction}")
|
|
470
|
+
|
|
471
|
+
# Update significance based on correction
|
|
472
|
+
corrected_comparisons: dict[tuple[str, str], SignificanceResult] = {}
|
|
473
|
+
for (key, result), sig in zip(comparisons.items(), significant_list, strict=False):
|
|
474
|
+
corrected_comparisons[key] = SignificanceResult(
|
|
475
|
+
test_name=f"{result.test_name} ({correction})",
|
|
476
|
+
statistic=result.statistic,
|
|
477
|
+
p_value=result.p_value,
|
|
478
|
+
significant=sig,
|
|
479
|
+
alpha=alpha,
|
|
480
|
+
effect_size=result.effect_size,
|
|
481
|
+
method_a=result.method_a,
|
|
482
|
+
method_b=result.method_b,
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
return corrected_comparisons
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
def bootstrap_ci(
|
|
489
|
+
values: NDArray[np.float64] | list[float],
|
|
490
|
+
statistic: str = "mean",
|
|
491
|
+
confidence_level: float = 0.95,
|
|
492
|
+
n_bootstrap: int = 10000,
|
|
493
|
+
seed: int = 42,
|
|
494
|
+
) -> tuple[float, float, float]:
|
|
495
|
+
"""Compute bootstrap confidence interval.
|
|
496
|
+
|
|
497
|
+
Args:
|
|
498
|
+
values: Array of values
|
|
499
|
+
statistic: Statistic to bootstrap ("mean" or "median")
|
|
500
|
+
confidence_level: Confidence level
|
|
501
|
+
n_bootstrap: Number of bootstrap samples
|
|
502
|
+
seed: Random seed
|
|
503
|
+
|
|
504
|
+
Returns:
|
|
505
|
+
Tuple of (point_estimate, ci_lower, ci_upper)
|
|
506
|
+
"""
|
|
507
|
+
arr = np.asarray(values)
|
|
508
|
+
rng = np.random.default_rng(seed)
|
|
509
|
+
|
|
510
|
+
stat_func = np.mean if statistic == "mean" else np.median
|
|
511
|
+
point_estimate = float(stat_func(arr))
|
|
512
|
+
|
|
513
|
+
# Generate bootstrap samples
|
|
514
|
+
bootstrap_stats_list: list[float] = []
|
|
515
|
+
for _ in range(n_bootstrap):
|
|
516
|
+
sample = rng.choice(arr, size=len(arr), replace=True)
|
|
517
|
+
bootstrap_stats_list.append(float(stat_func(sample)))
|
|
518
|
+
|
|
519
|
+
bootstrap_stats = np.array(bootstrap_stats_list)
|
|
520
|
+
|
|
521
|
+
# Percentile method
|
|
522
|
+
lower_percentile = (1 - confidence_level) / 2 * 100
|
|
523
|
+
upper_percentile = (1 + confidence_level) / 2 * 100
|
|
524
|
+
ci_lower = float(np.percentile(bootstrap_stats, lower_percentile))
|
|
525
|
+
ci_upper = float(np.percentile(bootstrap_stats, upper_percentile))
|
|
526
|
+
|
|
527
|
+
return point_estimate, ci_lower, ci_upper
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""Timing utilities for measuring and reporting experiment durations.
|
|
2
|
+
|
|
3
|
+
This module provides a simple Timer context manager for measuring execution time
|
|
4
|
+
and formatting durations in a human-readable format.
|
|
5
|
+
|
|
6
|
+
Example:
|
|
7
|
+
>>> from alberta_framework.utils.timing import Timer
|
|
8
|
+
>>>
|
|
9
|
+
>>> with Timer("Training"):
|
|
10
|
+
... # run training code
|
|
11
|
+
... pass
|
|
12
|
+
Training completed in 1.23s
|
|
13
|
+
>>>
|
|
14
|
+
>>> # Or capture the duration:
|
|
15
|
+
>>> with Timer("Experiment") as t:
|
|
16
|
+
... # run experiment
|
|
17
|
+
... pass
|
|
18
|
+
>>> print(f"Took {t.duration:.2f} seconds")
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import time
|
|
22
|
+
from collections.abc import Callable
|
|
23
|
+
from types import TracebackType
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def format_duration(seconds: float) -> str:
|
|
27
|
+
"""Format a duration in seconds as a human-readable string.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
seconds: Duration in seconds
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
Formatted string like "1.23s", "2m 30.5s", or "1h 5m 30s"
|
|
34
|
+
|
|
35
|
+
Examples:
|
|
36
|
+
>>> format_duration(0.5)
|
|
37
|
+
'0.50s'
|
|
38
|
+
>>> format_duration(90.5)
|
|
39
|
+
'1m 30.50s'
|
|
40
|
+
>>> format_duration(3665)
|
|
41
|
+
'1h 1m 5.00s'
|
|
42
|
+
"""
|
|
43
|
+
if seconds < 60:
|
|
44
|
+
return f"{seconds:.2f}s"
|
|
45
|
+
elif seconds < 3600:
|
|
46
|
+
minutes = int(seconds // 60)
|
|
47
|
+
secs = seconds % 60
|
|
48
|
+
return f"{minutes}m {secs:.2f}s"
|
|
49
|
+
else:
|
|
50
|
+
hours = int(seconds // 3600)
|
|
51
|
+
remaining = seconds % 3600
|
|
52
|
+
minutes = int(remaining // 60)
|
|
53
|
+
secs = remaining % 60
|
|
54
|
+
return f"{hours}h {minutes}m {secs:.2f}s"
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class Timer:
|
|
58
|
+
"""Context manager for timing code execution.
|
|
59
|
+
|
|
60
|
+
Measures wall-clock time for a block of code and optionally prints
|
|
61
|
+
the duration when the block completes.
|
|
62
|
+
|
|
63
|
+
Attributes:
|
|
64
|
+
name: Description of what is being timed
|
|
65
|
+
duration: Elapsed time in seconds (available after context exits)
|
|
66
|
+
start_time: Timestamp when timing started
|
|
67
|
+
end_time: Timestamp when timing ended
|
|
68
|
+
|
|
69
|
+
Example:
|
|
70
|
+
>>> with Timer("Training loop"):
|
|
71
|
+
... for i in range(1000):
|
|
72
|
+
... pass
|
|
73
|
+
Training loop completed in 0.01s
|
|
74
|
+
|
|
75
|
+
>>> # Silent timing (no print):
|
|
76
|
+
>>> with Timer("Silent", verbose=False) as t:
|
|
77
|
+
... time.sleep(0.1)
|
|
78
|
+
>>> print(f"Elapsed: {t.duration:.2f}s")
|
|
79
|
+
Elapsed: 0.10s
|
|
80
|
+
|
|
81
|
+
>>> # Custom print function:
|
|
82
|
+
>>> with Timer("Custom", print_fn=lambda msg: print(f">> {msg}")):
|
|
83
|
+
... pass
|
|
84
|
+
>> Custom completed in 0.00s
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
def __init__(
|
|
88
|
+
self,
|
|
89
|
+
name: str = "Operation",
|
|
90
|
+
verbose: bool = True,
|
|
91
|
+
print_fn: Callable[[str], None] | None = None,
|
|
92
|
+
):
|
|
93
|
+
"""Initialize the timer.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
name: Description of the operation being timed
|
|
97
|
+
verbose: Whether to print the duration when done
|
|
98
|
+
print_fn: Custom print function (defaults to built-in print)
|
|
99
|
+
"""
|
|
100
|
+
self.name = name
|
|
101
|
+
self.verbose = verbose
|
|
102
|
+
self.print_fn = print_fn or print
|
|
103
|
+
self.start_time: float = 0.0
|
|
104
|
+
self.end_time: float = 0.0
|
|
105
|
+
self.duration: float = 0.0
|
|
106
|
+
|
|
107
|
+
def __enter__(self) -> "Timer":
|
|
108
|
+
"""Start the timer."""
|
|
109
|
+
self.start_time = time.perf_counter()
|
|
110
|
+
return self
|
|
111
|
+
|
|
112
|
+
def __exit__(
|
|
113
|
+
self,
|
|
114
|
+
exc_type: type[BaseException] | None,
|
|
115
|
+
exc_val: BaseException | None,
|
|
116
|
+
exc_tb: TracebackType | None,
|
|
117
|
+
) -> None:
|
|
118
|
+
"""Stop the timer and optionally print the duration."""
|
|
119
|
+
self.end_time = time.perf_counter()
|
|
120
|
+
self.duration = self.end_time - self.start_time
|
|
121
|
+
|
|
122
|
+
if self.verbose:
|
|
123
|
+
formatted = format_duration(self.duration)
|
|
124
|
+
self.print_fn(f"{self.name} completed in {formatted}")
|
|
125
|
+
|
|
126
|
+
def elapsed(self) -> float:
|
|
127
|
+
"""Get elapsed time since timer started (can be called during execution).
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
Elapsed time in seconds
|
|
131
|
+
"""
|
|
132
|
+
return time.perf_counter() - self.start_time
|
|
133
|
+
|
|
134
|
+
def __repr__(self) -> str:
|
|
135
|
+
"""Return string representation."""
|
|
136
|
+
if self.duration > 0:
|
|
137
|
+
return f"Timer(name={self.name!r}, duration={self.duration:.2f}s)"
|
|
138
|
+
return f"Timer(name={self.name!r})"
|