alberta-framework 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,527 @@
1
+ """Statistical analysis utilities for publication-quality experiments.
2
+
3
+ Provides functions for computing confidence intervals, significance tests,
4
+ effect sizes, and multiple comparison corrections.
5
+ """
6
+
7
+ from typing import TYPE_CHECKING, NamedTuple
8
+
9
+ import numpy as np
10
+ from numpy.typing import NDArray
11
+
12
+ if TYPE_CHECKING:
13
+ from alberta_framework.utils.experiments import AggregatedResults
14
+
15
+
16
+ class StatisticalSummary(NamedTuple):
17
+ """Summary statistics for a set of values.
18
+
19
+ Attributes:
20
+ mean: Arithmetic mean
21
+ std: Standard deviation
22
+ sem: Standard error of the mean
23
+ ci_lower: Lower bound of confidence interval
24
+ ci_upper: Upper bound of confidence interval
25
+ median: Median value
26
+ iqr: Interquartile range
27
+ n_seeds: Number of samples
28
+ """
29
+
30
+ mean: float
31
+ std: float
32
+ sem: float
33
+ ci_lower: float
34
+ ci_upper: float
35
+ median: float
36
+ iqr: float
37
+ n_seeds: int
38
+
39
+
40
+ class SignificanceResult(NamedTuple):
41
+ """Result of a statistical significance test.
42
+
43
+ Attributes:
44
+ test_name: Name of the test performed
45
+ statistic: Test statistic value
46
+ p_value: P-value of the test
47
+ significant: Whether the result is significant at the given alpha
48
+ alpha: Significance level used
49
+ effect_size: Effect size (e.g., Cohen's d)
50
+ method_a: Name of first method
51
+ method_b: Name of second method
52
+ """
53
+
54
+ test_name: str
55
+ statistic: float
56
+ p_value: float
57
+ significant: bool
58
+ alpha: float
59
+ effect_size: float
60
+ method_a: str
61
+ method_b: str
62
+
63
+
64
+ def compute_statistics(
65
+ values: NDArray[np.float64] | list[float],
66
+ confidence_level: float = 0.95,
67
+ ) -> StatisticalSummary:
68
+ """Compute comprehensive statistics for a set of values.
69
+
70
+ Args:
71
+ values: Array of values (e.g., final performance across seeds)
72
+ confidence_level: Confidence level for CI (default 0.95)
73
+
74
+ Returns:
75
+ StatisticalSummary with all statistics
76
+ """
77
+ arr = np.asarray(values)
78
+ n = len(arr)
79
+
80
+ mean = float(np.mean(arr))
81
+ std = float(np.std(arr, ddof=1)) if n > 1 else 0.0
82
+ sem = std / np.sqrt(n) if n > 1 else 0.0
83
+ median = float(np.median(arr))
84
+ q75, q25 = np.percentile(arr, [75, 25])
85
+ iqr = float(q75 - q25)
86
+
87
+ # Compute confidence interval
88
+ try:
89
+ from scipy import stats
90
+
91
+ if n > 1:
92
+ t_value = float(stats.t.ppf((1 + confidence_level) / 2, n - 1))
93
+ margin = t_value * sem
94
+ ci_lower = mean - margin
95
+ ci_upper = mean + margin
96
+ else:
97
+ ci_lower = ci_upper = mean
98
+ except ImportError:
99
+ # Fallback without scipy: use normal approximation
100
+ z_value = 1.96 if confidence_level == 0.95 else 2.576 # 95% or 99%
101
+ margin = z_value * sem
102
+ ci_lower = mean - margin
103
+ ci_upper = mean + margin
104
+
105
+ return StatisticalSummary(
106
+ mean=mean,
107
+ std=std,
108
+ sem=sem,
109
+ ci_lower=float(ci_lower),
110
+ ci_upper=float(ci_upper),
111
+ median=median,
112
+ iqr=iqr,
113
+ n_seeds=n,
114
+ )
115
+
116
+
117
+ def compute_timeseries_statistics(
118
+ metric_array: NDArray[np.float64],
119
+ confidence_level: float = 0.95,
120
+ ) -> tuple[NDArray[np.float64], NDArray[np.float64], NDArray[np.float64]]:
121
+ """Compute mean and confidence intervals for timeseries data.
122
+
123
+ Args:
124
+ metric_array: Array of shape (n_seeds, n_steps)
125
+ confidence_level: Confidence level for CI
126
+
127
+ Returns:
128
+ Tuple of (mean, ci_lower, ci_upper) arrays of shape (n_steps,)
129
+ """
130
+ n_seeds = metric_array.shape[0]
131
+ mean = np.mean(metric_array, axis=0)
132
+ std = np.std(metric_array, axis=0, ddof=1)
133
+ sem = std / np.sqrt(n_seeds)
134
+
135
+ try:
136
+ from scipy import stats
137
+
138
+ t_value = stats.t.ppf((1 + confidence_level) / 2, n_seeds - 1)
139
+ except ImportError:
140
+ t_value = 1.96 if confidence_level == 0.95 else 2.576
141
+
142
+ margin = t_value * sem
143
+ ci_lower = mean - margin
144
+ ci_upper = mean + margin
145
+
146
+ return mean, ci_lower, ci_upper
147
+
148
+
149
+ def cohens_d(
150
+ values_a: NDArray[np.float64] | list[float],
151
+ values_b: NDArray[np.float64] | list[float],
152
+ ) -> float:
153
+ """Compute Cohen's d effect size.
154
+
155
+ Args:
156
+ values_a: Values for first group
157
+ values_b: Values for second group
158
+
159
+ Returns:
160
+ Cohen's d (positive means a > b)
161
+ """
162
+ a = np.asarray(values_a)
163
+ b = np.asarray(values_b)
164
+
165
+ mean_a = np.mean(a)
166
+ mean_b = np.mean(b)
167
+
168
+ n_a = len(a)
169
+ n_b = len(b)
170
+
171
+ # Pooled standard deviation
172
+ var_a = np.var(a, ddof=1) if n_a > 1 else 0.0
173
+ var_b = np.var(b, ddof=1) if n_b > 1 else 0.0
174
+
175
+ pooled_std = np.sqrt(((n_a - 1) * var_a + (n_b - 1) * var_b) / (n_a + n_b - 2))
176
+
177
+ if pooled_std == 0:
178
+ return 0.0
179
+
180
+ return float((mean_a - mean_b) / pooled_std)
181
+
182
+
183
+ def ttest_comparison(
184
+ values_a: NDArray[np.float64] | list[float],
185
+ values_b: NDArray[np.float64] | list[float],
186
+ paired: bool = True,
187
+ alpha: float = 0.05,
188
+ method_a: str = "A",
189
+ method_b: str = "B",
190
+ ) -> SignificanceResult:
191
+ """Perform t-test comparison between two methods.
192
+
193
+ Args:
194
+ values_a: Values for first method
195
+ values_b: Values for second method
196
+ paired: Whether to use paired t-test (default True for same seeds)
197
+ alpha: Significance level
198
+ method_a: Name of first method
199
+ method_b: Name of second method
200
+
201
+ Returns:
202
+ SignificanceResult with test results
203
+ """
204
+ a = np.asarray(values_a)
205
+ b = np.asarray(values_b)
206
+
207
+ try:
208
+ from scipy import stats
209
+
210
+ if paired:
211
+ result = stats.ttest_rel(a, b)
212
+ test_name = "paired t-test"
213
+ else:
214
+ result = stats.ttest_ind(a, b)
215
+ test_name = "independent t-test"
216
+ # scipy returns (statistic, pvalue) tuple
217
+ stat_val = float(result[0])
218
+ p_val = float(result[1])
219
+ except ImportError:
220
+ raise ImportError("scipy is required for t-test. Install with: pip install scipy")
221
+
222
+ effect = cohens_d(a, b)
223
+
224
+ return SignificanceResult(
225
+ test_name=test_name,
226
+ statistic=stat_val,
227
+ p_value=p_val,
228
+ significant=p_val < alpha,
229
+ alpha=alpha,
230
+ effect_size=effect,
231
+ method_a=method_a,
232
+ method_b=method_b,
233
+ )
234
+
235
+
236
+ def mann_whitney_comparison(
237
+ values_a: NDArray[np.float64] | list[float],
238
+ values_b: NDArray[np.float64] | list[float],
239
+ alpha: float = 0.05,
240
+ method_a: str = "A",
241
+ method_b: str = "B",
242
+ ) -> SignificanceResult:
243
+ """Perform Mann-Whitney U test (non-parametric).
244
+
245
+ Args:
246
+ values_a: Values for first method
247
+ values_b: Values for second method
248
+ alpha: Significance level
249
+ method_a: Name of first method
250
+ method_b: Name of second method
251
+
252
+ Returns:
253
+ SignificanceResult with test results
254
+ """
255
+ a = np.asarray(values_a)
256
+ b = np.asarray(values_b)
257
+
258
+ try:
259
+ from scipy import stats
260
+
261
+ result = stats.mannwhitneyu(a, b, alternative="two-sided")
262
+ # scipy returns (statistic, pvalue) tuple
263
+ stat_val = float(result[0])
264
+ p_val = float(result[1])
265
+ except ImportError:
266
+ raise ImportError(
267
+ "scipy is required for Mann-Whitney test. Install with: pip install scipy"
268
+ )
269
+
270
+ # Compute rank-biserial correlation as effect size
271
+ n_a, n_b = len(a), len(b)
272
+ r = 1 - (2 * stat_val) / (n_a * n_b)
273
+
274
+ return SignificanceResult(
275
+ test_name="Mann-Whitney U",
276
+ statistic=stat_val,
277
+ p_value=p_val,
278
+ significant=p_val < alpha,
279
+ alpha=alpha,
280
+ effect_size=r,
281
+ method_a=method_a,
282
+ method_b=method_b,
283
+ )
284
+
285
+
286
+ def wilcoxon_comparison(
287
+ values_a: NDArray[np.float64] | list[float],
288
+ values_b: NDArray[np.float64] | list[float],
289
+ alpha: float = 0.05,
290
+ method_a: str = "A",
291
+ method_b: str = "B",
292
+ ) -> SignificanceResult:
293
+ """Perform Wilcoxon signed-rank test (paired non-parametric).
294
+
295
+ Args:
296
+ values_a: Values for first method
297
+ values_b: Values for second method
298
+ alpha: Significance level
299
+ method_a: Name of first method
300
+ method_b: Name of second method
301
+
302
+ Returns:
303
+ SignificanceResult with test results
304
+ """
305
+ a = np.asarray(values_a)
306
+ b = np.asarray(values_b)
307
+
308
+ try:
309
+ from scipy import stats
310
+
311
+ result = stats.wilcoxon(a, b, alternative="two-sided")
312
+ # scipy returns (statistic, pvalue) tuple
313
+ stat_val = float(result[0])
314
+ p_val = float(result[1])
315
+ except ImportError:
316
+ raise ImportError(
317
+ "scipy is required for Wilcoxon test. Install with: pip install scipy"
318
+ )
319
+
320
+ effect = cohens_d(a, b)
321
+
322
+ return SignificanceResult(
323
+ test_name="Wilcoxon signed-rank",
324
+ statistic=stat_val,
325
+ p_value=p_val,
326
+ significant=p_val < alpha,
327
+ alpha=alpha,
328
+ effect_size=effect,
329
+ method_a=method_a,
330
+ method_b=method_b,
331
+ )
332
+
333
+
334
+ def bonferroni_correction(
335
+ p_values: list[float],
336
+ alpha: float = 0.05,
337
+ ) -> tuple[list[bool], float]:
338
+ """Apply Bonferroni correction for multiple comparisons.
339
+
340
+ Args:
341
+ p_values: List of p-values from multiple tests
342
+ alpha: Family-wise significance level
343
+
344
+ Returns:
345
+ Tuple of (list of significant booleans, corrected alpha)
346
+ """
347
+ n_tests = len(p_values)
348
+ corrected_alpha = alpha / n_tests
349
+ significant = [p < corrected_alpha for p in p_values]
350
+ return significant, corrected_alpha
351
+
352
+
353
+ def holm_correction(
354
+ p_values: list[float],
355
+ alpha: float = 0.05,
356
+ ) -> list[bool]:
357
+ """Apply Holm-Bonferroni step-down correction.
358
+
359
+ More powerful than Bonferroni while still controlling FWER.
360
+
361
+ Args:
362
+ p_values: List of p-values from multiple tests
363
+ alpha: Family-wise significance level
364
+
365
+ Returns:
366
+ List of significant booleans
367
+ """
368
+ n_tests = len(p_values)
369
+
370
+ # Sort p-values and track original indices
371
+ sorted_indices = np.argsort(p_values)
372
+ sorted_p = [p_values[i] for i in sorted_indices]
373
+
374
+ # Apply Holm correction
375
+ significant_sorted = []
376
+ for i, p in enumerate(sorted_p):
377
+ corrected_alpha = alpha / (n_tests - i)
378
+ if p < corrected_alpha:
379
+ significant_sorted.append(True)
380
+ else:
381
+ # Once we fail to reject, all subsequent are not significant
382
+ significant_sorted.extend([False] * (n_tests - i))
383
+ break
384
+
385
+ # Restore original order
386
+ significant = [False] * n_tests
387
+ for orig_idx, sig in zip(sorted_indices, significant_sorted, strict=False):
388
+ significant[orig_idx] = sig
389
+
390
+ return significant
391
+
392
+
393
+ def pairwise_comparisons(
394
+ results: "dict[str, AggregatedResults]", # noqa: F821
395
+ metric: str = "squared_error",
396
+ test: str = "ttest",
397
+ correction: str = "bonferroni",
398
+ alpha: float = 0.05,
399
+ window: int = 100,
400
+ ) -> dict[tuple[str, str], SignificanceResult]:
401
+ """Perform all pairwise comparisons between methods.
402
+
403
+ Args:
404
+ results: Dictionary mapping config name to AggregatedResults
405
+ metric: Metric to compare
406
+ test: Test to use ("ttest", "mann_whitney", or "wilcoxon")
407
+ correction: Multiple comparison correction ("bonferroni" or "holm")
408
+ alpha: Significance level
409
+ window: Number of final steps to average
410
+
411
+ Returns:
412
+ Dictionary mapping (method_a, method_b) to SignificanceResult
413
+ """
414
+ from alberta_framework.utils.experiments import AggregatedResults
415
+
416
+ names = list(results.keys())
417
+ n = len(names)
418
+
419
+ if n < 2:
420
+ return {}
421
+
422
+ # Extract final values for each method
423
+ final_values: dict[str, NDArray[np.float64]] = {}
424
+ for name, agg in results.items():
425
+ if not isinstance(agg, AggregatedResults):
426
+ raise TypeError(f"Expected AggregatedResults, got {type(agg)}")
427
+ arr = agg.metric_arrays[metric]
428
+ final_window = min(window, arr.shape[1])
429
+ final_values[name] = np.mean(arr[:, -final_window:], axis=1)
430
+
431
+ if test not in ("ttest", "mann_whitney", "wilcoxon"):
432
+ raise ValueError(f"Unknown test: {test}")
433
+
434
+ # Perform all pairwise comparisons
435
+ comparisons: dict[tuple[str, str], SignificanceResult] = {}
436
+ p_values: list[float] = []
437
+
438
+ for i in range(n):
439
+ for j in range(i + 1, n):
440
+ name_a, name_b = names[i], names[j]
441
+ values_a = final_values[name_a]
442
+ values_b = final_values[name_b]
443
+
444
+ if test == "ttest":
445
+ result = ttest_comparison(
446
+ values_a, values_b, paired=True, alpha=alpha,
447
+ method_a=name_a, method_b=name_b,
448
+ )
449
+ elif test == "mann_whitney":
450
+ result = mann_whitney_comparison(
451
+ values_a, values_b, alpha=alpha,
452
+ method_a=name_a, method_b=name_b,
453
+ )
454
+ else: # wilcoxon
455
+ result = wilcoxon_comparison(
456
+ values_a, values_b, alpha=alpha,
457
+ method_a=name_a, method_b=name_b,
458
+ )
459
+
460
+ comparisons[(name_a, name_b)] = result
461
+ p_values.append(result.p_value)
462
+
463
+ # Apply multiple comparison correction
464
+ if correction == "bonferroni":
465
+ significant_list, _ = bonferroni_correction(p_values, alpha)
466
+ elif correction == "holm":
467
+ significant_list = holm_correction(p_values, alpha)
468
+ else:
469
+ raise ValueError(f"Unknown correction: {correction}")
470
+
471
+ # Update significance based on correction
472
+ corrected_comparisons: dict[tuple[str, str], SignificanceResult] = {}
473
+ for (key, result), sig in zip(comparisons.items(), significant_list, strict=False):
474
+ corrected_comparisons[key] = SignificanceResult(
475
+ test_name=f"{result.test_name} ({correction})",
476
+ statistic=result.statistic,
477
+ p_value=result.p_value,
478
+ significant=sig,
479
+ alpha=alpha,
480
+ effect_size=result.effect_size,
481
+ method_a=result.method_a,
482
+ method_b=result.method_b,
483
+ )
484
+
485
+ return corrected_comparisons
486
+
487
+
488
+ def bootstrap_ci(
489
+ values: NDArray[np.float64] | list[float],
490
+ statistic: str = "mean",
491
+ confidence_level: float = 0.95,
492
+ n_bootstrap: int = 10000,
493
+ seed: int = 42,
494
+ ) -> tuple[float, float, float]:
495
+ """Compute bootstrap confidence interval.
496
+
497
+ Args:
498
+ values: Array of values
499
+ statistic: Statistic to bootstrap ("mean" or "median")
500
+ confidence_level: Confidence level
501
+ n_bootstrap: Number of bootstrap samples
502
+ seed: Random seed
503
+
504
+ Returns:
505
+ Tuple of (point_estimate, ci_lower, ci_upper)
506
+ """
507
+ arr = np.asarray(values)
508
+ rng = np.random.default_rng(seed)
509
+
510
+ stat_func = np.mean if statistic == "mean" else np.median
511
+ point_estimate = float(stat_func(arr))
512
+
513
+ # Generate bootstrap samples
514
+ bootstrap_stats_list: list[float] = []
515
+ for _ in range(n_bootstrap):
516
+ sample = rng.choice(arr, size=len(arr), replace=True)
517
+ bootstrap_stats_list.append(float(stat_func(sample)))
518
+
519
+ bootstrap_stats = np.array(bootstrap_stats_list)
520
+
521
+ # Percentile method
522
+ lower_percentile = (1 - confidence_level) / 2 * 100
523
+ upper_percentile = (1 + confidence_level) / 2 * 100
524
+ ci_lower = float(np.percentile(bootstrap_stats, lower_percentile))
525
+ ci_upper = float(np.percentile(bootstrap_stats, upper_percentile))
526
+
527
+ return point_estimate, ci_lower, ci_upper
@@ -0,0 +1,144 @@
1
+ """Timing utilities for measuring and reporting experiment durations.
2
+
3
+ This module provides a simple Timer context manager for measuring execution time
4
+ and formatting durations in a human-readable format.
5
+
6
+ Examples
7
+ --------
8
+ ```python
9
+ from alberta_framework.utils.timing import Timer
10
+
11
+ with Timer("Training"):
12
+ # run training code
13
+ pass
14
+ # Output: Training completed in 1.23s
15
+
16
+ # Or capture the duration:
17
+ with Timer("Experiment") as t:
18
+ # run experiment
19
+ pass
20
+ print(f"Took {t.duration:.2f} seconds")
21
+ ```
22
+ """
23
+
24
+ import time
25
+ from collections.abc import Callable
26
+ from types import TracebackType
27
+
28
+
29
+ def format_duration(seconds: float) -> str:
30
+ """Format a duration in seconds as a human-readable string.
31
+
32
+ Args:
33
+ seconds: Duration in seconds
34
+
35
+ Returns:
36
+ Formatted string like "1.23s", "2m 30.5s", or "1h 5m 30s"
37
+
38
+ Examples
39
+ --------
40
+ ```python
41
+ format_duration(0.5) # Returns: '0.50s'
42
+ format_duration(90.5) # Returns: '1m 30.50s'
43
+ format_duration(3665) # Returns: '1h 1m 5.00s'
44
+ ```
45
+ """
46
+ if seconds < 60:
47
+ return f"{seconds:.2f}s"
48
+ elif seconds < 3600:
49
+ minutes = int(seconds // 60)
50
+ secs = seconds % 60
51
+ return f"{minutes}m {secs:.2f}s"
52
+ else:
53
+ hours = int(seconds // 3600)
54
+ remaining = seconds % 3600
55
+ minutes = int(remaining // 60)
56
+ secs = remaining % 60
57
+ return f"{hours}h {minutes}m {secs:.2f}s"
58
+
59
+
60
+ class Timer:
61
+ """Context manager for timing code execution.
62
+
63
+ Measures wall-clock time for a block of code and optionally prints
64
+ the duration when the block completes.
65
+
66
+ Attributes:
67
+ name: Description of what is being timed
68
+ duration: Elapsed time in seconds (available after context exits)
69
+ start_time: Timestamp when timing started
70
+ end_time: Timestamp when timing ended
71
+
72
+ Examples
73
+ --------
74
+ ```python
75
+ with Timer("Training loop"):
76
+ for i in range(1000):
77
+ pass
78
+ # Output: Training loop completed in 0.01s
79
+
80
+ # Silent timing (no print):
81
+ with Timer("Silent", verbose=False) as t:
82
+ time.sleep(0.1)
83
+ print(f"Elapsed: {t.duration:.2f}s")
84
+ # Output: Elapsed: 0.10s
85
+
86
+ # Custom print function:
87
+ with Timer("Custom", print_fn=lambda msg: print(f">> {msg}")):
88
+ pass
89
+ # Output: >> Custom completed in 0.00s
90
+ ```
91
+ """
92
+
93
+ def __init__(
94
+ self,
95
+ name: str = "Operation",
96
+ verbose: bool = True,
97
+ print_fn: Callable[[str], None] | None = None,
98
+ ):
99
+ """Initialize the timer.
100
+
101
+ Args:
102
+ name: Description of the operation being timed
103
+ verbose: Whether to print the duration when done
104
+ print_fn: Custom print function (defaults to built-in print)
105
+ """
106
+ self.name = name
107
+ self.verbose = verbose
108
+ self.print_fn = print_fn or print
109
+ self.start_time: float = 0.0
110
+ self.end_time: float = 0.0
111
+ self.duration: float = 0.0
112
+
113
+ def __enter__(self) -> "Timer":
114
+ """Start the timer."""
115
+ self.start_time = time.perf_counter()
116
+ return self
117
+
118
+ def __exit__(
119
+ self,
120
+ exc_type: type[BaseException] | None,
121
+ exc_val: BaseException | None,
122
+ exc_tb: TracebackType | None,
123
+ ) -> None:
124
+ """Stop the timer and optionally print the duration."""
125
+ self.end_time = time.perf_counter()
126
+ self.duration = self.end_time - self.start_time
127
+
128
+ if self.verbose:
129
+ formatted = format_duration(self.duration)
130
+ self.print_fn(f"{self.name} completed in {formatted}")
131
+
132
+ def elapsed(self) -> float:
133
+ """Get elapsed time since timer started (can be called during execution).
134
+
135
+ Returns:
136
+ Elapsed time in seconds
137
+ """
138
+ return time.perf_counter() - self.start_time
139
+
140
+ def __repr__(self) -> str:
141
+ """Return string representation."""
142
+ if self.duration > 0:
143
+ return f"Timer(name={self.name!r}, duration={self.duration:.2f}s)"
144
+ return f"Timer(name={self.name!r})"