alberta-framework 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,113 @@
1
+ """Utility functions for the Alberta Framework."""
2
+
3
+ # Experiment runner (no external deps)
4
+ from alberta_framework.utils.experiments import (
5
+ AggregatedResults,
6
+ ExperimentConfig,
7
+ MetricSummary,
8
+ SingleRunResult,
9
+ aggregate_metrics,
10
+ get_final_performance,
11
+ get_metric_timeseries,
12
+ run_multi_seed_experiment,
13
+ run_single_experiment,
14
+ )
15
+
16
+ # Export utilities (no external deps for basic functionality)
17
+ from alberta_framework.utils.export import (
18
+ export_to_csv,
19
+ export_to_json,
20
+ generate_latex_table,
21
+ generate_markdown_table,
22
+ save_experiment_report,
23
+ )
24
+ from alberta_framework.utils.metrics import (
25
+ compare_learners,
26
+ compute_cumulative_error,
27
+ compute_running_mean,
28
+ compute_tracking_error,
29
+ extract_metric,
30
+ )
31
+
32
+ __all__ = [
33
+ # Metrics
34
+ "compare_learners",
35
+ "compute_cumulative_error",
36
+ "compute_running_mean",
37
+ "compute_tracking_error",
38
+ "extract_metric",
39
+ # Experiments
40
+ "AggregatedResults",
41
+ "ExperimentConfig",
42
+ "MetricSummary",
43
+ "SingleRunResult",
44
+ "aggregate_metrics",
45
+ "get_final_performance",
46
+ "get_metric_timeseries",
47
+ "run_multi_seed_experiment",
48
+ "run_single_experiment",
49
+ # Export
50
+ "export_to_csv",
51
+ "export_to_json",
52
+ "generate_latex_table",
53
+ "generate_markdown_table",
54
+ "save_experiment_report",
55
+ ]
56
+
57
+ # Optional: Statistics (requires scipy for full functionality)
58
+ try:
59
+ from alberta_framework.utils.statistics import (
60
+ SignificanceResult,
61
+ StatisticalSummary,
62
+ bonferroni_correction,
63
+ bootstrap_ci,
64
+ cohens_d,
65
+ compute_statistics,
66
+ compute_timeseries_statistics,
67
+ holm_correction,
68
+ mann_whitney_comparison,
69
+ pairwise_comparisons,
70
+ ttest_comparison,
71
+ wilcoxon_comparison,
72
+ )
73
+
74
+ __all__ += [
75
+ "SignificanceResult",
76
+ "StatisticalSummary",
77
+ "bonferroni_correction",
78
+ "bootstrap_ci",
79
+ "cohens_d",
80
+ "compute_statistics",
81
+ "compute_timeseries_statistics",
82
+ "holm_correction",
83
+ "mann_whitney_comparison",
84
+ "pairwise_comparisons",
85
+ "ttest_comparison",
86
+ "wilcoxon_comparison",
87
+ ]
88
+ except ImportError:
89
+ pass
90
+
91
+ # Optional: Visualization (requires matplotlib)
92
+ try:
93
+ from alberta_framework.utils.visualization import (
94
+ create_comparison_figure,
95
+ plot_final_performance_bars,
96
+ plot_hyperparameter_heatmap,
97
+ plot_learning_curves,
98
+ plot_step_size_evolution,
99
+ save_figure,
100
+ set_publication_style,
101
+ )
102
+
103
+ __all__ += [
104
+ "create_comparison_figure",
105
+ "plot_final_performance_bars",
106
+ "plot_hyperparameter_heatmap",
107
+ "plot_learning_curves",
108
+ "plot_step_size_evolution",
109
+ "save_figure",
110
+ "set_publication_style",
111
+ ]
112
+ except ImportError:
113
+ pass
@@ -0,0 +1,335 @@
1
+ """Multi-seed experiment runner for publication-quality analysis.
2
+
3
+ Provides infrastructure for running experiments across multiple seeds
4
+ with optional parallelization and aggregation of results.
5
+ """
6
+
7
+ from collections.abc import Callable, Sequence
8
+ from typing import Any, NamedTuple, cast
9
+
10
+ import jax.random as jr
11
+ import numpy as np
12
+ from numpy.typing import NDArray
13
+
14
+ from alberta_framework.core.learners import (
15
+ LinearLearner,
16
+ NormalizedLearnerState,
17
+ NormalizedLinearLearner,
18
+ metrics_to_dicts,
19
+ run_learning_loop,
20
+ run_normalized_learning_loop,
21
+ )
22
+ from alberta_framework.core.types import LearnerState
23
+ from alberta_framework.streams.base import ScanStream
24
+
25
+
26
+ class ExperimentConfig(NamedTuple):
27
+ """Configuration for a single experiment.
28
+
29
+ Attributes:
30
+ name: Human-readable name for this configuration
31
+ learner_factory: Callable that returns a fresh learner instance
32
+ stream_factory: Callable that returns a fresh stream instance
33
+ num_steps: Number of learning steps to run
34
+ """
35
+
36
+ name: str
37
+ learner_factory: Callable[[], LinearLearner | NormalizedLinearLearner]
38
+ stream_factory: Callable[[], ScanStream[Any]]
39
+ num_steps: int
40
+
41
+
42
+ class SingleRunResult(NamedTuple):
43
+ """Result from a single experiment run.
44
+
45
+ Attributes:
46
+ config_name: Name of the configuration that was run
47
+ seed: Random seed used for this run
48
+ metrics_history: List of metric dictionaries from each step
49
+ final_state: Final learner state after training
50
+ """
51
+
52
+ config_name: str
53
+ seed: int
54
+ metrics_history: list[dict[str, float]]
55
+ final_state: LearnerState | NormalizedLearnerState
56
+
57
+
58
+ class MetricSummary(NamedTuple):
59
+ """Summary statistics for a single metric.
60
+
61
+ Attributes:
62
+ mean: Mean across seeds
63
+ std: Standard deviation across seeds
64
+ min: Minimum value across seeds
65
+ max: Maximum value across seeds
66
+ n_seeds: Number of seeds
67
+ values: Raw values per seed
68
+ """
69
+
70
+ mean: float
71
+ std: float
72
+ min: float
73
+ max: float
74
+ n_seeds: int
75
+ values: NDArray[np.float64]
76
+
77
+
78
+ class AggregatedResults(NamedTuple):
79
+ """Aggregated results across multiple seeds.
80
+
81
+ Attributes:
82
+ config_name: Name of the configuration
83
+ seeds: List of seeds used
84
+ metric_arrays: Dict mapping metric name to (n_seeds, n_steps) array
85
+ summary: Dict mapping metric name to MetricSummary (final values)
86
+ """
87
+
88
+ config_name: str
89
+ seeds: list[int]
90
+ metric_arrays: dict[str, NDArray[np.float64]]
91
+ summary: dict[str, MetricSummary]
92
+
93
+
94
+ def run_single_experiment(
95
+ config: ExperimentConfig,
96
+ seed: int,
97
+ ) -> SingleRunResult:
98
+ """Run a single experiment with a given seed.
99
+
100
+ Args:
101
+ config: Experiment configuration
102
+ seed: Random seed for the stream
103
+
104
+ Returns:
105
+ SingleRunResult with metrics and final state
106
+ """
107
+ learner = config.learner_factory()
108
+ stream = config.stream_factory()
109
+ key = jr.key(seed)
110
+
111
+ final_state: LearnerState | NormalizedLearnerState
112
+ if isinstance(learner, NormalizedLinearLearner):
113
+ norm_result = run_normalized_learning_loop(
114
+ learner, stream, config.num_steps, key
115
+ )
116
+ final_state, metrics = cast(tuple[NormalizedLearnerState, Any], norm_result)
117
+ metrics_history = metrics_to_dicts(metrics, normalized=True)
118
+ else:
119
+ linear_result = run_learning_loop(learner, stream, config.num_steps, key)
120
+ final_state, metrics = cast(tuple[LearnerState, Any], linear_result)
121
+ metrics_history = metrics_to_dicts(metrics)
122
+
123
+ return SingleRunResult(
124
+ config_name=config.name,
125
+ seed=seed,
126
+ metrics_history=metrics_history,
127
+ final_state=final_state,
128
+ )
129
+
130
+
131
+ def aggregate_metrics(results: list[SingleRunResult]) -> AggregatedResults:
132
+ """Aggregate results from multiple seeds into summary statistics.
133
+
134
+ Args:
135
+ results: List of SingleRunResult from multiple seeds
136
+
137
+ Returns:
138
+ AggregatedResults with aggregated metrics
139
+ """
140
+ if not results:
141
+ raise ValueError("Cannot aggregate empty results list")
142
+
143
+ config_name = results[0].config_name
144
+ seeds = [r.seed for r in results]
145
+
146
+ # Get all metric keys from first result
147
+ metric_keys = list(results[0].metrics_history[0].keys())
148
+
149
+ # Build metric arrays: (n_seeds, n_steps)
150
+ metric_arrays: dict[str, NDArray[np.float64]] = {}
151
+ for key in metric_keys:
152
+ arrays = []
153
+ for r in results:
154
+ values = np.array([m[key] for m in r.metrics_history])
155
+ arrays.append(values)
156
+ metric_arrays[key] = np.stack(arrays)
157
+
158
+ # Compute summary statistics for final values (mean of last 100 steps)
159
+ summary: dict[str, MetricSummary] = {}
160
+ n_seeds = len(results)
161
+ for key in metric_keys:
162
+ # Use mean of last 100 steps as the final value
163
+ window = min(100, metric_arrays[key].shape[1])
164
+ final_values = np.mean(metric_arrays[key][:, -window:], axis=1)
165
+ summary[key] = MetricSummary(
166
+ mean=float(np.mean(final_values)),
167
+ std=float(np.std(final_values)),
168
+ min=float(np.min(final_values)),
169
+ max=float(np.max(final_values)),
170
+ n_seeds=n_seeds,
171
+ values=final_values,
172
+ )
173
+
174
+ return AggregatedResults(
175
+ config_name=config_name,
176
+ seeds=seeds,
177
+ metric_arrays=metric_arrays,
178
+ summary=summary,
179
+ )
180
+
181
+
182
+ def run_multi_seed_experiment(
183
+ configs: Sequence[ExperimentConfig],
184
+ seeds: int | Sequence[int] = 30,
185
+ parallel: bool = True,
186
+ n_jobs: int = -1,
187
+ show_progress: bool = True,
188
+ ) -> dict[str, AggregatedResults]:
189
+ """Run experiments across multiple seeds with optional parallelization.
190
+
191
+ Args:
192
+ configs: List of experiment configurations to run
193
+ seeds: Number of seeds (generates 0..n-1) or explicit list of seeds
194
+ parallel: Whether to use parallel execution (requires joblib)
195
+ n_jobs: Number of parallel jobs (-1 for all CPUs)
196
+ show_progress: Whether to show progress bar (requires tqdm)
197
+
198
+ Returns:
199
+ Dictionary mapping config name to AggregatedResults
200
+ """
201
+ # Convert seeds to list
202
+ if isinstance(seeds, int):
203
+ seed_list = list(range(seeds))
204
+ else:
205
+ seed_list = list(seeds)
206
+
207
+ # Build list of (config, seed) pairs
208
+ tasks: list[tuple[ExperimentConfig, int]] = []
209
+ for config in configs:
210
+ for seed in seed_list:
211
+ tasks.append((config, seed))
212
+
213
+ # Run experiments
214
+ if parallel:
215
+ try:
216
+ from joblib import Parallel, delayed
217
+
218
+ if show_progress:
219
+ try:
220
+ from tqdm import tqdm
221
+
222
+ results_list: list[SingleRunResult] = Parallel(n_jobs=n_jobs)(
223
+ delayed(run_single_experiment)(config, seed)
224
+ for config, seed in tqdm(tasks, desc="Running experiments")
225
+ )
226
+ except ImportError:
227
+ results_list = Parallel(n_jobs=n_jobs)(
228
+ delayed(run_single_experiment)(config, seed) for config, seed in tasks
229
+ )
230
+ else:
231
+ results_list = Parallel(n_jobs=n_jobs)(
232
+ delayed(run_single_experiment)(config, seed) for config, seed in tasks
233
+ )
234
+ except ImportError:
235
+ # Fallback to sequential if joblib not available
236
+ results_list = _run_sequential(tasks, show_progress)
237
+ else:
238
+ results_list = _run_sequential(tasks, show_progress)
239
+
240
+ # Group results by config name
241
+ grouped: dict[str, list[SingleRunResult]] = {}
242
+ for result in results_list:
243
+ if result.config_name not in grouped:
244
+ grouped[result.config_name] = []
245
+ grouped[result.config_name].append(result)
246
+
247
+ # Aggregate each config
248
+ aggregated: dict[str, AggregatedResults] = {}
249
+ for config_name, group_results in grouped.items():
250
+ aggregated[config_name] = aggregate_metrics(group_results)
251
+
252
+ return aggregated
253
+
254
+
255
+ def _run_sequential(
256
+ tasks: list[tuple[ExperimentConfig, int]],
257
+ show_progress: bool,
258
+ ) -> list[SingleRunResult]:
259
+ """Run experiments sequentially."""
260
+ if show_progress:
261
+ try:
262
+ from tqdm import tqdm
263
+
264
+ return [run_single_experiment(config, seed) for config, seed in tqdm(tasks)]
265
+ except ImportError:
266
+ pass
267
+ return [run_single_experiment(config, seed) for config, seed in tasks]
268
+
269
+
270
+ def get_metric_timeseries(
271
+ results: AggregatedResults,
272
+ metric: str = "squared_error",
273
+ ) -> tuple[NDArray[np.float64], NDArray[np.float64], NDArray[np.float64]]:
274
+ """Get mean and standard deviation timeseries for a metric.
275
+
276
+ Args:
277
+ results: Aggregated results
278
+ metric: Name of the metric
279
+
280
+ Returns:
281
+ Tuple of (mean, lower_bound, upper_bound) arrays
282
+ """
283
+ arr = results.metric_arrays[metric]
284
+ mean = np.mean(arr, axis=0)
285
+ std = np.std(arr, axis=0)
286
+ return mean, mean - std, mean + std
287
+
288
+
289
+ def get_final_performance(
290
+ results: dict[str, AggregatedResults],
291
+ metric: str = "squared_error",
292
+ window: int = 100,
293
+ ) -> dict[str, tuple[float, float]]:
294
+ """Get final performance (mean, std) for each config.
295
+
296
+ Args:
297
+ results: Dictionary of aggregated results
298
+ metric: Metric to evaluate
299
+ window: Number of final steps to average
300
+
301
+ Returns:
302
+ Dictionary mapping config name to (mean, std) tuple
303
+ """
304
+ performance: dict[str, tuple[float, float]] = {}
305
+ for name, agg in results.items():
306
+ arr = agg.metric_arrays[metric]
307
+ final_window = min(window, arr.shape[1])
308
+ final_means = np.mean(arr[:, -final_window:], axis=1)
309
+ performance[name] = (float(np.mean(final_means)), float(np.std(final_means)))
310
+ return performance
311
+
312
+
313
+ def extract_hyperparameter_results(
314
+ results: dict[str, AggregatedResults],
315
+ metric: str = "squared_error",
316
+ param_extractor: Callable[[str], Any] | None = None,
317
+ ) -> dict[Any, tuple[float, float]]:
318
+ """Extract results indexed by hyperparameter value.
319
+
320
+ Useful for creating hyperparameter sensitivity plots.
321
+
322
+ Args:
323
+ results: Dictionary of aggregated results
324
+ metric: Metric to evaluate
325
+ param_extractor: Function to extract param value from config name
326
+
327
+ Returns:
328
+ Dictionary mapping param value to (mean, std) tuple
329
+ """
330
+ performance = get_final_performance(results, metric)
331
+
332
+ if param_extractor is None:
333
+ return {k: v for k, v in performance.items()}
334
+
335
+ return {param_extractor(name): perf for name, perf in performance.items()}