alberta-framework 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,113 @@
1
+ """Utility functions for the Alberta Framework."""
2
+
3
+ # Experiment runner (no external deps)
4
+ from alberta_framework.utils.experiments import (
5
+ AggregatedResults,
6
+ ExperimentConfig,
7
+ MetricSummary,
8
+ SingleRunResult,
9
+ aggregate_metrics,
10
+ get_final_performance,
11
+ get_metric_timeseries,
12
+ run_multi_seed_experiment,
13
+ run_single_experiment,
14
+ )
15
+
16
+ # Export utilities (no external deps for basic functionality)
17
+ from alberta_framework.utils.export import (
18
+ export_to_csv,
19
+ export_to_json,
20
+ generate_latex_table,
21
+ generate_markdown_table,
22
+ save_experiment_report,
23
+ )
24
+ from alberta_framework.utils.metrics import (
25
+ compare_learners,
26
+ compute_cumulative_error,
27
+ compute_running_mean,
28
+ compute_tracking_error,
29
+ extract_metric,
30
+ )
31
+
32
+ __all__ = [
33
+ # Metrics
34
+ "compare_learners",
35
+ "compute_cumulative_error",
36
+ "compute_running_mean",
37
+ "compute_tracking_error",
38
+ "extract_metric",
39
+ # Experiments
40
+ "AggregatedResults",
41
+ "ExperimentConfig",
42
+ "MetricSummary",
43
+ "SingleRunResult",
44
+ "aggregate_metrics",
45
+ "get_final_performance",
46
+ "get_metric_timeseries",
47
+ "run_multi_seed_experiment",
48
+ "run_single_experiment",
49
+ # Export
50
+ "export_to_csv",
51
+ "export_to_json",
52
+ "generate_latex_table",
53
+ "generate_markdown_table",
54
+ "save_experiment_report",
55
+ ]
56
+
57
+ # Optional: Statistics (requires scipy for full functionality)
58
+ try:
59
+ from alberta_framework.utils.statistics import (
60
+ SignificanceResult,
61
+ StatisticalSummary,
62
+ bonferroni_correction,
63
+ bootstrap_ci,
64
+ cohens_d,
65
+ compute_statistics,
66
+ compute_timeseries_statistics,
67
+ holm_correction,
68
+ mann_whitney_comparison,
69
+ pairwise_comparisons,
70
+ ttest_comparison,
71
+ wilcoxon_comparison,
72
+ )
73
+
74
+ __all__ += [
75
+ "SignificanceResult",
76
+ "StatisticalSummary",
77
+ "bonferroni_correction",
78
+ "bootstrap_ci",
79
+ "cohens_d",
80
+ "compute_statistics",
81
+ "compute_timeseries_statistics",
82
+ "holm_correction",
83
+ "mann_whitney_comparison",
84
+ "pairwise_comparisons",
85
+ "ttest_comparison",
86
+ "wilcoxon_comparison",
87
+ ]
88
+ except ImportError:
89
+ pass
90
+
91
+ # Optional: Visualization (requires matplotlib)
92
+ try:
93
+ from alberta_framework.utils.visualization import (
94
+ create_comparison_figure,
95
+ plot_final_performance_bars,
96
+ plot_hyperparameter_heatmap,
97
+ plot_learning_curves,
98
+ plot_step_size_evolution,
99
+ save_figure,
100
+ set_publication_style,
101
+ )
102
+
103
+ __all__ += [
104
+ "create_comparison_figure",
105
+ "plot_final_performance_bars",
106
+ "plot_hyperparameter_heatmap",
107
+ "plot_learning_curves",
108
+ "plot_step_size_evolution",
109
+ "save_figure",
110
+ "set_publication_style",
111
+ ]
112
+ except ImportError:
113
+ pass
@@ -0,0 +1,334 @@
1
+ """Multi-seed experiment runner for publication-quality analysis.
2
+
3
+ Provides infrastructure for running experiments across multiple seeds
4
+ with optional parallelization and aggregation of results.
5
+ """
6
+
7
+ from collections.abc import Callable, Sequence
8
+ from typing import Any, NamedTuple, cast
9
+
10
+ import jax.random as jr
11
+ import numpy as np
12
+ from numpy.typing import NDArray
13
+
14
+ from alberta_framework.core.learners import (
15
+ LinearLearner,
16
+ NormalizedLearnerState,
17
+ NormalizedLinearLearner,
18
+ metrics_to_dicts,
19
+ run_learning_loop,
20
+ run_normalized_learning_loop,
21
+ )
22
+ from alberta_framework.core.types import LearnerState
23
+ from alberta_framework.streams.base import ScanStream
24
+
25
+
26
+ class ExperimentConfig(NamedTuple):
27
+ """Configuration for a single experiment.
28
+
29
+ Attributes:
30
+ name: Human-readable name for this configuration
31
+ learner_factory: Callable that returns a fresh learner instance
32
+ stream_factory: Callable that returns a fresh stream instance
33
+ num_steps: Number of learning steps to run
34
+ """
35
+
36
+ name: str
37
+ learner_factory: Callable[[], LinearLearner | NormalizedLinearLearner]
38
+ stream_factory: Callable[[], ScanStream[Any]]
39
+ num_steps: int
40
+
41
+
42
+ class SingleRunResult(NamedTuple):
43
+ """Result from a single experiment run.
44
+
45
+ Attributes:
46
+ config_name: Name of the configuration that was run
47
+ seed: Random seed used for this run
48
+ metrics_history: List of metric dictionaries from each step
49
+ final_state: Final learner state after training
50
+ """
51
+
52
+ config_name: str
53
+ seed: int
54
+ metrics_history: list[dict[str, float]]
55
+ final_state: LearnerState | NormalizedLearnerState
56
+
57
+
58
+ class MetricSummary(NamedTuple):
59
+ """Summary statistics for a single metric.
60
+
61
+ Attributes:
62
+ mean: Mean across seeds
63
+ std: Standard deviation across seeds
64
+ min: Minimum value across seeds
65
+ max: Maximum value across seeds
66
+ n_seeds: Number of seeds
67
+ values: Raw values per seed
68
+ """
69
+
70
+ mean: float
71
+ std: float
72
+ min: float
73
+ max: float
74
+ n_seeds: int
75
+ values: NDArray[np.float64]
76
+
77
+
78
+ class AggregatedResults(NamedTuple):
79
+ """Aggregated results across multiple seeds.
80
+
81
+ Attributes:
82
+ config_name: Name of the configuration
83
+ seeds: List of seeds used
84
+ metric_arrays: Dict mapping metric name to (n_seeds, n_steps) array
85
+ summary: Dict mapping metric name to MetricSummary (final values)
86
+ """
87
+
88
+ config_name: str
89
+ seeds: list[int]
90
+ metric_arrays: dict[str, NDArray[np.float64]]
91
+ summary: dict[str, MetricSummary]
92
+
93
+
94
+ def run_single_experiment(
95
+ config: ExperimentConfig,
96
+ seed: int,
97
+ ) -> SingleRunResult:
98
+ """Run a single experiment with a given seed.
99
+
100
+ Args:
101
+ config: Experiment configuration
102
+ seed: Random seed for the stream
103
+
104
+ Returns:
105
+ SingleRunResult with metrics and final state
106
+ """
107
+ learner = config.learner_factory()
108
+ stream = config.stream_factory()
109
+ key = jr.key(seed)
110
+
111
+ final_state: LearnerState | NormalizedLearnerState
112
+ if isinstance(learner, NormalizedLinearLearner):
113
+ final_state, metrics = run_normalized_learning_loop(
114
+ learner, stream, config.num_steps, key
115
+ )
116
+ metrics_history = metrics_to_dicts(metrics, normalized=True)
117
+ else:
118
+ result = run_learning_loop(learner, stream, config.num_steps, key)
119
+ final_state, metrics = cast(tuple[LearnerState, Any], result)
120
+ metrics_history = metrics_to_dicts(metrics)
121
+
122
+ return SingleRunResult(
123
+ config_name=config.name,
124
+ seed=seed,
125
+ metrics_history=metrics_history,
126
+ final_state=final_state,
127
+ )
128
+
129
+
130
+ def aggregate_metrics(results: list[SingleRunResult]) -> AggregatedResults:
131
+ """Aggregate results from multiple seeds into summary statistics.
132
+
133
+ Args:
134
+ results: List of SingleRunResult from multiple seeds
135
+
136
+ Returns:
137
+ AggregatedResults with aggregated metrics
138
+ """
139
+ if not results:
140
+ raise ValueError("Cannot aggregate empty results list")
141
+
142
+ config_name = results[0].config_name
143
+ seeds = [r.seed for r in results]
144
+
145
+ # Get all metric keys from first result
146
+ metric_keys = list(results[0].metrics_history[0].keys())
147
+
148
+ # Build metric arrays: (n_seeds, n_steps)
149
+ metric_arrays: dict[str, NDArray[np.float64]] = {}
150
+ for key in metric_keys:
151
+ arrays = []
152
+ for r in results:
153
+ values = np.array([m[key] for m in r.metrics_history])
154
+ arrays.append(values)
155
+ metric_arrays[key] = np.stack(arrays)
156
+
157
+ # Compute summary statistics for final values (mean of last 100 steps)
158
+ summary: dict[str, MetricSummary] = {}
159
+ n_seeds = len(results)
160
+ for key in metric_keys:
161
+ # Use mean of last 100 steps as the final value
162
+ window = min(100, metric_arrays[key].shape[1])
163
+ final_values = np.mean(metric_arrays[key][:, -window:], axis=1)
164
+ summary[key] = MetricSummary(
165
+ mean=float(np.mean(final_values)),
166
+ std=float(np.std(final_values)),
167
+ min=float(np.min(final_values)),
168
+ max=float(np.max(final_values)),
169
+ n_seeds=n_seeds,
170
+ values=final_values,
171
+ )
172
+
173
+ return AggregatedResults(
174
+ config_name=config_name,
175
+ seeds=seeds,
176
+ metric_arrays=metric_arrays,
177
+ summary=summary,
178
+ )
179
+
180
+
181
+ def run_multi_seed_experiment(
182
+ configs: Sequence[ExperimentConfig],
183
+ seeds: int | Sequence[int] = 30,
184
+ parallel: bool = True,
185
+ n_jobs: int = -1,
186
+ show_progress: bool = True,
187
+ ) -> dict[str, AggregatedResults]:
188
+ """Run experiments across multiple seeds with optional parallelization.
189
+
190
+ Args:
191
+ configs: List of experiment configurations to run
192
+ seeds: Number of seeds (generates 0..n-1) or explicit list of seeds
193
+ parallel: Whether to use parallel execution (requires joblib)
194
+ n_jobs: Number of parallel jobs (-1 for all CPUs)
195
+ show_progress: Whether to show progress bar (requires tqdm)
196
+
197
+ Returns:
198
+ Dictionary mapping config name to AggregatedResults
199
+ """
200
+ # Convert seeds to list
201
+ if isinstance(seeds, int):
202
+ seed_list = list(range(seeds))
203
+ else:
204
+ seed_list = list(seeds)
205
+
206
+ # Build list of (config, seed) pairs
207
+ tasks: list[tuple[ExperimentConfig, int]] = []
208
+ for config in configs:
209
+ for seed in seed_list:
210
+ tasks.append((config, seed))
211
+
212
+ # Run experiments
213
+ if parallel:
214
+ try:
215
+ from joblib import Parallel, delayed
216
+
217
+ if show_progress:
218
+ try:
219
+ from tqdm import tqdm
220
+
221
+ results_list: list[SingleRunResult] = Parallel(n_jobs=n_jobs)(
222
+ delayed(run_single_experiment)(config, seed)
223
+ for config, seed in tqdm(tasks, desc="Running experiments")
224
+ )
225
+ except ImportError:
226
+ results_list = Parallel(n_jobs=n_jobs)(
227
+ delayed(run_single_experiment)(config, seed) for config, seed in tasks
228
+ )
229
+ else:
230
+ results_list = Parallel(n_jobs=n_jobs)(
231
+ delayed(run_single_experiment)(config, seed) for config, seed in tasks
232
+ )
233
+ except ImportError:
234
+ # Fallback to sequential if joblib not available
235
+ results_list = _run_sequential(tasks, show_progress)
236
+ else:
237
+ results_list = _run_sequential(tasks, show_progress)
238
+
239
+ # Group results by config name
240
+ grouped: dict[str, list[SingleRunResult]] = {}
241
+ for result in results_list:
242
+ if result.config_name not in grouped:
243
+ grouped[result.config_name] = []
244
+ grouped[result.config_name].append(result)
245
+
246
+ # Aggregate each config
247
+ aggregated: dict[str, AggregatedResults] = {}
248
+ for config_name, group_results in grouped.items():
249
+ aggregated[config_name] = aggregate_metrics(group_results)
250
+
251
+ return aggregated
252
+
253
+
254
+ def _run_sequential(
255
+ tasks: list[tuple[ExperimentConfig, int]],
256
+ show_progress: bool,
257
+ ) -> list[SingleRunResult]:
258
+ """Run experiments sequentially."""
259
+ if show_progress:
260
+ try:
261
+ from tqdm import tqdm
262
+
263
+ return [run_single_experiment(config, seed) for config, seed in tqdm(tasks)]
264
+ except ImportError:
265
+ pass
266
+ return [run_single_experiment(config, seed) for config, seed in tasks]
267
+
268
+
269
+ def get_metric_timeseries(
270
+ results: AggregatedResults,
271
+ metric: str = "squared_error",
272
+ ) -> tuple[NDArray[np.float64], NDArray[np.float64], NDArray[np.float64]]:
273
+ """Get mean and standard deviation timeseries for a metric.
274
+
275
+ Args:
276
+ results: Aggregated results
277
+ metric: Name of the metric
278
+
279
+ Returns:
280
+ Tuple of (mean, lower_bound, upper_bound) arrays
281
+ """
282
+ arr = results.metric_arrays[metric]
283
+ mean = np.mean(arr, axis=0)
284
+ std = np.std(arr, axis=0)
285
+ return mean, mean - std, mean + std
286
+
287
+
288
+ def get_final_performance(
289
+ results: dict[str, AggregatedResults],
290
+ metric: str = "squared_error",
291
+ window: int = 100,
292
+ ) -> dict[str, tuple[float, float]]:
293
+ """Get final performance (mean, std) for each config.
294
+
295
+ Args:
296
+ results: Dictionary of aggregated results
297
+ metric: Metric to evaluate
298
+ window: Number of final steps to average
299
+
300
+ Returns:
301
+ Dictionary mapping config name to (mean, std) tuple
302
+ """
303
+ performance: dict[str, tuple[float, float]] = {}
304
+ for name, agg in results.items():
305
+ arr = agg.metric_arrays[metric]
306
+ final_window = min(window, arr.shape[1])
307
+ final_means = np.mean(arr[:, -final_window:], axis=1)
308
+ performance[name] = (float(np.mean(final_means)), float(np.std(final_means)))
309
+ return performance
310
+
311
+
312
+ def extract_hyperparameter_results(
313
+ results: dict[str, AggregatedResults],
314
+ metric: str = "squared_error",
315
+ param_extractor: Callable[[str], Any] | None = None,
316
+ ) -> dict[Any, tuple[float, float]]:
317
+ """Extract results indexed by hyperparameter value.
318
+
319
+ Useful for creating hyperparameter sensitivity plots.
320
+
321
+ Args:
322
+ results: Dictionary of aggregated results
323
+ metric: Metric to evaluate
324
+ param_extractor: Function to extract param value from config name
325
+
326
+ Returns:
327
+ Dictionary mapping param value to (mean, std) tuple
328
+ """
329
+ performance = get_final_performance(results, metric)
330
+
331
+ if param_extractor is None:
332
+ return {k: v for k, v in performance.items()}
333
+
334
+ return {param_extractor(name): perf for name, perf in performance.items()}