alberta-framework 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alberta_framework/__init__.py +196 -0
- alberta_framework/core/__init__.py +27 -0
- alberta_framework/core/learners.py +530 -0
- alberta_framework/core/normalizers.py +192 -0
- alberta_framework/core/optimizers.py +422 -0
- alberta_framework/core/types.py +198 -0
- alberta_framework/py.typed +0 -0
- alberta_framework/streams/__init__.py +83 -0
- alberta_framework/streams/base.py +70 -0
- alberta_framework/streams/gymnasium.py +655 -0
- alberta_framework/streams/synthetic.py +995 -0
- alberta_framework/utils/__init__.py +113 -0
- alberta_framework/utils/experiments.py +334 -0
- alberta_framework/utils/export.py +509 -0
- alberta_framework/utils/metrics.py +112 -0
- alberta_framework/utils/statistics.py +527 -0
- alberta_framework/utils/timing.py +138 -0
- alberta_framework/utils/visualization.py +571 -0
- alberta_framework-0.1.0.dist-info/METADATA +198 -0
- alberta_framework-0.1.0.dist-info/RECORD +22 -0
- alberta_framework-0.1.0.dist-info/WHEEL +4 -0
- alberta_framework-0.1.0.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""Utility functions for the Alberta Framework."""
|
|
2
|
+
|
|
3
|
+
# Experiment runner (no external deps)
|
|
4
|
+
from alberta_framework.utils.experiments import (
|
|
5
|
+
AggregatedResults,
|
|
6
|
+
ExperimentConfig,
|
|
7
|
+
MetricSummary,
|
|
8
|
+
SingleRunResult,
|
|
9
|
+
aggregate_metrics,
|
|
10
|
+
get_final_performance,
|
|
11
|
+
get_metric_timeseries,
|
|
12
|
+
run_multi_seed_experiment,
|
|
13
|
+
run_single_experiment,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
# Export utilities (no external deps for basic functionality)
|
|
17
|
+
from alberta_framework.utils.export import (
|
|
18
|
+
export_to_csv,
|
|
19
|
+
export_to_json,
|
|
20
|
+
generate_latex_table,
|
|
21
|
+
generate_markdown_table,
|
|
22
|
+
save_experiment_report,
|
|
23
|
+
)
|
|
24
|
+
from alberta_framework.utils.metrics import (
|
|
25
|
+
compare_learners,
|
|
26
|
+
compute_cumulative_error,
|
|
27
|
+
compute_running_mean,
|
|
28
|
+
compute_tracking_error,
|
|
29
|
+
extract_metric,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
__all__ = [
|
|
33
|
+
# Metrics
|
|
34
|
+
"compare_learners",
|
|
35
|
+
"compute_cumulative_error",
|
|
36
|
+
"compute_running_mean",
|
|
37
|
+
"compute_tracking_error",
|
|
38
|
+
"extract_metric",
|
|
39
|
+
# Experiments
|
|
40
|
+
"AggregatedResults",
|
|
41
|
+
"ExperimentConfig",
|
|
42
|
+
"MetricSummary",
|
|
43
|
+
"SingleRunResult",
|
|
44
|
+
"aggregate_metrics",
|
|
45
|
+
"get_final_performance",
|
|
46
|
+
"get_metric_timeseries",
|
|
47
|
+
"run_multi_seed_experiment",
|
|
48
|
+
"run_single_experiment",
|
|
49
|
+
# Export
|
|
50
|
+
"export_to_csv",
|
|
51
|
+
"export_to_json",
|
|
52
|
+
"generate_latex_table",
|
|
53
|
+
"generate_markdown_table",
|
|
54
|
+
"save_experiment_report",
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
# Optional: Statistics (requires scipy for full functionality)
|
|
58
|
+
try:
|
|
59
|
+
from alberta_framework.utils.statistics import (
|
|
60
|
+
SignificanceResult,
|
|
61
|
+
StatisticalSummary,
|
|
62
|
+
bonferroni_correction,
|
|
63
|
+
bootstrap_ci,
|
|
64
|
+
cohens_d,
|
|
65
|
+
compute_statistics,
|
|
66
|
+
compute_timeseries_statistics,
|
|
67
|
+
holm_correction,
|
|
68
|
+
mann_whitney_comparison,
|
|
69
|
+
pairwise_comparisons,
|
|
70
|
+
ttest_comparison,
|
|
71
|
+
wilcoxon_comparison,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
__all__ += [
|
|
75
|
+
"SignificanceResult",
|
|
76
|
+
"StatisticalSummary",
|
|
77
|
+
"bonferroni_correction",
|
|
78
|
+
"bootstrap_ci",
|
|
79
|
+
"cohens_d",
|
|
80
|
+
"compute_statistics",
|
|
81
|
+
"compute_timeseries_statistics",
|
|
82
|
+
"holm_correction",
|
|
83
|
+
"mann_whitney_comparison",
|
|
84
|
+
"pairwise_comparisons",
|
|
85
|
+
"ttest_comparison",
|
|
86
|
+
"wilcoxon_comparison",
|
|
87
|
+
]
|
|
88
|
+
except ImportError:
|
|
89
|
+
pass
|
|
90
|
+
|
|
91
|
+
# Optional: Visualization (requires matplotlib)
|
|
92
|
+
try:
|
|
93
|
+
from alberta_framework.utils.visualization import (
|
|
94
|
+
create_comparison_figure,
|
|
95
|
+
plot_final_performance_bars,
|
|
96
|
+
plot_hyperparameter_heatmap,
|
|
97
|
+
plot_learning_curves,
|
|
98
|
+
plot_step_size_evolution,
|
|
99
|
+
save_figure,
|
|
100
|
+
set_publication_style,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
__all__ += [
|
|
104
|
+
"create_comparison_figure",
|
|
105
|
+
"plot_final_performance_bars",
|
|
106
|
+
"plot_hyperparameter_heatmap",
|
|
107
|
+
"plot_learning_curves",
|
|
108
|
+
"plot_step_size_evolution",
|
|
109
|
+
"save_figure",
|
|
110
|
+
"set_publication_style",
|
|
111
|
+
]
|
|
112
|
+
except ImportError:
|
|
113
|
+
pass
|
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
"""Multi-seed experiment runner for publication-quality analysis.
|
|
2
|
+
|
|
3
|
+
Provides infrastructure for running experiments across multiple seeds
|
|
4
|
+
with optional parallelization and aggregation of results.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from collections.abc import Callable, Sequence
|
|
8
|
+
from typing import Any, NamedTuple, cast
|
|
9
|
+
|
|
10
|
+
import jax.random as jr
|
|
11
|
+
import numpy as np
|
|
12
|
+
from numpy.typing import NDArray
|
|
13
|
+
|
|
14
|
+
from alberta_framework.core.learners import (
|
|
15
|
+
LinearLearner,
|
|
16
|
+
NormalizedLearnerState,
|
|
17
|
+
NormalizedLinearLearner,
|
|
18
|
+
metrics_to_dicts,
|
|
19
|
+
run_learning_loop,
|
|
20
|
+
run_normalized_learning_loop,
|
|
21
|
+
)
|
|
22
|
+
from alberta_framework.core.types import LearnerState
|
|
23
|
+
from alberta_framework.streams.base import ScanStream
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ExperimentConfig(NamedTuple):
|
|
27
|
+
"""Configuration for a single experiment.
|
|
28
|
+
|
|
29
|
+
Attributes:
|
|
30
|
+
name: Human-readable name for this configuration
|
|
31
|
+
learner_factory: Callable that returns a fresh learner instance
|
|
32
|
+
stream_factory: Callable that returns a fresh stream instance
|
|
33
|
+
num_steps: Number of learning steps to run
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
name: str
|
|
37
|
+
learner_factory: Callable[[], LinearLearner | NormalizedLinearLearner]
|
|
38
|
+
stream_factory: Callable[[], ScanStream[Any]]
|
|
39
|
+
num_steps: int
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class SingleRunResult(NamedTuple):
|
|
43
|
+
"""Result from a single experiment run.
|
|
44
|
+
|
|
45
|
+
Attributes:
|
|
46
|
+
config_name: Name of the configuration that was run
|
|
47
|
+
seed: Random seed used for this run
|
|
48
|
+
metrics_history: List of metric dictionaries from each step
|
|
49
|
+
final_state: Final learner state after training
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
config_name: str
|
|
53
|
+
seed: int
|
|
54
|
+
metrics_history: list[dict[str, float]]
|
|
55
|
+
final_state: LearnerState | NormalizedLearnerState
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class MetricSummary(NamedTuple):
|
|
59
|
+
"""Summary statistics for a single metric.
|
|
60
|
+
|
|
61
|
+
Attributes:
|
|
62
|
+
mean: Mean across seeds
|
|
63
|
+
std: Standard deviation across seeds
|
|
64
|
+
min: Minimum value across seeds
|
|
65
|
+
max: Maximum value across seeds
|
|
66
|
+
n_seeds: Number of seeds
|
|
67
|
+
values: Raw values per seed
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
mean: float
|
|
71
|
+
std: float
|
|
72
|
+
min: float
|
|
73
|
+
max: float
|
|
74
|
+
n_seeds: int
|
|
75
|
+
values: NDArray[np.float64]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class AggregatedResults(NamedTuple):
|
|
79
|
+
"""Aggregated results across multiple seeds.
|
|
80
|
+
|
|
81
|
+
Attributes:
|
|
82
|
+
config_name: Name of the configuration
|
|
83
|
+
seeds: List of seeds used
|
|
84
|
+
metric_arrays: Dict mapping metric name to (n_seeds, n_steps) array
|
|
85
|
+
summary: Dict mapping metric name to MetricSummary (final values)
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
config_name: str
|
|
89
|
+
seeds: list[int]
|
|
90
|
+
metric_arrays: dict[str, NDArray[np.float64]]
|
|
91
|
+
summary: dict[str, MetricSummary]
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def run_single_experiment(
|
|
95
|
+
config: ExperimentConfig,
|
|
96
|
+
seed: int,
|
|
97
|
+
) -> SingleRunResult:
|
|
98
|
+
"""Run a single experiment with a given seed.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
config: Experiment configuration
|
|
102
|
+
seed: Random seed for the stream
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
SingleRunResult with metrics and final state
|
|
106
|
+
"""
|
|
107
|
+
learner = config.learner_factory()
|
|
108
|
+
stream = config.stream_factory()
|
|
109
|
+
key = jr.key(seed)
|
|
110
|
+
|
|
111
|
+
final_state: LearnerState | NormalizedLearnerState
|
|
112
|
+
if isinstance(learner, NormalizedLinearLearner):
|
|
113
|
+
final_state, metrics = run_normalized_learning_loop(
|
|
114
|
+
learner, stream, config.num_steps, key
|
|
115
|
+
)
|
|
116
|
+
metrics_history = metrics_to_dicts(metrics, normalized=True)
|
|
117
|
+
else:
|
|
118
|
+
result = run_learning_loop(learner, stream, config.num_steps, key)
|
|
119
|
+
final_state, metrics = cast(tuple[LearnerState, Any], result)
|
|
120
|
+
metrics_history = metrics_to_dicts(metrics)
|
|
121
|
+
|
|
122
|
+
return SingleRunResult(
|
|
123
|
+
config_name=config.name,
|
|
124
|
+
seed=seed,
|
|
125
|
+
metrics_history=metrics_history,
|
|
126
|
+
final_state=final_state,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def aggregate_metrics(results: list[SingleRunResult]) -> AggregatedResults:
|
|
131
|
+
"""Aggregate results from multiple seeds into summary statistics.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
results: List of SingleRunResult from multiple seeds
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
AggregatedResults with aggregated metrics
|
|
138
|
+
"""
|
|
139
|
+
if not results:
|
|
140
|
+
raise ValueError("Cannot aggregate empty results list")
|
|
141
|
+
|
|
142
|
+
config_name = results[0].config_name
|
|
143
|
+
seeds = [r.seed for r in results]
|
|
144
|
+
|
|
145
|
+
# Get all metric keys from first result
|
|
146
|
+
metric_keys = list(results[0].metrics_history[0].keys())
|
|
147
|
+
|
|
148
|
+
# Build metric arrays: (n_seeds, n_steps)
|
|
149
|
+
metric_arrays: dict[str, NDArray[np.float64]] = {}
|
|
150
|
+
for key in metric_keys:
|
|
151
|
+
arrays = []
|
|
152
|
+
for r in results:
|
|
153
|
+
values = np.array([m[key] for m in r.metrics_history])
|
|
154
|
+
arrays.append(values)
|
|
155
|
+
metric_arrays[key] = np.stack(arrays)
|
|
156
|
+
|
|
157
|
+
# Compute summary statistics for final values (mean of last 100 steps)
|
|
158
|
+
summary: dict[str, MetricSummary] = {}
|
|
159
|
+
n_seeds = len(results)
|
|
160
|
+
for key in metric_keys:
|
|
161
|
+
# Use mean of last 100 steps as the final value
|
|
162
|
+
window = min(100, metric_arrays[key].shape[1])
|
|
163
|
+
final_values = np.mean(metric_arrays[key][:, -window:], axis=1)
|
|
164
|
+
summary[key] = MetricSummary(
|
|
165
|
+
mean=float(np.mean(final_values)),
|
|
166
|
+
std=float(np.std(final_values)),
|
|
167
|
+
min=float(np.min(final_values)),
|
|
168
|
+
max=float(np.max(final_values)),
|
|
169
|
+
n_seeds=n_seeds,
|
|
170
|
+
values=final_values,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
return AggregatedResults(
|
|
174
|
+
config_name=config_name,
|
|
175
|
+
seeds=seeds,
|
|
176
|
+
metric_arrays=metric_arrays,
|
|
177
|
+
summary=summary,
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def run_multi_seed_experiment(
|
|
182
|
+
configs: Sequence[ExperimentConfig],
|
|
183
|
+
seeds: int | Sequence[int] = 30,
|
|
184
|
+
parallel: bool = True,
|
|
185
|
+
n_jobs: int = -1,
|
|
186
|
+
show_progress: bool = True,
|
|
187
|
+
) -> dict[str, AggregatedResults]:
|
|
188
|
+
"""Run experiments across multiple seeds with optional parallelization.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
configs: List of experiment configurations to run
|
|
192
|
+
seeds: Number of seeds (generates 0..n-1) or explicit list of seeds
|
|
193
|
+
parallel: Whether to use parallel execution (requires joblib)
|
|
194
|
+
n_jobs: Number of parallel jobs (-1 for all CPUs)
|
|
195
|
+
show_progress: Whether to show progress bar (requires tqdm)
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
Dictionary mapping config name to AggregatedResults
|
|
199
|
+
"""
|
|
200
|
+
# Convert seeds to list
|
|
201
|
+
if isinstance(seeds, int):
|
|
202
|
+
seed_list = list(range(seeds))
|
|
203
|
+
else:
|
|
204
|
+
seed_list = list(seeds)
|
|
205
|
+
|
|
206
|
+
# Build list of (config, seed) pairs
|
|
207
|
+
tasks: list[tuple[ExperimentConfig, int]] = []
|
|
208
|
+
for config in configs:
|
|
209
|
+
for seed in seed_list:
|
|
210
|
+
tasks.append((config, seed))
|
|
211
|
+
|
|
212
|
+
# Run experiments
|
|
213
|
+
if parallel:
|
|
214
|
+
try:
|
|
215
|
+
from joblib import Parallel, delayed
|
|
216
|
+
|
|
217
|
+
if show_progress:
|
|
218
|
+
try:
|
|
219
|
+
from tqdm import tqdm
|
|
220
|
+
|
|
221
|
+
results_list: list[SingleRunResult] = Parallel(n_jobs=n_jobs)(
|
|
222
|
+
delayed(run_single_experiment)(config, seed)
|
|
223
|
+
for config, seed in tqdm(tasks, desc="Running experiments")
|
|
224
|
+
)
|
|
225
|
+
except ImportError:
|
|
226
|
+
results_list = Parallel(n_jobs=n_jobs)(
|
|
227
|
+
delayed(run_single_experiment)(config, seed) for config, seed in tasks
|
|
228
|
+
)
|
|
229
|
+
else:
|
|
230
|
+
results_list = Parallel(n_jobs=n_jobs)(
|
|
231
|
+
delayed(run_single_experiment)(config, seed) for config, seed in tasks
|
|
232
|
+
)
|
|
233
|
+
except ImportError:
|
|
234
|
+
# Fallback to sequential if joblib not available
|
|
235
|
+
results_list = _run_sequential(tasks, show_progress)
|
|
236
|
+
else:
|
|
237
|
+
results_list = _run_sequential(tasks, show_progress)
|
|
238
|
+
|
|
239
|
+
# Group results by config name
|
|
240
|
+
grouped: dict[str, list[SingleRunResult]] = {}
|
|
241
|
+
for result in results_list:
|
|
242
|
+
if result.config_name not in grouped:
|
|
243
|
+
grouped[result.config_name] = []
|
|
244
|
+
grouped[result.config_name].append(result)
|
|
245
|
+
|
|
246
|
+
# Aggregate each config
|
|
247
|
+
aggregated: dict[str, AggregatedResults] = {}
|
|
248
|
+
for config_name, group_results in grouped.items():
|
|
249
|
+
aggregated[config_name] = aggregate_metrics(group_results)
|
|
250
|
+
|
|
251
|
+
return aggregated
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _run_sequential(
|
|
255
|
+
tasks: list[tuple[ExperimentConfig, int]],
|
|
256
|
+
show_progress: bool,
|
|
257
|
+
) -> list[SingleRunResult]:
|
|
258
|
+
"""Run experiments sequentially."""
|
|
259
|
+
if show_progress:
|
|
260
|
+
try:
|
|
261
|
+
from tqdm import tqdm
|
|
262
|
+
|
|
263
|
+
return [run_single_experiment(config, seed) for config, seed in tqdm(tasks)]
|
|
264
|
+
except ImportError:
|
|
265
|
+
pass
|
|
266
|
+
return [run_single_experiment(config, seed) for config, seed in tasks]
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def get_metric_timeseries(
|
|
270
|
+
results: AggregatedResults,
|
|
271
|
+
metric: str = "squared_error",
|
|
272
|
+
) -> tuple[NDArray[np.float64], NDArray[np.float64], NDArray[np.float64]]:
|
|
273
|
+
"""Get mean and standard deviation timeseries for a metric.
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
results: Aggregated results
|
|
277
|
+
metric: Name of the metric
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
Tuple of (mean, lower_bound, upper_bound) arrays
|
|
281
|
+
"""
|
|
282
|
+
arr = results.metric_arrays[metric]
|
|
283
|
+
mean = np.mean(arr, axis=0)
|
|
284
|
+
std = np.std(arr, axis=0)
|
|
285
|
+
return mean, mean - std, mean + std
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def get_final_performance(
|
|
289
|
+
results: dict[str, AggregatedResults],
|
|
290
|
+
metric: str = "squared_error",
|
|
291
|
+
window: int = 100,
|
|
292
|
+
) -> dict[str, tuple[float, float]]:
|
|
293
|
+
"""Get final performance (mean, std) for each config.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
results: Dictionary of aggregated results
|
|
297
|
+
metric: Metric to evaluate
|
|
298
|
+
window: Number of final steps to average
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
Dictionary mapping config name to (mean, std) tuple
|
|
302
|
+
"""
|
|
303
|
+
performance: dict[str, tuple[float, float]] = {}
|
|
304
|
+
for name, agg in results.items():
|
|
305
|
+
arr = agg.metric_arrays[metric]
|
|
306
|
+
final_window = min(window, arr.shape[1])
|
|
307
|
+
final_means = np.mean(arr[:, -final_window:], axis=1)
|
|
308
|
+
performance[name] = (float(np.mean(final_means)), float(np.std(final_means)))
|
|
309
|
+
return performance
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def extract_hyperparameter_results(
|
|
313
|
+
results: dict[str, AggregatedResults],
|
|
314
|
+
metric: str = "squared_error",
|
|
315
|
+
param_extractor: Callable[[str], Any] | None = None,
|
|
316
|
+
) -> dict[Any, tuple[float, float]]:
|
|
317
|
+
"""Extract results indexed by hyperparameter value.
|
|
318
|
+
|
|
319
|
+
Useful for creating hyperparameter sensitivity plots.
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
results: Dictionary of aggregated results
|
|
323
|
+
metric: Metric to evaluate
|
|
324
|
+
param_extractor: Function to extract param value from config name
|
|
325
|
+
|
|
326
|
+
Returns:
|
|
327
|
+
Dictionary mapping param value to (mean, std) tuple
|
|
328
|
+
"""
|
|
329
|
+
performance = get_final_performance(results, metric)
|
|
330
|
+
|
|
331
|
+
if param_extractor is None:
|
|
332
|
+
return {k: v for k, v in performance.items()}
|
|
333
|
+
|
|
334
|
+
return {param_extractor(name): perf for name, perf in performance.items()}
|