samlb 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- samlb-0.1.0/.gitignore +37 -0
- samlb-0.1.0/CMakeLists.txt +40 -0
- samlb-0.1.0/LICENSE +21 -0
- samlb-0.1.0/PKG-INFO +460 -0
- samlb-0.1.0/README.md +437 -0
- samlb-0.1.0/_cpp/bindings/pybind_module.cpp +192 -0
- samlb-0.1.0/_cpp/classification/efdt.cpp +26 -0
- samlb-0.1.0/_cpp/classification/efdt.h +19 -0
- samlb-0.1.0/_cpp/classification/hoeffding_tree.cpp +454 -0
- samlb-0.1.0/_cpp/classification/hoeffding_tree.h +63 -0
- samlb-0.1.0/_cpp/classification/knn_cls.cpp +74 -0
- samlb-0.1.0/_cpp/classification/knn_cls.h +19 -0
- samlb-0.1.0/_cpp/classification/logistic_regression.cpp +69 -0
- samlb-0.1.0/_cpp/classification/logistic_regression.h +27 -0
- samlb-0.1.0/_cpp/classification/naive_bayes.cpp +62 -0
- samlb-0.1.0/_cpp/classification/naive_bayes.h +21 -0
- samlb-0.1.0/_cpp/classification/passive_aggressive_cls.cpp +89 -0
- samlb-0.1.0/_cpp/classification/passive_aggressive_cls.h +26 -0
- samlb-0.1.0/_cpp/classification/perceptron.cpp +58 -0
- samlb-0.1.0/_cpp/classification/perceptron.h +23 -0
- samlb-0.1.0/_cpp/classification/sgt.cpp +268 -0
- samlb-0.1.0/_cpp/classification/sgt.h +59 -0
- samlb-0.1.0/_cpp/classification/softmax.cpp +63 -0
- samlb-0.1.0/_cpp/classification/softmax.h +25 -0
- samlb-0.1.0/_cpp/core/gaussian_estimator.h +39 -0
- samlb-0.1.0/_cpp/core/hoeffding_bound.h +13 -0
- samlb-0.1.0/_cpp/core/sliding_window.h +88 -0
- samlb-0.1.0/_cpp/regression/bayesian_linear_reg.cpp +48 -0
- samlb-0.1.0/_cpp/regression/bayesian_linear_reg.h +24 -0
- samlb-0.1.0/_cpp/regression/hoeffding_tree_reg.cpp +308 -0
- samlb-0.1.0/_cpp/regression/hoeffding_tree_reg.h +58 -0
- samlb-0.1.0/_cpp/regression/knn_reg.cpp +49 -0
- samlb-0.1.0/_cpp/regression/knn_reg.h +17 -0
- samlb-0.1.0/_cpp/regression/linear_regression.cpp +35 -0
- samlb-0.1.0/_cpp/regression/linear_regression.h +18 -0
- samlb-0.1.0/_cpp/regression/passive_aggressive_reg.cpp +35 -0
- samlb-0.1.0/_cpp/regression/passive_aggressive_reg.h +18 -0
- samlb-0.1.0/assets/samlb_logo.png +0 -0
- samlb-0.1.0/examples/run_benchmark.py +277 -0
- samlb-0.1.0/examples/run_regression.py +288 -0
- samlb-0.1.0/pyproject.toml +61 -0
- samlb-0.1.0/samlb/__init__.py +39 -0
- samlb-0.1.0/samlb/algorithms/__init__.py +8 -0
- samlb-0.1.0/samlb/algorithms/classification/__init__.py +29 -0
- samlb-0.1.0/samlb/algorithms/regression/__init__.py +21 -0
- samlb-0.1.0/samlb/benchmark/__init__.py +39 -0
- samlb-0.1.0/samlb/benchmark/suite.py +498 -0
- samlb-0.1.0/samlb/datasets/__init__.py +228 -0
- samlb-0.1.0/samlb/evaluation/__init__.py +29 -0
- samlb-0.1.0/samlb/evaluation/evaluator.py +256 -0
- samlb-0.1.0/samlb/evaluation/metrics.py +47 -0
- samlb-0.1.0/samlb/evaluation/results.py +189 -0
- samlb-0.1.0/samlb/framework/__init__.py +44 -0
- samlb-0.1.0/samlb/framework/base/__init__.py +28 -0
- samlb-0.1.0/samlb/framework/base/_cpp_wrappers.py +350 -0
- samlb-0.1.0/samlb/framework/base/_framework.py +55 -0
- samlb-0.1.0/samlb/framework/classification/__init__.py +12 -0
- samlb-0.1.0/samlb/framework/classification/asml/__init__.py +20 -0
- samlb-0.1.0/samlb/framework/classification/asml/config.py +43 -0
- samlb-0.1.0/samlb/framework/classification/asml/helper.py +51 -0
- samlb-0.1.0/samlb/framework/classification/asml/model.py +243 -0
- samlb-0.1.0/samlb/framework/classification/asml/search.py +184 -0
- samlb-0.1.0/samlb/framework/classification/autoclass/__init__.py +20 -0
- samlb-0.1.0/samlb/framework/classification/autoclass/config.py +16 -0
- samlb-0.1.0/samlb/framework/classification/autoclass/model.py +201 -0
- samlb-0.1.0/samlb/framework/classification/eaml/__init__.py +20 -0
- samlb-0.1.0/samlb/framework/classification/eaml/config.py +13 -0
- samlb-0.1.0/samlb/framework/classification/eaml/model.py +182 -0
- samlb-0.1.0/samlb/framework/classification/oaml/__init__.py +21 -0
- samlb-0.1.0/samlb/framework/classification/oaml/config.py +11 -0
- samlb-0.1.0/samlb/framework/classification/oaml/model.py +208 -0
- samlb-0.1.0/samlb/framework/classification/shared_config.py +254 -0
- samlb-0.1.0/samlb/framework/regression/__init__.py +6 -0
- samlb-0.1.0/samlb/framework/regression/asml/__init__.py +20 -0
- samlb-0.1.0/samlb/framework/regression/asml/config.py +94 -0
- samlb-0.1.0/samlb/framework/regression/asml/helper.py +50 -0
- samlb-0.1.0/samlb/framework/regression/asml/model.py +313 -0
- samlb-0.1.0/samlb/framework/regression/asml/search.py +162 -0
- samlb-0.1.0/samlb/framework/regression/chacha/__init__.py +16 -0
- samlb-0.1.0/samlb/framework/regression/chacha/model.py +140 -0
- samlb-0.1.0/samlb/framework/regression/eaml/__init__.py +20 -0
- samlb-0.1.0/samlb/framework/regression/eaml/config.py +53 -0
- samlb-0.1.0/samlb/framework/regression/eaml/model.py +157 -0
- samlb-0.1.0/tests/__init__.py +0 -0
- samlb-0.1.0/tests/conftest.py +137 -0
- samlb-0.1.0/tests/test_benchmark_core_fixes.py +101 -0
- samlb-0.1.0/tests/test_benchmark_progress.py +63 -0
- samlb-0.1.0/tests/test_benchmark_suite_results_api.py +65 -0
- samlb-0.1.0/tests/test_classification.py +324 -0
- samlb-0.1.0/tests/test_regression.py +253 -0
- samlb-0.1.0/tests/test_regression_chacha.py +70 -0
- samlb-0.1.0/tests/test_run_benchmark_parallel_utils.py +97 -0
- samlb-0.1.0/uv.lock +2275 -0
samlb-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.so
|
|
5
|
+
*.egg-info/
|
|
6
|
+
*.egg
|
|
7
|
+
dist/
|
|
8
|
+
build/
|
|
9
|
+
*.whl
|
|
10
|
+
|
|
11
|
+
# Virtual environments
|
|
12
|
+
.venv/
|
|
13
|
+
venv/
|
|
14
|
+
env/
|
|
15
|
+
|
|
16
|
+
# IDE
|
|
17
|
+
.vscode/
|
|
18
|
+
.idea/
|
|
19
|
+
*.swp
|
|
20
|
+
*.swo
|
|
21
|
+
|
|
22
|
+
# OS
|
|
23
|
+
.DS_Store
|
|
24
|
+
Thumbs.db
|
|
25
|
+
|
|
26
|
+
# Testing
|
|
27
|
+
.pytest_cache/
|
|
28
|
+
htmlcov/
|
|
29
|
+
.coverage
|
|
30
|
+
|
|
31
|
+
# Build artifacts
|
|
32
|
+
*.o
|
|
33
|
+
*.a
|
|
34
|
+
CMakeCache.txt
|
|
35
|
+
CMakeFiles/
|
|
36
|
+
cmake_install.cmake
|
|
37
|
+
Makefile
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
cmake_minimum_required(VERSION 3.17)
|
|
2
|
+
project(samlb_cpp LANGUAGES CXX)
|
|
3
|
+
|
|
4
|
+
set(CMAKE_CXX_STANDARD 17)
|
|
5
|
+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
|
6
|
+
|
|
7
|
+
# Release optimisations
|
|
8
|
+
if(NOT CMAKE_BUILD_TYPE)
|
|
9
|
+
set(CMAKE_BUILD_TYPE Release)
|
|
10
|
+
endif()
|
|
11
|
+
add_compile_options(-O3 -march=native -ffast-math)
|
|
12
|
+
|
|
13
|
+
# pybind11
|
|
14
|
+
find_package(pybind11 CONFIG REQUIRED)
|
|
15
|
+
|
|
16
|
+
# Collect all C++ sources
|
|
17
|
+
set(CPP_SOURCES
|
|
18
|
+
_cpp/classification/naive_bayes.cpp
|
|
19
|
+
_cpp/classification/perceptron.cpp
|
|
20
|
+
_cpp/classification/logistic_regression.cpp
|
|
21
|
+
_cpp/classification/passive_aggressive_cls.cpp
|
|
22
|
+
_cpp/classification/softmax.cpp
|
|
23
|
+
_cpp/classification/knn_cls.cpp
|
|
24
|
+
_cpp/classification/hoeffding_tree.cpp
|
|
25
|
+
_cpp/classification/efdt.cpp
|
|
26
|
+
_cpp/classification/sgt.cpp
|
|
27
|
+
_cpp/regression/linear_regression.cpp
|
|
28
|
+
_cpp/regression/bayesian_linear_reg.cpp
|
|
29
|
+
_cpp/regression/passive_aggressive_reg.cpp
|
|
30
|
+
_cpp/regression/knn_reg.cpp
|
|
31
|
+
_cpp/regression/hoeffding_tree_reg.cpp
|
|
32
|
+
_cpp/bindings/pybind_module.cpp
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
pybind11_add_module(_samlb_core ${CPP_SOURCES})
|
|
36
|
+
|
|
37
|
+
target_include_directories(_samlb_core PRIVATE _cpp/core _cpp)
|
|
38
|
+
|
|
39
|
+
# Place .so next to samlb package so `import samlb` finds it
|
|
40
|
+
install(TARGETS _samlb_core DESTINATION samlb)
|
samlb-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 SAMLB Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
samlb-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,460 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: samlb
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Streaming AutoML Benchmark — fast C++ base algorithms + unified AutoML frameworks
|
|
5
|
+
Keywords: streaming,automl,benchmark,machine-learning,data-streams
|
|
6
|
+
Author: SAMLB Contributors
|
|
7
|
+
License: MIT
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: C++
|
|
10
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
11
|
+
Requires-Python: >=3.9
|
|
12
|
+
Requires-Dist: river>=0.21
|
|
13
|
+
Requires-Dist: scikit-learn>=1.3
|
|
14
|
+
Requires-Dist: numpy>=1.24
|
|
15
|
+
Requires-Dist: openml>=0.15.1
|
|
16
|
+
Provides-Extra: vw
|
|
17
|
+
Requires-Dist: flaml[vw]>=2.3; extra == "vw"
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
20
|
+
Requires-Dist: pytest-benchmark; extra == "dev"
|
|
21
|
+
Requires-Dist: ruff; extra == "dev"
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
<p align="center">
|
|
25
|
+
<img src="https://raw.githubusercontent.com/TechyNilesh/samlb/main/assets/samlb_logo.png" alt="SAMLB Logo" width="400">
|
|
26
|
+
</p>
|
|
27
|
+
<p align="center">A unified benchmark framework for evaluating AutoML systems on data streams with fast C++ base algorithms and rigorous prequential evaluation.</p>
|
|
28
|
+
|
|
29
|
+
<p align="center">
|
|
30
|
+
<img src="https://img.shields.io/badge/Python-3.10%2B-blue.svg" alt="Python">
|
|
31
|
+
<img src="https://img.shields.io/pypi/v/samlb.svg" alt="PyPI">
|
|
32
|
+
<img src="https://img.shields.io/pypi/dm/samlb?color=green&label=Downloads" alt="Downloads">
|
|
33
|
+
<img src="https://img.shields.io/badge/License-MIT-green.svg" alt="License">
|
|
34
|
+
</p>
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
## Why SAMLB?
|
|
39
|
+
|
|
40
|
+
Streaming AutoML methods are hard to compare fairly. Different papers use different datasets, evaluation protocols, and algorithm pools. **SAMLB** solves this by providing:
|
|
41
|
+
|
|
42
|
+
- **Fast C++ base algorithms** with River-compatible Python interfaces (Naive Bayes, Hoeffding Trees, KNN, Perceptron, Logistic Regression, and more)
|
|
43
|
+
- **Framework-agnostic benchmarking** -- plug in any streaming AutoML method with just 3 methods
|
|
44
|
+
- **Standardized prequential evaluation** (test-then-train) with windowed metric snapshots for learning curves
|
|
45
|
+
- **30 curated datasets** (15 classification + 15 regression) spanning real-world and synthetic drift scenarios
|
|
46
|
+
- **Parallel execution** for large-scale experiments across multiple seeds
|
|
47
|
+
|
|
48
|
+
## Installation
|
|
49
|
+
|
|
50
|
+
### From PyPI
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pip install samlb
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### From source
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
git clone https://github.com/TechyNilesh/samlb.git
|
|
60
|
+
cd samlb
|
|
61
|
+
pip install -e ".[dev]"
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Optional: Vowpal Wabbit support (for ChaCha regressor)
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
pip install "samlb[vw]"
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
> **Requirements:** Python >= 3.9, a C++ compiler (for the native extension), CMake
|
|
71
|
+
|
|
72
|
+
## Quick Start
|
|
73
|
+
|
|
74
|
+
### Python API
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from samlb.benchmark import BenchmarkSuite
|
|
78
|
+
from samlb.framework.classification.asml import AutoStreamClassifier
|
|
79
|
+
from samlb.framework.classification.eaml import EvolutionaryBaggingClassifier
|
|
80
|
+
|
|
81
|
+
suite = BenchmarkSuite(
|
|
82
|
+
models={
|
|
83
|
+
"ASML": AutoStreamClassifier(seed=42),
|
|
84
|
+
"EvoAutoML": EvolutionaryBaggingClassifier(seed=42),
|
|
85
|
+
},
|
|
86
|
+
datasets=["electricity", "covertype"],
|
|
87
|
+
task="classification",
|
|
88
|
+
n_runs=10,
|
|
89
|
+
window_size=1000,
|
|
90
|
+
)
|
|
91
|
+
suite.run()
|
|
92
|
+
suite.print_table()
|
|
93
|
+
suite.to_csv("results/classification.csv")
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Dataset Streaming
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
from samlb.datasets import stream, list_datasets
|
|
100
|
+
|
|
101
|
+
# See all available datasets
|
|
102
|
+
print(list_datasets("classification"))
|
|
103
|
+
print(list_datasets("regression"))
|
|
104
|
+
|
|
105
|
+
# Stream instance by instance
|
|
106
|
+
for x, y in stream("electricity", task="classification"):
|
|
107
|
+
pred = model.predict_one(x)
|
|
108
|
+
model.learn_one(x, y)
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### CLI
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
# Full classification benchmark (4 frameworks x 15 datasets x 10 runs)
|
|
115
|
+
python examples/run_benchmark.py
|
|
116
|
+
|
|
117
|
+
# Custom subset
|
|
118
|
+
python examples/run_benchmark.py --n_runs 5 --max_samples 50000 --datasets electricity covertype
|
|
119
|
+
|
|
120
|
+
# Parallel execution across CPU cores
|
|
121
|
+
python examples/run_benchmark.py --n_runs 100 --parallel --cpu_utilization 0.8
|
|
122
|
+
|
|
123
|
+
# Regression benchmark
|
|
124
|
+
python examples/run_regression.py
|
|
125
|
+
python examples/run_regression.py --n_runs 5 --datasets bike california_housing
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## Included Frameworks
|
|
129
|
+
|
|
130
|
+
### Classification
|
|
131
|
+
|
|
132
|
+
| Framework | Strategy | Key Features |
|
|
133
|
+
|-----------|----------|--------------|
|
|
134
|
+
| **ASML** | Adaptive Random Drift Nearby Search | ADWIN drift detection, recency-weighted ensemble, adaptive budget |
|
|
135
|
+
| **AutoClass** | Genetic Algorithm + Meta-Regressor | Fitness-proportionate selection, ARF surrogate for HP mutation |
|
|
136
|
+
| **EvoAutoML** | Evolutionary Bagging | Population-based, tournament selection, Poisson(6) sampling |
|
|
137
|
+
| **OAML** | Drift-triggered Random Search | EDDM drift detector, warm-up phase, random search |
|
|
138
|
+
|
|
139
|
+
### Regression
|
|
140
|
+
|
|
141
|
+
| Framework | Strategy | Key Features |
|
|
142
|
+
|-----------|----------|--------------|
|
|
143
|
+
| **ASML** | Adaptive Random Drift Nearby Search | Online target normalization (Welford), prediction clipping |
|
|
144
|
+
| **ChaCha** | FLAML AutoVW | Vowpal Wabbit online HPO, progressive validation loss |
|
|
145
|
+
| **EvoAutoML** | Evolutionary Bagging | Population-based ensemble, mutation-driven search |
|
|
146
|
+
|
|
147
|
+
## C++ Base Algorithms
|
|
148
|
+
|
|
149
|
+
All base learners are implemented in C++ for speed and wrapped with River-compatible interfaces:
|
|
150
|
+
|
|
151
|
+
**Classification:** Naive Bayes, Perceptron, Logistic Regression, Passive Aggressive, Softmax Regression, KNN, Hoeffding Tree, EFDT, SGT
|
|
152
|
+
|
|
153
|
+
**Regression:** Linear Regression, Bayesian Linear Regression, Passive Aggressive, Hoeffding Tree, KNN
|
|
154
|
+
|
|
155
|
+
**Preprocessing (via River):** MinMaxScaler, StandardScaler, MaxAbsScaler, VarianceThreshold, SelectKBest
|
|
156
|
+
|
|
157
|
+
## Evaluation Methodology
|
|
158
|
+
|
|
159
|
+
SAMLB uses **prequential evaluation** (test-then-train):
|
|
160
|
+
|
|
161
|
+
1. For each instance in the stream:
|
|
162
|
+
- **Predict** -- get the model's prediction *before* seeing the label
|
|
163
|
+
- **Evaluate** -- score the prediction against the true label
|
|
164
|
+
- **Learn** -- update the model with the labelled instance
|
|
165
|
+
2. Metrics are captured at configurable window intervals for learning curve analysis
|
|
166
|
+
3. Runtime is sampled per-instance for performance profiling
|
|
167
|
+
|
|
168
|
+
**Classification metrics:** Accuracy, Macro-F1, Macro-Precision, Macro-Recall
|
|
169
|
+
|
|
170
|
+
**Regression metrics:** MAE, RMSE, R^2
|
|
171
|
+
|
|
172
|
+
## Datasets
|
|
173
|
+
|
|
174
|
+
### Classification (15 datasets -- 2.5M+ total instances)
|
|
175
|
+
|
|
176
|
+
| Dataset | Samples | Features | Classes | Type | Description |
|
|
177
|
+
|---------|--------:|---------:|--------:|------|-------------|
|
|
178
|
+
| `adult` | 48,842 | 14 | 4 | Real | Income prediction (Census) |
|
|
179
|
+
| `covertype` | 100,000 | 54 | 7 | Real | Forest cover type (cartographic) |
|
|
180
|
+
| `credit_card` | 284,807 | 30 | 2 | Real | Credit card fraud detection |
|
|
181
|
+
| `electricity` | 45,312 | 8 | 2 | Real | Electricity price direction (NSW, Australia) |
|
|
182
|
+
| `insects` | 52,848 | 33 | 6 | Real | Insect species with concept drift |
|
|
183
|
+
| `new_airlines` | 539,383 | 7 | 2 | Real | Flight delay prediction |
|
|
184
|
+
| `nomao` | 34,465 | 118 | 2 | Real | Nomao place deduplication |
|
|
185
|
+
| `poker_hand` | 1,025,009 | 10 | 10 | Real | Poker hand classification |
|
|
186
|
+
| `shuttle` | 58,000 | 9 | 7 | Real | NASA Space Shuttle radiator |
|
|
187
|
+
| `vehicle_sensIT` | 98,528 | 100 | 3 | Real | Vehicle type from seismic sensors |
|
|
188
|
+
| `movingRBF` | 200,000 | 10 | 5 | Synthetic | Moving radial basis functions |
|
|
189
|
+
| `moving_squares` | 200,000 | 2 | 4 | Synthetic | Moving class boundaries |
|
|
190
|
+
| `sea_high_abrupt_drift` | 500,000 | 3 | 2 | Synthetic | SEA generator with abrupt drift |
|
|
191
|
+
| `synth_RandomRBFDrift` | 100,000 | 4 | 4 | Synthetic | RBF generator with gradual drift |
|
|
192
|
+
| `synth_agrawal` | 100,000 | 9 | 2 | Synthetic | Agrawal generator |
|
|
193
|
+
|
|
194
|
+
### Regression (15 datasets -- 1M+ total instances)
|
|
195
|
+
|
|
196
|
+
| Dataset | Samples | Features | Type | Description |
|
|
197
|
+
|---------|--------:|---------:|------|-------------|
|
|
198
|
+
| `ailerons` | 13,750 | 40 | Real | Aircraft control surface deflection |
|
|
199
|
+
| `bike` | 17,379 | 12 | Real | Bike sharing hourly demand |
|
|
200
|
+
| `california_housing` | 20,640 | 8 | Real | California median house values |
|
|
201
|
+
| `cps88wages` | 28,155 | 6 | Real | Wage prediction (CPS 1988) |
|
|
202
|
+
| `diamonds` | 53,940 | 9 | Real | Diamond price prediction |
|
|
203
|
+
| `elevators` | 16,599 | 18 | Real | Aircraft elevator control |
|
|
204
|
+
| `fifa` | 19,178 | 28 | Real | FIFA player overall rating |
|
|
205
|
+
| `House8L` | 22,784 | 8 | Real | House price (8-feature variant) |
|
|
206
|
+
| `kings_county` | 21,613 | 21 | Real | King County house sales price |
|
|
207
|
+
| `MetroTraffic` | 48,204 | 7 | Real | Interstate traffic volume (Minneapolis) |
|
|
208
|
+
| `superconductivity` | 21,263 | 81 | Real | Superconductor critical temperature |
|
|
209
|
+
| `wave_energy` | 72,000 | 48 | Real | Wave energy converter power output |
|
|
210
|
+
| `fried` | 40,768 | 10 | Synthetic | Friedman function |
|
|
211
|
+
| `FriedmanGra` | 100,000 | 10 | Synthetic | Friedman with gradual drift |
|
|
212
|
+
| `hyperA` | 500,000 | 10 | Synthetic | Hyperplane with drift |
|
|
213
|
+
|
|
214
|
+
## Output Formats
|
|
215
|
+
|
|
216
|
+
```
|
|
217
|
+
results/
|
|
218
|
+
classification_10runs.csv # Flat CSV: one row per (framework x dataset x run)
|
|
219
|
+
aggregate.json # Aggregated mean +/- std across runs
|
|
220
|
+
ASML_electricity_seed0.json # Per-run JSON with full learning curves
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
## Project Structure
|
|
224
|
+
|
|
225
|
+
```
|
|
226
|
+
.
|
|
227
|
+
├── pyproject.toml # Package metadata & build config
|
|
228
|
+
├── CMakeLists.txt # C++ build configuration
|
|
229
|
+
├── LICENSE # MIT License
|
|
230
|
+
├── README.md # This file
|
|
231
|
+
├── _cpp/ # C++ source (9 classifiers, 5 regressors)
|
|
232
|
+
│ ├── classification/
|
|
233
|
+
│ ├── regression/
|
|
234
|
+
│ ├── core/ # Shared headers
|
|
235
|
+
│ └── bindings/ # PyBind11 module
|
|
236
|
+
├── samlb/ # Python package
|
|
237
|
+
│ ├── __init__.py # Version: 0.1.0
|
|
238
|
+
│ ├── algorithms/ # C++ algorithm Python bindings
|
|
239
|
+
│ ├── benchmark/ # BenchmarkSuite orchestrator
|
|
240
|
+
│ ├── evaluation/ # PrequentialEvaluator, metrics, results
|
|
241
|
+
│ ├── datasets/ # 30 datasets (15 clf + 15 reg NPZ files)
|
|
242
|
+
│ └── framework/ # AutoML framework implementations
|
|
243
|
+
│ ├── base/ # BaseStreamFramework + C++ wrappers
|
|
244
|
+
│ ├── classification/ # ASML, AutoClass, EvoAutoML, OAML
|
|
245
|
+
│ └── regression/ # ASML, ChaCha, EvoAutoML
|
|
246
|
+
├── tests/ # Test suite
|
|
247
|
+
└── examples/ # Benchmark runner scripts
|
|
248
|
+
├── run_benchmark.py # Classification benchmark CLI
|
|
249
|
+
└── run_regression.py # Regression benchmark CLI
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
---
|
|
253
|
+
|
|
254
|
+
## Contributing
|
|
255
|
+
|
|
256
|
+
We welcome contributions! Whether you are adding a new AutoML framework, new datasets, or fixing bugs.
|
|
257
|
+
|
|
258
|
+
### Development Setup
|
|
259
|
+
|
|
260
|
+
```bash
|
|
261
|
+
git clone https://github.com/TechyNilesh/samlb.git
|
|
262
|
+
cd samlb
|
|
263
|
+
pip install -e ".[dev]"
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
### Running Tests
|
|
267
|
+
|
|
268
|
+
```bash
|
|
269
|
+
pytest tests/
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
### Code Style
|
|
273
|
+
|
|
274
|
+
```bash
|
|
275
|
+
ruff check samlb/
|
|
276
|
+
ruff format samlb/
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
---
|
|
280
|
+
|
|
281
|
+
### Adding a New Streaming AutoML Framework
|
|
282
|
+
|
|
283
|
+
This is the primary way to contribute. Every framework in SAMLB implements the same 3-method interface, making it easy to add your own.
|
|
284
|
+
|
|
285
|
+
#### Step 1 -- Create your framework directory
|
|
286
|
+
|
|
287
|
+
```
|
|
288
|
+
samlb/framework/classification/my_method/ # (or regression/)
|
|
289
|
+
__init__.py
|
|
290
|
+
model.py
|
|
291
|
+
config.py # optional: search space / hyperparameter config
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
#### Step 2 -- Implement `BaseStreamFramework`
|
|
295
|
+
|
|
296
|
+
```python
|
|
297
|
+
# samlb/framework/classification/my_method/model.py
|
|
298
|
+
|
|
299
|
+
from __future__ import annotations
|
|
300
|
+
from typing import Any, Dict
|
|
301
|
+
from samlb.framework.base import BaseStreamFramework
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
class MyStreamingAutoML(BaseStreamFramework):
|
|
305
|
+
"""My new streaming AutoML method."""
|
|
306
|
+
|
|
307
|
+
def __init__(self, seed: int = 42, exploration_window: int = 1000, budget: int = 10):
|
|
308
|
+
self.seed = seed
|
|
309
|
+
self.exploration_window = exploration_window
|
|
310
|
+
self.budget = budget
|
|
311
|
+
self._init_state()
|
|
312
|
+
|
|
313
|
+
def predict_one(self, x: Dict[str, float]) -> Any:
|
|
314
|
+
"""
|
|
315
|
+
Return prediction for one instance BEFORE learning.
|
|
316
|
+
|
|
317
|
+
x : dict mapping feature_name -> float value
|
|
318
|
+
Returns: class label (int) for classification, value (float) for regression
|
|
319
|
+
"""
|
|
320
|
+
return self._current_model_predict(x)
|
|
321
|
+
|
|
322
|
+
def learn_one(self, x: Dict[str, float], y: Any) -> None:
|
|
323
|
+
"""
|
|
324
|
+
Update the model with one labelled instance.
|
|
325
|
+
|
|
326
|
+
This is where your AutoML logic lives:
|
|
327
|
+
- Update base learners
|
|
328
|
+
- Evaluate pipeline candidates
|
|
329
|
+
- Detect drift and adapt
|
|
330
|
+
- Explore new configurations
|
|
331
|
+
"""
|
|
332
|
+
self._update(x, y)
|
|
333
|
+
|
|
334
|
+
def reset(self) -> None:
|
|
335
|
+
"""Reset to initial untrained state (called before each run)."""
|
|
336
|
+
self._init_state()
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
#### Step 3 -- Register in `__init__.py`
|
|
340
|
+
|
|
341
|
+
```python
|
|
342
|
+
# samlb/framework/classification/__init__.py
|
|
343
|
+
|
|
344
|
+
from .my_method.model import MyStreamingAutoML
|
|
345
|
+
|
|
346
|
+
__all__ = [
|
|
347
|
+
"AutoStreamClassifier",
|
|
348
|
+
"AutoClass",
|
|
349
|
+
"EvolutionaryBaggingClassifier",
|
|
350
|
+
"OAMLClassifier",
|
|
351
|
+
"MyStreamingAutoML", # <-- add here
|
|
352
|
+
]
|
|
353
|
+
```
|
|
354
|
+
|
|
355
|
+
#### Step 4 -- Use available building blocks
|
|
356
|
+
|
|
357
|
+
SAMLB provides fast C++ algorithms and River's full ecosystem as building blocks:
|
|
358
|
+
|
|
359
|
+
```python
|
|
360
|
+
# C++ algorithms (fast, River-compatible)
|
|
361
|
+
from samlb.framework.base import (
|
|
362
|
+
CppNaiveBayes,
|
|
363
|
+
CppPerceptron,
|
|
364
|
+
CppLogisticRegression,
|
|
365
|
+
CppHoeffdingTreeClassifier,
|
|
366
|
+
CppKNNClassifier,
|
|
367
|
+
CppSGTClassifier,
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
# River preprocessing & drift detection
|
|
371
|
+
from river.preprocessing import MinMaxScaler, StandardScaler
|
|
372
|
+
from river.feature_selection import VarianceThreshold
|
|
373
|
+
from river.drift import ADWIN
|
|
374
|
+
|
|
375
|
+
# Compose a pipeline using River's | operator
|
|
376
|
+
pipeline = MinMaxScaler() | CppHoeffdingTreeClassifier(grace_period=200)
|
|
377
|
+
pipeline.predict_one(x)
|
|
378
|
+
pipeline.learn_one(x, y)
|
|
379
|
+
```
|
|
380
|
+
|
|
381
|
+
#### Step 5 -- Run it in the benchmark
|
|
382
|
+
|
|
383
|
+
```python
|
|
384
|
+
from samlb.benchmark import BenchmarkSuite
|
|
385
|
+
from samlb.framework.classification.my_method import MyStreamingAutoML
|
|
386
|
+
|
|
387
|
+
suite = BenchmarkSuite(
|
|
388
|
+
models={
|
|
389
|
+
"MyMethod": MyStreamingAutoML(seed=42),
|
|
390
|
+
},
|
|
391
|
+
datasets=["electricity", "covertype", "insects"],
|
|
392
|
+
task="classification",
|
|
393
|
+
n_runs=10,
|
|
394
|
+
)
|
|
395
|
+
suite.run()
|
|
396
|
+
suite.print_table()
|
|
397
|
+
```
|
|
398
|
+
|
|
399
|
+
#### Step 6 -- Add tests
|
|
400
|
+
|
|
401
|
+
```python
|
|
402
|
+
# tests/test_my_method.py
|
|
403
|
+
|
|
404
|
+
from samlb.framework.classification.my_method import MyStreamingAutoML
|
|
405
|
+
from samlb.datasets import stream
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def test_predict_and_learn():
|
|
409
|
+
model = MyStreamingAutoML(seed=42)
|
|
410
|
+
for x, y in stream("electricity", task="classification", max_samples=500):
|
|
411
|
+
pred = model.predict_one(x)
|
|
412
|
+
model.learn_one(x, y)
|
|
413
|
+
assert pred is not None
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def test_reset():
|
|
417
|
+
model = MyStreamingAutoML(seed=42)
|
|
418
|
+
for x, y in stream("electricity", task="classification", max_samples=100):
|
|
419
|
+
model.learn_one(x, y)
|
|
420
|
+
model.reset()
|
|
421
|
+
# Should be back to untrained state
|
|
422
|
+
```
|
|
423
|
+
|
|
424
|
+
### Adding a New Dataset
|
|
425
|
+
|
|
426
|
+
1. Prepare your data as a NumPy NPZ file with this schema:
|
|
427
|
+
- `X` -- `float32` array of shape `(n_samples, n_features)`
|
|
428
|
+
- `y` -- `int32` (classification) or `float32` (regression) array of shape `(n_samples,)`
|
|
429
|
+
- `feature_names` -- string array of shape `(n_features,)`
|
|
430
|
+
- `target_name` -- string scalar
|
|
431
|
+
2. Place the `.npz` file in `samlb/datasets/classification/` or `samlb/datasets/regression/`
|
|
432
|
+
3. It will be automatically discovered by `list_datasets()` and `load()`
|
|
433
|
+
|
|
434
|
+
### PR Checklist
|
|
435
|
+
|
|
436
|
+
- [ ] Code passes `ruff check samlb/`
|
|
437
|
+
- [ ] Tests pass with `pytest tests/`
|
|
438
|
+
- [ ] New framework implements all 3 methods of `BaseStreamFramework`
|
|
439
|
+
- [ ] Include a brief description of the AutoML strategy
|
|
440
|
+
- [ ] Reference any papers if applicable
|
|
441
|
+
- [ ] Include benchmark results on at least 3 datasets
|
|
442
|
+
|
|
443
|
+
---
|
|
444
|
+
|
|
445
|
+
## Citation
|
|
446
|
+
|
|
447
|
+
If you use SAMLB in your research, please cite:
|
|
448
|
+
|
|
449
|
+
```bibtex
|
|
450
|
+
@software{samlb2024,
|
|
451
|
+
title = {SAMLB: Streaming AutoML Benchmark},
|
|
452
|
+
author = {Verma, Nilesh and Bifet, Albert and Pfahringer, Bernhard and Bahri, Maroua},
|
|
453
|
+
year = {2026},
|
|
454
|
+
url = {https://github.com/TechyNilesh/samlb}
|
|
455
|
+
}
|
|
456
|
+
```
|
|
457
|
+
|
|
458
|
+
## License
|
|
459
|
+
|
|
460
|
+
MIT License. See [LICENSE](LICENSE) for details.
|