samlb 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. samlb-0.1.0/.gitignore +37 -0
  2. samlb-0.1.0/CMakeLists.txt +40 -0
  3. samlb-0.1.0/LICENSE +21 -0
  4. samlb-0.1.0/PKG-INFO +460 -0
  5. samlb-0.1.0/README.md +437 -0
  6. samlb-0.1.0/_cpp/bindings/pybind_module.cpp +192 -0
  7. samlb-0.1.0/_cpp/classification/efdt.cpp +26 -0
  8. samlb-0.1.0/_cpp/classification/efdt.h +19 -0
  9. samlb-0.1.0/_cpp/classification/hoeffding_tree.cpp +454 -0
  10. samlb-0.1.0/_cpp/classification/hoeffding_tree.h +63 -0
  11. samlb-0.1.0/_cpp/classification/knn_cls.cpp +74 -0
  12. samlb-0.1.0/_cpp/classification/knn_cls.h +19 -0
  13. samlb-0.1.0/_cpp/classification/logistic_regression.cpp +69 -0
  14. samlb-0.1.0/_cpp/classification/logistic_regression.h +27 -0
  15. samlb-0.1.0/_cpp/classification/naive_bayes.cpp +62 -0
  16. samlb-0.1.0/_cpp/classification/naive_bayes.h +21 -0
  17. samlb-0.1.0/_cpp/classification/passive_aggressive_cls.cpp +89 -0
  18. samlb-0.1.0/_cpp/classification/passive_aggressive_cls.h +26 -0
  19. samlb-0.1.0/_cpp/classification/perceptron.cpp +58 -0
  20. samlb-0.1.0/_cpp/classification/perceptron.h +23 -0
  21. samlb-0.1.0/_cpp/classification/sgt.cpp +268 -0
  22. samlb-0.1.0/_cpp/classification/sgt.h +59 -0
  23. samlb-0.1.0/_cpp/classification/softmax.cpp +63 -0
  24. samlb-0.1.0/_cpp/classification/softmax.h +25 -0
  25. samlb-0.1.0/_cpp/core/gaussian_estimator.h +39 -0
  26. samlb-0.1.0/_cpp/core/hoeffding_bound.h +13 -0
  27. samlb-0.1.0/_cpp/core/sliding_window.h +88 -0
  28. samlb-0.1.0/_cpp/regression/bayesian_linear_reg.cpp +48 -0
  29. samlb-0.1.0/_cpp/regression/bayesian_linear_reg.h +24 -0
  30. samlb-0.1.0/_cpp/regression/hoeffding_tree_reg.cpp +308 -0
  31. samlb-0.1.0/_cpp/regression/hoeffding_tree_reg.h +58 -0
  32. samlb-0.1.0/_cpp/regression/knn_reg.cpp +49 -0
  33. samlb-0.1.0/_cpp/regression/knn_reg.h +17 -0
  34. samlb-0.1.0/_cpp/regression/linear_regression.cpp +35 -0
  35. samlb-0.1.0/_cpp/regression/linear_regression.h +18 -0
  36. samlb-0.1.0/_cpp/regression/passive_aggressive_reg.cpp +35 -0
  37. samlb-0.1.0/_cpp/regression/passive_aggressive_reg.h +18 -0
  38. samlb-0.1.0/assets/samlb_logo.png +0 -0
  39. samlb-0.1.0/examples/run_benchmark.py +277 -0
  40. samlb-0.1.0/examples/run_regression.py +288 -0
  41. samlb-0.1.0/pyproject.toml +61 -0
  42. samlb-0.1.0/samlb/__init__.py +39 -0
  43. samlb-0.1.0/samlb/algorithms/__init__.py +8 -0
  44. samlb-0.1.0/samlb/algorithms/classification/__init__.py +29 -0
  45. samlb-0.1.0/samlb/algorithms/regression/__init__.py +21 -0
  46. samlb-0.1.0/samlb/benchmark/__init__.py +39 -0
  47. samlb-0.1.0/samlb/benchmark/suite.py +498 -0
  48. samlb-0.1.0/samlb/datasets/__init__.py +228 -0
  49. samlb-0.1.0/samlb/evaluation/__init__.py +29 -0
  50. samlb-0.1.0/samlb/evaluation/evaluator.py +256 -0
  51. samlb-0.1.0/samlb/evaluation/metrics.py +47 -0
  52. samlb-0.1.0/samlb/evaluation/results.py +189 -0
  53. samlb-0.1.0/samlb/framework/__init__.py +44 -0
  54. samlb-0.1.0/samlb/framework/base/__init__.py +28 -0
  55. samlb-0.1.0/samlb/framework/base/_cpp_wrappers.py +350 -0
  56. samlb-0.1.0/samlb/framework/base/_framework.py +55 -0
  57. samlb-0.1.0/samlb/framework/classification/__init__.py +12 -0
  58. samlb-0.1.0/samlb/framework/classification/asml/__init__.py +20 -0
  59. samlb-0.1.0/samlb/framework/classification/asml/config.py +43 -0
  60. samlb-0.1.0/samlb/framework/classification/asml/helper.py +51 -0
  61. samlb-0.1.0/samlb/framework/classification/asml/model.py +243 -0
  62. samlb-0.1.0/samlb/framework/classification/asml/search.py +184 -0
  63. samlb-0.1.0/samlb/framework/classification/autoclass/__init__.py +20 -0
  64. samlb-0.1.0/samlb/framework/classification/autoclass/config.py +16 -0
  65. samlb-0.1.0/samlb/framework/classification/autoclass/model.py +201 -0
  66. samlb-0.1.0/samlb/framework/classification/eaml/__init__.py +20 -0
  67. samlb-0.1.0/samlb/framework/classification/eaml/config.py +13 -0
  68. samlb-0.1.0/samlb/framework/classification/eaml/model.py +182 -0
  69. samlb-0.1.0/samlb/framework/classification/oaml/__init__.py +21 -0
  70. samlb-0.1.0/samlb/framework/classification/oaml/config.py +11 -0
  71. samlb-0.1.0/samlb/framework/classification/oaml/model.py +208 -0
  72. samlb-0.1.0/samlb/framework/classification/shared_config.py +254 -0
  73. samlb-0.1.0/samlb/framework/regression/__init__.py +6 -0
  74. samlb-0.1.0/samlb/framework/regression/asml/__init__.py +20 -0
  75. samlb-0.1.0/samlb/framework/regression/asml/config.py +94 -0
  76. samlb-0.1.0/samlb/framework/regression/asml/helper.py +50 -0
  77. samlb-0.1.0/samlb/framework/regression/asml/model.py +313 -0
  78. samlb-0.1.0/samlb/framework/regression/asml/search.py +162 -0
  79. samlb-0.1.0/samlb/framework/regression/chacha/__init__.py +16 -0
  80. samlb-0.1.0/samlb/framework/regression/chacha/model.py +140 -0
  81. samlb-0.1.0/samlb/framework/regression/eaml/__init__.py +20 -0
  82. samlb-0.1.0/samlb/framework/regression/eaml/config.py +53 -0
  83. samlb-0.1.0/samlb/framework/regression/eaml/model.py +157 -0
  84. samlb-0.1.0/tests/__init__.py +0 -0
  85. samlb-0.1.0/tests/conftest.py +137 -0
  86. samlb-0.1.0/tests/test_benchmark_core_fixes.py +101 -0
  87. samlb-0.1.0/tests/test_benchmark_progress.py +63 -0
  88. samlb-0.1.0/tests/test_benchmark_suite_results_api.py +65 -0
  89. samlb-0.1.0/tests/test_classification.py +324 -0
  90. samlb-0.1.0/tests/test_regression.py +253 -0
  91. samlb-0.1.0/tests/test_regression_chacha.py +70 -0
  92. samlb-0.1.0/tests/test_run_benchmark_parallel_utils.py +97 -0
  93. samlb-0.1.0/uv.lock +2275 -0
samlb-0.1.0/.gitignore ADDED
@@ -0,0 +1,37 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.so
5
+ *.egg-info/
6
+ *.egg
7
+ dist/
8
+ build/
9
+ *.whl
10
+
11
+ # Virtual environments
12
+ .venv/
13
+ venv/
14
+ env/
15
+
16
+ # IDE
17
+ .vscode/
18
+ .idea/
19
+ *.swp
20
+ *.swo
21
+
22
+ # OS
23
+ .DS_Store
24
+ Thumbs.db
25
+
26
+ # Testing
27
+ .pytest_cache/
28
+ htmlcov/
29
+ .coverage
30
+
31
+ # Build artifacts
32
+ *.o
33
+ *.a
34
+ CMakeCache.txt
35
+ CMakeFiles/
36
+ cmake_install.cmake
37
+ Makefile
@@ -0,0 +1,40 @@
1
+ cmake_minimum_required(VERSION 3.17)
2
+ project(samlb_cpp LANGUAGES CXX)
3
+
4
+ set(CMAKE_CXX_STANDARD 17)
5
+ set(CMAKE_CXX_STANDARD_REQUIRED ON)
6
+
7
+ # Release optimisations
8
+ if(NOT CMAKE_BUILD_TYPE)
9
+ set(CMAKE_BUILD_TYPE Release)
10
+ endif()
11
+ add_compile_options(-O3 -march=native -ffast-math)
12
+
13
+ # pybind11
14
+ find_package(pybind11 CONFIG REQUIRED)
15
+
16
+ # Collect all C++ sources
17
+ set(CPP_SOURCES
18
+ _cpp/classification/naive_bayes.cpp
19
+ _cpp/classification/perceptron.cpp
20
+ _cpp/classification/logistic_regression.cpp
21
+ _cpp/classification/passive_aggressive_cls.cpp
22
+ _cpp/classification/softmax.cpp
23
+ _cpp/classification/knn_cls.cpp
24
+ _cpp/classification/hoeffding_tree.cpp
25
+ _cpp/classification/efdt.cpp
26
+ _cpp/classification/sgt.cpp
27
+ _cpp/regression/linear_regression.cpp
28
+ _cpp/regression/bayesian_linear_reg.cpp
29
+ _cpp/regression/passive_aggressive_reg.cpp
30
+ _cpp/regression/knn_reg.cpp
31
+ _cpp/regression/hoeffding_tree_reg.cpp
32
+ _cpp/bindings/pybind_module.cpp
33
+ )
34
+
35
+ pybind11_add_module(_samlb_core ${CPP_SOURCES})
36
+
37
+ target_include_directories(_samlb_core PRIVATE _cpp/core _cpp)
38
+
39
+ # Place .so next to samlb package so `import samlb` finds it
40
+ install(TARGETS _samlb_core DESTINATION samlb)
samlb-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 SAMLB Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
samlb-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,460 @@
1
+ Metadata-Version: 2.2
2
+ Name: samlb
3
+ Version: 0.1.0
4
+ Summary: Streaming AutoML Benchmark — fast C++ base algorithms + unified AutoML frameworks
5
+ Keywords: streaming,automl,benchmark,machine-learning,data-streams
6
+ Author: SAMLB Contributors
7
+ License: MIT
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: C++
10
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
11
+ Requires-Python: >=3.9
12
+ Requires-Dist: river>=0.21
13
+ Requires-Dist: scikit-learn>=1.3
14
+ Requires-Dist: numpy>=1.24
15
+ Requires-Dist: openml>=0.15.1
16
+ Provides-Extra: vw
17
+ Requires-Dist: flaml[vw]>=2.3; extra == "vw"
18
+ Provides-Extra: dev
19
+ Requires-Dist: pytest>=7.0; extra == "dev"
20
+ Requires-Dist: pytest-benchmark; extra == "dev"
21
+ Requires-Dist: ruff; extra == "dev"
22
+ Description-Content-Type: text/markdown
23
+
24
+ <p align="center">
25
+ <img src="https://raw.githubusercontent.com/TechyNilesh/samlb/main/assets/samlb_logo.png" alt="SAMLB Logo" width="400">
26
+ </p>
27
+ <p align="center">A unified benchmark framework for evaluating AutoML systems on data streams with fast C++ base algorithms and rigorous prequential evaluation.</p>
28
+
29
+ <p align="center">
30
+ <img src="https://img.shields.io/badge/Python-3.10%2B-blue.svg" alt="Python">
31
+ <img src="https://img.shields.io/pypi/v/samlb.svg" alt="PyPI">
32
+ <img src="https://img.shields.io/pypi/dm/samlb?color=green&label=Downloads" alt="Downloads">
33
+ <img src="https://img.shields.io/badge/License-MIT-green.svg" alt="License">
34
+ </p>
35
+
36
+ ---
37
+
38
+ ## Why SAMLB?
39
+
40
+ Streaming AutoML methods are hard to compare fairly. Different papers use different datasets, evaluation protocols, and algorithm pools. **SAMLB** solves this by providing:
41
+
42
+ - **Fast C++ base algorithms** with River-compatible Python interfaces (Naive Bayes, Hoeffding Trees, KNN, Perceptron, Logistic Regression, and more)
43
+ - **Framework-agnostic benchmarking** -- plug in any streaming AutoML method with just 3 methods
44
+ - **Standardized prequential evaluation** (test-then-train) with windowed metric snapshots for learning curves
45
+ - **30 curated datasets** (15 classification + 15 regression) spanning real-world and synthetic drift scenarios
46
+ - **Parallel execution** for large-scale experiments across multiple seeds
47
+
48
+ ## Installation
49
+
50
+ ### From PyPI
51
+
52
+ ```bash
53
+ pip install samlb
54
+ ```
55
+
56
+ ### From source
57
+
58
+ ```bash
59
+ git clone https://github.com/TechyNilesh/samlb.git
60
+ cd samlb
61
+ pip install -e ".[dev]"
62
+ ```
63
+
64
+ ### Optional: Vowpal Wabbit support (for ChaCha regressor)
65
+
66
+ ```bash
67
+ pip install "samlb[vw]"
68
+ ```
69
+
70
+ > **Requirements:** Python >= 3.9, a C++ compiler (for the native extension), CMake
71
+
72
+ ## Quick Start
73
+
74
+ ### Python API
75
+
76
+ ```python
77
+ from samlb.benchmark import BenchmarkSuite
78
+ from samlb.framework.classification.asml import AutoStreamClassifier
79
+ from samlb.framework.classification.eaml import EvolutionaryBaggingClassifier
80
+
81
+ suite = BenchmarkSuite(
82
+ models={
83
+ "ASML": AutoStreamClassifier(seed=42),
84
+ "EvoAutoML": EvolutionaryBaggingClassifier(seed=42),
85
+ },
86
+ datasets=["electricity", "covertype"],
87
+ task="classification",
88
+ n_runs=10,
89
+ window_size=1000,
90
+ )
91
+ suite.run()
92
+ suite.print_table()
93
+ suite.to_csv("results/classification.csv")
94
+ ```
95
+
96
+ ### Dataset Streaming
97
+
98
+ ```python
99
+ from samlb.datasets import stream, list_datasets
100
+
101
+ # See all available datasets
102
+ print(list_datasets("classification"))
103
+ print(list_datasets("regression"))
104
+
105
+ # Stream instance by instance
106
+ for x, y in stream("electricity", task="classification"):
107
+ pred = model.predict_one(x)
108
+ model.learn_one(x, y)
109
+ ```
110
+
111
+ ### CLI
112
+
113
+ ```bash
114
+ # Full classification benchmark (4 frameworks x 15 datasets x 10 runs)
115
+ python examples/run_benchmark.py
116
+
117
+ # Custom subset
118
+ python examples/run_benchmark.py --n_runs 5 --max_samples 50000 --datasets electricity covertype
119
+
120
+ # Parallel execution across CPU cores
121
+ python examples/run_benchmark.py --n_runs 100 --parallel --cpu_utilization 0.8
122
+
123
+ # Regression benchmark
124
+ python examples/run_regression.py
125
+ python examples/run_regression.py --n_runs 5 --datasets bike california_housing
126
+ ```
127
+
128
+ ## Included Frameworks
129
+
130
+ ### Classification
131
+
132
+ | Framework | Strategy | Key Features |
133
+ |-----------|----------|--------------|
134
+ | **ASML** | Adaptive Random Drift Nearby Search | ADWIN drift detection, recency-weighted ensemble, adaptive budget |
135
+ | **AutoClass** | Genetic Algorithm + Meta-Regressor | Fitness-proportionate selection, ARF surrogate for HP mutation |
136
+ | **EvoAutoML** | Evolutionary Bagging | Population-based, tournament selection, Poisson(6) sampling |
137
+ | **OAML** | Drift-triggered Random Search | EDDM drift detector, warm-up phase, random search |
138
+
139
+ ### Regression
140
+
141
+ | Framework | Strategy | Key Features |
142
+ |-----------|----------|--------------|
143
+ | **ASML** | Adaptive Random Drift Nearby Search | Online target normalization (Welford), prediction clipping |
144
+ | **ChaCha** | FLAML AutoVW | Vowpal Wabbit online HPO, progressive validation loss |
145
+ | **EvoAutoML** | Evolutionary Bagging | Population-based ensemble, mutation-driven search |
146
+
147
+ ## C++ Base Algorithms
148
+
149
+ All base learners are implemented in C++ for speed and wrapped with River-compatible interfaces:
150
+
151
+ **Classification:** Naive Bayes, Perceptron, Logistic Regression, Passive Aggressive, Softmax Regression, KNN, Hoeffding Tree, EFDT, SGT
152
+
153
+ **Regression:** Linear Regression, Bayesian Linear Regression, Passive Aggressive, Hoeffding Tree, KNN
154
+
155
+ **Preprocessing (via River):** MinMaxScaler, StandardScaler, MaxAbsScaler, VarianceThreshold, SelectKBest
156
+
157
+ ## Evaluation Methodology
158
+
159
+ SAMLB uses **prequential evaluation** (test-then-train):
160
+
161
+ 1. For each instance in the stream:
162
+ - **Predict** -- get the model's prediction *before* seeing the label
163
+ - **Evaluate** -- score the prediction against the true label
164
+ - **Learn** -- update the model with the labelled instance
165
+ 2. Metrics are captured at configurable window intervals for learning curve analysis
166
+ 3. Runtime is sampled per-instance for performance profiling
167
+
168
+ **Classification metrics:** Accuracy, Macro-F1, Macro-Precision, Macro-Recall
169
+
170
+ **Regression metrics:** MAE, RMSE, R^2
171
+
172
+ ## Datasets
173
+
174
+ ### Classification (15 datasets -- 2.5M+ total instances)
175
+
176
+ | Dataset | Samples | Features | Classes | Type | Description |
177
+ |---------|--------:|---------:|--------:|------|-------------|
178
+ | `adult` | 48,842 | 14 | 4 | Real | Income prediction (Census) |
179
+ | `covertype` | 100,000 | 54 | 7 | Real | Forest cover type (cartographic) |
180
+ | `credit_card` | 284,807 | 30 | 2 | Real | Credit card fraud detection |
181
+ | `electricity` | 45,312 | 8 | 2 | Real | Electricity price direction (NSW, Australia) |
182
+ | `insects` | 52,848 | 33 | 6 | Real | Insect species with concept drift |
183
+ | `new_airlines` | 539,383 | 7 | 2 | Real | Flight delay prediction |
184
+ | `nomao` | 34,465 | 118 | 2 | Real | Nomao place deduplication |
185
+ | `poker_hand` | 1,025,009 | 10 | 10 | Real | Poker hand classification |
186
+ | `shuttle` | 58,000 | 9 | 7 | Real | NASA Space Shuttle radiator |
187
+ | `vehicle_sensIT` | 98,528 | 100 | 3 | Real | Vehicle type from seismic sensors |
188
+ | `movingRBF` | 200,000 | 10 | 5 | Synthetic | Moving radial basis functions |
189
+ | `moving_squares` | 200,000 | 2 | 4 | Synthetic | Moving class boundaries |
190
+ | `sea_high_abrupt_drift` | 500,000 | 3 | 2 | Synthetic | SEA generator with abrupt drift |
191
+ | `synth_RandomRBFDrift` | 100,000 | 4 | 4 | Synthetic | RBF generator with gradual drift |
192
+ | `synth_agrawal` | 100,000 | 9 | 2 | Synthetic | Agrawal generator |
193
+
194
+ ### Regression (15 datasets -- 1M+ total instances)
195
+
196
+ | Dataset | Samples | Features | Type | Description |
197
+ |---------|--------:|---------:|------|-------------|
198
+ | `ailerons` | 13,750 | 40 | Real | Aircraft control surface deflection |
199
+ | `bike` | 17,379 | 12 | Real | Bike sharing hourly demand |
200
+ | `california_housing` | 20,640 | 8 | Real | California median house values |
201
+ | `cps88wages` | 28,155 | 6 | Real | Wage prediction (CPS 1988) |
202
+ | `diamonds` | 53,940 | 9 | Real | Diamond price prediction |
203
+ | `elevators` | 16,599 | 18 | Real | Aircraft elevator control |
204
+ | `fifa` | 19,178 | 28 | Real | FIFA player overall rating |
205
+ | `House8L` | 22,784 | 8 | Real | House price (8-feature variant) |
206
+ | `kings_county` | 21,613 | 21 | Real | King County house sales price |
207
+ | `MetroTraffic` | 48,204 | 7 | Real | Interstate traffic volume (Minneapolis) |
208
+ | `superconductivity` | 21,263 | 81 | Real | Superconductor critical temperature |
209
+ | `wave_energy` | 72,000 | 48 | Real | Wave energy converter power output |
210
+ | `fried` | 40,768 | 10 | Synthetic | Friedman function |
211
+ | `FriedmanGra` | 100,000 | 10 | Synthetic | Friedman with gradual drift |
212
+ | `hyperA` | 500,000 | 10 | Synthetic | Hyperplane with drift |
213
+
214
+ ## Output Formats
215
+
216
+ ```
217
+ results/
218
+ classification_10runs.csv # Flat CSV: one row per (framework x dataset x run)
219
+ aggregate.json # Aggregated mean +/- std across runs
220
+ ASML_electricity_seed0.json # Per-run JSON with full learning curves
221
+ ```
222
+
223
+ ## Project Structure
224
+
225
+ ```
226
+ .
227
+ ├── pyproject.toml # Package metadata & build config
228
+ ├── CMakeLists.txt # C++ build configuration
229
+ ├── LICENSE # MIT License
230
+ ├── README.md # This file
231
+ ├── _cpp/ # C++ source (9 classifiers, 5 regressors)
232
+ │ ├── classification/
233
+ │ ├── regression/
234
+ │ ├── core/ # Shared headers
235
+ │ └── bindings/ # PyBind11 module
236
+ ├── samlb/ # Python package
237
+ │ ├── __init__.py # Version: 0.1.0
238
+ │ ├── algorithms/ # C++ algorithm Python bindings
239
+ │ ├── benchmark/ # BenchmarkSuite orchestrator
240
+ │ ├── evaluation/ # PrequentialEvaluator, metrics, results
241
+ │ ├── datasets/ # 30 datasets (15 clf + 15 reg NPZ files)
242
+ │ └── framework/ # AutoML framework implementations
243
+ │ ├── base/ # BaseStreamFramework + C++ wrappers
244
+ │ ├── classification/ # ASML, AutoClass, EvoAutoML, OAML
245
+ │ └── regression/ # ASML, ChaCha, EvoAutoML
246
+ ├── tests/ # Test suite
247
+ └── examples/ # Benchmark runner scripts
248
+ ├── run_benchmark.py # Classification benchmark CLI
249
+ └── run_regression.py # Regression benchmark CLI
250
+ ```
251
+
252
+ ---
253
+
254
+ ## Contributing
255
+
256
+ We welcome contributions! Whether you are adding a new AutoML framework, new datasets, or fixing bugs.
257
+
258
+ ### Development Setup
259
+
260
+ ```bash
261
+ git clone https://github.com/TechyNilesh/samlb.git
262
+ cd samlb
263
+ pip install -e ".[dev]"
264
+ ```
265
+
266
+ ### Running Tests
267
+
268
+ ```bash
269
+ pytest tests/
270
+ ```
271
+
272
+ ### Code Style
273
+
274
+ ```bash
275
+ ruff check samlb/
276
+ ruff format samlb/
277
+ ```
278
+
279
+ ---
280
+
281
+ ### Adding a New Streaming AutoML Framework
282
+
283
+ This is the primary way to contribute. Every framework in SAMLB implements the same 3-method interface, making it easy to add your own.
284
+
285
+ #### Step 1 -- Create your framework directory
286
+
287
+ ```
288
+ samlb/framework/classification/my_method/ # (or regression/)
289
+ __init__.py
290
+ model.py
291
+ config.py # optional: search space / hyperparameter config
292
+ ```
293
+
294
+ #### Step 2 -- Implement `BaseStreamFramework`
295
+
296
+ ```python
297
+ # samlb/framework/classification/my_method/model.py
298
+
299
+ from __future__ import annotations
300
+ from typing import Any, Dict
301
+ from samlb.framework.base import BaseStreamFramework
302
+
303
+
304
+ class MyStreamingAutoML(BaseStreamFramework):
305
+ """My new streaming AutoML method."""
306
+
307
+ def __init__(self, seed: int = 42, exploration_window: int = 1000, budget: int = 10):
308
+ self.seed = seed
309
+ self.exploration_window = exploration_window
310
+ self.budget = budget
311
+ self._init_state()
312
+
313
+ def predict_one(self, x: Dict[str, float]) -> Any:
314
+ """
315
+ Return prediction for one instance BEFORE learning.
316
+
317
+ x : dict mapping feature_name -> float value
318
+ Returns: class label (int) for classification, value (float) for regression
319
+ """
320
+ return self._current_model_predict(x)
321
+
322
+ def learn_one(self, x: Dict[str, float], y: Any) -> None:
323
+ """
324
+ Update the model with one labelled instance.
325
+
326
+ This is where your AutoML logic lives:
327
+ - Update base learners
328
+ - Evaluate pipeline candidates
329
+ - Detect drift and adapt
330
+ - Explore new configurations
331
+ """
332
+ self._update(x, y)
333
+
334
+ def reset(self) -> None:
335
+ """Reset to initial untrained state (called before each run)."""
336
+ self._init_state()
337
+ ```
338
+
339
+ #### Step 3 -- Register in `__init__.py`
340
+
341
+ ```python
342
+ # samlb/framework/classification/__init__.py
343
+
344
+ from .my_method.model import MyStreamingAutoML
345
+
346
+ __all__ = [
347
+ "AutoStreamClassifier",
348
+ "AutoClass",
349
+ "EvolutionaryBaggingClassifier",
350
+ "OAMLClassifier",
351
+ "MyStreamingAutoML", # <-- add here
352
+ ]
353
+ ```
354
+
355
+ #### Step 4 -- Use available building blocks
356
+
357
+ SAMLB provides fast C++ algorithms and River's full ecosystem as building blocks:
358
+
359
+ ```python
360
+ # C++ algorithms (fast, River-compatible)
361
+ from samlb.framework.base import (
362
+ CppNaiveBayes,
363
+ CppPerceptron,
364
+ CppLogisticRegression,
365
+ CppHoeffdingTreeClassifier,
366
+ CppKNNClassifier,
367
+ CppSGTClassifier,
368
+ )
369
+
370
+ # River preprocessing & drift detection
371
+ from river.preprocessing import MinMaxScaler, StandardScaler
372
+ from river.feature_selection import VarianceThreshold
373
+ from river.drift import ADWIN
374
+
375
+ # Compose a pipeline using River's | operator
376
+ pipeline = MinMaxScaler() | CppHoeffdingTreeClassifier(grace_period=200)
377
+ pipeline.predict_one(x)
378
+ pipeline.learn_one(x, y)
379
+ ```
380
+
381
+ #### Step 5 -- Run it in the benchmark
382
+
383
+ ```python
384
+ from samlb.benchmark import BenchmarkSuite
385
+ from samlb.framework.classification.my_method import MyStreamingAutoML
386
+
387
+ suite = BenchmarkSuite(
388
+ models={
389
+ "MyMethod": MyStreamingAutoML(seed=42),
390
+ },
391
+ datasets=["electricity", "covertype", "insects"],
392
+ task="classification",
393
+ n_runs=10,
394
+ )
395
+ suite.run()
396
+ suite.print_table()
397
+ ```
398
+
399
+ #### Step 6 -- Add tests
400
+
401
+ ```python
402
+ # tests/test_my_method.py
403
+
404
+ from samlb.framework.classification.my_method import MyStreamingAutoML
405
+ from samlb.datasets import stream
406
+
407
+
408
+ def test_predict_and_learn():
409
+ model = MyStreamingAutoML(seed=42)
410
+ for x, y in stream("electricity", task="classification", max_samples=500):
411
+ pred = model.predict_one(x)
412
+ model.learn_one(x, y)
413
+ assert pred is not None
414
+
415
+
416
+ def test_reset():
417
+ model = MyStreamingAutoML(seed=42)
418
+ for x, y in stream("electricity", task="classification", max_samples=100):
419
+ model.learn_one(x, y)
420
+ model.reset()
421
+ # Should be back to untrained state
422
+ ```
423
+
424
+ ### Adding a New Dataset
425
+
426
+ 1. Prepare your data as a NumPy NPZ file with this schema:
427
+ - `X` -- `float32` array of shape `(n_samples, n_features)`
428
+ - `y` -- `int32` (classification) or `float32` (regression) array of shape `(n_samples,)`
429
+ - `feature_names` -- string array of shape `(n_features,)`
430
+ - `target_name` -- string scalar
431
+ 2. Place the `.npz` file in `samlb/datasets/classification/` or `samlb/datasets/regression/`
432
+ 3. It will be automatically discovered by `list_datasets()` and `load()`
433
+
434
+ ### PR Checklist
435
+
436
+ - [ ] Code passes `ruff check samlb/`
437
+ - [ ] Tests pass with `pytest tests/`
438
+ - [ ] New framework implements all 3 methods of `BaseStreamFramework`
439
+ - [ ] Include a brief description of the AutoML strategy
440
+ - [ ] Reference any papers if applicable
441
+ - [ ] Include benchmark results on at least 3 datasets
442
+
443
+ ---
444
+
445
+ ## Citation
446
+
447
+ If you use SAMLB in your research, please cite:
448
+
449
+ ```bibtex
450
+ @software{samlb2024,
451
+ title = {SAMLB: Streaming AutoML Benchmark},
452
+ author = {Verma, Nilesh and Bifet, Albert and Pfahringer, Bernhard and Bahri, Maroua},
453
+ year = {2026},
454
+ url = {https://github.com/TechyNilesh/samlb}
455
+ }
456
+ ```
457
+
458
+ ## License
459
+
460
+ MIT License. See [LICENSE](LICENSE) for details.