bayesian-gp-cvloss 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bayesian_gp_cvloss-0.1.1/LICENSE +21 -0
- bayesian_gp_cvloss-0.1.1/PKG-INFO +234 -0
- bayesian_gp_cvloss-0.1.1/README.md +197 -0
- bayesian_gp_cvloss-0.1.1/bayesian_gp_cvloss/__init__.py +6 -0
- bayesian_gp_cvloss-0.1.1/bayesian_gp_cvloss/optimizer.py +287 -0
- bayesian_gp_cvloss-0.1.1/bayesian_gp_cvloss.egg-info/PKG-INFO +234 -0
- bayesian_gp_cvloss-0.1.1/bayesian_gp_cvloss.egg-info/SOURCES.txt +10 -0
- bayesian_gp_cvloss-0.1.1/bayesian_gp_cvloss.egg-info/dependency_links.txt +1 -0
- bayesian_gp_cvloss-0.1.1/bayesian_gp_cvloss.egg-info/requires.txt +5 -0
- bayesian_gp_cvloss-0.1.1/bayesian_gp_cvloss.egg-info/top_level.txt +1 -0
- bayesian_gp_cvloss-0.1.1/setup.cfg +4 -0
- bayesian_gp_cvloss-0.1.1/setup.py +34 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Shifa Zhong
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: bayesian-gp-cvloss
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: A Python package for Gaussian Process Regression with hyperparameter optimization using Hyperopt and cross-validation, focusing on optimizing cross-validated loss.
|
|
5
|
+
Home-page: https://github.com/Shifa-Zhong/bayesian-gp-cvloss
|
|
6
|
+
Author: Shifa Zhong
|
|
7
|
+
Author-email: sfzhong@tongji.edu.cn
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
+
Requires-Python: >=3.8
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: gpflow>=2.0.0
|
|
23
|
+
Requires-Dist: hyperopt>=0.2.0
|
|
24
|
+
Requires-Dist: scikit-learn>=0.23.0
|
|
25
|
+
Requires-Dist: pandas>=1.0.0
|
|
26
|
+
Requires-Dist: numpy>=1.18.0
|
|
27
|
+
Dynamic: author
|
|
28
|
+
Dynamic: author-email
|
|
29
|
+
Dynamic: classifier
|
|
30
|
+
Dynamic: description
|
|
31
|
+
Dynamic: description-content-type
|
|
32
|
+
Dynamic: home-page
|
|
33
|
+
Dynamic: license-file
|
|
34
|
+
Dynamic: requires-dist
|
|
35
|
+
Dynamic: requires-python
|
|
36
|
+
Dynamic: summary
|
|
37
|
+
|
|
38
|
+
# Bayesian GP CVLoss: Gaussian Process Regression with Cross-Validated Hyperparameter Optimization
|
|
39
|
+
|
|
40
|
+
[](https://badge.fury.io/py/bayesian-gp-cvloss) <!-- Placeholder for PyPI badge -->
|
|
41
|
+
|
|
42
|
+
`bayesian_gp_cvloss` is a Python package designed to simplify the process of training Gaussian Process (GP) models by finding optimal hyperparameters through Bayesian optimization (using Hyperopt) with k-fold cross-validation. The key feature of this package is its direct optimization of the cross-validated Root Mean Squared Error (RMSE), aligning the hyperparameter tuning process closely with the model's predictive performance.
|
|
43
|
+
|
|
44
|
+
This package is particularly useful for researchers and practitioners who want to apply GP models without manually tuning hyperparameters or relying solely on maximizing marginal likelihood, offering a more direct approach to achieving good generalization on unseen data.
|
|
45
|
+
|
|
46
|
+
## Core Idea
|
|
47
|
+
|
|
48
|
+
The traditional approach to training GP models often involves maximizing the log marginal likelihood of the model parameters. While effective, this doesn't always directly translate to the best predictive performance on unseen data, especially when the model assumptions are not perfectly met or when working with smaller datasets.
|
|
49
|
+
|
|
50
|
+
This library implements an alternative strategy:
|
|
51
|
+
|
|
52
|
+
1. **Define a search space** for the GP kernel parameters (e.g., length scales, kernel variance) and likelihood parameters (e.g., noise variance).
|
|
53
|
+
2. Use **Bayesian optimization (Hyperopt)** to intelligently search this space.
|
|
54
|
+
3. For each set of hyperparameters evaluated by Hyperopt, perform **k-fold cross-validation** on the training data.
|
|
55
|
+
4. The **objective function** for Hyperopt is the mean RMSE across these k folds.
|
|
56
|
+
5. The set of hyperparameters yielding the **minimum average cross-validated RMSE** is selected as optimal.
|
|
57
|
+
6. A final GP model is then **refitted on the entire training dataset** using these best-found hyperparameters.
|
|
58
|
+
|
|
59
|
+
This method directly targets the minimization of prediction error, which can be a more robust approach for many real-world regression tasks.
|
|
60
|
+
|
|
61
|
+
## Features
|
|
62
|
+
|
|
63
|
+
* Automated hyperparameter optimization for GP models using Hyperopt.
|
|
64
|
+
* Cross-validation (k-fold) integrated into the optimization loop to find parameters that generalize well.
|
|
65
|
+
* Directly optimizes for mean cross-validated RMSE.
|
|
66
|
+
* Supports various GPflow kernels (e.g., RBF, Matern32, Matern52, RationalQuadratic by default, easily extensible).
|
|
67
|
+
* Data-dependent default hyperparameter search space generation based on the target variable's statistics.
|
|
68
|
+
* Handles mean centering of the target variable internally for potentially improved stability.
|
|
69
|
+
* Simple API: provide your preprocessed numerical `X_train` and `y_train` data.
|
|
70
|
+
|
|
71
|
+
## Installation
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
# Coming soon to PyPI!
|
|
75
|
+
# pip install bayesian-gp-cvloss
|
|
76
|
+
# For now, you can install from source if you have the code:
|
|
77
|
+
# pip install .
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Dependencies
|
|
81
|
+
|
|
82
|
+
* gpflow >= 2.0.0
|
|
83
|
+
* hyperopt >= 0.2.0
|
|
84
|
+
* scikit-learn >= 0.23.0
|
|
85
|
+
* pandas >= 1.0.0
|
|
86
|
+
* numpy >= 1.18.0
|
|
87
|
+
|
|
88
|
+
Users are responsible for their own data preprocessing (e.g., encoding categorical features, feature scaling) before using this library. The optimizer expects purely numerical `X_train` and `y_train` inputs.
|
|
89
|
+
|
|
90
|
+
## Quick Start
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
import numpy as np
|
|
94
|
+
import pandas as pd
|
|
95
|
+
from sklearn.preprocessing import StandardScaler
|
|
96
|
+
from sklearn.model_selection import train_test_split
|
|
97
|
+
|
|
98
|
+
from bayesian_gp_cvloss.optimizer import GPCrossValidatedOptimizer
|
|
99
|
+
|
|
100
|
+
# 0. (User Responsibility) Load and Preprocess Data
|
|
101
|
+
# Example: Assume you have X (features) and y (target) as pandas DataFrames/Series
|
|
102
|
+
# Ensure X is purely numerical. All encoding and scaling is up to the user.
|
|
103
|
+
|
|
104
|
+
# Let's create some synthetic data for demonstration
|
|
105
|
+
np.random.seed(42)
|
|
106
|
+
N_train = 100
|
|
107
|
+
N_features = 3
|
|
108
|
+
X_synth = np.random.rand(N_train, N_features)
|
|
109
|
+
# A simple function for y with some noise
|
|
110
|
+
y_synth = np.sin(X_synth[:, 0] * 2 * np.pi) + X_synth[:, 1]**2 + np.random.randn(N_train) * 0.1
|
|
111
|
+
|
|
112
|
+
# Convert to pandas DataFrame/Series if your data isn't already
|
|
113
|
+
X_df = pd.DataFrame(X_synth, columns=[f'feature_{i}' for i in range(N_features)])
|
|
114
|
+
y_series = pd.Series(y_synth, name='target')
|
|
115
|
+
|
|
116
|
+
# Split data (optional, but good practice to have a final test set)
|
|
117
|
+
# The optimizer does its own CV on the X_train_opt, y_train_opt
|
|
118
|
+
X_train_data, X_test_data, y_train_data, y_test_data = train_test_split(
|
|
119
|
+
X_df, y_series, test_size=0.2, random_state=42
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Scale features (example - user should choose appropriate scaling)
|
|
123
|
+
scaler = StandardScaler()
|
|
124
|
+
X_train_scaled = scaler.fit_transform(X_train_data)
|
|
125
|
+
X_test_scaled = scaler.transform(X_test_data)
|
|
126
|
+
|
|
127
|
+
# Optimizer expects numpy arrays
|
|
128
|
+
X_train_np = X_train_scaled
|
|
129
|
+
y_train_np = y_train_data.values
|
|
130
|
+
|
|
131
|
+
X_test_np = X_test_scaled
|
|
132
|
+
y_test_np = y_test_data.values
|
|
133
|
+
|
|
134
|
+
# 1. Initialize the Optimizer
|
|
135
|
+
# You can specify kernels, number of folds, max_evals for Hyperopt, etc.
|
|
136
|
+
optimizer = GPCrossValidatedOptimizer(
|
|
137
|
+
n_folds=5,
|
|
138
|
+
max_evals=50, # Number of Hyperopt trials (increase for better results)
|
|
139
|
+
random_state_hyperopt=42
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# 2. Run Optimization
|
|
143
|
+
# This will find the best hyperparameters based on cross-validated RMSE
|
|
144
|
+
# using X_train_np and y_train_np
|
|
145
|
+
optimizer.optimize(X_train_np, y_train_np)
|
|
146
|
+
|
|
147
|
+
print(f"Best hyperparameters found: {optimizer.best_params}")
|
|
148
|
+
print(f"Best CV validation RMSE: {optimizer.best_cv_val_rmse_}")
|
|
149
|
+
print(f"Best CV train RMSE: {optimizer.best_cv_train_rmse_}")
|
|
150
|
+
|
|
151
|
+
# 3. Get the Refitted Model (Optional)
|
|
152
|
+
# The optimizer automatically refits a model on the full X_train_np, y_train_np
|
|
153
|
+
# using the best hyperparameters. You can access it if needed.
|
|
154
|
+
# best_gpr_model = optimizer.get_refitted_model()
|
|
155
|
+
# print_summary(best_gpr_model) # Requires gpflow
|
|
156
|
+
|
|
157
|
+
# 4. Make Predictions
|
|
158
|
+
# The predict method uses the refitted model.
|
|
159
|
+
# Input to predict should be preprocessed in the same way as X_train_np
|
|
160
|
+
y_pred_test, y_pred_var_test = optimizer.predict(X_test_np)
|
|
161
|
+
|
|
162
|
+
# Evaluate (example)
|
|
163
|
+
from sklearn.metrics import mean_squared_error
|
|
164
|
+
rmse_test = np.sqrt(mean_squared_error(y_test_np, y_pred_test))
|
|
165
|
+
print(f"Test RMSE: {rmse_test}")
|
|
166
|
+
|
|
167
|
+
# Plot results (example)
|
|
168
|
+
import matplotlib.pyplot as plt
|
|
169
|
+
plt.figure(figsize=(8, 6))
|
|
170
|
+
plt.scatter(y_test_np, y_pred_test, alpha=0.7, label='Test Predictions')
|
|
171
|
+
plt.plot([min(y_test_np), max(y_test_np)], [min(y_test_np), max(y_test_np)], 'r--', label='Ideal')
|
|
172
|
+
plt.xlabel("True Values")
|
|
173
|
+
plt.ylabel("Predicted Values")
|
|
174
|
+
plt.title("GPR Predictions vs True Values on Test Set")
|
|
175
|
+
plt.legend()
|
|
176
|
+
plt.grid(True)
|
|
177
|
+
plt.show()
|
|
178
|
+
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
## How it Works Internally
|
|
182
|
+
|
|
183
|
+
1. **`__init__(...)`**: Initializes settings like number of folds (`n_folds`), Hyperopt maximum evaluations (`max_evals`), desired kernels (`kernels_to_try`), and random states.
|
|
184
|
+
2. **`optimize(X_train, y_train)`**:
|
|
185
|
+
* Stores `X_train` and `y_train`.
|
|
186
|
+
* Calculates `self.y_train_mean_` for internal centering.
|
|
187
|
+
* Calls `_get_default_data_dependent_space(y_train)` to define the hyperparameter search space for Hyperopt. This space is dynamically adjusted based on the variance and standard deviation of `y_train` to provide sensible default ranges for kernel variance and likelihood noise.
|
|
188
|
+
* Initializes `hyperopt.Trials()`.
|
|
189
|
+
* Runs `hyperopt.fmin()` with the `_objective` function, the defined `space`, `tpe.suggest` algorithm, and `max_evals`.
|
|
190
|
+
* Stores the best parameters (`self.best_params`), best cross-validated validation RMSE (`self.best_cv_val_rmse_`), and corresponding training RMSE (`self.best_cv_train_rmse_`).
|
|
191
|
+
* Calls `refit_best_model(X_train, y_train)` to train a final GPR model on the full training data using `self.best_params`.
|
|
192
|
+
3. **`_objective(params)`**:
|
|
193
|
+
* This is the function minimized by Hyperopt.
|
|
194
|
+
* It takes a dictionary of `params` (hyperparameters for a single trial).
|
|
195
|
+
* Performs k-fold cross-validation:
|
|
196
|
+
* For each fold, splits `X_train`, `y_train` into `X_train_fold`, `y_train_fold` and `X_val_fold`, `y_val_fold`.
|
|
197
|
+
* **Important**: `y_train_fold` and `y_val_fold` are centered by subtracting the mean of the *current* `y_train_fold`.
|
|
198
|
+
* Constructs a GPflow GPR model using the hyperparameters from `params` and the current fold's training data.
|
|
199
|
+
* Predicts on `X_val_fold` and calculates RMSE.
|
|
200
|
+
* Averages the RMSEs from all validation folds.
|
|
201
|
+
* Returns a dictionary including `{'loss': avg_val_rmse, 'status': STATUS_OK, ...}`.
|
|
202
|
+
4. **`_get_default_data_dependent_space(y_train)`**:
|
|
203
|
+
* Defines the search space for Hyperopt for each hyperparameter:
|
|
204
|
+
* `lengthscales`: `hp.quniform` between 1 and 100 (step 0.01) for each input dimension.
|
|
205
|
+
* `kernel_variance`: `hp.uniform` between 0 and `y_train.var()`.
|
|
206
|
+
* `likelihood_noise_variance`: `hp.loguniform` between `(y_train.std()/100)**2` and `(y_train.std()/2)**2` (with safety checks for small/zero std dev).
|
|
207
|
+
* `kernel_class`: `hp.choice` among the kernels specified in `self.kernels_to_try`.
|
|
208
|
+
5. **`refit_best_model(X_data_refit, y_data_refit)`**:
|
|
209
|
+
* Trains a new GPflow GPR model using `self.best_params` on the *entire* `X_data_refit` and `y_data_refit` (which are centered using `self.y_train_mean_`).
|
|
210
|
+
* Stores this model as `self.final_model_`.
|
|
211
|
+
6. **`predict(X_new)`**:
|
|
212
|
+
* Takes new, preprocessed data `X_new`.
|
|
213
|
+
* Uses `self.final_model_` to predict mean and variance.
|
|
214
|
+
* Adds back `self.y_train_mean_` to the predicted mean to return predictions on the original scale.
|
|
215
|
+
|
|
216
|
+
## Customization
|
|
217
|
+
|
|
218
|
+
* **Kernels**: Pass a list of GPflow kernel classes to the `kernels_to_try` argument in the `GPCrossValidatedOptimizer` constructor (e.g., `[gpflow.kernels.Matern52, gpflow.kernels.RBF]`).
|
|
219
|
+
* **Hyperparameter Space**: While a data-dependent default space is provided, you can supply your own `hyperopt_space` dictionary to the `optimize` method if you need finer control or different distributions for hyperparameters.
|
|
220
|
+
* **Cross-Validation**: Change `n_folds` and `random_state_kfold`.
|
|
221
|
+
* **Hyperopt**: Adjust `max_evals` and `random_state_hyperopt`.
|
|
222
|
+
|
|
223
|
+
## Contributing
|
|
224
|
+
|
|
225
|
+
Contributions are welcome! If you have suggestions for improvements or find any issues, please open an issue or submit a pull request to the GitHub repository: https://github.com/Shifa-Zhong/bayesian-gp-cvloss
|
|
226
|
+
|
|
227
|
+
## License
|
|
228
|
+
|
|
229
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
230
|
+
|
|
231
|
+
## Author
|
|
232
|
+
|
|
233
|
+
Shifa Zhong (sfzhong@tongji.edu.cn)
|
|
234
|
+
GitHub: [Shifa-Zhong](https://github.com/Shifa-Zhong)
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
# Bayesian GP CVLoss: Gaussian Process Regression with Cross-Validated Hyperparameter Optimization
|
|
2
|
+
|
|
3
|
+
[](https://badge.fury.io/py/bayesian-gp-cvloss) <!-- Placeholder for PyPI badge -->
|
|
4
|
+
|
|
5
|
+
`bayesian_gp_cvloss` is a Python package designed to simplify the process of training Gaussian Process (GP) models by finding optimal hyperparameters through Bayesian optimization (using Hyperopt) with k-fold cross-validation. The key feature of this package is its direct optimization of the cross-validated Root Mean Squared Error (RMSE), aligning the hyperparameter tuning process closely with the model's predictive performance.
|
|
6
|
+
|
|
7
|
+
This package is particularly useful for researchers and practitioners who want to apply GP models without manually tuning hyperparameters or relying solely on maximizing marginal likelihood, offering a more direct approach to achieving good generalization on unseen data.
|
|
8
|
+
|
|
9
|
+
## Core Idea
|
|
10
|
+
|
|
11
|
+
The traditional approach to training GP models often involves maximizing the log marginal likelihood of the model parameters. While effective, this doesn't always directly translate to the best predictive performance on unseen data, especially when the model assumptions are not perfectly met or when working with smaller datasets.
|
|
12
|
+
|
|
13
|
+
This library implements an alternative strategy:
|
|
14
|
+
|
|
15
|
+
1. **Define a search space** for the GP kernel parameters (e.g., length scales, kernel variance) and likelihood parameters (e.g., noise variance).
|
|
16
|
+
2. Use **Bayesian optimization (Hyperopt)** to intelligently search this space.
|
|
17
|
+
3. For each set of hyperparameters evaluated by Hyperopt, perform **k-fold cross-validation** on the training data.
|
|
18
|
+
4. The **objective function** for Hyperopt is the mean RMSE across these k folds.
|
|
19
|
+
5. The set of hyperparameters yielding the **minimum average cross-validated RMSE** is selected as optimal.
|
|
20
|
+
6. A final GP model is then **refitted on the entire training dataset** using these best-found hyperparameters.
|
|
21
|
+
|
|
22
|
+
This method directly targets the minimization of prediction error, which can be a more robust approach for many real-world regression tasks.
|
|
23
|
+
|
|
24
|
+
## Features
|
|
25
|
+
|
|
26
|
+
* Automated hyperparameter optimization for GP models using Hyperopt.
|
|
27
|
+
* Cross-validation (k-fold) integrated into the optimization loop to find parameters that generalize well.
|
|
28
|
+
* Directly optimizes for mean cross-validated RMSE.
|
|
29
|
+
* Supports various GPflow kernels (e.g., RBF, Matern32, Matern52, RationalQuadratic by default, easily extensible).
|
|
30
|
+
* Data-dependent default hyperparameter search space generation based on the target variable's statistics.
|
|
31
|
+
* Handles mean centering of the target variable internally for potentially improved stability.
|
|
32
|
+
* Simple API: provide your preprocessed numerical `X_train` and `y_train` data.
|
|
33
|
+
|
|
34
|
+
## Installation
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
# Coming soon to PyPI!
|
|
38
|
+
# pip install bayesian-gp-cvloss
|
|
39
|
+
# For now, you can install from source if you have the code:
|
|
40
|
+
# pip install .
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Dependencies
|
|
44
|
+
|
|
45
|
+
* gpflow >= 2.0.0
|
|
46
|
+
* hyperopt >= 0.2.0
|
|
47
|
+
* scikit-learn >= 0.23.0
|
|
48
|
+
* pandas >= 1.0.0
|
|
49
|
+
* numpy >= 1.18.0
|
|
50
|
+
|
|
51
|
+
Users are responsible for their own data preprocessing (e.g., encoding categorical features, feature scaling) before using this library. The optimizer expects purely numerical `X_train` and `y_train` inputs.
|
|
52
|
+
|
|
53
|
+
## Quick Start
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
import numpy as np
|
|
57
|
+
import pandas as pd
|
|
58
|
+
from sklearn.preprocessing import StandardScaler
|
|
59
|
+
from sklearn.model_selection import train_test_split
|
|
60
|
+
|
|
61
|
+
from bayesian_gp_cvloss.optimizer import GPCrossValidatedOptimizer
|
|
62
|
+
|
|
63
|
+
# 0. (User Responsibility) Load and Preprocess Data
|
|
64
|
+
# Example: Assume you have X (features) and y (target) as pandas DataFrames/Series
|
|
65
|
+
# Ensure X is purely numerical. All encoding and scaling is up to the user.
|
|
66
|
+
|
|
67
|
+
# Let's create some synthetic data for demonstration
|
|
68
|
+
np.random.seed(42)
|
|
69
|
+
N_train = 100
|
|
70
|
+
N_features = 3
|
|
71
|
+
X_synth = np.random.rand(N_train, N_features)
|
|
72
|
+
# A simple function for y with some noise
|
|
73
|
+
y_synth = np.sin(X_synth[:, 0] * 2 * np.pi) + X_synth[:, 1]**2 + np.random.randn(N_train) * 0.1
|
|
74
|
+
|
|
75
|
+
# Convert to pandas DataFrame/Series if your data isn't already
|
|
76
|
+
X_df = pd.DataFrame(X_synth, columns=[f'feature_{i}' for i in range(N_features)])
|
|
77
|
+
y_series = pd.Series(y_synth, name='target')
|
|
78
|
+
|
|
79
|
+
# Split data (optional, but good practice to have a final test set)
|
|
80
|
+
# The optimizer does its own CV on the X_train_opt, y_train_opt
|
|
81
|
+
X_train_data, X_test_data, y_train_data, y_test_data = train_test_split(
|
|
82
|
+
X_df, y_series, test_size=0.2, random_state=42
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# Scale features (example - user should choose appropriate scaling)
|
|
86
|
+
scaler = StandardScaler()
|
|
87
|
+
X_train_scaled = scaler.fit_transform(X_train_data)
|
|
88
|
+
X_test_scaled = scaler.transform(X_test_data)
|
|
89
|
+
|
|
90
|
+
# Optimizer expects numpy arrays
|
|
91
|
+
X_train_np = X_train_scaled
|
|
92
|
+
y_train_np = y_train_data.values
|
|
93
|
+
|
|
94
|
+
X_test_np = X_test_scaled
|
|
95
|
+
y_test_np = y_test_data.values
|
|
96
|
+
|
|
97
|
+
# 1. Initialize the Optimizer
|
|
98
|
+
# You can specify kernels, number of folds, max_evals for Hyperopt, etc.
|
|
99
|
+
optimizer = GPCrossValidatedOptimizer(
|
|
100
|
+
n_folds=5,
|
|
101
|
+
max_evals=50, # Number of Hyperopt trials (increase for better results)
|
|
102
|
+
random_state_hyperopt=42
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# 2. Run Optimization
|
|
106
|
+
# This will find the best hyperparameters based on cross-validated RMSE
|
|
107
|
+
# using X_train_np and y_train_np
|
|
108
|
+
optimizer.optimize(X_train_np, y_train_np)
|
|
109
|
+
|
|
110
|
+
print(f"Best hyperparameters found: {optimizer.best_params}")
|
|
111
|
+
print(f"Best CV validation RMSE: {optimizer.best_cv_val_rmse_}")
|
|
112
|
+
print(f"Best CV train RMSE: {optimizer.best_cv_train_rmse_}")
|
|
113
|
+
|
|
114
|
+
# 3. Get the Refitted Model (Optional)
|
|
115
|
+
# The optimizer automatically refits a model on the full X_train_np, y_train_np
|
|
116
|
+
# using the best hyperparameters. You can access it if needed.
|
|
117
|
+
# best_gpr_model = optimizer.get_refitted_model()
|
|
118
|
+
# print_summary(best_gpr_model) # Requires gpflow
|
|
119
|
+
|
|
120
|
+
# 4. Make Predictions
|
|
121
|
+
# The predict method uses the refitted model.
|
|
122
|
+
# Input to predict should be preprocessed in the same way as X_train_np
|
|
123
|
+
y_pred_test, y_pred_var_test = optimizer.predict(X_test_np)
|
|
124
|
+
|
|
125
|
+
# Evaluate (example)
|
|
126
|
+
from sklearn.metrics import mean_squared_error
|
|
127
|
+
rmse_test = np.sqrt(mean_squared_error(y_test_np, y_pred_test))
|
|
128
|
+
print(f"Test RMSE: {rmse_test}")
|
|
129
|
+
|
|
130
|
+
# Plot results (example)
|
|
131
|
+
import matplotlib.pyplot as plt
|
|
132
|
+
plt.figure(figsize=(8, 6))
|
|
133
|
+
plt.scatter(y_test_np, y_pred_test, alpha=0.7, label='Test Predictions')
|
|
134
|
+
plt.plot([min(y_test_np), max(y_test_np)], [min(y_test_np), max(y_test_np)], 'r--', label='Ideal')
|
|
135
|
+
plt.xlabel("True Values")
|
|
136
|
+
plt.ylabel("Predicted Values")
|
|
137
|
+
plt.title("GPR Predictions vs True Values on Test Set")
|
|
138
|
+
plt.legend()
|
|
139
|
+
plt.grid(True)
|
|
140
|
+
plt.show()
|
|
141
|
+
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## How it Works Internally
|
|
145
|
+
|
|
146
|
+
1. **`__init__(...)`**: Initializes settings like number of folds (`n_folds`), Hyperopt maximum evaluations (`max_evals`), desired kernels (`kernels_to_try`), and random states.
|
|
147
|
+
2. **`optimize(X_train, y_train)`**:
|
|
148
|
+
* Stores `X_train` and `y_train`.
|
|
149
|
+
* Calculates `self.y_train_mean_` for internal centering.
|
|
150
|
+
* Calls `_get_default_data_dependent_space(y_train)` to define the hyperparameter search space for Hyperopt. This space is dynamically adjusted based on the variance and standard deviation of `y_train` to provide sensible default ranges for kernel variance and likelihood noise.
|
|
151
|
+
* Initializes `hyperopt.Trials()`.
|
|
152
|
+
* Runs `hyperopt.fmin()` with the `_objective` function, the defined `space`, `tpe.suggest` algorithm, and `max_evals`.
|
|
153
|
+
* Stores the best parameters (`self.best_params`), best cross-validated validation RMSE (`self.best_cv_val_rmse_`), and corresponding training RMSE (`self.best_cv_train_rmse_`).
|
|
154
|
+
* Calls `refit_best_model(X_train, y_train)` to train a final GPR model on the full training data using `self.best_params`.
|
|
155
|
+
3. **`_objective(params)`**:
|
|
156
|
+
* This is the function minimized by Hyperopt.
|
|
157
|
+
* It takes a dictionary of `params` (hyperparameters for a single trial).
|
|
158
|
+
* Performs k-fold cross-validation:
|
|
159
|
+
* For each fold, splits `X_train`, `y_train` into `X_train_fold`, `y_train_fold` and `X_val_fold`, `y_val_fold`.
|
|
160
|
+
* **Important**: `y_train_fold` and `y_val_fold` are centered by subtracting the mean of the *current* `y_train_fold`.
|
|
161
|
+
* Constructs a GPflow GPR model using the hyperparameters from `params` and the current fold's training data.
|
|
162
|
+
* Predicts on `X_val_fold` and calculates RMSE.
|
|
163
|
+
* Averages the RMSEs from all validation folds.
|
|
164
|
+
* Returns a dictionary including `{'loss': avg_val_rmse, 'status': STATUS_OK, ...}`.
|
|
165
|
+
4. **`_get_default_data_dependent_space(y_train)`**:
|
|
166
|
+
* Defines the search space for Hyperopt for each hyperparameter:
|
|
167
|
+
* `lengthscales`: `hp.quniform` between 1 and 100 (step 0.01) for each input dimension.
|
|
168
|
+
* `kernel_variance`: `hp.uniform` between 0 and `y_train.var()`.
|
|
169
|
+
* `likelihood_noise_variance`: `hp.loguniform` between `(y_train.std()/100)**2` and `(y_train.std()/2)**2` (with safety checks for small/zero std dev).
|
|
170
|
+
* `kernel_class`: `hp.choice` among the kernels specified in `self.kernels_to_try`.
|
|
171
|
+
5. **`refit_best_model(X_data_refit, y_data_refit)`**:
|
|
172
|
+
* Trains a new GPflow GPR model using `self.best_params` on the *entire* `X_data_refit` and `y_data_refit` (which are centered using `self.y_train_mean_`).
|
|
173
|
+
* Stores this model as `self.final_model_`.
|
|
174
|
+
6. **`predict(X_new)`**:
|
|
175
|
+
* Takes new, preprocessed data `X_new`.
|
|
176
|
+
* Uses `self.final_model_` to predict mean and variance.
|
|
177
|
+
* Adds back `self.y_train_mean_` to the predicted mean to return predictions on the original scale.
|
|
178
|
+
|
|
179
|
+
## Customization
|
|
180
|
+
|
|
181
|
+
* **Kernels**: Pass a list of GPflow kernel classes to the `kernels_to_try` argument in the `GPCrossValidatedOptimizer` constructor (e.g., `[gpflow.kernels.Matern52, gpflow.kernels.RBF]`).
|
|
182
|
+
* **Hyperparameter Space**: While a data-dependent default space is provided, you can supply your own `hyperopt_space` dictionary to the `optimize` method if you need finer control or different distributions for hyperparameters.
|
|
183
|
+
* **Cross-Validation**: Change `n_folds` and `random_state_kfold`.
|
|
184
|
+
* **Hyperopt**: Adjust `max_evals` and `random_state_hyperopt`.
|
|
185
|
+
|
|
186
|
+
## Contributing
|
|
187
|
+
|
|
188
|
+
Contributions are welcome! If you have suggestions for improvements or find any issues, please open an issue or submit a pull request to the GitHub repository: https://github.com/Shifa-Zhong/bayesian-gp-cvloss
|
|
189
|
+
|
|
190
|
+
## License
|
|
191
|
+
|
|
192
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
193
|
+
|
|
194
|
+
## Author
|
|
195
|
+
|
|
196
|
+
Shifa Zhong (sfzhong@tongji.edu.cn)
|
|
197
|
+
GitHub: [Shifa-Zhong](https://github.com/Shifa-Zhong)
|
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
import gpflow
|
|
4
|
+
from gpflow.utilities import set_trainable
|
|
5
|
+
from sklearn.model_selection import KFold
|
|
6
|
+
from sklearn.metrics import mean_squared_error
|
|
7
|
+
# StandardScaler and JamesSteinEncoder are no longer direct dependencies for the class
|
|
8
|
+
# but might be used by the user or the optional preprocessing utility.
|
|
9
|
+
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
|
|
10
|
+
import tensorflow as tf
|
|
11
|
+
import logging
|
|
12
|
+
|
|
13
|
+
# Configure basic logging
|
|
14
|
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
# Default GP Kernels from GPflow - user can specify these in the space
|
|
18
|
+
DEFAULT_KERNELS = {
|
|
19
|
+
#"Matern12": gpflow.kernels.Matern12,
|
|
20
|
+
"Matern32": gpflow.kernels.Matern32,
|
|
21
|
+
"Matern52": gpflow.kernels.Matern52,
|
|
22
|
+
"RBF": gpflow.kernels.RBF,
|
|
23
|
+
"RationalQuadratic": gpflow.kernels.RationalQuadratic,
|
|
24
|
+
#"Exponential": gpflow.kernels.Exponential
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
class GPCrossValidatedOptimizer:
|
|
28
|
+
"""
|
|
29
|
+
Optimizes hyperparameters for a Gaussian Process Regressor using Hyperopt
|
|
30
|
+
with k-fold cross-validation, minimizing RMSE.
|
|
31
|
+
Assumes input data X_train and y_train are already preprocessed (numerical and scaled).
|
|
32
|
+
Generates a data-dependent default hyperparameter space if none is provided.
|
|
33
|
+
"""
|
|
34
|
+
def __init__(self, X_train, y_train,
|
|
35
|
+
hyperopt_space=None, n_splits=5, random_state=None):
|
|
36
|
+
"""
|
|
37
|
+
Args:
|
|
38
|
+
X_train (pd.DataFrame or np.ndarray): The preprocessed training feature dataset.
|
|
39
|
+
y_train (pd.Series or np.ndarray): The preprocessed training target variable.
|
|
40
|
+
hyperopt_space (dict, optional): Hyperopt search space. If None, a data-dependent default space is generated.
|
|
41
|
+
n_splits (int): Number of folds for KFold cross-validation.
|
|
42
|
+
random_state (int, optional): Random seed for KFold and Hyperopt for reproducibility.
|
|
43
|
+
"""
|
|
44
|
+
if not isinstance(X_train, (pd.DataFrame, np.ndarray)):
|
|
45
|
+
raise ValueError("X_train must be a pandas DataFrame or NumPy ndarray.")
|
|
46
|
+
if not isinstance(y_train, (pd.Series, np.ndarray)):
|
|
47
|
+
raise ValueError("y_train must be a pandas Series or NumPy ndarray.")
|
|
48
|
+
|
|
49
|
+
if isinstance(X_train, np.ndarray) and len(X_train.shape) != 2:
|
|
50
|
+
raise ValueError("X_train as NumPy array must be 2D.")
|
|
51
|
+
|
|
52
|
+
_y_data_internal = y_train.values if isinstance(y_train, pd.Series) else np.asarray(y_train)
|
|
53
|
+
if len(_y_data_internal.shape) != 1 and not (len(_y_data_internal.shape) == 2 and _y_data_internal.shape[1] == 1):
|
|
54
|
+
raise ValueError("y_train must be 1D or 2D with one column.")
|
|
55
|
+
if len(_y_data_internal.shape) == 2:
|
|
56
|
+
_y_data_internal = _y_data_internal.flatten()
|
|
57
|
+
|
|
58
|
+
if X_train.shape[0] != _y_data_internal.shape[0]:
|
|
59
|
+
raise ValueError("X_train and y_train must have the same number of samples.")
|
|
60
|
+
|
|
61
|
+
self.X_train = X_train
|
|
62
|
+
self.y_train_1d = _y_data_internal # Store as 1D for var/std calculations and mean subtraction reference
|
|
63
|
+
self.y_train_mean_ = np.mean(self.y_train_1d) # Store the mean of the original y_train
|
|
64
|
+
self.n_splits = n_splits
|
|
65
|
+
self.random_state = random_state
|
|
66
|
+
|
|
67
|
+
self.num_features = X_train.shape[1]
|
|
68
|
+
self.hyperopt_space = hyperopt_space if hyperopt_space is not None else self._get_default_data_dependent_space()
|
|
69
|
+
self.trials = Trials()
|
|
70
|
+
self.best_params = None
|
|
71
|
+
self.best_model_ = None
|
|
72
|
+
self._iteration_count = 0
|
|
73
|
+
|
|
74
|
+
def _get_default_data_dependent_space(self):
|
|
75
|
+
"""Generates a data-dependent default hyperparameter search space."""
|
|
76
|
+
logger.info("Generating data-dependent default hyperparameter space.")
|
|
77
|
+
# Using y_train_1d which is y_train before any internal mean centering for GP models
|
|
78
|
+
space = {f'lengthscales_{i}': hp.quniform(f'lengthscales_{i}', 0.1, 100, 0.01) for i in range(self.num_features)}
|
|
79
|
+
|
|
80
|
+
y_var = np.var(self.y_train_1d)
|
|
81
|
+
y_std = np.std(self.y_train_1d)
|
|
82
|
+
|
|
83
|
+
kernel_var_upper = float(y_var)
|
|
84
|
+
kernel_var_lower = 0
|
|
85
|
+
logger.info(f"Default kernel_variance range: ({kernel_var_lower:.2e}, {kernel_var_upper:.2e})")
|
|
86
|
+
space['kernel_variance'] = hp.uniform('kernel_variance', kernel_var_lower, kernel_var_upper)
|
|
87
|
+
|
|
88
|
+
if y_std > 1e-9:
|
|
89
|
+
noise_var_lower_bound = (y_std / 100.0)**2
|
|
90
|
+
noise_var_upper_bound = (y_std / 2.0)**2
|
|
91
|
+
else:
|
|
92
|
+
noise_var_lower_bound = 1e-9
|
|
93
|
+
noise_var_upper_bound = 1e-2
|
|
94
|
+
|
|
95
|
+
noise_var_lower_bound = max(1e-9, float(noise_var_lower_bound))
|
|
96
|
+
noise_var_upper_bound = max(noise_var_lower_bound * 1.1 + 1e-9, float(noise_var_upper_bound))
|
|
97
|
+
|
|
98
|
+
logger.info(f"Default likelihood_noise_variance loguniform range (effective): ({noise_var_lower_bound:.2e}, {noise_var_upper_bound:.2e})")
|
|
99
|
+
space['likelihood_noise_variance'] = hp.loguniform(
|
|
100
|
+
'likelihood_noise_variance',
|
|
101
|
+
np.log(noise_var_lower_bound),
|
|
102
|
+
np.log(noise_var_upper_bound)
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
space['kernel_name'] = hp.choice('kernel_name', list(DEFAULT_KERNELS.keys()))
|
|
106
|
+
return space
|
|
107
|
+
|
|
108
|
+
def _objective(self, params):
|
|
109
|
+
self._iteration_count += 1
|
|
110
|
+
iteration_num = self._iteration_count
|
|
111
|
+
|
|
112
|
+
kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)
|
|
113
|
+
fold_rmses = []
|
|
114
|
+
fold_train_rmses = []
|
|
115
|
+
|
|
116
|
+
kernel_name = params['kernel_name']
|
|
117
|
+
selected_kernel_class = DEFAULT_KERNELS[kernel_name]
|
|
118
|
+
|
|
119
|
+
kernel_hparams = {}
|
|
120
|
+
try:
|
|
121
|
+
lengthscales = np.array([params[f'lengthscales_{i}'] for i in range(self.num_features)], dtype=float)
|
|
122
|
+
except KeyError as e:
|
|
123
|
+
logger.error(f"Missing lengthscale param: {e}. Params: {params}")
|
|
124
|
+
return {'loss': np.inf, 'status': STATUS_OK, 'params': params, 'iteration': iteration_num}
|
|
125
|
+
|
|
126
|
+
if not lengthscales.shape[0] == self.num_features:
|
|
127
|
+
logger.error(f"LS dim mismatch. Expected {self.num_features}, got {lengthscales.shape[0]}")
|
|
128
|
+
return {'loss': np.inf, 'status': STATUS_OK, 'params': params, 'iteration': iteration_num}
|
|
129
|
+
|
|
130
|
+
kernel_hparams['lengthscales'] = lengthscales
|
|
131
|
+
kernel_hparams['variance'] = float(params['kernel_variance'])
|
|
132
|
+
current_noise_variance = float(params['likelihood_noise_variance'])
|
|
133
|
+
|
|
134
|
+
X_data_for_cv = self.X_train.values if isinstance(self.X_train, pd.DataFrame) else self.X_train
|
|
135
|
+
y_data_1d_for_cv = self.y_train_1d
|
|
136
|
+
|
|
137
|
+
for fold_idx, (train_index, val_index) in enumerate(kf.split(X_data_for_cv)):
|
|
138
|
+
X_train_fold, X_val_fold = X_data_for_cv[train_index], X_data_for_cv[val_index]
|
|
139
|
+
y_train_fold_1d, y_val_fold_1d = y_data_1d_for_cv[train_index], y_data_1d_for_cv[val_index]
|
|
140
|
+
|
|
141
|
+
# Mean centering for this fold based on this fold's training y
|
|
142
|
+
current_fold_y_train_mean = np.mean(y_train_fold_1d)
|
|
143
|
+
y_train_fold_centered = y_train_fold_1d - current_fold_y_train_mean
|
|
144
|
+
y_val_fold_centered = y_val_fold_1d - current_fold_y_train_mean
|
|
145
|
+
|
|
146
|
+
y_train_fold_2d = y_train_fold_centered.reshape(-1,1)
|
|
147
|
+
y_val_fold_2d = y_val_fold_centered.reshape(-1,1) # This is y_val - mean(y_train_fold)
|
|
148
|
+
|
|
149
|
+
try:
|
|
150
|
+
fold_kernel = selected_kernel_class(**kernel_hparams)
|
|
151
|
+
model = gpflow.models.GPR(data=(X_train_fold, y_train_fold_2d),
|
|
152
|
+
kernel=fold_kernel,
|
|
153
|
+
noise_variance=current_noise_variance)
|
|
154
|
+
|
|
155
|
+
set_trainable(model.kernel.variance, False)
|
|
156
|
+
set_trainable(model.kernel.lengthscales, False)
|
|
157
|
+
set_trainable(model.likelihood.variance, False)
|
|
158
|
+
|
|
159
|
+
# Predictions are on the centered scale
|
|
160
|
+
y_pred_val_centered, _ = model.predict_y(X_val_fold)
|
|
161
|
+
y_pred_train_centered, _ = model.predict_y(X_train_fold)
|
|
162
|
+
|
|
163
|
+
# RMSE is calculated on centered y_val and centered predictions
|
|
164
|
+
# OR add mean back to predictions and compare with original y_val_fold_1d
|
|
165
|
+
# The current notebook code subtracts fold_train_mean from y_val_fold_2d for training GP model,
|
|
166
|
+
# so predictions are already centered around that. So RMSE on y_val_fold_centered is correct.
|
|
167
|
+
fold_rmse = np.sqrt(mean_squared_error(y_val_fold_centered, y_pred_val_centered.numpy()))
|
|
168
|
+
fold_train_rmse = np.sqrt(mean_squared_error(y_train_fold_centered, y_pred_train_centered.numpy()))
|
|
169
|
+
fold_rmses.append(fold_rmse)
|
|
170
|
+
fold_train_rmses.append(fold_train_rmse)
|
|
171
|
+
except Exception as e:
|
|
172
|
+
logger.warning(f"Fold {fold_idx+1} error for params {params}: {e}. High loss.")
|
|
173
|
+
fold_rmses.append(np.inf)
|
|
174
|
+
fold_train_rmses.append(np.inf)
|
|
175
|
+
break
|
|
176
|
+
|
|
177
|
+
avg_cv_rmse = np.mean(fold_rmses) if fold_rmses and np.all(np.isfinite(fold_rmses)) else np.inf
|
|
178
|
+
avg_train_rmse = np.mean(fold_train_rmses) if fold_train_rmses and np.all(np.isfinite(fold_train_rmses)) else np.inf
|
|
179
|
+
|
|
180
|
+
ls_rounded = np.round(lengthscales,2)
|
|
181
|
+
logger.info(f"Iter: {iteration_num:>3} | CV RMSE: {avg_cv_rmse:<8.4f} | Train RMSE: {avg_train_rmse:<8.4f} | Kernel: {kernel_name} | Var: {kernel_hparams['variance']:.4f} | Noise: {current_noise_variance:.6f} | LS: {ls_rounded}")
|
|
182
|
+
|
|
183
|
+
return {
|
|
184
|
+
'loss': avg_cv_rmse,
|
|
185
|
+
'status': STATUS_OK,
|
|
186
|
+
'params': params,
|
|
187
|
+
'iteration': iteration_num,
|
|
188
|
+
'train_loss': avg_train_rmse
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
def optimize(self, max_evals=100, tpe_algo=tpe.suggest, early_stop_fn=None, rstate_seed=None):
|
|
192
|
+
self._iteration_count = 0
|
|
193
|
+
if rstate_seed is None and self.random_state is not None:
|
|
194
|
+
rstate_seed = self.random_state
|
|
195
|
+
|
|
196
|
+
rstate = np.random.default_rng(rstate_seed) if rstate_seed is not None else None
|
|
197
|
+
|
|
198
|
+
self.best_params_raw_ = fmin(
|
|
199
|
+
fn=self._objective,
|
|
200
|
+
space=self.hyperopt_space,
|
|
201
|
+
algo=tpe_algo,
|
|
202
|
+
max_evals=max_evals,
|
|
203
|
+
trials=self.trials,
|
|
204
|
+
early_stop_fn=early_stop_fn,
|
|
205
|
+
rstate=rstate
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
logger.info(f"Optimization finished. Best raw params from fmin: {self.best_params_raw_}")
|
|
209
|
+
|
|
210
|
+
if self.trials.best_trial and 'result' in self.trials.best_trial and self.trials.best_trial['result']['status'] == STATUS_OK:
|
|
211
|
+
self.best_params = self.trials.best_trial['result']['params']
|
|
212
|
+
logger.info(f"Best full params from trials: {self.best_params}")
|
|
213
|
+
logger.info(f"Best CV RMSE from trials: {self.trials.best_trial['result']['loss']}")
|
|
214
|
+
logger.info(f"Best CV Train RMSE from trials: {self.trials.best_trial['result']['train_loss']}")
|
|
215
|
+
self.refit_best_model()
|
|
216
|
+
else:
|
|
217
|
+
self.best_params = None
|
|
218
|
+
logger.warning("Optimization did not yield a valid best trial. Model not refitted.")
|
|
219
|
+
|
|
220
|
+
return self.best_params
|
|
221
|
+
|
|
222
|
+
def refit_best_model(self):
|
|
223
|
+
if not self.best_params:
|
|
224
|
+
logger.warning("No valid best parameters. Cannot refit model.")
|
|
225
|
+
self.best_model_ = None
|
|
226
|
+
return None
|
|
227
|
+
|
|
228
|
+
params_for_refit = self.best_params
|
|
229
|
+
kernel_name = params_for_refit['kernel_name']
|
|
230
|
+
selected_kernel_class = DEFAULT_KERNELS.get(kernel_name)
|
|
231
|
+
|
|
232
|
+
if selected_kernel_class is None:
|
|
233
|
+
logger.error(f"Kernel '{kernel_name}' not found. Cannot refit.")
|
|
234
|
+
self.best_model_ = None
|
|
235
|
+
return None
|
|
236
|
+
|
|
237
|
+
kernel_hparams = {}
|
|
238
|
+
lengthscales = np.array([params_for_refit[f'lengthscales_{i}'] for i in range(self.num_features)], dtype=float)
|
|
239
|
+
kernel_hparams['lengthscales'] = lengthscales
|
|
240
|
+
kernel_hparams['variance'] = float(params_for_refit['kernel_variance'])
|
|
241
|
+
noise_var_refit = float(params_for_refit['likelihood_noise_variance'])
|
|
242
|
+
|
|
243
|
+
X_data_refit = self.X_train.values if isinstance(self.X_train, pd.DataFrame) else self.X_train
|
|
244
|
+
# For refitting, use the original y_train_1d and subtract its overall mean (self.y_train_mean_)
|
|
245
|
+
y_train_centered_for_refit = (self.y_train_1d - self.y_train_mean_).reshape(-1,1)
|
|
246
|
+
|
|
247
|
+
try:
|
|
248
|
+
best_kernel = selected_kernel_class(**kernel_hparams)
|
|
249
|
+
self.best_model_ = gpflow.models.GPR(
|
|
250
|
+
data=(X_data_refit, y_train_centered_for_refit),
|
|
251
|
+
kernel=best_kernel,
|
|
252
|
+
noise_variance=noise_var_refit
|
|
253
|
+
)
|
|
254
|
+
set_trainable(self.best_model_.kernel.variance, False)
|
|
255
|
+
set_trainable(self.best_model_.kernel.lengthscales, False)
|
|
256
|
+
set_trainable(self.best_model_.likelihood.variance, False)
|
|
257
|
+
logger.info(f"Successfully refitted GPR model: {params_for_refit}")
|
|
258
|
+
except Exception as e:
|
|
259
|
+
logger.error(f"Error refitting model with params {params_for_refit}: {e}")
|
|
260
|
+
self.best_model_ = None
|
|
261
|
+
return self.best_model_
|
|
262
|
+
|
|
263
|
+
def predict(self, X_new_processed):
|
|
264
|
+
if self.best_model_ is None:
|
|
265
|
+
logger.error("No best model. Run optimize() and ensure refit was successful.")
|
|
266
|
+
return None, None
|
|
267
|
+
if not isinstance(X_new_processed, (pd.DataFrame, np.ndarray)):
|
|
268
|
+
raise ValueError("X_new_processed must be pd.DataFrame or np.ndarray.")
|
|
269
|
+
|
|
270
|
+
if X_new_processed.shape[1] != self.num_features:
|
|
271
|
+
raise ValueError(f"X_new has {X_new_processed.shape[1]} features, model expects {self.num_features}.")
|
|
272
|
+
|
|
273
|
+
X_new_values = X_new_processed.values if isinstance(X_new_processed, pd.DataFrame) else X_new_processed
|
|
274
|
+
|
|
275
|
+
try:
|
|
276
|
+
pred_mean_centered, pred_var = self.best_model_.predict_y(X_new_values)
|
|
277
|
+
# Add back the overall mean of the original y_train used for refitting
|
|
278
|
+
pred_mean_original_scale = pred_mean_centered.numpy() + self.y_train_mean_
|
|
279
|
+
return pred_mean_original_scale, pred_var.numpy()
|
|
280
|
+
except Exception as e:
|
|
281
|
+
logger.error(f"Error during prediction: {e}")
|
|
282
|
+
return None, None
|
|
283
|
+
|
|
284
|
+
def get_optimization_results(self):
|
|
285
|
+
return self.trials
|
|
286
|
+
|
|
287
|
+
# End of GPCrossValidatedOptimizer class
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: bayesian-gp-cvloss
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: A Python package for Gaussian Process Regression with hyperparameter optimization using Hyperopt and cross-validation, focusing on optimizing cross-validated loss.
|
|
5
|
+
Home-page: https://github.com/Shifa-Zhong/bayesian-gp-cvloss
|
|
6
|
+
Author: Shifa Zhong
|
|
7
|
+
Author-email: sfzhong@tongji.edu.cn
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
+
Requires-Python: >=3.8
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: gpflow>=2.0.0
|
|
23
|
+
Requires-Dist: hyperopt>=0.2.0
|
|
24
|
+
Requires-Dist: scikit-learn>=0.23.0
|
|
25
|
+
Requires-Dist: pandas>=1.0.0
|
|
26
|
+
Requires-Dist: numpy>=1.18.0
|
|
27
|
+
Dynamic: author
|
|
28
|
+
Dynamic: author-email
|
|
29
|
+
Dynamic: classifier
|
|
30
|
+
Dynamic: description
|
|
31
|
+
Dynamic: description-content-type
|
|
32
|
+
Dynamic: home-page
|
|
33
|
+
Dynamic: license-file
|
|
34
|
+
Dynamic: requires-dist
|
|
35
|
+
Dynamic: requires-python
|
|
36
|
+
Dynamic: summary
|
|
37
|
+
|
|
38
|
+
# Bayesian GP CVLoss: Gaussian Process Regression with Cross-Validated Hyperparameter Optimization
|
|
39
|
+
|
|
40
|
+
[](https://badge.fury.io/py/bayesian-gp-cvloss) <!-- Placeholder for PyPI badge -->
|
|
41
|
+
|
|
42
|
+
`bayesian_gp_cvloss` is a Python package designed to simplify the process of training Gaussian Process (GP) models by finding optimal hyperparameters through Bayesian optimization (using Hyperopt) with k-fold cross-validation. The key feature of this package is its direct optimization of the cross-validated Root Mean Squared Error (RMSE), aligning the hyperparameter tuning process closely with the model's predictive performance.
|
|
43
|
+
|
|
44
|
+
This package is particularly useful for researchers and practitioners who want to apply GP models without manually tuning hyperparameters or relying solely on maximizing marginal likelihood, offering a more direct approach to achieving good generalization on unseen data.
|
|
45
|
+
|
|
46
|
+
## Core Idea
|
|
47
|
+
|
|
48
|
+
The traditional approach to training GP models often involves maximizing the log marginal likelihood of the model parameters. While effective, this doesn't always directly translate to the best predictive performance on unseen data, especially when the model assumptions are not perfectly met or when working with smaller datasets.
|
|
49
|
+
|
|
50
|
+
This library implements an alternative strategy:
|
|
51
|
+
|
|
52
|
+
1. **Define a search space** for the GP kernel parameters (e.g., length scales, kernel variance) and likelihood parameters (e.g., noise variance).
|
|
53
|
+
2. Use **Bayesian optimization (Hyperopt)** to intelligently search this space.
|
|
54
|
+
3. For each set of hyperparameters evaluated by Hyperopt, perform **k-fold cross-validation** on the training data.
|
|
55
|
+
4. The **objective function** for Hyperopt is the mean RMSE across these k folds.
|
|
56
|
+
5. The set of hyperparameters yielding the **minimum average cross-validated RMSE** is selected as optimal.
|
|
57
|
+
6. A final GP model is then **refitted on the entire training dataset** using these best-found hyperparameters.
|
|
58
|
+
|
|
59
|
+
This method directly targets the minimization of prediction error, which can be a more robust approach for many real-world regression tasks.
|
|
60
|
+
|
|
61
|
+
## Features
|
|
62
|
+
|
|
63
|
+
* Automated hyperparameter optimization for GP models using Hyperopt.
|
|
64
|
+
* Cross-validation (k-fold) integrated into the optimization loop to find parameters that generalize well.
|
|
65
|
+
* Directly optimizes for mean cross-validated RMSE.
|
|
66
|
+
* Supports various GPflow kernels (e.g., RBF, Matern32, Matern52, RationalQuadratic by default, easily extensible).
|
|
67
|
+
* Data-dependent default hyperparameter search space generation based on the target variable's statistics.
|
|
68
|
+
* Handles mean centering of the target variable internally for potentially improved stability.
|
|
69
|
+
* Simple API: provide your preprocessed numerical `X_train` and `y_train` data.
|
|
70
|
+
|
|
71
|
+
## Installation
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
# Coming soon to PyPI!
|
|
75
|
+
# pip install bayesian-gp-cvloss
|
|
76
|
+
# For now, you can install from source if you have the code:
|
|
77
|
+
# pip install .
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Dependencies
|
|
81
|
+
|
|
82
|
+
* gpflow >= 2.0.0
|
|
83
|
+
* hyperopt >= 0.2.0
|
|
84
|
+
* scikit-learn >= 0.23.0
|
|
85
|
+
* pandas >= 1.0.0
|
|
86
|
+
* numpy >= 1.18.0
|
|
87
|
+
|
|
88
|
+
Users are responsible for their own data preprocessing (e.g., encoding categorical features, feature scaling) before using this library. The optimizer expects purely numerical `X_train` and `y_train` inputs.
|
|
89
|
+
|
|
90
|
+
## Quick Start
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
import numpy as np
|
|
94
|
+
import pandas as pd
|
|
95
|
+
from sklearn.preprocessing import StandardScaler
|
|
96
|
+
from sklearn.model_selection import train_test_split
|
|
97
|
+
|
|
98
|
+
from bayesian_gp_cvloss.optimizer import GPCrossValidatedOptimizer
|
|
99
|
+
|
|
100
|
+
# 0. (User Responsibility) Load and Preprocess Data
|
|
101
|
+
# Example: Assume you have X (features) and y (target) as pandas DataFrames/Series
|
|
102
|
+
# Ensure X is purely numerical. All encoding and scaling is up to the user.
|
|
103
|
+
|
|
104
|
+
# Let's create some synthetic data for demonstration
|
|
105
|
+
np.random.seed(42)
|
|
106
|
+
N_train = 100
|
|
107
|
+
N_features = 3
|
|
108
|
+
X_synth = np.random.rand(N_train, N_features)
|
|
109
|
+
# A simple function for y with some noise
|
|
110
|
+
y_synth = np.sin(X_synth[:, 0] * 2 * np.pi) + X_synth[:, 1]**2 + np.random.randn(N_train) * 0.1
|
|
111
|
+
|
|
112
|
+
# Convert to pandas DataFrame/Series if your data isn't already
|
|
113
|
+
X_df = pd.DataFrame(X_synth, columns=[f'feature_{i}' for i in range(N_features)])
|
|
114
|
+
y_series = pd.Series(y_synth, name='target')
|
|
115
|
+
|
|
116
|
+
# Split data (optional, but good practice to have a final test set)
|
|
117
|
+
# The optimizer does its own CV on the X_train_opt, y_train_opt
|
|
118
|
+
X_train_data, X_test_data, y_train_data, y_test_data = train_test_split(
|
|
119
|
+
X_df, y_series, test_size=0.2, random_state=42
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Scale features (example - user should choose appropriate scaling)
|
|
123
|
+
scaler = StandardScaler()
|
|
124
|
+
X_train_scaled = scaler.fit_transform(X_train_data)
|
|
125
|
+
X_test_scaled = scaler.transform(X_test_data)
|
|
126
|
+
|
|
127
|
+
# Optimizer expects numpy arrays
|
|
128
|
+
X_train_np = X_train_scaled
|
|
129
|
+
y_train_np = y_train_data.values
|
|
130
|
+
|
|
131
|
+
X_test_np = X_test_scaled
|
|
132
|
+
y_test_np = y_test_data.values
|
|
133
|
+
|
|
134
|
+
# 1. Initialize the Optimizer
|
|
135
|
+
# You can specify kernels, number of folds, max_evals for Hyperopt, etc.
|
|
136
|
+
optimizer = GPCrossValidatedOptimizer(
|
|
137
|
+
n_folds=5,
|
|
138
|
+
max_evals=50, # Number of Hyperopt trials (increase for better results)
|
|
139
|
+
random_state_hyperopt=42
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# 2. Run Optimization
|
|
143
|
+
# This will find the best hyperparameters based on cross-validated RMSE
|
|
144
|
+
# using X_train_np and y_train_np
|
|
145
|
+
optimizer.optimize(X_train_np, y_train_np)
|
|
146
|
+
|
|
147
|
+
print(f"Best hyperparameters found: {optimizer.best_params}")
|
|
148
|
+
print(f"Best CV validation RMSE: {optimizer.best_cv_val_rmse_}")
|
|
149
|
+
print(f"Best CV train RMSE: {optimizer.best_cv_train_rmse_}")
|
|
150
|
+
|
|
151
|
+
# 3. Get the Refitted Model (Optional)
|
|
152
|
+
# The optimizer automatically refits a model on the full X_train_np, y_train_np
|
|
153
|
+
# using the best hyperparameters. You can access it if needed.
|
|
154
|
+
# best_gpr_model = optimizer.get_refitted_model()
|
|
155
|
+
# print_summary(best_gpr_model) # Requires gpflow
|
|
156
|
+
|
|
157
|
+
# 4. Make Predictions
|
|
158
|
+
# The predict method uses the refitted model.
|
|
159
|
+
# Input to predict should be preprocessed in the same way as X_train_np
|
|
160
|
+
y_pred_test, y_pred_var_test = optimizer.predict(X_test_np)
|
|
161
|
+
|
|
162
|
+
# Evaluate (example)
|
|
163
|
+
from sklearn.metrics import mean_squared_error
|
|
164
|
+
rmse_test = np.sqrt(mean_squared_error(y_test_np, y_pred_test))
|
|
165
|
+
print(f"Test RMSE: {rmse_test}")
|
|
166
|
+
|
|
167
|
+
# Plot results (example)
|
|
168
|
+
import matplotlib.pyplot as plt
|
|
169
|
+
plt.figure(figsize=(8, 6))
|
|
170
|
+
plt.scatter(y_test_np, y_pred_test, alpha=0.7, label='Test Predictions')
|
|
171
|
+
plt.plot([min(y_test_np), max(y_test_np)], [min(y_test_np), max(y_test_np)], 'r--', label='Ideal')
|
|
172
|
+
plt.xlabel("True Values")
|
|
173
|
+
plt.ylabel("Predicted Values")
|
|
174
|
+
plt.title("GPR Predictions vs True Values on Test Set")
|
|
175
|
+
plt.legend()
|
|
176
|
+
plt.grid(True)
|
|
177
|
+
plt.show()
|
|
178
|
+
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
## How it Works Internally
|
|
182
|
+
|
|
183
|
+
1. **`__init__(...)`**: Initializes settings like number of folds (`n_folds`), Hyperopt maximum evaluations (`max_evals`), desired kernels (`kernels_to_try`), and random states.
|
|
184
|
+
2. **`optimize(X_train, y_train)`**:
|
|
185
|
+
* Stores `X_train` and `y_train`.
|
|
186
|
+
* Calculates `self.y_train_mean_` for internal centering.
|
|
187
|
+
* Calls `_get_default_data_dependent_space(y_train)` to define the hyperparameter search space for Hyperopt. This space is dynamically adjusted based on the variance and standard deviation of `y_train` to provide sensible default ranges for kernel variance and likelihood noise.
|
|
188
|
+
* Initializes `hyperopt.Trials()`.
|
|
189
|
+
* Runs `hyperopt.fmin()` with the `_objective` function, the defined `space`, `tpe.suggest` algorithm, and `max_evals`.
|
|
190
|
+
* Stores the best parameters (`self.best_params`), best cross-validated validation RMSE (`self.best_cv_val_rmse_`), and corresponding training RMSE (`self.best_cv_train_rmse_`).
|
|
191
|
+
* Calls `refit_best_model(X_train, y_train)` to train a final GPR model on the full training data using `self.best_params`.
|
|
192
|
+
3. **`_objective(params)`**:
|
|
193
|
+
* This is the function minimized by Hyperopt.
|
|
194
|
+
* It takes a dictionary of `params` (hyperparameters for a single trial).
|
|
195
|
+
* Performs k-fold cross-validation:
|
|
196
|
+
* For each fold, splits `X_train`, `y_train` into `X_train_fold`, `y_train_fold` and `X_val_fold`, `y_val_fold`.
|
|
197
|
+
* **Important**: `y_train_fold` and `y_val_fold` are centered by subtracting the mean of the *current* `y_train_fold`.
|
|
198
|
+
* Constructs a GPflow GPR model using the hyperparameters from `params` and the current fold's training data.
|
|
199
|
+
* Predicts on `X_val_fold` and calculates RMSE.
|
|
200
|
+
* Averages the RMSEs from all validation folds.
|
|
201
|
+
* Returns a dictionary including `{'loss': avg_val_rmse, 'status': STATUS_OK, ...}`.
|
|
202
|
+
4. **`_get_default_data_dependent_space(y_train)`**:
|
|
203
|
+
* Defines the search space for Hyperopt for each hyperparameter:
|
|
204
|
+
* `lengthscales`: `hp.quniform` between 1 and 100 (step 0.01) for each input dimension.
|
|
205
|
+
* `kernel_variance`: `hp.uniform` between 0 and `y_train.var()`.
|
|
206
|
+
* `likelihood_noise_variance`: `hp.loguniform` between `(y_train.std()/100)**2` and `(y_train.std()/2)**2` (with safety checks for small/zero std dev).
|
|
207
|
+
* `kernel_class`: `hp.choice` among the kernels specified in `self.kernels_to_try`.
|
|
208
|
+
5. **`refit_best_model(X_data_refit, y_data_refit)`**:
|
|
209
|
+
* Trains a new GPflow GPR model using `self.best_params` on the *entire* `X_data_refit` and `y_data_refit` (which are centered using `self.y_train_mean_`).
|
|
210
|
+
* Stores this model as `self.final_model_`.
|
|
211
|
+
6. **`predict(X_new)`**:
|
|
212
|
+
* Takes new, preprocessed data `X_new`.
|
|
213
|
+
* Uses `self.final_model_` to predict mean and variance.
|
|
214
|
+
* Adds back `self.y_train_mean_` to the predicted mean to return predictions on the original scale.
|
|
215
|
+
|
|
216
|
+
## Customization
|
|
217
|
+
|
|
218
|
+
* **Kernels**: Pass a list of GPflow kernel classes to the `kernels_to_try` argument in the `GPCrossValidatedOptimizer` constructor (e.g., `[gpflow.kernels.Matern52, gpflow.kernels.RBF]`).
|
|
219
|
+
* **Hyperparameter Space**: While a data-dependent default space is provided, you can supply your own `hyperopt_space` dictionary to the `optimize` method if you need finer control or different distributions for hyperparameters.
|
|
220
|
+
* **Cross-Validation**: Change `n_folds` and `random_state_kfold`.
|
|
221
|
+
* **Hyperopt**: Adjust `max_evals` and `random_state_hyperopt`.
|
|
222
|
+
|
|
223
|
+
## Contributing
|
|
224
|
+
|
|
225
|
+
Contributions are welcome! If you have suggestions for improvements or find any issues, please open an issue or submit a pull request to the GitHub repository: https://github.com/Shifa-Zhong/bayesian-gp-cvloss
|
|
226
|
+
|
|
227
|
+
## License
|
|
228
|
+
|
|
229
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
230
|
+
|
|
231
|
+
## Author
|
|
232
|
+
|
|
233
|
+
Shifa Zhong (sfzhong@tongji.edu.cn)
|
|
234
|
+
GitHub: [Shifa-Zhong](https://github.com/Shifa-Zhong)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
setup.py
|
|
4
|
+
bayesian_gp_cvloss/__init__.py
|
|
5
|
+
bayesian_gp_cvloss/optimizer.py
|
|
6
|
+
bayesian_gp_cvloss.egg-info/PKG-INFO
|
|
7
|
+
bayesian_gp_cvloss.egg-info/SOURCES.txt
|
|
8
|
+
bayesian_gp_cvloss.egg-info/dependency_links.txt
|
|
9
|
+
bayesian_gp_cvloss.egg-info/requires.txt
|
|
10
|
+
bayesian_gp_cvloss.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
bayesian_gp_cvloss
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name='bayesian-gp-cvloss',
|
|
5
|
+
version='0.1.1',
|
|
6
|
+
author='Shifa Zhong',
|
|
7
|
+
author_email='sfzhong@tongji.edu.cn',
|
|
8
|
+
description='A Python package for Gaussian Process Regression with hyperparameter optimization using Hyperopt and cross-validation, focusing on optimizing cross-validated loss.',
|
|
9
|
+
long_description=open('README.md').read(),
|
|
10
|
+
long_description_content_type='text/markdown',
|
|
11
|
+
url='https://github.com/Shifa-Zhong/bayesian-gp-cvloss',
|
|
12
|
+
packages=find_packages(),
|
|
13
|
+
install_requires=[
|
|
14
|
+
'gpflow>=2.0.0',
|
|
15
|
+
'hyperopt>=0.2.0',
|
|
16
|
+
'scikit-learn>=0.23.0',
|
|
17
|
+
'pandas>=1.0.0',
|
|
18
|
+
'numpy>=1.18.0',
|
|
19
|
+
],
|
|
20
|
+
classifiers=[
|
|
21
|
+
'Development Status :: 3 - Alpha',
|
|
22
|
+
'Intended Audience :: Science/Research',
|
|
23
|
+
'Intended Audience :: Developers',
|
|
24
|
+
'License :: OSI Approved :: MIT License',
|
|
25
|
+
'Programming Language :: Python :: 3',
|
|
26
|
+
'Programming Language :: Python :: 3.8',
|
|
27
|
+
'Programming Language :: Python :: 3.9',
|
|
28
|
+
'Programming Language :: Python :: 3.10',
|
|
29
|
+
'Operating System :: OS Independent',
|
|
30
|
+
'Topic :: Scientific/Engineering :: Artificial Intelligence',
|
|
31
|
+
'Topic :: Software Development :: Libraries :: Python Modules',
|
|
32
|
+
],
|
|
33
|
+
python_requires='>=3.8',
|
|
34
|
+
)
|