sciml 0.0.7__py3-none-any.whl → 0.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sciml/__init__.py CHANGED
@@ -1,2 +1,2 @@
1
- # coding: utf-8
2
- __all__ = ["utils", "pipelines"]
1
+ # coding: utf-8
2
+ __all__ = ["utils", "pipelines", "models"]
sciml/models.py ADDED
@@ -0,0 +1,277 @@
1
+ import numpy as np
2
+ import copy
3
+ import itertools
4
+ import warnings
5
+ from xgboost import XGBRegressor
6
+ from sklearn.metrics import mean_squared_error
7
+ from sklearn.model_selection import train_test_split
8
+
9
+ class SmartForest:
10
+ """
11
+ SmartForest: A deep, intelligent decision forest model for complex sequential and tabular data.
12
+
13
+ SmartForest blends ideas from deep forests (cascade forest structures), LSTM-style forget gates,
14
+ and ensemble learning using XGBoost. It is especially suited for time series or structured tabular data
15
+ where layer-wise feature expansion and memory-inspired filtering can enhance performance.
16
+
17
+ Key Features:
18
+ -------------
19
+ - Deep cascade of XGBoost regressors
20
+ - Optional Multi-Grained Scanning (MGS) for local feature extraction
21
+ - Forget-gate-inspired mechanism to regulate information flow across layers
22
+ - Early stopping to prevent overfitting
23
+ - Full retention of best-performing model (lowest validation RMSE)
24
+
25
+ Parameters:
26
+ -----------
27
+ n_estimators_per_layer : int
28
+ Number of XGBoost regressors per layer.
29
+
30
+ max_layers : int
31
+ Maximum number of layers (depth) in the model.
32
+
33
+ early_stopping_rounds : int
34
+ Number of layers with no improvement before early stopping is triggered.
35
+
36
+ param_grid : dict
37
+ Grid of XGBoost hyperparameters to search over.
38
+
39
+ use_gpu : bool
40
+ If True, use GPU-accelerated training (CUDA required).
41
+
42
+ gpu_id : int
43
+ ID of GPU to use (if use_gpu=True).
44
+
45
+ window_sizes : list of int
46
+ Enables Multi-Grained Scanning if non-empty, with specified sliding window sizes.
47
+
48
+ forget_factor : float in [0, 1]
49
+ Simulates LSTM-style forget gate; higher values forget more past information.
50
+
51
+ verbose : int
52
+ Verbosity level (0 = silent, 1 = progress updates).
53
+
54
+ Methods:
55
+ --------
56
+ fit(X, y, X_val=None, y_val=None):
57
+ Train the SmartForest model layer by layer, using optional validation for early stopping.
58
+
59
+ predict(X):
60
+ Make predictions on new data using the trained cascade structure.
61
+
62
+ get_best_model():
63
+ Returns a copy of the best model and the corresponding RMSE from validation.
64
+
65
+ Example:
66
+ --------
67
+ >>> model = SmartForest(n_estimators_per_layer=5, max_layers=10, window_sizes=[2, 3], forget_factor=0.2)
68
+ >>> model.fit(X_train, y_train, X_val, y_val)
69
+ >>> y_pred = model.predict(X_val)
70
+ >>> best_model, best_rmse = model.get_best_model()
71
+ """
72
+ def __init__(self, n_estimators_per_layer = 5, max_layers = 10, early_stopping_rounds = 3, param_grid = None,
73
+ use_gpu = False, gpu_id = 0, window_sizes = [], forget_factor = 0, verbose = 1):
74
+ self.n_estimators_per_layer = n_estimators_per_layer
75
+ self.max_layers = max_layers
76
+ self.early_stopping_rounds = early_stopping_rounds
77
+ self.param_grid = param_grid or {
78
+ "objective": ["reg:squarederror"],
79
+ "random_state": [42],
80
+ 'seed': [0],
81
+ 'n_estimators': [100],
82
+ 'max_depth': [6],
83
+ 'min_child_weight': [4],
84
+ 'subsample': [0.8],
85
+ 'colsample_bytree': [0.8],
86
+ 'gamma': [0],
87
+ 'reg_alpha': [0],
88
+ 'reg_lambda': [1],
89
+ 'learning_rate': [0.05],
90
+ }
91
+ self.use_gpu = use_gpu
92
+ self.gpu_id = gpu_id
93
+ self.window_sizes = window_sizes
94
+ self.forget_factor = forget_factor
95
+ self.layers = []
96
+ self.best_model = None
97
+ self.best_rmse = float("inf")
98
+ self.verbose = verbose
99
+
100
+ def _get_param_combinations(self):
101
+ keys, values = zip(*self.param_grid.items())
102
+ return [dict(zip(keys, v)) for v in itertools.product(*values)]
103
+
104
+ def _multi_grained_scanning(self, X, y):
105
+ new_features = []
106
+ for window_size in self.window_sizes:
107
+ if X.shape[1] < window_size:
108
+ continue
109
+ for start in range(X.shape[1] - window_size + 1):
110
+ window = X[:, start:start + window_size]
111
+ if y is None:
112
+ new_features.append(window)
113
+ continue
114
+
115
+ param_combos = self._get_param_combinations()
116
+ for params in param_combos:
117
+ if self.use_gpu:
118
+ params['tree_method'] = 'hist'
119
+ params['device'] = 'cuda'
120
+ model = XGBRegressor(**params)
121
+ model.fit(window, y)
122
+ preds = model.predict(window).reshape(-1, 1)
123
+ new_features.append(preds)
124
+ return np.hstack(new_features) if new_features else X
125
+
126
+ def _apply_forget_gate(self, X, layer_index):
127
+ forget_weights = np.random.rand(X.shape[1]) * self.forget_factor
128
+ return X * (1 - forget_weights)
129
+
130
+ def _fit_layer(self, X, y, X_val=None, y_val=None, layer_index=0):
131
+ layer = []
132
+ layer_outputs = []
133
+ param_combos = self._get_param_combinations()
134
+ X = self._apply_forget_gate(X, layer_index)
135
+
136
+ for i in range(self.n_estimators_per_layer):
137
+ best_rmse = float('inf')
138
+ best_model = None
139
+
140
+ for params in param_combos:
141
+ if self.use_gpu:
142
+ params['tree_method'] = 'hist'
143
+ params['device'] = 'cuda'
144
+
145
+ params = params.copy() # Prevent modification from affecting the next loop iteration
146
+ params['random_state'] = i # Use a different random seed for each model to enhance diversity
147
+
148
+ model = XGBRegressor(**params)
149
+ model.fit(X, y)
150
+
151
+ if X_val is not None:
152
+ preds_val = model.predict(X_val)
153
+ rmse = np.sqrt(mean_squared_error(y_val, preds_val))
154
+ if rmse < best_rmse:
155
+ best_rmse = rmse
156
+ best_model = model
157
+ else:
158
+ best_model = model
159
+
160
+ preds = best_model.predict(X).reshape(-1, 1)
161
+ layer.append(best_model)
162
+ layer_outputs.append(preds)
163
+
164
+ output = np.hstack(layer_outputs)
165
+ return layer, output
166
+
167
+ def fit(self, X, y, X_val=None, y_val=None):
168
+ X_current = self._multi_grained_scanning(X, y)
169
+ X_val_current = self._multi_grained_scanning(X_val, y_val) if X_val is not None else None
170
+ no_improve_rounds = 0
171
+
172
+ for layer_index in range(self.max_layers):
173
+ if self.verbose: print(f"Training Layer {layer_index + 1}")
174
+ layer, output = self._fit_layer(X_current, y, X_val_current, y_val, layer_index)
175
+ self.layers.append(layer)
176
+ X_current = np.hstack([X_current, output])
177
+
178
+ if X_val is not None:
179
+ val_outputs = []
180
+ for reg in layer:
181
+ n_features = reg.n_features_in_
182
+ preds = reg.predict(X_val_current[:, :n_features]).reshape(-1, 1)
183
+ val_outputs.append(preds)
184
+ val_output = np.hstack(val_outputs)
185
+ X_val_current = np.hstack([X_val_current, val_output])
186
+
187
+ y_pred = self.predict(X_val)
188
+ rmse = np.sqrt(mean_squared_error(y_val, y_pred))
189
+ if self.verbose: print(f"Validation RMSE: {rmse:.4f}")
190
+
191
+ if rmse < self.best_rmse:
192
+ self.best_rmse = rmse
193
+ self.best_model = copy.deepcopy(self.layers)
194
+ no_improve_rounds = 0
195
+ if self.verbose: print(f"✅ New best RMSE: {self.best_rmse:.4f}")
196
+ else:
197
+ no_improve_rounds += 1
198
+ if no_improve_rounds >= self.early_stopping_rounds:
199
+ if self.verbose: print("Early stopping triggered.")
200
+ break
201
+
202
+ def predict(self, X):
203
+ X_current = self._multi_grained_scanning(X, None)
204
+ X_current = self._apply_forget_gate(X_current, layer_index=-1)
205
+
206
+ for layer in self.layers:
207
+ layer_outputs = []
208
+ for reg in layer:
209
+ n_features = reg.n_features_in_
210
+ preds = reg.predict(X_current[:, :n_features]).reshape(-1, 1)
211
+ layer_outputs.append(preds)
212
+ output = np.hstack(layer_outputs)
213
+ X_current = np.hstack([X_current, output])
214
+
215
+ final_outputs = []
216
+ for reg in self.layers[-1]:
217
+ n_features = reg.n_features_in_
218
+ final_outputs.append(reg.predict(X_current[:, :n_features]).reshape(-1, 1))
219
+ return np.mean(np.hstack(final_outputs), axis=1)
220
+
221
+ def get_best_model(self):
222
+ return self.best_model, self.best_rmse
223
+
224
+ """
225
+ # ============================== Test Example ==============================
226
+ from sklearn.datasets import load_diabetes
227
+ from sklearn.datasets import fetch_california_housing
228
+ from sklearn.model_selection import train_test_split
229
+
230
+
231
+
232
+ warnings.simplefilter('ignore')
233
+
234
+ # X, y = load_diabetes(return_X_y=True) # Using diabetes dataset
235
+ X, y = fetch_california_housing(return_X_y=True) # Using house price dataset
236
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
237
+
238
+ # Hyperparameter grid
239
+ param_grid = {
240
+ "objective": ["reg:squarederror"],
241
+ "random_state": [42],
242
+ 'seed': [0],
243
+ 'n_estimators': [100],
244
+ 'max_depth': [6],
245
+ 'min_child_weight': [4],
246
+ 'subsample': [0.8],
247
+ 'colsample_bytree': [0.8],
248
+ 'gamma': [0],
249
+ 'reg_alpha': [0],
250
+ 'reg_lambda': [1],
251
+ 'learning_rate': [0.05],
252
+ }
253
+
254
+ # Create the model with Multi-Grained Scanning enabled (with window sizes 2 and 3)
255
+ regr = SmartForest(
256
+ n_estimators_per_layer = 5,
257
+ max_layers = 10,
258
+ early_stopping_rounds = 5,
259
+ param_grid = param_grid,
260
+ use_gpu = False,
261
+ gpu_id = 0,
262
+ window_sizes = [], # Enables MGS if e.g., [2, 3], else empty disables MGS.
263
+ forget_factor = 0., # Set forget factor to simulate forget gate behavior
264
+ verbose = 1
265
+ )
266
+
267
+ regr.fit(X_train, y_train, X_val, y_val)
268
+
269
+ # Predict on validation set and evaluate
270
+ y_pred = regr.predict(X_val)
271
+ rmse = np.sqrt(mean_squared_error(y_val, y_pred))
272
+ print("\nFinal RMSE:", rmse)
273
+
274
+ # Output best model and RMSE
275
+ best_model, best_rmse = regr.get_best_model()
276
+ print("\nBest validation RMSE:", best_rmse)
277
+ """
sciml/pipelines.py CHANGED
@@ -1,174 +1,435 @@
1
- import numpy as np
2
- import pandas as pd
3
- from scipy import stats
4
- from copy import deepcopy
5
- from tqdm import tqdm
6
- from sklearn.metrics import mean_squared_error
7
- from xgboost import XGBRegressor
8
-
9
- def get_metrics(df, truth = 'truth', pred = 'pred', return_dict = False):
10
- '''
11
- Calculate statistical measures between validation and prediction sequences
12
- '''
13
- df = df[[truth, pred]].copy().dropna()
14
- slope, intercept, r_value, p_value, std_err = stats.linregress(df.dropna()[truth], df.dropna()[pred])
15
- r2 = r_value**2
16
- mse = mean_squared_error(df.dropna()[truth], df.dropna()[pred])
17
- rmse = np.sqrt(mse)
18
- mbe = np.mean(df.dropna()[pred] - df.dropna()[truth])
19
- mae = (df.dropna()[pred] - df.dropna()[truth]).abs().mean()
20
- if return_dict:
21
- return pd.DataFrame.from_dict([{
22
- 'r2': r2,
23
- 'Slope': slope,
24
- 'RMSE': rmse,
25
- 'MBE': mbe,
26
- 'MAE': mae,
27
- 'Intercept': intercept,
28
- 'p-value': p_value,
29
- 'std_err': std_err
30
- }])
31
- else:
32
- return r2, slope, rmse, mbe, mae, intercept, p_value, std_err
33
-
34
- # ===============================================================================================================================
35
- # Machine learning algorithms
36
- def train_ml(
37
- X_train, y_train, model_name = 'XGB',
38
- xgb_params_user = None, rfr_params_user = None,
39
- mlp_params_user = None, svr_params_user = None,
40
- df21_params_user = None,
41
- gpu = False, partial_mode = False
42
- ):
43
- # -------------------------------------------------------------------------
44
- # Setup parameters:
45
- if xgb_params_user:
46
- xgb_params = xgb_params_user
47
- else:
48
- xgb_params = {
49
- "objective": "reg:squarederror",
50
- "random_state": 0,
51
- 'seed': 0,
52
- 'n_estimators': 100,
53
- 'max_depth': 6,
54
- 'min_child_weight': 4,
55
- 'subsample': 0.8,
56
- 'colsample_bytree': 0.8,
57
- 'gamma': 0,
58
- 'reg_alpha': 0,
59
- 'reg_lambda': 1,
60
- 'learning_rate': 0.05,
61
- }
62
-
63
- xgb_gpu_params = {
64
- 'tree_method': 'gpu_hist',
65
- 'gpu_id': 0,
66
- # "n_gpus": 2,
67
- }
68
-
69
- if gpu: xgb_params.update(xgb_gpu_params)
70
-
71
- if rfr_params_user:
72
- rfr_params = rfr_params_user
73
- else:
74
- rfr_params = {
75
- 'max_depth': 20,
76
- 'min_samples_leaf': 3,
77
- 'min_samples_split': 12,
78
- 'n_estimators': 100,
79
- 'n_jobs': -1
80
- }
81
-
82
- if df21_params_user:
83
- df21_params = df21_params_user
84
- else:
85
- df21_params = {
86
- 'random_state': 1,
87
- 'verbose' : 0,
88
- 'predictor': "xgboost",
89
- 'n_jobs' : -1,
90
- 'predictor_kwargs' : xgb_params,
91
- 'partial_mode' : partial_mode
92
- }
93
- # -------------------------------------------------------------------------
94
- # Run:
95
- if model_name == "XGB":
96
- from xgboost import XGBRegressor
97
- regr = XGBRegressor(**xgb_params)
98
- elif model_name == "MLP":
99
- from sklearn.neural_network import MLPRegressor
100
- regr = MLPRegressor(**mlp_params_user)
101
- elif model_name == "RFR":
102
- from sklearn.ensemble import RandomForestRegressor
103
- regr = RandomForestRegressor(**rfr_params)
104
- elif model_name == "SVR":
105
- from sklearn.svm import SVR
106
- regr = SVR(**svr_params_user)
107
- elif model_name == "DF21":
108
- from deepforest import CascadeForestRegressor
109
- # https://deep-forest.readthedocs.io/en/latest/api_reference.html?highlight=CascadeForestRegressor#cascadeforestregressor
110
- # predictor: {"forest", "xgboost", "lightgbm"}
111
- # regr = CascadeForestRegressor(random_state = 1, verbose = 0, predictor = "xgboost", n_jobs = -1, predictor_kwargs = xgb_params, partial_mode = partial_mode)
112
- regr = CascadeForestRegressor(**df21_params)
113
- regr.fit(X_train, y_train)
114
- return regr
115
-
116
- def test_ml(X_test, y_test, regr):
117
- res = y_test.copy() # y_test is 2D pandas dataframe.
118
- res.columns = ['truth']
119
- res['pred'] = regr.predict(X_test)
120
- return res
121
-
122
- def run_ensemble(X_train, y_train, n_models = 10, frac_sample = 0.8):
123
- base_params_xgb = {
124
- "objective": "reg:squarederror",
125
- 'seed': 0,
126
- "random_state": 0,
127
- }
128
- params_xgb = deepcopy(base_params_xgb)
129
- # dropout-like regularization
130
- params_xgb.update({
131
- "subsample": 0.8, # Use 80% of the data for each tree
132
- "colsample_bytree": 0.8, # Use 80% of the features for each tree
133
- })
134
-
135
- models = []
136
- for i in tqdm(range(n_models)):
137
- # Create a bootstrapped dataset
138
- y_resampled = y_train.copy().sample(frac = frac_sample, random_state = i)
139
- X_resampled = X_train.copy().loc[y_resampled.index]
140
- # print(y_resampled.sort_index().index[0], y_resampled.sort_index().index[-1])
141
-
142
- # Train the XGBoost model
143
- params_xgb.update({'random_state': i})
144
- model = XGBRegressor(**params_xgb)
145
- model.fit(X_resampled, y_resampled)
146
- models.append(model)
147
- return models
148
-
149
- # ===============================================================================================================================
150
- # Deep learning neural networks
151
-
152
- try:
153
- from tensorflow import keras
154
- from tensorflow.keras import layers
155
- from tensorflow.keras import models
156
- # from keras.layers import Dropout
157
- from keras.callbacks import EarlyStopping
158
- from scitbx.stutils import *
159
- except Exception as e:
160
- print(e)
161
-
162
- def train_lstm(X_train, y_train, nfeature, ntime, verbose = 2, epochs = 200, batch_size = 64):
163
- # create and fit the LSTM network
164
- model = models.Sequential()
165
- model.add(layers.LSTM(64, input_shape=(nfeature, ntime)))
166
- model.add(layers.Dropout(0.2))
167
- model.add(layers.Dense(16, activation='relu'))
168
- model.add(layers.Dropout(0.2))
169
- model.add(layers.Dense(1, activation='relu'))
170
- model.compile(loss='mean_squared_error', optimizer='adam')
171
- # es = EarlyStopping(monitor='loss', mode='min', verbose=1)
172
- # model.fit(X_train.reshape(-1, nsites, nfeats), y_train, epochs=100, batch_size=256, verbose=2, callbacks=[es])
173
- model.fit(X_train, y_train, epochs = epochs, batch_size = batch_size, verbose=verbose)
174
- return model
1
+ import numpy as np
2
+ import pandas as pd
3
+ from scipy import stats
4
+ from copy import deepcopy
5
+ from tqdm import tqdm
6
+ from sklearn.metrics import mean_squared_error
7
+ from xgboost import XGBRegressor
8
+
9
+ def get_metrics(df, truth = 'truth', pred = 'pred', return_dict = False):
10
+ '''
11
+ Calculate statistical measures between validation and prediction sequences
12
+ '''
13
+ df = df[[truth, pred]].copy().dropna()
14
+ slope, intercept, r_value, p_value, std_err = stats.linregress(df.dropna()[truth], df.dropna()[pred])
15
+ r2 = r_value**2
16
+ mse = mean_squared_error(df.dropna()[truth], df.dropna()[pred])
17
+ rmse = np.sqrt(mse)
18
+ mbe = np.mean(df.dropna()[pred] - df.dropna()[truth])
19
+ mae = (df.dropna()[pred] - df.dropna()[truth]).abs().mean()
20
+ if return_dict:
21
+ return pd.DataFrame.from_dict([{
22
+ 'r2': r2,
23
+ 'Slope': slope,
24
+ 'RMSE': rmse,
25
+ 'MBE': mbe,
26
+ 'MAE': mae,
27
+ 'Intercept': intercept,
28
+ 'p-value': p_value,
29
+ 'std_err': std_err
30
+ }])
31
+ else:
32
+ return r2, slope, rmse, mbe, mae, intercept, p_value, std_err
33
+
34
+ # ===============================================================================================================================
35
+ # Machine learning algorithms
36
+ def train_ml(
37
+ X_train, y_train, model_name = 'XGB',
38
+ xgb_params_user = None, rfr_params_user = None,
39
+ mlp_params_user = None, svr_params_user = None,
40
+ df21_params_user = None,
41
+ gpu = False, partial_mode = False
42
+ ):
43
+ # -------------------------------------------------------------------------
44
+ # Setup parameters:
45
+ if xgb_params_user:
46
+ xgb_params = xgb_params_user
47
+ else:
48
+ xgb_params = {
49
+ "objective": "reg:squarederror",
50
+ "random_state": 0,
51
+ 'seed': 0,
52
+ 'n_estimators': 100,
53
+ 'max_depth': 6,
54
+ 'min_child_weight': 4,
55
+ 'subsample': 0.8,
56
+ 'colsample_bytree': 0.8,
57
+ 'gamma': 0,
58
+ 'reg_alpha': 0,
59
+ 'reg_lambda': 1,
60
+ 'learning_rate': 0.05,
61
+ }
62
+
63
+ xgb_gpu_params = {
64
+ 'tree_method': 'gpu_hist',
65
+ 'gpu_id': 0,
66
+ # "n_gpus": 2,
67
+ }
68
+
69
+ if gpu: xgb_params.update(xgb_gpu_params)
70
+
71
+ if rfr_params_user:
72
+ rfr_params = rfr_params_user
73
+ else:
74
+ rfr_params = {
75
+ 'max_depth': 20,
76
+ 'min_samples_leaf': 3,
77
+ 'min_samples_split': 12,
78
+ 'n_estimators': 100,
79
+ 'n_jobs': -1
80
+ }
81
+
82
+ if df21_params_user:
83
+ df21_params = df21_params_user
84
+ else:
85
+ df21_params = {
86
+ 'random_state': 1,
87
+ 'verbose' : 0,
88
+ 'predictor': "xgboost",
89
+ 'n_jobs' : -1,
90
+ 'predictor_kwargs' : xgb_params,
91
+ 'partial_mode' : partial_mode
92
+ }
93
+ # -------------------------------------------------------------------------
94
+ # Run:
95
+ if model_name == "XGB":
96
+ from xgboost import XGBRegressor
97
+ regr = XGBRegressor(**xgb_params)
98
+ elif model_name == "MLP":
99
+ from sklearn.neural_network import MLPRegressor
100
+ regr = MLPRegressor(**mlp_params_user)
101
+ elif model_name == "RFR":
102
+ from sklearn.ensemble import RandomForestRegressor
103
+ regr = RandomForestRegressor(**rfr_params)
104
+ elif model_name == "SVR":
105
+ from sklearn.svm import SVR
106
+ regr = SVR(**svr_params_user)
107
+ elif model_name == "DF21":
108
+ from deepforest import CascadeForestRegressor
109
+ # https://deep-forest.readthedocs.io/en/latest/api_reference.html?highlight=CascadeForestRegressor#cascadeforestregressor
110
+ # predictor: {"forest", "xgboost", "lightgbm"}
111
+ # regr = CascadeForestRegressor(random_state = 1, verbose = 0, predictor = "xgboost", n_jobs = -1, predictor_kwargs = xgb_params, partial_mode = partial_mode)
112
+ regr = CascadeForestRegressor(**df21_params)
113
+ regr.fit(X_train, y_train)
114
+ return regr
115
+
116
+ def test_ml(X_test, y_test, regr):
117
+ res = y_test.copy() # y_test is 2D pandas dataframe.
118
+ res.columns = ['truth']
119
+ res['pred'] = regr.predict(X_test)
120
+ return res
121
+
122
+ def run_ensemble(X_train, y_train, n_models = 10, frac_sample = 0.8):
123
+ base_params_xgb = {
124
+ "objective": "reg:squarederror",
125
+ 'seed': 0,
126
+ "random_state": 0,
127
+ }
128
+ params_xgb = deepcopy(base_params_xgb)
129
+ # dropout-like regularization
130
+ params_xgb.update({
131
+ "subsample": 0.8, # Use 80% of the data for each tree
132
+ "colsample_bytree": 0.8, # Use 80% of the features for each tree
133
+ })
134
+
135
+ models = []
136
+ for i in tqdm(range(n_models)):
137
+ # Create a bootstrapped dataset
138
+ y_resampled = y_train.copy().sample(frac = frac_sample, random_state = i)
139
+ X_resampled = X_train.copy().loc[y_resampled.index]
140
+ # print(y_resampled.sort_index().index[0], y_resampled.sort_index().index[-1])
141
+
142
+ # Train the XGBoost model
143
+ params_xgb.update({'random_state': i})
144
+ model = XGBRegressor(**params_xgb)
145
+ model.fit(X_resampled, y_resampled)
146
+ models.append(model)
147
+ return models
148
+
149
+ # ===============================================================================================================================
150
+ # Deep learning neural networks
151
+
152
+ try:
153
+ from tensorflow import keras
154
+ from tensorflow.keras import layers
155
+ from tensorflow.keras import models
156
+ # from keras.layers import Dropout
157
+ from keras.callbacks import EarlyStopping
158
+ from scitbx.stutils import *
159
+ except Exception as e:
160
+ print(e)
161
+
162
+ def train_lstm(X_train, y_train, nfeature, ntime, verbose = 2, epochs = 200, batch_size = 64):
163
+ # create and fit the LSTM network
164
+ model = models.Sequential()
165
+ model.add(layers.LSTM(64, input_shape=(nfeature, ntime)))
166
+ model.add(layers.Dropout(0.2))
167
+ model.add(layers.Dense(16, activation='relu'))
168
+ model.add(layers.Dropout(0.2))
169
+ model.add(layers.Dense(1, activation='relu'))
170
+ model.compile(loss='mean_squared_error', optimizer='adam')
171
+ # es = EarlyStopping(monitor='loss', mode='min', verbose=1)
172
+ # model.fit(X_train.reshape(-1, nsites, nfeats), y_train, epochs=100, batch_size=256, verbose=2, callbacks=[es])
173
+ model.fit(X_train, y_train, epochs = epochs, batch_size = batch_size, verbose=verbose)
174
+ return model
175
+
176
+
177
+ '''
178
+ # ========================================================================================================
179
+ import numpy as np
180
+ from xgboost import XGBRegressor
181
+ from sklearn.metrics import mean_squared_error
182
+
183
+ class XGBoostDeepForestRegressor:
184
+ def __init__(self, n_estimators_per_layer=2, max_layers=20, early_stopping_rounds=2):
185
+ self.n_estimators_per_layer = n_estimators_per_layer
186
+ self.max_layers = max_layers
187
+ self.early_stopping_rounds = early_stopping_rounds
188
+ self.layers = []
189
+
190
+ def _fit_layer(self, X, y):
191
+ layer = []
192
+ layer_outputs = []
193
+ for _ in range(self.n_estimators_per_layer):
194
+ reg = XGBRegressor()
195
+ reg.fit(X, y)
196
+ preds = reg.predict(X).reshape(-1, 1)
197
+ layer.append(reg)
198
+ layer_outputs.append(preds)
199
+ output = np.hstack(layer_outputs)
200
+ return layer, output
201
+
202
+ def fit(self, X, y, X_val=None, y_val=None):
203
+ X_current = X.copy()
204
+ best_rmse = float("inf")
205
+ no_improve_rounds = 0
206
+
207
+ for layer_index in range(self.max_layers):
208
+ print(f"Training Layer {layer_index + 1}")
209
+ layer, output = self._fit_layer(X_current, y)
210
+ self.layers.append(layer)
211
+ X_current = np.hstack([X_current, output])
212
+
213
+ if X_val is not None:
214
+ y_pred = self.predict(X_val)
215
+ # rmse = mean_squared_error(y_val, y_pred, squared=False)
216
+ rmse = np.sqrt(mean_squared_error(y_val, y_pred))
217
+ print(f"Validation RMSE: {rmse:.4f}")
218
+
219
+ if rmse < best_rmse:
220
+ best_rmse = rmse
221
+ no_improve_rounds = 0
222
+ else:
223
+ no_improve_rounds += 1
224
+ if no_improve_rounds >= self.early_stopping_rounds:
225
+ print("Early stopping triggered.")
226
+ break
227
+
228
+ def predict(self, X):
229
+ X_current = X.copy()
230
+ for layer in self.layers:
231
+ layer_outputs = []
232
+ for reg in layer:
233
+ n_features = reg.n_features_in_
234
+ preds = reg.predict(X_current[:, :n_features]).reshape(-1, 1)
235
+ layer_outputs.append(preds)
236
+ output = np.hstack(layer_outputs)
237
+ X_current = np.hstack([X_current, output])
238
+
239
+ # Final prediction = average of last layer regressors
240
+ final_outputs = []
241
+ for reg in self.layers[-1]:
242
+ n_features = reg.n_features_in_
243
+ final_outputs.append(reg.predict(X_current[:, :n_features]).reshape(-1, 1))
244
+ return np.mean(np.hstack(final_outputs), axis=1)
245
+
246
+
247
+ from sklearn.datasets import load_diabetes
248
+ from sklearn.model_selection import train_test_split
249
+ from sklearn.metrics import mean_squared_error
250
+
251
+ X, y = load_diabetes(return_X_y=True)
252
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
253
+
254
+ df_reg = XGBoostDeepForestRegressor(n_estimators_per_layer=2, max_layers=5)
255
+ df_reg.fit(X_train, y_train, X_val, y_val)
256
+
257
+ y_pred = df_reg.predict(X_val)
258
+ # rmse = mean_squared_error(y_val, y_pred, squared=False)
259
+ rmse = np.sqrt(mean_squared_error(y_val, y_pred))
260
+ print("Final RMSE:", rmse)
261
+
262
+ # ----------------------------------------------------------------------------------------------------
263
+
264
+ import numpy as np
265
+ from xgboost import XGBRegressor
266
+ from sklearn.metrics import mean_squared_error
267
+ import itertools
268
+
269
+ class XGBoostDeepForestRegressor:
270
+ def __init__(self, n_estimators_per_layer=2, max_layers=20, early_stopping_rounds=2, param_grid=None, use_gpu=True, gpu_id=0):
271
+ self.n_estimators_per_layer = n_estimators_per_layer
272
+ self.max_layers = max_layers
273
+ self.early_stopping_rounds = early_stopping_rounds
274
+ self.param_grid = param_grid or {
275
+ 'max_depth': [3],
276
+ 'learning_rate': [0.1],
277
+ 'n_estimators': [100]
278
+ }
279
+ self.use_gpu = use_gpu
280
+ self.gpu_id = gpu_id
281
+ self.layers = []
282
+
283
+ def _get_param_combinations(self):
284
+ keys, values = zip(*self.param_grid.items())
285
+ return [dict(zip(keys, v)) for v in itertools.product(*values)]
286
+
287
+ def _fit_layer(self, X, y, X_val=None, y_val=None):
288
+ layer = []
289
+ layer_outputs = []
290
+ param_combos = self._get_param_combinations()
291
+
292
+ for i in range(self.n_estimators_per_layer):
293
+ best_rmse = float('inf')
294
+ best_model = None
295
+
296
+ for params in param_combos:
297
+ # Set GPU support parameters in XGBRegressor
298
+ if self.use_gpu:
299
+ params['tree_method'] = 'hist' # Use hist method
300
+ params['device'] = 'cuda' # Enable CUDA for GPU
301
+
302
+ model = XGBRegressor(**params)
303
+ model.fit(X, y)
304
+
305
+ if X_val is not None:
306
+ preds_val = model.predict(X_val)
307
+ rmse = np.sqrt(mean_squared_error(y_val, preds_val))
308
+ if rmse < best_rmse:
309
+ best_rmse = rmse
310
+ best_model = model
311
+ else:
312
+ best_model = model
313
+
314
+ final_model = best_model
315
+ preds = final_model.predict(X).reshape(-1, 1)
316
+ layer.append(final_model)
317
+ layer_outputs.append(preds)
318
+
319
+ output = np.hstack(layer_outputs)
320
+ return layer, output
321
+
322
+ def fit(self, X, y, X_val=None, y_val=None):
323
+ X_current = X.copy()
324
+ X_val_current = X_val.copy() if X_val is not None else None
325
+
326
+ best_rmse = float("inf")
327
+ no_improve_rounds = 0
328
+
329
+ for layer_index in range(self.max_layers):
330
+ print(f"Training Layer {layer_index + 1}")
331
+ layer, output = self._fit_layer(X_current, y, X_val_current, y_val)
332
+ self.layers.append(layer)
333
+ X_current = np.hstack([X_current, output])
334
+
335
+ if X_val is not None:
336
+ val_outputs = []
337
+ for reg in layer:
338
+ n_features = reg.n_features_in_
339
+ preds = reg.predict(X_val_current[:, :n_features]).reshape(-1, 1)
340
+ val_outputs.append(preds)
341
+ val_output = np.hstack(val_outputs)
342
+ X_val_current = np.hstack([X_val_current, val_output])
343
+
344
+ y_pred = self.predict(X_val)
345
+ rmse = np.sqrt(mean_squared_error(y_val, y_pred))
346
+ print(f"Validation RMSE: {rmse:.4f}")
347
+
348
+ if rmse < best_rmse:
349
+ best_rmse = rmse
350
+ no_improve_rounds = 0
351
+ else:
352
+ no_improve_rounds += 1
353
+ if no_improve_rounds >= self.early_stopping_rounds:
354
+ print("Early stopping triggered.")
355
+ break
356
+
357
+ def predict(self, X):
358
+ X_current = X.copy()
359
+ for layer in self.layers:
360
+ layer_outputs = []
361
+ for reg in layer:
362
+ n_features = reg.n_features_in_
363
+ preds = reg.predict(X_current[:, :n_features]).reshape(-1, 1)
364
+ layer_outputs.append(preds)
365
+ output = np.hstack(layer_outputs)
366
+ X_current = np.hstack([X_current, output])
367
+
368
+ final_outputs = []
369
+ for reg in self.layers[-1]:
370
+ n_features = reg.n_features_in_
371
+ final_outputs.append(reg.predict(X_current[:, :n_features]).reshape(-1, 1))
372
+ return np.mean(np.hstack(final_outputs), axis=1)
373
+
374
+
375
+ from sklearn.datasets import load_diabetes
376
+ from sklearn.model_selection import train_test_split
377
+ from sklearn.metrics import mean_squared_error
378
+
379
+ # Load dataset
380
+ X, y = load_diabetes(return_X_y=True)
381
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
382
+
383
+ # Hyperparameter grid
384
+ param_grid = {
385
+ 'max_depth': [3, 4],
386
+ 'learning_rate': [0.1, 0.05],
387
+ 'n_estimators': [50, 100]
388
+ }
389
+
390
+ # Create and fit the model with GPU enabled
391
+ df_reg = XGBoostDeepForestRegressor(
392
+ n_estimators_per_layer=2,
393
+ max_layers=5,
394
+ early_stopping_rounds=2,
395
+ param_grid=param_grid,
396
+ use_gpu=True, # Enable GPU acceleration
397
+ gpu_id=0 # Default to the first GPU
398
+ )
399
+
400
+ df_reg.fit(X_train, y_train, X_val, y_val)
401
+
402
+ # Final evaluation
403
+ y_pred = df_reg.predict(X_val)
404
+ rmse = np.sqrt(mean_squared_error(y_val, y_pred))
405
+ print("Final RMSE:", rmse)
406
+
407
+ # ----------------------------------------------------------------------------------------------------
408
+
409
+ xgb_params = {
410
+ "objective": "reg:squarederror",
411
+ "random_state": 0,
412
+ 'seed': 0,
413
+ 'n_estimators': 100,
414
+ 'max_depth': 6,
415
+ 'min_child_weight': 4,
416
+ 'subsample': 0.8,
417
+ 'colsample_bytree': 0.8,
418
+ 'gamma': 0,
419
+ 'reg_alpha': 0,
420
+ 'reg_lambda': 1,
421
+ 'learning_rate': 0.05,
422
+ }
423
+
424
+ from xgboost import XGBRegressor
425
+ regr = XGBRegressor(**xgb_params)
426
+
427
+ regr.fit(X_train, y_train)
428
+ y_pred = regr.predict(X_val)
429
+
430
+
431
+ from scipy import stats
432
+
433
+ stats.linregress(y_val, y_pred)
434
+
435
+ '''
sciml/utils.py CHANGED
@@ -1,46 +1,46 @@
1
- import numpy as np
2
- import pandas as pd
3
- from sklearn.model_selection import ShuffleSplit
4
- from sklearn.model_selection import train_test_split
5
-
6
- # randomly select sites
7
- def random_select(ds, count, num, random_state = 0):
8
- np.random.seed(random_state)
9
- idxs = np.random.choice(np.delete(np.arange(len(ds)), count), num, replace = False)
10
- return np.sort(idxs)
11
-
12
- def split(Xs, ys, return_index = False, test_size = 0.33, random_state = 42):
13
- if return_index:
14
- sss = ShuffleSplit(n_splits=1, test_size = test_size, random_state = random_state)
15
- sss.get_n_splits(Xs, ys)
16
- train_index, test_index = next(sss.split(Xs, ys))
17
- return (train_index, test_index)
18
- else:
19
- X_train, X_test, y_train, y_test = train_test_split(
20
- Xs, ys,
21
- test_size = test_size,
22
- random_state = random_state
23
- )
24
- return (X_train, X_test, y_train, y_test)
25
-
26
- def split_cut(Xs, ys, test_ratio = 0.33):
27
- assert ys.ndim == 2, 'ys must be 2D!'
28
- assert len(Xs) == len(ys), 'Xs and ys should be equally long!'
29
- assert type(Xs) == type(ys), 'Xs and ys should be the same data type!'
30
- if not type(Xs) in [pd.core.frame.DataFrame, np.ndarray]: raise Exception('Only accept numpy ndarray or pandas dataframe')
31
- anchor = int(np.floor(len(ys) * (1 - test_ratio)))
32
-
33
- if type(Xs) == pd.core.frame.DataFrame:
34
- X_train = Xs.iloc[0: anchor, :]
35
- X_test = Xs.iloc[anchor::, :]
36
- y_train = ys.iloc[0: anchor, :]
37
- y_test = ys.iloc[anchor::, :]
38
- else:
39
- X_train = Xs[0: anchor, :]
40
- X_test = Xs[anchor::, :]
41
- y_train = ys[0: anchor, :]
42
- y_test = ys[anchor::, :]
43
-
44
- assert len(X_train) + len(X_test) == len(Xs), 'The sum of train and test lengths must equal to Xs/ys!'
45
-
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.model_selection import ShuffleSplit
4
+ from sklearn.model_selection import train_test_split
5
+
6
+ # randomly select sites
7
+ def random_select(ds, count, num, random_state = 0):
8
+ np.random.seed(random_state)
9
+ idxs = np.random.choice(np.delete(np.arange(len(ds)), count), num, replace = False)
10
+ return np.sort(idxs)
11
+
12
+ def split(Xs, ys, return_index = False, test_size = 0.33, random_state = 42):
13
+ if return_index:
14
+ sss = ShuffleSplit(n_splits=1, test_size = test_size, random_state = random_state)
15
+ sss.get_n_splits(Xs, ys)
16
+ train_index, test_index = next(sss.split(Xs, ys))
17
+ return (train_index, test_index)
18
+ else:
19
+ X_train, X_test, y_train, y_test = train_test_split(
20
+ Xs, ys,
21
+ test_size = test_size,
22
+ random_state = random_state
23
+ )
24
+ return (X_train, X_test, y_train, y_test)
25
+
26
+ def split_cut(Xs, ys, test_ratio = 0.33):
27
+ assert ys.ndim == 2, 'ys must be 2D!'
28
+ assert len(Xs) == len(ys), 'Xs and ys should be equally long!'
29
+ assert type(Xs) == type(ys), 'Xs and ys should be the same data type!'
30
+ if not type(Xs) in [pd.core.frame.DataFrame, np.ndarray]: raise Exception('Only accept numpy ndarray or pandas dataframe')
31
+ anchor = int(np.floor(len(ys) * (1 - test_ratio)))
32
+
33
+ if type(Xs) == pd.core.frame.DataFrame:
34
+ X_train = Xs.iloc[0: anchor, :]
35
+ X_test = Xs.iloc[anchor::, :]
36
+ y_train = ys.iloc[0: anchor, :]
37
+ y_test = ys.iloc[anchor::, :]
38
+ else:
39
+ X_train = Xs[0: anchor, :]
40
+ X_test = Xs[anchor::, :]
41
+ y_train = ys[0: anchor, :]
42
+ y_test = ys[anchor::, :]
43
+
44
+ assert len(X_train) + len(X_test) == len(Xs), 'The sum of train and test lengths must equal to Xs/ys!'
45
+
46
46
  return (X_train, X_test, y_train, y_test)
@@ -1,21 +1,21 @@
1
- MIT License
2
-
3
- Copyright (c) 2021 Zhu
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
1
+ MIT License
2
+
3
+ Copyright (c) 2021 Zhu
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -1,13 +1,13 @@
1
- Metadata-Version: 2.1
2
- Name: sciml
3
- Version: 0.0.7
4
- Summary: draw and basic calculations/conversions
5
- Home-page: https://github.com/soonyenju/sciml
6
- Author: Songyan Zhu
7
- Author-email: zhusy93@gmail.com
8
- License: MIT Licence
9
- Keywords: Scientific machine learning wrappers
10
- Platform: any
11
- License-File: LICENSE
12
-
13
- coming soon
1
+ Metadata-Version: 2.1
2
+ Name: sciml
3
+ Version: 0.0.9
4
+ Summary: draw and basic calculations/conversions
5
+ Home-page: https://github.com/soonyenju/sciml
6
+ Author: Songyan Zhu
7
+ Author-email: zhusy93@gmail.com
8
+ License: MIT Licence
9
+ Keywords: Scientific machine learning wrappers
10
+ Platform: any
11
+ License-File: LICENSE
12
+
13
+ coming soon
@@ -0,0 +1,9 @@
1
+ sciml/__init__.py,sha256=wtdlXERN2ik7NT_TQxFdd2gdodBY9vSU1ClSdeJnLm4,59
2
+ sciml/models.py,sha256=BjbliW-KNfzbNdGNgM7nBdJ2SF2z21qCoAvug_v0FEg,10574
3
+ sciml/pipelines.py,sha256=ReNEkQbdFn04D5G2tbxcA7jdSwACy8SnmZ8bFZI_oqE,15702
4
+ sciml/utils.py,sha256=qCdABaTUu3K0R269jI7D_8SO6AqEjphg03CzdxCJR2k,1876
5
+ sciml-0.0.9.dist-info/LICENSE,sha256=hcunSTJmVgRcUNOa1rKl8axtY3Jsy2B4wXDYtQsrAt0,1081
6
+ sciml-0.0.9.dist-info/METADATA,sha256=S5hG3pP3x4yDPe8AJOKn4R-fIuvL-DL1GSKqGqiImSw,326
7
+ sciml-0.0.9.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
8
+ sciml-0.0.9.dist-info/top_level.txt,sha256=dS_7aBCZFKQE3myPy5sh4USjQZCZyGg382-YxUUYcdw,6
9
+ sciml-0.0.9.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.38.4)
2
+ Generator: bdist_wheel (0.43.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,8 +0,0 @@
1
- sciml/__init__.py,sha256=Asqzx08kEOBLv_IRE20VlHxZu9XgydyrzIMUDRE-qiU,48
2
- sciml/pipelines.py,sha256=5qfeHdxGhF-GMu-rTiInPv5metXiT32uSENIDFd2Ths,6333
3
- sciml/utils.py,sha256=u5DzQJV4aCZ-p7sY56Fxzj8WDGYOgn1rOTeGzAw0vwY,1831
4
- sciml-0.0.7.dist-info/LICENSE,sha256=dX4jBmkgQPWc_TfYkXtKQzVIgZQWFuHZ8vQjV4sEeV4,1060
5
- sciml-0.0.7.dist-info/METADATA,sha256=363EbWoSVqR9qAdhOfeVD8RiP6DfcalvDiZECJ6LW3s,313
6
- sciml-0.0.7.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
7
- sciml-0.0.7.dist-info/top_level.txt,sha256=dS_7aBCZFKQE3myPy5sh4USjQZCZyGg382-YxUUYcdw,6
8
- sciml-0.0.7.dist-info/RECORD,,