sciml 0.0.7__py3-none-any.whl → 0.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sciml/__init__.py +2 -2
- sciml/models.py +277 -0
- sciml/pipelines.py +435 -174
- sciml/utils.py +45 -45
- {sciml-0.0.7.dist-info → sciml-0.0.9.dist-info}/LICENSE +21 -21
- {sciml-0.0.7.dist-info → sciml-0.0.9.dist-info}/METADATA +13 -13
- sciml-0.0.9.dist-info/RECORD +9 -0
- {sciml-0.0.7.dist-info → sciml-0.0.9.dist-info}/WHEEL +1 -1
- sciml-0.0.7.dist-info/RECORD +0 -8
- {sciml-0.0.7.dist-info → sciml-0.0.9.dist-info}/top_level.txt +0 -0
sciml/__init__.py
CHANGED
@@ -1,2 +1,2 @@
|
|
1
|
-
# coding: utf-8
|
2
|
-
__all__ = ["utils", "pipelines"]
|
1
|
+
# coding: utf-8
|
2
|
+
__all__ = ["utils", "pipelines", "models"]
|
sciml/models.py
ADDED
@@ -0,0 +1,277 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import copy
|
3
|
+
import itertools
|
4
|
+
import warnings
|
5
|
+
from xgboost import XGBRegressor
|
6
|
+
from sklearn.metrics import mean_squared_error
|
7
|
+
from sklearn.model_selection import train_test_split
|
8
|
+
|
9
|
+
class SmartForest:
|
10
|
+
"""
|
11
|
+
SmartForest: A deep, intelligent decision forest model for complex sequential and tabular data.
|
12
|
+
|
13
|
+
SmartForest blends ideas from deep forests (cascade forest structures), LSTM-style forget gates,
|
14
|
+
and ensemble learning using XGBoost. It is especially suited for time series or structured tabular data
|
15
|
+
where layer-wise feature expansion and memory-inspired filtering can enhance performance.
|
16
|
+
|
17
|
+
Key Features:
|
18
|
+
-------------
|
19
|
+
- Deep cascade of XGBoost regressors
|
20
|
+
- Optional Multi-Grained Scanning (MGS) for local feature extraction
|
21
|
+
- Forget-gate-inspired mechanism to regulate information flow across layers
|
22
|
+
- Early stopping to prevent overfitting
|
23
|
+
- Full retention of best-performing model (lowest validation RMSE)
|
24
|
+
|
25
|
+
Parameters:
|
26
|
+
-----------
|
27
|
+
n_estimators_per_layer : int
|
28
|
+
Number of XGBoost regressors per layer.
|
29
|
+
|
30
|
+
max_layers : int
|
31
|
+
Maximum number of layers (depth) in the model.
|
32
|
+
|
33
|
+
early_stopping_rounds : int
|
34
|
+
Number of layers with no improvement before early stopping is triggered.
|
35
|
+
|
36
|
+
param_grid : dict
|
37
|
+
Grid of XGBoost hyperparameters to search over.
|
38
|
+
|
39
|
+
use_gpu : bool
|
40
|
+
If True, use GPU-accelerated training (CUDA required).
|
41
|
+
|
42
|
+
gpu_id : int
|
43
|
+
ID of GPU to use (if use_gpu=True).
|
44
|
+
|
45
|
+
window_sizes : list of int
|
46
|
+
Enables Multi-Grained Scanning if non-empty, with specified sliding window sizes.
|
47
|
+
|
48
|
+
forget_factor : float in [0, 1]
|
49
|
+
Simulates LSTM-style forget gate; higher values forget more past information.
|
50
|
+
|
51
|
+
verbose : int
|
52
|
+
Verbosity level (0 = silent, 1 = progress updates).
|
53
|
+
|
54
|
+
Methods:
|
55
|
+
--------
|
56
|
+
fit(X, y, X_val=None, y_val=None):
|
57
|
+
Train the SmartForest model layer by layer, using optional validation for early stopping.
|
58
|
+
|
59
|
+
predict(X):
|
60
|
+
Make predictions on new data using the trained cascade structure.
|
61
|
+
|
62
|
+
get_best_model():
|
63
|
+
Returns a copy of the best model and the corresponding RMSE from validation.
|
64
|
+
|
65
|
+
Example:
|
66
|
+
--------
|
67
|
+
>>> model = SmartForest(n_estimators_per_layer=5, max_layers=10, window_sizes=[2, 3], forget_factor=0.2)
|
68
|
+
>>> model.fit(X_train, y_train, X_val, y_val)
|
69
|
+
>>> y_pred = model.predict(X_val)
|
70
|
+
>>> best_model, best_rmse = model.get_best_model()
|
71
|
+
"""
|
72
|
+
def __init__(self, n_estimators_per_layer = 5, max_layers = 10, early_stopping_rounds = 3, param_grid = None,
|
73
|
+
use_gpu = False, gpu_id = 0, window_sizes = [], forget_factor = 0, verbose = 1):
|
74
|
+
self.n_estimators_per_layer = n_estimators_per_layer
|
75
|
+
self.max_layers = max_layers
|
76
|
+
self.early_stopping_rounds = early_stopping_rounds
|
77
|
+
self.param_grid = param_grid or {
|
78
|
+
"objective": ["reg:squarederror"],
|
79
|
+
"random_state": [42],
|
80
|
+
'seed': [0],
|
81
|
+
'n_estimators': [100],
|
82
|
+
'max_depth': [6],
|
83
|
+
'min_child_weight': [4],
|
84
|
+
'subsample': [0.8],
|
85
|
+
'colsample_bytree': [0.8],
|
86
|
+
'gamma': [0],
|
87
|
+
'reg_alpha': [0],
|
88
|
+
'reg_lambda': [1],
|
89
|
+
'learning_rate': [0.05],
|
90
|
+
}
|
91
|
+
self.use_gpu = use_gpu
|
92
|
+
self.gpu_id = gpu_id
|
93
|
+
self.window_sizes = window_sizes
|
94
|
+
self.forget_factor = forget_factor
|
95
|
+
self.layers = []
|
96
|
+
self.best_model = None
|
97
|
+
self.best_rmse = float("inf")
|
98
|
+
self.verbose = verbose
|
99
|
+
|
100
|
+
def _get_param_combinations(self):
|
101
|
+
keys, values = zip(*self.param_grid.items())
|
102
|
+
return [dict(zip(keys, v)) for v in itertools.product(*values)]
|
103
|
+
|
104
|
+
def _multi_grained_scanning(self, X, y):
|
105
|
+
new_features = []
|
106
|
+
for window_size in self.window_sizes:
|
107
|
+
if X.shape[1] < window_size:
|
108
|
+
continue
|
109
|
+
for start in range(X.shape[1] - window_size + 1):
|
110
|
+
window = X[:, start:start + window_size]
|
111
|
+
if y is None:
|
112
|
+
new_features.append(window)
|
113
|
+
continue
|
114
|
+
|
115
|
+
param_combos = self._get_param_combinations()
|
116
|
+
for params in param_combos:
|
117
|
+
if self.use_gpu:
|
118
|
+
params['tree_method'] = 'hist'
|
119
|
+
params['device'] = 'cuda'
|
120
|
+
model = XGBRegressor(**params)
|
121
|
+
model.fit(window, y)
|
122
|
+
preds = model.predict(window).reshape(-1, 1)
|
123
|
+
new_features.append(preds)
|
124
|
+
return np.hstack(new_features) if new_features else X
|
125
|
+
|
126
|
+
def _apply_forget_gate(self, X, layer_index):
|
127
|
+
forget_weights = np.random.rand(X.shape[1]) * self.forget_factor
|
128
|
+
return X * (1 - forget_weights)
|
129
|
+
|
130
|
+
def _fit_layer(self, X, y, X_val=None, y_val=None, layer_index=0):
|
131
|
+
layer = []
|
132
|
+
layer_outputs = []
|
133
|
+
param_combos = self._get_param_combinations()
|
134
|
+
X = self._apply_forget_gate(X, layer_index)
|
135
|
+
|
136
|
+
for i in range(self.n_estimators_per_layer):
|
137
|
+
best_rmse = float('inf')
|
138
|
+
best_model = None
|
139
|
+
|
140
|
+
for params in param_combos:
|
141
|
+
if self.use_gpu:
|
142
|
+
params['tree_method'] = 'hist'
|
143
|
+
params['device'] = 'cuda'
|
144
|
+
|
145
|
+
params = params.copy() # Prevent modification from affecting the next loop iteration
|
146
|
+
params['random_state'] = i # Use a different random seed for each model to enhance diversity
|
147
|
+
|
148
|
+
model = XGBRegressor(**params)
|
149
|
+
model.fit(X, y)
|
150
|
+
|
151
|
+
if X_val is not None:
|
152
|
+
preds_val = model.predict(X_val)
|
153
|
+
rmse = np.sqrt(mean_squared_error(y_val, preds_val))
|
154
|
+
if rmse < best_rmse:
|
155
|
+
best_rmse = rmse
|
156
|
+
best_model = model
|
157
|
+
else:
|
158
|
+
best_model = model
|
159
|
+
|
160
|
+
preds = best_model.predict(X).reshape(-1, 1)
|
161
|
+
layer.append(best_model)
|
162
|
+
layer_outputs.append(preds)
|
163
|
+
|
164
|
+
output = np.hstack(layer_outputs)
|
165
|
+
return layer, output
|
166
|
+
|
167
|
+
def fit(self, X, y, X_val=None, y_val=None):
|
168
|
+
X_current = self._multi_grained_scanning(X, y)
|
169
|
+
X_val_current = self._multi_grained_scanning(X_val, y_val) if X_val is not None else None
|
170
|
+
no_improve_rounds = 0
|
171
|
+
|
172
|
+
for layer_index in range(self.max_layers):
|
173
|
+
if self.verbose: print(f"Training Layer {layer_index + 1}")
|
174
|
+
layer, output = self._fit_layer(X_current, y, X_val_current, y_val, layer_index)
|
175
|
+
self.layers.append(layer)
|
176
|
+
X_current = np.hstack([X_current, output])
|
177
|
+
|
178
|
+
if X_val is not None:
|
179
|
+
val_outputs = []
|
180
|
+
for reg in layer:
|
181
|
+
n_features = reg.n_features_in_
|
182
|
+
preds = reg.predict(X_val_current[:, :n_features]).reshape(-1, 1)
|
183
|
+
val_outputs.append(preds)
|
184
|
+
val_output = np.hstack(val_outputs)
|
185
|
+
X_val_current = np.hstack([X_val_current, val_output])
|
186
|
+
|
187
|
+
y_pred = self.predict(X_val)
|
188
|
+
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
|
189
|
+
if self.verbose: print(f"Validation RMSE: {rmse:.4f}")
|
190
|
+
|
191
|
+
if rmse < self.best_rmse:
|
192
|
+
self.best_rmse = rmse
|
193
|
+
self.best_model = copy.deepcopy(self.layers)
|
194
|
+
no_improve_rounds = 0
|
195
|
+
if self.verbose: print(f"✅ New best RMSE: {self.best_rmse:.4f}")
|
196
|
+
else:
|
197
|
+
no_improve_rounds += 1
|
198
|
+
if no_improve_rounds >= self.early_stopping_rounds:
|
199
|
+
if self.verbose: print("Early stopping triggered.")
|
200
|
+
break
|
201
|
+
|
202
|
+
def predict(self, X):
|
203
|
+
X_current = self._multi_grained_scanning(X, None)
|
204
|
+
X_current = self._apply_forget_gate(X_current, layer_index=-1)
|
205
|
+
|
206
|
+
for layer in self.layers:
|
207
|
+
layer_outputs = []
|
208
|
+
for reg in layer:
|
209
|
+
n_features = reg.n_features_in_
|
210
|
+
preds = reg.predict(X_current[:, :n_features]).reshape(-1, 1)
|
211
|
+
layer_outputs.append(preds)
|
212
|
+
output = np.hstack(layer_outputs)
|
213
|
+
X_current = np.hstack([X_current, output])
|
214
|
+
|
215
|
+
final_outputs = []
|
216
|
+
for reg in self.layers[-1]:
|
217
|
+
n_features = reg.n_features_in_
|
218
|
+
final_outputs.append(reg.predict(X_current[:, :n_features]).reshape(-1, 1))
|
219
|
+
return np.mean(np.hstack(final_outputs), axis=1)
|
220
|
+
|
221
|
+
def get_best_model(self):
|
222
|
+
return self.best_model, self.best_rmse
|
223
|
+
|
224
|
+
"""
|
225
|
+
# ============================== Test Example ==============================
|
226
|
+
from sklearn.datasets import load_diabetes
|
227
|
+
from sklearn.datasets import fetch_california_housing
|
228
|
+
from sklearn.model_selection import train_test_split
|
229
|
+
|
230
|
+
|
231
|
+
|
232
|
+
warnings.simplefilter('ignore')
|
233
|
+
|
234
|
+
# X, y = load_diabetes(return_X_y=True) # Using diabetes dataset
|
235
|
+
X, y = fetch_california_housing(return_X_y=True) # Using house price dataset
|
236
|
+
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
|
237
|
+
|
238
|
+
# Hyperparameter grid
|
239
|
+
param_grid = {
|
240
|
+
"objective": ["reg:squarederror"],
|
241
|
+
"random_state": [42],
|
242
|
+
'seed': [0],
|
243
|
+
'n_estimators': [100],
|
244
|
+
'max_depth': [6],
|
245
|
+
'min_child_weight': [4],
|
246
|
+
'subsample': [0.8],
|
247
|
+
'colsample_bytree': [0.8],
|
248
|
+
'gamma': [0],
|
249
|
+
'reg_alpha': [0],
|
250
|
+
'reg_lambda': [1],
|
251
|
+
'learning_rate': [0.05],
|
252
|
+
}
|
253
|
+
|
254
|
+
# Create the model with Multi-Grained Scanning enabled (with window sizes 2 and 3)
|
255
|
+
regr = SmartForest(
|
256
|
+
n_estimators_per_layer = 5,
|
257
|
+
max_layers = 10,
|
258
|
+
early_stopping_rounds = 5,
|
259
|
+
param_grid = param_grid,
|
260
|
+
use_gpu = False,
|
261
|
+
gpu_id = 0,
|
262
|
+
window_sizes = [], # Enables MGS if e.g., [2, 3], else empty disables MGS.
|
263
|
+
forget_factor = 0., # Set forget factor to simulate forget gate behavior
|
264
|
+
verbose = 1
|
265
|
+
)
|
266
|
+
|
267
|
+
regr.fit(X_train, y_train, X_val, y_val)
|
268
|
+
|
269
|
+
# Predict on validation set and evaluate
|
270
|
+
y_pred = regr.predict(X_val)
|
271
|
+
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
|
272
|
+
print("\nFinal RMSE:", rmse)
|
273
|
+
|
274
|
+
# Output best model and RMSE
|
275
|
+
best_model, best_rmse = regr.get_best_model()
|
276
|
+
print("\nBest validation RMSE:", best_rmse)
|
277
|
+
"""
|
sciml/pipelines.py
CHANGED
@@ -1,174 +1,435 @@
|
|
1
|
-
import numpy as np
|
2
|
-
import pandas as pd
|
3
|
-
from scipy import stats
|
4
|
-
from copy import deepcopy
|
5
|
-
from tqdm import tqdm
|
6
|
-
from sklearn.metrics import mean_squared_error
|
7
|
-
from xgboost import XGBRegressor
|
8
|
-
|
9
|
-
def get_metrics(df, truth = 'truth', pred = 'pred', return_dict = False):
|
10
|
-
'''
|
11
|
-
Calculate statistical measures between validation and prediction sequences
|
12
|
-
'''
|
13
|
-
df = df[[truth, pred]].copy().dropna()
|
14
|
-
slope, intercept, r_value, p_value, std_err = stats.linregress(df.dropna()[truth], df.dropna()[pred])
|
15
|
-
r2 = r_value**2
|
16
|
-
mse = mean_squared_error(df.dropna()[truth], df.dropna()[pred])
|
17
|
-
rmse = np.sqrt(mse)
|
18
|
-
mbe = np.mean(df.dropna()[pred] - df.dropna()[truth])
|
19
|
-
mae = (df.dropna()[pred] - df.dropna()[truth]).abs().mean()
|
20
|
-
if return_dict:
|
21
|
-
return pd.DataFrame.from_dict([{
|
22
|
-
'r2': r2,
|
23
|
-
'Slope': slope,
|
24
|
-
'RMSE': rmse,
|
25
|
-
'MBE': mbe,
|
26
|
-
'MAE': mae,
|
27
|
-
'Intercept': intercept,
|
28
|
-
'p-value': p_value,
|
29
|
-
'std_err': std_err
|
30
|
-
}])
|
31
|
-
else:
|
32
|
-
return r2, slope, rmse, mbe, mae, intercept, p_value, std_err
|
33
|
-
|
34
|
-
# ===============================================================================================================================
|
35
|
-
# Machine learning algorithms
|
36
|
-
def train_ml(
|
37
|
-
X_train, y_train, model_name = 'XGB',
|
38
|
-
xgb_params_user = None, rfr_params_user = None,
|
39
|
-
mlp_params_user = None, svr_params_user = None,
|
40
|
-
df21_params_user = None,
|
41
|
-
gpu = False, partial_mode = False
|
42
|
-
):
|
43
|
-
# -------------------------------------------------------------------------
|
44
|
-
# Setup parameters:
|
45
|
-
if xgb_params_user:
|
46
|
-
xgb_params = xgb_params_user
|
47
|
-
else:
|
48
|
-
xgb_params = {
|
49
|
-
"objective": "reg:squarederror",
|
50
|
-
"random_state": 0,
|
51
|
-
'seed': 0,
|
52
|
-
'n_estimators': 100,
|
53
|
-
'max_depth': 6,
|
54
|
-
'min_child_weight': 4,
|
55
|
-
'subsample': 0.8,
|
56
|
-
'colsample_bytree': 0.8,
|
57
|
-
'gamma': 0,
|
58
|
-
'reg_alpha': 0,
|
59
|
-
'reg_lambda': 1,
|
60
|
-
'learning_rate': 0.05,
|
61
|
-
}
|
62
|
-
|
63
|
-
xgb_gpu_params = {
|
64
|
-
'tree_method': 'gpu_hist',
|
65
|
-
'gpu_id': 0,
|
66
|
-
# "n_gpus": 2,
|
67
|
-
}
|
68
|
-
|
69
|
-
if gpu: xgb_params.update(xgb_gpu_params)
|
70
|
-
|
71
|
-
if rfr_params_user:
|
72
|
-
rfr_params = rfr_params_user
|
73
|
-
else:
|
74
|
-
rfr_params = {
|
75
|
-
'max_depth': 20,
|
76
|
-
'min_samples_leaf': 3,
|
77
|
-
'min_samples_split': 12,
|
78
|
-
'n_estimators': 100,
|
79
|
-
'n_jobs': -1
|
80
|
-
}
|
81
|
-
|
82
|
-
if df21_params_user:
|
83
|
-
df21_params = df21_params_user
|
84
|
-
else:
|
85
|
-
df21_params = {
|
86
|
-
'random_state': 1,
|
87
|
-
'verbose' : 0,
|
88
|
-
'predictor': "xgboost",
|
89
|
-
'n_jobs' : -1,
|
90
|
-
'predictor_kwargs' : xgb_params,
|
91
|
-
'partial_mode' : partial_mode
|
92
|
-
}
|
93
|
-
# -------------------------------------------------------------------------
|
94
|
-
# Run:
|
95
|
-
if model_name == "XGB":
|
96
|
-
from xgboost import XGBRegressor
|
97
|
-
regr = XGBRegressor(**xgb_params)
|
98
|
-
elif model_name == "MLP":
|
99
|
-
from sklearn.neural_network import MLPRegressor
|
100
|
-
regr = MLPRegressor(**mlp_params_user)
|
101
|
-
elif model_name == "RFR":
|
102
|
-
from sklearn.ensemble import RandomForestRegressor
|
103
|
-
regr = RandomForestRegressor(**rfr_params)
|
104
|
-
elif model_name == "SVR":
|
105
|
-
from sklearn.svm import SVR
|
106
|
-
regr = SVR(**svr_params_user)
|
107
|
-
elif model_name == "DF21":
|
108
|
-
from deepforest import CascadeForestRegressor
|
109
|
-
# https://deep-forest.readthedocs.io/en/latest/api_reference.html?highlight=CascadeForestRegressor#cascadeforestregressor
|
110
|
-
# predictor: {"forest", "xgboost", "lightgbm"}
|
111
|
-
# regr = CascadeForestRegressor(random_state = 1, verbose = 0, predictor = "xgboost", n_jobs = -1, predictor_kwargs = xgb_params, partial_mode = partial_mode)
|
112
|
-
regr = CascadeForestRegressor(**df21_params)
|
113
|
-
regr.fit(X_train, y_train)
|
114
|
-
return regr
|
115
|
-
|
116
|
-
def test_ml(X_test, y_test, regr):
|
117
|
-
res = y_test.copy() # y_test is 2D pandas dataframe.
|
118
|
-
res.columns = ['truth']
|
119
|
-
res['pred'] = regr.predict(X_test)
|
120
|
-
return res
|
121
|
-
|
122
|
-
def run_ensemble(X_train, y_train, n_models = 10, frac_sample = 0.8):
|
123
|
-
base_params_xgb = {
|
124
|
-
"objective": "reg:squarederror",
|
125
|
-
'seed': 0,
|
126
|
-
"random_state": 0,
|
127
|
-
}
|
128
|
-
params_xgb = deepcopy(base_params_xgb)
|
129
|
-
# dropout-like regularization
|
130
|
-
params_xgb.update({
|
131
|
-
"subsample": 0.8, # Use 80% of the data for each tree
|
132
|
-
"colsample_bytree": 0.8, # Use 80% of the features for each tree
|
133
|
-
})
|
134
|
-
|
135
|
-
models = []
|
136
|
-
for i in tqdm(range(n_models)):
|
137
|
-
# Create a bootstrapped dataset
|
138
|
-
y_resampled = y_train.copy().sample(frac = frac_sample, random_state = i)
|
139
|
-
X_resampled = X_train.copy().loc[y_resampled.index]
|
140
|
-
# print(y_resampled.sort_index().index[0], y_resampled.sort_index().index[-1])
|
141
|
-
|
142
|
-
# Train the XGBoost model
|
143
|
-
params_xgb.update({'random_state': i})
|
144
|
-
model = XGBRegressor(**params_xgb)
|
145
|
-
model.fit(X_resampled, y_resampled)
|
146
|
-
models.append(model)
|
147
|
-
return models
|
148
|
-
|
149
|
-
# ===============================================================================================================================
|
150
|
-
# Deep learning neural networks
|
151
|
-
|
152
|
-
try:
|
153
|
-
from tensorflow import keras
|
154
|
-
from tensorflow.keras import layers
|
155
|
-
from tensorflow.keras import models
|
156
|
-
# from keras.layers import Dropout
|
157
|
-
from keras.callbacks import EarlyStopping
|
158
|
-
from scitbx.stutils import *
|
159
|
-
except Exception as e:
|
160
|
-
print(e)
|
161
|
-
|
162
|
-
def train_lstm(X_train, y_train, nfeature, ntime, verbose = 2, epochs = 200, batch_size = 64):
|
163
|
-
# create and fit the LSTM network
|
164
|
-
model = models.Sequential()
|
165
|
-
model.add(layers.LSTM(64, input_shape=(nfeature, ntime)))
|
166
|
-
model.add(layers.Dropout(0.2))
|
167
|
-
model.add(layers.Dense(16, activation='relu'))
|
168
|
-
model.add(layers.Dropout(0.2))
|
169
|
-
model.add(layers.Dense(1, activation='relu'))
|
170
|
-
model.compile(loss='mean_squared_error', optimizer='adam')
|
171
|
-
# es = EarlyStopping(monitor='loss', mode='min', verbose=1)
|
172
|
-
# model.fit(X_train.reshape(-1, nsites, nfeats), y_train, epochs=100, batch_size=256, verbose=2, callbacks=[es])
|
173
|
-
model.fit(X_train, y_train, epochs = epochs, batch_size = batch_size, verbose=verbose)
|
174
|
-
return model
|
1
|
+
import numpy as np
|
2
|
+
import pandas as pd
|
3
|
+
from scipy import stats
|
4
|
+
from copy import deepcopy
|
5
|
+
from tqdm import tqdm
|
6
|
+
from sklearn.metrics import mean_squared_error
|
7
|
+
from xgboost import XGBRegressor
|
8
|
+
|
9
|
+
def get_metrics(df, truth = 'truth', pred = 'pred', return_dict = False):
|
10
|
+
'''
|
11
|
+
Calculate statistical measures between validation and prediction sequences
|
12
|
+
'''
|
13
|
+
df = df[[truth, pred]].copy().dropna()
|
14
|
+
slope, intercept, r_value, p_value, std_err = stats.linregress(df.dropna()[truth], df.dropna()[pred])
|
15
|
+
r2 = r_value**2
|
16
|
+
mse = mean_squared_error(df.dropna()[truth], df.dropna()[pred])
|
17
|
+
rmse = np.sqrt(mse)
|
18
|
+
mbe = np.mean(df.dropna()[pred] - df.dropna()[truth])
|
19
|
+
mae = (df.dropna()[pred] - df.dropna()[truth]).abs().mean()
|
20
|
+
if return_dict:
|
21
|
+
return pd.DataFrame.from_dict([{
|
22
|
+
'r2': r2,
|
23
|
+
'Slope': slope,
|
24
|
+
'RMSE': rmse,
|
25
|
+
'MBE': mbe,
|
26
|
+
'MAE': mae,
|
27
|
+
'Intercept': intercept,
|
28
|
+
'p-value': p_value,
|
29
|
+
'std_err': std_err
|
30
|
+
}])
|
31
|
+
else:
|
32
|
+
return r2, slope, rmse, mbe, mae, intercept, p_value, std_err
|
33
|
+
|
34
|
+
# ===============================================================================================================================
|
35
|
+
# Machine learning algorithms
|
36
|
+
def train_ml(
|
37
|
+
X_train, y_train, model_name = 'XGB',
|
38
|
+
xgb_params_user = None, rfr_params_user = None,
|
39
|
+
mlp_params_user = None, svr_params_user = None,
|
40
|
+
df21_params_user = None,
|
41
|
+
gpu = False, partial_mode = False
|
42
|
+
):
|
43
|
+
# -------------------------------------------------------------------------
|
44
|
+
# Setup parameters:
|
45
|
+
if xgb_params_user:
|
46
|
+
xgb_params = xgb_params_user
|
47
|
+
else:
|
48
|
+
xgb_params = {
|
49
|
+
"objective": "reg:squarederror",
|
50
|
+
"random_state": 0,
|
51
|
+
'seed': 0,
|
52
|
+
'n_estimators': 100,
|
53
|
+
'max_depth': 6,
|
54
|
+
'min_child_weight': 4,
|
55
|
+
'subsample': 0.8,
|
56
|
+
'colsample_bytree': 0.8,
|
57
|
+
'gamma': 0,
|
58
|
+
'reg_alpha': 0,
|
59
|
+
'reg_lambda': 1,
|
60
|
+
'learning_rate': 0.05,
|
61
|
+
}
|
62
|
+
|
63
|
+
xgb_gpu_params = {
|
64
|
+
'tree_method': 'gpu_hist',
|
65
|
+
'gpu_id': 0,
|
66
|
+
# "n_gpus": 2,
|
67
|
+
}
|
68
|
+
|
69
|
+
if gpu: xgb_params.update(xgb_gpu_params)
|
70
|
+
|
71
|
+
if rfr_params_user:
|
72
|
+
rfr_params = rfr_params_user
|
73
|
+
else:
|
74
|
+
rfr_params = {
|
75
|
+
'max_depth': 20,
|
76
|
+
'min_samples_leaf': 3,
|
77
|
+
'min_samples_split': 12,
|
78
|
+
'n_estimators': 100,
|
79
|
+
'n_jobs': -1
|
80
|
+
}
|
81
|
+
|
82
|
+
if df21_params_user:
|
83
|
+
df21_params = df21_params_user
|
84
|
+
else:
|
85
|
+
df21_params = {
|
86
|
+
'random_state': 1,
|
87
|
+
'verbose' : 0,
|
88
|
+
'predictor': "xgboost",
|
89
|
+
'n_jobs' : -1,
|
90
|
+
'predictor_kwargs' : xgb_params,
|
91
|
+
'partial_mode' : partial_mode
|
92
|
+
}
|
93
|
+
# -------------------------------------------------------------------------
|
94
|
+
# Run:
|
95
|
+
if model_name == "XGB":
|
96
|
+
from xgboost import XGBRegressor
|
97
|
+
regr = XGBRegressor(**xgb_params)
|
98
|
+
elif model_name == "MLP":
|
99
|
+
from sklearn.neural_network import MLPRegressor
|
100
|
+
regr = MLPRegressor(**mlp_params_user)
|
101
|
+
elif model_name == "RFR":
|
102
|
+
from sklearn.ensemble import RandomForestRegressor
|
103
|
+
regr = RandomForestRegressor(**rfr_params)
|
104
|
+
elif model_name == "SVR":
|
105
|
+
from sklearn.svm import SVR
|
106
|
+
regr = SVR(**svr_params_user)
|
107
|
+
elif model_name == "DF21":
|
108
|
+
from deepforest import CascadeForestRegressor
|
109
|
+
# https://deep-forest.readthedocs.io/en/latest/api_reference.html?highlight=CascadeForestRegressor#cascadeforestregressor
|
110
|
+
# predictor: {"forest", "xgboost", "lightgbm"}
|
111
|
+
# regr = CascadeForestRegressor(random_state = 1, verbose = 0, predictor = "xgboost", n_jobs = -1, predictor_kwargs = xgb_params, partial_mode = partial_mode)
|
112
|
+
regr = CascadeForestRegressor(**df21_params)
|
113
|
+
regr.fit(X_train, y_train)
|
114
|
+
return regr
|
115
|
+
|
116
|
+
def test_ml(X_test, y_test, regr):
|
117
|
+
res = y_test.copy() # y_test is 2D pandas dataframe.
|
118
|
+
res.columns = ['truth']
|
119
|
+
res['pred'] = regr.predict(X_test)
|
120
|
+
return res
|
121
|
+
|
122
|
+
def run_ensemble(X_train, y_train, n_models = 10, frac_sample = 0.8):
|
123
|
+
base_params_xgb = {
|
124
|
+
"objective": "reg:squarederror",
|
125
|
+
'seed': 0,
|
126
|
+
"random_state": 0,
|
127
|
+
}
|
128
|
+
params_xgb = deepcopy(base_params_xgb)
|
129
|
+
# dropout-like regularization
|
130
|
+
params_xgb.update({
|
131
|
+
"subsample": 0.8, # Use 80% of the data for each tree
|
132
|
+
"colsample_bytree": 0.8, # Use 80% of the features for each tree
|
133
|
+
})
|
134
|
+
|
135
|
+
models = []
|
136
|
+
for i in tqdm(range(n_models)):
|
137
|
+
# Create a bootstrapped dataset
|
138
|
+
y_resampled = y_train.copy().sample(frac = frac_sample, random_state = i)
|
139
|
+
X_resampled = X_train.copy().loc[y_resampled.index]
|
140
|
+
# print(y_resampled.sort_index().index[0], y_resampled.sort_index().index[-1])
|
141
|
+
|
142
|
+
# Train the XGBoost model
|
143
|
+
params_xgb.update({'random_state': i})
|
144
|
+
model = XGBRegressor(**params_xgb)
|
145
|
+
model.fit(X_resampled, y_resampled)
|
146
|
+
models.append(model)
|
147
|
+
return models
|
148
|
+
|
149
|
+
# ===============================================================================================================================
|
150
|
+
# Deep learning neural networks
|
151
|
+
|
152
|
+
try:
|
153
|
+
from tensorflow import keras
|
154
|
+
from tensorflow.keras import layers
|
155
|
+
from tensorflow.keras import models
|
156
|
+
# from keras.layers import Dropout
|
157
|
+
from keras.callbacks import EarlyStopping
|
158
|
+
from scitbx.stutils import *
|
159
|
+
except Exception as e:
|
160
|
+
print(e)
|
161
|
+
|
162
|
+
def train_lstm(X_train, y_train, nfeature, ntime, verbose = 2, epochs = 200, batch_size = 64):
|
163
|
+
# create and fit the LSTM network
|
164
|
+
model = models.Sequential()
|
165
|
+
model.add(layers.LSTM(64, input_shape=(nfeature, ntime)))
|
166
|
+
model.add(layers.Dropout(0.2))
|
167
|
+
model.add(layers.Dense(16, activation='relu'))
|
168
|
+
model.add(layers.Dropout(0.2))
|
169
|
+
model.add(layers.Dense(1, activation='relu'))
|
170
|
+
model.compile(loss='mean_squared_error', optimizer='adam')
|
171
|
+
# es = EarlyStopping(monitor='loss', mode='min', verbose=1)
|
172
|
+
# model.fit(X_train.reshape(-1, nsites, nfeats), y_train, epochs=100, batch_size=256, verbose=2, callbacks=[es])
|
173
|
+
model.fit(X_train, y_train, epochs = epochs, batch_size = batch_size, verbose=verbose)
|
174
|
+
return model
|
175
|
+
|
176
|
+
|
177
|
+
'''
|
178
|
+
# ========================================================================================================
|
179
|
+
import numpy as np
|
180
|
+
from xgboost import XGBRegressor
|
181
|
+
from sklearn.metrics import mean_squared_error
|
182
|
+
|
183
|
+
class XGBoostDeepForestRegressor:
|
184
|
+
def __init__(self, n_estimators_per_layer=2, max_layers=20, early_stopping_rounds=2):
|
185
|
+
self.n_estimators_per_layer = n_estimators_per_layer
|
186
|
+
self.max_layers = max_layers
|
187
|
+
self.early_stopping_rounds = early_stopping_rounds
|
188
|
+
self.layers = []
|
189
|
+
|
190
|
+
def _fit_layer(self, X, y):
|
191
|
+
layer = []
|
192
|
+
layer_outputs = []
|
193
|
+
for _ in range(self.n_estimators_per_layer):
|
194
|
+
reg = XGBRegressor()
|
195
|
+
reg.fit(X, y)
|
196
|
+
preds = reg.predict(X).reshape(-1, 1)
|
197
|
+
layer.append(reg)
|
198
|
+
layer_outputs.append(preds)
|
199
|
+
output = np.hstack(layer_outputs)
|
200
|
+
return layer, output
|
201
|
+
|
202
|
+
def fit(self, X, y, X_val=None, y_val=None):
|
203
|
+
X_current = X.copy()
|
204
|
+
best_rmse = float("inf")
|
205
|
+
no_improve_rounds = 0
|
206
|
+
|
207
|
+
for layer_index in range(self.max_layers):
|
208
|
+
print(f"Training Layer {layer_index + 1}")
|
209
|
+
layer, output = self._fit_layer(X_current, y)
|
210
|
+
self.layers.append(layer)
|
211
|
+
X_current = np.hstack([X_current, output])
|
212
|
+
|
213
|
+
if X_val is not None:
|
214
|
+
y_pred = self.predict(X_val)
|
215
|
+
# rmse = mean_squared_error(y_val, y_pred, squared=False)
|
216
|
+
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
|
217
|
+
print(f"Validation RMSE: {rmse:.4f}")
|
218
|
+
|
219
|
+
if rmse < best_rmse:
|
220
|
+
best_rmse = rmse
|
221
|
+
no_improve_rounds = 0
|
222
|
+
else:
|
223
|
+
no_improve_rounds += 1
|
224
|
+
if no_improve_rounds >= self.early_stopping_rounds:
|
225
|
+
print("Early stopping triggered.")
|
226
|
+
break
|
227
|
+
|
228
|
+
def predict(self, X):
|
229
|
+
X_current = X.copy()
|
230
|
+
for layer in self.layers:
|
231
|
+
layer_outputs = []
|
232
|
+
for reg in layer:
|
233
|
+
n_features = reg.n_features_in_
|
234
|
+
preds = reg.predict(X_current[:, :n_features]).reshape(-1, 1)
|
235
|
+
layer_outputs.append(preds)
|
236
|
+
output = np.hstack(layer_outputs)
|
237
|
+
X_current = np.hstack([X_current, output])
|
238
|
+
|
239
|
+
# Final prediction = average of last layer regressors
|
240
|
+
final_outputs = []
|
241
|
+
for reg in self.layers[-1]:
|
242
|
+
n_features = reg.n_features_in_
|
243
|
+
final_outputs.append(reg.predict(X_current[:, :n_features]).reshape(-1, 1))
|
244
|
+
return np.mean(np.hstack(final_outputs), axis=1)
|
245
|
+
|
246
|
+
|
247
|
+
from sklearn.datasets import load_diabetes
|
248
|
+
from sklearn.model_selection import train_test_split
|
249
|
+
from sklearn.metrics import mean_squared_error
|
250
|
+
|
251
|
+
X, y = load_diabetes(return_X_y=True)
|
252
|
+
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
|
253
|
+
|
254
|
+
df_reg = XGBoostDeepForestRegressor(n_estimators_per_layer=2, max_layers=5)
|
255
|
+
df_reg.fit(X_train, y_train, X_val, y_val)
|
256
|
+
|
257
|
+
y_pred = df_reg.predict(X_val)
|
258
|
+
# rmse = mean_squared_error(y_val, y_pred, squared=False)
|
259
|
+
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
|
260
|
+
print("Final RMSE:", rmse)
|
261
|
+
|
262
|
+
# ----------------------------------------------------------------------------------------------------
|
263
|
+
|
264
|
+
import numpy as np
|
265
|
+
from xgboost import XGBRegressor
|
266
|
+
from sklearn.metrics import mean_squared_error
|
267
|
+
import itertools
|
268
|
+
|
269
|
+
class XGBoostDeepForestRegressor:
|
270
|
+
def __init__(self, n_estimators_per_layer=2, max_layers=20, early_stopping_rounds=2, param_grid=None, use_gpu=True, gpu_id=0):
|
271
|
+
self.n_estimators_per_layer = n_estimators_per_layer
|
272
|
+
self.max_layers = max_layers
|
273
|
+
self.early_stopping_rounds = early_stopping_rounds
|
274
|
+
self.param_grid = param_grid or {
|
275
|
+
'max_depth': [3],
|
276
|
+
'learning_rate': [0.1],
|
277
|
+
'n_estimators': [100]
|
278
|
+
}
|
279
|
+
self.use_gpu = use_gpu
|
280
|
+
self.gpu_id = gpu_id
|
281
|
+
self.layers = []
|
282
|
+
|
283
|
+
def _get_param_combinations(self):
|
284
|
+
keys, values = zip(*self.param_grid.items())
|
285
|
+
return [dict(zip(keys, v)) for v in itertools.product(*values)]
|
286
|
+
|
287
|
+
def _fit_layer(self, X, y, X_val=None, y_val=None):
|
288
|
+
layer = []
|
289
|
+
layer_outputs = []
|
290
|
+
param_combos = self._get_param_combinations()
|
291
|
+
|
292
|
+
for i in range(self.n_estimators_per_layer):
|
293
|
+
best_rmse = float('inf')
|
294
|
+
best_model = None
|
295
|
+
|
296
|
+
for params in param_combos:
|
297
|
+
# Set GPU support parameters in XGBRegressor
|
298
|
+
if self.use_gpu:
|
299
|
+
params['tree_method'] = 'hist' # Use hist method
|
300
|
+
params['device'] = 'cuda' # Enable CUDA for GPU
|
301
|
+
|
302
|
+
model = XGBRegressor(**params)
|
303
|
+
model.fit(X, y)
|
304
|
+
|
305
|
+
if X_val is not None:
|
306
|
+
preds_val = model.predict(X_val)
|
307
|
+
rmse = np.sqrt(mean_squared_error(y_val, preds_val))
|
308
|
+
if rmse < best_rmse:
|
309
|
+
best_rmse = rmse
|
310
|
+
best_model = model
|
311
|
+
else:
|
312
|
+
best_model = model
|
313
|
+
|
314
|
+
final_model = best_model
|
315
|
+
preds = final_model.predict(X).reshape(-1, 1)
|
316
|
+
layer.append(final_model)
|
317
|
+
layer_outputs.append(preds)
|
318
|
+
|
319
|
+
output = np.hstack(layer_outputs)
|
320
|
+
return layer, output
|
321
|
+
|
322
|
+
def fit(self, X, y, X_val=None, y_val=None):
|
323
|
+
X_current = X.copy()
|
324
|
+
X_val_current = X_val.copy() if X_val is not None else None
|
325
|
+
|
326
|
+
best_rmse = float("inf")
|
327
|
+
no_improve_rounds = 0
|
328
|
+
|
329
|
+
for layer_index in range(self.max_layers):
|
330
|
+
print(f"Training Layer {layer_index + 1}")
|
331
|
+
layer, output = self._fit_layer(X_current, y, X_val_current, y_val)
|
332
|
+
self.layers.append(layer)
|
333
|
+
X_current = np.hstack([X_current, output])
|
334
|
+
|
335
|
+
if X_val is not None:
|
336
|
+
val_outputs = []
|
337
|
+
for reg in layer:
|
338
|
+
n_features = reg.n_features_in_
|
339
|
+
preds = reg.predict(X_val_current[:, :n_features]).reshape(-1, 1)
|
340
|
+
val_outputs.append(preds)
|
341
|
+
val_output = np.hstack(val_outputs)
|
342
|
+
X_val_current = np.hstack([X_val_current, val_output])
|
343
|
+
|
344
|
+
y_pred = self.predict(X_val)
|
345
|
+
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
|
346
|
+
print(f"Validation RMSE: {rmse:.4f}")
|
347
|
+
|
348
|
+
if rmse < best_rmse:
|
349
|
+
best_rmse = rmse
|
350
|
+
no_improve_rounds = 0
|
351
|
+
else:
|
352
|
+
no_improve_rounds += 1
|
353
|
+
if no_improve_rounds >= self.early_stopping_rounds:
|
354
|
+
print("Early stopping triggered.")
|
355
|
+
break
|
356
|
+
|
357
|
+
def predict(self, X):
|
358
|
+
X_current = X.copy()
|
359
|
+
for layer in self.layers:
|
360
|
+
layer_outputs = []
|
361
|
+
for reg in layer:
|
362
|
+
n_features = reg.n_features_in_
|
363
|
+
preds = reg.predict(X_current[:, :n_features]).reshape(-1, 1)
|
364
|
+
layer_outputs.append(preds)
|
365
|
+
output = np.hstack(layer_outputs)
|
366
|
+
X_current = np.hstack([X_current, output])
|
367
|
+
|
368
|
+
final_outputs = []
|
369
|
+
for reg in self.layers[-1]:
|
370
|
+
n_features = reg.n_features_in_
|
371
|
+
final_outputs.append(reg.predict(X_current[:, :n_features]).reshape(-1, 1))
|
372
|
+
return np.mean(np.hstack(final_outputs), axis=1)
|
373
|
+
|
374
|
+
|
375
|
+
from sklearn.datasets import load_diabetes
|
376
|
+
from sklearn.model_selection import train_test_split
|
377
|
+
from sklearn.metrics import mean_squared_error
|
378
|
+
|
379
|
+
# Load dataset
|
380
|
+
X, y = load_diabetes(return_X_y=True)
|
381
|
+
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
|
382
|
+
|
383
|
+
# Hyperparameter grid
|
384
|
+
param_grid = {
|
385
|
+
'max_depth': [3, 4],
|
386
|
+
'learning_rate': [0.1, 0.05],
|
387
|
+
'n_estimators': [50, 100]
|
388
|
+
}
|
389
|
+
|
390
|
+
# Create and fit the model with GPU enabled
|
391
|
+
df_reg = XGBoostDeepForestRegressor(
|
392
|
+
n_estimators_per_layer=2,
|
393
|
+
max_layers=5,
|
394
|
+
early_stopping_rounds=2,
|
395
|
+
param_grid=param_grid,
|
396
|
+
use_gpu=True, # Enable GPU acceleration
|
397
|
+
gpu_id=0 # Default to the first GPU
|
398
|
+
)
|
399
|
+
|
400
|
+
df_reg.fit(X_train, y_train, X_val, y_val)
|
401
|
+
|
402
|
+
# Final evaluation
|
403
|
+
y_pred = df_reg.predict(X_val)
|
404
|
+
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
|
405
|
+
print("Final RMSE:", rmse)
|
406
|
+
|
407
|
+
# ----------------------------------------------------------------------------------------------------
|
408
|
+
|
409
|
+
xgb_params = {
|
410
|
+
"objective": "reg:squarederror",
|
411
|
+
"random_state": 0,
|
412
|
+
'seed': 0,
|
413
|
+
'n_estimators': 100,
|
414
|
+
'max_depth': 6,
|
415
|
+
'min_child_weight': 4,
|
416
|
+
'subsample': 0.8,
|
417
|
+
'colsample_bytree': 0.8,
|
418
|
+
'gamma': 0,
|
419
|
+
'reg_alpha': 0,
|
420
|
+
'reg_lambda': 1,
|
421
|
+
'learning_rate': 0.05,
|
422
|
+
}
|
423
|
+
|
424
|
+
from xgboost import XGBRegressor
|
425
|
+
regr = XGBRegressor(**xgb_params)
|
426
|
+
|
427
|
+
regr.fit(X_train, y_train)
|
428
|
+
y_pred = regr.predict(X_val)
|
429
|
+
|
430
|
+
|
431
|
+
from scipy import stats
|
432
|
+
|
433
|
+
stats.linregress(y_val, y_pred)
|
434
|
+
|
435
|
+
'''
|
sciml/utils.py
CHANGED
@@ -1,46 +1,46 @@
|
|
1
|
-
import numpy as np
|
2
|
-
import pandas as pd
|
3
|
-
from sklearn.model_selection import ShuffleSplit
|
4
|
-
from sklearn.model_selection import train_test_split
|
5
|
-
|
6
|
-
# randomly select sites
|
7
|
-
def random_select(ds, count, num, random_state = 0):
|
8
|
-
np.random.seed(random_state)
|
9
|
-
idxs = np.random.choice(np.delete(np.arange(len(ds)), count), num, replace = False)
|
10
|
-
return np.sort(idxs)
|
11
|
-
|
12
|
-
def split(Xs, ys, return_index = False, test_size = 0.33, random_state = 42):
|
13
|
-
if return_index:
|
14
|
-
sss = ShuffleSplit(n_splits=1, test_size = test_size, random_state = random_state)
|
15
|
-
sss.get_n_splits(Xs, ys)
|
16
|
-
train_index, test_index = next(sss.split(Xs, ys))
|
17
|
-
return (train_index, test_index)
|
18
|
-
else:
|
19
|
-
X_train, X_test, y_train, y_test = train_test_split(
|
20
|
-
Xs, ys,
|
21
|
-
test_size = test_size,
|
22
|
-
random_state = random_state
|
23
|
-
)
|
24
|
-
return (X_train, X_test, y_train, y_test)
|
25
|
-
|
26
|
-
def split_cut(Xs, ys, test_ratio = 0.33):
|
27
|
-
assert ys.ndim == 2, 'ys must be 2D!'
|
28
|
-
assert len(Xs) == len(ys), 'Xs and ys should be equally long!'
|
29
|
-
assert type(Xs) == type(ys), 'Xs and ys should be the same data type!'
|
30
|
-
if not type(Xs) in [pd.core.frame.DataFrame, np.ndarray]: raise Exception('Only accept numpy ndarray or pandas dataframe')
|
31
|
-
anchor = int(np.floor(len(ys) * (1 - test_ratio)))
|
32
|
-
|
33
|
-
if type(Xs) == pd.core.frame.DataFrame:
|
34
|
-
X_train = Xs.iloc[0: anchor, :]
|
35
|
-
X_test = Xs.iloc[anchor::, :]
|
36
|
-
y_train = ys.iloc[0: anchor, :]
|
37
|
-
y_test = ys.iloc[anchor::, :]
|
38
|
-
else:
|
39
|
-
X_train = Xs[0: anchor, :]
|
40
|
-
X_test = Xs[anchor::, :]
|
41
|
-
y_train = ys[0: anchor, :]
|
42
|
-
y_test = ys[anchor::, :]
|
43
|
-
|
44
|
-
assert len(X_train) + len(X_test) == len(Xs), 'The sum of train and test lengths must equal to Xs/ys!'
|
45
|
-
|
1
|
+
import numpy as np
|
2
|
+
import pandas as pd
|
3
|
+
from sklearn.model_selection import ShuffleSplit
|
4
|
+
from sklearn.model_selection import train_test_split
|
5
|
+
|
6
|
+
# randomly select sites
|
7
|
+
def random_select(ds, count, num, random_state = 0):
|
8
|
+
np.random.seed(random_state)
|
9
|
+
idxs = np.random.choice(np.delete(np.arange(len(ds)), count), num, replace = False)
|
10
|
+
return np.sort(idxs)
|
11
|
+
|
12
|
+
def split(Xs, ys, return_index = False, test_size = 0.33, random_state = 42):
|
13
|
+
if return_index:
|
14
|
+
sss = ShuffleSplit(n_splits=1, test_size = test_size, random_state = random_state)
|
15
|
+
sss.get_n_splits(Xs, ys)
|
16
|
+
train_index, test_index = next(sss.split(Xs, ys))
|
17
|
+
return (train_index, test_index)
|
18
|
+
else:
|
19
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
20
|
+
Xs, ys,
|
21
|
+
test_size = test_size,
|
22
|
+
random_state = random_state
|
23
|
+
)
|
24
|
+
return (X_train, X_test, y_train, y_test)
|
25
|
+
|
26
|
+
def split_cut(Xs, ys, test_ratio = 0.33):
|
27
|
+
assert ys.ndim == 2, 'ys must be 2D!'
|
28
|
+
assert len(Xs) == len(ys), 'Xs and ys should be equally long!'
|
29
|
+
assert type(Xs) == type(ys), 'Xs and ys should be the same data type!'
|
30
|
+
if not type(Xs) in [pd.core.frame.DataFrame, np.ndarray]: raise Exception('Only accept numpy ndarray or pandas dataframe')
|
31
|
+
anchor = int(np.floor(len(ys) * (1 - test_ratio)))
|
32
|
+
|
33
|
+
if type(Xs) == pd.core.frame.DataFrame:
|
34
|
+
X_train = Xs.iloc[0: anchor, :]
|
35
|
+
X_test = Xs.iloc[anchor::, :]
|
36
|
+
y_train = ys.iloc[0: anchor, :]
|
37
|
+
y_test = ys.iloc[anchor::, :]
|
38
|
+
else:
|
39
|
+
X_train = Xs[0: anchor, :]
|
40
|
+
X_test = Xs[anchor::, :]
|
41
|
+
y_train = ys[0: anchor, :]
|
42
|
+
y_test = ys[anchor::, :]
|
43
|
+
|
44
|
+
assert len(X_train) + len(X_test) == len(Xs), 'The sum of train and test lengths must equal to Xs/ys!'
|
45
|
+
|
46
46
|
return (X_train, X_test, y_train, y_test)
|
@@ -1,21 +1,21 @@
|
|
1
|
-
MIT License
|
2
|
-
|
3
|
-
Copyright (c) 2021 Zhu
|
4
|
-
|
5
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
-
of this software and associated documentation files (the "Software"), to deal
|
7
|
-
in the Software without restriction, including without limitation the rights
|
8
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
-
copies of the Software, and to permit persons to whom the Software is
|
10
|
-
furnished to do so, subject to the following conditions:
|
11
|
-
|
12
|
-
The above copyright notice and this permission notice shall be included in all
|
13
|
-
copies or substantial portions of the Software.
|
14
|
-
|
15
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
-
SOFTWARE.
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2021 Zhu
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
@@ -1,13 +1,13 @@
|
|
1
|
-
Metadata-Version: 2.1
|
2
|
-
Name: sciml
|
3
|
-
Version: 0.0.
|
4
|
-
Summary: draw and basic calculations/conversions
|
5
|
-
Home-page: https://github.com/soonyenju/sciml
|
6
|
-
Author: Songyan Zhu
|
7
|
-
Author-email: zhusy93@gmail.com
|
8
|
-
License: MIT Licence
|
9
|
-
Keywords: Scientific machine learning wrappers
|
10
|
-
Platform: any
|
11
|
-
License-File: LICENSE
|
12
|
-
|
13
|
-
coming soon
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: sciml
|
3
|
+
Version: 0.0.9
|
4
|
+
Summary: draw and basic calculations/conversions
|
5
|
+
Home-page: https://github.com/soonyenju/sciml
|
6
|
+
Author: Songyan Zhu
|
7
|
+
Author-email: zhusy93@gmail.com
|
8
|
+
License: MIT Licence
|
9
|
+
Keywords: Scientific machine learning wrappers
|
10
|
+
Platform: any
|
11
|
+
License-File: LICENSE
|
12
|
+
|
13
|
+
coming soon
|
@@ -0,0 +1,9 @@
|
|
1
|
+
sciml/__init__.py,sha256=wtdlXERN2ik7NT_TQxFdd2gdodBY9vSU1ClSdeJnLm4,59
|
2
|
+
sciml/models.py,sha256=BjbliW-KNfzbNdGNgM7nBdJ2SF2z21qCoAvug_v0FEg,10574
|
3
|
+
sciml/pipelines.py,sha256=ReNEkQbdFn04D5G2tbxcA7jdSwACy8SnmZ8bFZI_oqE,15702
|
4
|
+
sciml/utils.py,sha256=qCdABaTUu3K0R269jI7D_8SO6AqEjphg03CzdxCJR2k,1876
|
5
|
+
sciml-0.0.9.dist-info/LICENSE,sha256=hcunSTJmVgRcUNOa1rKl8axtY3Jsy2B4wXDYtQsrAt0,1081
|
6
|
+
sciml-0.0.9.dist-info/METADATA,sha256=S5hG3pP3x4yDPe8AJOKn4R-fIuvL-DL1GSKqGqiImSw,326
|
7
|
+
sciml-0.0.9.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
8
|
+
sciml-0.0.9.dist-info/top_level.txt,sha256=dS_7aBCZFKQE3myPy5sh4USjQZCZyGg382-YxUUYcdw,6
|
9
|
+
sciml-0.0.9.dist-info/RECORD,,
|
sciml-0.0.7.dist-info/RECORD
DELETED
@@ -1,8 +0,0 @@
|
|
1
|
-
sciml/__init__.py,sha256=Asqzx08kEOBLv_IRE20VlHxZu9XgydyrzIMUDRE-qiU,48
|
2
|
-
sciml/pipelines.py,sha256=5qfeHdxGhF-GMu-rTiInPv5metXiT32uSENIDFd2Ths,6333
|
3
|
-
sciml/utils.py,sha256=u5DzQJV4aCZ-p7sY56Fxzj8WDGYOgn1rOTeGzAw0vwY,1831
|
4
|
-
sciml-0.0.7.dist-info/LICENSE,sha256=dX4jBmkgQPWc_TfYkXtKQzVIgZQWFuHZ8vQjV4sEeV4,1060
|
5
|
-
sciml-0.0.7.dist-info/METADATA,sha256=363EbWoSVqR9qAdhOfeVD8RiP6DfcalvDiZECJ6LW3s,313
|
6
|
-
sciml-0.0.7.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
7
|
-
sciml-0.0.7.dist-info/top_level.txt,sha256=dS_7aBCZFKQE3myPy5sh4USjQZCZyGg382-YxUUYcdw,6
|
8
|
-
sciml-0.0.7.dist-info/RECORD,,
|
File without changes
|