sciml 0.0.11__py3-none-any.whl → 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sciml/pipelines.py CHANGED
@@ -1,226 +1,226 @@
1
- import numpy as np
2
- import pandas as pd
3
- from scipy import stats
4
- from copy import deepcopy
5
- from tqdm import tqdm
6
- from sklearn.metrics import mean_squared_error
7
- from xgboost import XGBRegressor
8
-
9
- def get_metrics(df, truth = 'truth', pred = 'pred', return_dict = False):
10
- '''
11
- Calculate statistical measures between validation and prediction sequences
12
- '''
13
- df = df[[truth, pred]].copy().dropna()
14
- slope, intercept, r_value, p_value, std_err = stats.linregress(df.dropna()[truth], df.dropna()[pred])
15
- r2 = r_value**2
16
- mse = mean_squared_error(df.dropna()[truth], df.dropna()[pred])
17
- rmse = np.sqrt(mse)
18
- mbe = np.mean(df.dropna()[pred] - df.dropna()[truth])
19
- mae = (df.dropna()[pred] - df.dropna()[truth]).abs().mean()
20
- if return_dict:
21
- return pd.DataFrame.from_dict([{
22
- 'r2': r2,
23
- 'Slope': slope,
24
- 'RMSE': rmse,
25
- 'MBE': mbe,
26
- 'MAE': mae,
27
- 'Intercept': intercept,
28
- 'p-value': p_value,
29
- 'std_err': std_err
30
- }])
31
- else:
32
- return r2, slope, rmse, mbe, mae, intercept, p_value, std_err
33
-
34
- # ===============================================================================================================================
35
- # Machine learning algorithms
36
- def train_ml(
37
- X_train, y_train, model_name = 'XGB',
38
- xgb_params_user = None, rfr_params_user = None,
39
- mlp_params_user = None, svr_params_user = None,
40
- df21_params_user = None,
41
- gpu = False, partial_mode = False
42
- ):
43
- # -------------------------------------------------------------------------
44
- # Setup parameters:
45
- if xgb_params_user:
46
- xgb_params = xgb_params_user
47
- else:
48
- xgb_params = {
49
- "objective": "reg:squarederror",
50
- "random_state": 0,
51
- 'seed': 0,
52
- 'n_estimators': 100,
53
- 'max_depth': 6,
54
- 'min_child_weight': 4,
55
- 'subsample': 0.8,
56
- 'colsample_bytree': 0.8,
57
- 'gamma': 0,
58
- 'reg_alpha': 0,
59
- 'reg_lambda': 1,
60
- 'learning_rate': 0.05,
61
- }
62
-
63
- xgb_gpu_params = {
64
- 'tree_method': 'gpu_hist',
65
- 'gpu_id': 0,
66
- # "n_gpus": 2,
67
- }
68
-
69
- if gpu: xgb_params.update(xgb_gpu_params)
70
-
71
- if rfr_params_user:
72
- rfr_params = rfr_params_user
73
- else:
74
- rfr_params = {
75
- 'max_depth': 20,
76
- 'min_samples_leaf': 3,
77
- 'min_samples_split': 12,
78
- 'n_estimators': 100,
79
- 'n_jobs': -1
80
- }
81
-
82
- if df21_params_user:
83
- df21_params = df21_params_user
84
- else:
85
- df21_params = {
86
- 'random_state': 1,
87
- 'verbose' : 0,
88
- 'predictor': "xgboost",
89
- 'n_jobs' : -1,
90
- 'predictor_kwargs' : xgb_params,
91
- 'partial_mode' : partial_mode
92
- }
93
- # -------------------------------------------------------------------------
94
- # Run:
95
- if model_name == "XGB":
96
- from xgboost import XGBRegressor
97
- regr = XGBRegressor(**xgb_params)
98
- elif model_name == "MLP":
99
- from sklearn.neural_network import MLPRegressor
100
- regr = MLPRegressor(**mlp_params_user)
101
- elif model_name == "RFR":
102
- from sklearn.ensemble import RandomForestRegressor
103
- regr = RandomForestRegressor(**rfr_params)
104
- elif model_name == "SVR":
105
- from sklearn.svm import SVR
106
- regr = SVR(**svr_params_user)
107
- elif model_name == "DF21":
108
- from deepforest import CascadeForestRegressor
109
- # https://deep-forest.readthedocs.io/en/latest/api_reference.html?highlight=CascadeForestRegressor#cascadeforestregressor
110
- # predictor: {"forest", "xgboost", "lightgbm"}
111
- # regr = CascadeForestRegressor(random_state = 1, verbose = 0, predictor = "xgboost", n_jobs = -1, predictor_kwargs = xgb_params, partial_mode = partial_mode)
112
- regr = CascadeForestRegressor(**df21_params)
113
- regr.fit(X_train, y_train)
114
- return regr
115
-
116
- def test_ml(X_test, y_test, regr):
117
- res = y_test.copy() # y_test is 2D pandas dataframe.
118
- res.columns = ['truth']
119
- res['pred'] = regr.predict(X_test)
120
- return res
121
-
122
- def run_ensemble(X_train, y_train, n_models = 10, frac_sample = 0.8):
123
- base_params_xgb = {
124
- "objective": "reg:squarederror",
125
- 'seed': 0,
126
- "random_state": 0,
127
- }
128
- params_xgb = deepcopy(base_params_xgb)
129
- # dropout-like regularization
130
- params_xgb.update({
131
- "subsample": 0.8, # Use 80% of the data for each tree
132
- "colsample_bytree": 0.8, # Use 80% of the features for each tree
133
- })
134
-
135
- models = []
136
- for i in tqdm(range(n_models)):
137
- # Create a bootstrapped dataset
138
- y_resampled = y_train.copy().sample(frac = frac_sample, random_state = i)
139
- X_resampled = X_train.copy().loc[y_resampled.index]
140
- # print(y_resampled.sort_index().index[0], y_resampled.sort_index().index[-1])
141
-
142
- # Train the XGBoost model
143
- params_xgb.update({'random_state': i})
144
- model = XGBRegressor(**params_xgb)
145
- model.fit(X_resampled, y_resampled)
146
- models.append(model)
147
- return models
148
-
149
- # ===============================================================================================================================
150
- # Deep learning neural networks
151
-
152
- try:
153
- from tensorflow import keras
154
- from tensorflow.keras import layers
155
- from tensorflow.keras import models
156
- # from keras.layers import Dropout
157
- from keras.callbacks import EarlyStopping
158
- from scitbx.utils import *
159
- except Exception as e:
160
- print(e)
161
-
162
- def train_lstm(X_train, y_train, nfeature, ntime, verbose = 2, epochs = 200, batch_size = 64):
163
- # create and fit the LSTM network
164
- model = models.Sequential()
165
- model.add(layers.LSTM(64, input_shape=(nfeature, ntime)))
166
- model.add(layers.Dropout(0.2))
167
- model.add(layers.Dense(16, activation='relu'))
168
- model.add(layers.Dropout(0.2))
169
- model.add(layers.Dense(1, activation='relu'))
170
- model.compile(loss='mean_squared_error', optimizer='adam')
171
- # es = EarlyStopping(monitor='loss', mode='min', verbose=1)
172
- # model.fit(X_train.reshape(-1, nsites, nfeats), y_train, epochs=100, batch_size=256, verbose=2, callbacks=[es])
173
- model.fit(X_train, y_train, epochs = epochs, batch_size = batch_size, verbose=verbose)
174
- return model
175
-
176
- # ===============================================================================================================================
177
- # Training utils
178
- import numpy as np
179
- import pandas as pd
180
- from sklearn.model_selection import ShuffleSplit
181
- from sklearn.model_selection import train_test_split
182
-
183
- # randomly select sites
184
- def random_select(ds, count, num, random_state = 0):
185
- np.random.seed(random_state)
186
- idxs = np.random.choice(np.delete(np.arange(len(ds)), count), num, replace = False)
187
- return np.sort(idxs)
188
-
189
- def split(Xs, ys, return_index = False, test_size = 0.33, random_state = 42):
190
- if return_index:
191
- sss = ShuffleSplit(n_splits=1, test_size = test_size, random_state = random_state)
192
- sss.get_n_splits(Xs, ys)
193
- train_index, test_index = next(sss.split(Xs, ys))
194
- return (train_index, test_index)
195
- else:
196
- X_train, X_test, y_train, y_test = train_test_split(
197
- Xs, ys,
198
- test_size = test_size,
199
- random_state = random_state
200
- )
201
- return (X_train, X_test, y_train, y_test)
202
-
203
- def split_cut(Xs, ys, test_ratio = 0.33):
204
- """
205
- Split the timeseries into before and after halves
206
- """
207
- assert ys.ndim == 2, 'ys must be 2D!'
208
- assert len(Xs) == len(ys), 'Xs and ys should be equally long!'
209
- assert type(Xs) == type(ys), 'Xs and ys should be the same data type!'
210
- if not type(Xs) in [pd.core.frame.DataFrame, np.ndarray]: raise Exception('Only accept numpy ndarray or pandas dataframe')
211
- anchor = int(np.floor(len(ys) * (1 - test_ratio)))
212
-
213
- if type(Xs) == pd.core.frame.DataFrame:
214
- X_train = Xs.iloc[0: anchor, :]
215
- X_test = Xs.iloc[anchor::, :]
216
- y_train = ys.iloc[0: anchor, :]
217
- y_test = ys.iloc[anchor::, :]
218
- else:
219
- X_train = Xs[0: anchor, :]
220
- X_test = Xs[anchor::, :]
221
- y_train = ys[0: anchor, :]
222
- y_test = ys[anchor::, :]
223
-
224
- assert len(X_train) + len(X_test) == len(Xs), 'The sum of train and test lengths must equal to Xs/ys!'
225
-
1
+ import numpy as np
2
+ import pandas as pd
3
+ from scipy import stats
4
+ from copy import deepcopy
5
+ from tqdm import tqdm
6
+ from sklearn.metrics import mean_squared_error
7
+ from xgboost import XGBRegressor
8
+
9
+ def get_metrics(df, truth = 'truth', pred = 'pred', return_dict = False):
10
+ '''
11
+ Calculate statistical measures between validation and prediction sequences
12
+ '''
13
+ df = df[[truth, pred]].copy().dropna()
14
+ slope, intercept, r_value, p_value, std_err = stats.linregress(df.dropna()[truth], df.dropna()[pred])
15
+ r2 = r_value**2
16
+ mse = mean_squared_error(df.dropna()[truth], df.dropna()[pred])
17
+ rmse = np.sqrt(mse)
18
+ mbe = np.mean(df.dropna()[pred] - df.dropna()[truth])
19
+ mae = (df.dropna()[pred] - df.dropna()[truth]).abs().mean()
20
+ if return_dict:
21
+ return pd.DataFrame.from_dict([{
22
+ 'r2': r2,
23
+ 'Slope': slope,
24
+ 'RMSE': rmse,
25
+ 'MBE': mbe,
26
+ 'MAE': mae,
27
+ 'Intercept': intercept,
28
+ 'p-value': p_value,
29
+ 'std_err': std_err
30
+ }])
31
+ else:
32
+ return r2, slope, rmse, mbe, mae, intercept, p_value, std_err
33
+
34
+ # ===============================================================================================================================
35
+ # Machine learning algorithms
36
+ def train_ml(
37
+ X_train, y_train, model_name = 'XGB',
38
+ xgb_params_user = None, rfr_params_user = None,
39
+ mlp_params_user = None, svr_params_user = None,
40
+ df21_params_user = None,
41
+ gpu = False, partial_mode = False
42
+ ):
43
+ # -------------------------------------------------------------------------
44
+ # Setup parameters:
45
+ if xgb_params_user:
46
+ xgb_params = xgb_params_user
47
+ else:
48
+ xgb_params = {
49
+ "objective": "reg:squarederror",
50
+ "random_state": 0,
51
+ 'seed': 0,
52
+ 'n_estimators': 100,
53
+ 'max_depth': 6,
54
+ 'min_child_weight': 4,
55
+ 'subsample': 0.8,
56
+ 'colsample_bytree': 0.8,
57
+ 'gamma': 0,
58
+ 'reg_alpha': 0,
59
+ 'reg_lambda': 1,
60
+ 'learning_rate': 0.05,
61
+ }
62
+
63
+ xgb_gpu_params = {
64
+ 'tree_method': 'gpu_hist',
65
+ 'gpu_id': 0,
66
+ # "n_gpus": 2,
67
+ }
68
+
69
+ if gpu: xgb_params.update(xgb_gpu_params)
70
+
71
+ if rfr_params_user:
72
+ rfr_params = rfr_params_user
73
+ else:
74
+ rfr_params = {
75
+ 'max_depth': 20,
76
+ 'min_samples_leaf': 3,
77
+ 'min_samples_split': 12,
78
+ 'n_estimators': 100,
79
+ 'n_jobs': -1
80
+ }
81
+
82
+ if df21_params_user:
83
+ df21_params = df21_params_user
84
+ else:
85
+ df21_params = {
86
+ 'random_state': 1,
87
+ 'verbose' : 0,
88
+ 'predictor': "xgboost",
89
+ 'n_jobs' : -1,
90
+ 'predictor_kwargs' : xgb_params,
91
+ 'partial_mode' : partial_mode
92
+ }
93
+ # -------------------------------------------------------------------------
94
+ # Run:
95
+ if model_name == "XGB":
96
+ from xgboost import XGBRegressor
97
+ regr = XGBRegressor(**xgb_params)
98
+ elif model_name == "MLP":
99
+ from sklearn.neural_network import MLPRegressor
100
+ regr = MLPRegressor(**mlp_params_user)
101
+ elif model_name == "RFR":
102
+ from sklearn.ensemble import RandomForestRegressor
103
+ regr = RandomForestRegressor(**rfr_params)
104
+ elif model_name == "SVR":
105
+ from sklearn.svm import SVR
106
+ regr = SVR(**svr_params_user)
107
+ elif model_name == "DF21":
108
+ from deepforest import CascadeForestRegressor
109
+ # https://deep-forest.readthedocs.io/en/latest/api_reference.html?highlight=CascadeForestRegressor#cascadeforestregressor
110
+ # predictor: {"forest", "xgboost", "lightgbm"}
111
+ # regr = CascadeForestRegressor(random_state = 1, verbose = 0, predictor = "xgboost", n_jobs = -1, predictor_kwargs = xgb_params, partial_mode = partial_mode)
112
+ regr = CascadeForestRegressor(**df21_params)
113
+ regr.fit(X_train, y_train)
114
+ return regr
115
+
116
+ def test_ml(X_test, y_test, regr):
117
+ res = y_test.copy() # y_test is 2D pandas dataframe.
118
+ res.columns = ['truth']
119
+ res['pred'] = regr.predict(X_test)
120
+ return res
121
+
122
+ def run_ensemble(X_train, y_train, n_models = 10, frac_sample = 0.8):
123
+ base_params_xgb = {
124
+ "objective": "reg:squarederror",
125
+ 'seed': 0,
126
+ "random_state": 0,
127
+ }
128
+ params_xgb = deepcopy(base_params_xgb)
129
+ # dropout-like regularization
130
+ params_xgb.update({
131
+ "subsample": 0.8, # Use 80% of the data for each tree
132
+ "colsample_bytree": 0.8, # Use 80% of the features for each tree
133
+ })
134
+
135
+ models = []
136
+ for i in tqdm(range(n_models)):
137
+ # Create a bootstrapped dataset
138
+ y_resampled = y_train.copy().sample(frac = frac_sample, random_state = i)
139
+ X_resampled = X_train.copy().loc[y_resampled.index]
140
+ # print(y_resampled.sort_index().index[0], y_resampled.sort_index().index[-1])
141
+
142
+ # Train the XGBoost model
143
+ params_xgb.update({'random_state': i})
144
+ model = XGBRegressor(**params_xgb)
145
+ model.fit(X_resampled, y_resampled)
146
+ models.append(model)
147
+ return models
148
+
149
+ # ===============================================================================================================================
150
+ # Deep learning neural networks
151
+
152
+ try:
153
+ from tensorflow import keras
154
+ from tensorflow.keras import layers
155
+ from tensorflow.keras import models
156
+ # from keras.layers import Dropout
157
+ from keras.callbacks import EarlyStopping
158
+ from scitbx.utils import *
159
+ except Exception as e:
160
+ print(e)
161
+
162
+ def train_lstm(X_train, y_train, nfeature, ntime, verbose = 2, epochs = 200, batch_size = 64):
163
+ # create and fit the LSTM network
164
+ model = models.Sequential()
165
+ model.add(layers.LSTM(64, input_shape=(nfeature, ntime)))
166
+ model.add(layers.Dropout(0.2))
167
+ model.add(layers.Dense(16, activation='relu'))
168
+ model.add(layers.Dropout(0.2))
169
+ model.add(layers.Dense(1, activation='relu'))
170
+ model.compile(loss='mean_squared_error', optimizer='adam')
171
+ # es = EarlyStopping(monitor='loss', mode='min', verbose=1)
172
+ # model.fit(X_train.reshape(-1, nsites, nfeats), y_train, epochs=100, batch_size=256, verbose=2, callbacks=[es])
173
+ model.fit(X_train, y_train, epochs = epochs, batch_size = batch_size, verbose=verbose)
174
+ return model
175
+
176
+ # ===============================================================================================================================
177
+ # Training utils
178
+ import numpy as np
179
+ import pandas as pd
180
+ from sklearn.model_selection import ShuffleSplit
181
+ from sklearn.model_selection import train_test_split
182
+
183
+ # randomly select sites
184
+ def random_select(ds, count, num, random_state = 0):
185
+ np.random.seed(random_state)
186
+ idxs = np.random.choice(np.delete(np.arange(len(ds)), count), num, replace = False)
187
+ return np.sort(idxs)
188
+
189
+ def split(Xs, ys, return_index = False, test_size = 0.33, random_state = 42):
190
+ if return_index:
191
+ sss = ShuffleSplit(n_splits=1, test_size = test_size, random_state = random_state)
192
+ sss.get_n_splits(Xs, ys)
193
+ train_index, test_index = next(sss.split(Xs, ys))
194
+ return (train_index, test_index)
195
+ else:
196
+ X_train, X_test, y_train, y_test = train_test_split(
197
+ Xs, ys,
198
+ test_size = test_size,
199
+ random_state = random_state
200
+ )
201
+ return (X_train, X_test, y_train, y_test)
202
+
203
+ def split_cut(Xs, ys, test_ratio = 0.33):
204
+ """
205
+ Split the timeseries into before and after halves
206
+ """
207
+ assert ys.ndim == 2, 'ys must be 2D!'
208
+ assert len(Xs) == len(ys), 'Xs and ys should be equally long!'
209
+ assert type(Xs) == type(ys), 'Xs and ys should be the same data type!'
210
+ if not type(Xs) in [pd.core.frame.DataFrame, np.ndarray]: raise Exception('Only accept numpy ndarray or pandas dataframe')
211
+ anchor = int(np.floor(len(ys) * (1 - test_ratio)))
212
+
213
+ if type(Xs) == pd.core.frame.DataFrame:
214
+ X_train = Xs.iloc[0: anchor, :]
215
+ X_test = Xs.iloc[anchor::, :]
216
+ y_train = ys.iloc[0: anchor, :]
217
+ y_test = ys.iloc[anchor::, :]
218
+ else:
219
+ X_train = Xs[0: anchor, :]
220
+ X_test = Xs[anchor::, :]
221
+ y_train = ys[0: anchor, :]
222
+ y_test = ys[anchor::, :]
223
+
224
+ assert len(X_train) + len(X_test) == len(Xs), 'The sum of train and test lengths must equal to Xs/ys!'
225
+
226
226
  return (X_train, X_test, y_train, y_test)