sciml 0.0.8__py3-none-any.whl → 0.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sciml/__init__.py +1 -1
- sciml/ccc.py +36 -0
- sciml/metrics.py +123 -0
- sciml/models.py +14 -7
- sciml/pipelines.py +47 -256
- sciml/regress2.py +217 -0
- {sciml-0.0.8.dist-info → sciml-0.0.10.dist-info}/METADATA +1 -1
- sciml-0.0.10.dist-info/RECORD +11 -0
- sciml/utils.py +0 -46
- sciml-0.0.8.dist-info/RECORD +0 -9
- {sciml-0.0.8.dist-info → sciml-0.0.10.dist-info}/LICENSE +0 -0
- {sciml-0.0.8.dist-info → sciml-0.0.10.dist-info}/WHEEL +0 -0
- {sciml-0.0.8.dist-info → sciml-0.0.10.dist-info}/top_level.txt +0 -0
sciml/__init__.py
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
# coding: utf-8
|
2
|
-
__all__ = ["
|
2
|
+
__all__ = ["pipelines", "models", "metrics", "regress2", "ccc"]
|
sciml/ccc.py
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# https://rowannicholls.github.io/python/statistics/agreement/correlation_coefficients.html#lins-concordance-correlation-coefficient-ccc
|
2
|
+
# Lin LIK (1989). “A concordance correlation coefficient to evaluate reproducibility”. Biometrics. 45 (1):255-268.
|
3
|
+
import numpy as np
|
4
|
+
import pandas as pd
|
5
|
+
|
6
|
+
def concordance_correlation_coefficient(y_true, y_pred):
|
7
|
+
"""Concordance correlation coefficient."""
|
8
|
+
# Remove NaNs
|
9
|
+
df = pd.DataFrame({
|
10
|
+
'y_true': y_true,
|
11
|
+
'y_pred': y_pred
|
12
|
+
})
|
13
|
+
df = df.dropna()
|
14
|
+
y_true = df['y_true']
|
15
|
+
y_pred = df['y_pred']
|
16
|
+
# Pearson product-moment correlation coefficients
|
17
|
+
cor = np.corrcoef(y_true, y_pred)[0][1]
|
18
|
+
# Mean
|
19
|
+
mean_true = np.mean(y_true)
|
20
|
+
mean_pred = np.mean(y_pred)
|
21
|
+
# Variance
|
22
|
+
var_true = np.var(y_true)
|
23
|
+
var_pred = np.var(y_pred)
|
24
|
+
# Standard deviation
|
25
|
+
sd_true = np.std(y_true)
|
26
|
+
sd_pred = np.std(y_pred)
|
27
|
+
# Calculate CCC
|
28
|
+
numerator = 2 * cor * sd_true * sd_pred
|
29
|
+
denominator = var_true + var_pred + (mean_true - mean_pred)**2
|
30
|
+
return numerator / denominator
|
31
|
+
|
32
|
+
|
33
|
+
# y_true = [3, -0.5, 2, 7, np.NaN]
|
34
|
+
# y_pred = [2.5, 0.0, 2, 8, 3]
|
35
|
+
# ccc = concordance_correlation_coefficient(y_true, y_pred)
|
36
|
+
# print(ccc)
|
sciml/metrics.py
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import pandas as pd
|
3
|
+
from scipy import stats
|
4
|
+
from sklearn.metrics import explained_variance_score, max_error, mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error, r2_score, mean_poisson_deviance, mean_gamma_deviance, mean_tweedie_deviance
|
5
|
+
|
6
|
+
def stats_summary(df):
|
7
|
+
min_ = df.min().to_frame().T
|
8
|
+
Q1 = df.quantile(0.25).to_frame().T
|
9
|
+
median_ = df.quantile(0.5).to_frame().T
|
10
|
+
mean_ = df.mean().to_frame().T
|
11
|
+
Q3 = df.quantile(0.75).to_frame().T
|
12
|
+
max_ = df.max().to_frame().T
|
13
|
+
df_stats = pd.concat([min_, Q1, median_, mean_, Q3, max_])
|
14
|
+
df_stats.index = ["Min", "Q1", "Median", "Mean", "Q3", "Max"]
|
15
|
+
return df_stats
|
16
|
+
|
17
|
+
def stats_measures(x, y, return_dict = False):
|
18
|
+
slope, intercept, rvalue, pvalue, stderr = stats.linregress(x, y)
|
19
|
+
mse = mean_squared_error(x, y)
|
20
|
+
r2 = rvalue ** 2
|
21
|
+
rmse = np.sqrt(mse)
|
22
|
+
mbe = (y - x).mean()
|
23
|
+
if return_dict:
|
24
|
+
return {
|
25
|
+
"R2": r2,
|
26
|
+
"SLOPE": slope,
|
27
|
+
"RMSE": rmse,
|
28
|
+
"MBE": mbe
|
29
|
+
}
|
30
|
+
else:
|
31
|
+
return [r2, slope, rmse, mbe]
|
32
|
+
|
33
|
+
def stats_measures_full(x, y):
|
34
|
+
# from sklearn.metrics import mean_absolute_percentage_error
|
35
|
+
slope, intercept, rvalue, pvalue, stderr = stats.linregress(x, y)
|
36
|
+
mse = mean_squared_error(x, y)
|
37
|
+
r2 = rvalue ** 2
|
38
|
+
rmse = np.sqrt(mse)
|
39
|
+
mbe = (y - x).mean()
|
40
|
+
# ----------------------------------------------------------------
|
41
|
+
pearsonr = stats.pearsonr(x, y)
|
42
|
+
evs = explained_variance_score(x, y)
|
43
|
+
me = max_error(x, y)
|
44
|
+
mae = mean_absolute_error(x, y)
|
45
|
+
msle = mean_squared_log_error(x, y)
|
46
|
+
meae = median_absolute_error(x, y)
|
47
|
+
r2_score = r2_score(x, y)
|
48
|
+
mpd = mean_poisson_deviance(x, y)
|
49
|
+
mgd = mean_gamma_deviance(x, y)
|
50
|
+
mtd = mean_tweedie_deviance(x, y)
|
51
|
+
return {
|
52
|
+
"R2": r2,
|
53
|
+
"SLOPE": slope,
|
54
|
+
"RMSE": rmse,
|
55
|
+
"MBE": mbe,
|
56
|
+
"INTERCEPT": intercept,
|
57
|
+
"PVALUE": pvalue,
|
58
|
+
"STDERR": stderr,
|
59
|
+
"PEARSON": pearsonr,
|
60
|
+
"EXPLAINED_VARIANCE": evs,
|
61
|
+
"MAXERR": me,
|
62
|
+
"MAE": mae,
|
63
|
+
"MSLE": msle,
|
64
|
+
"MEDIAN_AE": meae,
|
65
|
+
"R2_SCORE": r2_score,
|
66
|
+
"MPD": mpd,
|
67
|
+
"MGD": mgd,
|
68
|
+
"MTD": mtd
|
69
|
+
}
|
70
|
+
|
71
|
+
def stats_measures_df(df, name1, name2, return_dict = False):
|
72
|
+
slope, intercept, rvalue, pvalue, stderr = stats.linregress(df[name1], df[name2])
|
73
|
+
mse = mean_squared_error(df[name1], df[name2])
|
74
|
+
r2 = rvalue ** 2
|
75
|
+
rmse = np.sqrt(mse)
|
76
|
+
mbe = (df[name2] - df[name1]).mean()
|
77
|
+
if return_dict:
|
78
|
+
return {
|
79
|
+
"R2": r2,
|
80
|
+
"SLOPE": slope,
|
81
|
+
"RMSE": rmse,
|
82
|
+
"MBE": mbe
|
83
|
+
}
|
84
|
+
else:
|
85
|
+
return [r2, slope, rmse, mbe]
|
86
|
+
|
87
|
+
|
88
|
+
|
89
|
+
def get_r2(x, y):
|
90
|
+
try:
|
91
|
+
x_bar = x.mean()
|
92
|
+
except:
|
93
|
+
x_bar = np.mean(x)
|
94
|
+
|
95
|
+
r2 = 1 - np.sum((x - y)**2) / np.sum((x - x_bar)**2)
|
96
|
+
return r2
|
97
|
+
|
98
|
+
def get_rmse(observations, estimates):
|
99
|
+
return np.sqrt(((estimates - observations) ** 2).mean())
|
100
|
+
|
101
|
+
def calculate_R2(y_true, y_pred):
|
102
|
+
"""
|
103
|
+
Calculate the R^2 (coefficient of determination).
|
104
|
+
|
105
|
+
Args:
|
106
|
+
y_true (array-like): Actual values of the dependent variable.
|
107
|
+
y_pred (array-like): Predicted values of the dependent variable.
|
108
|
+
|
109
|
+
Returns:
|
110
|
+
float: The R^2 value.
|
111
|
+
"""
|
112
|
+
y_true = np.array(y_true)
|
113
|
+
y_pred = np.array(y_pred)
|
114
|
+
|
115
|
+
# Residual sum of squares
|
116
|
+
ss_res = np.sum((y_true - y_pred) ** 2)
|
117
|
+
|
118
|
+
# Total sum of squares
|
119
|
+
ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
|
120
|
+
|
121
|
+
# R^2 calculation
|
122
|
+
R2 = 1 - (ss_res / ss_tot)
|
123
|
+
return R2
|
sciml/models.py
CHANGED
@@ -142,6 +142,9 @@ class SmartForest:
|
|
142
142
|
params['tree_method'] = 'hist'
|
143
143
|
params['device'] = 'cuda'
|
144
144
|
|
145
|
+
params = params.copy() # Prevent modification from affecting the next loop iteration
|
146
|
+
params['random_state'] = i # Use a different random seed for each model to enhance diversity
|
147
|
+
|
145
148
|
model = XGBRegressor(**params)
|
146
149
|
model.fit(X, y)
|
147
150
|
|
@@ -220,11 +223,15 @@ class SmartForest:
|
|
220
223
|
|
221
224
|
"""
|
222
225
|
# ============================== Test Example ==============================
|
226
|
+
import warnings
|
227
|
+
import numpy as np
|
223
228
|
from sklearn.datasets import load_diabetes
|
229
|
+
from sklearn.datasets import fetch_california_housing
|
230
|
+
from sklearn.model_selection import train_test_split
|
231
|
+
from sklearn.metrics import mean_squared_error
|
224
232
|
|
225
|
-
|
226
|
-
|
227
|
-
X, y = load_diabetes(return_X_y=True)
|
233
|
+
# X, y = load_diabetes(return_X_y=True) # Using diabetes dataset
|
234
|
+
X, y = fetch_california_housing(return_X_y=True) # Using house price dataset
|
228
235
|
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
|
229
236
|
|
230
237
|
# Hyperparameter grid
|
@@ -244,7 +251,7 @@ param_grid = {
|
|
244
251
|
}
|
245
252
|
|
246
253
|
# Create the model with Multi-Grained Scanning enabled (with window sizes 2 and 3)
|
247
|
-
|
254
|
+
regr = SmartForest(
|
248
255
|
n_estimators_per_layer = 5,
|
249
256
|
max_layers = 10,
|
250
257
|
early_stopping_rounds = 5,
|
@@ -256,14 +263,14 @@ df_reg = SmartForest(
|
|
256
263
|
verbose = 1
|
257
264
|
)
|
258
265
|
|
259
|
-
|
266
|
+
regr.fit(X_train, y_train, X_val, y_val)
|
260
267
|
|
261
268
|
# Predict on validation set and evaluate
|
262
|
-
y_pred =
|
269
|
+
y_pred = regr.predict(X_val)
|
263
270
|
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
|
264
271
|
print("\nFinal RMSE:", rmse)
|
265
272
|
|
266
273
|
# Output best model and RMSE
|
267
|
-
best_model, best_rmse =
|
274
|
+
best_model, best_rmse = regr.get_best_model()
|
268
275
|
print("\nBest validation RMSE:", best_rmse)
|
269
276
|
"""
|
sciml/pipelines.py
CHANGED
@@ -155,7 +155,7 @@ try:
|
|
155
155
|
from tensorflow.keras import models
|
156
156
|
# from keras.layers import Dropout
|
157
157
|
from keras.callbacks import EarlyStopping
|
158
|
-
from scitbx.
|
158
|
+
from scitbx.utils import *
|
159
159
|
except Exception as e:
|
160
160
|
print(e)
|
161
161
|
|
@@ -173,263 +173,54 @@ def train_lstm(X_train, y_train, nfeature, ntime, verbose = 2, epochs = 200, bat
|
|
173
173
|
model.fit(X_train, y_train, epochs = epochs, batch_size = batch_size, verbose=verbose)
|
174
174
|
return model
|
175
175
|
|
176
|
-
|
177
|
-
|
178
|
-
# ========================================================================================================
|
179
|
-
import numpy as np
|
180
|
-
from xgboost import XGBRegressor
|
181
|
-
from sklearn.metrics import mean_squared_error
|
182
|
-
|
183
|
-
class XGBoostDeepForestRegressor:
|
184
|
-
def __init__(self, n_estimators_per_layer=2, max_layers=20, early_stopping_rounds=2):
|
185
|
-
self.n_estimators_per_layer = n_estimators_per_layer
|
186
|
-
self.max_layers = max_layers
|
187
|
-
self.early_stopping_rounds = early_stopping_rounds
|
188
|
-
self.layers = []
|
189
|
-
|
190
|
-
def _fit_layer(self, X, y):
|
191
|
-
layer = []
|
192
|
-
layer_outputs = []
|
193
|
-
for _ in range(self.n_estimators_per_layer):
|
194
|
-
reg = XGBRegressor()
|
195
|
-
reg.fit(X, y)
|
196
|
-
preds = reg.predict(X).reshape(-1, 1)
|
197
|
-
layer.append(reg)
|
198
|
-
layer_outputs.append(preds)
|
199
|
-
output = np.hstack(layer_outputs)
|
200
|
-
return layer, output
|
201
|
-
|
202
|
-
def fit(self, X, y, X_val=None, y_val=None):
|
203
|
-
X_current = X.copy()
|
204
|
-
best_rmse = float("inf")
|
205
|
-
no_improve_rounds = 0
|
206
|
-
|
207
|
-
for layer_index in range(self.max_layers):
|
208
|
-
print(f"Training Layer {layer_index + 1}")
|
209
|
-
layer, output = self._fit_layer(X_current, y)
|
210
|
-
self.layers.append(layer)
|
211
|
-
X_current = np.hstack([X_current, output])
|
212
|
-
|
213
|
-
if X_val is not None:
|
214
|
-
y_pred = self.predict(X_val)
|
215
|
-
# rmse = mean_squared_error(y_val, y_pred, squared=False)
|
216
|
-
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
|
217
|
-
print(f"Validation RMSE: {rmse:.4f}")
|
218
|
-
|
219
|
-
if rmse < best_rmse:
|
220
|
-
best_rmse = rmse
|
221
|
-
no_improve_rounds = 0
|
222
|
-
else:
|
223
|
-
no_improve_rounds += 1
|
224
|
-
if no_improve_rounds >= self.early_stopping_rounds:
|
225
|
-
print("Early stopping triggered.")
|
226
|
-
break
|
227
|
-
|
228
|
-
def predict(self, X):
|
229
|
-
X_current = X.copy()
|
230
|
-
for layer in self.layers:
|
231
|
-
layer_outputs = []
|
232
|
-
for reg in layer:
|
233
|
-
n_features = reg.n_features_in_
|
234
|
-
preds = reg.predict(X_current[:, :n_features]).reshape(-1, 1)
|
235
|
-
layer_outputs.append(preds)
|
236
|
-
output = np.hstack(layer_outputs)
|
237
|
-
X_current = np.hstack([X_current, output])
|
238
|
-
|
239
|
-
# Final prediction = average of last layer regressors
|
240
|
-
final_outputs = []
|
241
|
-
for reg in self.layers[-1]:
|
242
|
-
n_features = reg.n_features_in_
|
243
|
-
final_outputs.append(reg.predict(X_current[:, :n_features]).reshape(-1, 1))
|
244
|
-
return np.mean(np.hstack(final_outputs), axis=1)
|
245
|
-
|
246
|
-
|
247
|
-
from sklearn.datasets import load_diabetes
|
248
|
-
from sklearn.model_selection import train_test_split
|
249
|
-
from sklearn.metrics import mean_squared_error
|
250
|
-
|
251
|
-
X, y = load_diabetes(return_X_y=True)
|
252
|
-
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
|
253
|
-
|
254
|
-
df_reg = XGBoostDeepForestRegressor(n_estimators_per_layer=2, max_layers=5)
|
255
|
-
df_reg.fit(X_train, y_train, X_val, y_val)
|
256
|
-
|
257
|
-
y_pred = df_reg.predict(X_val)
|
258
|
-
# rmse = mean_squared_error(y_val, y_pred, squared=False)
|
259
|
-
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
|
260
|
-
print("Final RMSE:", rmse)
|
261
|
-
|
262
|
-
# ----------------------------------------------------------------------------------------------------
|
263
|
-
|
176
|
+
# ===============================================================================================================================
|
177
|
+
# Training utils
|
264
178
|
import numpy as np
|
265
|
-
|
266
|
-
from sklearn.
|
267
|
-
import itertools
|
268
|
-
|
269
|
-
class XGBoostDeepForestRegressor:
|
270
|
-
def __init__(self, n_estimators_per_layer=2, max_layers=20, early_stopping_rounds=2, param_grid=None, use_gpu=True, gpu_id=0):
|
271
|
-
self.n_estimators_per_layer = n_estimators_per_layer
|
272
|
-
self.max_layers = max_layers
|
273
|
-
self.early_stopping_rounds = early_stopping_rounds
|
274
|
-
self.param_grid = param_grid or {
|
275
|
-
'max_depth': [3],
|
276
|
-
'learning_rate': [0.1],
|
277
|
-
'n_estimators': [100]
|
278
|
-
}
|
279
|
-
self.use_gpu = use_gpu
|
280
|
-
self.gpu_id = gpu_id
|
281
|
-
self.layers = []
|
282
|
-
|
283
|
-
def _get_param_combinations(self):
|
284
|
-
keys, values = zip(*self.param_grid.items())
|
285
|
-
return [dict(zip(keys, v)) for v in itertools.product(*values)]
|
286
|
-
|
287
|
-
def _fit_layer(self, X, y, X_val=None, y_val=None):
|
288
|
-
layer = []
|
289
|
-
layer_outputs = []
|
290
|
-
param_combos = self._get_param_combinations()
|
291
|
-
|
292
|
-
for i in range(self.n_estimators_per_layer):
|
293
|
-
best_rmse = float('inf')
|
294
|
-
best_model = None
|
295
|
-
|
296
|
-
for params in param_combos:
|
297
|
-
# Set GPU support parameters in XGBRegressor
|
298
|
-
if self.use_gpu:
|
299
|
-
params['tree_method'] = 'hist' # Use hist method
|
300
|
-
params['device'] = 'cuda' # Enable CUDA for GPU
|
301
|
-
|
302
|
-
model = XGBRegressor(**params)
|
303
|
-
model.fit(X, y)
|
304
|
-
|
305
|
-
if X_val is not None:
|
306
|
-
preds_val = model.predict(X_val)
|
307
|
-
rmse = np.sqrt(mean_squared_error(y_val, preds_val))
|
308
|
-
if rmse < best_rmse:
|
309
|
-
best_rmse = rmse
|
310
|
-
best_model = model
|
311
|
-
else:
|
312
|
-
best_model = model
|
313
|
-
|
314
|
-
final_model = best_model
|
315
|
-
preds = final_model.predict(X).reshape(-1, 1)
|
316
|
-
layer.append(final_model)
|
317
|
-
layer_outputs.append(preds)
|
318
|
-
|
319
|
-
output = np.hstack(layer_outputs)
|
320
|
-
return layer, output
|
321
|
-
|
322
|
-
def fit(self, X, y, X_val=None, y_val=None):
|
323
|
-
X_current = X.copy()
|
324
|
-
X_val_current = X_val.copy() if X_val is not None else None
|
325
|
-
|
326
|
-
best_rmse = float("inf")
|
327
|
-
no_improve_rounds = 0
|
328
|
-
|
329
|
-
for layer_index in range(self.max_layers):
|
330
|
-
print(f"Training Layer {layer_index + 1}")
|
331
|
-
layer, output = self._fit_layer(X_current, y, X_val_current, y_val)
|
332
|
-
self.layers.append(layer)
|
333
|
-
X_current = np.hstack([X_current, output])
|
334
|
-
|
335
|
-
if X_val is not None:
|
336
|
-
val_outputs = []
|
337
|
-
for reg in layer:
|
338
|
-
n_features = reg.n_features_in_
|
339
|
-
preds = reg.predict(X_val_current[:, :n_features]).reshape(-1, 1)
|
340
|
-
val_outputs.append(preds)
|
341
|
-
val_output = np.hstack(val_outputs)
|
342
|
-
X_val_current = np.hstack([X_val_current, val_output])
|
343
|
-
|
344
|
-
y_pred = self.predict(X_val)
|
345
|
-
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
|
346
|
-
print(f"Validation RMSE: {rmse:.4f}")
|
347
|
-
|
348
|
-
if rmse < best_rmse:
|
349
|
-
best_rmse = rmse
|
350
|
-
no_improve_rounds = 0
|
351
|
-
else:
|
352
|
-
no_improve_rounds += 1
|
353
|
-
if no_improve_rounds >= self.early_stopping_rounds:
|
354
|
-
print("Early stopping triggered.")
|
355
|
-
break
|
356
|
-
|
357
|
-
def predict(self, X):
|
358
|
-
X_current = X.copy()
|
359
|
-
for layer in self.layers:
|
360
|
-
layer_outputs = []
|
361
|
-
for reg in layer:
|
362
|
-
n_features = reg.n_features_in_
|
363
|
-
preds = reg.predict(X_current[:, :n_features]).reshape(-1, 1)
|
364
|
-
layer_outputs.append(preds)
|
365
|
-
output = np.hstack(layer_outputs)
|
366
|
-
X_current = np.hstack([X_current, output])
|
367
|
-
|
368
|
-
final_outputs = []
|
369
|
-
for reg in self.layers[-1]:
|
370
|
-
n_features = reg.n_features_in_
|
371
|
-
final_outputs.append(reg.predict(X_current[:, :n_features]).reshape(-1, 1))
|
372
|
-
return np.mean(np.hstack(final_outputs), axis=1)
|
373
|
-
|
374
|
-
|
375
|
-
from sklearn.datasets import load_diabetes
|
179
|
+
import pandas as pd
|
180
|
+
from sklearn.model_selection import ShuffleSplit
|
376
181
|
from sklearn.model_selection import train_test_split
|
377
|
-
from sklearn.metrics import mean_squared_error
|
378
|
-
|
379
|
-
# Load dataset
|
380
|
-
X, y = load_diabetes(return_X_y=True)
|
381
|
-
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
|
382
|
-
|
383
|
-
# Hyperparameter grid
|
384
|
-
param_grid = {
|
385
|
-
'max_depth': [3, 4],
|
386
|
-
'learning_rate': [0.1, 0.05],
|
387
|
-
'n_estimators': [50, 100]
|
388
|
-
}
|
389
|
-
|
390
|
-
# Create and fit the model with GPU enabled
|
391
|
-
df_reg = XGBoostDeepForestRegressor(
|
392
|
-
n_estimators_per_layer=2,
|
393
|
-
max_layers=5,
|
394
|
-
early_stopping_rounds=2,
|
395
|
-
param_grid=param_grid,
|
396
|
-
use_gpu=True, # Enable GPU acceleration
|
397
|
-
gpu_id=0 # Default to the first GPU
|
398
|
-
)
|
399
|
-
|
400
|
-
df_reg.fit(X_train, y_train, X_val, y_val)
|
401
|
-
|
402
|
-
# Final evaluation
|
403
|
-
y_pred = df_reg.predict(X_val)
|
404
|
-
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
|
405
|
-
print("Final RMSE:", rmse)
|
406
|
-
|
407
|
-
# ----------------------------------------------------------------------------------------------------
|
408
|
-
|
409
|
-
xgb_params = {
|
410
|
-
"objective": "reg:squarederror",
|
411
|
-
"random_state": 0,
|
412
|
-
'seed': 0,
|
413
|
-
'n_estimators': 100,
|
414
|
-
'max_depth': 6,
|
415
|
-
'min_child_weight': 4,
|
416
|
-
'subsample': 0.8,
|
417
|
-
'colsample_bytree': 0.8,
|
418
|
-
'gamma': 0,
|
419
|
-
'reg_alpha': 0,
|
420
|
-
'reg_lambda': 1,
|
421
|
-
'learning_rate': 0.05,
|
422
|
-
}
|
423
182
|
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
183
|
+
# randomly select sites
|
184
|
+
def random_select(ds, count, num, random_state = 0):
|
185
|
+
np.random.seed(random_state)
|
186
|
+
idxs = np.random.choice(np.delete(np.arange(len(ds)), count), num, replace = False)
|
187
|
+
return np.sort(idxs)
|
188
|
+
|
189
|
+
def split(Xs, ys, return_index = False, test_size = 0.33, random_state = 42):
|
190
|
+
if return_index:
|
191
|
+
sss = ShuffleSplit(n_splits=1, test_size = test_size, random_state = random_state)
|
192
|
+
sss.get_n_splits(Xs, ys)
|
193
|
+
train_index, test_index = next(sss.split(Xs, ys))
|
194
|
+
return (train_index, test_index)
|
195
|
+
else:
|
196
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
197
|
+
Xs, ys,
|
198
|
+
test_size = test_size,
|
199
|
+
random_state = random_state
|
200
|
+
)
|
201
|
+
return (X_train, X_test, y_train, y_test)
|
202
|
+
|
203
|
+
def split_cut(Xs, ys, test_ratio = 0.33):
|
204
|
+
"""
|
205
|
+
Split the timeseries into before and after halves
|
206
|
+
"""
|
207
|
+
assert ys.ndim == 2, 'ys must be 2D!'
|
208
|
+
assert len(Xs) == len(ys), 'Xs and ys should be equally long!'
|
209
|
+
assert type(Xs) == type(ys), 'Xs and ys should be the same data type!'
|
210
|
+
if not type(Xs) in [pd.core.frame.DataFrame, np.ndarray]: raise Exception('Only accept numpy ndarray or pandas dataframe')
|
211
|
+
anchor = int(np.floor(len(ys) * (1 - test_ratio)))
|
212
|
+
|
213
|
+
if type(Xs) == pd.core.frame.DataFrame:
|
214
|
+
X_train = Xs.iloc[0: anchor, :]
|
215
|
+
X_test = Xs.iloc[anchor::, :]
|
216
|
+
y_train = ys.iloc[0: anchor, :]
|
217
|
+
y_test = ys.iloc[anchor::, :]
|
218
|
+
else:
|
219
|
+
X_train = Xs[0: anchor, :]
|
220
|
+
X_test = Xs[anchor::, :]
|
221
|
+
y_train = ys[0: anchor, :]
|
222
|
+
y_test = ys[anchor::, :]
|
432
223
|
|
433
|
-
|
224
|
+
assert len(X_train) + len(X_test) == len(Xs), 'The sum of train and test lengths must equal to Xs/ys!'
|
434
225
|
|
435
|
-
|
226
|
+
return (X_train, X_test, y_train, y_test)
|
sciml/regress2.py
ADDED
@@ -0,0 +1,217 @@
|
|
1
|
+
# Model type I and II regression, including RMA (reduced major axis regression)
|
2
|
+
|
3
|
+
"""
|
4
|
+
Credit: UMaine MISC Lab; emmanuel.boss@maine.edu
|
5
|
+
http://misclab.umeoce.maine.edu/
|
6
|
+
https://github.com/OceanOptics
|
7
|
+
------------------------------------------------------------------------------
|
8
|
+
MIT License
|
9
|
+
|
10
|
+
Copyright (c) [year] [fullname]
|
11
|
+
|
12
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
13
|
+
of this software and associated documentation files (the "Software"), to deal
|
14
|
+
in the Software without restriction, including without limitation the rights
|
15
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
16
|
+
copies of the Software, and to permit persons to whom the Software is
|
17
|
+
furnished to do so, subject to the following conditions:
|
18
|
+
|
19
|
+
The above copyright notice and this permission notice shall be included in all
|
20
|
+
copies or substantial portions of the Software.
|
21
|
+
|
22
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
23
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
24
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
25
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
26
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
27
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
28
|
+
SOFTWARE.
|
29
|
+
"""
|
30
|
+
|
31
|
+
import statsmodels.api as sm
|
32
|
+
import numpy as np
|
33
|
+
|
34
|
+
|
35
|
+
def regress2(_x, _y, _method_type_1 = "ordinary least square",
|
36
|
+
_method_type_2 = "reduced major axis",
|
37
|
+
_weight_x = [], _weight_y = [], _need_intercept = True):
|
38
|
+
# Regression Type II based on statsmodels
|
39
|
+
# Type II regressions are recommended if there is variability on both x and y
|
40
|
+
# It's computing the linear regression type I for (x,y) and (y,x)
|
41
|
+
# and then average relationship with one of the type II methods
|
42
|
+
#
|
43
|
+
# INPUT:
|
44
|
+
# _x <np.array>
|
45
|
+
# _y <np.array>
|
46
|
+
# _method_type_1 <str> method to use for regression type I:
|
47
|
+
# ordinary least square or OLS <default>
|
48
|
+
# weighted least square or WLS
|
49
|
+
# robust linear model or RLM
|
50
|
+
# _method_type_2 <str> method to use for regression type II:
|
51
|
+
# major axis
|
52
|
+
# reduced major axis <default> (also known as geometric mean)
|
53
|
+
# arithmetic mean
|
54
|
+
# _need_intercept <bool>
|
55
|
+
# True <default> add a constant to relation (y = a x + b)
|
56
|
+
# False force relation by 0 (y = a x)
|
57
|
+
# _weight_x <np.array> containing the weigth of x
|
58
|
+
# _weigth_y <np.array> containing the weigth of y
|
59
|
+
#
|
60
|
+
# OUTPUT:
|
61
|
+
# slope
|
62
|
+
# intercept
|
63
|
+
# r
|
64
|
+
# std_slope
|
65
|
+
# std_intercept
|
66
|
+
# predict
|
67
|
+
#
|
68
|
+
# REQUIRE:
|
69
|
+
# numpy
|
70
|
+
# statsmodels
|
71
|
+
#
|
72
|
+
# The code is based on the matlab function of MBARI.
|
73
|
+
# AUTHOR: Nils Haentjens
|
74
|
+
# REFERENCE: https://www.mbari.org/products/research-software/matlab-scripts-linear-regressions/
|
75
|
+
|
76
|
+
# Check input
|
77
|
+
if _method_type_2 != "reduced major axis" and _method_type_1 != "ordinary least square":
|
78
|
+
raise ValueError("'" + _method_type_2 + "' only supports '" + _method_type_1 + "' method as type 1.")
|
79
|
+
|
80
|
+
# Set x, y depending on intercept requirement
|
81
|
+
if _need_intercept:
|
82
|
+
x_intercept = sm.add_constant(_x)
|
83
|
+
y_intercept = sm.add_constant(_y)
|
84
|
+
|
85
|
+
# Compute Regression Type I (if type II requires it)
|
86
|
+
if (_method_type_2 == "reduced major axis" or
|
87
|
+
_method_type_2 == "geometric mean"):
|
88
|
+
if _method_type_1 == "OLS" or _method_type_1 == "ordinary least square":
|
89
|
+
if _need_intercept:
|
90
|
+
[intercept_a, slope_a] = sm.OLS(_y, x_intercept).fit().params
|
91
|
+
[intercept_b, slope_b] = sm.OLS(_x, y_intercept).fit().params
|
92
|
+
else:
|
93
|
+
slope_a = sm.OLS(_y, _x).fit().params
|
94
|
+
slope_b = sm.OLS(_x, _y).fit().params
|
95
|
+
elif _method_type_1 == "WLS" or _method_type_1 == "weighted least square":
|
96
|
+
if _need_intercept:
|
97
|
+
[intercept_a, slope_a] = sm.WLS(
|
98
|
+
_y, x_intercept, weights=1. / _weight_y).fit().params
|
99
|
+
[intercept_b, slope_b] = sm.WLS(
|
100
|
+
_x, y_intercept, weights=1. / _weight_x).fit().params
|
101
|
+
else:
|
102
|
+
slope_a = sm.WLS(_y, _x, weights=1. / _weight_y).fit().params
|
103
|
+
slope_b = sm.WLS(_x, _y, weights=1. / _weight_x).fit().params
|
104
|
+
elif _method_type_1 == "RLM" or _method_type_1 == "robust linear model":
|
105
|
+
if _need_intercept:
|
106
|
+
[intercept_a, slope_a] = sm.RLM(_y, x_intercept).fit().params
|
107
|
+
[intercept_b, slope_b] = sm.RLM(_x, y_intercept).fit().params
|
108
|
+
else:
|
109
|
+
slope_a = sm.RLM(_y, _x).fit().params
|
110
|
+
slope_b = sm.RLM(_x, _y).fit().params
|
111
|
+
else:
|
112
|
+
raise ValueError("Invalid literal for _method_type_1: " + _method_type_1)
|
113
|
+
|
114
|
+
# Compute Regression Type II
|
115
|
+
if (_method_type_2 == "reduced major axis" or
|
116
|
+
_method_type_2 == "geometric mean"):
|
117
|
+
# Transpose coefficients
|
118
|
+
if _need_intercept:
|
119
|
+
intercept_b = -intercept_b / slope_b
|
120
|
+
slope_b = 1 / slope_b
|
121
|
+
# Check if correlated in same direction
|
122
|
+
if np.sign(slope_a) != np.sign(slope_b):
|
123
|
+
raise RuntimeError('Type I regressions of opposite sign.')
|
124
|
+
# Compute Reduced Major Axis Slope
|
125
|
+
slope = np.sign(slope_a) * np.sqrt(slope_a * slope_b)
|
126
|
+
if _need_intercept:
|
127
|
+
# Compute Intercept (use mean for least square)
|
128
|
+
if _method_type_1 == "OLS" or _method_type_1 == "ordinary least square":
|
129
|
+
intercept = np.mean(_y) - slope * np.mean(_x)
|
130
|
+
else:
|
131
|
+
intercept = np.median(_y) - slope * np.median(_x)
|
132
|
+
else:
|
133
|
+
intercept = 0
|
134
|
+
# Compute r
|
135
|
+
r = np.sign(slope_a) * np.sqrt(slope_a / slope_b)
|
136
|
+
# Compute predicted values
|
137
|
+
predict = slope * _x + intercept
|
138
|
+
# Compute standard deviation of the slope and the intercept
|
139
|
+
n = len(_x)
|
140
|
+
diff = _y - predict
|
141
|
+
Sx2 = np.sum(np.multiply(_x, _x))
|
142
|
+
den = n * Sx2 - np.sum(_x) ** 2
|
143
|
+
s2 = np.sum(np.multiply(diff, diff)) / (n - 2)
|
144
|
+
std_slope = np.sqrt(n * s2 / den)
|
145
|
+
if _need_intercept:
|
146
|
+
std_intercept = np.sqrt(Sx2 * s2 / den)
|
147
|
+
else:
|
148
|
+
std_intercept = 0
|
149
|
+
elif (_method_type_2 == "Pearson's major axis" or
|
150
|
+
_method_type_2 == "major axis"):
|
151
|
+
if not _need_intercept:
|
152
|
+
raise ValueError("Invalid value for _need_intercept: " + str(_need_intercept))
|
153
|
+
xm = np.mean(_x)
|
154
|
+
ym = np.mean(_y)
|
155
|
+
xp = _x - xm
|
156
|
+
yp = _y - ym
|
157
|
+
sumx2 = np.sum(np.multiply(xp, xp))
|
158
|
+
sumy2 = np.sum(np.multiply(yp, yp))
|
159
|
+
sumxy = np.sum(np.multiply(xp, yp))
|
160
|
+
slope = ((sumy2 - sumx2 + np.sqrt((sumy2 - sumx2)**2 + 4 * sumxy**2)) /
|
161
|
+
(2 * sumxy))
|
162
|
+
intercept = ym - slope * xm
|
163
|
+
# Compute r
|
164
|
+
r = sumxy / np.sqrt(sumx2 * sumy2)
|
165
|
+
# Compute standard deviation of the slope and the intercept
|
166
|
+
n = len(_x)
|
167
|
+
std_slope = (slope / r) * np.sqrt((1 - r ** 2) / n)
|
168
|
+
sigx = np.sqrt(sumx2 / (n - 1))
|
169
|
+
sigy = np.sqrt(sumy2 / (n - 1))
|
170
|
+
std_i1 = (sigy - sigx * slope) ** 2
|
171
|
+
std_i2 = (2 * sigx * sigy) + ((xm ** 2 * slope * (1 + r)) / r ** 2)
|
172
|
+
std_intercept = np.sqrt((std_i1 + ((1 - r) * slope * std_i2)) / n)
|
173
|
+
# Compute predicted values
|
174
|
+
predict = slope * _x + intercept
|
175
|
+
elif _method_type_2 == "arithmetic mean":
|
176
|
+
if not _need_intercept:
|
177
|
+
raise ValueError("Invalid value for _need_intercept: " + str(_need_intercept))
|
178
|
+
n = len(_x)
|
179
|
+
sg = np.floor(n / 2)
|
180
|
+
# Sort x and y in order of x
|
181
|
+
sorted_index = sorted(range(len(_x)), key=lambda i: _x[i])
|
182
|
+
x_w = np.array([_x[i] for i in sorted_index])
|
183
|
+
y_w = np.array([_y[i] for i in sorted_index])
|
184
|
+
x1 = x_w[1:sg + 1]
|
185
|
+
x2 = x_w[sg:n]
|
186
|
+
y1 = y_w[1:sg + 1]
|
187
|
+
y2 = y_w[sg:n]
|
188
|
+
x1m = np.mean(x1)
|
189
|
+
x2m = np.mean(x2)
|
190
|
+
y1m = np.mean(y1)
|
191
|
+
y2m = np.mean(y2)
|
192
|
+
xm = (x1m + x2m) / 2
|
193
|
+
ym = (y1m + y2m) / 2
|
194
|
+
slope = (x2m - x1m) / (y2m - y1m)
|
195
|
+
intercept = ym - xm * slope
|
196
|
+
# r (to verify)
|
197
|
+
r = []
|
198
|
+
# Compute predicted values
|
199
|
+
predict = slope * _x + intercept
|
200
|
+
# Compute standard deviation of the slope and the intercept
|
201
|
+
std_slope = []
|
202
|
+
std_intercept = []
|
203
|
+
|
204
|
+
# Return all that
|
205
|
+
return {"slope": float(slope), "intercept": intercept, "r": r,
|
206
|
+
"std_slope": std_slope, "std_intercept": std_intercept,
|
207
|
+
"predict": predict}
|
208
|
+
|
209
|
+
|
210
|
+
# if __name__ == '__main__':
|
211
|
+
# x = np.linspace(0, 10, 100)
|
212
|
+
# # Add random error on y
|
213
|
+
# e = np.random.normal(size=len(x))
|
214
|
+
# y = x + e
|
215
|
+
# results = regress2(x, y, _method_type_2="reduced major axis",
|
216
|
+
# _need_intercept=False)
|
217
|
+
# # print(results)
|
@@ -0,0 +1,11 @@
|
|
1
|
+
sciml/__init__.py,sha256=BqRVu5DbfbnxksBXhe4gH_uulPdqTjSaSO1LvGkc37Q,79
|
2
|
+
sciml/ccc.py,sha256=AE1l46hvh18_Q9_BQufMjsGF9-JfsTw2hrT1CbgBHE8,1210
|
3
|
+
sciml/metrics.py,sha256=ICEeH6jwmpdx9jxwYSzB_YTvbyBq9AEUYqkZiVS1ZGs,3577
|
4
|
+
sciml/models.py,sha256=qc2LgdpSkq9kGMnLKZTnyuwzytCu6R8hyU5i6PaI7Qw,10345
|
5
|
+
sciml/pipelines.py,sha256=NGBwl5vA0Uq5GO-VtIow_k42K7HoVwxPQrkW-jINflY,8381
|
6
|
+
sciml/regress2.py,sha256=GSZ4IqmyF9u3PGOhHIKV0Rb_C2pI8eJ3jGJBa1IrEXM,8978
|
7
|
+
sciml-0.0.10.dist-info/LICENSE,sha256=dX4jBmkgQPWc_TfYkXtKQzVIgZQWFuHZ8vQjV4sEeV4,1060
|
8
|
+
sciml-0.0.10.dist-info/METADATA,sha256=iMcI6kpM6IX2oBhx9JwmI77JiX2bZPWI93dHta_jkCM,314
|
9
|
+
sciml-0.0.10.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
10
|
+
sciml-0.0.10.dist-info/top_level.txt,sha256=dS_7aBCZFKQE3myPy5sh4USjQZCZyGg382-YxUUYcdw,6
|
11
|
+
sciml-0.0.10.dist-info/RECORD,,
|
sciml/utils.py
DELETED
@@ -1,46 +0,0 @@
|
|
1
|
-
import numpy as np
|
2
|
-
import pandas as pd
|
3
|
-
from sklearn.model_selection import ShuffleSplit
|
4
|
-
from sklearn.model_selection import train_test_split
|
5
|
-
|
6
|
-
# randomly select sites
|
7
|
-
def random_select(ds, count, num, random_state = 0):
|
8
|
-
np.random.seed(random_state)
|
9
|
-
idxs = np.random.choice(np.delete(np.arange(len(ds)), count), num, replace = False)
|
10
|
-
return np.sort(idxs)
|
11
|
-
|
12
|
-
def split(Xs, ys, return_index = False, test_size = 0.33, random_state = 42):
|
13
|
-
if return_index:
|
14
|
-
sss = ShuffleSplit(n_splits=1, test_size = test_size, random_state = random_state)
|
15
|
-
sss.get_n_splits(Xs, ys)
|
16
|
-
train_index, test_index = next(sss.split(Xs, ys))
|
17
|
-
return (train_index, test_index)
|
18
|
-
else:
|
19
|
-
X_train, X_test, y_train, y_test = train_test_split(
|
20
|
-
Xs, ys,
|
21
|
-
test_size = test_size,
|
22
|
-
random_state = random_state
|
23
|
-
)
|
24
|
-
return (X_train, X_test, y_train, y_test)
|
25
|
-
|
26
|
-
def split_cut(Xs, ys, test_ratio = 0.33):
|
27
|
-
assert ys.ndim == 2, 'ys must be 2D!'
|
28
|
-
assert len(Xs) == len(ys), 'Xs and ys should be equally long!'
|
29
|
-
assert type(Xs) == type(ys), 'Xs and ys should be the same data type!'
|
30
|
-
if not type(Xs) in [pd.core.frame.DataFrame, np.ndarray]: raise Exception('Only accept numpy ndarray or pandas dataframe')
|
31
|
-
anchor = int(np.floor(len(ys) * (1 - test_ratio)))
|
32
|
-
|
33
|
-
if type(Xs) == pd.core.frame.DataFrame:
|
34
|
-
X_train = Xs.iloc[0: anchor, :]
|
35
|
-
X_test = Xs.iloc[anchor::, :]
|
36
|
-
y_train = ys.iloc[0: anchor, :]
|
37
|
-
y_test = ys.iloc[anchor::, :]
|
38
|
-
else:
|
39
|
-
X_train = Xs[0: anchor, :]
|
40
|
-
X_test = Xs[anchor::, :]
|
41
|
-
y_train = ys[0: anchor, :]
|
42
|
-
y_test = ys[anchor::, :]
|
43
|
-
|
44
|
-
assert len(X_train) + len(X_test) == len(Xs), 'The sum of train and test lengths must equal to Xs/ys!'
|
45
|
-
|
46
|
-
return (X_train, X_test, y_train, y_test)
|
sciml-0.0.8.dist-info/RECORD
DELETED
@@ -1,9 +0,0 @@
|
|
1
|
-
sciml/__init__.py,sha256=6iQAGgCEMuw4yoLBzZDax46a45LZgzEeNSHQMdmcBSQ,58
|
2
|
-
sciml/models.py,sha256=p6cw3SxTQaOtFhJx8KdW0Z2QtxBlSBlVPHETTNCjJ2w,9880
|
3
|
-
sciml/pipelines.py,sha256=CJolleJakoEQc-EV-v6NovP3bDb1hif7SvObXdaLXdY,15268
|
4
|
-
sciml/utils.py,sha256=u5DzQJV4aCZ-p7sY56Fxzj8WDGYOgn1rOTeGzAw0vwY,1831
|
5
|
-
sciml-0.0.8.dist-info/LICENSE,sha256=dX4jBmkgQPWc_TfYkXtKQzVIgZQWFuHZ8vQjV4sEeV4,1060
|
6
|
-
sciml-0.0.8.dist-info/METADATA,sha256=uMCtigVwS2e0abqbvfbLZca6iZnkdDTBXtbjdg34yIA,313
|
7
|
-
sciml-0.0.8.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
8
|
-
sciml-0.0.8.dist-info/top_level.txt,sha256=dS_7aBCZFKQE3myPy5sh4USjQZCZyGg382-YxUUYcdw,6
|
9
|
-
sciml-0.0.8.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|