sciml 0.0.9__py3-none-any.whl → 0.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sciml/__init__.py +1 -1
- sciml/ccc.py +36 -0
- sciml/metrics.py +123 -0
- sciml/models.py +524 -5
- sciml/pipelines.py +47 -256
- sciml/regress2.py +217 -0
- {sciml-0.0.9.dist-info → sciml-0.0.11.dist-info}/METADATA +1 -1
- sciml-0.0.11.dist-info/RECORD +11 -0
- sciml/utils.py +0 -46
- sciml-0.0.9.dist-info/RECORD +0 -9
- {sciml-0.0.9.dist-info → sciml-0.0.11.dist-info}/LICENSE +0 -0
- {sciml-0.0.9.dist-info → sciml-0.0.11.dist-info}/WHEEL +0 -0
- {sciml-0.0.9.dist-info → sciml-0.0.11.dist-info}/top_level.txt +0 -0
sciml/__init__.py
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
# coding: utf-8
|
2
|
-
__all__ = ["
|
2
|
+
__all__ = ["pipelines", "models", "metrics", "regress2", "ccc"]
|
sciml/ccc.py
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# https://rowannicholls.github.io/python/statistics/agreement/correlation_coefficients.html#lins-concordance-correlation-coefficient-ccc
|
2
|
+
# Lin LIK (1989). “A concordance correlation coefficient to evaluate reproducibility”. Biometrics. 45 (1):255-268.
|
3
|
+
import numpy as np
|
4
|
+
import pandas as pd
|
5
|
+
|
6
|
+
def concordance_correlation_coefficient(y_true, y_pred):
|
7
|
+
"""Concordance correlation coefficient."""
|
8
|
+
# Remove NaNs
|
9
|
+
df = pd.DataFrame({
|
10
|
+
'y_true': y_true,
|
11
|
+
'y_pred': y_pred
|
12
|
+
})
|
13
|
+
df = df.dropna()
|
14
|
+
y_true = df['y_true']
|
15
|
+
y_pred = df['y_pred']
|
16
|
+
# Pearson product-moment correlation coefficients
|
17
|
+
cor = np.corrcoef(y_true, y_pred)[0][1]
|
18
|
+
# Mean
|
19
|
+
mean_true = np.mean(y_true)
|
20
|
+
mean_pred = np.mean(y_pred)
|
21
|
+
# Variance
|
22
|
+
var_true = np.var(y_true)
|
23
|
+
var_pred = np.var(y_pred)
|
24
|
+
# Standard deviation
|
25
|
+
sd_true = np.std(y_true)
|
26
|
+
sd_pred = np.std(y_pred)
|
27
|
+
# Calculate CCC
|
28
|
+
numerator = 2 * cor * sd_true * sd_pred
|
29
|
+
denominator = var_true + var_pred + (mean_true - mean_pred)**2
|
30
|
+
return numerator / denominator
|
31
|
+
|
32
|
+
|
33
|
+
# y_true = [3, -0.5, 2, 7, np.NaN]
|
34
|
+
# y_pred = [2.5, 0.0, 2, 8, 3]
|
35
|
+
# ccc = concordance_correlation_coefficient(y_true, y_pred)
|
36
|
+
# print(ccc)
|
sciml/metrics.py
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import pandas as pd
|
3
|
+
from scipy import stats
|
4
|
+
from sklearn.metrics import explained_variance_score, max_error, mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error, r2_score, mean_poisson_deviance, mean_gamma_deviance, mean_tweedie_deviance
|
5
|
+
|
6
|
+
def stats_summary(df):
|
7
|
+
min_ = df.min().to_frame().T
|
8
|
+
Q1 = df.quantile(0.25).to_frame().T
|
9
|
+
median_ = df.quantile(0.5).to_frame().T
|
10
|
+
mean_ = df.mean().to_frame().T
|
11
|
+
Q3 = df.quantile(0.75).to_frame().T
|
12
|
+
max_ = df.max().to_frame().T
|
13
|
+
df_stats = pd.concat([min_, Q1, median_, mean_, Q3, max_])
|
14
|
+
df_stats.index = ["Min", "Q1", "Median", "Mean", "Q3", "Max"]
|
15
|
+
return df_stats
|
16
|
+
|
17
|
+
def stats_measures(x, y, return_dict = False):
|
18
|
+
slope, intercept, rvalue, pvalue, stderr = stats.linregress(x, y)
|
19
|
+
mse = mean_squared_error(x, y)
|
20
|
+
r2 = rvalue ** 2
|
21
|
+
rmse = np.sqrt(mse)
|
22
|
+
mbe = (y - x).mean()
|
23
|
+
if return_dict:
|
24
|
+
return {
|
25
|
+
"R2": r2,
|
26
|
+
"SLOPE": slope,
|
27
|
+
"RMSE": rmse,
|
28
|
+
"MBE": mbe
|
29
|
+
}
|
30
|
+
else:
|
31
|
+
return [r2, slope, rmse, mbe]
|
32
|
+
|
33
|
+
def stats_measures_full(x, y):
|
34
|
+
# from sklearn.metrics import mean_absolute_percentage_error
|
35
|
+
slope, intercept, rvalue, pvalue, stderr = stats.linregress(x, y)
|
36
|
+
mse = mean_squared_error(x, y)
|
37
|
+
r2 = rvalue ** 2
|
38
|
+
rmse = np.sqrt(mse)
|
39
|
+
mbe = (y - x).mean()
|
40
|
+
# ----------------------------------------------------------------
|
41
|
+
pearsonr = stats.pearsonr(x, y)
|
42
|
+
evs = explained_variance_score(x, y)
|
43
|
+
me = max_error(x, y)
|
44
|
+
mae = mean_absolute_error(x, y)
|
45
|
+
msle = mean_squared_log_error(x, y)
|
46
|
+
meae = median_absolute_error(x, y)
|
47
|
+
r2_score = r2_score(x, y)
|
48
|
+
mpd = mean_poisson_deviance(x, y)
|
49
|
+
mgd = mean_gamma_deviance(x, y)
|
50
|
+
mtd = mean_tweedie_deviance(x, y)
|
51
|
+
return {
|
52
|
+
"R2": r2,
|
53
|
+
"SLOPE": slope,
|
54
|
+
"RMSE": rmse,
|
55
|
+
"MBE": mbe,
|
56
|
+
"INTERCEPT": intercept,
|
57
|
+
"PVALUE": pvalue,
|
58
|
+
"STDERR": stderr,
|
59
|
+
"PEARSON": pearsonr,
|
60
|
+
"EXPLAINED_VARIANCE": evs,
|
61
|
+
"MAXERR": me,
|
62
|
+
"MAE": mae,
|
63
|
+
"MSLE": msle,
|
64
|
+
"MEDIAN_AE": meae,
|
65
|
+
"R2_SCORE": r2_score,
|
66
|
+
"MPD": mpd,
|
67
|
+
"MGD": mgd,
|
68
|
+
"MTD": mtd
|
69
|
+
}
|
70
|
+
|
71
|
+
def stats_measures_df(df, name1, name2, return_dict = False):
|
72
|
+
slope, intercept, rvalue, pvalue, stderr = stats.linregress(df[name1], df[name2])
|
73
|
+
mse = mean_squared_error(df[name1], df[name2])
|
74
|
+
r2 = rvalue ** 2
|
75
|
+
rmse = np.sqrt(mse)
|
76
|
+
mbe = (df[name2] - df[name1]).mean()
|
77
|
+
if return_dict:
|
78
|
+
return {
|
79
|
+
"R2": r2,
|
80
|
+
"SLOPE": slope,
|
81
|
+
"RMSE": rmse,
|
82
|
+
"MBE": mbe
|
83
|
+
}
|
84
|
+
else:
|
85
|
+
return [r2, slope, rmse, mbe]
|
86
|
+
|
87
|
+
|
88
|
+
|
89
|
+
def get_r2(x, y):
|
90
|
+
try:
|
91
|
+
x_bar = x.mean()
|
92
|
+
except:
|
93
|
+
x_bar = np.mean(x)
|
94
|
+
|
95
|
+
r2 = 1 - np.sum((x - y)**2) / np.sum((x - x_bar)**2)
|
96
|
+
return r2
|
97
|
+
|
98
|
+
def get_rmse(observations, estimates):
|
99
|
+
return np.sqrt(((estimates - observations) ** 2).mean())
|
100
|
+
|
101
|
+
def calculate_R2(y_true, y_pred):
|
102
|
+
"""
|
103
|
+
Calculate the R^2 (coefficient of determination).
|
104
|
+
|
105
|
+
Args:
|
106
|
+
y_true (array-like): Actual values of the dependent variable.
|
107
|
+
y_pred (array-like): Predicted values of the dependent variable.
|
108
|
+
|
109
|
+
Returns:
|
110
|
+
float: The R^2 value.
|
111
|
+
"""
|
112
|
+
y_true = np.array(y_true)
|
113
|
+
y_pred = np.array(y_pred)
|
114
|
+
|
115
|
+
# Residual sum of squares
|
116
|
+
ss_res = np.sum((y_true - y_pred) ** 2)
|
117
|
+
|
118
|
+
# Total sum of squares
|
119
|
+
ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
|
120
|
+
|
121
|
+
# R^2 calculation
|
122
|
+
R2 = 1 - (ss_res / ss_tot)
|
123
|
+
return R2
|
sciml/models.py
CHANGED
@@ -223,13 +223,12 @@ class SmartForest:
|
|
223
223
|
|
224
224
|
"""
|
225
225
|
# ============================== Test Example ==============================
|
226
|
+
import warnings
|
227
|
+
import numpy as np
|
226
228
|
from sklearn.datasets import load_diabetes
|
227
229
|
from sklearn.datasets import fetch_california_housing
|
228
230
|
from sklearn.model_selection import train_test_split
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
warnings.simplefilter('ignore')
|
231
|
+
from sklearn.metrics import mean_squared_error
|
233
232
|
|
234
233
|
# X, y = load_diabetes(return_X_y=True) # Using diabetes dataset
|
235
234
|
X, y = fetch_california_housing(return_X_y=True) # Using house price dataset
|
@@ -274,4 +273,524 @@ print("\nFinal RMSE:", rmse)
|
|
274
273
|
# Output best model and RMSE
|
275
274
|
best_model, best_rmse = regr.get_best_model()
|
276
275
|
print("\nBest validation RMSE:", best_rmse)
|
277
|
-
"""
|
276
|
+
"""
|
277
|
+
|
278
|
+
# ============================================================================================================================================================
|
279
|
+
|
280
|
+
import numpy as np
|
281
|
+
import copy
|
282
|
+
import itertools
|
283
|
+
from scipy import ndimage
|
284
|
+
from xgboost import XGBRegressor
|
285
|
+
from sklearn.metrics import mean_squared_error
|
286
|
+
from sklearn.model_selection import train_test_split
|
287
|
+
|
288
|
+
class SmartForest4D:
|
289
|
+
"""
|
290
|
+
SmartForest4D is an ensemble learning model designed to handle complex 4D input data
|
291
|
+
(samples, time, spatial, features). It integrates ideas from gradient-boosted decision trees
|
292
|
+
(XGBoost) with LSTM-style forget gates and spatial max pooling.
|
293
|
+
|
294
|
+
The model builds layers of regressors, each layer taking the previous output as part of its
|
295
|
+
input (deep forest style). A forget gate mechanism is applied along the time dimension to
|
296
|
+
emphasize recent temporal information. Spatial max pooling is used to reduce dimensionality
|
297
|
+
across spatial units before flattening and feeding into the regressors.
|
298
|
+
|
299
|
+
Parameters:
|
300
|
+
-----------
|
301
|
+
n_estimators_per_layer : int
|
302
|
+
Number of XGBoost regressors per layer.
|
303
|
+
|
304
|
+
max_layers : int
|
305
|
+
Maximum number of layers in the deep forest.
|
306
|
+
|
307
|
+
early_stopping_rounds : int
|
308
|
+
Number of rounds without improvement on the validation set before early stopping.
|
309
|
+
|
310
|
+
param_grid : dict
|
311
|
+
Dictionary of hyperparameter lists to search over for XGBoost.
|
312
|
+
|
313
|
+
use_gpu : bool
|
314
|
+
Whether to use GPU for training XGBoost models.
|
315
|
+
|
316
|
+
gpu_id : int
|
317
|
+
GPU device ID to use if use_gpu is True.
|
318
|
+
|
319
|
+
kernel: np.ndarray
|
320
|
+
Convolutional kernel for spatial processing.
|
321
|
+
# ===============================
|
322
|
+
# 0. Do nothing
|
323
|
+
# ===============================
|
324
|
+
|
325
|
+
identity_kernel = np.array([
|
326
|
+
[0, 0, 0],
|
327
|
+
[0, 1, 0],
|
328
|
+
[0, 0, 0]
|
329
|
+
])
|
330
|
+
|
331
|
+
# ===============================
|
332
|
+
# 1. Sobel Edge Detection Kernels
|
333
|
+
# ===============================
|
334
|
+
|
335
|
+
sobel_x = np.array([
|
336
|
+
[-1, 0, 1],
|
337
|
+
[-2, 0, 2],
|
338
|
+
[-1, 0, 1]
|
339
|
+
])
|
340
|
+
|
341
|
+
sobel_y = np.array([
|
342
|
+
[-1, -2, -1],
|
343
|
+
[ 0, 0, 0],
|
344
|
+
[ 1, 2, 1]
|
345
|
+
])
|
346
|
+
|
347
|
+
# ===============================
|
348
|
+
# 2. Gaussian Blur Kernel (3x3)
|
349
|
+
# ===============================
|
350
|
+
gaussian_kernel = (1/16) * np.array([
|
351
|
+
[1, 2, 1],
|
352
|
+
[2, 4, 2],
|
353
|
+
[1, 2, 1]
|
354
|
+
])
|
355
|
+
|
356
|
+
# ===============================
|
357
|
+
# 3. Morphological Structuring Element (3x3 cross)
|
358
|
+
# Used in binary dilation/erosion
|
359
|
+
# ===============================
|
360
|
+
morph_kernel = np.array([
|
361
|
+
[0, 1, 0],
|
362
|
+
[1, 1, 1],
|
363
|
+
[0, 1, 0]
|
364
|
+
])
|
365
|
+
|
366
|
+
# ===============================
|
367
|
+
# 4. Sharpening Kernel
|
368
|
+
# Enhances edges and contrast
|
369
|
+
# ===============================
|
370
|
+
sharpen_kernel = np.array([
|
371
|
+
[ 0, -1, 0],
|
372
|
+
[-1, 5, -1],
|
373
|
+
[ 0, -1, 0]
|
374
|
+
])
|
375
|
+
|
376
|
+
# ===============================
|
377
|
+
# 5. Embossing Kernel
|
378
|
+
# Creates a 3D-like shadowed effect
|
379
|
+
# ===============================
|
380
|
+
emboss_kernel = np.array([
|
381
|
+
[-2, -1, 0],
|
382
|
+
[-1, 1, 1],
|
383
|
+
[ 0, 1, 2]
|
384
|
+
])
|
385
|
+
|
386
|
+
spatial_h : int
|
387
|
+
The height of the 2D grid for the flattened spatial dimension.
|
388
|
+
|
389
|
+
spatial_w : int
|
390
|
+
The width of the 2D grid for the flattened spatial dimension.
|
391
|
+
|
392
|
+
forget_factor : float
|
393
|
+
Exponential decay rate applied along the time axis. Higher values mean stronger forgetting.
|
394
|
+
|
395
|
+
verbose : int
|
396
|
+
Verbosity level for training output.
|
397
|
+
|
398
|
+
Attributes:
|
399
|
+
-----------
|
400
|
+
layers : list
|
401
|
+
List of trained layers, each containing a list of regressors.
|
402
|
+
|
403
|
+
best_model : list
|
404
|
+
The set of layers corresponding to the best validation RMSE seen during training.
|
405
|
+
|
406
|
+
best_rmse : float
|
407
|
+
The lowest RMSE achieved on the validation set.
|
408
|
+
|
409
|
+
Methods:
|
410
|
+
--------
|
411
|
+
fit(X, y, X_val=None, y_val=None):
|
412
|
+
Train the SmartForest4D model on the given 4D input data.
|
413
|
+
|
414
|
+
predict(X):
|
415
|
+
Predict targets for new 4D input data using the trained model.
|
416
|
+
|
417
|
+
get_best_model():
|
418
|
+
Return the best set of layers and corresponding RMSE.
|
419
|
+
|
420
|
+
Notes:
|
421
|
+
------
|
422
|
+
- The product of spatial_h and spatial_w must equal spatial_size (spatial_h * spatial_w = spatial_size).
|
423
|
+
|
424
|
+
Example:
|
425
|
+
--------
|
426
|
+
>>> model = SmartForest4D(n_estimators_per_layer=5, max_layers=10, early_stopping_rounds=3, forget_factor=0.3, verbose=1)
|
427
|
+
>>> model.fit(X_train, y_train, X_val, y_val)
|
428
|
+
>>> y_pred = model.predict(X_val)
|
429
|
+
>>> best_model, best_rmse = model.get_best_model()
|
430
|
+
"""
|
431
|
+
def __init__(self, n_estimators_per_layer=5, max_layers=10, early_stopping_rounds=3, param_grid=None,
|
432
|
+
use_gpu=False, gpu_id=0, kernel = np.array([[0, 0, 0], [0, 1, 0], [0, 0, 0]]), spatial_h=None, spatial_w=None,
|
433
|
+
forget_factor=0.0, verbose=1):
|
434
|
+
self.n_estimators_per_layer = n_estimators_per_layer
|
435
|
+
self.max_layers = max_layers
|
436
|
+
self.early_stopping_rounds = early_stopping_rounds
|
437
|
+
self.param_grid = param_grid or {
|
438
|
+
"objective": ["reg:squarederror"],
|
439
|
+
"random_state": [42],
|
440
|
+
'n_estimators': [100],
|
441
|
+
'max_depth': [6],
|
442
|
+
'min_child_weight': [4],
|
443
|
+
'subsample': [0.8],
|
444
|
+
'colsample_bytree': [0.8],
|
445
|
+
'gamma': [0],
|
446
|
+
'reg_alpha': [0],
|
447
|
+
'reg_lambda': [1],
|
448
|
+
'learning_rate': [0.05],
|
449
|
+
}
|
450
|
+
self.use_gpu = use_gpu
|
451
|
+
self.gpu_id = gpu_id
|
452
|
+
self.kernel = kernel
|
453
|
+
self.spatial_h = spatial_h
|
454
|
+
self.spatial_w = spatial_w
|
455
|
+
self.forget_factor = forget_factor
|
456
|
+
self.layers = []
|
457
|
+
self.best_model = None
|
458
|
+
self.best_rmse = float("inf")
|
459
|
+
self.verbose = verbose
|
460
|
+
if (self.spatial_h is None) or (self.spatial_w is None):
|
461
|
+
raise ValueError("Please specify spatial_h and spatial_w")
|
462
|
+
|
463
|
+
def _get_param_combinations(self):
|
464
|
+
keys, values = zip(*self.param_grid.items())
|
465
|
+
return [dict(zip(keys, v)) for v in itertools.product(*values)]
|
466
|
+
|
467
|
+
def _prepare_input(self, X, y=None, apply_forget=False, layer_index=0):
|
468
|
+
# Ensure 4D: (samples, time, spatial, features)
|
469
|
+
if X.ndim == 2:
|
470
|
+
X = X[:, np.newaxis, np.newaxis, :]
|
471
|
+
elif X.ndim == 3:
|
472
|
+
X = X[:, :, np.newaxis, :]
|
473
|
+
elif X.ndim == 4:
|
474
|
+
pass
|
475
|
+
else:
|
476
|
+
raise ValueError("Input must be 2D, 3D, or 4D.")
|
477
|
+
|
478
|
+
n_samples, n_time, n_spatial, n_features = X.shape
|
479
|
+
|
480
|
+
if apply_forget and self.forget_factor > 0:
|
481
|
+
decay = np.exp(-self.forget_factor * np.arange(n_time))[::-1]
|
482
|
+
decay = decay / decay.sum()
|
483
|
+
decay = decay.reshape(1, n_time, 1, 1)
|
484
|
+
X = X * decay
|
485
|
+
|
486
|
+
# Apply convolutional kernels:
|
487
|
+
if n_spatial != 1:
|
488
|
+
if self.spatial_h * self.spatial_w != n_spatial: raise ValueError("spatial_h * spatial_w != n_spatial")
|
489
|
+
X_out = np.zeros_like(X)
|
490
|
+
for sample in range(X.shape[0]):
|
491
|
+
for t in range(X.shape[1]):
|
492
|
+
for f in range(X.shape[3]):
|
493
|
+
spatial_2d = X[sample, t, :, f].reshape(self.spatial_h, self.spatial_w)
|
494
|
+
# Apply 2D convolution
|
495
|
+
filtered = ndimage.convolve(spatial_2d, self.kernel, mode='constant', cval=0.0)
|
496
|
+
# Flatten back to (20,) and store
|
497
|
+
X_out[sample, t, :, f] = filtered.reshape(n_spatial)
|
498
|
+
X = X_out; del(X_out)
|
499
|
+
# Max pooling over spatial dim
|
500
|
+
X_pooled = X.max(axis=2) # (samples, time, features)
|
501
|
+
X_flattened = X_pooled.reshape(n_samples, -1) # (samples, time * features)
|
502
|
+
return X_flattened
|
503
|
+
|
504
|
+
def _fit_layer(self, X, y, X_val=None, y_val=None, layer_index=0):
|
505
|
+
layer = []
|
506
|
+
layer_outputs = []
|
507
|
+
param_combos = self._get_param_combinations()
|
508
|
+
|
509
|
+
for i in range(self.n_estimators_per_layer):
|
510
|
+
best_rmse = float('inf')
|
511
|
+
best_model = None
|
512
|
+
|
513
|
+
for params in param_combos:
|
514
|
+
if self.use_gpu:
|
515
|
+
params['tree_method'] = 'hist'
|
516
|
+
params['device'] = 'cuda'
|
517
|
+
|
518
|
+
params = params.copy()
|
519
|
+
params['random_state'] = i
|
520
|
+
|
521
|
+
model = XGBRegressor(**params)
|
522
|
+
model.fit(X, y)
|
523
|
+
|
524
|
+
if X_val is not None:
|
525
|
+
preds_val = model.predict(X_val)
|
526
|
+
rmse = np.sqrt(mean_squared_error(y_val, preds_val))
|
527
|
+
if rmse < best_rmse:
|
528
|
+
best_rmse = rmse
|
529
|
+
best_model = model
|
530
|
+
else:
|
531
|
+
best_model = model
|
532
|
+
|
533
|
+
preds = best_model.predict(X).reshape(-1, 1)
|
534
|
+
layer.append(best_model)
|
535
|
+
layer_outputs.append(preds)
|
536
|
+
|
537
|
+
output = np.hstack(layer_outputs)
|
538
|
+
return layer, output
|
539
|
+
|
540
|
+
def fit(self, X, y, X_val=None, y_val=None):
|
541
|
+
y = y.ravel()
|
542
|
+
X_current = self._prepare_input(X, apply_forget=True)
|
543
|
+
X_val_current = self._prepare_input(X_val, apply_forget=True) if X_val is not None else None
|
544
|
+
|
545
|
+
no_improve_rounds = 0
|
546
|
+
|
547
|
+
for layer_index in range(self.max_layers):
|
548
|
+
if self.verbose:
|
549
|
+
print(f"Training Layer {layer_index + 1}")
|
550
|
+
|
551
|
+
layer, output = self._fit_layer(X_current, y, X_val_current, y_val, layer_index)
|
552
|
+
self.layers.append(layer)
|
553
|
+
X_current = np.hstack([X_current, output])
|
554
|
+
|
555
|
+
if X_val is not None:
|
556
|
+
val_outputs = []
|
557
|
+
for reg in layer:
|
558
|
+
n_features = reg.n_features_in_
|
559
|
+
preds = reg.predict(X_val_current[:, :n_features]).reshape(-1, 1)
|
560
|
+
val_outputs.append(preds)
|
561
|
+
val_output = np.hstack(val_outputs)
|
562
|
+
X_val_current = np.hstack([X_val_current, val_output])
|
563
|
+
|
564
|
+
y_pred = self.predict(X_val)
|
565
|
+
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
|
566
|
+
if self.verbose:
|
567
|
+
print(f"Validation RMSE: {rmse:.4f}")
|
568
|
+
|
569
|
+
if rmse < self.best_rmse:
|
570
|
+
self.best_rmse = rmse
|
571
|
+
self.best_model = copy.deepcopy(self.layers)
|
572
|
+
no_improve_rounds = 0
|
573
|
+
if self.verbose:
|
574
|
+
print(f"✅ New best RMSE: {self.best_rmse:.4f}")
|
575
|
+
else:
|
576
|
+
no_improve_rounds += 1
|
577
|
+
if no_improve_rounds >= self.early_stopping_rounds:
|
578
|
+
if self.verbose:
|
579
|
+
print("Early stopping triggered.")
|
580
|
+
break
|
581
|
+
|
582
|
+
def predict(self, X):
|
583
|
+
X_current = self._prepare_input(X, apply_forget=True)
|
584
|
+
|
585
|
+
for layer in self.layers:
|
586
|
+
layer_outputs = []
|
587
|
+
for reg in layer:
|
588
|
+
n_features = reg.n_features_in_
|
589
|
+
preds = reg.predict(X_current[:, :n_features]).reshape(-1, 1)
|
590
|
+
layer_outputs.append(preds)
|
591
|
+
output = np.hstack(layer_outputs)
|
592
|
+
X_current = np.hstack([X_current, output])
|
593
|
+
|
594
|
+
final_outputs = []
|
595
|
+
for reg in self.layers[-1]:
|
596
|
+
n_features = reg.n_features_in_
|
597
|
+
final_outputs.append(reg.predict(X_current[:, :n_features]).reshape(-1, 1))
|
598
|
+
return np.mean(np.hstack(final_outputs), axis=1)
|
599
|
+
|
600
|
+
def get_best_model(self):
|
601
|
+
return self.best_model, self.best_rmse
|
602
|
+
|
603
|
+
"""
|
604
|
+
# ============================== Test Example ==============================
|
605
|
+
import numpy as np
|
606
|
+
import copy
|
607
|
+
import itertools
|
608
|
+
from scipy import ndimage
|
609
|
+
from xgboost import XGBRegressor
|
610
|
+
from sklearn.metrics import mean_squared_error
|
611
|
+
from sklearn.model_selection import train_test_split
|
612
|
+
|
613
|
+
# Generate synthetic 4D data: (samples, time, spatial, features)
|
614
|
+
# time order is like [t (today), t - 1 (yesterday), t -2, ...]
|
615
|
+
n_samples = 200
|
616
|
+
n_time = 5
|
617
|
+
n_spatial = 4
|
618
|
+
n_features = 5
|
619
|
+
|
620
|
+
np.random.seed(42)
|
621
|
+
X = np.random.rand(n_samples, n_time, n_spatial, n_features)
|
622
|
+
y = X[:, :3, :2, :4].mean(axis=(1, 2, 3)) + 0.1 * np.random.randn(n_samples)
|
623
|
+
y = y.ravel()
|
624
|
+
|
625
|
+
# Split
|
626
|
+
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
|
627
|
+
|
628
|
+
# Train model
|
629
|
+
model = SmartForest4D(
|
630
|
+
n_estimators_per_layer=5,
|
631
|
+
max_layers=20,
|
632
|
+
early_stopping_rounds=5,
|
633
|
+
spatial_h = 2,
|
634
|
+
spatial_w = 2,
|
635
|
+
forget_factor=0.1,
|
636
|
+
verbose=1
|
637
|
+
)
|
638
|
+
model.fit(X_train, y_train, X_val, y_val)
|
639
|
+
|
640
|
+
# Predict
|
641
|
+
y_pred = model.predict(X_val)
|
642
|
+
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
|
643
|
+
print("\n✅ Final RMSE on validation set:", rmse)
|
644
|
+
|
645
|
+
|
646
|
+
# Output best model and RMSE
|
647
|
+
best_model, best_rmse = model.get_best_model()
|
648
|
+
print("\nBest validation RMSE:", best_rmse)
|
649
|
+
"""
|
650
|
+
|
651
|
+
# ============================================================================================================================================================
|
652
|
+
# Function mode
|
653
|
+
|
654
|
+
import tensorflow as tf
|
655
|
+
from tensorflow import keras
|
656
|
+
from tensorflow.keras import layers
|
657
|
+
from tensorflow.keras.models import load_model
|
658
|
+
|
659
|
+
def srcnn(learning_rate=0.001):
|
660
|
+
"""
|
661
|
+
Builds and compiles a Super-Resolution Convolutional Neural Network (SRCNN) model
|
662
|
+
that fuses features from both low-resolution and high-resolution images.
|
663
|
+
|
664
|
+
This model uses two parallel input streams:
|
665
|
+
- A low-resolution input which undergoes upscaling through convolutional layers.
|
666
|
+
- A high-resolution input from which texture features are extracted and fused with the low-resolution stream.
|
667
|
+
|
668
|
+
Args:
|
669
|
+
save_path (str, optional): Path to save the compiled model. If None, the model is not saved.
|
670
|
+
learning_rate (float): Learning rate for the Adam optimizer.
|
671
|
+
|
672
|
+
Returns:
|
673
|
+
keras.Model: A compiled Keras model ready for training.
|
674
|
+
"""
|
675
|
+
# Input layers
|
676
|
+
lowres_input = layers.Input(shape=(None, None, 1)) # Low-resolution input
|
677
|
+
highres_input = layers.Input(shape=(None, None, 1)) # High-resolution image
|
678
|
+
|
679
|
+
# Feature extraction from high-resolution image
|
680
|
+
highres_features = layers.Conv2D(64, (3, 3), activation="relu", padding="same")(highres_input)
|
681
|
+
highres_features = layers.Conv2D(128, (3, 3), activation="relu", padding="same")(highres_features)
|
682
|
+
|
683
|
+
# Processing low-resoltuion input
|
684
|
+
x = layers.Conv2D(64, (3, 3), activation="relu", padding="same")(lowres_input)
|
685
|
+
x = layers.Conv2D(128, (3, 3), activation="relu", padding="same")(x)
|
686
|
+
|
687
|
+
# Fusion of high-resolution image textures
|
688
|
+
fusion = layers.Concatenate()([x, highres_features])
|
689
|
+
fusion = layers.Conv2D(128, (3, 3), activation="relu", padding="same")(fusion)
|
690
|
+
fusion = layers.Conv2D(64, (3, 3), activation="relu", padding="same")(fusion)
|
691
|
+
|
692
|
+
# Output
|
693
|
+
output = layers.Conv2D(1, (3, 3), activation="sigmoid", padding="same")(fusion)
|
694
|
+
|
695
|
+
model = keras.Model(inputs=[lowres_input, highres_input], outputs=output)
|
696
|
+
model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate), loss="mse")
|
697
|
+
|
698
|
+
return model
|
699
|
+
|
700
|
+
def print_model(model):
|
701
|
+
return model.summary()
|
702
|
+
|
703
|
+
def train(lowres_data, highres_data, epochs=100, batch_size=1, verbose=1, save_path=None):
|
704
|
+
model = srcnn()
|
705
|
+
# Train SRCNN
|
706
|
+
model.fit([modis_data_1, s2_data], s2_data, epochs=epochs, batch_size=batch_size, verbose=verbose)
|
707
|
+
# Save the complete model
|
708
|
+
# Recommended in newer versions of Keras (TensorFlow 2.11+): e.g., 'texture_fusion_model.keras'
|
709
|
+
if save_path: model.save(save_path)
|
710
|
+
|
711
|
+
def apply(model, lowres_data_app, highres_data):
|
712
|
+
super_resolved = model.predict([lowres_data_app, highres_data]).squeeze()
|
713
|
+
super_resolved = xr.DataArray(
|
714
|
+
super_resolved,
|
715
|
+
dims = ("latitude", "longitude"),
|
716
|
+
coords={"latitude": highres_data.latitude, "longitude": highres_data.longitude},
|
717
|
+
name="super_res"
|
718
|
+
)
|
719
|
+
return super_resolved
|
720
|
+
|
721
|
+
def load_model(save_path):
|
722
|
+
model = load_model('texture_fusion_model.keras')
|
723
|
+
|
724
|
+
# ------------------------------------------------------------------------------------------------------------------------------------------------------------
|
725
|
+
# Class mode
|
726
|
+
|
727
|
+
import numpy as np
|
728
|
+
import xarray as xr
|
729
|
+
import tensorflow as tf
|
730
|
+
from tensorflow import keras
|
731
|
+
from tensorflow.keras import layers
|
732
|
+
from tensorflow.keras.callbacks import EarlyStopping
|
733
|
+
|
734
|
+
class TextureFusionSRCNN:
|
735
|
+
def __init__(self, learning_rate=0.001):
|
736
|
+
self.learning_rate = learning_rate
|
737
|
+
self.model = self._build_model()
|
738
|
+
|
739
|
+
def _build_model(self):
|
740
|
+
# Input layers
|
741
|
+
lowres_input = layers.Input(shape=(None, None, 1)) # Low-resolution input
|
742
|
+
highres_input = layers.Input(shape=(None, None, 1)) # High-resolution image
|
743
|
+
|
744
|
+
# Feature extraction from high-resolution image
|
745
|
+
highres_features = layers.Conv2D(64, (3, 3), activation="relu", padding="same")(highres_input)
|
746
|
+
highres_features = layers.Conv2D(128, (3, 3), activation="relu", padding="same")(highres_features)
|
747
|
+
|
748
|
+
# Processing low-resolution input
|
749
|
+
x = layers.Conv2D(64, (3, 3), activation="relu", padding="same")(lowres_input)
|
750
|
+
x = layers.Conv2D(128, (3, 3), activation="relu", padding="same")(x)
|
751
|
+
|
752
|
+
# Fusion of high-resolution image textures
|
753
|
+
fusion = layers.Concatenate()([x, highres_features])
|
754
|
+
fusion = layers.Conv2D(128, (3, 3), activation="relu", padding="same")(fusion)
|
755
|
+
fusion = layers.Conv2D(64, (3, 3), activation="relu", padding="same")(fusion)
|
756
|
+
|
757
|
+
# Output
|
758
|
+
output = layers.Conv2D(1, (3, 3), activation="sigmoid", padding="same")(fusion)
|
759
|
+
|
760
|
+
model = keras.Model(inputs=[lowres_input, highres_input], outputs=output)
|
761
|
+
model.compile(optimizer=keras.optimizers.Adam(learning_rate=self.learning_rate), loss="mse")
|
762
|
+
|
763
|
+
return model
|
764
|
+
|
765
|
+
def summary(self):
|
766
|
+
return self.model.summary()
|
767
|
+
|
768
|
+
def train(self, lowres_data, highres_data, epochs=100, batch_size=1, verbose=1, save_path=None):
|
769
|
+
early_stop = EarlyStopping(
|
770
|
+
monitor='loss', # You can change to 'val_loss' if you add validation
|
771
|
+
patience=10, # Number of epochs with no improvement after which training will be stopped
|
772
|
+
restore_best_weights=True
|
773
|
+
)
|
774
|
+
|
775
|
+
self.model.fit(
|
776
|
+
[lowres_data, highres_data], highres_data,
|
777
|
+
epochs=epochs,
|
778
|
+
batch_size=batch_size,
|
779
|
+
verbose=verbose,
|
780
|
+
callbacks=[early_stop]
|
781
|
+
)
|
782
|
+
|
783
|
+
if save_path:
|
784
|
+
self.model.save(save_path)
|
785
|
+
|
786
|
+
def apply(self, lowres_data_app, highres_data):
|
787
|
+
super_resolved = self.model.predict([lowres_data_app, highres_data]).squeeze()
|
788
|
+
return super_resolved
|
789
|
+
|
790
|
+
@staticmethod
|
791
|
+
def load(save_path):
|
792
|
+
model = keras.models.load_model(save_path)
|
793
|
+
instance = TextureFusionSRCNN()
|
794
|
+
instance.model = model
|
795
|
+
return instance
|
796
|
+
|
sciml/pipelines.py
CHANGED
@@ -155,7 +155,7 @@ try:
|
|
155
155
|
from tensorflow.keras import models
|
156
156
|
# from keras.layers import Dropout
|
157
157
|
from keras.callbacks import EarlyStopping
|
158
|
-
from scitbx.
|
158
|
+
from scitbx.utils import *
|
159
159
|
except Exception as e:
|
160
160
|
print(e)
|
161
161
|
|
@@ -173,263 +173,54 @@ def train_lstm(X_train, y_train, nfeature, ntime, verbose = 2, epochs = 200, bat
|
|
173
173
|
model.fit(X_train, y_train, epochs = epochs, batch_size = batch_size, verbose=verbose)
|
174
174
|
return model
|
175
175
|
|
176
|
-
|
177
|
-
|
178
|
-
# ========================================================================================================
|
179
|
-
import numpy as np
|
180
|
-
from xgboost import XGBRegressor
|
181
|
-
from sklearn.metrics import mean_squared_error
|
182
|
-
|
183
|
-
class XGBoostDeepForestRegressor:
|
184
|
-
def __init__(self, n_estimators_per_layer=2, max_layers=20, early_stopping_rounds=2):
|
185
|
-
self.n_estimators_per_layer = n_estimators_per_layer
|
186
|
-
self.max_layers = max_layers
|
187
|
-
self.early_stopping_rounds = early_stopping_rounds
|
188
|
-
self.layers = []
|
189
|
-
|
190
|
-
def _fit_layer(self, X, y):
|
191
|
-
layer = []
|
192
|
-
layer_outputs = []
|
193
|
-
for _ in range(self.n_estimators_per_layer):
|
194
|
-
reg = XGBRegressor()
|
195
|
-
reg.fit(X, y)
|
196
|
-
preds = reg.predict(X).reshape(-1, 1)
|
197
|
-
layer.append(reg)
|
198
|
-
layer_outputs.append(preds)
|
199
|
-
output = np.hstack(layer_outputs)
|
200
|
-
return layer, output
|
201
|
-
|
202
|
-
def fit(self, X, y, X_val=None, y_val=None):
|
203
|
-
X_current = X.copy()
|
204
|
-
best_rmse = float("inf")
|
205
|
-
no_improve_rounds = 0
|
206
|
-
|
207
|
-
for layer_index in range(self.max_layers):
|
208
|
-
print(f"Training Layer {layer_index + 1}")
|
209
|
-
layer, output = self._fit_layer(X_current, y)
|
210
|
-
self.layers.append(layer)
|
211
|
-
X_current = np.hstack([X_current, output])
|
212
|
-
|
213
|
-
if X_val is not None:
|
214
|
-
y_pred = self.predict(X_val)
|
215
|
-
# rmse = mean_squared_error(y_val, y_pred, squared=False)
|
216
|
-
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
|
217
|
-
print(f"Validation RMSE: {rmse:.4f}")
|
218
|
-
|
219
|
-
if rmse < best_rmse:
|
220
|
-
best_rmse = rmse
|
221
|
-
no_improve_rounds = 0
|
222
|
-
else:
|
223
|
-
no_improve_rounds += 1
|
224
|
-
if no_improve_rounds >= self.early_stopping_rounds:
|
225
|
-
print("Early stopping triggered.")
|
226
|
-
break
|
227
|
-
|
228
|
-
def predict(self, X):
|
229
|
-
X_current = X.copy()
|
230
|
-
for layer in self.layers:
|
231
|
-
layer_outputs = []
|
232
|
-
for reg in layer:
|
233
|
-
n_features = reg.n_features_in_
|
234
|
-
preds = reg.predict(X_current[:, :n_features]).reshape(-1, 1)
|
235
|
-
layer_outputs.append(preds)
|
236
|
-
output = np.hstack(layer_outputs)
|
237
|
-
X_current = np.hstack([X_current, output])
|
238
|
-
|
239
|
-
# Final prediction = average of last layer regressors
|
240
|
-
final_outputs = []
|
241
|
-
for reg in self.layers[-1]:
|
242
|
-
n_features = reg.n_features_in_
|
243
|
-
final_outputs.append(reg.predict(X_current[:, :n_features]).reshape(-1, 1))
|
244
|
-
return np.mean(np.hstack(final_outputs), axis=1)
|
245
|
-
|
246
|
-
|
247
|
-
from sklearn.datasets import load_diabetes
|
248
|
-
from sklearn.model_selection import train_test_split
|
249
|
-
from sklearn.metrics import mean_squared_error
|
250
|
-
|
251
|
-
X, y = load_diabetes(return_X_y=True)
|
252
|
-
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
|
253
|
-
|
254
|
-
df_reg = XGBoostDeepForestRegressor(n_estimators_per_layer=2, max_layers=5)
|
255
|
-
df_reg.fit(X_train, y_train, X_val, y_val)
|
256
|
-
|
257
|
-
y_pred = df_reg.predict(X_val)
|
258
|
-
# rmse = mean_squared_error(y_val, y_pred, squared=False)
|
259
|
-
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
|
260
|
-
print("Final RMSE:", rmse)
|
261
|
-
|
262
|
-
# ----------------------------------------------------------------------------------------------------
|
263
|
-
|
176
|
+
# ===============================================================================================================================
|
177
|
+
# Training utils
|
264
178
|
import numpy as np
|
265
|
-
|
266
|
-
from sklearn.
|
267
|
-
import itertools
|
268
|
-
|
269
|
-
class XGBoostDeepForestRegressor:
|
270
|
-
def __init__(self, n_estimators_per_layer=2, max_layers=20, early_stopping_rounds=2, param_grid=None, use_gpu=True, gpu_id=0):
|
271
|
-
self.n_estimators_per_layer = n_estimators_per_layer
|
272
|
-
self.max_layers = max_layers
|
273
|
-
self.early_stopping_rounds = early_stopping_rounds
|
274
|
-
self.param_grid = param_grid or {
|
275
|
-
'max_depth': [3],
|
276
|
-
'learning_rate': [0.1],
|
277
|
-
'n_estimators': [100]
|
278
|
-
}
|
279
|
-
self.use_gpu = use_gpu
|
280
|
-
self.gpu_id = gpu_id
|
281
|
-
self.layers = []
|
282
|
-
|
283
|
-
def _get_param_combinations(self):
|
284
|
-
keys, values = zip(*self.param_grid.items())
|
285
|
-
return [dict(zip(keys, v)) for v in itertools.product(*values)]
|
286
|
-
|
287
|
-
def _fit_layer(self, X, y, X_val=None, y_val=None):
|
288
|
-
layer = []
|
289
|
-
layer_outputs = []
|
290
|
-
param_combos = self._get_param_combinations()
|
291
|
-
|
292
|
-
for i in range(self.n_estimators_per_layer):
|
293
|
-
best_rmse = float('inf')
|
294
|
-
best_model = None
|
295
|
-
|
296
|
-
for params in param_combos:
|
297
|
-
# Set GPU support parameters in XGBRegressor
|
298
|
-
if self.use_gpu:
|
299
|
-
params['tree_method'] = 'hist' # Use hist method
|
300
|
-
params['device'] = 'cuda' # Enable CUDA for GPU
|
301
|
-
|
302
|
-
model = XGBRegressor(**params)
|
303
|
-
model.fit(X, y)
|
304
|
-
|
305
|
-
if X_val is not None:
|
306
|
-
preds_val = model.predict(X_val)
|
307
|
-
rmse = np.sqrt(mean_squared_error(y_val, preds_val))
|
308
|
-
if rmse < best_rmse:
|
309
|
-
best_rmse = rmse
|
310
|
-
best_model = model
|
311
|
-
else:
|
312
|
-
best_model = model
|
313
|
-
|
314
|
-
final_model = best_model
|
315
|
-
preds = final_model.predict(X).reshape(-1, 1)
|
316
|
-
layer.append(final_model)
|
317
|
-
layer_outputs.append(preds)
|
318
|
-
|
319
|
-
output = np.hstack(layer_outputs)
|
320
|
-
return layer, output
|
321
|
-
|
322
|
-
def fit(self, X, y, X_val=None, y_val=None):
|
323
|
-
X_current = X.copy()
|
324
|
-
X_val_current = X_val.copy() if X_val is not None else None
|
325
|
-
|
326
|
-
best_rmse = float("inf")
|
327
|
-
no_improve_rounds = 0
|
328
|
-
|
329
|
-
for layer_index in range(self.max_layers):
|
330
|
-
print(f"Training Layer {layer_index + 1}")
|
331
|
-
layer, output = self._fit_layer(X_current, y, X_val_current, y_val)
|
332
|
-
self.layers.append(layer)
|
333
|
-
X_current = np.hstack([X_current, output])
|
334
|
-
|
335
|
-
if X_val is not None:
|
336
|
-
val_outputs = []
|
337
|
-
for reg in layer:
|
338
|
-
n_features = reg.n_features_in_
|
339
|
-
preds = reg.predict(X_val_current[:, :n_features]).reshape(-1, 1)
|
340
|
-
val_outputs.append(preds)
|
341
|
-
val_output = np.hstack(val_outputs)
|
342
|
-
X_val_current = np.hstack([X_val_current, val_output])
|
343
|
-
|
344
|
-
y_pred = self.predict(X_val)
|
345
|
-
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
|
346
|
-
print(f"Validation RMSE: {rmse:.4f}")
|
347
|
-
|
348
|
-
if rmse < best_rmse:
|
349
|
-
best_rmse = rmse
|
350
|
-
no_improve_rounds = 0
|
351
|
-
else:
|
352
|
-
no_improve_rounds += 1
|
353
|
-
if no_improve_rounds >= self.early_stopping_rounds:
|
354
|
-
print("Early stopping triggered.")
|
355
|
-
break
|
356
|
-
|
357
|
-
def predict(self, X):
|
358
|
-
X_current = X.copy()
|
359
|
-
for layer in self.layers:
|
360
|
-
layer_outputs = []
|
361
|
-
for reg in layer:
|
362
|
-
n_features = reg.n_features_in_
|
363
|
-
preds = reg.predict(X_current[:, :n_features]).reshape(-1, 1)
|
364
|
-
layer_outputs.append(preds)
|
365
|
-
output = np.hstack(layer_outputs)
|
366
|
-
X_current = np.hstack([X_current, output])
|
367
|
-
|
368
|
-
final_outputs = []
|
369
|
-
for reg in self.layers[-1]:
|
370
|
-
n_features = reg.n_features_in_
|
371
|
-
final_outputs.append(reg.predict(X_current[:, :n_features]).reshape(-1, 1))
|
372
|
-
return np.mean(np.hstack(final_outputs), axis=1)
|
373
|
-
|
374
|
-
|
375
|
-
from sklearn.datasets import load_diabetes
|
179
|
+
import pandas as pd
|
180
|
+
from sklearn.model_selection import ShuffleSplit
|
376
181
|
from sklearn.model_selection import train_test_split
|
377
|
-
from sklearn.metrics import mean_squared_error
|
378
|
-
|
379
|
-
# Load dataset
|
380
|
-
X, y = load_diabetes(return_X_y=True)
|
381
|
-
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
|
382
|
-
|
383
|
-
# Hyperparameter grid
|
384
|
-
param_grid = {
|
385
|
-
'max_depth': [3, 4],
|
386
|
-
'learning_rate': [0.1, 0.05],
|
387
|
-
'n_estimators': [50, 100]
|
388
|
-
}
|
389
|
-
|
390
|
-
# Create and fit the model with GPU enabled
|
391
|
-
df_reg = XGBoostDeepForestRegressor(
|
392
|
-
n_estimators_per_layer=2,
|
393
|
-
max_layers=5,
|
394
|
-
early_stopping_rounds=2,
|
395
|
-
param_grid=param_grid,
|
396
|
-
use_gpu=True, # Enable GPU acceleration
|
397
|
-
gpu_id=0 # Default to the first GPU
|
398
|
-
)
|
399
|
-
|
400
|
-
df_reg.fit(X_train, y_train, X_val, y_val)
|
401
|
-
|
402
|
-
# Final evaluation
|
403
|
-
y_pred = df_reg.predict(X_val)
|
404
|
-
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
|
405
|
-
print("Final RMSE:", rmse)
|
406
|
-
|
407
|
-
# ----------------------------------------------------------------------------------------------------
|
408
|
-
|
409
|
-
xgb_params = {
|
410
|
-
"objective": "reg:squarederror",
|
411
|
-
"random_state": 0,
|
412
|
-
'seed': 0,
|
413
|
-
'n_estimators': 100,
|
414
|
-
'max_depth': 6,
|
415
|
-
'min_child_weight': 4,
|
416
|
-
'subsample': 0.8,
|
417
|
-
'colsample_bytree': 0.8,
|
418
|
-
'gamma': 0,
|
419
|
-
'reg_alpha': 0,
|
420
|
-
'reg_lambda': 1,
|
421
|
-
'learning_rate': 0.05,
|
422
|
-
}
|
423
182
|
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
183
|
+
# randomly select sites
|
184
|
+
def random_select(ds, count, num, random_state = 0):
|
185
|
+
np.random.seed(random_state)
|
186
|
+
idxs = np.random.choice(np.delete(np.arange(len(ds)), count), num, replace = False)
|
187
|
+
return np.sort(idxs)
|
188
|
+
|
189
|
+
def split(Xs, ys, return_index = False, test_size = 0.33, random_state = 42):
|
190
|
+
if return_index:
|
191
|
+
sss = ShuffleSplit(n_splits=1, test_size = test_size, random_state = random_state)
|
192
|
+
sss.get_n_splits(Xs, ys)
|
193
|
+
train_index, test_index = next(sss.split(Xs, ys))
|
194
|
+
return (train_index, test_index)
|
195
|
+
else:
|
196
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
197
|
+
Xs, ys,
|
198
|
+
test_size = test_size,
|
199
|
+
random_state = random_state
|
200
|
+
)
|
201
|
+
return (X_train, X_test, y_train, y_test)
|
202
|
+
|
203
|
+
def split_cut(Xs, ys, test_ratio = 0.33):
|
204
|
+
"""
|
205
|
+
Split the timeseries into before and after halves
|
206
|
+
"""
|
207
|
+
assert ys.ndim == 2, 'ys must be 2D!'
|
208
|
+
assert len(Xs) == len(ys), 'Xs and ys should be equally long!'
|
209
|
+
assert type(Xs) == type(ys), 'Xs and ys should be the same data type!'
|
210
|
+
if not type(Xs) in [pd.core.frame.DataFrame, np.ndarray]: raise Exception('Only accept numpy ndarray or pandas dataframe')
|
211
|
+
anchor = int(np.floor(len(ys) * (1 - test_ratio)))
|
212
|
+
|
213
|
+
if type(Xs) == pd.core.frame.DataFrame:
|
214
|
+
X_train = Xs.iloc[0: anchor, :]
|
215
|
+
X_test = Xs.iloc[anchor::, :]
|
216
|
+
y_train = ys.iloc[0: anchor, :]
|
217
|
+
y_test = ys.iloc[anchor::, :]
|
218
|
+
else:
|
219
|
+
X_train = Xs[0: anchor, :]
|
220
|
+
X_test = Xs[anchor::, :]
|
221
|
+
y_train = ys[0: anchor, :]
|
222
|
+
y_test = ys[anchor::, :]
|
432
223
|
|
433
|
-
|
224
|
+
assert len(X_train) + len(X_test) == len(Xs), 'The sum of train and test lengths must equal to Xs/ys!'
|
434
225
|
|
435
|
-
|
226
|
+
return (X_train, X_test, y_train, y_test)
|
sciml/regress2.py
ADDED
@@ -0,0 +1,217 @@
|
|
1
|
+
# Model type I and II regression, including RMA (reduced major axis regression)
|
2
|
+
|
3
|
+
"""
|
4
|
+
Credit: UMaine MISC Lab; emmanuel.boss@maine.edu
|
5
|
+
http://misclab.umeoce.maine.edu/
|
6
|
+
https://github.com/OceanOptics
|
7
|
+
------------------------------------------------------------------------------
|
8
|
+
MIT License
|
9
|
+
|
10
|
+
Copyright (c) [year] [fullname]
|
11
|
+
|
12
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
13
|
+
of this software and associated documentation files (the "Software"), to deal
|
14
|
+
in the Software without restriction, including without limitation the rights
|
15
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
16
|
+
copies of the Software, and to permit persons to whom the Software is
|
17
|
+
furnished to do so, subject to the following conditions:
|
18
|
+
|
19
|
+
The above copyright notice and this permission notice shall be included in all
|
20
|
+
copies or substantial portions of the Software.
|
21
|
+
|
22
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
23
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
24
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
25
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
26
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
27
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
28
|
+
SOFTWARE.
|
29
|
+
"""
|
30
|
+
|
31
|
+
import statsmodels.api as sm
|
32
|
+
import numpy as np
|
33
|
+
|
34
|
+
|
35
|
+
def regress2(_x, _y, _method_type_1 = "ordinary least square",
|
36
|
+
_method_type_2 = "reduced major axis",
|
37
|
+
_weight_x = [], _weight_y = [], _need_intercept = True):
|
38
|
+
# Regression Type II based on statsmodels
|
39
|
+
# Type II regressions are recommended if there is variability on both x and y
|
40
|
+
# It's computing the linear regression type I for (x,y) and (y,x)
|
41
|
+
# and then average relationship with one of the type II methods
|
42
|
+
#
|
43
|
+
# INPUT:
|
44
|
+
# _x <np.array>
|
45
|
+
# _y <np.array>
|
46
|
+
# _method_type_1 <str> method to use for regression type I:
|
47
|
+
# ordinary least square or OLS <default>
|
48
|
+
# weighted least square or WLS
|
49
|
+
# robust linear model or RLM
|
50
|
+
# _method_type_2 <str> method to use for regression type II:
|
51
|
+
# major axis
|
52
|
+
# reduced major axis <default> (also known as geometric mean)
|
53
|
+
# arithmetic mean
|
54
|
+
# _need_intercept <bool>
|
55
|
+
# True <default> add a constant to relation (y = a x + b)
|
56
|
+
# False force relation by 0 (y = a x)
|
57
|
+
# _weight_x <np.array> containing the weigth of x
|
58
|
+
# _weigth_y <np.array> containing the weigth of y
|
59
|
+
#
|
60
|
+
# OUTPUT:
|
61
|
+
# slope
|
62
|
+
# intercept
|
63
|
+
# r
|
64
|
+
# std_slope
|
65
|
+
# std_intercept
|
66
|
+
# predict
|
67
|
+
#
|
68
|
+
# REQUIRE:
|
69
|
+
# numpy
|
70
|
+
# statsmodels
|
71
|
+
#
|
72
|
+
# The code is based on the matlab function of MBARI.
|
73
|
+
# AUTHOR: Nils Haentjens
|
74
|
+
# REFERENCE: https://www.mbari.org/products/research-software/matlab-scripts-linear-regressions/
|
75
|
+
|
76
|
+
# Check input
|
77
|
+
if _method_type_2 != "reduced major axis" and _method_type_1 != "ordinary least square":
|
78
|
+
raise ValueError("'" + _method_type_2 + "' only supports '" + _method_type_1 + "' method as type 1.")
|
79
|
+
|
80
|
+
# Set x, y depending on intercept requirement
|
81
|
+
if _need_intercept:
|
82
|
+
x_intercept = sm.add_constant(_x)
|
83
|
+
y_intercept = sm.add_constant(_y)
|
84
|
+
|
85
|
+
# Compute Regression Type I (if type II requires it)
|
86
|
+
if (_method_type_2 == "reduced major axis" or
|
87
|
+
_method_type_2 == "geometric mean"):
|
88
|
+
if _method_type_1 == "OLS" or _method_type_1 == "ordinary least square":
|
89
|
+
if _need_intercept:
|
90
|
+
[intercept_a, slope_a] = sm.OLS(_y, x_intercept).fit().params
|
91
|
+
[intercept_b, slope_b] = sm.OLS(_x, y_intercept).fit().params
|
92
|
+
else:
|
93
|
+
slope_a = sm.OLS(_y, _x).fit().params
|
94
|
+
slope_b = sm.OLS(_x, _y).fit().params
|
95
|
+
elif _method_type_1 == "WLS" or _method_type_1 == "weighted least square":
|
96
|
+
if _need_intercept:
|
97
|
+
[intercept_a, slope_a] = sm.WLS(
|
98
|
+
_y, x_intercept, weights=1. / _weight_y).fit().params
|
99
|
+
[intercept_b, slope_b] = sm.WLS(
|
100
|
+
_x, y_intercept, weights=1. / _weight_x).fit().params
|
101
|
+
else:
|
102
|
+
slope_a = sm.WLS(_y, _x, weights=1. / _weight_y).fit().params
|
103
|
+
slope_b = sm.WLS(_x, _y, weights=1. / _weight_x).fit().params
|
104
|
+
elif _method_type_1 == "RLM" or _method_type_1 == "robust linear model":
|
105
|
+
if _need_intercept:
|
106
|
+
[intercept_a, slope_a] = sm.RLM(_y, x_intercept).fit().params
|
107
|
+
[intercept_b, slope_b] = sm.RLM(_x, y_intercept).fit().params
|
108
|
+
else:
|
109
|
+
slope_a = sm.RLM(_y, _x).fit().params
|
110
|
+
slope_b = sm.RLM(_x, _y).fit().params
|
111
|
+
else:
|
112
|
+
raise ValueError("Invalid literal for _method_type_1: " + _method_type_1)
|
113
|
+
|
114
|
+
# Compute Regression Type II
|
115
|
+
if (_method_type_2 == "reduced major axis" or
|
116
|
+
_method_type_2 == "geometric mean"):
|
117
|
+
# Transpose coefficients
|
118
|
+
if _need_intercept:
|
119
|
+
intercept_b = -intercept_b / slope_b
|
120
|
+
slope_b = 1 / slope_b
|
121
|
+
# Check if correlated in same direction
|
122
|
+
if np.sign(slope_a) != np.sign(slope_b):
|
123
|
+
raise RuntimeError('Type I regressions of opposite sign.')
|
124
|
+
# Compute Reduced Major Axis Slope
|
125
|
+
slope = np.sign(slope_a) * np.sqrt(slope_a * slope_b)
|
126
|
+
if _need_intercept:
|
127
|
+
# Compute Intercept (use mean for least square)
|
128
|
+
if _method_type_1 == "OLS" or _method_type_1 == "ordinary least square":
|
129
|
+
intercept = np.mean(_y) - slope * np.mean(_x)
|
130
|
+
else:
|
131
|
+
intercept = np.median(_y) - slope * np.median(_x)
|
132
|
+
else:
|
133
|
+
intercept = 0
|
134
|
+
# Compute r
|
135
|
+
r = np.sign(slope_a) * np.sqrt(slope_a / slope_b)
|
136
|
+
# Compute predicted values
|
137
|
+
predict = slope * _x + intercept
|
138
|
+
# Compute standard deviation of the slope and the intercept
|
139
|
+
n = len(_x)
|
140
|
+
diff = _y - predict
|
141
|
+
Sx2 = np.sum(np.multiply(_x, _x))
|
142
|
+
den = n * Sx2 - np.sum(_x) ** 2
|
143
|
+
s2 = np.sum(np.multiply(diff, diff)) / (n - 2)
|
144
|
+
std_slope = np.sqrt(n * s2 / den)
|
145
|
+
if _need_intercept:
|
146
|
+
std_intercept = np.sqrt(Sx2 * s2 / den)
|
147
|
+
else:
|
148
|
+
std_intercept = 0
|
149
|
+
elif (_method_type_2 == "Pearson's major axis" or
|
150
|
+
_method_type_2 == "major axis"):
|
151
|
+
if not _need_intercept:
|
152
|
+
raise ValueError("Invalid value for _need_intercept: " + str(_need_intercept))
|
153
|
+
xm = np.mean(_x)
|
154
|
+
ym = np.mean(_y)
|
155
|
+
xp = _x - xm
|
156
|
+
yp = _y - ym
|
157
|
+
sumx2 = np.sum(np.multiply(xp, xp))
|
158
|
+
sumy2 = np.sum(np.multiply(yp, yp))
|
159
|
+
sumxy = np.sum(np.multiply(xp, yp))
|
160
|
+
slope = ((sumy2 - sumx2 + np.sqrt((sumy2 - sumx2)**2 + 4 * sumxy**2)) /
|
161
|
+
(2 * sumxy))
|
162
|
+
intercept = ym - slope * xm
|
163
|
+
# Compute r
|
164
|
+
r = sumxy / np.sqrt(sumx2 * sumy2)
|
165
|
+
# Compute standard deviation of the slope and the intercept
|
166
|
+
n = len(_x)
|
167
|
+
std_slope = (slope / r) * np.sqrt((1 - r ** 2) / n)
|
168
|
+
sigx = np.sqrt(sumx2 / (n - 1))
|
169
|
+
sigy = np.sqrt(sumy2 / (n - 1))
|
170
|
+
std_i1 = (sigy - sigx * slope) ** 2
|
171
|
+
std_i2 = (2 * sigx * sigy) + ((xm ** 2 * slope * (1 + r)) / r ** 2)
|
172
|
+
std_intercept = np.sqrt((std_i1 + ((1 - r) * slope * std_i2)) / n)
|
173
|
+
# Compute predicted values
|
174
|
+
predict = slope * _x + intercept
|
175
|
+
elif _method_type_2 == "arithmetic mean":
|
176
|
+
if not _need_intercept:
|
177
|
+
raise ValueError("Invalid value for _need_intercept: " + str(_need_intercept))
|
178
|
+
n = len(_x)
|
179
|
+
sg = np.floor(n / 2)
|
180
|
+
# Sort x and y in order of x
|
181
|
+
sorted_index = sorted(range(len(_x)), key=lambda i: _x[i])
|
182
|
+
x_w = np.array([_x[i] for i in sorted_index])
|
183
|
+
y_w = np.array([_y[i] for i in sorted_index])
|
184
|
+
x1 = x_w[1:sg + 1]
|
185
|
+
x2 = x_w[sg:n]
|
186
|
+
y1 = y_w[1:sg + 1]
|
187
|
+
y2 = y_w[sg:n]
|
188
|
+
x1m = np.mean(x1)
|
189
|
+
x2m = np.mean(x2)
|
190
|
+
y1m = np.mean(y1)
|
191
|
+
y2m = np.mean(y2)
|
192
|
+
xm = (x1m + x2m) / 2
|
193
|
+
ym = (y1m + y2m) / 2
|
194
|
+
slope = (x2m - x1m) / (y2m - y1m)
|
195
|
+
intercept = ym - xm * slope
|
196
|
+
# r (to verify)
|
197
|
+
r = []
|
198
|
+
# Compute predicted values
|
199
|
+
predict = slope * _x + intercept
|
200
|
+
# Compute standard deviation of the slope and the intercept
|
201
|
+
std_slope = []
|
202
|
+
std_intercept = []
|
203
|
+
|
204
|
+
# Return all that
|
205
|
+
return {"slope": float(slope), "intercept": intercept, "r": r,
|
206
|
+
"std_slope": std_slope, "std_intercept": std_intercept,
|
207
|
+
"predict": predict}
|
208
|
+
|
209
|
+
|
210
|
+
# if __name__ == '__main__':
|
211
|
+
# x = np.linspace(0, 10, 100)
|
212
|
+
# # Add random error on y
|
213
|
+
# e = np.random.normal(size=len(x))
|
214
|
+
# y = x + e
|
215
|
+
# results = regress2(x, y, _method_type_2="reduced major axis",
|
216
|
+
# _need_intercept=False)
|
217
|
+
# # print(results)
|
@@ -0,0 +1,11 @@
|
|
1
|
+
sciml/__init__.py,sha256=OglTSUWcPOHOFqTObFkbwIpv-ZStEl-iMHRZG9aT2pU,80
|
2
|
+
sciml/ccc.py,sha256=uQryOK1y2w3iLPhC2AScXFfbcvc5gMXjRAibYD38GkQ,1245
|
3
|
+
sciml/metrics.py,sha256=wLO1bka7GeXEbMT-w3ZZAwFt1TH0A4U3wf1-TkGtDuM,3699
|
4
|
+
sciml/models.py,sha256=UD8wOOJTeVrjzHM_OZmfNIGq32l4g4-6ZZoPwodcMhU,30104
|
5
|
+
sciml/pipelines.py,sha256=vhWbyoOi7-7F7v65ShMFi0aEBj08JhjHh-JDAxdc65c,8606
|
6
|
+
sciml/regress2.py,sha256=GvD3eQPRdzNSvOBhdcKd08NDg56CHlNZSQgwx5aN_bY,9194
|
7
|
+
sciml-0.0.11.dist-info/LICENSE,sha256=hcunSTJmVgRcUNOa1rKl8axtY3Jsy2B4wXDYtQsrAt0,1081
|
8
|
+
sciml-0.0.11.dist-info/METADATA,sha256=1FCJe3TgTnVW2jQLKfP0CRf2u2ghuJF1P-2dPWOjExg,327
|
9
|
+
sciml-0.0.11.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
10
|
+
sciml-0.0.11.dist-info/top_level.txt,sha256=dS_7aBCZFKQE3myPy5sh4USjQZCZyGg382-YxUUYcdw,6
|
11
|
+
sciml-0.0.11.dist-info/RECORD,,
|
sciml/utils.py
DELETED
@@ -1,46 +0,0 @@
|
|
1
|
-
import numpy as np
|
2
|
-
import pandas as pd
|
3
|
-
from sklearn.model_selection import ShuffleSplit
|
4
|
-
from sklearn.model_selection import train_test_split
|
5
|
-
|
6
|
-
# randomly select sites
|
7
|
-
def random_select(ds, count, num, random_state = 0):
|
8
|
-
np.random.seed(random_state)
|
9
|
-
idxs = np.random.choice(np.delete(np.arange(len(ds)), count), num, replace = False)
|
10
|
-
return np.sort(idxs)
|
11
|
-
|
12
|
-
def split(Xs, ys, return_index = False, test_size = 0.33, random_state = 42):
|
13
|
-
if return_index:
|
14
|
-
sss = ShuffleSplit(n_splits=1, test_size = test_size, random_state = random_state)
|
15
|
-
sss.get_n_splits(Xs, ys)
|
16
|
-
train_index, test_index = next(sss.split(Xs, ys))
|
17
|
-
return (train_index, test_index)
|
18
|
-
else:
|
19
|
-
X_train, X_test, y_train, y_test = train_test_split(
|
20
|
-
Xs, ys,
|
21
|
-
test_size = test_size,
|
22
|
-
random_state = random_state
|
23
|
-
)
|
24
|
-
return (X_train, X_test, y_train, y_test)
|
25
|
-
|
26
|
-
def split_cut(Xs, ys, test_ratio = 0.33):
|
27
|
-
assert ys.ndim == 2, 'ys must be 2D!'
|
28
|
-
assert len(Xs) == len(ys), 'Xs and ys should be equally long!'
|
29
|
-
assert type(Xs) == type(ys), 'Xs and ys should be the same data type!'
|
30
|
-
if not type(Xs) in [pd.core.frame.DataFrame, np.ndarray]: raise Exception('Only accept numpy ndarray or pandas dataframe')
|
31
|
-
anchor = int(np.floor(len(ys) * (1 - test_ratio)))
|
32
|
-
|
33
|
-
if type(Xs) == pd.core.frame.DataFrame:
|
34
|
-
X_train = Xs.iloc[0: anchor, :]
|
35
|
-
X_test = Xs.iloc[anchor::, :]
|
36
|
-
y_train = ys.iloc[0: anchor, :]
|
37
|
-
y_test = ys.iloc[anchor::, :]
|
38
|
-
else:
|
39
|
-
X_train = Xs[0: anchor, :]
|
40
|
-
X_test = Xs[anchor::, :]
|
41
|
-
y_train = ys[0: anchor, :]
|
42
|
-
y_test = ys[anchor::, :]
|
43
|
-
|
44
|
-
assert len(X_train) + len(X_test) == len(Xs), 'The sum of train and test lengths must equal to Xs/ys!'
|
45
|
-
|
46
|
-
return (X_train, X_test, y_train, y_test)
|
sciml-0.0.9.dist-info/RECORD
DELETED
@@ -1,9 +0,0 @@
|
|
1
|
-
sciml/__init__.py,sha256=wtdlXERN2ik7NT_TQxFdd2gdodBY9vSU1ClSdeJnLm4,59
|
2
|
-
sciml/models.py,sha256=BjbliW-KNfzbNdGNgM7nBdJ2SF2z21qCoAvug_v0FEg,10574
|
3
|
-
sciml/pipelines.py,sha256=ReNEkQbdFn04D5G2tbxcA7jdSwACy8SnmZ8bFZI_oqE,15702
|
4
|
-
sciml/utils.py,sha256=qCdABaTUu3K0R269jI7D_8SO6AqEjphg03CzdxCJR2k,1876
|
5
|
-
sciml-0.0.9.dist-info/LICENSE,sha256=hcunSTJmVgRcUNOa1rKl8axtY3Jsy2B4wXDYtQsrAt0,1081
|
6
|
-
sciml-0.0.9.dist-info/METADATA,sha256=S5hG3pP3x4yDPe8AJOKn4R-fIuvL-DL1GSKqGqiImSw,326
|
7
|
-
sciml-0.0.9.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
8
|
-
sciml-0.0.9.dist-info/top_level.txt,sha256=dS_7aBCZFKQE3myPy5sh4USjQZCZyGg382-YxUUYcdw,6
|
9
|
-
sciml-0.0.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|