sciml 0.0.8__py3-none-any.whl → 0.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sciml/__init__.py CHANGED
@@ -1,2 +1,2 @@
1
1
  # coding: utf-8
2
- __all__ = ["utils", "pipelines", "models"]
2
+ __all__ = ["pipelines", "models", "metrics", "regress2", "ccc"]
sciml/ccc.py ADDED
@@ -0,0 +1,36 @@
1
+ # https://rowannicholls.github.io/python/statistics/agreement/correlation_coefficients.html#lins-concordance-correlation-coefficient-ccc
2
+ # Lin LIK (1989). “A concordance correlation coefficient to evaluate reproducibility”. Biometrics. 45 (1):255-268.
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+ def concordance_correlation_coefficient(y_true, y_pred):
7
+ """Concordance correlation coefficient."""
8
+ # Remove NaNs
9
+ df = pd.DataFrame({
10
+ 'y_true': y_true,
11
+ 'y_pred': y_pred
12
+ })
13
+ df = df.dropna()
14
+ y_true = df['y_true']
15
+ y_pred = df['y_pred']
16
+ # Pearson product-moment correlation coefficients
17
+ cor = np.corrcoef(y_true, y_pred)[0][1]
18
+ # Mean
19
+ mean_true = np.mean(y_true)
20
+ mean_pred = np.mean(y_pred)
21
+ # Variance
22
+ var_true = np.var(y_true)
23
+ var_pred = np.var(y_pred)
24
+ # Standard deviation
25
+ sd_true = np.std(y_true)
26
+ sd_pred = np.std(y_pred)
27
+ # Calculate CCC
28
+ numerator = 2 * cor * sd_true * sd_pred
29
+ denominator = var_true + var_pred + (mean_true - mean_pred)**2
30
+ return numerator / denominator
31
+
32
+
33
+ # y_true = [3, -0.5, 2, 7, np.NaN]
34
+ # y_pred = [2.5, 0.0, 2, 8, 3]
35
+ # ccc = concordance_correlation_coefficient(y_true, y_pred)
36
+ # print(ccc)
sciml/metrics.py ADDED
@@ -0,0 +1,123 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from scipy import stats
4
+ from sklearn.metrics import explained_variance_score, max_error, mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error, r2_score, mean_poisson_deviance, mean_gamma_deviance, mean_tweedie_deviance
5
+
6
+ def stats_summary(df):
7
+ min_ = df.min().to_frame().T
8
+ Q1 = df.quantile(0.25).to_frame().T
9
+ median_ = df.quantile(0.5).to_frame().T
10
+ mean_ = df.mean().to_frame().T
11
+ Q3 = df.quantile(0.75).to_frame().T
12
+ max_ = df.max().to_frame().T
13
+ df_stats = pd.concat([min_, Q1, median_, mean_, Q3, max_])
14
+ df_stats.index = ["Min", "Q1", "Median", "Mean", "Q3", "Max"]
15
+ return df_stats
16
+
17
+ def stats_measures(x, y, return_dict = False):
18
+ slope, intercept, rvalue, pvalue, stderr = stats.linregress(x, y)
19
+ mse = mean_squared_error(x, y)
20
+ r2 = rvalue ** 2
21
+ rmse = np.sqrt(mse)
22
+ mbe = (y - x).mean()
23
+ if return_dict:
24
+ return {
25
+ "R2": r2,
26
+ "SLOPE": slope,
27
+ "RMSE": rmse,
28
+ "MBE": mbe
29
+ }
30
+ else:
31
+ return [r2, slope, rmse, mbe]
32
+
33
+ def stats_measures_full(x, y):
34
+ # from sklearn.metrics import mean_absolute_percentage_error
35
+ slope, intercept, rvalue, pvalue, stderr = stats.linregress(x, y)
36
+ mse = mean_squared_error(x, y)
37
+ r2 = rvalue ** 2
38
+ rmse = np.sqrt(mse)
39
+ mbe = (y - x).mean()
40
+ # ----------------------------------------------------------------
41
+ pearsonr = stats.pearsonr(x, y)
42
+ evs = explained_variance_score(x, y)
43
+ me = max_error(x, y)
44
+ mae = mean_absolute_error(x, y)
45
+ msle = mean_squared_log_error(x, y)
46
+ meae = median_absolute_error(x, y)
47
+ r2_score = r2_score(x, y)
48
+ mpd = mean_poisson_deviance(x, y)
49
+ mgd = mean_gamma_deviance(x, y)
50
+ mtd = mean_tweedie_deviance(x, y)
51
+ return {
52
+ "R2": r2,
53
+ "SLOPE": slope,
54
+ "RMSE": rmse,
55
+ "MBE": mbe,
56
+ "INTERCEPT": intercept,
57
+ "PVALUE": pvalue,
58
+ "STDERR": stderr,
59
+ "PEARSON": pearsonr,
60
+ "EXPLAINED_VARIANCE": evs,
61
+ "MAXERR": me,
62
+ "MAE": mae,
63
+ "MSLE": msle,
64
+ "MEDIAN_AE": meae,
65
+ "R2_SCORE": r2_score,
66
+ "MPD": mpd,
67
+ "MGD": mgd,
68
+ "MTD": mtd
69
+ }
70
+
71
+ def stats_measures_df(df, name1, name2, return_dict = False):
72
+ slope, intercept, rvalue, pvalue, stderr = stats.linregress(df[name1], df[name2])
73
+ mse = mean_squared_error(df[name1], df[name2])
74
+ r2 = rvalue ** 2
75
+ rmse = np.sqrt(mse)
76
+ mbe = (df[name2] - df[name1]).mean()
77
+ if return_dict:
78
+ return {
79
+ "R2": r2,
80
+ "SLOPE": slope,
81
+ "RMSE": rmse,
82
+ "MBE": mbe
83
+ }
84
+ else:
85
+ return [r2, slope, rmse, mbe]
86
+
87
+
88
+
89
+ def get_r2(x, y):
90
+ try:
91
+ x_bar = x.mean()
92
+ except:
93
+ x_bar = np.mean(x)
94
+
95
+ r2 = 1 - np.sum((x - y)**2) / np.sum((x - x_bar)**2)
96
+ return r2
97
+
98
+ def get_rmse(observations, estimates):
99
+ return np.sqrt(((estimates - observations) ** 2).mean())
100
+
101
+ def calculate_R2(y_true, y_pred):
102
+ """
103
+ Calculate the R^2 (coefficient of determination).
104
+
105
+ Args:
106
+ y_true (array-like): Actual values of the dependent variable.
107
+ y_pred (array-like): Predicted values of the dependent variable.
108
+
109
+ Returns:
110
+ float: The R^2 value.
111
+ """
112
+ y_true = np.array(y_true)
113
+ y_pred = np.array(y_pred)
114
+
115
+ # Residual sum of squares
116
+ ss_res = np.sum((y_true - y_pred) ** 2)
117
+
118
+ # Total sum of squares
119
+ ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
120
+
121
+ # R^2 calculation
122
+ R2 = 1 - (ss_res / ss_tot)
123
+ return R2
sciml/models.py CHANGED
@@ -142,6 +142,9 @@ class SmartForest:
142
142
  params['tree_method'] = 'hist'
143
143
  params['device'] = 'cuda'
144
144
 
145
+ params = params.copy() # Prevent modification from affecting the next loop iteration
146
+ params['random_state'] = i # Use a different random seed for each model to enhance diversity
147
+
145
148
  model = XGBRegressor(**params)
146
149
  model.fit(X, y)
147
150
 
@@ -220,11 +223,15 @@ class SmartForest:
220
223
 
221
224
  """
222
225
  # ============================== Test Example ==============================
226
+ import warnings
227
+ import numpy as np
223
228
  from sklearn.datasets import load_diabetes
229
+ from sklearn.datasets import fetch_california_housing
230
+ from sklearn.model_selection import train_test_split
231
+ from sklearn.metrics import mean_squared_error
224
232
 
225
- warnings.simplefilter('ignore')
226
-
227
- X, y = load_diabetes(return_X_y=True)
233
+ # X, y = load_diabetes(return_X_y=True) # Using diabetes dataset
234
+ X, y = fetch_california_housing(return_X_y=True) # Using house price dataset
228
235
  X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
229
236
 
230
237
  # Hyperparameter grid
@@ -244,7 +251,7 @@ param_grid = {
244
251
  }
245
252
 
246
253
  # Create the model with Multi-Grained Scanning enabled (with window sizes 2 and 3)
247
- df_reg = SmartForest(
254
+ regr = SmartForest(
248
255
  n_estimators_per_layer = 5,
249
256
  max_layers = 10,
250
257
  early_stopping_rounds = 5,
@@ -256,14 +263,14 @@ df_reg = SmartForest(
256
263
  verbose = 1
257
264
  )
258
265
 
259
- df_reg.fit(X_train, y_train, X_val, y_val)
266
+ regr.fit(X_train, y_train, X_val, y_val)
260
267
 
261
268
  # Predict on validation set and evaluate
262
- y_pred = df_reg.predict(X_val)
269
+ y_pred = regr.predict(X_val)
263
270
  rmse = np.sqrt(mean_squared_error(y_val, y_pred))
264
271
  print("\nFinal RMSE:", rmse)
265
272
 
266
273
  # Output best model and RMSE
267
- best_model, best_rmse = df_reg.get_best_model()
274
+ best_model, best_rmse = regr.get_best_model()
268
275
  print("\nBest validation RMSE:", best_rmse)
269
276
  """
sciml/pipelines.py CHANGED
@@ -155,7 +155,7 @@ try:
155
155
  from tensorflow.keras import models
156
156
  # from keras.layers import Dropout
157
157
  from keras.callbacks import EarlyStopping
158
- from scitbx.stutils import *
158
+ from scitbx.utils import *
159
159
  except Exception as e:
160
160
  print(e)
161
161
 
@@ -173,263 +173,54 @@ def train_lstm(X_train, y_train, nfeature, ntime, verbose = 2, epochs = 200, bat
173
173
  model.fit(X_train, y_train, epochs = epochs, batch_size = batch_size, verbose=verbose)
174
174
  return model
175
175
 
176
-
177
- '''
178
- # ========================================================================================================
179
- import numpy as np
180
- from xgboost import XGBRegressor
181
- from sklearn.metrics import mean_squared_error
182
-
183
- class XGBoostDeepForestRegressor:
184
- def __init__(self, n_estimators_per_layer=2, max_layers=20, early_stopping_rounds=2):
185
- self.n_estimators_per_layer = n_estimators_per_layer
186
- self.max_layers = max_layers
187
- self.early_stopping_rounds = early_stopping_rounds
188
- self.layers = []
189
-
190
- def _fit_layer(self, X, y):
191
- layer = []
192
- layer_outputs = []
193
- for _ in range(self.n_estimators_per_layer):
194
- reg = XGBRegressor()
195
- reg.fit(X, y)
196
- preds = reg.predict(X).reshape(-1, 1)
197
- layer.append(reg)
198
- layer_outputs.append(preds)
199
- output = np.hstack(layer_outputs)
200
- return layer, output
201
-
202
- def fit(self, X, y, X_val=None, y_val=None):
203
- X_current = X.copy()
204
- best_rmse = float("inf")
205
- no_improve_rounds = 0
206
-
207
- for layer_index in range(self.max_layers):
208
- print(f"Training Layer {layer_index + 1}")
209
- layer, output = self._fit_layer(X_current, y)
210
- self.layers.append(layer)
211
- X_current = np.hstack([X_current, output])
212
-
213
- if X_val is not None:
214
- y_pred = self.predict(X_val)
215
- # rmse = mean_squared_error(y_val, y_pred, squared=False)
216
- rmse = np.sqrt(mean_squared_error(y_val, y_pred))
217
- print(f"Validation RMSE: {rmse:.4f}")
218
-
219
- if rmse < best_rmse:
220
- best_rmse = rmse
221
- no_improve_rounds = 0
222
- else:
223
- no_improve_rounds += 1
224
- if no_improve_rounds >= self.early_stopping_rounds:
225
- print("Early stopping triggered.")
226
- break
227
-
228
- def predict(self, X):
229
- X_current = X.copy()
230
- for layer in self.layers:
231
- layer_outputs = []
232
- for reg in layer:
233
- n_features = reg.n_features_in_
234
- preds = reg.predict(X_current[:, :n_features]).reshape(-1, 1)
235
- layer_outputs.append(preds)
236
- output = np.hstack(layer_outputs)
237
- X_current = np.hstack([X_current, output])
238
-
239
- # Final prediction = average of last layer regressors
240
- final_outputs = []
241
- for reg in self.layers[-1]:
242
- n_features = reg.n_features_in_
243
- final_outputs.append(reg.predict(X_current[:, :n_features]).reshape(-1, 1))
244
- return np.mean(np.hstack(final_outputs), axis=1)
245
-
246
-
247
- from sklearn.datasets import load_diabetes
248
- from sklearn.model_selection import train_test_split
249
- from sklearn.metrics import mean_squared_error
250
-
251
- X, y = load_diabetes(return_X_y=True)
252
- X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
253
-
254
- df_reg = XGBoostDeepForestRegressor(n_estimators_per_layer=2, max_layers=5)
255
- df_reg.fit(X_train, y_train, X_val, y_val)
256
-
257
- y_pred = df_reg.predict(X_val)
258
- # rmse = mean_squared_error(y_val, y_pred, squared=False)
259
- rmse = np.sqrt(mean_squared_error(y_val, y_pred))
260
- print("Final RMSE:", rmse)
261
-
262
- # ----------------------------------------------------------------------------------------------------
263
-
176
+ # ===============================================================================================================================
177
+ # Training utils
264
178
  import numpy as np
265
- from xgboost import XGBRegressor
266
- from sklearn.metrics import mean_squared_error
267
- import itertools
268
-
269
- class XGBoostDeepForestRegressor:
270
- def __init__(self, n_estimators_per_layer=2, max_layers=20, early_stopping_rounds=2, param_grid=None, use_gpu=True, gpu_id=0):
271
- self.n_estimators_per_layer = n_estimators_per_layer
272
- self.max_layers = max_layers
273
- self.early_stopping_rounds = early_stopping_rounds
274
- self.param_grid = param_grid or {
275
- 'max_depth': [3],
276
- 'learning_rate': [0.1],
277
- 'n_estimators': [100]
278
- }
279
- self.use_gpu = use_gpu
280
- self.gpu_id = gpu_id
281
- self.layers = []
282
-
283
- def _get_param_combinations(self):
284
- keys, values = zip(*self.param_grid.items())
285
- return [dict(zip(keys, v)) for v in itertools.product(*values)]
286
-
287
- def _fit_layer(self, X, y, X_val=None, y_val=None):
288
- layer = []
289
- layer_outputs = []
290
- param_combos = self._get_param_combinations()
291
-
292
- for i in range(self.n_estimators_per_layer):
293
- best_rmse = float('inf')
294
- best_model = None
295
-
296
- for params in param_combos:
297
- # Set GPU support parameters in XGBRegressor
298
- if self.use_gpu:
299
- params['tree_method'] = 'hist' # Use hist method
300
- params['device'] = 'cuda' # Enable CUDA for GPU
301
-
302
- model = XGBRegressor(**params)
303
- model.fit(X, y)
304
-
305
- if X_val is not None:
306
- preds_val = model.predict(X_val)
307
- rmse = np.sqrt(mean_squared_error(y_val, preds_val))
308
- if rmse < best_rmse:
309
- best_rmse = rmse
310
- best_model = model
311
- else:
312
- best_model = model
313
-
314
- final_model = best_model
315
- preds = final_model.predict(X).reshape(-1, 1)
316
- layer.append(final_model)
317
- layer_outputs.append(preds)
318
-
319
- output = np.hstack(layer_outputs)
320
- return layer, output
321
-
322
- def fit(self, X, y, X_val=None, y_val=None):
323
- X_current = X.copy()
324
- X_val_current = X_val.copy() if X_val is not None else None
325
-
326
- best_rmse = float("inf")
327
- no_improve_rounds = 0
328
-
329
- for layer_index in range(self.max_layers):
330
- print(f"Training Layer {layer_index + 1}")
331
- layer, output = self._fit_layer(X_current, y, X_val_current, y_val)
332
- self.layers.append(layer)
333
- X_current = np.hstack([X_current, output])
334
-
335
- if X_val is not None:
336
- val_outputs = []
337
- for reg in layer:
338
- n_features = reg.n_features_in_
339
- preds = reg.predict(X_val_current[:, :n_features]).reshape(-1, 1)
340
- val_outputs.append(preds)
341
- val_output = np.hstack(val_outputs)
342
- X_val_current = np.hstack([X_val_current, val_output])
343
-
344
- y_pred = self.predict(X_val)
345
- rmse = np.sqrt(mean_squared_error(y_val, y_pred))
346
- print(f"Validation RMSE: {rmse:.4f}")
347
-
348
- if rmse < best_rmse:
349
- best_rmse = rmse
350
- no_improve_rounds = 0
351
- else:
352
- no_improve_rounds += 1
353
- if no_improve_rounds >= self.early_stopping_rounds:
354
- print("Early stopping triggered.")
355
- break
356
-
357
- def predict(self, X):
358
- X_current = X.copy()
359
- for layer in self.layers:
360
- layer_outputs = []
361
- for reg in layer:
362
- n_features = reg.n_features_in_
363
- preds = reg.predict(X_current[:, :n_features]).reshape(-1, 1)
364
- layer_outputs.append(preds)
365
- output = np.hstack(layer_outputs)
366
- X_current = np.hstack([X_current, output])
367
-
368
- final_outputs = []
369
- for reg in self.layers[-1]:
370
- n_features = reg.n_features_in_
371
- final_outputs.append(reg.predict(X_current[:, :n_features]).reshape(-1, 1))
372
- return np.mean(np.hstack(final_outputs), axis=1)
373
-
374
-
375
- from sklearn.datasets import load_diabetes
179
+ import pandas as pd
180
+ from sklearn.model_selection import ShuffleSplit
376
181
  from sklearn.model_selection import train_test_split
377
- from sklearn.metrics import mean_squared_error
378
-
379
- # Load dataset
380
- X, y = load_diabetes(return_X_y=True)
381
- X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
382
-
383
- # Hyperparameter grid
384
- param_grid = {
385
- 'max_depth': [3, 4],
386
- 'learning_rate': [0.1, 0.05],
387
- 'n_estimators': [50, 100]
388
- }
389
-
390
- # Create and fit the model with GPU enabled
391
- df_reg = XGBoostDeepForestRegressor(
392
- n_estimators_per_layer=2,
393
- max_layers=5,
394
- early_stopping_rounds=2,
395
- param_grid=param_grid,
396
- use_gpu=True, # Enable GPU acceleration
397
- gpu_id=0 # Default to the first GPU
398
- )
399
-
400
- df_reg.fit(X_train, y_train, X_val, y_val)
401
-
402
- # Final evaluation
403
- y_pred = df_reg.predict(X_val)
404
- rmse = np.sqrt(mean_squared_error(y_val, y_pred))
405
- print("Final RMSE:", rmse)
406
-
407
- # ----------------------------------------------------------------------------------------------------
408
-
409
- xgb_params = {
410
- "objective": "reg:squarederror",
411
- "random_state": 0,
412
- 'seed': 0,
413
- 'n_estimators': 100,
414
- 'max_depth': 6,
415
- 'min_child_weight': 4,
416
- 'subsample': 0.8,
417
- 'colsample_bytree': 0.8,
418
- 'gamma': 0,
419
- 'reg_alpha': 0,
420
- 'reg_lambda': 1,
421
- 'learning_rate': 0.05,
422
- }
423
182
 
424
- from xgboost import XGBRegressor
425
- regr = XGBRegressor(**xgb_params)
426
-
427
- regr.fit(X_train, y_train)
428
- y_pred = regr.predict(X_val)
429
-
430
-
431
- from scipy import stats
183
+ # randomly select sites
184
+ def random_select(ds, count, num, random_state = 0):
185
+ np.random.seed(random_state)
186
+ idxs = np.random.choice(np.delete(np.arange(len(ds)), count), num, replace = False)
187
+ return np.sort(idxs)
188
+
189
+ def split(Xs, ys, return_index = False, test_size = 0.33, random_state = 42):
190
+ if return_index:
191
+ sss = ShuffleSplit(n_splits=1, test_size = test_size, random_state = random_state)
192
+ sss.get_n_splits(Xs, ys)
193
+ train_index, test_index = next(sss.split(Xs, ys))
194
+ return (train_index, test_index)
195
+ else:
196
+ X_train, X_test, y_train, y_test = train_test_split(
197
+ Xs, ys,
198
+ test_size = test_size,
199
+ random_state = random_state
200
+ )
201
+ return (X_train, X_test, y_train, y_test)
202
+
203
+ def split_cut(Xs, ys, test_ratio = 0.33):
204
+ """
205
+ Split the timeseries into before and after halves
206
+ """
207
+ assert ys.ndim == 2, 'ys must be 2D!'
208
+ assert len(Xs) == len(ys), 'Xs and ys should be equally long!'
209
+ assert type(Xs) == type(ys), 'Xs and ys should be the same data type!'
210
+ if not type(Xs) in [pd.core.frame.DataFrame, np.ndarray]: raise Exception('Only accept numpy ndarray or pandas dataframe')
211
+ anchor = int(np.floor(len(ys) * (1 - test_ratio)))
212
+
213
+ if type(Xs) == pd.core.frame.DataFrame:
214
+ X_train = Xs.iloc[0: anchor, :]
215
+ X_test = Xs.iloc[anchor::, :]
216
+ y_train = ys.iloc[0: anchor, :]
217
+ y_test = ys.iloc[anchor::, :]
218
+ else:
219
+ X_train = Xs[0: anchor, :]
220
+ X_test = Xs[anchor::, :]
221
+ y_train = ys[0: anchor, :]
222
+ y_test = ys[anchor::, :]
432
223
 
433
- stats.linregress(y_val, y_pred)
224
+ assert len(X_train) + len(X_test) == len(Xs), 'The sum of train and test lengths must equal to Xs/ys!'
434
225
 
435
- '''
226
+ return (X_train, X_test, y_train, y_test)
sciml/regress2.py ADDED
@@ -0,0 +1,217 @@
1
+ # Model type I and II regression, including RMA (reduced major axis regression)
2
+
3
+ """
4
+ Credit: UMaine MISC Lab; emmanuel.boss@maine.edu
5
+ http://misclab.umeoce.maine.edu/
6
+ https://github.com/OceanOptics
7
+ ------------------------------------------------------------------------------
8
+ MIT License
9
+
10
+ Copyright (c) [year] [fullname]
11
+
12
+ Permission is hereby granted, free of charge, to any person obtaining a copy
13
+ of this software and associated documentation files (the "Software"), to deal
14
+ in the Software without restriction, including without limitation the rights
15
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16
+ copies of the Software, and to permit persons to whom the Software is
17
+ furnished to do so, subject to the following conditions:
18
+
19
+ The above copyright notice and this permission notice shall be included in all
20
+ copies or substantial portions of the Software.
21
+
22
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28
+ SOFTWARE.
29
+ """
30
+
31
+ import statsmodels.api as sm
32
+ import numpy as np
33
+
34
+
35
+ def regress2(_x, _y, _method_type_1 = "ordinary least square",
36
+ _method_type_2 = "reduced major axis",
37
+ _weight_x = [], _weight_y = [], _need_intercept = True):
38
+ # Regression Type II based on statsmodels
39
+ # Type II regressions are recommended if there is variability on both x and y
40
+ # It's computing the linear regression type I for (x,y) and (y,x)
41
+ # and then average relationship with one of the type II methods
42
+ #
43
+ # INPUT:
44
+ # _x <np.array>
45
+ # _y <np.array>
46
+ # _method_type_1 <str> method to use for regression type I:
47
+ # ordinary least square or OLS <default>
48
+ # weighted least square or WLS
49
+ # robust linear model or RLM
50
+ # _method_type_2 <str> method to use for regression type II:
51
+ # major axis
52
+ # reduced major axis <default> (also known as geometric mean)
53
+ # arithmetic mean
54
+ # _need_intercept <bool>
55
+ # True <default> add a constant to relation (y = a x + b)
56
+ # False force relation by 0 (y = a x)
57
+ # _weight_x <np.array> containing the weigth of x
58
+ # _weigth_y <np.array> containing the weigth of y
59
+ #
60
+ # OUTPUT:
61
+ # slope
62
+ # intercept
63
+ # r
64
+ # std_slope
65
+ # std_intercept
66
+ # predict
67
+ #
68
+ # REQUIRE:
69
+ # numpy
70
+ # statsmodels
71
+ #
72
+ # The code is based on the matlab function of MBARI.
73
+ # AUTHOR: Nils Haentjens
74
+ # REFERENCE: https://www.mbari.org/products/research-software/matlab-scripts-linear-regressions/
75
+
76
+ # Check input
77
+ if _method_type_2 != "reduced major axis" and _method_type_1 != "ordinary least square":
78
+ raise ValueError("'" + _method_type_2 + "' only supports '" + _method_type_1 + "' method as type 1.")
79
+
80
+ # Set x, y depending on intercept requirement
81
+ if _need_intercept:
82
+ x_intercept = sm.add_constant(_x)
83
+ y_intercept = sm.add_constant(_y)
84
+
85
+ # Compute Regression Type I (if type II requires it)
86
+ if (_method_type_2 == "reduced major axis" or
87
+ _method_type_2 == "geometric mean"):
88
+ if _method_type_1 == "OLS" or _method_type_1 == "ordinary least square":
89
+ if _need_intercept:
90
+ [intercept_a, slope_a] = sm.OLS(_y, x_intercept).fit().params
91
+ [intercept_b, slope_b] = sm.OLS(_x, y_intercept).fit().params
92
+ else:
93
+ slope_a = sm.OLS(_y, _x).fit().params
94
+ slope_b = sm.OLS(_x, _y).fit().params
95
+ elif _method_type_1 == "WLS" or _method_type_1 == "weighted least square":
96
+ if _need_intercept:
97
+ [intercept_a, slope_a] = sm.WLS(
98
+ _y, x_intercept, weights=1. / _weight_y).fit().params
99
+ [intercept_b, slope_b] = sm.WLS(
100
+ _x, y_intercept, weights=1. / _weight_x).fit().params
101
+ else:
102
+ slope_a = sm.WLS(_y, _x, weights=1. / _weight_y).fit().params
103
+ slope_b = sm.WLS(_x, _y, weights=1. / _weight_x).fit().params
104
+ elif _method_type_1 == "RLM" or _method_type_1 == "robust linear model":
105
+ if _need_intercept:
106
+ [intercept_a, slope_a] = sm.RLM(_y, x_intercept).fit().params
107
+ [intercept_b, slope_b] = sm.RLM(_x, y_intercept).fit().params
108
+ else:
109
+ slope_a = sm.RLM(_y, _x).fit().params
110
+ slope_b = sm.RLM(_x, _y).fit().params
111
+ else:
112
+ raise ValueError("Invalid literal for _method_type_1: " + _method_type_1)
113
+
114
+ # Compute Regression Type II
115
+ if (_method_type_2 == "reduced major axis" or
116
+ _method_type_2 == "geometric mean"):
117
+ # Transpose coefficients
118
+ if _need_intercept:
119
+ intercept_b = -intercept_b / slope_b
120
+ slope_b = 1 / slope_b
121
+ # Check if correlated in same direction
122
+ if np.sign(slope_a) != np.sign(slope_b):
123
+ raise RuntimeError('Type I regressions of opposite sign.')
124
+ # Compute Reduced Major Axis Slope
125
+ slope = np.sign(slope_a) * np.sqrt(slope_a * slope_b)
126
+ if _need_intercept:
127
+ # Compute Intercept (use mean for least square)
128
+ if _method_type_1 == "OLS" or _method_type_1 == "ordinary least square":
129
+ intercept = np.mean(_y) - slope * np.mean(_x)
130
+ else:
131
+ intercept = np.median(_y) - slope * np.median(_x)
132
+ else:
133
+ intercept = 0
134
+ # Compute r
135
+ r = np.sign(slope_a) * np.sqrt(slope_a / slope_b)
136
+ # Compute predicted values
137
+ predict = slope * _x + intercept
138
+ # Compute standard deviation of the slope and the intercept
139
+ n = len(_x)
140
+ diff = _y - predict
141
+ Sx2 = np.sum(np.multiply(_x, _x))
142
+ den = n * Sx2 - np.sum(_x) ** 2
143
+ s2 = np.sum(np.multiply(diff, diff)) / (n - 2)
144
+ std_slope = np.sqrt(n * s2 / den)
145
+ if _need_intercept:
146
+ std_intercept = np.sqrt(Sx2 * s2 / den)
147
+ else:
148
+ std_intercept = 0
149
+ elif (_method_type_2 == "Pearson's major axis" or
150
+ _method_type_2 == "major axis"):
151
+ if not _need_intercept:
152
+ raise ValueError("Invalid value for _need_intercept: " + str(_need_intercept))
153
+ xm = np.mean(_x)
154
+ ym = np.mean(_y)
155
+ xp = _x - xm
156
+ yp = _y - ym
157
+ sumx2 = np.sum(np.multiply(xp, xp))
158
+ sumy2 = np.sum(np.multiply(yp, yp))
159
+ sumxy = np.sum(np.multiply(xp, yp))
160
+ slope = ((sumy2 - sumx2 + np.sqrt((sumy2 - sumx2)**2 + 4 * sumxy**2)) /
161
+ (2 * sumxy))
162
+ intercept = ym - slope * xm
163
+ # Compute r
164
+ r = sumxy / np.sqrt(sumx2 * sumy2)
165
+ # Compute standard deviation of the slope and the intercept
166
+ n = len(_x)
167
+ std_slope = (slope / r) * np.sqrt((1 - r ** 2) / n)
168
+ sigx = np.sqrt(sumx2 / (n - 1))
169
+ sigy = np.sqrt(sumy2 / (n - 1))
170
+ std_i1 = (sigy - sigx * slope) ** 2
171
+ std_i2 = (2 * sigx * sigy) + ((xm ** 2 * slope * (1 + r)) / r ** 2)
172
+ std_intercept = np.sqrt((std_i1 + ((1 - r) * slope * std_i2)) / n)
173
+ # Compute predicted values
174
+ predict = slope * _x + intercept
175
+ elif _method_type_2 == "arithmetic mean":
176
+ if not _need_intercept:
177
+ raise ValueError("Invalid value for _need_intercept: " + str(_need_intercept))
178
+ n = len(_x)
179
+ sg = np.floor(n / 2)
180
+ # Sort x and y in order of x
181
+ sorted_index = sorted(range(len(_x)), key=lambda i: _x[i])
182
+ x_w = np.array([_x[i] for i in sorted_index])
183
+ y_w = np.array([_y[i] for i in sorted_index])
184
+ x1 = x_w[1:sg + 1]
185
+ x2 = x_w[sg:n]
186
+ y1 = y_w[1:sg + 1]
187
+ y2 = y_w[sg:n]
188
+ x1m = np.mean(x1)
189
+ x2m = np.mean(x2)
190
+ y1m = np.mean(y1)
191
+ y2m = np.mean(y2)
192
+ xm = (x1m + x2m) / 2
193
+ ym = (y1m + y2m) / 2
194
+ slope = (x2m - x1m) / (y2m - y1m)
195
+ intercept = ym - xm * slope
196
+ # r (to verify)
197
+ r = []
198
+ # Compute predicted values
199
+ predict = slope * _x + intercept
200
+ # Compute standard deviation of the slope and the intercept
201
+ std_slope = []
202
+ std_intercept = []
203
+
204
+ # Return all that
205
+ return {"slope": float(slope), "intercept": intercept, "r": r,
206
+ "std_slope": std_slope, "std_intercept": std_intercept,
207
+ "predict": predict}
208
+
209
+
210
+ # if __name__ == '__main__':
211
+ # x = np.linspace(0, 10, 100)
212
+ # # Add random error on y
213
+ # e = np.random.normal(size=len(x))
214
+ # y = x + e
215
+ # results = regress2(x, y, _method_type_2="reduced major axis",
216
+ # _need_intercept=False)
217
+ # # print(results)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sciml
3
- Version: 0.0.8
3
+ Version: 0.0.10
4
4
  Summary: draw and basic calculations/conversions
5
5
  Home-page: https://github.com/soonyenju/sciml
6
6
  Author: Songyan Zhu
@@ -0,0 +1,11 @@
1
+ sciml/__init__.py,sha256=BqRVu5DbfbnxksBXhe4gH_uulPdqTjSaSO1LvGkc37Q,79
2
+ sciml/ccc.py,sha256=AE1l46hvh18_Q9_BQufMjsGF9-JfsTw2hrT1CbgBHE8,1210
3
+ sciml/metrics.py,sha256=ICEeH6jwmpdx9jxwYSzB_YTvbyBq9AEUYqkZiVS1ZGs,3577
4
+ sciml/models.py,sha256=qc2LgdpSkq9kGMnLKZTnyuwzytCu6R8hyU5i6PaI7Qw,10345
5
+ sciml/pipelines.py,sha256=NGBwl5vA0Uq5GO-VtIow_k42K7HoVwxPQrkW-jINflY,8381
6
+ sciml/regress2.py,sha256=GSZ4IqmyF9u3PGOhHIKV0Rb_C2pI8eJ3jGJBa1IrEXM,8978
7
+ sciml-0.0.10.dist-info/LICENSE,sha256=dX4jBmkgQPWc_TfYkXtKQzVIgZQWFuHZ8vQjV4sEeV4,1060
8
+ sciml-0.0.10.dist-info/METADATA,sha256=iMcI6kpM6IX2oBhx9JwmI77JiX2bZPWI93dHta_jkCM,314
9
+ sciml-0.0.10.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
10
+ sciml-0.0.10.dist-info/top_level.txt,sha256=dS_7aBCZFKQE3myPy5sh4USjQZCZyGg382-YxUUYcdw,6
11
+ sciml-0.0.10.dist-info/RECORD,,
sciml/utils.py DELETED
@@ -1,46 +0,0 @@
1
- import numpy as np
2
- import pandas as pd
3
- from sklearn.model_selection import ShuffleSplit
4
- from sklearn.model_selection import train_test_split
5
-
6
- # randomly select sites
7
- def random_select(ds, count, num, random_state = 0):
8
- np.random.seed(random_state)
9
- idxs = np.random.choice(np.delete(np.arange(len(ds)), count), num, replace = False)
10
- return np.sort(idxs)
11
-
12
- def split(Xs, ys, return_index = False, test_size = 0.33, random_state = 42):
13
- if return_index:
14
- sss = ShuffleSplit(n_splits=1, test_size = test_size, random_state = random_state)
15
- sss.get_n_splits(Xs, ys)
16
- train_index, test_index = next(sss.split(Xs, ys))
17
- return (train_index, test_index)
18
- else:
19
- X_train, X_test, y_train, y_test = train_test_split(
20
- Xs, ys,
21
- test_size = test_size,
22
- random_state = random_state
23
- )
24
- return (X_train, X_test, y_train, y_test)
25
-
26
- def split_cut(Xs, ys, test_ratio = 0.33):
27
- assert ys.ndim == 2, 'ys must be 2D!'
28
- assert len(Xs) == len(ys), 'Xs and ys should be equally long!'
29
- assert type(Xs) == type(ys), 'Xs and ys should be the same data type!'
30
- if not type(Xs) in [pd.core.frame.DataFrame, np.ndarray]: raise Exception('Only accept numpy ndarray or pandas dataframe')
31
- anchor = int(np.floor(len(ys) * (1 - test_ratio)))
32
-
33
- if type(Xs) == pd.core.frame.DataFrame:
34
- X_train = Xs.iloc[0: anchor, :]
35
- X_test = Xs.iloc[anchor::, :]
36
- y_train = ys.iloc[0: anchor, :]
37
- y_test = ys.iloc[anchor::, :]
38
- else:
39
- X_train = Xs[0: anchor, :]
40
- X_test = Xs[anchor::, :]
41
- y_train = ys[0: anchor, :]
42
- y_test = ys[anchor::, :]
43
-
44
- assert len(X_train) + len(X_test) == len(Xs), 'The sum of train and test lengths must equal to Xs/ys!'
45
-
46
- return (X_train, X_test, y_train, y_test)
@@ -1,9 +0,0 @@
1
- sciml/__init__.py,sha256=6iQAGgCEMuw4yoLBzZDax46a45LZgzEeNSHQMdmcBSQ,58
2
- sciml/models.py,sha256=p6cw3SxTQaOtFhJx8KdW0Z2QtxBlSBlVPHETTNCjJ2w,9880
3
- sciml/pipelines.py,sha256=CJolleJakoEQc-EV-v6NovP3bDb1hif7SvObXdaLXdY,15268
4
- sciml/utils.py,sha256=u5DzQJV4aCZ-p7sY56Fxzj8WDGYOgn1rOTeGzAw0vwY,1831
5
- sciml-0.0.8.dist-info/LICENSE,sha256=dX4jBmkgQPWc_TfYkXtKQzVIgZQWFuHZ8vQjV4sEeV4,1060
6
- sciml-0.0.8.dist-info/METADATA,sha256=uMCtigVwS2e0abqbvfbLZca6iZnkdDTBXtbjdg34yIA,313
7
- sciml-0.0.8.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
8
- sciml-0.0.8.dist-info/top_level.txt,sha256=dS_7aBCZFKQE3myPy5sh4USjQZCZyGg382-YxUUYcdw,6
9
- sciml-0.0.8.dist-info/RECORD,,
File without changes