PySAR 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pySAR/model.py ADDED
@@ -0,0 +1,559 @@
1
+ ################################################################################
2
+ ################# Model #################
3
+ ################################################################################
4
+
5
+ from sklearn.neighbors import KNeighborsRegressor
6
+ from sklearn.svm import SVR
7
+ from sklearn.linear_model import Lasso, LinearRegression, Ridge, SGDRegressor, ElasticNet
8
+ from sklearn.tree import DecisionTreeRegressor
9
+ from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor
10
+ from sklearn.gaussian_process import GaussianProcessRegressor
11
+ from sklearn.cross_decomposition import PLSRegression
12
+ from sklearn.preprocessing import StandardScaler
13
+ from sklearn.model_selection import GridSearchCV, train_test_split
14
+ from sklearn.metrics import get_scorer_names
15
+ from sklearn.feature_selection import SelectKBest, f_regression, VarianceThreshold, RFE, SelectFromModel, SequentialFeatureSelector
16
+ from difflib import get_close_matches
17
+ from copy import deepcopy
18
+ import os
19
+ import pickle
20
+ import pandas as pd
21
+ import numpy as np
22
+ # np.seterr is intentionally NOT set globally; divide/invalid warnings are suppressed
23
+ # locally via np.errstate() at the call sites where they are expected.
24
+
25
+ from .evaluate import Evaluate
26
+
27
+ class Model():
28
+ """
29
+ Class for building, fitting and training a various range of predictive
30
+ regression models and all their related methods and attributes. The
31
+ model class supports the following regression algorithms: PLS Regression,
32
+ Random Forest, AdaBoost, Bagging, Decision Tree, GradientBoost, Linear
33
+ Regression, Lasso, Ridge, ElasticNet, Support Vector Regression, Stochastic
34
+ Gradient Descent, K Nearest Neighbours (KNN), Extra Trees, Histogram-based
35
+ Gradient Boosting and Gaussian Process Regression.
36
+
37
+ Once a model object has been built and fitted to the training data and
38
+ labels, it can then be used for predicting the sought activity/fitness
39
+ value for unseen test sequences.
40
+
41
+ Parameters
42
+ ==========
43
+ :X: np.ndarray
44
+ training data.
45
+ :Y: np.ndarray
46
+ training data labels.
47
+ :algorithm: str
48
+ sklearn regression algorithm to build and fit model with. Value can be
49
+ an approximate representation of model name, for example: 'plsreg' will
50
+ initialiase an instance of the PLSRegression model etc. Available
51
+ algorithms listed above.
52
+ :parameters: dict (default={})
53
+ parameters to use for specific sklearn model when building regression
54
+ model, by default it is set to {}, meaning all of the models' default
55
+ parameters are used. Refer to sci-kit learn for full list of available
56
+ input parameters for each model: https://scikit-learn.org/stable/index.html.
57
+ :test_split: float (default=0.2)
58
+ proportion of the test data to use for building model, default of 0.2 is
59
+ recommended, meaning 80% of the data used for training and 20% for testing.
60
+
61
+ Methods
62
+ =======
63
+ get_model():
64
+ build model using inputted parameters.
65
+ train_test_split(scale=True, test_split=0.2, random_state=None, shuffle=True):
66
+ get train-test split of dataset.
67
+ fit():
68
+ fit model.
69
+ predict():
70
+ predict activity values using trained model and test data.
71
+ save(save_folder):
72
+ save fitted model to save_folder.
73
+ model_fitted():
74
+ return if model has been fitted (true or false)
75
+ hyperparameter_tuning(self, parameters={}, metric='r2', cv=5, n_jobs=None, verbose=2):
76
+ complete hyperparameter tuning of model and its associated parameters.
77
+ feature_selection(method=""):
78
+ undertake feature selection using technique specified by method input
79
+ parameter to find optimal selection of features for maximum predictability
80
+ in model. Supported feature selection methods include SelectKBest, chi2,
81
+ VarianceThreshold, RFE, SelectFromModel and SequentialFeatureSelector.
82
+ """
83
+ MODEL_CONSTRUCTORS = {
84
+ 'plsregression': PLSRegression,
85
+ 'randomforestregressor': RandomForestRegressor,
86
+ 'adaboostregressor': AdaBoostRegressor,
87
+ 'baggingregressor': BaggingRegressor,
88
+ 'decisiontreeregressor': DecisionTreeRegressor,
89
+ 'linearregression': LinearRegression,
90
+ 'lasso': Lasso,
91
+ 'ridge': Ridge,
92
+ 'sgd': SGDRegressor,
93
+ 'stochasticgradientdescent': SGDRegressor,
94
+ 'gbr': GradientBoostingRegressor,
95
+ 'gradientboost': GradientBoostingRegressor,
96
+ 'gradientboostingregressor': GradientBoostingRegressor,
97
+ 'svr': SVR,
98
+ 'supportvectorregression': SVR,
99
+ 'knn': KNeighborsRegressor,
100
+ 'kneighborsregressor': KNeighborsRegressor,
101
+ 'knearestneighbors': KNeighborsRegressor,
102
+ 'elasticnet': ElasticNet,
103
+ 'extratreesregressor': ExtraTreesRegressor,
104
+ 'extratrees': ExtraTreesRegressor,
105
+ 'histgradientboostingregressor': HistGradientBoostingRegressor,
106
+ 'histgradientboosting': HistGradientBoostingRegressor,
107
+ 'hgbr': HistGradientBoostingRegressor,
108
+ 'gaussianprocessregressor': GaussianProcessRegressor,
109
+ 'gaussianprocess': GaussianProcessRegressor,
110
+ 'gpr': GaussianProcessRegressor,
111
+ }
112
+
113
+ def __init__(self, X, Y, algorithm, parameters=None, test_split=0.2):
114
+
115
+ self.algorithm = algorithm
116
+ self.test_split = test_split
117
+ self.X = X
118
+ self.Y = Y
119
+
120
+ #if no model parameters input, then set to {} meaning default models' parameters are used
121
+ if parameters is None or parameters == [] or parameters == "":
122
+ self.parameters = {}
123
+ else:
124
+ self.parameters = parameters
125
+
126
+ #list of valid models available to use for this class
127
+ self.valid_models = ['plsregression', 'randomforestregressor', 'adaboostregressor',\
128
+ 'baggingregressor', 'decisiontreeregressor', 'gbr',
129
+ 'gradientboostingregressor', 'linearregression', 'lasso', 'ridge',
130
+ 'svr', 'supportvectorregression', 'sgd', 'stochasticgradientdescent',
131
+ 'kneighborsregressor', 'knearestneighbors', 'knn', 'elasticnet',
132
+ 'extratreesregressor', 'extratrees', 'histgradientboostingregressor',
133
+ 'histgradientboosting', 'hgbr', 'gaussianprocessregressor',
134
+ 'gaussianprocess', 'gpr']
135
+
136
+ #raise error if algorithm parameter isnt string type
137
+ if not(isinstance(self.algorithm, str)):
138
+ raise TypeError(f"Algorithm input parameter must be a string, got type {type(self.algorithm)}.")
139
+
140
+ #get closest match of valid model from the input algorithm parameter value using difflib
141
+ model_matches = get_close_matches(self.algorithm.lower().strip(),[item.lower().strip() \
142
+ for item in self.valid_models], cutoff=0.5)
143
+
144
+ #if algorithm is a valid model then set it to self.algorithm, else raise error
145
+ if (model_matches!=[]):
146
+ self.algorithm = model_matches[0]
147
+ else:
148
+ raise ValueError(f'Input algorithm {self.algorithm} not found in list of available valid models\n{self.valid_models}.')
149
+
150
+ #create instance of algorithm object using its sklearn constructor
151
+ self.model = self.get_model()
152
+
153
+ #set model_fit to None, specifies if model has been fit or not
154
+ self.model_fit = None
155
+
156
+ def get_model(self):
157
+ """
158
+ Create instance of model type specified by input 'algorithm' argument. If
159
+ input 'parameters' = {} then default parameters of sklearn model are used, else set
160
+ the parameters of the model to the values specified in the 'parameters' input.
161
+
162
+ Parameters
163
+ ==========
164
+ None
165
+
166
+ Returns
167
+ =======
168
+ :model: sklearn.model
169
+ instantiated regression model with default or user-specified parameters.
170
+ """
171
+ constructor = self.MODEL_CONSTRUCTORS.get(self.algorithm.lower().strip())
172
+ if constructor is None:
173
+ raise ValueError('Input Algorithm {} not found in available valid models:\n{}'.
174
+ format(self.algorithm, self.valid_models))
175
+
176
+ valid_parameter_names = set(constructor().get_params().keys())
177
+ parameters = {
178
+ key: value for key, value in self.parameters.items()
179
+ if key in valid_parameter_names
180
+ }
181
+
182
+ return constructor(**parameters) if parameters else constructor()
183
+
184
+ def train_test_split(self, test_split=0.2, scale=True, random_state=None, shuffle=True):
185
+ """
186
+ Split the X and Y input features and labels into random train and test
187
+ subsets. By default a 80:20 split will be used, whereby 80% of the data
188
+ will be used for training and 20% for testing. By default the input will
189
+ be scaled first such that the mean is removed and features scaled to unit
190
+ variance. By default data is shuffled before the split and random state is None.
191
+
192
+ Parameters
193
+ ==========
194
+ :scale: bool (default=True)
195
+ if true then scale the features such that they are standardised.
196
+ :test_split: float (default=0.2)
197
+ proportion of the total dataset to use for testing, rest used for training.
198
+ :random_state : float (default=None)
199
+ Controls the shuffling applied to the data before applying the split.
200
+ Popular integer random seeds are 0 and 42, None by default.
201
+ :shuffle: bool (default=True)
202
+ Whether or not to shuffle the data before splitting.
203
+
204
+ Returns
205
+ =======
206
+ :self.X_train, self.X_test, self.Y_train, self.Y_test: np.ndarray
207
+ splitted training and test data features and labels.
208
+ """
209
+ #validate that X and Y arrays are of the same size
210
+ if (len(self.X) != len(self.Y)):
211
+ raise ValueError('X and Y input parameters must be of the same length - X: {}, Y: {}.'.
212
+ format(len(self.X), len(self.Y)))
213
+
214
+ #reshape input arrays to 2D arrays without mutating the original attributes
215
+ X_values = self.X.values if isinstance(self.X, (pd.DataFrame, pd.Series)) else self.X
216
+ Y_values = self.Y.values if isinstance(self.Y, (pd.DataFrame, pd.Series)) else self.Y
217
+
218
+ X_values = np.asarray(X_values)
219
+ Y_values = np.asarray(Y_values)
220
+
221
+ if (X_values.ndim != 2):
222
+ X_values = np.reshape(X_values, (-1,1))
223
+ if (Y_values.ndim != 2):
224
+ Y_values = np.reshape(Y_values, (-1,1))
225
+
226
+ #if invalid test size input then set to default 0.2
227
+ if (test_split <= 0 or test_split >=1):
228
+ test_split = 0.2
229
+
230
+ #setting test_split attribute
231
+ self.test_split = test_split
232
+
233
+ #split X and Y into training and test data
234
+ X_train, X_test, Y_train, Y_test = train_test_split(X_values, Y_values,
235
+ test_size=test_split, random_state=random_state, shuffle=shuffle)
236
+
237
+ #scale training data X after splitting to avoid test-set leakage
238
+ if (scale):
239
+ scaler = StandardScaler()
240
+ X_train = scaler.fit_transform(X_train)
241
+ X_test = scaler.transform(X_test)
242
+
243
+ #set X and Y attributes
244
+ self.X_train = X_train
245
+ self.X_test = X_test
246
+ self.Y_train = np.reshape(Y_train, (len(Y_train),))
247
+ self.Y_test = np.reshape(Y_test, (len(Y_test),))
248
+
249
+ return self.X_train, self.X_test, self.Y_train, self.Y_test
250
+
251
+ def fit(self):
252
+ """
253
+ Fit model to training data and labels.
254
+
255
+ Parameters
256
+ ==========
257
+ None
258
+
259
+ Returns
260
+ =======
261
+ :self.model_fit: np.ndarray
262
+ fitted sklearn model of type specified by algorithm attribute.
263
+ """
264
+ self.model_fit = self.model.fit(self.X_train, self.Y_train)
265
+ return self.model_fit
266
+
267
+ def predict(self):
268
+ """
269
+ Predict the target values of unseen test data using the
270
+ trained model.
271
+
272
+ Parameters
273
+ ==========
274
+ None
275
+
276
+ Returns
277
+ =======
278
+ :self.model_fit.predict(self.X_test): np.ndarray
279
+ array of predicted target values for unseen test data.
280
+ """
281
+ return self.model_fit.predict(self.X_test)
282
+
283
+ def save(self, save_folder, model_name="model.pkl"):
284
+ """
285
+ Save fitted model to specified save_folder.
286
+
287
+ Parameters
288
+ ==========
289
+ :save_folder: str
290
+ folder to save model to.
291
+ :model_name: str
292
+ filename for model.
293
+
294
+ Returns
295
+ =======
296
+ None
297
+
298
+ Security
299
+ ========
300
+ Models are serialized using pickle. Never load pickle files from untrusted
301
+ sources; deserialization of malicious data can execute arbitrary code.
302
+ """
303
+ #append pickle file extension if not present in filename
304
+ if (os.path.splitext(model_name)[1].lower() != ".pkl"):
305
+ model_name = model_name + ".pkl"
306
+
307
+ #set save path to folder + filename
308
+ save_path = os.path.join(save_folder, model_name)
309
+
310
+ #save model in pickle format
311
+ try:
312
+ with open(save_path, 'wb') as file:
313
+ pickle.dump(self.model, file)
314
+ except (pickle.PickleError):
315
+ print(f"Error pickling model with path: {save_path}.")
316
+
317
+ def hyperparameter_tuning(self, param_grid=None, metric='r2', cv=5, n_jobs=None, verbose=2):
318
+ """
319
+ Hyperparameter tuning of model to find its optimal arrangement of parameters
320
+ using a Grid Search.
321
+
322
+ Parameters
323
+ ==========
324
+ :param_grid: dict (default=None)
325
+ dictionary/grid of selected models' parameters and the potential values of each
326
+ that you want to tune.
327
+ :metric: str (default=r2)
328
+ scoring metric used to evaluate the performance of the cross-validated
329
+ model on the test set, R2 by default. List of available scoring metrics
330
+ can be found in documentation:
331
+ https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
332
+ :cv: int (default=5)
333
+ Determines the cross-validation splitting strategy, a CV fold of 5 is used by default.
334
+ :n_jobs : int (default=None)
335
+ Number of jobs to run in parallel. None means 1 job.
336
+ :verbose: int (default=2)
337
+ verbosity of output during tuning process. The values and what they mean
338
+ for this parameter can be found on the documentation:
339
+ https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
340
+
341
+ Returns
342
+ =======
343
+ None
344
+ """
345
+ #raise error if train_test_split() hasn't been called yet
346
+ if not hasattr(self, 'X_train') or self.X_train is None:
347
+ raise RuntimeError(
348
+ 'train_test_split() must be called before hyperparameter_tuning().'
349
+ )
350
+
351
+ #default to empty dict if not provided
352
+ if param_grid is None:
353
+ param_grid = {}
354
+
355
+ #input 'param_grid' parameter must be a dict, if not raise error
356
+ if not (isinstance(param_grid, dict)):
357
+ raise TypeError(f'param_grid argument must be of type dict, got type {type(param_grid)}.')
358
+
359
+ #input metric must be in available scoring metrics, if not raise error
360
+ valid_scorers = sorted(get_scorer_names())
361
+ if (metric not in valid_scorers):
362
+ raise ValueError(
363
+ f"Invalid scoring metric {metric} not in list of available Sklearn Scoring Metrics:\n{valid_scorers}."
364
+ )
365
+
366
+ #cv must be of type int and be between 5 and 10, if not then default of 5 is used
367
+ if not isinstance(cv, int) or cv < 5 or cv > 10:
368
+ cv = 5
369
+
370
+ #copy to avoid mutating caller's dict; filter out parameter names invalid for this model
371
+ param_grid = {p: v for p, v in param_grid.items() if p in self.model.get_params()}
372
+
373
+ #create deep copy of model
374
+ model_copy = deepcopy(self.model)
375
+
376
+ #grid search of hyperparameter space for model
377
+ grid_search = GridSearchCV(estimator=model_copy, param_grid=param_grid, \
378
+ cv=cv, scoring=metric, n_jobs=n_jobs, verbose=verbose, error_score=0)
379
+
380
+ #fit X and Y to best model found in grid search
381
+ grid_result = grid_search.fit(self.X_train, self.Y_train)
382
+
383
+ #predict values of unseen test data using best found model
384
+ best_model_pred = grid_result.predict(self.X_test)
385
+
386
+ #create instance of Evaluate class and calculate metrics from best model
387
+ evaluation = Evaluate(self.Y_test,best_model_pred)
388
+
389
+ #print out results of grid search
390
+ print('\n#############################################################')
391
+ print('################### Hyperparameter Results ###################')
392
+ print('#############################################################\n')
393
+
394
+ print('######################### Parameters ########################\n')
395
+ print(f'# Best Params: {grid_result.best_params_}')
396
+ print(f'# Model Type: {repr(self)}')
397
+ print(f'# Scoring Metric: {metric}')
398
+ print(f'# Number of CV folds: {cv}')
399
+ print(f'# Test Split: {self.test_split}\n')
400
+
401
+ print('######################### Metrics ###########################\n')
402
+ print(f'# Best Score (R2): {grid_result.best_score_}')
403
+ print(f'# RMSE: {evaluation.rmse} ')
404
+ print(f'# MSE: {evaluation.mse} ')
405
+ print(f'# MAE: {evaluation.mae}')
406
+ print(f'# RPD: {evaluation.rpd}')
407
+ print(f'# Explained Variance: {evaluation.explained_var}\n')
408
+ print('##############################################################')
409
+
410
+ self.grid_result = grid_result
411
+
412
+ def model_fitted(self):
413
+ """
414
+ Return if model has been fitted, true or false.
415
+
416
+ Parameters
417
+ ==========
418
+ None
419
+
420
+ Returns
421
+ =======
422
+ :True/False: bool
423
+ true if model (self.model) has been fitted, false if not.
424
+ """
425
+ return (self.model_fit is not None)
426
+
427
+ def feature_selection(self, method=""):
428
+ """
429
+ Feature selection/dimensionality reduction on dataset and models.
430
+ Return the best applicable features found using the technique selected
431
+ from method input parameter.
432
+
433
+ Parameters
434
+ ==========
435
+ :method: str (default="")
436
+ feature selection method to use.
437
+
438
+ Returns
439
+ =======
440
+ :X_new: np.ndarray
441
+ best found features using training data.
442
+
443
+ References
444
+ ==========
445
+ [1] https://scikit-learn.org/stable/modules/feature_selection.html
446
+ """
447
+ #list of available sklearn feature selection techniques
448
+ valid_feature_selection = ["selectkbest", "chi2", "variancethreshold", "rfe",
449
+ "selectfrommodel", "sequentialfeatureselector"]
450
+
451
+ #get closest valid feature selection method
452
+ feature_matches = get_close_matches(method.lower().strip(), [item.lower().strip() \
453
+ for item in valid_feature_selection], cutoff=0.6)
454
+ selected_method = feature_matches[0] if feature_matches else "selectkbest"
455
+
456
+ #apply feature selection method according to input parameter
457
+ if (selected_method == 'selectkbest'):
458
+ X_new = SelectKBest(f_regression, k=1).fit_transform(self.X, self.Y)
459
+ elif (selected_method == "variancethreshold"):
460
+ X_new = VarianceThreshold(1).fit_transform(self.X, self.Y)
461
+ elif (selected_method == "chi2"):
462
+ X_new = SelectKBest(f_regression, k=2).fit_transform(self.X, self.Y)
463
+ elif (selected_method == "rfe"):
464
+ selector = RFE(self.model, n_features_to_select=5, step=1)
465
+ X_new = selector.fit_transform(self.X, self.Y)
466
+ elif (selected_method == "sequentialfeatureselector"):
467
+ selector = SequentialFeatureSelector(self.model, n_features_to_select=3)
468
+ X_new = selector.fit_transform(self.X, self.Y)
469
+ elif (selected_method == "selectfrommodel"):
470
+ selector = SelectFromModel(estimator=deepcopy(self.model))
471
+ X_new = selector.fit_transform(self.X, self.Y)
472
+ else:
473
+ X_new = SelectKBest(f_regression, k=2).fit_transform(self.X, self.Y)
474
+
475
+ return X_new
476
+
477
+ ###################### Getters & Setters ######################
478
+
479
+ @property
480
+ def X(self):
481
+ return self._X
482
+
483
+ @X.setter
484
+ def X(self, val):
485
+ self._X = val
486
+
487
+ @property
488
+ def Y(self):
489
+ return self._Y
490
+
491
+ @Y.setter
492
+ def Y(self, val):
493
+ self._Y = val
494
+
495
+ @property
496
+ def model(self):
497
+ return self._model
498
+
499
+ @model.setter
500
+ def model(self, val):
501
+ self._model = val
502
+
503
+ @property
504
+ def test_split(self):
505
+ return self._test_split
506
+
507
+ @test_split.setter
508
+ def test_split(self, val):
509
+ self._test_split = val
510
+
511
+ @property
512
+ def valid_models(self):
513
+ return self._valid_models
514
+
515
+ @valid_models.setter
516
+ def valid_models(self,val):
517
+ self._valid_models = val
518
+
519
+ @property
520
+ def parameters(self):
521
+ return self._parameters
522
+
523
+ @parameters.setter
524
+ def parameters(self,val):
525
+ self._parameters = val
526
+
527
+ @property
528
+ def algorithm(self):
529
+ return self._algorithm
530
+
531
+ @algorithm.setter
532
+ def algorithm(self,val):
533
+ self._algorithm = val
534
+
535
+ @property
536
+ def model_fit(self):
537
+ return self._model_fit
538
+
539
+ @model_fit.setter
540
+ def model_fit(self,val):
541
+ self._model_fit = val
542
+
543
+ def __str__(self):
544
+ return (
545
+ f"Model of type {type(self.model).__name__} using parameters {self.parameters}, "
546
+ f"model has been fitted = {self.model_fitted()}."
547
+ )
548
+
549
+ def __repr__(self):
550
+ """ Object representation of class instance. """
551
+ return type(self.model).__name__
552
+
553
+ def __eq__(self, other):
554
+ """ Checking if 2 sklearn models are the same. """
555
+ return self.model == other.model
556
+
557
+ def __sizeof__(self):
558
+ """ Get size of sklearn model. """
559
+ return self.model.__sizeof__()
pySAR/plots.py ADDED
@@ -0,0 +1,92 @@
1
+ ################################################################################
2
+ ################# Plots #################
3
+ ################################################################################
4
+
5
+ import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
+ import numpy as np
8
+ from pathlib import Path
9
+
10
+ from .globals_ import OUTPUT_FOLDER, CURRENT_DATETIME
11
+
12
+ def plot_reg(Y_true, Y_pred, r2, output_folder="", show_plot=False, filename="model_regression_plot.png"):
13
+ """
14
+ Plot regression plot of observed (Y_true) vs predicted activity values (Y_pred).
15
+
16
+ Parameters
17
+ ==========
18
+ :Y_true: np.ndarray
19
+ array of observed values.
20
+ :Y_pred: np.ndarray
21
+ array of predicted values.
22
+ :r2: float
23
+ r2 score value.
24
+ :output_folder: str (default="")
25
+ output folder to store regression plot to, if empty input it will be stored in
26
+ the OUTPUT_FOLDER global var.
27
+ :show_plot: bool (default=False)
28
+ whether to display plot or not when function is run, if False the plot is just
29
+ saved to output folder.
30
+ :filename: str (default="model_regression_plot.png")
31
+ output filename for saved plot image.
32
+
33
+ Returns
34
+ =======
35
+ :save_path: str
36
+ full output path of saved regression plot.
37
+ """
38
+ # Validate inputs and normalize to 1D float arrays for plotting.
39
+ try:
40
+ y_true = np.asarray(Y_true, dtype=float).reshape(-1)
41
+ y_pred = np.asarray(Y_pred, dtype=float).reshape(-1)
42
+ except (TypeError, ValueError):
43
+ raise TypeError("Y_true and Y_pred must be numeric array-like inputs.")
44
+
45
+ if y_true.size == 0 or y_pred.size == 0:
46
+ raise ValueError("Y_true and Y_pred must be non-empty arrays.")
47
+ if y_true.shape[0] != y_pred.shape[0]:
48
+ raise ValueError(f"Y_true and Y_pred must have same length, got {y_true.shape[0]} and {y_pred.shape[0]}.")
49
+ if not (np.isfinite(y_true).all() and np.isfinite(y_pred).all()):
50
+ raise ValueError("Y_true and Y_pred must contain only finite numeric values.")
51
+ try:
52
+ r2 = float(r2)
53
+ except (TypeError, ValueError):
54
+ raise TypeError(f"r2 must be a numeric value, got {type(r2)}.")
55
+
56
+ if not np.isfinite(r2):
57
+ raise ValueError(f"r2 must be a finite numeric value, got {r2}.")
58
+
59
+ if not isinstance(filename, str) or filename.strip() == "":
60
+ raise ValueError("filename must be a non-empty string.")
61
+ if Path(filename).suffix == "":
62
+ filename = f"{filename}.png"
63
+
64
+ # Resolve output folder and ensure it exists.
65
+ if output_folder in ("", None):
66
+ target_dir = Path(OUTPUT_FOLDER)
67
+ else:
68
+ target_dir = Path(f"{output_folder}_{CURRENT_DATETIME}")
69
+ target_dir.mkdir(parents=True, exist_ok=True)
70
+
71
+ save_path = target_dir / filename
72
+
73
+ fig, ax = plt.subplots(figsize=(8, 8))
74
+ try:
75
+ # Plot predicted values against observed values to match axis labels.
76
+ sns.regplot(x=y_pred, y=y_true, marker="+", truncate=False, fit_reg=True, ax=ax)
77
+ r2_annotation = f"R2: {r2:.3f}"
78
+ ax.text(0.15, 0.92, r2_annotation, ha="left", va="top", fontsize=15, color="green",
79
+ fontweight="bold", transform=ax.transAxes)
80
+ ax.set_xlabel("Predicted Value", fontdict=dict(weight="bold"), fontsize=12)
81
+ ax.set_ylabel("Observed Value", fontdict=dict(weight="bold"), fontsize=12)
82
+ ax.set_title("Observed vs Predicted values for protein activity", fontdict=dict(weight="bold"), fontsize=15)
83
+
84
+ fig.savefig(save_path, dpi=300, bbox_inches="tight")
85
+
86
+ if show_plot:
87
+ plt.show(block=False)
88
+ plt.pause(3)
89
+
90
+ return str(save_path)
91
+ finally:
92
+ plt.close(fig)
pySAR/py.typed ADDED
File without changes