PyPI - PySAR - Versions diffs - 2.5.0__py3-none-any.whl - Mend

PySAR 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

docs/conf.py +53 -0
pySAR/__init__.py +28 -0
pySAR/descriptors.py +2893 -0
pySAR/encoding.py +986 -0
pySAR/evaluate.py +231 -0
pySAR/globals_.py +21 -0
pySAR/model.py +559 -0
pySAR/plots.py +92 -0
pySAR/py.typed +0 -0
pySAR/pyDSP.py +582 -0
pySAR/pySAR.py +962 -0
pySAR/utils.py +283 -0
pysar-2.5.0.dist-info/METADATA +740 -0
pysar-2.5.0.dist-info/RECORD +17 -0
pysar-2.5.0.dist-info/WHEEL +5 -0
pysar-2.5.0.dist-info/licenses/LICENSE +21 -0
pysar-2.5.0.dist-info/top_level.txt +2 -0

pySAR/model.py ADDED Viewed

@@ -0,0 +1,559 @@
+################################################################################
+#################                    Model                     #################
+################################################################################
+from sklearn.neighbors import KNeighborsRegressor
+from sklearn.svm import SVR
+from sklearn.linear_model import Lasso, LinearRegression, Ridge, SGDRegressor, ElasticNet
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.cross_decomposition import PLSRegression
+from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import GridSearchCV, train_test_split
+from sklearn.metrics import get_scorer_names
+from sklearn.feature_selection import SelectKBest, f_regression, VarianceThreshold, RFE, SelectFromModel, SequentialFeatureSelector
+from difflib import get_close_matches
+from copy import deepcopy
+import os
+import pickle
+import pandas as pd
+import numpy as np
+# np.seterr is intentionally NOT set globally; divide/invalid warnings are suppressed
+# locally via np.errstate() at the call sites where they are expected.
+from .evaluate import Evaluate
+class Model():
+    """
+    Class for building, fitting and training a various range of predictive
+    regression models and all their related methods and attributes. The
+    model class supports the following regression algorithms: PLS Regression,
+    Random Forest, AdaBoost, Bagging, Decision Tree, GradientBoost, Linear
+    Regression, Lasso, Ridge, ElasticNet, Support Vector Regression, Stochastic
+    Gradient Descent, K Nearest Neighbours (KNN), Extra Trees, Histogram-based
+    Gradient Boosting and Gaussian Process Regression.
+    Once a model object has been built and fitted to the training data and
+    labels, it can then be used for predicting the sought activity/fitness
+    value for unseen test sequences.
+    Parameters
+    ==========
+    :X: np.ndarray
+        training data.
+    :Y: np.ndarray
+        training data labels.
+    :algorithm: str
+        sklearn regression algorithm to build and fit model with. Value can be
+        an approximate representation of model name, for example: 'plsreg' will
+        initialiase an instance of the PLSRegression model etc. Available
+        algorithms listed above.
+    :parameters: dict (default={})
+        parameters to use for specific sklearn model when building regression
+        model, by default it is set to {}, meaning all of the models' default
+        parameters are used. Refer to sci-kit learn for full list of available
+        input parameters for each model: https://scikit-learn.org/stable/index.html.
+    :test_split: float (default=0.2)
+        proportion of the test data to use for building model, default of 0.2 is
+        recommended, meaning 80% of the data used for training and 20% for testing.
+    Methods
+    =======
+    get_model():
+        build model using inputted parameters.
+    train_test_split(scale=True, test_split=0.2, random_state=None, shuffle=True):
+        get train-test split of dataset.
+    fit():
+        fit model.
+    predict():
+        predict activity values using trained model and test data.
+    save(save_folder):
+        save fitted model to save_folder.
+    model_fitted():
+        return if model has been fitted (true or false)
+    hyperparameter_tuning(self, parameters={}, metric='r2', cv=5, n_jobs=None, verbose=2):
+        complete hyperparameter tuning of model and its associated parameters.
+    feature_selection(method=""):
+        undertake feature selection using technique specified by method input
+        parameter to find optimal selection of features for maximum predictability
+        in model. Supported feature selection methods include SelectKBest, chi2,
+        VarianceThreshold, RFE, SelectFromModel and SequentialFeatureSelector.
+    """
+    MODEL_CONSTRUCTORS = {
+        'plsregression': PLSRegression,
+        'randomforestregressor': RandomForestRegressor,
+        'adaboostregressor': AdaBoostRegressor,
+        'baggingregressor': BaggingRegressor,
+        'decisiontreeregressor': DecisionTreeRegressor,
+        'linearregression': LinearRegression,
+        'lasso': Lasso,
+        'ridge': Ridge,
+        'sgd': SGDRegressor,
+        'stochasticgradientdescent': SGDRegressor,
+        'gbr': GradientBoostingRegressor,
+        'gradientboost': GradientBoostingRegressor,
+        'gradientboostingregressor': GradientBoostingRegressor,
+        'svr': SVR,
+        'supportvectorregression': SVR,
+        'knn': KNeighborsRegressor,
+        'kneighborsregressor': KNeighborsRegressor,
+        'knearestneighbors': KNeighborsRegressor,
+        'elasticnet': ElasticNet,
+        'extratreesregressor': ExtraTreesRegressor,
+        'extratrees': ExtraTreesRegressor,
+        'histgradientboostingregressor': HistGradientBoostingRegressor,
+        'histgradientboosting': HistGradientBoostingRegressor,
+        'hgbr': HistGradientBoostingRegressor,
+        'gaussianprocessregressor': GaussianProcessRegressor,
+        'gaussianprocess': GaussianProcessRegressor,
+        'gpr': GaussianProcessRegressor,
+    }
+    def __init__(self, X, Y, algorithm, parameters=None, test_split=0.2):
+        self.algorithm = algorithm
+        self.test_split = test_split
+        self.X = X
+        self.Y = Y
+        #if no model parameters input, then set to {} meaning default models' parameters are used
+        if parameters is None or parameters == [] or parameters == "":
+            self.parameters = {}
+        else:
+            self.parameters = parameters
+        #list of valid models available to use for this class
+        self.valid_models = ['plsregression', 'randomforestregressor', 'adaboostregressor',\
+                            'baggingregressor', 'decisiontreeregressor', 'gbr',
+                            'gradientboostingregressor', 'linearregression', 'lasso', 'ridge',
+                            'svr', 'supportvectorregression', 'sgd', 'stochasticgradientdescent',
+                            'kneighborsregressor', 'knearestneighbors', 'knn', 'elasticnet',
+                            'extratreesregressor', 'extratrees', 'histgradientboostingregressor',
+                            'histgradientboosting', 'hgbr', 'gaussianprocessregressor',
+                            'gaussianprocess', 'gpr']
+        #raise error if algorithm parameter isnt string type
+        if not(isinstance(self.algorithm, str)):
+            raise TypeError(f"Algorithm input parameter must be a string, got type {type(self.algorithm)}.")
+        #get closest match of valid model from the input algorithm parameter value using difflib
+        model_matches = get_close_matches(self.algorithm.lower().strip(),[item.lower().strip() \
+            for item in self.valid_models], cutoff=0.5)
+        #if algorithm is a valid model then set it to self.algorithm, else raise error
+        if (model_matches!=[]):
+            self.algorithm = model_matches[0]
+        else:
+            raise ValueError(f'Input algorithm {self.algorithm} not found in list of available valid models\n{self.valid_models}.')
+        #create instance of algorithm object using its sklearn constructor
+        self.model = self.get_model()
+        #set model_fit to None, specifies if model has been fit or not
+        self.model_fit = None
+    def get_model(self):
+        """
+        Create instance of model type specified by input 'algorithm' argument. If
+        input 'parameters' = {} then default parameters of sklearn model are used, else set
+        the parameters of the model to the values specified in the 'parameters' input.
+        Parameters
+        ==========
+        None
+        Returns
+        =======
+        :model: sklearn.model
+            instantiated regression model with default or user-specified parameters.
+        """
+        constructor = self.MODEL_CONSTRUCTORS.get(self.algorithm.lower().strip())
+        if constructor is None:
+            raise ValueError('Input Algorithm {} not found in available valid models:\n{}'.
+                format(self.algorithm, self.valid_models))
+        valid_parameter_names = set(constructor().get_params().keys())
+        parameters = {
+            key: value for key, value in self.parameters.items()
+            if key in valid_parameter_names
+        }
+        return constructor(**parameters) if parameters else constructor()
+    def train_test_split(self, test_split=0.2, scale=True, random_state=None, shuffle=True):
+        """
+        Split the X and Y input features and labels into random train and test
+        subsets. By default a 80:20 split will be used, whereby 80% of the data
+        will be used for training and 20% for testing. By default the input will
+        be scaled first such that the mean is removed and features scaled to unit
+        variance. By default data is shuffled before the split and random state is None.
+        Parameters
+        ==========
+        :scale: bool (default=True)
+            if true then scale the features such that they are standardised.
+        :test_split: float (default=0.2)
+            proportion of the total dataset to use for testing, rest used for training.
+        :random_state : float (default=None)
+            Controls the shuffling applied to the data before applying the split.
+            Popular integer random seeds are 0 and 42, None by default.
+        :shuffle: bool (default=True)
+            Whether or not to shuffle the data before splitting.
+        Returns
+        =======
+        :self.X_train, self.X_test, self.Y_train, self.Y_test: np.ndarray
+            splitted training and test data features and labels.
+        """
+        #validate that X and Y arrays are of the same size
+        if (len(self.X) != len(self.Y)):
+            raise ValueError('X and Y input parameters must be of the same length - X: {}, Y: {}.'.
+                format(len(self.X), len(self.Y)))
+        #reshape input arrays to 2D arrays without mutating the original attributes
+        X_values = self.X.values if isinstance(self.X, (pd.DataFrame, pd.Series)) else self.X
+        Y_values = self.Y.values if isinstance(self.Y, (pd.DataFrame, pd.Series)) else self.Y
+        X_values = np.asarray(X_values)
+        Y_values = np.asarray(Y_values)
+        if (X_values.ndim != 2):
+            X_values = np.reshape(X_values, (-1,1))
+        if (Y_values.ndim != 2):
+            Y_values = np.reshape(Y_values, (-1,1))
+        #if invalid test size input then set to default 0.2
+        if (test_split <= 0 or test_split >=1):
+            test_split = 0.2
+        #setting test_split attribute
+        self.test_split = test_split
+        #split X and Y into training and test data
+        X_train, X_test, Y_train, Y_test = train_test_split(X_values, Y_values,
+            test_size=test_split, random_state=random_state, shuffle=shuffle)
+        #scale training data X after splitting to avoid test-set leakage
+        if (scale):
+            scaler = StandardScaler()
+            X_train = scaler.fit_transform(X_train)
+            X_test = scaler.transform(X_test)
+        #set X and Y attributes
+        self.X_train = X_train
+        self.X_test = X_test
+        self.Y_train = np.reshape(Y_train, (len(Y_train),))
+        self.Y_test = np.reshape(Y_test, (len(Y_test),))
+        return self.X_train, self.X_test, self.Y_train, self.Y_test
+    def fit(self):
+        """
+        Fit model to training data and labels.
+        Parameters
+        ==========
+        None
+        Returns
+        =======
+        :self.model_fit: np.ndarray
+            fitted sklearn model of type specified by algorithm attribute.
+        """
+        self.model_fit = self.model.fit(self.X_train, self.Y_train)
+        return self.model_fit
+    def predict(self):
+        """
+        Predict the target values of unseen test data using the
+        trained model.
+        Parameters
+        ==========
+        None
+        Returns
+        =======
+        :self.model_fit.predict(self.X_test): np.ndarray
+            array of predicted target values for unseen test data.
+        """
+        return self.model_fit.predict(self.X_test)
+    def save(self, save_folder, model_name="model.pkl"):
+        """
+        Save fitted model to specified save_folder.
+        Parameters
+        ==========
+        :save_folder: str
+            folder to save model to.
+        :model_name: str
+            filename for model.
+        Returns
+        =======
+        None
+        Security
+        ========
+        Models are serialized using pickle. Never load pickle files from untrusted
+        sources; deserialization of malicious data can execute arbitrary code.
+        """
+        #append pickle file extension if not present in filename
+        if (os.path.splitext(model_name)[1].lower() != ".pkl"):
+            model_name = model_name + ".pkl"
+        #set save path to folder + filename
+        save_path = os.path.join(save_folder, model_name)
+        #save model in pickle format
+        try:
+            with open(save_path, 'wb') as file:
+                pickle.dump(self.model, file)
+        except (pickle.PickleError):
+            print(f"Error pickling model with path: {save_path}.")
+    def hyperparameter_tuning(self, param_grid=None, metric='r2', cv=5, n_jobs=None, verbose=2):
+        """
+        Hyperparameter tuning of model to find its optimal arrangement of parameters
+        using a Grid Search.
+        Parameters
+        ==========
+        :param_grid: dict (default=None)
+            dictionary/grid of selected models' parameters and the potential values of each
+            that you want to tune.
+        :metric: str (default=r2)
+            scoring metric used to evaluate the performance of the cross-validated
+            model on the test set, R2 by default. List of available scoring metrics
+            can be found in documentation:
+            https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
+        :cv: int (default=5)
+            Determines the cross-validation splitting strategy, a CV fold of 5 is used by default.
+        :n_jobs : int (default=None)
+            Number of jobs to run in parallel. None means 1 job.
+        :verbose: int (default=2)
+            verbosity of output during tuning process. The values and what they mean
+            for this parameter can be found on the documentation:
+            https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
+        Returns
+        =======
+        None
+        """
+        #raise error if train_test_split() hasn't been called yet
+        if not hasattr(self, 'X_train') or self.X_train is None:
+            raise RuntimeError(
+                'train_test_split() must be called before hyperparameter_tuning().'
+            )
+        #default to empty dict if not provided
+        if param_grid is None:
+            param_grid = {}
+        #input 'param_grid' parameter must be a dict, if not raise error
+        if not (isinstance(param_grid, dict)):
+            raise TypeError(f'param_grid argument must be of type dict, got type {type(param_grid)}.')
+        #input metric must be in available scoring metrics, if not raise error
+        valid_scorers = sorted(get_scorer_names())
+        if (metric not in valid_scorers):
+            raise ValueError(
+                f"Invalid scoring metric {metric} not in list of available Sklearn Scoring Metrics:\n{valid_scorers}."
+            )
+        #cv must be of type int and be between 5 and 10, if not then default of 5 is used
+        if not isinstance(cv, int) or cv < 5 or cv > 10:
+            cv = 5
+        #copy to avoid mutating caller's dict; filter out parameter names invalid for this model
+        param_grid = {p: v for p, v in param_grid.items() if p in self.model.get_params()}
+        #create deep copy of model
+        model_copy = deepcopy(self.model)
+        #grid search of hyperparameter space for model
+        grid_search = GridSearchCV(estimator=model_copy, param_grid=param_grid, \
+            cv=cv, scoring=metric, n_jobs=n_jobs, verbose=verbose, error_score=0)
+        #fit X and Y to best model found in grid search
+        grid_result = grid_search.fit(self.X_train, self.Y_train)
+        #predict values of unseen test data using best found model
+        best_model_pred = grid_result.predict(self.X_test)
+        #create instance of Evaluate class and calculate metrics from best model
+        evaluation = Evaluate(self.Y_test,best_model_pred)
+        #print out results of grid search
+        print('\n#############################################################')
+        print('################### Hyperparameter Results ###################')
+        print('#############################################################\n')
+        print('######################### Parameters ########################\n')
+        print(f'# Best Params: {grid_result.best_params_}')
+        print(f'# Model Type: {repr(self)}')
+        print(f'# Scoring Metric: {metric}')
+        print(f'# Number of CV folds: {cv}')
+        print(f'# Test Split: {self.test_split}\n')
+        print('######################### Metrics ###########################\n')
+        print(f'# Best Score (R2): {grid_result.best_score_}')
+        print(f'# RMSE: {evaluation.rmse} ')
+        print(f'# MSE: {evaluation.mse} ')
+        print(f'# MAE: {evaluation.mae}')
+        print(f'# RPD: {evaluation.rpd}')
+        print(f'# Explained Variance: {evaluation.explained_var}\n')
+        print('##############################################################')
+        self.grid_result = grid_result
+    def model_fitted(self):
+        """
+        Return if model has been fitted, true or false.
+        Parameters
+        ==========
+        None
+        Returns
+        =======
+        :True/False: bool
+            true if model (self.model) has been fitted, false if not.
+        """
+        return (self.model_fit is not None)
+    def feature_selection(self, method=""):
+        """
+        Feature selection/dimensionality reduction on dataset and models.
+        Return the best applicable features found using the technique selected
+        from method input parameter.
+        Parameters
+        ==========
+        :method: str (default="")
+            feature selection method to use.
+        Returns
+        =======
+        :X_new: np.ndarray
+            best found features using training data.
+        References
+        ==========
+        [1] https://scikit-learn.org/stable/modules/feature_selection.html
+        """
+        #list of available sklearn feature selection techniques
+        valid_feature_selection = ["selectkbest", "chi2", "variancethreshold", "rfe",
+            "selectfrommodel", "sequentialfeatureselector"]
+        #get closest valid feature selection method
+        feature_matches = get_close_matches(method.lower().strip(), [item.lower().strip() \
+            for item in valid_feature_selection], cutoff=0.6)
+        selected_method = feature_matches[0] if feature_matches else "selectkbest"
+        #apply feature selection method according to input parameter
+        if (selected_method == 'selectkbest'):
+            X_new = SelectKBest(f_regression, k=1).fit_transform(self.X, self.Y)
+        elif (selected_method == "variancethreshold"):
+            X_new = VarianceThreshold(1).fit_transform(self.X, self.Y)
+        elif (selected_method == "chi2"):
+            X_new = SelectKBest(f_regression, k=2).fit_transform(self.X, self.Y)
+        elif (selected_method == "rfe"):
+            selector = RFE(self.model, n_features_to_select=5, step=1)
+            X_new = selector.fit_transform(self.X, self.Y)
+        elif (selected_method == "sequentialfeatureselector"):
+            selector = SequentialFeatureSelector(self.model, n_features_to_select=3)
+            X_new = selector.fit_transform(self.X, self.Y)
+        elif (selected_method == "selectfrommodel"):
+            selector = SelectFromModel(estimator=deepcopy(self.model))
+            X_new = selector.fit_transform(self.X, self.Y)
+        else:
+            X_new = SelectKBest(f_regression, k=2).fit_transform(self.X, self.Y)
+        return X_new
+######################          Getters & Setters          ######################
+    @property
+    def X(self):
+        return self._X
+    @X.setter
+    def X(self, val):
+        self._X = val
+    @property
+    def Y(self):
+        return self._Y
+    @Y.setter
+    def Y(self, val):
+        self._Y = val
+    @property
+    def model(self):
+        return self._model
+    @model.setter
+    def model(self, val):
+        self._model = val
+    @property
+    def test_split(self):
+        return self._test_split
+    @test_split.setter
+    def test_split(self, val):
+        self._test_split = val
+    @property
+    def valid_models(self):
+        return self._valid_models
+    @valid_models.setter
+    def valid_models(self,val):
+        self._valid_models = val
+    @property
+    def parameters(self):
+        return self._parameters
+    @parameters.setter
+    def parameters(self,val):
+        self._parameters = val
+    @property
+    def algorithm(self):
+        return self._algorithm
+    @algorithm.setter
+    def algorithm(self,val):
+        self._algorithm = val
+    @property
+    def model_fit(self):
+        return self._model_fit
+    @model_fit.setter
+    def model_fit(self,val):
+        self._model_fit = val
+    def __str__(self):
+        return (
+            f"Model of type {type(self.model).__name__} using parameters {self.parameters}, "
+            f"model has been fitted = {self.model_fitted()}."
+        )
+    def __repr__(self):
+        """ Object representation of class instance. """
+        return type(self.model).__name__
+    def __eq__(self, other):
+        """ Checking if 2 sklearn models are the same. """
+        return self.model == other.model
+    def __sizeof__(self):
+        """ Get size of sklearn model. """
+        return self.model.__sizeof__()

pySAR/plots.py ADDED Viewed

@@ -0,0 +1,92 @@
+################################################################################
+#################                    Plots                     #################
+################################################################################
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+from pathlib import Path
+from .globals_ import OUTPUT_FOLDER, CURRENT_DATETIME
+def plot_reg(Y_true, Y_pred, r2, output_folder="", show_plot=False, filename="model_regression_plot.png"):
+    """
+    Plot regression plot of observed (Y_true) vs predicted activity values (Y_pred).
+    Parameters
+    ==========
+    :Y_true: np.ndarray
+        array of observed values.
+    :Y_pred: np.ndarray
+        array of predicted values.
+    :r2: float
+        r2 score value.
+    :output_folder: str (default="")
+        output folder to store regression plot to, if empty input it will be stored in
+        the OUTPUT_FOLDER global var.
+    :show_plot: bool (default=False)
+        whether to display plot or not when function is run, if False the plot is just
+        saved to output folder.
+    :filename: str (default="model_regression_plot.png")
+        output filename for saved plot image.
+    Returns
+    =======
+    :save_path: str
+        full output path of saved regression plot.
+    """
+    # Validate inputs and normalize to 1D float arrays for plotting.
+    try:
+        y_true = np.asarray(Y_true, dtype=float).reshape(-1)
+        y_pred = np.asarray(Y_pred, dtype=float).reshape(-1)
+    except (TypeError, ValueError):
+        raise TypeError("Y_true and Y_pred must be numeric array-like inputs.")
+    if y_true.size == 0 or y_pred.size == 0:
+        raise ValueError("Y_true and Y_pred must be non-empty arrays.")
+    if y_true.shape[0] != y_pred.shape[0]:
+        raise ValueError(f"Y_true and Y_pred must have same length, got {y_true.shape[0]} and {y_pred.shape[0]}.")
+    if not (np.isfinite(y_true).all() and np.isfinite(y_pred).all()):
+        raise ValueError("Y_true and Y_pred must contain only finite numeric values.")
+    try:
+        r2 = float(r2)
+    except (TypeError, ValueError):
+        raise TypeError(f"r2 must be a numeric value, got {type(r2)}.")
+    if not np.isfinite(r2):
+        raise ValueError(f"r2 must be a finite numeric value, got {r2}.")
+    if not isinstance(filename, str) or filename.strip() == "":
+        raise ValueError("filename must be a non-empty string.")
+    if Path(filename).suffix == "":
+        filename = f"{filename}.png"
+    # Resolve output folder and ensure it exists.
+    if output_folder in ("", None):
+        target_dir = Path(OUTPUT_FOLDER)
+    else:
+        target_dir = Path(f"{output_folder}_{CURRENT_DATETIME}")
+    target_dir.mkdir(parents=True, exist_ok=True)
+    save_path = target_dir / filename
+    fig, ax = plt.subplots(figsize=(8, 8))
+    try:
+        # Plot predicted values against observed values to match axis labels.
+        sns.regplot(x=y_pred, y=y_true, marker="+", truncate=False, fit_reg=True, ax=ax)
+        r2_annotation = f"R2: {r2:.3f}"
+        ax.text(0.15, 0.92, r2_annotation, ha="left", va="top", fontsize=15, color="green",
+            fontweight="bold", transform=ax.transAxes)
+        ax.set_xlabel("Predicted Value", fontdict=dict(weight="bold"), fontsize=12)
+        ax.set_ylabel("Observed Value", fontdict=dict(weight="bold"), fontsize=12)
+        ax.set_title("Observed vs Predicted values for protein activity", fontdict=dict(weight="bold"), fontsize=15)
+        fig.savefig(save_path, dpi=300, bbox_inches="tight")
+        if show_plot:
+            plt.show(block=False)
+            plt.pause(3)
+        return str(save_path)
+    finally:
+        plt.close(fig)

pySAR/py.typed ADDED Viewed

File without changes