PyPI - chemotools - Versions diffs - 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl - Mend

chemotools 0.1.8py3-none-any.whl → 0.1.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

chemotools/augmentation/__init__.py +4 -0
chemotools/augmentation/_add_noise.py +70 -49
chemotools/augmentation/_fractional_shift.py +203 -0
chemotools/augmentation/_gaussian_broadening.py +136 -0
chemotools/augmentation/_index_shift.py +116 -101
chemotools/outliers/__init__.py +7 -0
chemotools/outliers/_base.py +180 -0
chemotools/outliers/_utils.py +91 -0
chemotools/outliers/dmodx.py +146 -0
chemotools/outliers/hotelling_t2.py +155 -0
chemotools/outliers/leverage.py +150 -0
chemotools/outliers/q_residuals.py +225 -0
chemotools/outliers/studentized_residuals.py +197 -0
{chemotools-0.1.8.dist-info → chemotools-0.1.10.dist-info}/METADATA +1 -1
{chemotools-0.1.8.dist-info → chemotools-0.1.10.dist-info}/RECORD +17 -7
{chemotools-0.1.8.dist-info → chemotools-0.1.10.dist-info}/WHEEL +1 -1
{chemotools-0.1.8.dist-info → chemotools-0.1.10.dist-info}/LICENSE +0 -0

chemotools/outliers/hotelling_t2.py ADDED Viewed

@@ -0,0 +1,155 @@
+from typing import Optional, Union
+import numpy as np
+from sklearn.cross_decomposition._pls import _PLS
+from sklearn.decomposition._base import _BasePCA
+from sklearn.pipeline import Pipeline
+from sklearn.utils.validation import validate_data, check_is_fitted
+from scipy.stats import f as f_distribution
+from ._base import _ModelResidualsBase, ModelTypes
+class HotellingT2(_ModelResidualsBase):
+    """
+    Calculate Hotelling's T-squared statistics for PCA or PLS like models.
+    Parameters
+    ----------
+    model : Union[ModelType, Pipeline]
+        A fitted PCA/PLS model or Pipeline ending with such a model
+    confidence : float, default=0.95
+        Confidence level for statistical calculations (between 0 and 1)
+    Attributes
+    ----------
+    model_ : ModelType
+        The fitted model of type _BasePCA or _PLS
+    preprocessing_ : Optional[Pipeline]
+        Preprocessing steps before the model
+    n_features_in_ : int
+        Number of features in the input data
+    n_components_ : int
+        Number of components in the model
+    n_samples_ : int
+        Number of samples used to train the model
+    critical_value_ : float
+        The calculated critical value for outlier detection
+    References
+    ----------
+    Johan A. Westerhuis, Stephen P. Gurden, Age K. Smilde (2001) Generalized contribution plots in multivariate statistical process
+    monitoring  Chemometrics and Intelligent Laboratory Systems 51 2000 95–114
+    """
+    def __init__(
+        self, model: Union[ModelTypes, Pipeline], confidence: float = 0.95
+    ) -> None:
+        super().__init__(model, confidence)
+    def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "HotellingT2":
+        """
+        Fit the model to the input data.
+        This step calculates the critical value for the outlier detection. In the DmodX method,
+        the critical value is not depend on the input data but on the model parameters.
+        """
+        X = validate_data(
+            self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
+        )
+        self.critical_value_ = self._calculate_critical_value()
+        return self
+    def predict(self, X: np.ndarray) -> np.ndarray:
+        """Identify outliers in the input data.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input data
+        Returns
+        -------
+        ndarray of shape (n_samples,)
+            Boolean array indicating outliers
+        """
+        # Check the estimator has been fitted
+        check_is_fitted(self, ["critical_value_"])
+        # Validate the input data
+        X = validate_data(
+            self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
+        )
+        # Calculate the Hotelling's T-squared statistics
+        hotelling_t2_values = self.predict_residuals(X, y=None, validate=False)
+        return np.where(hotelling_t2_values > self.critical_value_, -1, 1)
+    def predict_residuals(
+        self, X: np.ndarray, y: Optional[np.ndarray], validate: bool = True
+    ) -> np.ndarray:
+        """Calculate Hotelling's T-squared statistics for input data.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input data
+        Returns
+        -------
+        ndarray of shape (n_samples,)
+            Hotellin's T-squared statistics for each sample
+        """
+        # Check the estimator has been fitted
+        check_is_fitted(self, ["critical_value_"])
+        # Validate the input data
+        if validate:
+            X = validate_data(
+                self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
+            )
+        # Apply preprocessing steps
+        if self.preprocessing_:
+            X = self.preprocessing_.transform(X)
+        # Calculate the Hotelling's T-squared statistics
+        if isinstance(self.model_, _BasePCA):
+            # For PCA-like models
+            variances = self.model_.explained_variance_
+        if isinstance(self.model_, _PLS):
+            # For PLS-like models
+            variances = np.var(self.model_.x_scores_, axis=0)
+        # Equivalent to X @ model.components_.T for _BasePCA and X @ model.x_rotations_ for _PLS
+        X_transformed = self.model_.transform(X)
+        return np.sum((X_transformed**2) / variances, axis=1)
+    def _calculate_critical_value(self, X: Optional[np.ndarray] = None) -> float:
+        """
+        Calculate the critical value for the Hotelling's T-squared statistics.
+        Returns
+        -------
+        float
+            The critical value for the Hotelling's T-squared statistics
+        """
+        critical_value = f_distribution.ppf(
+            self.confidence, self.n_components_, self.n_samples_ - self.n_components_
+        )
+        return (
+            critical_value
+            * self.n_components_
+            * (self.n_samples_ - 1)
+            / (self.n_samples_ - self.n_components_)
+        )

chemotools/outliers/leverage.py ADDED Viewed

@@ -0,0 +1,150 @@
+from typing import Optional, Union
+import numpy as np
+from sklearn.pipeline import Pipeline
+from sklearn.utils.validation import validate_data, check_is_fitted
+from ._base import _ModelResidualsBase, ModelTypes
+class Leverage(_ModelResidualsBase):
+    """
+    Calculate the leverage of the training samples on  the latent space of a PCA or PLS models.
+    This method allows to detect datapoints with high leverage in the model.
+    Parameters
+    ----------
+    model : Union[ModelType, Pipeline]
+        A fitted PCA/PLS model or Pipeline ending with such a model
+    Attributes
+    ----------
+    model_ : ModelType
+        The fitted model of type _BasePCA or _PLS
+    preprocessing_ : Optional[Pipeline]
+        Preprocessing steps before the model
+    References
+    ----------
+    """
+    def __init__(
+        self, model: Union[ModelTypes, Pipeline], confidence: float = 0.95
+    ) -> None:
+        super().__init__(model, confidence)
+    def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "Leverage":
+        """
+        Fit the model to the input data.
+        Parameters
+        """
+        X = validate_data(
+            self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
+        )
+        if self.preprocessing_:
+            X = self.preprocessing_.fit_transform(X)
+        # Compute the critical threshold
+        self.critical_value_ = self._calculate_critical_value(X)
+        return self
+    def predict(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray:
+        """Calculate Leverage for training data on the model.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input data
+        Returns
+        -------
+        ndarray of shape (n_samples,)
+            Bool with samples with a leverage above the critical value
+        """
+        # Check the estimator has been fitted
+        check_is_fitted(self, ["critical_value_"])
+        # Validate the input data
+        X = validate_data(
+            self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
+        )
+        # Preprocess the data
+        if self.preprocessing_:
+            X = self.preprocessing_.transform(X)
+        # Calculate outliers based on samples with too high leverage
+        leverage = calculate_leverage(self.model_, X)
+        return np.where(leverage > self.critical_value_, -1, 1)
+    def predict_residuals(
+        self, X: np.ndarray, y: Optional[np.ndarray], validate: bool = True
+    ) -> np.ndarray:
+        """Calculate the leverage of the samples.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input data
+        Returns
+        -------
+        np.ndarray
+            Leverage of the samples
+        """
+        # Check the estimator has been fitted
+        check_is_fitted(self, ["critical_value_"])
+        # Validate the input data
+        if validate:
+            X = validate_data(self, X, ensure_2d=True, dtype=np.float64)
+        # Apply preprocessing if available
+        if self.preprocessing_:
+            X = self.preprocessing_.transform(X)
+        # Calculate the leverage
+        return calculate_leverage(self.model_, X)
+    def _calculate_critical_value(self, X: Optional[np.ndarray]) -> float:
+        """Calculate the critical value for outlier detection using the percentile outlier method."""
+        # Calculate the leverage of the samples
+        leverage = calculate_leverage(self.model_, X)
+        # Calculate the critical value
+        return np.percentile(leverage, self.confidence * 100)
+def calculate_leverage(model: ModelTypes, X: Optional[np.ndarray]) -> np.ndarray:
+    """
+    Calculate the leverage of the training samples in a PLS/PCA-like model.
+    Parameters
+    ----------
+    model : Union[_BasePCA, _PLS]
+        A fitted PCA/PLS model
+    X : np.ndarray
+        Preprocessed input data
+    Returns
+    -------
+    np.ndarray
+        Leverage of the samples
+    """
+    X_transformed = model.transform(X)
+    X_hat = (
+        X_transformed @ np.linalg.inv(X_transformed.T @ X_transformed) @ X_transformed.T
+    )
+    return np.diag(X_hat)

chemotools/outliers/q_residuals.py ADDED Viewed

@@ -0,0 +1,225 @@
+from typing import Optional, Literal, Union
+import numpy as np
+from scipy.stats import norm, chi2
+from sklearn.pipeline import Pipeline
+from sklearn.utils.validation import validate_data, check_is_fitted
+from ._base import _ModelResidualsBase, ModelTypes
+class QResiduals(_ModelResidualsBase):
+    """
+    Calculate Q residuals (Squared Prediction Error - SPE) for PCA or PLS models.
+    Parameters
+    ----------
+    model : Union[ModelType, Pipeline]
+        A fitted PCA/PLS model or Pipeline ending with such a model.
+    confidence : float, default=0.95
+        Confidence level for statistical calculations (between 0 and 1).
+    method : str, default="chi-square"
+        The method used to compute the confidence threshold for Q residuals.
+        Options:
+        - "chi-square" : Uses mean and standard deviation to approximate Q residuals threshold.
+        - "jackson-mudholkar" : Uses eigenvalue-based analytical approximation.
+        - "percentile" : Uses empirical percentile threshold.
+    Attributes
+    ----------
+    model_ : ModelType
+        The fitted model of type _BasePCA or _PLS.
+    preprocessing_ : Optional[Pipeline]
+        Preprocessing steps before the model.
+    n_features_in_ : int
+        Number of features in the input data.
+    n_components_ : int
+        Number of components in the model.
+    n_samples_ : int
+        Number of samples used to train the model.
+    critical_value_ : float
+        The calculated critical value for outlier detection.
+    References
+    ----------
+    Johan A. Westerhuis, Stephen P. Gurden, Age K. Smilde (2001) Generalized contribution plots in multivariate statistical process
+    monitoring  Chemometrics and Intelligent Laboratory Systems 51 2000 95–114
+    """
+    def __init__(
+        self,
+        model: Union[ModelTypes, Pipeline],
+        confidence: float = 0.95,
+        method: Literal["chi-square", "jackson-mudholkar", "percentile"] = "percentile",
+    ) -> None:
+        self.method = method
+        super().__init__(model, confidence)
+    def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "QResiduals":
+        """
+        Fit the Q Residuals model by computing residuals from the training set.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data.
+        Returns
+        -------
+        self : object
+            Fitted instance of QResiduals.
+        """
+        X = validate_data(self, X, ensure_2d=True, dtype=np.float64)
+        if self.preprocessing_:
+            X = self.preprocessing_.fit_transform(X)
+        # Compute the critical threshold using the chosen method
+        self.critical_value_ = self._calculate_critical_value(X)
+        return self
+    def predict(self, X: np.ndarray) -> np.ndarray:
+        """Identify outliers in the input data based on Q residuals threshold.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input data.
+        Returns
+        -------
+        ndarray of shape (n_samples,)
+            Boolean array indicating outliers (-1 for outliers, 1 for normal data).
+        """
+        # Check the estimator has been fitted
+        check_is_fitted(self, ["critical_value_"])
+        # Validate the input data
+        X = validate_data(
+            self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
+        )
+        # Calculate outliers based on the Q residuals
+        Q_residuals = self.predict_residuals(X, validate=False)
+        return np.where(Q_residuals > self.critical_value_, -1, 1)
+    def predict_residuals(
+        self, X: np.ndarray, y: Optional[np.ndarray] = None, validate: bool = True
+    ) -> np.ndarray:
+        """Calculate Q residuals (Squared Prediction Error - SPE) for input data.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input data.
+        validate : bool, default=True
+            Whether to validate the input data.
+        Returns
+        -------
+        ndarray of shape (n_samples,)
+            Q residuals for each sample.
+        """
+        # Check the estimator has been fitted
+        check_is_fitted(self, ["critical_value_"])
+        # Validate the input data
+        if validate:
+            X = validate_data(self, X, ensure_2d=True, dtype=np.float64)
+        # Apply preprocessing if available
+        if self.preprocessing_:
+            X = self.preprocessing_.transform(X)
+        # Compute reconstruction error (Q residuals)
+        X_transformed = self.model_.transform(X)
+        X_reconstructed = self.model_.inverse_transform(X_transformed)
+        Q_residuals = np.sum((X - X_reconstructed) ** 2, axis=1)
+        return Q_residuals
+    def _calculate_critical_value(
+        self,
+        X: Optional[np.ndarray] = None,
+    ) -> float:
+        """Calculate the critical value for outlier detection.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input data.
+        X_reconstructed : array-like of shape (n_samples, n_features)
+            Reconstructed input data.
+        method : str Literal["chi-square", "jackson-mudholkar", "percentile"]
+            The method used to compute the confidence threshold for Q residuals.
+        Returns
+        -------
+        float
+            The calculated critical value for outlier detection.
+        """
+        # Compute Q residuals for training data
+        X_transformed = self.model_.transform(X)
+        X_reconstructed = self.model_.inverse_transform(X_transformed)
+        residuals = X - X_reconstructed
+        if self.method == "chi-square":
+            return self._chi_square_threshold(residuals)
+        elif self.method == "jackson-mudholkar":
+            return self._jackson_mudholkar_threshold(residuals)
+        elif self.method == "percentile":
+            Q_residuals = np.sum((residuals) ** 2, axis=1)
+            return self._percentile_threshold(Q_residuals)
+        else:
+            raise ValueError(
+                "Invalid method. Choose from 'chi-square', 'jackson-mudholkar', or 'percentile'."
+            )
+    def _chi_square_threshold(self, residuals: np.ndarray) -> float:
+        """Compute Q residual threshold using Chi-Square Approximation."""
+        eigenvalues = np.linalg.trace(np.cov(residuals.T))
+        theta_1 = np.sum(eigenvalues)
+        theta_2 = np.sum(eigenvalues**2)
+        # Degrees of freedom approximation
+        g = theta_2 / theta_1
+        h = (2 * theta_1**2) / theta_2
+        # Compute chi-square critical value at given confidence level
+        chi_critical = chi2.ppf(self.confidence, df=h)
+        # Compute final Q residual threshold
+        return g * chi_critical
+    def _jackson_mudholkar_threshold(self, residuals: np.ndarray) -> float:
+        """Compute Q residual threshold using Jackson & Mudholkar’s analytical method."""
+        eigenvalues = np.linalg.trace(np.cov(residuals.T))
+        theta_1 = np.sum(eigenvalues)
+        theta_2 = np.sum(eigenvalues**2)
+        theta_3 = np.sum(eigenvalues**3)
+        z_alpha = norm.ppf(self.confidence)
+        h0 = 1 - (2 * theta_1 * theta_3) / (3 * theta_2**2)
+        term1 = theta_2 * h0 * (1 - h0) / theta_1**2
+        term2 = np.sqrt(z_alpha * 2 * theta_2 * h0**2) / theta_1
+        return theta_1 * (1 - term1 + term2) ** (1 / h0)
+    def _percentile_threshold(self, Q_residuals: np.ndarray) -> float:
+        """Compute Q residual threshold using the empirical percentile method."""
+        return np.percentile(Q_residuals, self.confidence * 100)

chemotools 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

chemotools 0.1.8py3-none-any.whl → 0.1.10py3-none-any.whl