PyPI - chemotools - Versions diffs - 0.1.10__py3-none-any.whl → 0.1.11__py3-none-any.whl - Mend

chemotools 0.1.10py3-none-any.whl → 0.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

chemotools/feature_selection/__init__.py +3 -1
chemotools/feature_selection/_base.py +88 -0
chemotools/feature_selection/_sr_selector.py +137 -0
chemotools/feature_selection/_vip_selector.py +129 -0
chemotools/outliers/_base.py +75 -67
chemotools/outliers/dmodx.py +26 -8
chemotools/outliers/hotelling_t2.py +11 -10
chemotools/outliers/leverage.py +15 -14
chemotools/outliers/q_residuals.py +19 -16
chemotools/outliers/studentized_residuals.py +17 -16
chemotools/outliers/utils.py +51 -0
{chemotools-0.1.10.dist-info → chemotools-0.1.11.dist-info}/METADATA +2 -2
{chemotools-0.1.10.dist-info → chemotools-0.1.11.dist-info}/RECORD +15 -12
{chemotools-0.1.10.dist-info → chemotools-0.1.11.dist-info}/WHEEL +1 -1
chemotools/outliers/_utils.py +0 -91
{chemotools-0.1.10.dist-info → chemotools-0.1.11.dist-info}/LICENSE +0 -0

chemotools/feature_selection/__init__.py CHANGED Viewed

@@ -1,4 +1,6 @@
 from ._index_selector import IndexSelector
 from ._range_cut import RangeCut
+from ._sr_selector import SRSelector
+from ._vip_selector import VIPSelector
-__all__ = ["IndexSelector", "RangeCut"]
+__all__ = ["IndexSelector", "RangeCut", "SRSelector", "VIPSelector"]

chemotools/feature_selection/_base.py ADDED Viewed

@@ -0,0 +1,88 @@
+from abc import ABC, abstractmethod
+from typing import Union
+import numpy as np
+from sklearn.base import BaseEstimator
+from sklearn.cross_decomposition._pls import _PLS
+from sklearn.feature_selection._base import SelectorMixin
+from sklearn.pipeline import Pipeline
+from sklearn.utils.validation import check_is_fitted
+ModelTypes = Union[_PLS, Pipeline]
+class _PLSFeatureSelectorBase(ABC, BaseEstimator, SelectorMixin):
+    """Feature selection base class for _PLS-like models.
+    Parameters
+    ----------
+    model : Union[_PLS, Pipeline]
+        A fitted  _PLS models or Pipeline ending with such a model
+    threshold : float
+        The threshold for feature selection. Features with importance
+        above this threshold will be selected.
+    Attributes
+    ----------
+    estimator_ : ModelTypes
+        The fitted model of type _BasePCA or _PLS
+    feature_scores_ : np.ndarray
+        The calculated feature scores based on the selected method.
+    support_mask : np.ndarray
+        The boolean mask indicating which features are selected.
+    """
+    def __init__(
+        self,
+        model: Union[_PLS, Pipeline],
+    ) -> None:
+        self.estimator_ = _validate_and_extract_model(model)
+    @abstractmethod
+    def _calculate_features(self, X: np.ndarray) -> np.ndarray:
+        """Calculate the residuals of the model.
+        Returns
+        -------
+        ndarray of shape (n_samples,)
+            The residuals of the model
+        """
+def _validate_and_extract_model(
+    model: Union[_PLS, Pipeline],
+) -> _PLS:
+    """Validate and extract the model.
+    Parameters
+    ----------
+    model : Union[_PLS, Pipeline]
+        A fitted _PLS model or Pipeline ending with such a model
+    Returns
+    -------
+    _PLS
+        The extracted estimator
+    Raises
+    ------
+    TypeError
+        If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
+    """
+    if isinstance(model, Pipeline):
+        estimator = model[-1]
+    else:
+        estimator = model
+    if not isinstance(estimator, _PLS):
+        raise TypeError(
+            "Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
+        )
+    check_is_fitted(model)
+    return estimator

chemotools/feature_selection/_sr_selector.py ADDED Viewed

@@ -0,0 +1,137 @@
+import numpy as np
+from sklearn.utils.validation import validate_data
+from ._base import _PLSFeatureSelectorBase
+class SRSelector(_PLSFeatureSelectorBase):
+    """
+    This selector is used to select features that contribute significantly
+    to the latent variables in a PLS regression model using the Selectivity
+    Ratio (SR) method.
+    Parameters
+    ----------
+    - model: Union[_PLS, Pipeline]
+        The PLS regression model or a pipeline with a PLS regression model as last step.
+    - threshold: float, default=1.0
+        The threshold for feature selection. Features with importance
+        above this threshold will be selected.
+    Attributes
+    ----------
+    estimator_ : ModelTypes
+        The fitted model of type _BasePCA or _PLS
+    feature_scores_ : np.ndarray
+        The calculated feature scores based on the selected method.
+    support_mask_ : np.ndarray
+        The boolean mask indicating which features are selected.
+    Methods
+    -------
+    fit(X, y=None)
+        Fit the transformer to the input data. It calculates the feature scores and the feature_mask.
+    """
+    def __init__(
+        self,
+        model,
+        threshold: float = 1.0,
+    ):
+        self.model = model
+        self.threshold = threshold
+        super().__init__(self.model)
+    def fit(self, X: np.ndarray, y=None) -> "SRSelector":
+        """
+        Fit the transformer to calculate the feature scores and the support mask.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input data to fit the transformer to.
+        y : None
+            Ignored.
+        Returns
+        -------
+        self : SRSelector
+            The fitted transformer.
+        """
+        # Check that X is a 2D array and has only finite values
+        X = validate_data(
+            self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
+        )
+        # Calculate the SR scores
+        self.feature_scores_ = self._calculate_features(X)
+        # Calculate the support mask
+        self.support_mask_ = self._get_support_mask()
+        return self
+    def _get_support_mask(self) -> np.ndarray:
+        """
+        Get the support mask based on the feature scores and threshold.
+        Features with scores above the threshold are selected.
+        Parameters
+        ----------
+        self : SRSelector
+            The fitted transformer.
+        Returns
+        -------
+        support_mask_ : np.ndarray
+            The boolean mask indicating which features are selected.
+        """
+        return self.feature_scores_ > self.threshold
+    def _calculate_features(self, X: np.ndarray) -> np.ndarray:
+        """
+        Vectorized Selectivity Ratio calculation from a fitted _PLS
+        like model.
+        Parameters:
+        ----------
+        - self: SRSelector
+            The fitted transformer.
+        - X: array-like of shape (n_samples, n_features)
+            The input training data to calculate the feature scores from.
+        Returns
+        -------
+        feature_scores_ : np.ndarray
+            The calculated feature scores based on the selected method.
+        """
+        bpls = self.estimator_.coef_
+        bpls_norm = bpls.T / np.linalg.norm(bpls)
+        # Handle 1D case correctly
+        if bpls.ndim == 1:
+            bpls_norm = bpls_norm.reshape(-1, 1)
+        # Project X onto the regression vector
+        ttp = X @ bpls_norm
+        ptp = X.T @ np.linalg.pinv(ttp).T
+        # Predicted part of X
+        X_hat = ttp @ ptp.T
+        # Compute squared norms directly
+        total_ss = np.linalg.norm(X, axis=0) ** 2
+        explained_ss = np.linalg.norm(X_hat, axis=0) ** 2
+        # Calculate residual sum of squares
+        residual_ss = total_ss - explained_ss
+        # Stability: avoid division by zero
+        epsilon = 1e-12
+        # Calculate Selectivity Ratio
+        return explained_ss / (residual_ss + epsilon)

chemotools/feature_selection/_vip_selector.py ADDED Viewed

@@ -0,0 +1,129 @@
+import numpy as np
+from sklearn.utils.validation import validate_data
+from ._base import _PLSFeatureSelectorBase
+class VIPSelector(_PLSFeatureSelectorBase):
+    """
+    This selector is used to select features that contribute significantly
+    to the latent variables in a PLS regression model using the Variables
+    Importance in Projection (VIP) method.
+    Parameters
+    ----------
+    - model: Union[_PLS, Pipeline]
+        The PLS regression model or a pipeline with a PLS regression model as last step.
+    - threshold: float, default=1.0
+        The threshold for feature selection. Features with importance
+        above this threshold will be selected.
+    Attributes
+    ----------
+    estimator_ : ModelTypes
+        The fitted model of type _BasePCA or _PLS
+    feature_scores_ : np.ndarray
+        The calculated feature scores based on the selected method.
+    support_mask_ : np.ndarray
+        The boolean mask indicating which features are selected.
+    Methods
+    -------
+    fit(X, y=None)
+        Fit the transformer to the input data. It calculates the feature scores and the feature_mask.
+    """
+    def __init__(
+        self,
+        model,
+        threshold: float = 1.0,
+    ):
+        self.model = model
+        self.threshold = threshold
+        super().__init__(self.model)
+    def fit(self, X: np.ndarray, y=None) -> "VIPSelector":
+        """
+        Fit the transformer to calculate the feature scores and the support mask.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input data to fit the transformer to.
+        y : None
+            Ignored.
+        Returns
+        -------
+        self : VIPSelector
+            The fitted transformer.
+        """
+        # Check that X is a 2D array and has only finite values
+        X = validate_data(
+            self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
+        )
+        # Calculate the VIP scores
+        self.feature_scores_ = self._calculate_features(X)
+        # Calculate the support mask
+        self.support_mask_ = self._get_support_mask()
+        return self
+    def _get_support_mask(self) -> np.ndarray:
+        """
+        Get the support mask based on the feature scores and threshold.
+        Features with scores above the threshold are selected.
+        Parameters
+        ----------
+        self : VIPSelector
+            The fitted transformer.
+        Returns
+        -------
+        support_mask_ : np.ndarray
+            The boolean mask indicating which features are selected.
+        """
+        return self.feature_scores_ > self.threshold
+    def _calculate_features(self, X: np.ndarray) -> np.ndarray:
+        """
+        Calculate the VIP scores based on the fitted model.
+        Parameters
+        ----------
+        self : VIPSelector
+            The fitted transformer.
+        Returns
+        -------
+        feature_scores_ : np.ndarray
+            The calculated feature scores based on the selected method.
+        """
+        # Calculate sum of squares of y_loadings and x_scores
+        sum_of_squares_y_loadings = (
+            np.linalg.norm(self.estimator_.y_loadings_, ord=2, axis=0) ** 2
+        )
+        sum_of_squares_x_scores = (
+            np.linalg.norm(self.estimator_.x_scores_, ord=2, axis=0) ** 2
+        )
+        # Calculate the sum of squares
+        sum_of_squares = sum_of_squares_y_loadings * sum_of_squares_x_scores
+        # Calculate the numerator
+        numerator = self.estimator_.n_features_in_ * np.sum(
+            sum_of_squares * self.estimator_.x_weights_**2,
+            axis=1,
+        )
+        # Calculate the denominator
+        denominator = np.sum(sum_of_squares, axis=0)
+        # Calculate the VIP scores
+        return np.sqrt(numerator / denominator)

chemotools/outliers/_base.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Union, Optional
+from typing import Optional, Tuple, Union
 import numpy as np
@@ -9,7 +9,6 @@ from sklearn.cross_decomposition._pls import _PLS
 from sklearn.pipeline import Pipeline
 from sklearn.utils.validation import check_is_fitted
-from ._utils import validate_confidence, validate_and_extract_model
 ModelTypes = Union[_BasePCA, _PLS]
@@ -29,10 +28,10 @@ class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
     Attributes
     ----------
-    model_ : ModelTypes
+    estimator_ : ModelTypes
         The fitted model of type _BasePCA or _PLS
-    preprocessing_ : Optional[Pipeline]
+    transformer_ : Optional[Pipeline]
         Preprocessing steps before the model
     n_features_in_ : int
@@ -54,13 +53,13 @@ class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
         confidence: float,
     ) -> None:
         (
-            self.model_,
-            self.preprocessing_,
+            self.estimator_,
+            self.transformer_,
             self.n_features_in_,
             self.n_components_,
             self.n_samples_,
-        ) = validate_and_extract_model(model)
-        self.confidence = validate_confidence(confidence)
+        ) = _validate_and_extract_model(model)
+        self.confidence = _validate_confidence(confidence)
     def fit_predict_residuals(
         self, X: np.ndarray, y: Optional[np.ndarray] = None
@@ -96,7 +95,7 @@ class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
         """
     @abstractmethod
-    def _calculate_critical_value(self, X: Optional[np.ndarray]) -> float:
+    def _calculate_critical_value(self, X: np.ndarray) -> float:
         """Calculate the critical value for outlier detection.
         Returns
@@ -106,75 +105,84 @@ class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
         """
-class _ModelDiagnosticsBase(ABC):
-    """Base class for model diagnostics methods. This does not implement outlier detection algorithms,
-    but rather implements methods that are used to assess trained models.
+def _get_model_parameters(model: ModelTypes) -> Tuple[int, int, int]:
+    """
+    Get the number of features, components and samples from a model with PLS or PCA. types.
     Parameters
     ----------
-    model : Union[ModelTypes, Pipeline]
-        A fitted PCA/PLS model or Pipeline ending with such a model
-    Attributes
-    ----------
-    model_ : ModelTypes
-        The fitted model of type _BasePCA or _PLS
-    preprocessing_ : Optional[Pipeline]
-        Preprocessing steps before the model
+    model : ModelType
+        A fitted model of type _BasePCA or _PLS
+    Returns
+    -------
+    Tuple[int, int, int]
+        The number of features, components and samples in the model
     """
+    if isinstance(model, _BasePCA):
+        return model.n_features_in_, model.n_components_, model.n_samples_
+    elif isinstance(model, _PLS):
+        return model.n_features_in_, model.n_components, len(model.x_scores_)
+    else:
+        raise ValueError(
+            "Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
+        )
-    def __init__(self, model: Union[ModelTypes, Pipeline]):
-        self.model_, self.preprocessing_ = self._validate_and_extract_model(model)
-    def _validate_and_extract_model(self, model):
-        """Validate and extract the model and preprocessing steps.
+def _validate_confidence(confidence: float) -> float:
+    """Validate parameters using sklearn conventions.
-        Parameters
-        ----------
-        model : Union[ModelTypes, Pipeline]
-            A fitted PCA/PLS model or Pipeline ending with such a model
+    Parameters
+    ----------
+    confidence : float
+        Confidence level for statistical calculations (between 0 and 1)
-        Returns
-        -------
-        Tuple[ModelTypes, Optional[Pipeline]]
-            The extracted model and preprocessing steps
+    Returns
+    -------
+    float
+        The validated confidence level
-        Raises
-        ------
-        ValueError
-            If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
-        """
-        if isinstance(model, Pipeline):
-            preprocessing = model[:-1]
-            model = model[-1]
-        else:
-            preprocessing = None
-        if isinstance(model, (_BasePCA, _PLS)):
-            check_is_fitted(model)
-        else:
-            raise ValueError(
-                "Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
-            )
-        check_is_fitted(model)
-        return model, preprocessing
+    Raises
+    ------
+    ValueError
+        If confidence is not between 0 and 1
+    """
+    if not 0 < confidence < 1:
+        raise ValueError("Confidence must be between 0 and 1")
+    return confidence
-    @abstractmethod
-    def predict(self, X: np.ndarray, y: Optional[np.ndarray]) -> np.ndarray:
-        """Predict the output of the model.
-        Parameters
-        ----------
-        X : array-like of shape (n_samples, n_features)
-            Input data
+def _validate_and_extract_model(
+    model: Union[ModelTypes, Pipeline],
+) -> Tuple[ModelTypes, Optional[Pipeline], int, int, int]:
+    """Validate and extract the model and preprocessing steps.
-        y : array-like of shape (n_samples,), default=None
-            Target values
+    Parameters
+    ----------
+    model : Union[ModelTypes, Pipeline]
+        A fitted PCA/PLS model or Pipeline ending with such a model
-        Returns
-        -------
-        ndarray of shape (n_samples,)
-            Predicted values
-        """
+    Returns
+    -------
+    Tuple[ModelTypes, Optional[Pipeline]]
+        The extracted model and preprocessing steps
+    Raises
+    ------
+    ValueError
+        If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
+    """
+    if isinstance(model, Pipeline):
+        preprocessing = model[:-1]
+        model = model[-1]
+    else:
+        preprocessing = None
+    if not isinstance(model, (_BasePCA, _PLS)):
+        raise ValueError(
+            "Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
+        )
+    check_is_fitted(model)
+    n_features_in, n_components, n_samples = _get_model_parameters(model)
+    return model, preprocessing, n_features_in, n_components, n_samples

chemotools/outliers/dmodx.py CHANGED Viewed

@@ -7,6 +7,7 @@ from scipy.stats import f as f_distribution
 from ._base import _ModelResidualsBase, ModelTypes
+from .utils import calculate_residual_spectrum
 class DModX(_ModelResidualsBase):
@@ -25,10 +26,10 @@ class DModX(_ModelResidualsBase):
     Attributes
     ----------
-    model_ : ModelType
+    estimator_ : ModelType
         The fitted model of type _BasePCA or _PLS
-    preprocessing_ : Optional[Pipeline]
+    transformer_ : Optional[Pipeline]
         Preprocessing steps before the model
     n_features_in_ : int
@@ -42,6 +43,9 @@ class DModX(_ModelResidualsBase):
     critical_value_ : float
         The calculated critical value for outlier detection
+    train_spe_: float
+        The training sum of squared errors (SSE) for the model normalized by degrees of freedom
     """
     def __init__(
@@ -49,6 +53,7 @@ class DModX(_ModelResidualsBase):
         model: Union[ModelTypes, Pipeline],
         confidence: float = 0.95,
     ) -> None:
+        model, confidence = model, confidence
         super().__init__(model, confidence)
     def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "DModX":
@@ -62,7 +67,18 @@ class DModX(_ModelResidualsBase):
             self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
         )
+        # Calculate the critical value
         self.critical_value_ = self._calculate_critical_value()
+        # Calculate the degrees of freedom normalized SPE of the training set
+        residuals = calculate_residual_spectrum(X, self.estimator_)
+        squared_errors = np.sum((residuals) ** 2, axis=1)
+        self.train_spe_ = np.sqrt(
+            squared_errors
+            / (self.n_samples_ - self.n_components_ - 1)
+            * (self.n_features_in_ - self.n_components_)
+        )
         return self
     def predict(self, X: np.ndarray) -> np.ndarray:
@@ -118,15 +134,17 @@ class DModX(_ModelResidualsBase):
             )
         # Apply preprocessing if available
-        if self.preprocessing_:
-            X = self.preprocessing_.transform(X)
+        if self.transformer_:
+            X = self.transformer_.transform(X)
         # Calculate the DModX statistics
-        X_transformed = self.model_.transform(X)
-        X_reconstructed = self.model_.inverse_transform(X_transformed)
-        squared_errors = np.sum((X - X_reconstructed) ** 2, axis=1)
+        residual = calculate_residual_spectrum(X, self.estimator_)
+        squared_errors = np.sum((residual) ** 2, axis=1)
-        return np.sqrt(squared_errors / (self.n_features_in_ - self.n_components_))
+        return (
+            np.sqrt(squared_errors / (self.n_features_in_ - self.n_components_))
+            / self.train_spe_
+        )
     def _calculate_critical_value(self, X: Optional[np.ndarray] = None) -> float:
         """Calculate F-distribution based critical value.

chemotools/outliers/hotelling_t2.py CHANGED Viewed

@@ -24,10 +24,10 @@ class HotellingT2(_ModelResidualsBase):
     Attributes
     ----------
-    model_ : ModelType
+    estimator_ : ModelType
         The fitted model of type _BasePCA or _PLS
-    preprocessing_ : Optional[Pipeline]
+    transformer_ : Optional[Pipeline]
         Preprocessing steps before the model
     n_features_in_ : int
@@ -51,6 +51,7 @@ class HotellingT2(_ModelResidualsBase):
     def __init__(
         self, model: Union[ModelTypes, Pipeline], confidence: float = 0.95
     ) -> None:
+        self.model, self.confidence = model, confidence
         super().__init__(model, confidence)
     def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "HotellingT2":
@@ -93,7 +94,7 @@ class HotellingT2(_ModelResidualsBase):
         return np.where(hotelling_t2_values > self.critical_value_, -1, 1)
     def predict_residuals(
-        self, X: np.ndarray, y: Optional[np.ndarray], validate: bool = True
+        self, X: np.ndarray, y: Optional[np.ndarray] = None, validate: bool = True
     ) -> np.ndarray:
         """Calculate Hotelling's T-squared statistics for input data.
@@ -117,20 +118,20 @@ class HotellingT2(_ModelResidualsBase):
             )
         # Apply preprocessing steps
-        if self.preprocessing_:
-            X = self.preprocessing_.transform(X)
+        if self.transformer_:
+            X = self.transformer_.transform(X)
         # Calculate the Hotelling's T-squared statistics
-        if isinstance(self.model_, _BasePCA):
+        if isinstance(self.estimator_, _BasePCA):
             # For PCA-like models
-            variances = self.model_.explained_variance_
+            variances = self.estimator_.explained_variance_
-        if isinstance(self.model_, _PLS):
+        if isinstance(self.estimator_, _PLS):
             # For PLS-like models
-            variances = np.var(self.model_.x_scores_, axis=0)
+            variances = np.var(self.estimator_.x_scores_, axis=0)
         # Equivalent to X @ model.components_.T for _BasePCA and X @ model.x_rotations_ for _PLS
-        X_transformed = self.model_.transform(X)
+        X_transformed = self.estimator_.transform(X)
         return np.sum((X_transformed**2) / variances, axis=1)

chemotools/outliers/leverage.py CHANGED Viewed

@@ -20,10 +20,10 @@ class Leverage(_ModelResidualsBase):
     Attributes
     ----------
-    model_ : ModelType
+    estimator_ : ModelType
         The fitted model of type _BasePCA or _PLS
-    preprocessing_ : Optional[Pipeline]
+    transformer_ : Optional[Pipeline]
         Preprocessing steps before the model
     References
@@ -34,6 +34,7 @@ class Leverage(_ModelResidualsBase):
     def __init__(
         self, model: Union[ModelTypes, Pipeline], confidence: float = 0.95
     ) -> None:
+        model, confidence = model, confidence
         super().__init__(model, confidence)
     def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "Leverage":
@@ -47,8 +48,8 @@ class Leverage(_ModelResidualsBase):
             self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
         )
-        if self.preprocessing_:
-            X = self.preprocessing_.fit_transform(X)
+        if self.transformer_:
+            X = self.transformer_.fit_transform(X)
         # Compute the critical threshold
         self.critical_value_ = self._calculate_critical_value(X)
@@ -77,15 +78,15 @@ class Leverage(_ModelResidualsBase):
         )
         # Preprocess the data
-        if self.preprocessing_:
-            X = self.preprocessing_.transform(X)
+        if self.transformer_:
+            X = self.transformer_.transform(X)
         # Calculate outliers based on samples with too high leverage
-        leverage = calculate_leverage(self.model_, X)
+        leverage = calculate_leverage(X, self.estimator_)
         return np.where(leverage > self.critical_value_, -1, 1)
     def predict_residuals(
-        self, X: np.ndarray, y: Optional[np.ndarray], validate: bool = True
+        self, X: np.ndarray, y: Optional[np.ndarray] = None, validate: bool = True
     ) -> np.ndarray:
         """Calculate the leverage of the samples.
@@ -107,23 +108,23 @@ class Leverage(_ModelResidualsBase):
             X = validate_data(self, X, ensure_2d=True, dtype=np.float64)
         # Apply preprocessing if available
-        if self.preprocessing_:
-            X = self.preprocessing_.transform(X)
+        if self.transformer_:
+            X = self.transformer_.transform(X)
         # Calculate the leverage
-        return calculate_leverage(self.model_, X)
+        return calculate_leverage(X, self.estimator_)
-    def _calculate_critical_value(self, X: Optional[np.ndarray]) -> float:
+    def _calculate_critical_value(self, X: np.ndarray) -> float:
         """Calculate the critical value for outlier detection using the percentile outlier method."""
         # Calculate the leverage of the samples
-        leverage = calculate_leverage(self.model_, X)
+        leverage = calculate_leverage(X, self.estimator_)
         # Calculate the critical value
         return np.percentile(leverage, self.confidence * 100)
-def calculate_leverage(model: ModelTypes, X: Optional[np.ndarray]) -> np.ndarray:
+def calculate_leverage(X: np.ndarray, model: ModelTypes) -> np.ndarray:
     """
     Calculate the leverage of the training samples in a PLS/PCA-like model.

chemotools/outliers/q_residuals.py CHANGED Viewed

@@ -7,6 +7,7 @@ from sklearn.pipeline import Pipeline
 from sklearn.utils.validation import validate_data, check_is_fitted
 from ._base import _ModelResidualsBase, ModelTypes
+from .utils import calculate_residual_spectrum
 class QResiduals(_ModelResidualsBase):
@@ -21,7 +22,7 @@ class QResiduals(_ModelResidualsBase):
     confidence : float, default=0.95
         Confidence level for statistical calculations (between 0 and 1).
-    method : str, default="chi-square"
+    method : str, default="jackson-mudholkar"
         The method used to compute the confidence threshold for Q residuals.
         Options:
         - "chi-square" : Uses mean and standard deviation to approximate Q residuals threshold.
@@ -30,10 +31,10 @@ class QResiduals(_ModelResidualsBase):
     Attributes
     ----------
-    model_ : ModelType
+    estimator_ : ModelType
         The fitted model of type _BasePCA or _PLS.
-    preprocessing_ : Optional[Pipeline]
+    transformer_ : Optional[Pipeline]
         Preprocessing steps before the model.
     n_features_in_ : int
@@ -58,9 +59,11 @@ class QResiduals(_ModelResidualsBase):
         self,
         model: Union[ModelTypes, Pipeline],
         confidence: float = 0.95,
-        method: Literal["chi-square", "jackson-mudholkar", "percentile"] = "percentile",
+        method: Literal[
+            "chi-square", "jackson-mudholkar", "percentile"
+        ] = "jackson-mudholkar",
     ) -> None:
-        self.method = method
+        self.model, self.confidence, self.method = model, confidence, method
         super().__init__(model, confidence)
     def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "QResiduals":
@@ -79,8 +82,8 @@ class QResiduals(_ModelResidualsBase):
         """
         X = validate_data(self, X, ensure_2d=True, dtype=np.float64)
-        if self.preprocessing_:
-            X = self.preprocessing_.fit_transform(X)
+        if self.transformer_:
+            X = self.transformer_.fit_transform(X)
         # Compute the critical threshold using the chosen method
         self.critical_value_ = self._calculate_critical_value(X)
@@ -138,19 +141,18 @@ class QResiduals(_ModelResidualsBase):
             X = validate_data(self, X, ensure_2d=True, dtype=np.float64)
         # Apply preprocessing if available
-        if self.preprocessing_:
-            X = self.preprocessing_.transform(X)
+        if self.transformer_:
+            X = self.transformer_.transform(X)
         # Compute reconstruction error (Q residuals)
-        X_transformed = self.model_.transform(X)
-        X_reconstructed = self.model_.inverse_transform(X_transformed)
-        Q_residuals = np.sum((X - X_reconstructed) ** 2, axis=1)
+        residual = calculate_residual_spectrum(X, self.estimator_)
+        Q_residuals = np.sum(residual**2, axis=1)
         return Q_residuals
     def _calculate_critical_value(
         self,
-        X: Optional[np.ndarray] = None,
+        X: np.ndarray,
     ) -> float:
         """Calculate the critical value for outlier detection.
@@ -172,17 +174,18 @@ class QResiduals(_ModelResidualsBase):
         """
         # Compute Q residuals for training data
-        X_transformed = self.model_.transform(X)
-        X_reconstructed = self.model_.inverse_transform(X_transformed)
-        residuals = X - X_reconstructed
+        residuals = calculate_residual_spectrum(X, self.estimator_)
         if self.method == "chi-square":
             return self._chi_square_threshold(residuals)
         elif self.method == "jackson-mudholkar":
             return self._jackson_mudholkar_threshold(residuals)
         elif self.method == "percentile":
             Q_residuals = np.sum((residuals) ** 2, axis=1)
             return self._percentile_threshold(Q_residuals)
         else:
             raise ValueError(
                 "Invalid method. Choose from 'chi-square', 'jackson-mudholkar', or 'percentile'."

chemotools/outliers/studentized_residuals.py CHANGED Viewed

@@ -21,10 +21,10 @@ class StudentizedResiduals(_ModelResidualsBase):
     Attributes
     ----------
-    model_ : ModelType
+    estimator_ : ModelType
         The fitted model of type _BasePCA or _PLS
-    preprocessing_ : Optional[Pipeline]
+    transformer_ : Optional[Pipeline]
         Preprocessing steps before the model
     References
@@ -33,6 +33,7 @@ class StudentizedResiduals(_ModelResidualsBase):
     """
     def __init__(self, model: Union[_PLS, Pipeline], confidence=0.95) -> None:
+        self.model, self.confidence = model, confidence
         super().__init__(model, confidence)
     def fit(self, X: np.ndarray, y: Optional[np.ndarray]) -> "StudentizedResiduals":
@@ -53,18 +54,18 @@ class StudentizedResiduals(_ModelResidualsBase):
         )
         # Preprocess the data
-        if self.preprocessing_:
-            X = self.preprocessing_.transform(X)
+        if self.transformer_:
+            X = self.transformer_.transform(X)
         # Calculate y residuals
-        y_residuals = y - self.model_.predict(X)
+        y_residuals = y - self.estimator_.predict(X)
         y_residuals = (
             y_residuals.reshape(-1, 1) if len(y_residuals.shape) == 1 else y_residuals
         )
         # Calculate the studentized residuals
         studentized_residuals = calculate_studentized_residuals(
-            self.model_, X, y_residuals
+            self.estimator_, X, y_residuals
         )
         # Calculate the critical threshold
@@ -97,18 +98,18 @@ class StudentizedResiduals(_ModelResidualsBase):
         )
         # Preprocess the data
-        if self.preprocessing_:
-            X = self.preprocessing_.transform(X)
+        if self.transformer_:
+            X = self.transformer_.transform(X)
         # Calculate y residuals
-        y_residuals = y - self.model_.predict(X)
+        y_residuals = y - self.estimator_.predict(X)
         y_residuals = (
             y_residuals.reshape(-1, 1) if len(y_residuals.shape) == 1 else y_residuals
         )
         # Calculate the studentized residuals
         studentized_residuals = calculate_studentized_residuals(
-            self.model_, X, y_residuals
+            self.estimator_, X, y_residuals
         )
         return np.where(studentized_residuals > self.critical_value_, -1, 1)
@@ -138,18 +139,18 @@ class StudentizedResiduals(_ModelResidualsBase):
             X = validate_data(self, X, ensure_2d=True, dtype=np.float64)
         # Apply preprocessing if available
-        if self.preprocessing_:
-            X = self.preprocessing_.transform(X)
+        if self.transformer_:
+            X = self.transformer_.transform(X)
         # Calculate y residuals
-        y_residuals = y - self.model_.predict(X)
+        y_residuals = y - self.estimator_.predict(X)
         y_residuals = (
             y_residuals.reshape(-1, 1) if len(y_residuals.shape) == 1 else y_residuals
         )
-        return calculate_studentized_residuals(self.model_, X, y_residuals)
+        return calculate_studentized_residuals(self.estimator_, X, y_residuals)
-    def _calculate_critical_value(self, X: Optional[np.ndarray]) -> float:
+    def _calculate_critical_value(self, X: np.ndarray) -> float:
         """Calculate the critical value for outlier detection.
         Parameters
@@ -189,7 +190,7 @@ def calculate_studentized_residuals(
     """
     # Calculate the leverage of the samples
-    leverage = calculate_leverage(model, X)
+    leverage = calculate_leverage(X, model)
     # Calculate the standard deviation of the residuals
     std = np.sqrt(np.sum(y_residuals**2, axis=0) / (X.shape[0] - model.n_components))

chemotools/outliers/utils.py ADDED Viewed

@@ -0,0 +1,51 @@
+import numpy as np
+from ._base import ModelTypes
+def calculate_decoded_spectrum(X: np.ndarray, estimator: ModelTypes):
+    """
+    Calculate the decoded spectrum for a given transformed (preprocessed!!) spectrum and estimator from the latent space.
+    Parameters
+    ----------
+    spectrum : np.ndarray
+        The transformed spectrum data.
+    estimator : ModelTypes
+        The fitted PCA or PLS model.
+    Returns
+    -------
+    np.ndarray
+        The decoded spectrum.
+    """
+    # Project the transformed spectrum onto the latent space
+    X_transformed = estimator.transform(X)
+    # Decode the spectrum back to the original space
+    return estimator.inverse_transform(X_transformed)
+def calculate_residual_spectrum(X: np.ndarray, estimator: ModelTypes):
+    """
+    Calculate the residual spectrum for a given transformed (preprocessed!!) spectrum and estimator.
+    Parameters
+    ----------
+    spectrum : np.ndarray
+        The transformed spectrum data.
+    estimator : ModelTypes
+        The fitted PCA or PLS model.
+    Returns
+    -------
+    np.ndarray
+        The residual spectrum.
+    """
+    # Compute the reconstruction error (Q residuals)
+    decoded_spectrum = calculate_decoded_spectrum(X, estimator)
+    # Calculate the residual
+    return X - decoded_spectrum

{chemotools-0.1.10.dist-info → chemotools-0.1.11.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: chemotools
-Version: 0.1.10
+Version: 0.1.11
 Summary: chemotools: A Python Package that Integrates Chemometrics and scikit-learn
 License: MIT
 Author: Pau Cabaneros
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.13
 Requires-Dist: numpy (>=2.0.0,<3.0.0)
 Requires-Dist: pandas (>=2.0.0,<3.0.0)
 Requires-Dist: polars (>=1.17.0,<2.0.0)
-Requires-Dist: pyarrow (>=18.0.0,<19.0.0)
+Requires-Dist: pyarrow (>=18,<21)
 Requires-Dist: scikit-learn (>=1.4.0,<2.0.0)
 Description-Content-Type: text/markdown

{chemotools-0.1.10.dist-info → chemotools-0.1.11.dist-info}/RECORD RENAMED Viewed

@@ -27,17 +27,20 @@ chemotools/datasets/data/train_spectra.csv,sha256=iVF19W52NHlbqq8BbLomn8n47kSPT0
 chemotools/derivative/__init__.py,sha256=FkckdzO30jrRWPGpIU3cfnaTtxPtNT5Tb2G9F9PmVTw,134
 chemotools/derivative/_norris_william.py,sha256=rMY_yntpiB5fbSM1tPph4AaGmF1k-HqJp7o48ijePBs,4958
 chemotools/derivative/_savitzky_golay.py,sha256=CuCrKoLmrB1YmJ4ihIykgkL3tO3frqkStMogtsVhO3A,3632
-chemotools/feature_selection/__init__.py,sha256=1_i28hIxijjwhMypTy1w2fLbzXXVkKD5IYzzY8ZSuHw,117
+chemotools/feature_selection/__init__.py,sha256=e_GFVawlDNEQv3EqrGSXUr5cvDN1jckoxe2C2jRwVl8,222
+chemotools/feature_selection/_base.py,sha256=SIH6kl9AePVWTByL0OvJFfc2j3idqs7lm_7Zi1YMp4Y,2311
 chemotools/feature_selection/_index_selector.py,sha256=lNTP2b7P3doWl30KiAr3Xd2HOMxeUmj24MuqoXl4Voc,3556
 chemotools/feature_selection/_range_cut.py,sha256=lVVVC30ZsK2z9jsDGb_z6l8Ty2I89yM05_dIDbMP73Q,3564
+chemotools/feature_selection/_sr_selector.py,sha256=OaXkt3t_NvymgDy6R15ig87jhcb-vM7i63LgtsNdfZo,3969
+chemotools/feature_selection/_vip_selector.py,sha256=ZK3bhdpl3nBYt6xmuHq2IvWtpgJ8ZdElH06xnCFA-Xs,3835
 chemotools/outliers/__init__.py,sha256=wpdlyqU34n1Pb9kGCM4idhcok35WAakxEhzP0xeKaZw,272
-chemotools/outliers/_base.py,sha256=zx9z_Snkvq5YWBoRi_-kRr3a-Q7jTz1zVlrGWycUTb4,5232
-chemotools/outliers/_utils.py,sha256=SDrrDvgEVQyPuKdh0Rw0DD4a8LShbNAQLRwSLICtiYU,2720
-chemotools/outliers/dmodx.py,sha256=R9LaQpUJeDv4GJ0hroKOlFcFbsfQRtrHWD_EI3-TX7Y,4521
-chemotools/outliers/hotelling_t2.py,sha256=Ga1qmlurF_fps9igaTUGOrnUOctIJEYqoCdb468KhY4,5006
-chemotools/outliers/leverage.py,sha256=zgxG2F7ZCf5wRVJeezHSJ2gaUDTP2CvK38Rr-hR6niA,4215
-chemotools/outliers/q_residuals.py,sha256=6_h73A1YxHBcQtjAXOAp1Rb7egHJwj0DQ0MKdnj6aBQ,7647
-chemotools/outliers/studentized_residuals.py,sha256=rF0wObKQV5DCa8THkZcuwdu7u4mBk-dbOHth5tj5cqM,5830
+chemotools/outliers/_base.py,sha256=zl0LhRKjpvj5IbYc3su6zEZ7YZ0pDSR3yqNWt2qBjNA,5374
+chemotools/outliers/dmodx.py,sha256=sgizal_BDlqWTZNT8y2D_ImcKAJejXt6vqvFYk4Vqi0,5152
+chemotools/outliers/hotelling_t2.py,sha256=g_IOQD_rhKb3cjIJkn5OTto6bYClQtqXunG_02BSIs8,5087
+chemotools/outliers/leverage.py,sha256=hNQ_x68LPPTDZvSJP_eRqu3GoeV3OBU37VC_XTFEzvw,4250
+chemotools/outliers/q_residuals.py,sha256=sg7u8ockQvSSnXwNM4U-GITB-5OcbsDMX6Oig_TcONM,7598
+chemotools/outliers/studentized_residuals.py,sha256=1L-GiutuO1x9s3UKMOBpmhs2Q-UuDtfG2YLELIxiiao,5890
+chemotools/outliers/utils.py,sha256=SAjvtjl9oWHrQnkqGnDfYE4WWAgiL1RwnKmW-ql5TIc,1304
 chemotools/scale/__init__.py,sha256=eztqcHg-TKE1Rr0N9ArfytHk8teuqVfi4SZi2DS96vc,175
 chemotools/scale/_min_max_scaler.py,sha256=YvqRkV2pXu-viQrpjzWcp9KmSSCYSoubSnrZHRLqgKQ,3011
 chemotools/scale/_norm_scaler.py,sha256=CHWSir2q-pL1hxzw_ZB45yi4mw-SkJ4YOa1CUL4nm2I,2568
@@ -53,7 +56,7 @@ chemotools/smooth/_median_filter.py,sha256=9ndTJCwrZirWlvDNldiigMddy79KIGq9OwwYN
 chemotools/smooth/_savitzky_golay_filter.py,sha256=27iFUWxdL9_7oZabR0R5L0ZTpBmYfVUjx2XCTukihBE,3509
 chemotools/smooth/_whittaker_smooth.py,sha256=lpLAyf4GdyDW4ulT1nyEoK6xQEl2cVUKquawQdGWbHU,3571
 chemotools/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-chemotools-0.1.10.dist-info/LICENSE,sha256=qtyOy2wDQVX9hxp58h3T-6Lmfv-mSCHoSRkcLUdM9bg,1070
-chemotools-0.1.10.dist-info/METADATA,sha256=fRgOO8cS2JNtNWz_CEG0uKvncSHEJ8myfhm2IOz3y-4,5240
-chemotools-0.1.10.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
-chemotools-0.1.10.dist-info/RECORD,,
+chemotools-0.1.11.dist-info/LICENSE,sha256=qtyOy2wDQVX9hxp58h3T-6Lmfv-mSCHoSRkcLUdM9bg,1070
+chemotools-0.1.11.dist-info/METADATA,sha256=Ne8xEa1cZUhbP-I4D1CFVvy8fhJANUjsY5cXRpNVV1k,5232
+chemotools-0.1.11.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+chemotools-0.1.11.dist-info/RECORD,,

{chemotools-0.1.10.dist-info → chemotools-0.1.11.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: poetry-core 2.1.1
+Generator: poetry-core 2.1.3
 Root-Is-Purelib: true
 Tag: py3-none-any

chemotools/outliers/_utils.py DELETED Viewed

@@ -1,91 +0,0 @@
-from typing import Optional, Tuple, Union
-from sklearn.cross_decomposition._pls import _PLS
-from sklearn.decomposition._base import _BasePCA
-from sklearn.pipeline import Pipeline
-from sklearn.utils.validation import check_is_fitted
-ModelTypes = Union[_BasePCA, _PLS]
-def get_model_parameters(model: ModelTypes) -> Tuple[int, int, int]:
-    """
-    Get the number of features, components and samples from a model with PLS or PCA. types.
-    Parameters
-    ----------
-    model : ModelType
-        A fitted model of type _BasePCA or _PLS
-    Returns
-    -------
-    Tuple[int, int, int]
-        The number of features, components and samples in the model
-    """
-    if isinstance(model, _BasePCA):
-        return model.n_features_in_, model.n_components_, model.n_samples_
-    elif isinstance(model, _PLS):
-        return model.n_features_in_, model.n_components, len(model.x_scores_)
-    else:
-        raise ValueError(
-            "Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
-        )
-def validate_confidence(confidence: float) -> float:
-    """Validate parameters using sklearn conventions.
-    Parameters
-    ----------
-    confidence : float
-        Confidence level for statistical calculations (between 0 and 1)
-    Returns
-    -------
-    float
-        The validated confidence level
-    Raises
-    ------
-    ValueError
-        If confidence is not between 0 and 1
-    """
-    if not 0 < confidence < 1:
-        raise ValueError("Confidence must be between 0 and 1")
-    return confidence
-def validate_and_extract_model(
-    model: Union[ModelTypes, Pipeline],
-) -> Tuple[ModelTypes, Optional[Pipeline], int, int, int]:
-    """Validate and extract the model and preprocessing steps.
-    Parameters
-    ----------
-    model : Union[ModelTypes, Pipeline]
-        A fitted PCA/PLS model or Pipeline ending with such a model
-    Returns
-    -------
-    Tuple[ModelTypes, Optional[Pipeline]]
-        The extracted model and preprocessing steps
-    Raises
-    ------
-    ValueError
-        If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
-    """
-    if isinstance(model, Pipeline):
-        preprocessing = model[:-1]
-        model = model[-1]
-    else:
-        preprocessing = None
-    if not isinstance(model, (_BasePCA, _PLS)):
-        raise ValueError(
-            "Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
-        )
-    check_is_fitted(model)
-    n_features_in, n_components, n_samples = get_model_parameters(model)
-    return model, preprocessing, n_features_in, n_components, n_samples

{chemotools-0.1.10.dist-info → chemotools-0.1.11.dist-info}/LICENSE RENAMED Viewed

File without changes

chemotools 0.1.10__py3-none-any.whl → 0.1.11__py3-none-any.whl

chemotools 0.1.10py3-none-any.whl → 0.1.11py3-none-any.whl