PyPI - chemotools - Versions diffs - 0.1.10__tar.gz → 0.1.11__tar.gz - Mend

chemotools 0.1.10tar.gz → 0.1.11tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

{chemotools-0.1.10 → chemotools-0.1.11}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: chemotools
-Version: 0.1.10
+Version: 0.1.11
 Summary: chemotools: A Python Package that Integrates Chemometrics and scikit-learn
 License: MIT
 Author: Pau Cabaneros
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.13
 Requires-Dist: numpy (>=2.0.0,<3.0.0)
 Requires-Dist: pandas (>=2.0.0,<3.0.0)
 Requires-Dist: polars (>=1.17.0,<2.0.0)
-Requires-Dist: pyarrow (>=18.0.0,<19.0.0)
+Requires-Dist: pyarrow (>=18,<21)
 Requires-Dist: scikit-learn (>=1.4.0,<2.0.0)
 Description-Content-Type: text/markdown

chemotools-0.1.11/chemotools/feature_selection/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from ._index_selector import IndexSelector
+from ._range_cut import RangeCut
+from ._sr_selector import SRSelector
+from ._vip_selector import VIPSelector
+__all__ = ["IndexSelector", "RangeCut", "SRSelector", "VIPSelector"]

chemotools-0.1.11/chemotools/feature_selection/_base.py ADDED Viewed

@@ -0,0 +1,88 @@
+from abc import ABC, abstractmethod
+from typing import Union
+import numpy as np
+from sklearn.base import BaseEstimator
+from sklearn.cross_decomposition._pls import _PLS
+from sklearn.feature_selection._base import SelectorMixin
+from sklearn.pipeline import Pipeline
+from sklearn.utils.validation import check_is_fitted
+ModelTypes = Union[_PLS, Pipeline]
+class _PLSFeatureSelectorBase(ABC, BaseEstimator, SelectorMixin):
+    """Feature selection base class for _PLS-like models.
+    Parameters
+    ----------
+    model : Union[_PLS, Pipeline]
+        A fitted  _PLS models or Pipeline ending with such a model
+    threshold : float
+        The threshold for feature selection. Features with importance
+        above this threshold will be selected.
+    Attributes
+    ----------
+    estimator_ : ModelTypes
+        The fitted model of type _BasePCA or _PLS
+    feature_scores_ : np.ndarray
+        The calculated feature scores based on the selected method.
+    support_mask : np.ndarray
+        The boolean mask indicating which features are selected.
+    """
+    def __init__(
+        self,
+        model: Union[_PLS, Pipeline],
+    ) -> None:
+        self.estimator_ = _validate_and_extract_model(model)
+    @abstractmethod
+    def _calculate_features(self, X: np.ndarray) -> np.ndarray:
+        """Calculate the residuals of the model.
+        Returns
+        -------
+        ndarray of shape (n_samples,)
+            The residuals of the model
+        """
+def _validate_and_extract_model(
+    model: Union[_PLS, Pipeline],
+) -> _PLS:
+    """Validate and extract the model.
+    Parameters
+    ----------
+    model : Union[_PLS, Pipeline]
+        A fitted _PLS model or Pipeline ending with such a model
+    Returns
+    -------
+    _PLS
+        The extracted estimator
+    Raises
+    ------
+    TypeError
+        If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
+    """
+    if isinstance(model, Pipeline):
+        estimator = model[-1]
+    else:
+        estimator = model
+    if not isinstance(estimator, _PLS):
+        raise TypeError(
+            "Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
+        )
+    check_is_fitted(model)
+    return estimator

chemotools-0.1.11/chemotools/feature_selection/_sr_selector.py ADDED Viewed

@@ -0,0 +1,137 @@
+import numpy as np
+from sklearn.utils.validation import validate_data
+from ._base import _PLSFeatureSelectorBase
+class SRSelector(_PLSFeatureSelectorBase):
+    """
+    This selector is used to select features that contribute significantly
+    to the latent variables in a PLS regression model using the Selectivity
+    Ratio (SR) method.
+    Parameters
+    ----------
+    - model: Union[_PLS, Pipeline]
+        The PLS regression model or a pipeline with a PLS regression model as last step.
+    - threshold: float, default=1.0
+        The threshold for feature selection. Features with importance
+        above this threshold will be selected.
+    Attributes
+    ----------
+    estimator_ : ModelTypes
+        The fitted model of type _BasePCA or _PLS
+    feature_scores_ : np.ndarray
+        The calculated feature scores based on the selected method.
+    support_mask_ : np.ndarray
+        The boolean mask indicating which features are selected.
+    Methods
+    -------
+    fit(X, y=None)
+        Fit the transformer to the input data. It calculates the feature scores and the feature_mask.
+    """
+    def __init__(
+        self,
+        model,
+        threshold: float = 1.0,
+    ):
+        self.model = model
+        self.threshold = threshold
+        super().__init__(self.model)
+    def fit(self, X: np.ndarray, y=None) -> "SRSelector":
+        """
+        Fit the transformer to calculate the feature scores and the support mask.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input data to fit the transformer to.
+        y : None
+            Ignored.
+        Returns
+        -------
+        self : SRSelector
+            The fitted transformer.
+        """
+        # Check that X is a 2D array and has only finite values
+        X = validate_data(
+            self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
+        )
+        # Calculate the SR scores
+        self.feature_scores_ = self._calculate_features(X)
+        # Calculate the support mask
+        self.support_mask_ = self._get_support_mask()
+        return self
+    def _get_support_mask(self) -> np.ndarray:
+        """
+        Get the support mask based on the feature scores and threshold.
+        Features with scores above the threshold are selected.
+        Parameters
+        ----------
+        self : SRSelector
+            The fitted transformer.
+        Returns
+        -------
+        support_mask_ : np.ndarray
+            The boolean mask indicating which features are selected.
+        """
+        return self.feature_scores_ > self.threshold
+    def _calculate_features(self, X: np.ndarray) -> np.ndarray:
+        """
+        Vectorized Selectivity Ratio calculation from a fitted _PLS
+        like model.
+        Parameters:
+        ----------
+        - self: SRSelector
+            The fitted transformer.
+        - X: array-like of shape (n_samples, n_features)
+            The input training data to calculate the feature scores from.
+        Returns
+        -------
+        feature_scores_ : np.ndarray
+            The calculated feature scores based on the selected method.
+        """
+        bpls = self.estimator_.coef_
+        bpls_norm = bpls.T / np.linalg.norm(bpls)
+        # Handle 1D case correctly
+        if bpls.ndim == 1:
+            bpls_norm = bpls_norm.reshape(-1, 1)
+        # Project X onto the regression vector
+        ttp = X @ bpls_norm
+        ptp = X.T @ np.linalg.pinv(ttp).T
+        # Predicted part of X
+        X_hat = ttp @ ptp.T
+        # Compute squared norms directly
+        total_ss = np.linalg.norm(X, axis=0) ** 2
+        explained_ss = np.linalg.norm(X_hat, axis=0) ** 2
+        # Calculate residual sum of squares
+        residual_ss = total_ss - explained_ss
+        # Stability: avoid division by zero
+        epsilon = 1e-12
+        # Calculate Selectivity Ratio
+        return explained_ss / (residual_ss + epsilon)

chemotools-0.1.11/chemotools/feature_selection/_vip_selector.py ADDED Viewed

@@ -0,0 +1,129 @@
+import numpy as np
+from sklearn.utils.validation import validate_data
+from ._base import _PLSFeatureSelectorBase
+class VIPSelector(_PLSFeatureSelectorBase):
+    """
+    This selector is used to select features that contribute significantly
+    to the latent variables in a PLS regression model using the Variables
+    Importance in Projection (VIP) method.
+    Parameters
+    ----------
+    - model: Union[_PLS, Pipeline]
+        The PLS regression model or a pipeline with a PLS regression model as last step.
+    - threshold: float, default=1.0
+        The threshold for feature selection. Features with importance
+        above this threshold will be selected.
+    Attributes
+    ----------
+    estimator_ : ModelTypes
+        The fitted model of type _BasePCA or _PLS
+    feature_scores_ : np.ndarray
+        The calculated feature scores based on the selected method.
+    support_mask_ : np.ndarray
+        The boolean mask indicating which features are selected.
+    Methods
+    -------
+    fit(X, y=None)
+        Fit the transformer to the input data. It calculates the feature scores and the feature_mask.
+    """
+    def __init__(
+        self,
+        model,
+        threshold: float = 1.0,
+    ):
+        self.model = model
+        self.threshold = threshold
+        super().__init__(self.model)
+    def fit(self, X: np.ndarray, y=None) -> "VIPSelector":
+        """
+        Fit the transformer to calculate the feature scores and the support mask.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input data to fit the transformer to.
+        y : None
+            Ignored.
+        Returns
+        -------
+        self : VIPSelector
+            The fitted transformer.
+        """
+        # Check that X is a 2D array and has only finite values
+        X = validate_data(
+            self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
+        )
+        # Calculate the VIP scores
+        self.feature_scores_ = self._calculate_features(X)
+        # Calculate the support mask
+        self.support_mask_ = self._get_support_mask()
+        return self
+    def _get_support_mask(self) -> np.ndarray:
+        """
+        Get the support mask based on the feature scores and threshold.
+        Features with scores above the threshold are selected.
+        Parameters
+        ----------
+        self : VIPSelector
+            The fitted transformer.
+        Returns
+        -------
+        support_mask_ : np.ndarray
+            The boolean mask indicating which features are selected.
+        """
+        return self.feature_scores_ > self.threshold
+    def _calculate_features(self, X: np.ndarray) -> np.ndarray:
+        """
+        Calculate the VIP scores based on the fitted model.
+        Parameters
+        ----------
+        self : VIPSelector
+            The fitted transformer.
+        Returns
+        -------
+        feature_scores_ : np.ndarray
+            The calculated feature scores based on the selected method.
+        """
+        # Calculate sum of squares of y_loadings and x_scores
+        sum_of_squares_y_loadings = (
+            np.linalg.norm(self.estimator_.y_loadings_, ord=2, axis=0) ** 2
+        )
+        sum_of_squares_x_scores = (
+            np.linalg.norm(self.estimator_.x_scores_, ord=2, axis=0) ** 2
+        )
+        # Calculate the sum of squares
+        sum_of_squares = sum_of_squares_y_loadings * sum_of_squares_x_scores
+        # Calculate the numerator
+        numerator = self.estimator_.n_features_in_ * np.sum(
+            sum_of_squares * self.estimator_.x_weights_**2,
+            axis=1,
+        )
+        # Calculate the denominator
+        denominator = np.sum(sum_of_squares, axis=0)
+        # Calculate the VIP scores
+        return np.sqrt(numerator / denominator)

{chemotools-0.1.10 → chemotools-0.1.11}/chemotools/outliers/_base.py RENAMED Viewed

@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Union, Optional
+from typing import Optional, Tuple, Union
 import numpy as np
@@ -9,7 +9,6 @@ from sklearn.cross_decomposition._pls import _PLS
 from sklearn.pipeline import Pipeline
 from sklearn.utils.validation import check_is_fitted
-from ._utils import validate_confidence, validate_and_extract_model
 ModelTypes = Union[_BasePCA, _PLS]
@@ -29,10 +28,10 @@ class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
     Attributes
     ----------
-    model_ : ModelTypes
+    estimator_ : ModelTypes
         The fitted model of type _BasePCA or _PLS
-    preprocessing_ : Optional[Pipeline]
+    transformer_ : Optional[Pipeline]
         Preprocessing steps before the model
     n_features_in_ : int
@@ -54,13 +53,13 @@ class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
         confidence: float,
     ) -> None:
         (
-            self.model_,
-            self.preprocessing_,
+            self.estimator_,
+            self.transformer_,
             self.n_features_in_,
             self.n_components_,
             self.n_samples_,
-        ) = validate_and_extract_model(model)
-        self.confidence = validate_confidence(confidence)
+        ) = _validate_and_extract_model(model)
+        self.confidence = _validate_confidence(confidence)
     def fit_predict_residuals(
         self, X: np.ndarray, y: Optional[np.ndarray] = None
@@ -96,7 +95,7 @@ class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
         """
     @abstractmethod
-    def _calculate_critical_value(self, X: Optional[np.ndarray]) -> float:
+    def _calculate_critical_value(self, X: np.ndarray) -> float:
         """Calculate the critical value for outlier detection.
         Returns
@@ -106,75 +105,84 @@ class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
         """
-class _ModelDiagnosticsBase(ABC):
-    """Base class for model diagnostics methods. This does not implement outlier detection algorithms,
-    but rather implements methods that are used to assess trained models.
+def _get_model_parameters(model: ModelTypes) -> Tuple[int, int, int]:
+    """
+    Get the number of features, components and samples from a model with PLS or PCA. types.
     Parameters
     ----------
-    model : Union[ModelTypes, Pipeline]
-        A fitted PCA/PLS model or Pipeline ending with such a model
-    Attributes
-    ----------
-    model_ : ModelTypes
-        The fitted model of type _BasePCA or _PLS
-    preprocessing_ : Optional[Pipeline]
-        Preprocessing steps before the model
+    model : ModelType
+        A fitted model of type _BasePCA or _PLS
+    Returns
+    -------
+    Tuple[int, int, int]
+        The number of features, components and samples in the model
     """
+    if isinstance(model, _BasePCA):
+        return model.n_features_in_, model.n_components_, model.n_samples_
+    elif isinstance(model, _PLS):
+        return model.n_features_in_, model.n_components, len(model.x_scores_)
+    else:
+        raise ValueError(
+            "Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
+        )
-    def __init__(self, model: Union[ModelTypes, Pipeline]):
-        self.model_, self.preprocessing_ = self._validate_and_extract_model(model)
-    def _validate_and_extract_model(self, model):
-        """Validate and extract the model and preprocessing steps.
+def _validate_confidence(confidence: float) -> float:
+    """Validate parameters using sklearn conventions.
-        Parameters
-        ----------
-        model : Union[ModelTypes, Pipeline]
-            A fitted PCA/PLS model or Pipeline ending with such a model
+    Parameters
+    ----------
+    confidence : float
+        Confidence level for statistical calculations (between 0 and 1)
-        Returns
-        -------
-        Tuple[ModelTypes, Optional[Pipeline]]
-            The extracted model and preprocessing steps
+    Returns
+    -------
+    float
+        The validated confidence level
-        Raises
-        ------
-        ValueError
-            If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
-        """
-        if isinstance(model, Pipeline):
-            preprocessing = model[:-1]
-            model = model[-1]
-        else:
-            preprocessing = None
-        if isinstance(model, (_BasePCA, _PLS)):
-            check_is_fitted(model)
-        else:
-            raise ValueError(
-                "Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
-            )
-        check_is_fitted(model)
-        return model, preprocessing
+    Raises
+    ------
+    ValueError
+        If confidence is not between 0 and 1
+    """
+    if not 0 < confidence < 1:
+        raise ValueError("Confidence must be between 0 and 1")
+    return confidence
-    @abstractmethod
-    def predict(self, X: np.ndarray, y: Optional[np.ndarray]) -> np.ndarray:
-        """Predict the output of the model.
-        Parameters
-        ----------
-        X : array-like of shape (n_samples, n_features)
-            Input data
+def _validate_and_extract_model(
+    model: Union[ModelTypes, Pipeline],
+) -> Tuple[ModelTypes, Optional[Pipeline], int, int, int]:
+    """Validate and extract the model and preprocessing steps.
-        y : array-like of shape (n_samples,), default=None
-            Target values
+    Parameters
+    ----------
+    model : Union[ModelTypes, Pipeline]
+        A fitted PCA/PLS model or Pipeline ending with such a model
-        Returns
-        -------
-        ndarray of shape (n_samples,)
-            Predicted values
-        """
+    Returns
+    -------
+    Tuple[ModelTypes, Optional[Pipeline]]
+        The extracted model and preprocessing steps
+    Raises
+    ------
+    ValueError
+        If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
+    """
+    if isinstance(model, Pipeline):
+        preprocessing = model[:-1]
+        model = model[-1]
+    else:
+        preprocessing = None
+    if not isinstance(model, (_BasePCA, _PLS)):
+        raise ValueError(
+            "Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
+        )
+    check_is_fitted(model)
+    n_features_in, n_components, n_samples = _get_model_parameters(model)
+    return model, preprocessing, n_features_in, n_components, n_samples

{chemotools-0.1.10 → chemotools-0.1.11}/chemotools/outliers/dmodx.py RENAMED Viewed

@@ -7,6 +7,7 @@ from scipy.stats import f as f_distribution
 from ._base import _ModelResidualsBase, ModelTypes
+from .utils import calculate_residual_spectrum
 class DModX(_ModelResidualsBase):
@@ -25,10 +26,10 @@ class DModX(_ModelResidualsBase):
     Attributes
     ----------
-    model_ : ModelType
+    estimator_ : ModelType
         The fitted model of type _BasePCA or _PLS
-    preprocessing_ : Optional[Pipeline]
+    transformer_ : Optional[Pipeline]
         Preprocessing steps before the model
     n_features_in_ : int
@@ -42,6 +43,9 @@ class DModX(_ModelResidualsBase):
     critical_value_ : float
         The calculated critical value for outlier detection
+    train_spe_: float
+        The training sum of squared errors (SSE) for the model normalized by degrees of freedom
     """
     def __init__(
@@ -49,6 +53,7 @@ class DModX(_ModelResidualsBase):
         model: Union[ModelTypes, Pipeline],
         confidence: float = 0.95,
     ) -> None:
+        model, confidence = model, confidence
         super().__init__(model, confidence)
     def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "DModX":
@@ -62,7 +67,18 @@ class DModX(_ModelResidualsBase):
             self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
         )
+        # Calculate the critical value
         self.critical_value_ = self._calculate_critical_value()
+        # Calculate the degrees of freedom normalized SPE of the training set
+        residuals = calculate_residual_spectrum(X, self.estimator_)
+        squared_errors = np.sum((residuals) ** 2, axis=1)
+        self.train_spe_ = np.sqrt(
+            squared_errors
+            / (self.n_samples_ - self.n_components_ - 1)
+            * (self.n_features_in_ - self.n_components_)
+        )
         return self
     def predict(self, X: np.ndarray) -> np.ndarray:
@@ -118,15 +134,17 @@ class DModX(_ModelResidualsBase):
             )
         # Apply preprocessing if available
-        if self.preprocessing_:
-            X = self.preprocessing_.transform(X)
+        if self.transformer_:
+            X = self.transformer_.transform(X)
         # Calculate the DModX statistics
-        X_transformed = self.model_.transform(X)
-        X_reconstructed = self.model_.inverse_transform(X_transformed)
-        squared_errors = np.sum((X - X_reconstructed) ** 2, axis=1)
+        residual = calculate_residual_spectrum(X, self.estimator_)
+        squared_errors = np.sum((residual) ** 2, axis=1)
-        return np.sqrt(squared_errors / (self.n_features_in_ - self.n_components_))
+        return (
+            np.sqrt(squared_errors / (self.n_features_in_ - self.n_components_))
+            / self.train_spe_
+        )
     def _calculate_critical_value(self, X: Optional[np.ndarray] = None) -> float:
         """Calculate F-distribution based critical value.

chemotools 0.1.10__tar.gz → 0.1.11__tar.gz

chemotools 0.1.10tar.gz → 0.1.11tar.gz