PyPI - chemotools - Versions diffs - 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl - Mend

chemotools 0.1.8py3-none-any.whl → 0.1.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

chemotools/augmentation/__init__.py +4 -0
chemotools/augmentation/_add_noise.py +70 -49
chemotools/augmentation/_fractional_shift.py +203 -0
chemotools/augmentation/_gaussian_broadening.py +136 -0
chemotools/augmentation/_index_shift.py +116 -101
chemotools/outliers/__init__.py +7 -0
chemotools/outliers/_base.py +180 -0
chemotools/outliers/_utils.py +91 -0
chemotools/outliers/dmodx.py +146 -0
chemotools/outliers/hotelling_t2.py +155 -0
chemotools/outliers/leverage.py +150 -0
chemotools/outliers/q_residuals.py +225 -0
chemotools/outliers/studentized_residuals.py +197 -0
{chemotools-0.1.8.dist-info → chemotools-0.1.10.dist-info}/METADATA +1 -1
{chemotools-0.1.8.dist-info → chemotools-0.1.10.dist-info}/RECORD +17 -7
{chemotools-0.1.8.dist-info → chemotools-0.1.10.dist-info}/WHEEL +1 -1
{chemotools-0.1.8.dist-info → chemotools-0.1.10.dist-info}/LICENSE +0 -0

chemotools/augmentation/__init__.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from ._add_noise import AddNoise
 from ._baseline_shift import BaselineShift
+from ._fractional_shift import FractionalShift
+from ._gaussian_broadening import GaussianBroadening
 from ._index_shift import IndexShift
 from ._spectrum_scale import SpectrumScale
@@ -7,6 +9,8 @@ from ._spectrum_scale import SpectrumScale
 __all__ = [
     "AddNoise",
     "BaselineShift",
+    "FractionalShift",
+    "GaussianBroadening",
     "IndexShift",
     "SpectrumScale",
 ]

chemotools/augmentation/_add_noise.py CHANGED Viewed

@@ -6,72 +6,95 @@ from sklearn.utils.validation import check_is_fitted, validate_data
 class AddNoise(TransformerMixin, OneToOneFeatureMixin, BaseEstimator):
-    """
-    Add normal noise to the input data.
+    """Add noise to input data from various probability distributions.
+    This transformer adds random noise from specified probability distributions
+    to the input data. Supported distributions include Gaussian, Poisson, and
+    exponential.
+    Parameters
+    ----------
+    distribution : {'gaussian', 'poisson', 'exponential'}, default='gaussian'
+        The probability distribution to sample noise from.
+    scale : float, default=0.0
+        Scale parameter for the noise distribution:
+        - For gaussian: standard deviation
+        - For poisson: multiplication factor for sampled values
+        - For exponential: scale parameter (1/λ)
+    random_state : int, optional
+        Random seed for reproducibility.
+    Attributes
+    ----------
+    n_features_in_ : int
+        Number of features in the training data.
     """
     def __init__(
         self,
-        noise_distribution: Literal["gaussian", "poisson", "exponential"] = "gaussian",
+        distribution: Literal["gaussian", "poisson", "exponential"] = "gaussian",
         scale: float = 0.0,
         random_state: Optional[int] = None,
     ):
-        self.noise_distribution = noise_distribution
+        self.distribution = distribution
         self.scale = scale
         self.random_state = random_state
     def fit(self, X: np.ndarray, y=None) -> "AddNoise":
-        """
-        Fit the transformer to the input data.
+        """Fit the transformer to the input data.
         Parameters
         ----------
-        X : np.ndarray of shape (n_samples, n_features)
-            The input data to fit the transformer to.
+        X : array-like of shape (n_samples, n_features)
+            Training data.
         y : None
-            Ignored.
+            Ignored. Present for API consistency.
         Returns
         -------
-        self : NormalNoise
-            The fitted transformer.
+        self : AddNoise
+            Fitted transformer.
+        Raises
+        ------
+        ValueError
+            If X is not a 2D array or contains non-finite values.
         """
         # Check that X is a 2D array and has only finite values
         X = validate_data(
             self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
         )
-        # Set the number of features
-        self.n_features_in_ = X.shape[1]
-        # Set the fitted attribute to True
-        self._is_fitted = True
         # Instantiate the random number generator
         self._rng = np.random.default_rng(self.random_state)
         return self
     def transform(self, X: np.ndarray, y=None) -> np.ndarray:
-        """
-        Transform the input data by adding random normal noise.
+        """Transform the input data by adding random noise.
         Parameters
         ----------
-        X : np.ndarray of shape (n_samples, n_features)
-            The input data to transform.
+        X : array-like of shape (n_samples, n_features)
+            Input data to transform.
         y : None
-            Ignored.
+            Ignored. Present for API consistency.
         Returns
         -------
-        X_ : np.ndarray of shape (n_samples, n_features)
-            The transformed data.
+        X_noisy : ndarray of shape (n_samples, n_features)
+            Transformed data with added noise.
+        Raises
+        ------
+        ValueError
+            If X has different number of features than the training data,
+            or if an invalid noise distribution is specified.
         """
         # Check that the estimator is fitted
-        check_is_fitted(self, "_is_fitted")
+        check_is_fitted(self, "n_features_in_")
         # Check that X is a 2D array and has only finite values
         X_ = validate_data(
@@ -84,31 +107,29 @@ class AddNoise(TransformerMixin, OneToOneFeatureMixin, BaseEstimator):
             dtype=np.float64,
         )
-        # Check that the number of features is the same as the fitted data
-        if X_.shape[1] != self.n_features_in_:
+        # Select the noise function based on the selected distribution
+        noise_func = {
+            "gaussian": self._add_gaussian_noise,
+            "poisson": self._add_poisson_noise,
+            "exponential": self._add_exponential_noise,
+        }.get(self.distribution)
+        if noise_func is None:
             raise ValueError(
-                f"Expected {self.n_features_in_} features but got {X_.shape[1]}"
+                f"Invalid noise distribution: {self.distribution}. "
+                "Expected one of: gaussian, poisson, exponential"
             )
-        # Calculate the standard normal variate
-        for i, x in enumerate(X_):
-            match self.noise_distribution:
-                case "gaussian":
-                    X_[i] = self._add_gaussian_noise(x)
-                case "poisson":
-                    X_[i] = self._add_poisson_noise(x)
-                case "exponential":
-                    X_[i] = self._add_exponential_noise(x)
-                case _:
-                    raise ValueError("Invalid noise distribution")
-        return X_.reshape(-1, 1) if X_.ndim == 1 else X_
+        return noise_func(X_)
-    def _add_gaussian_noise(self, x) -> np.ndarray:
-        return x + self._rng.normal(0, self.scale, size=x.shape)
+    def _add_gaussian_noise(self, X: np.ndarray) -> np.ndarray:
+        """Add Gaussian noise to the input array."""
+        return X + self._rng.normal(0, self.scale, size=X.shape)
-    def _add_poisson_noise(self, x) -> np.ndarray:
-        return self._rng.poisson(x, size=x.shape) * self.scale
+    def _add_poisson_noise(self, X: np.ndarray) -> np.ndarray:
+        """Add Poisson noise to the input array."""
+        return X + self._rng.poisson(X, size=X.shape) * self.scale
-    def _add_exponential_noise(self, x) -> np.ndarray:
-        return x + self._rng.exponential(self.scale, size=x.shape)
+    def _add_exponential_noise(self, X: np.ndarray) -> np.ndarray:
+        """Add exponential noise to the input array."""
+        return X + self._rng.exponential(self.scale, size=X.shape)

chemotools/augmentation/_fractional_shift.py ADDED Viewed

@@ -0,0 +1,203 @@
+from typing import Literal, Optional
+import numpy as np
+from scipy.interpolate import CubicSpline
+from scipy import stats
+from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
+from sklearn.utils.validation import check_is_fitted, validate_data
+class FractionalShift(TransformerMixin, OneToOneFeatureMixin, BaseEstimator):
+    """
+    Shift the spectrum by a fractional amount, allowing shifts below one index.
+    Parameters
+    ----------
+    shift : float, default=0.0
+        Maximum amount by which the data is randomly shifted.
+        The actual shift is a random float between -shift and shift.
+    padding_mode : {'zeros', 'constant', 'wrap', 'extend', 'mirror', 'linear'}, default='linear'
+        Specifies how to handle padding when shifting the data:
+            - 'zeros': Pads with zeros.
+            - 'constant': Pads with a constant value defined by `pad_value`.
+            - 'wrap': Circular shift (wraps around).
+            - 'extend': Extends using edge values.
+            - 'mirror': Mirrors the signal.
+            - 'linear': Uses linear regression on 5 points to extrapolate values.
+    pad_value : float, default=0.0
+        The value used for padding when `padding_mode='constant'`.
+    random_state : int, optional, default=None
+        The random seed for reproducibility.
+    """
+    def __init__(
+        self,
+        shift: float = 0.0,
+        padding_mode: Literal[
+            "zeros", "constant", "extend", "mirror", "linear"
+        ] = "linear",
+        pad_value: float = 0.0,
+        random_state: Optional[int] = None,
+    ):
+        self.shift = shift
+        self.padding_mode = padding_mode
+        self.pad_value = pad_value
+        self.random_state = random_state
+    def fit(self, X: np.ndarray, y=None) -> "FractionalShift":
+        """
+        Fit the transformer to the input data.
+        Parameters
+        ----------
+        X : np.ndarray of shape (n_samples, n_features)
+            The input data to fit the transformer to.
+        y : None
+            Ignored.
+        Returns
+        -------
+        self : FractionalShift
+            The fitted transformer.
+        """
+        X = validate_data(
+            self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
+        )
+        self._rng = np.random.default_rng(self.random_state)
+        return self
+    def transform(self, X: np.ndarray, y=None) -> np.ndarray:
+        """
+        Transform the input data by shifting the spectrum.
+        Parameters
+        ----------
+        X : np.ndarray of shape (n_samples, n_features)
+            The input data to transform.
+        y : None
+            Ignored.
+        Returns
+        -------
+        X_ : np.ndarray of shape (n_samples, n_features)
+            The transformed data with the applied shifts.
+        """
+        check_is_fitted(self, "n_features_in_")
+        X_ = validate_data(
+            self,
+            X,
+            y="no_validation",
+            ensure_2d=True,
+            copy=True,
+            reset=False,
+            dtype=np.float64,
+        )
+        for i, x in enumerate(X_):
+            X_[i] = self._shift_signal(x)
+        return X_.reshape(-1, 1) if X_.ndim == 1 else X_
+    def _shift_signal(self, x: np.ndarray) -> np.ndarray:
+        """
+        Shifts a signal by a fractional amount using cubic spline interpolation.
+        Parameters
+        ----------
+        x : np.ndarray of shape (n_features,)
+            The input signal to shift.
+        Returns
+        -------
+        shifted_signal : np.ndarray of shape (n_features,)
+            The shifted signal.
+        """
+        shift = self._rng.uniform(-self.shift, self.shift)
+        n = len(x)
+        indices = np.arange(n)
+        shifted_indices = indices + shift
+        # Create cubic spline interpolator
+        spline = CubicSpline(indices, x, bc_type="not-a-knot")
+        shifted_signal = spline(shifted_indices)
+        # Determine padding direction and length
+        if shift >= 0:
+            pad_length = len(shifted_indices[shifted_indices >= n - 1])
+            pad_left = False
+        else:
+            pad_length = len(shifted_indices[shifted_indices < 0])
+            pad_left = True
+        # Handle padding based on mode
+        if self.padding_mode == "zeros":
+            shifted_signal[shifted_indices < 0] = 0
+            shifted_signal[shifted_indices >= n - 1] = 0
+        elif self.padding_mode == "constant":
+            shifted_signal[shifted_indices < 0] = self.pad_value
+            shifted_signal[shifted_indices >= n - 1] = self.pad_value
+        elif self.padding_mode == "mirror":
+            if pad_left:
+                pad_values = x[pad_length - 1 :: -1]
+                shifted_signal[shifted_indices < 0] = pad_values[:pad_length]
+            else:
+                pad_values = x[:-1][::-1]
+                shifted_signal[shifted_indices >= n - 1] = pad_values[:pad_length]
+        elif self.padding_mode == "extend":
+            if pad_left:
+                shifted_signal[shifted_indices < 0] = x[0]
+            else:
+                shifted_signal[shifted_indices >= n - 1] = x[-1]
+        elif self.padding_mode == "linear":
+            if pad_left:
+                # Use first 5 points for regression
+                if len(x) < 5:
+                    points = x[: len(x)]  # Use all points if less than 5
+                else:
+                    points = x[:5]
+                x_coords = np.arange(len(points))
+                # Reshape arrays for linregress
+                x_coords = x_coords.reshape(-1)
+                points = points.reshape(-1)
+                # Perform regression
+                slope, intercept, _, _, _ = stats.linregress(x_coords, points)
+                # Generate new points using linear regression
+                new_x = np.arange(-pad_length, 0)
+                extrapolated = slope * new_x + intercept
+                shifted_signal[shifted_indices < 0] = extrapolated
+            else:
+                # Use last 5 points for regression
+                if len(x) < 5:
+                    points = x[-len(x) :]  # Use all points if less than 5
+                else:
+                    points = x[-5:]
+                x_coords = np.arange(len(points))
+                # Reshape arrays for linregress
+                x_coords = x_coords.reshape(-1)
+                points = points.reshape(-1)
+                # Perform regression
+                slope, intercept, _, _, _ = stats.linregress(x_coords, points)
+                # Generate new points using linear regression
+                new_x = np.arange(len(points), len(points) + pad_length)
+                extrapolated = slope * new_x + intercept
+                shifted_signal[shifted_indices >= n] = extrapolated
+        else:
+            raise ValueError(f"Unknown padding mode: {self.padding_mode}")
+        return shifted_signal

chemotools/augmentation/_gaussian_broadening.py ADDED Viewed

@@ -0,0 +1,136 @@
+from typing import Literal, Optional
+import numpy as np
+from scipy.ndimage import gaussian_filter1d
+from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
+from sklearn.utils.validation import check_is_fitted, validate_data
+class GaussianBroadening(TransformerMixin, OneToOneFeatureMixin, BaseEstimator):
+    """
+    Transform spectral data by broadening peaks using Gaussian convolution.
+    This transformer applies Gaussian smoothing to broaden peaks in spectral data.
+    For each signal, a random sigma is chosen between 0 and the specified sigma value.
+    Parameters
+    ----------
+    sigma : float, default=1.0
+        Maximum standard deviation for the Gaussian kernel.
+        The actual sigma used will be randomly chosen between 0 and this value.
+    mode : {'reflect', 'constant', 'nearest', 'mirror', 'wrap'}, default='reflect'
+        The mode parameter determines how the input array is extended when
+        the filter overlaps a border. Default is 'reflect'.
+    pad_value : float, default=0.0
+        Value to fill past edges of input if mode is 'constant'.
+    random_state : int, optional, default=None
+        Random state for reproducible sigma selection.
+    truncate : float, default=4.0
+        Truncate the filter at this many standard deviations.
+        Larger values increase computation time but improve accuracy.
+    """
+    def __init__(
+        self,
+        sigma: float = 1.0,
+        mode: Literal["reflect", "constant", "nearest", "mirror", "wrap"] = "reflect",
+        pad_value: float = 0.0,
+        random_state: Optional[int] = None,
+        truncate: float = 4.0,
+    ):
+        self.sigma = sigma
+        self.mode = mode
+        self.pad_value = pad_value
+        self.random_state = random_state
+        self.truncate = truncate
+    def fit(self, X: np.ndarray, y=None) -> "GaussianBroadening":
+        """
+        Fit the transformer to the data (in this case, only validates input).
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input data to validate.
+        y : None
+            Ignored.
+        Returns
+        -------
+        self : GaussianBroadening
+            The fitted transformer.
+        """
+        X = validate_data(
+            self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
+        )
+        # Validate sigma parameter
+        if not isinstance(self.sigma, (int, float)):
+            raise ValueError("sigma must be a number")
+        if self.sigma < 0:
+            raise ValueError("sigma must be non-negative")
+        # Initialize random number generator
+        self._rng = np.random.default_rng(self.random_state)
+        return self
+    def transform(self, X: np.ndarray, y=None) -> np.ndarray:
+        """
+        Apply Gaussian broadening to the input data.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to transform.
+        y : None
+            Ignored.
+        Returns
+        -------
+        X_transformed : ndarray of shape (n_samples, n_features)
+            The transformed data with broadened peaks.
+        """
+        check_is_fitted(self, "n_features_in_")
+        X_ = validate_data(
+            self,
+            X,
+            y="no_validation",
+            ensure_2d=True,
+            copy=True,
+            reset=False,
+            dtype=np.float64,
+        )
+        # Transform each sample
+        for i, x in enumerate(X_):
+            X_[i] = self._broaden_signal(x)
+        return X_
+    def _broaden_signal(self, x: np.ndarray) -> np.ndarray:
+        """
+        Apply Gaussian broadening to a single signal.
+        Parameters
+        ----------
+        x : ndarray of shape (n_features,)
+            The input signal to broaden.
+        Returns
+        -------
+        broadened_signal : ndarray of shape (n_features,)
+            The broadened signal.
+        """
+        # Randomly choose sigma between 0 and max sigma
+        sigma = self._rng.uniform(0, self.sigma)
+        # Apply Gaussian filter
+        return gaussian_filter1d(
+            x, sigma=sigma, mode=self.mode, cval=self.pad_value, truncate=self.truncate
+        )

chemotools 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

chemotools 0.1.8py3-none-any.whl → 0.1.10py3-none-any.whl