chemotools 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chemotools/augmentation/__init__.py +2 -0
- chemotools/augmentation/_gaussian_broadening.py +136 -0
- chemotools/feature_selection/__init__.py +3 -1
- chemotools/feature_selection/_base.py +88 -0
- chemotools/feature_selection/_sr_selector.py +137 -0
- chemotools/feature_selection/_vip_selector.py +129 -0
- chemotools/outliers/__init__.py +7 -0
- chemotools/outliers/_base.py +188 -0
- chemotools/outliers/dmodx.py +164 -0
- chemotools/outliers/hotelling_t2.py +156 -0
- chemotools/outliers/leverage.py +151 -0
- chemotools/outliers/q_residuals.py +228 -0
- chemotools/outliers/studentized_residuals.py +198 -0
- chemotools/outliers/utils.py +51 -0
- {chemotools-0.1.9.dist-info → chemotools-0.1.11.dist-info}/METADATA +2 -2
- {chemotools-0.1.9.dist-info → chemotools-0.1.11.dist-info}/RECORD +18 -6
- {chemotools-0.1.9.dist-info → chemotools-0.1.11.dist-info}/WHEEL +1 -1
- {chemotools-0.1.9.dist-info → chemotools-0.1.11.dist-info}/LICENSE +0 -0
@@ -1,6 +1,7 @@
|
|
1
1
|
from ._add_noise import AddNoise
|
2
2
|
from ._baseline_shift import BaselineShift
|
3
3
|
from ._fractional_shift import FractionalShift
|
4
|
+
from ._gaussian_broadening import GaussianBroadening
|
4
5
|
from ._index_shift import IndexShift
|
5
6
|
from ._spectrum_scale import SpectrumScale
|
6
7
|
|
@@ -9,6 +10,7 @@ __all__ = [
|
|
9
10
|
"AddNoise",
|
10
11
|
"BaselineShift",
|
11
12
|
"FractionalShift",
|
13
|
+
"GaussianBroadening",
|
12
14
|
"IndexShift",
|
13
15
|
"SpectrumScale",
|
14
16
|
]
|
@@ -0,0 +1,136 @@
|
|
1
|
+
from typing import Literal, Optional
|
2
|
+
import numpy as np
|
3
|
+
from scipy.ndimage import gaussian_filter1d
|
4
|
+
from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
|
5
|
+
from sklearn.utils.validation import check_is_fitted, validate_data
|
6
|
+
|
7
|
+
|
8
|
+
class GaussianBroadening(TransformerMixin, OneToOneFeatureMixin, BaseEstimator):
|
9
|
+
"""
|
10
|
+
Transform spectral data by broadening peaks using Gaussian convolution.
|
11
|
+
|
12
|
+
This transformer applies Gaussian smoothing to broaden peaks in spectral data.
|
13
|
+
For each signal, a random sigma is chosen between 0 and the specified sigma value.
|
14
|
+
|
15
|
+
Parameters
|
16
|
+
----------
|
17
|
+
sigma : float, default=1.0
|
18
|
+
Maximum standard deviation for the Gaussian kernel.
|
19
|
+
The actual sigma used will be randomly chosen between 0 and this value.
|
20
|
+
|
21
|
+
mode : {'reflect', 'constant', 'nearest', 'mirror', 'wrap'}, default='reflect'
|
22
|
+
The mode parameter determines how the input array is extended when
|
23
|
+
the filter overlaps a border. Default is 'reflect'.
|
24
|
+
|
25
|
+
pad_value : float, default=0.0
|
26
|
+
Value to fill past edges of input if mode is 'constant'.
|
27
|
+
|
28
|
+
random_state : int, optional, default=None
|
29
|
+
Random state for reproducible sigma selection.
|
30
|
+
|
31
|
+
truncate : float, default=4.0
|
32
|
+
Truncate the filter at this many standard deviations.
|
33
|
+
Larger values increase computation time but improve accuracy.
|
34
|
+
"""
|
35
|
+
|
36
|
+
def __init__(
|
37
|
+
self,
|
38
|
+
sigma: float = 1.0,
|
39
|
+
mode: Literal["reflect", "constant", "nearest", "mirror", "wrap"] = "reflect",
|
40
|
+
pad_value: float = 0.0,
|
41
|
+
random_state: Optional[int] = None,
|
42
|
+
truncate: float = 4.0,
|
43
|
+
):
|
44
|
+
self.sigma = sigma
|
45
|
+
self.mode = mode
|
46
|
+
self.pad_value = pad_value
|
47
|
+
self.random_state = random_state
|
48
|
+
self.truncate = truncate
|
49
|
+
|
50
|
+
def fit(self, X: np.ndarray, y=None) -> "GaussianBroadening":
|
51
|
+
"""
|
52
|
+
Fit the transformer to the data (in this case, only validates input).
|
53
|
+
|
54
|
+
Parameters
|
55
|
+
----------
|
56
|
+
X : array-like of shape (n_samples, n_features)
|
57
|
+
Input data to validate.
|
58
|
+
|
59
|
+
y : None
|
60
|
+
Ignored.
|
61
|
+
|
62
|
+
Returns
|
63
|
+
-------
|
64
|
+
self : GaussianBroadening
|
65
|
+
The fitted transformer.
|
66
|
+
"""
|
67
|
+
X = validate_data(
|
68
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
69
|
+
)
|
70
|
+
|
71
|
+
# Validate sigma parameter
|
72
|
+
if not isinstance(self.sigma, (int, float)):
|
73
|
+
raise ValueError("sigma must be a number")
|
74
|
+
if self.sigma < 0:
|
75
|
+
raise ValueError("sigma must be non-negative")
|
76
|
+
|
77
|
+
# Initialize random number generator
|
78
|
+
self._rng = np.random.default_rng(self.random_state)
|
79
|
+
|
80
|
+
return self
|
81
|
+
|
82
|
+
def transform(self, X: np.ndarray, y=None) -> np.ndarray:
|
83
|
+
"""
|
84
|
+
Apply Gaussian broadening to the input data.
|
85
|
+
|
86
|
+
Parameters
|
87
|
+
----------
|
88
|
+
X : array-like of shape (n_samples, n_features)
|
89
|
+
The data to transform.
|
90
|
+
|
91
|
+
y : None
|
92
|
+
Ignored.
|
93
|
+
|
94
|
+
Returns
|
95
|
+
-------
|
96
|
+
X_transformed : ndarray of shape (n_samples, n_features)
|
97
|
+
The transformed data with broadened peaks.
|
98
|
+
"""
|
99
|
+
check_is_fitted(self, "n_features_in_")
|
100
|
+
X_ = validate_data(
|
101
|
+
self,
|
102
|
+
X,
|
103
|
+
y="no_validation",
|
104
|
+
ensure_2d=True,
|
105
|
+
copy=True,
|
106
|
+
reset=False,
|
107
|
+
dtype=np.float64,
|
108
|
+
)
|
109
|
+
|
110
|
+
# Transform each sample
|
111
|
+
for i, x in enumerate(X_):
|
112
|
+
X_[i] = self._broaden_signal(x)
|
113
|
+
|
114
|
+
return X_
|
115
|
+
|
116
|
+
def _broaden_signal(self, x: np.ndarray) -> np.ndarray:
|
117
|
+
"""
|
118
|
+
Apply Gaussian broadening to a single signal.
|
119
|
+
|
120
|
+
Parameters
|
121
|
+
----------
|
122
|
+
x : ndarray of shape (n_features,)
|
123
|
+
The input signal to broaden.
|
124
|
+
|
125
|
+
Returns
|
126
|
+
-------
|
127
|
+
broadened_signal : ndarray of shape (n_features,)
|
128
|
+
The broadened signal.
|
129
|
+
"""
|
130
|
+
# Randomly choose sigma between 0 and max sigma
|
131
|
+
sigma = self._rng.uniform(0, self.sigma)
|
132
|
+
|
133
|
+
# Apply Gaussian filter
|
134
|
+
return gaussian_filter1d(
|
135
|
+
x, sigma=sigma, mode=self.mode, cval=self.pad_value, truncate=self.truncate
|
136
|
+
)
|
@@ -1,4 +1,6 @@
|
|
1
1
|
from ._index_selector import IndexSelector
|
2
2
|
from ._range_cut import RangeCut
|
3
|
+
from ._sr_selector import SRSelector
|
4
|
+
from ._vip_selector import VIPSelector
|
3
5
|
|
4
|
-
__all__ = ["IndexSelector", "RangeCut"]
|
6
|
+
__all__ = ["IndexSelector", "RangeCut", "SRSelector", "VIPSelector"]
|
@@ -0,0 +1,88 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import Union
|
3
|
+
|
4
|
+
import numpy as np
|
5
|
+
|
6
|
+
from sklearn.base import BaseEstimator
|
7
|
+
from sklearn.cross_decomposition._pls import _PLS
|
8
|
+
from sklearn.feature_selection._base import SelectorMixin
|
9
|
+
from sklearn.pipeline import Pipeline
|
10
|
+
from sklearn.utils.validation import check_is_fitted
|
11
|
+
|
12
|
+
ModelTypes = Union[_PLS, Pipeline]
|
13
|
+
|
14
|
+
|
15
|
+
class _PLSFeatureSelectorBase(ABC, BaseEstimator, SelectorMixin):
|
16
|
+
"""Feature selection base class for _PLS-like models.
|
17
|
+
|
18
|
+
|
19
|
+
Parameters
|
20
|
+
----------
|
21
|
+
model : Union[_PLS, Pipeline]
|
22
|
+
A fitted _PLS models or Pipeline ending with such a model
|
23
|
+
|
24
|
+
threshold : float
|
25
|
+
The threshold for feature selection. Features with importance
|
26
|
+
above this threshold will be selected.
|
27
|
+
|
28
|
+
Attributes
|
29
|
+
----------
|
30
|
+
estimator_ : ModelTypes
|
31
|
+
The fitted model of type _BasePCA or _PLS
|
32
|
+
|
33
|
+
feature_scores_ : np.ndarray
|
34
|
+
The calculated feature scores based on the selected method.
|
35
|
+
|
36
|
+
support_mask : np.ndarray
|
37
|
+
The boolean mask indicating which features are selected.
|
38
|
+
"""
|
39
|
+
|
40
|
+
def __init__(
|
41
|
+
self,
|
42
|
+
model: Union[_PLS, Pipeline],
|
43
|
+
) -> None:
|
44
|
+
self.estimator_ = _validate_and_extract_model(model)
|
45
|
+
|
46
|
+
@abstractmethod
|
47
|
+
def _calculate_features(self, X: np.ndarray) -> np.ndarray:
|
48
|
+
"""Calculate the residuals of the model.
|
49
|
+
|
50
|
+
Returns
|
51
|
+
-------
|
52
|
+
ndarray of shape (n_samples,)
|
53
|
+
The residuals of the model
|
54
|
+
"""
|
55
|
+
|
56
|
+
|
57
|
+
def _validate_and_extract_model(
|
58
|
+
model: Union[_PLS, Pipeline],
|
59
|
+
) -> _PLS:
|
60
|
+
"""Validate and extract the model.
|
61
|
+
|
62
|
+
Parameters
|
63
|
+
----------
|
64
|
+
model : Union[_PLS, Pipeline]
|
65
|
+
A fitted _PLS model or Pipeline ending with such a model
|
66
|
+
|
67
|
+
Returns
|
68
|
+
-------
|
69
|
+
_PLS
|
70
|
+
The extracted estimator
|
71
|
+
|
72
|
+
Raises
|
73
|
+
------
|
74
|
+
TypeError
|
75
|
+
If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
|
76
|
+
"""
|
77
|
+
if isinstance(model, Pipeline):
|
78
|
+
estimator = model[-1]
|
79
|
+
else:
|
80
|
+
estimator = model
|
81
|
+
|
82
|
+
if not isinstance(estimator, _PLS):
|
83
|
+
raise TypeError(
|
84
|
+
"Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
|
85
|
+
)
|
86
|
+
|
87
|
+
check_is_fitted(model)
|
88
|
+
return estimator
|
@@ -0,0 +1,137 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from sklearn.utils.validation import validate_data
|
3
|
+
|
4
|
+
from ._base import _PLSFeatureSelectorBase
|
5
|
+
|
6
|
+
|
7
|
+
class SRSelector(_PLSFeatureSelectorBase):
|
8
|
+
"""
|
9
|
+
This selector is used to select features that contribute significantly
|
10
|
+
to the latent variables in a PLS regression model using the Selectivity
|
11
|
+
Ratio (SR) method.
|
12
|
+
|
13
|
+
Parameters
|
14
|
+
----------
|
15
|
+
- model: Union[_PLS, Pipeline]
|
16
|
+
The PLS regression model or a pipeline with a PLS regression model as last step.
|
17
|
+
|
18
|
+
- threshold: float, default=1.0
|
19
|
+
The threshold for feature selection. Features with importance
|
20
|
+
above this threshold will be selected.
|
21
|
+
|
22
|
+
Attributes
|
23
|
+
----------
|
24
|
+
estimator_ : ModelTypes
|
25
|
+
The fitted model of type _BasePCA or _PLS
|
26
|
+
|
27
|
+
feature_scores_ : np.ndarray
|
28
|
+
The calculated feature scores based on the selected method.
|
29
|
+
|
30
|
+
support_mask_ : np.ndarray
|
31
|
+
The boolean mask indicating which features are selected.
|
32
|
+
|
33
|
+
Methods
|
34
|
+
-------
|
35
|
+
fit(X, y=None)
|
36
|
+
Fit the transformer to the input data. It calculates the feature scores and the feature_mask.
|
37
|
+
"""
|
38
|
+
|
39
|
+
def __init__(
|
40
|
+
self,
|
41
|
+
model,
|
42
|
+
threshold: float = 1.0,
|
43
|
+
):
|
44
|
+
self.model = model
|
45
|
+
self.threshold = threshold
|
46
|
+
super().__init__(self.model)
|
47
|
+
|
48
|
+
def fit(self, X: np.ndarray, y=None) -> "SRSelector":
|
49
|
+
"""
|
50
|
+
Fit the transformer to calculate the feature scores and the support mask.
|
51
|
+
|
52
|
+
Parameters
|
53
|
+
----------
|
54
|
+
X : array-like of shape (n_samples, n_features)
|
55
|
+
The input data to fit the transformer to.
|
56
|
+
|
57
|
+
y : None
|
58
|
+
Ignored.
|
59
|
+
|
60
|
+
Returns
|
61
|
+
-------
|
62
|
+
self : SRSelector
|
63
|
+
The fitted transformer.
|
64
|
+
"""
|
65
|
+
# Check that X is a 2D array and has only finite values
|
66
|
+
X = validate_data(
|
67
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
68
|
+
)
|
69
|
+
|
70
|
+
# Calculate the SR scores
|
71
|
+
self.feature_scores_ = self._calculate_features(X)
|
72
|
+
|
73
|
+
# Calculate the support mask
|
74
|
+
self.support_mask_ = self._get_support_mask()
|
75
|
+
|
76
|
+
return self
|
77
|
+
|
78
|
+
def _get_support_mask(self) -> np.ndarray:
|
79
|
+
"""
|
80
|
+
Get the support mask based on the feature scores and threshold.
|
81
|
+
Features with scores above the threshold are selected.
|
82
|
+
Parameters
|
83
|
+
----------
|
84
|
+
self : SRSelector
|
85
|
+
The fitted transformer.
|
86
|
+
|
87
|
+
Returns
|
88
|
+
-------
|
89
|
+
support_mask_ : np.ndarray
|
90
|
+
The boolean mask indicating which features are selected.
|
91
|
+
"""
|
92
|
+
return self.feature_scores_ > self.threshold
|
93
|
+
|
94
|
+
def _calculate_features(self, X: np.ndarray) -> np.ndarray:
|
95
|
+
"""
|
96
|
+
Vectorized Selectivity Ratio calculation from a fitted _PLS
|
97
|
+
like model.
|
98
|
+
|
99
|
+
Parameters:
|
100
|
+
----------
|
101
|
+
- self: SRSelector
|
102
|
+
The fitted transformer.
|
103
|
+
|
104
|
+
- X: array-like of shape (n_samples, n_features)
|
105
|
+
The input training data to calculate the feature scores from.
|
106
|
+
|
107
|
+
Returns
|
108
|
+
-------
|
109
|
+
feature_scores_ : np.ndarray
|
110
|
+
The calculated feature scores based on the selected method.
|
111
|
+
"""
|
112
|
+
bpls = self.estimator_.coef_
|
113
|
+
bpls_norm = bpls.T / np.linalg.norm(bpls)
|
114
|
+
|
115
|
+
# Handle 1D case correctly
|
116
|
+
if bpls.ndim == 1:
|
117
|
+
bpls_norm = bpls_norm.reshape(-1, 1)
|
118
|
+
|
119
|
+
# Project X onto the regression vector
|
120
|
+
ttp = X @ bpls_norm
|
121
|
+
ptp = X.T @ np.linalg.pinv(ttp).T
|
122
|
+
|
123
|
+
# Predicted part of X
|
124
|
+
X_hat = ttp @ ptp.T
|
125
|
+
|
126
|
+
# Compute squared norms directly
|
127
|
+
total_ss = np.linalg.norm(X, axis=0) ** 2
|
128
|
+
explained_ss = np.linalg.norm(X_hat, axis=0) ** 2
|
129
|
+
|
130
|
+
# Calculate residual sum of squares
|
131
|
+
residual_ss = total_ss - explained_ss
|
132
|
+
|
133
|
+
# Stability: avoid division by zero
|
134
|
+
epsilon = 1e-12
|
135
|
+
|
136
|
+
# Calculate Selectivity Ratio
|
137
|
+
return explained_ss / (residual_ss + epsilon)
|
@@ -0,0 +1,129 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from sklearn.utils.validation import validate_data
|
3
|
+
|
4
|
+
from ._base import _PLSFeatureSelectorBase
|
5
|
+
|
6
|
+
|
7
|
+
class VIPSelector(_PLSFeatureSelectorBase):
|
8
|
+
"""
|
9
|
+
This selector is used to select features that contribute significantly
|
10
|
+
to the latent variables in a PLS regression model using the Variables
|
11
|
+
Importance in Projection (VIP) method.
|
12
|
+
|
13
|
+
Parameters
|
14
|
+
----------
|
15
|
+
- model: Union[_PLS, Pipeline]
|
16
|
+
The PLS regression model or a pipeline with a PLS regression model as last step.
|
17
|
+
|
18
|
+
- threshold: float, default=1.0
|
19
|
+
The threshold for feature selection. Features with importance
|
20
|
+
above this threshold will be selected.
|
21
|
+
|
22
|
+
Attributes
|
23
|
+
----------
|
24
|
+
estimator_ : ModelTypes
|
25
|
+
The fitted model of type _BasePCA or _PLS
|
26
|
+
|
27
|
+
feature_scores_ : np.ndarray
|
28
|
+
The calculated feature scores based on the selected method.
|
29
|
+
|
30
|
+
support_mask_ : np.ndarray
|
31
|
+
The boolean mask indicating which features are selected.
|
32
|
+
|
33
|
+
Methods
|
34
|
+
-------
|
35
|
+
fit(X, y=None)
|
36
|
+
Fit the transformer to the input data. It calculates the feature scores and the feature_mask.
|
37
|
+
"""
|
38
|
+
|
39
|
+
def __init__(
|
40
|
+
self,
|
41
|
+
model,
|
42
|
+
threshold: float = 1.0,
|
43
|
+
):
|
44
|
+
self.model = model
|
45
|
+
self.threshold = threshold
|
46
|
+
super().__init__(self.model)
|
47
|
+
|
48
|
+
def fit(self, X: np.ndarray, y=None) -> "VIPSelector":
|
49
|
+
"""
|
50
|
+
Fit the transformer to calculate the feature scores and the support mask.
|
51
|
+
|
52
|
+
Parameters
|
53
|
+
----------
|
54
|
+
X : array-like of shape (n_samples, n_features)
|
55
|
+
The input data to fit the transformer to.
|
56
|
+
|
57
|
+
y : None
|
58
|
+
Ignored.
|
59
|
+
|
60
|
+
Returns
|
61
|
+
-------
|
62
|
+
self : VIPSelector
|
63
|
+
The fitted transformer.
|
64
|
+
"""
|
65
|
+
# Check that X is a 2D array and has only finite values
|
66
|
+
X = validate_data(
|
67
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
68
|
+
)
|
69
|
+
|
70
|
+
# Calculate the VIP scores
|
71
|
+
self.feature_scores_ = self._calculate_features(X)
|
72
|
+
|
73
|
+
# Calculate the support mask
|
74
|
+
self.support_mask_ = self._get_support_mask()
|
75
|
+
|
76
|
+
return self
|
77
|
+
|
78
|
+
def _get_support_mask(self) -> np.ndarray:
|
79
|
+
"""
|
80
|
+
Get the support mask based on the feature scores and threshold.
|
81
|
+
Features with scores above the threshold are selected.
|
82
|
+
Parameters
|
83
|
+
----------
|
84
|
+
self : VIPSelector
|
85
|
+
The fitted transformer.
|
86
|
+
|
87
|
+
Returns
|
88
|
+
-------
|
89
|
+
support_mask_ : np.ndarray
|
90
|
+
The boolean mask indicating which features are selected.
|
91
|
+
"""
|
92
|
+
return self.feature_scores_ > self.threshold
|
93
|
+
|
94
|
+
def _calculate_features(self, X: np.ndarray) -> np.ndarray:
|
95
|
+
"""
|
96
|
+
Calculate the VIP scores based on the fitted model.
|
97
|
+
|
98
|
+
Parameters
|
99
|
+
----------
|
100
|
+
self : VIPSelector
|
101
|
+
The fitted transformer.
|
102
|
+
|
103
|
+
Returns
|
104
|
+
-------
|
105
|
+
feature_scores_ : np.ndarray
|
106
|
+
The calculated feature scores based on the selected method.
|
107
|
+
"""
|
108
|
+
# Calculate sum of squares of y_loadings and x_scores
|
109
|
+
sum_of_squares_y_loadings = (
|
110
|
+
np.linalg.norm(self.estimator_.y_loadings_, ord=2, axis=0) ** 2
|
111
|
+
)
|
112
|
+
sum_of_squares_x_scores = (
|
113
|
+
np.linalg.norm(self.estimator_.x_scores_, ord=2, axis=0) ** 2
|
114
|
+
)
|
115
|
+
|
116
|
+
# Calculate the sum of squares
|
117
|
+
sum_of_squares = sum_of_squares_y_loadings * sum_of_squares_x_scores
|
118
|
+
|
119
|
+
# Calculate the numerator
|
120
|
+
numerator = self.estimator_.n_features_in_ * np.sum(
|
121
|
+
sum_of_squares * self.estimator_.x_weights_**2,
|
122
|
+
axis=1,
|
123
|
+
)
|
124
|
+
|
125
|
+
# Calculate the denominator
|
126
|
+
denominator = np.sum(sum_of_squares, axis=0)
|
127
|
+
|
128
|
+
# Calculate the VIP scores
|
129
|
+
return np.sqrt(numerator / denominator)
|
@@ -0,0 +1,7 @@
|
|
1
|
+
from .dmodx import DModX
|
2
|
+
from .hotelling_t2 import HotellingT2
|
3
|
+
from .q_residuals import QResiduals
|
4
|
+
from .leverage import Leverage
|
5
|
+
from .studentized_residuals import StudentizedResiduals
|
6
|
+
|
7
|
+
__all__ = ["DModX", "HotellingT2", "QResiduals", "Leverage", "StudentizedResiduals"]
|