chemotools 0.1.10__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chemotools/feature_selection/__init__.py +3 -1
- chemotools/feature_selection/_base.py +88 -0
- chemotools/feature_selection/_sr_selector.py +137 -0
- chemotools/feature_selection/_vip_selector.py +129 -0
- chemotools/outliers/_base.py +75 -67
- chemotools/outliers/dmodx.py +26 -8
- chemotools/outliers/hotelling_t2.py +11 -10
- chemotools/outliers/leverage.py +15 -14
- chemotools/outliers/q_residuals.py +19 -16
- chemotools/outliers/studentized_residuals.py +17 -16
- chemotools/outliers/utils.py +51 -0
- {chemotools-0.1.10.dist-info → chemotools-0.1.11.dist-info}/METADATA +2 -2
- {chemotools-0.1.10.dist-info → chemotools-0.1.11.dist-info}/RECORD +15 -12
- {chemotools-0.1.10.dist-info → chemotools-0.1.11.dist-info}/WHEEL +1 -1
- chemotools/outliers/_utils.py +0 -91
- {chemotools-0.1.10.dist-info → chemotools-0.1.11.dist-info}/LICENSE +0 -0
@@ -1,4 +1,6 @@
|
|
1
1
|
from ._index_selector import IndexSelector
|
2
2
|
from ._range_cut import RangeCut
|
3
|
+
from ._sr_selector import SRSelector
|
4
|
+
from ._vip_selector import VIPSelector
|
3
5
|
|
4
|
-
__all__ = ["IndexSelector", "RangeCut"]
|
6
|
+
__all__ = ["IndexSelector", "RangeCut", "SRSelector", "VIPSelector"]
|
@@ -0,0 +1,88 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import Union
|
3
|
+
|
4
|
+
import numpy as np
|
5
|
+
|
6
|
+
from sklearn.base import BaseEstimator
|
7
|
+
from sklearn.cross_decomposition._pls import _PLS
|
8
|
+
from sklearn.feature_selection._base import SelectorMixin
|
9
|
+
from sklearn.pipeline import Pipeline
|
10
|
+
from sklearn.utils.validation import check_is_fitted
|
11
|
+
|
12
|
+
ModelTypes = Union[_PLS, Pipeline]
|
13
|
+
|
14
|
+
|
15
|
+
class _PLSFeatureSelectorBase(ABC, BaseEstimator, SelectorMixin):
|
16
|
+
"""Feature selection base class for _PLS-like models.
|
17
|
+
|
18
|
+
|
19
|
+
Parameters
|
20
|
+
----------
|
21
|
+
model : Union[_PLS, Pipeline]
|
22
|
+
A fitted _PLS models or Pipeline ending with such a model
|
23
|
+
|
24
|
+
threshold : float
|
25
|
+
The threshold for feature selection. Features with importance
|
26
|
+
above this threshold will be selected.
|
27
|
+
|
28
|
+
Attributes
|
29
|
+
----------
|
30
|
+
estimator_ : ModelTypes
|
31
|
+
The fitted model of type _BasePCA or _PLS
|
32
|
+
|
33
|
+
feature_scores_ : np.ndarray
|
34
|
+
The calculated feature scores based on the selected method.
|
35
|
+
|
36
|
+
support_mask : np.ndarray
|
37
|
+
The boolean mask indicating which features are selected.
|
38
|
+
"""
|
39
|
+
|
40
|
+
def __init__(
|
41
|
+
self,
|
42
|
+
model: Union[_PLS, Pipeline],
|
43
|
+
) -> None:
|
44
|
+
self.estimator_ = _validate_and_extract_model(model)
|
45
|
+
|
46
|
+
@abstractmethod
|
47
|
+
def _calculate_features(self, X: np.ndarray) -> np.ndarray:
|
48
|
+
"""Calculate the residuals of the model.
|
49
|
+
|
50
|
+
Returns
|
51
|
+
-------
|
52
|
+
ndarray of shape (n_samples,)
|
53
|
+
The residuals of the model
|
54
|
+
"""
|
55
|
+
|
56
|
+
|
57
|
+
def _validate_and_extract_model(
|
58
|
+
model: Union[_PLS, Pipeline],
|
59
|
+
) -> _PLS:
|
60
|
+
"""Validate and extract the model.
|
61
|
+
|
62
|
+
Parameters
|
63
|
+
----------
|
64
|
+
model : Union[_PLS, Pipeline]
|
65
|
+
A fitted _PLS model or Pipeline ending with such a model
|
66
|
+
|
67
|
+
Returns
|
68
|
+
-------
|
69
|
+
_PLS
|
70
|
+
The extracted estimator
|
71
|
+
|
72
|
+
Raises
|
73
|
+
------
|
74
|
+
TypeError
|
75
|
+
If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
|
76
|
+
"""
|
77
|
+
if isinstance(model, Pipeline):
|
78
|
+
estimator = model[-1]
|
79
|
+
else:
|
80
|
+
estimator = model
|
81
|
+
|
82
|
+
if not isinstance(estimator, _PLS):
|
83
|
+
raise TypeError(
|
84
|
+
"Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
|
85
|
+
)
|
86
|
+
|
87
|
+
check_is_fitted(model)
|
88
|
+
return estimator
|
@@ -0,0 +1,137 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from sklearn.utils.validation import validate_data
|
3
|
+
|
4
|
+
from ._base import _PLSFeatureSelectorBase
|
5
|
+
|
6
|
+
|
7
|
+
class SRSelector(_PLSFeatureSelectorBase):
|
8
|
+
"""
|
9
|
+
This selector is used to select features that contribute significantly
|
10
|
+
to the latent variables in a PLS regression model using the Selectivity
|
11
|
+
Ratio (SR) method.
|
12
|
+
|
13
|
+
Parameters
|
14
|
+
----------
|
15
|
+
- model: Union[_PLS, Pipeline]
|
16
|
+
The PLS regression model or a pipeline with a PLS regression model as last step.
|
17
|
+
|
18
|
+
- threshold: float, default=1.0
|
19
|
+
The threshold for feature selection. Features with importance
|
20
|
+
above this threshold will be selected.
|
21
|
+
|
22
|
+
Attributes
|
23
|
+
----------
|
24
|
+
estimator_ : ModelTypes
|
25
|
+
The fitted model of type _BasePCA or _PLS
|
26
|
+
|
27
|
+
feature_scores_ : np.ndarray
|
28
|
+
The calculated feature scores based on the selected method.
|
29
|
+
|
30
|
+
support_mask_ : np.ndarray
|
31
|
+
The boolean mask indicating which features are selected.
|
32
|
+
|
33
|
+
Methods
|
34
|
+
-------
|
35
|
+
fit(X, y=None)
|
36
|
+
Fit the transformer to the input data. It calculates the feature scores and the feature_mask.
|
37
|
+
"""
|
38
|
+
|
39
|
+
def __init__(
|
40
|
+
self,
|
41
|
+
model,
|
42
|
+
threshold: float = 1.0,
|
43
|
+
):
|
44
|
+
self.model = model
|
45
|
+
self.threshold = threshold
|
46
|
+
super().__init__(self.model)
|
47
|
+
|
48
|
+
def fit(self, X: np.ndarray, y=None) -> "SRSelector":
|
49
|
+
"""
|
50
|
+
Fit the transformer to calculate the feature scores and the support mask.
|
51
|
+
|
52
|
+
Parameters
|
53
|
+
----------
|
54
|
+
X : array-like of shape (n_samples, n_features)
|
55
|
+
The input data to fit the transformer to.
|
56
|
+
|
57
|
+
y : None
|
58
|
+
Ignored.
|
59
|
+
|
60
|
+
Returns
|
61
|
+
-------
|
62
|
+
self : SRSelector
|
63
|
+
The fitted transformer.
|
64
|
+
"""
|
65
|
+
# Check that X is a 2D array and has only finite values
|
66
|
+
X = validate_data(
|
67
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
68
|
+
)
|
69
|
+
|
70
|
+
# Calculate the SR scores
|
71
|
+
self.feature_scores_ = self._calculate_features(X)
|
72
|
+
|
73
|
+
# Calculate the support mask
|
74
|
+
self.support_mask_ = self._get_support_mask()
|
75
|
+
|
76
|
+
return self
|
77
|
+
|
78
|
+
def _get_support_mask(self) -> np.ndarray:
|
79
|
+
"""
|
80
|
+
Get the support mask based on the feature scores and threshold.
|
81
|
+
Features with scores above the threshold are selected.
|
82
|
+
Parameters
|
83
|
+
----------
|
84
|
+
self : SRSelector
|
85
|
+
The fitted transformer.
|
86
|
+
|
87
|
+
Returns
|
88
|
+
-------
|
89
|
+
support_mask_ : np.ndarray
|
90
|
+
The boolean mask indicating which features are selected.
|
91
|
+
"""
|
92
|
+
return self.feature_scores_ > self.threshold
|
93
|
+
|
94
|
+
def _calculate_features(self, X: np.ndarray) -> np.ndarray:
|
95
|
+
"""
|
96
|
+
Vectorized Selectivity Ratio calculation from a fitted _PLS
|
97
|
+
like model.
|
98
|
+
|
99
|
+
Parameters:
|
100
|
+
----------
|
101
|
+
- self: SRSelector
|
102
|
+
The fitted transformer.
|
103
|
+
|
104
|
+
- X: array-like of shape (n_samples, n_features)
|
105
|
+
The input training data to calculate the feature scores from.
|
106
|
+
|
107
|
+
Returns
|
108
|
+
-------
|
109
|
+
feature_scores_ : np.ndarray
|
110
|
+
The calculated feature scores based on the selected method.
|
111
|
+
"""
|
112
|
+
bpls = self.estimator_.coef_
|
113
|
+
bpls_norm = bpls.T / np.linalg.norm(bpls)
|
114
|
+
|
115
|
+
# Handle 1D case correctly
|
116
|
+
if bpls.ndim == 1:
|
117
|
+
bpls_norm = bpls_norm.reshape(-1, 1)
|
118
|
+
|
119
|
+
# Project X onto the regression vector
|
120
|
+
ttp = X @ bpls_norm
|
121
|
+
ptp = X.T @ np.linalg.pinv(ttp).T
|
122
|
+
|
123
|
+
# Predicted part of X
|
124
|
+
X_hat = ttp @ ptp.T
|
125
|
+
|
126
|
+
# Compute squared norms directly
|
127
|
+
total_ss = np.linalg.norm(X, axis=0) ** 2
|
128
|
+
explained_ss = np.linalg.norm(X_hat, axis=0) ** 2
|
129
|
+
|
130
|
+
# Calculate residual sum of squares
|
131
|
+
residual_ss = total_ss - explained_ss
|
132
|
+
|
133
|
+
# Stability: avoid division by zero
|
134
|
+
epsilon = 1e-12
|
135
|
+
|
136
|
+
# Calculate Selectivity Ratio
|
137
|
+
return explained_ss / (residual_ss + epsilon)
|
@@ -0,0 +1,129 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from sklearn.utils.validation import validate_data
|
3
|
+
|
4
|
+
from ._base import _PLSFeatureSelectorBase
|
5
|
+
|
6
|
+
|
7
|
+
class VIPSelector(_PLSFeatureSelectorBase):
|
8
|
+
"""
|
9
|
+
This selector is used to select features that contribute significantly
|
10
|
+
to the latent variables in a PLS regression model using the Variables
|
11
|
+
Importance in Projection (VIP) method.
|
12
|
+
|
13
|
+
Parameters
|
14
|
+
----------
|
15
|
+
- model: Union[_PLS, Pipeline]
|
16
|
+
The PLS regression model or a pipeline with a PLS regression model as last step.
|
17
|
+
|
18
|
+
- threshold: float, default=1.0
|
19
|
+
The threshold for feature selection. Features with importance
|
20
|
+
above this threshold will be selected.
|
21
|
+
|
22
|
+
Attributes
|
23
|
+
----------
|
24
|
+
estimator_ : ModelTypes
|
25
|
+
The fitted model of type _BasePCA or _PLS
|
26
|
+
|
27
|
+
feature_scores_ : np.ndarray
|
28
|
+
The calculated feature scores based on the selected method.
|
29
|
+
|
30
|
+
support_mask_ : np.ndarray
|
31
|
+
The boolean mask indicating which features are selected.
|
32
|
+
|
33
|
+
Methods
|
34
|
+
-------
|
35
|
+
fit(X, y=None)
|
36
|
+
Fit the transformer to the input data. It calculates the feature scores and the feature_mask.
|
37
|
+
"""
|
38
|
+
|
39
|
+
def __init__(
|
40
|
+
self,
|
41
|
+
model,
|
42
|
+
threshold: float = 1.0,
|
43
|
+
):
|
44
|
+
self.model = model
|
45
|
+
self.threshold = threshold
|
46
|
+
super().__init__(self.model)
|
47
|
+
|
48
|
+
def fit(self, X: np.ndarray, y=None) -> "VIPSelector":
|
49
|
+
"""
|
50
|
+
Fit the transformer to calculate the feature scores and the support mask.
|
51
|
+
|
52
|
+
Parameters
|
53
|
+
----------
|
54
|
+
X : array-like of shape (n_samples, n_features)
|
55
|
+
The input data to fit the transformer to.
|
56
|
+
|
57
|
+
y : None
|
58
|
+
Ignored.
|
59
|
+
|
60
|
+
Returns
|
61
|
+
-------
|
62
|
+
self : VIPSelector
|
63
|
+
The fitted transformer.
|
64
|
+
"""
|
65
|
+
# Check that X is a 2D array and has only finite values
|
66
|
+
X = validate_data(
|
67
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
68
|
+
)
|
69
|
+
|
70
|
+
# Calculate the VIP scores
|
71
|
+
self.feature_scores_ = self._calculate_features(X)
|
72
|
+
|
73
|
+
# Calculate the support mask
|
74
|
+
self.support_mask_ = self._get_support_mask()
|
75
|
+
|
76
|
+
return self
|
77
|
+
|
78
|
+
def _get_support_mask(self) -> np.ndarray:
|
79
|
+
"""
|
80
|
+
Get the support mask based on the feature scores and threshold.
|
81
|
+
Features with scores above the threshold are selected.
|
82
|
+
Parameters
|
83
|
+
----------
|
84
|
+
self : VIPSelector
|
85
|
+
The fitted transformer.
|
86
|
+
|
87
|
+
Returns
|
88
|
+
-------
|
89
|
+
support_mask_ : np.ndarray
|
90
|
+
The boolean mask indicating which features are selected.
|
91
|
+
"""
|
92
|
+
return self.feature_scores_ > self.threshold
|
93
|
+
|
94
|
+
def _calculate_features(self, X: np.ndarray) -> np.ndarray:
|
95
|
+
"""
|
96
|
+
Calculate the VIP scores based on the fitted model.
|
97
|
+
|
98
|
+
Parameters
|
99
|
+
----------
|
100
|
+
self : VIPSelector
|
101
|
+
The fitted transformer.
|
102
|
+
|
103
|
+
Returns
|
104
|
+
-------
|
105
|
+
feature_scores_ : np.ndarray
|
106
|
+
The calculated feature scores based on the selected method.
|
107
|
+
"""
|
108
|
+
# Calculate sum of squares of y_loadings and x_scores
|
109
|
+
sum_of_squares_y_loadings = (
|
110
|
+
np.linalg.norm(self.estimator_.y_loadings_, ord=2, axis=0) ** 2
|
111
|
+
)
|
112
|
+
sum_of_squares_x_scores = (
|
113
|
+
np.linalg.norm(self.estimator_.x_scores_, ord=2, axis=0) ** 2
|
114
|
+
)
|
115
|
+
|
116
|
+
# Calculate the sum of squares
|
117
|
+
sum_of_squares = sum_of_squares_y_loadings * sum_of_squares_x_scores
|
118
|
+
|
119
|
+
# Calculate the numerator
|
120
|
+
numerator = self.estimator_.n_features_in_ * np.sum(
|
121
|
+
sum_of_squares * self.estimator_.x_weights_**2,
|
122
|
+
axis=1,
|
123
|
+
)
|
124
|
+
|
125
|
+
# Calculate the denominator
|
126
|
+
denominator = np.sum(sum_of_squares, axis=0)
|
127
|
+
|
128
|
+
# Calculate the VIP scores
|
129
|
+
return np.sqrt(numerator / denominator)
|
chemotools/outliers/_base.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
from abc import ABC, abstractmethod
|
2
|
-
from typing import
|
2
|
+
from typing import Optional, Tuple, Union
|
3
3
|
|
4
4
|
import numpy as np
|
5
5
|
|
@@ -9,7 +9,6 @@ from sklearn.cross_decomposition._pls import _PLS
|
|
9
9
|
from sklearn.pipeline import Pipeline
|
10
10
|
from sklearn.utils.validation import check_is_fitted
|
11
11
|
|
12
|
-
from ._utils import validate_confidence, validate_and_extract_model
|
13
12
|
|
14
13
|
ModelTypes = Union[_BasePCA, _PLS]
|
15
14
|
|
@@ -29,10 +28,10 @@ class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
|
|
29
28
|
|
30
29
|
Attributes
|
31
30
|
----------
|
32
|
-
|
31
|
+
estimator_ : ModelTypes
|
33
32
|
The fitted model of type _BasePCA or _PLS
|
34
33
|
|
35
|
-
|
34
|
+
transformer_ : Optional[Pipeline]
|
36
35
|
Preprocessing steps before the model
|
37
36
|
|
38
37
|
n_features_in_ : int
|
@@ -54,13 +53,13 @@ class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
|
|
54
53
|
confidence: float,
|
55
54
|
) -> None:
|
56
55
|
(
|
57
|
-
self.
|
58
|
-
self.
|
56
|
+
self.estimator_,
|
57
|
+
self.transformer_,
|
59
58
|
self.n_features_in_,
|
60
59
|
self.n_components_,
|
61
60
|
self.n_samples_,
|
62
|
-
) =
|
63
|
-
self.confidence =
|
61
|
+
) = _validate_and_extract_model(model)
|
62
|
+
self.confidence = _validate_confidence(confidence)
|
64
63
|
|
65
64
|
def fit_predict_residuals(
|
66
65
|
self, X: np.ndarray, y: Optional[np.ndarray] = None
|
@@ -96,7 +95,7 @@ class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
|
|
96
95
|
"""
|
97
96
|
|
98
97
|
@abstractmethod
|
99
|
-
def _calculate_critical_value(self, X:
|
98
|
+
def _calculate_critical_value(self, X: np.ndarray) -> float:
|
100
99
|
"""Calculate the critical value for outlier detection.
|
101
100
|
|
102
101
|
Returns
|
@@ -106,75 +105,84 @@ class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
|
|
106
105
|
"""
|
107
106
|
|
108
107
|
|
109
|
-
|
110
|
-
"""
|
111
|
-
|
108
|
+
def _get_model_parameters(model: ModelTypes) -> Tuple[int, int, int]:
|
109
|
+
"""
|
110
|
+
Get the number of features, components and samples from a model with PLS or PCA. types.
|
112
111
|
|
113
112
|
Parameters
|
114
113
|
----------
|
115
|
-
model :
|
116
|
-
A fitted
|
117
|
-
|
118
|
-
Attributes
|
119
|
-
----------
|
120
|
-
model_ : ModelTypes
|
121
|
-
The fitted model of type _BasePCA or _PLS
|
122
|
-
|
123
|
-
preprocessing_ : Optional[Pipeline]
|
124
|
-
Preprocessing steps before the model
|
114
|
+
model : ModelType
|
115
|
+
A fitted model of type _BasePCA or _PLS
|
125
116
|
|
117
|
+
Returns
|
118
|
+
-------
|
119
|
+
Tuple[int, int, int]
|
120
|
+
The number of features, components and samples in the model
|
126
121
|
"""
|
122
|
+
if isinstance(model, _BasePCA):
|
123
|
+
return model.n_features_in_, model.n_components_, model.n_samples_
|
124
|
+
elif isinstance(model, _PLS):
|
125
|
+
return model.n_features_in_, model.n_components, len(model.x_scores_)
|
126
|
+
else:
|
127
|
+
raise ValueError(
|
128
|
+
"Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
|
129
|
+
)
|
127
130
|
|
128
|
-
def __init__(self, model: Union[ModelTypes, Pipeline]):
|
129
|
-
self.model_, self.preprocessing_ = self._validate_and_extract_model(model)
|
130
131
|
|
131
|
-
|
132
|
-
|
132
|
+
def _validate_confidence(confidence: float) -> float:
|
133
|
+
"""Validate parameters using sklearn conventions.
|
133
134
|
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
135
|
+
Parameters
|
136
|
+
----------
|
137
|
+
confidence : float
|
138
|
+
Confidence level for statistical calculations (between 0 and 1)
|
138
139
|
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
140
|
+
Returns
|
141
|
+
-------
|
142
|
+
float
|
143
|
+
The validated confidence level
|
143
144
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
else:
|
153
|
-
preprocessing = None
|
154
|
-
|
155
|
-
if isinstance(model, (_BasePCA, _PLS)):
|
156
|
-
check_is_fitted(model)
|
157
|
-
else:
|
158
|
-
raise ValueError(
|
159
|
-
"Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
|
160
|
-
)
|
161
|
-
check_is_fitted(model)
|
162
|
-
return model, preprocessing
|
145
|
+
Raises
|
146
|
+
------
|
147
|
+
ValueError
|
148
|
+
If confidence is not between 0 and 1
|
149
|
+
"""
|
150
|
+
if not 0 < confidence < 1:
|
151
|
+
raise ValueError("Confidence must be between 0 and 1")
|
152
|
+
return confidence
|
163
153
|
|
164
|
-
@abstractmethod
|
165
|
-
def predict(self, X: np.ndarray, y: Optional[np.ndarray]) -> np.ndarray:
|
166
|
-
"""Predict the output of the model.
|
167
154
|
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
155
|
+
def _validate_and_extract_model(
|
156
|
+
model: Union[ModelTypes, Pipeline],
|
157
|
+
) -> Tuple[ModelTypes, Optional[Pipeline], int, int, int]:
|
158
|
+
"""Validate and extract the model and preprocessing steps.
|
172
159
|
|
173
|
-
|
174
|
-
|
160
|
+
Parameters
|
161
|
+
----------
|
162
|
+
model : Union[ModelTypes, Pipeline]
|
163
|
+
A fitted PCA/PLS model or Pipeline ending with such a model
|
175
164
|
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
165
|
+
Returns
|
166
|
+
-------
|
167
|
+
Tuple[ModelTypes, Optional[Pipeline]]
|
168
|
+
The extracted model and preprocessing steps
|
169
|
+
|
170
|
+
Raises
|
171
|
+
------
|
172
|
+
ValueError
|
173
|
+
If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
|
174
|
+
"""
|
175
|
+
if isinstance(model, Pipeline):
|
176
|
+
preprocessing = model[:-1]
|
177
|
+
model = model[-1]
|
178
|
+
else:
|
179
|
+
preprocessing = None
|
180
|
+
|
181
|
+
if not isinstance(model, (_BasePCA, _PLS)):
|
182
|
+
raise ValueError(
|
183
|
+
"Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
|
184
|
+
)
|
185
|
+
|
186
|
+
check_is_fitted(model)
|
187
|
+
n_features_in, n_components, n_samples = _get_model_parameters(model)
|
188
|
+
return model, preprocessing, n_features_in, n_components, n_samples
|
chemotools/outliers/dmodx.py
CHANGED
@@ -7,6 +7,7 @@ from scipy.stats import f as f_distribution
|
|
7
7
|
|
8
8
|
|
9
9
|
from ._base import _ModelResidualsBase, ModelTypes
|
10
|
+
from .utils import calculate_residual_spectrum
|
10
11
|
|
11
12
|
|
12
13
|
class DModX(_ModelResidualsBase):
|
@@ -25,10 +26,10 @@ class DModX(_ModelResidualsBase):
|
|
25
26
|
|
26
27
|
Attributes
|
27
28
|
----------
|
28
|
-
|
29
|
+
estimator_ : ModelType
|
29
30
|
The fitted model of type _BasePCA or _PLS
|
30
31
|
|
31
|
-
|
32
|
+
transformer_ : Optional[Pipeline]
|
32
33
|
Preprocessing steps before the model
|
33
34
|
|
34
35
|
n_features_in_ : int
|
@@ -42,6 +43,9 @@ class DModX(_ModelResidualsBase):
|
|
42
43
|
|
43
44
|
critical_value_ : float
|
44
45
|
The calculated critical value for outlier detection
|
46
|
+
|
47
|
+
train_spe_: float
|
48
|
+
The training sum of squared errors (SSE) for the model normalized by degrees of freedom
|
45
49
|
"""
|
46
50
|
|
47
51
|
def __init__(
|
@@ -49,6 +53,7 @@ class DModX(_ModelResidualsBase):
|
|
49
53
|
model: Union[ModelTypes, Pipeline],
|
50
54
|
confidence: float = 0.95,
|
51
55
|
) -> None:
|
56
|
+
model, confidence = model, confidence
|
52
57
|
super().__init__(model, confidence)
|
53
58
|
|
54
59
|
def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "DModX":
|
@@ -62,7 +67,18 @@ class DModX(_ModelResidualsBase):
|
|
62
67
|
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
63
68
|
)
|
64
69
|
|
70
|
+
# Calculate the critical value
|
65
71
|
self.critical_value_ = self._calculate_critical_value()
|
72
|
+
|
73
|
+
# Calculate the degrees of freedom normalized SPE of the training set
|
74
|
+
residuals = calculate_residual_spectrum(X, self.estimator_)
|
75
|
+
squared_errors = np.sum((residuals) ** 2, axis=1)
|
76
|
+
self.train_spe_ = np.sqrt(
|
77
|
+
squared_errors
|
78
|
+
/ (self.n_samples_ - self.n_components_ - 1)
|
79
|
+
* (self.n_features_in_ - self.n_components_)
|
80
|
+
)
|
81
|
+
|
66
82
|
return self
|
67
83
|
|
68
84
|
def predict(self, X: np.ndarray) -> np.ndarray:
|
@@ -118,15 +134,17 @@ class DModX(_ModelResidualsBase):
|
|
118
134
|
)
|
119
135
|
|
120
136
|
# Apply preprocessing if available
|
121
|
-
if self.
|
122
|
-
X = self.
|
137
|
+
if self.transformer_:
|
138
|
+
X = self.transformer_.transform(X)
|
123
139
|
|
124
140
|
# Calculate the DModX statistics
|
125
|
-
|
126
|
-
|
127
|
-
squared_errors = np.sum((X - X_reconstructed) ** 2, axis=1)
|
141
|
+
residual = calculate_residual_spectrum(X, self.estimator_)
|
142
|
+
squared_errors = np.sum((residual) ** 2, axis=1)
|
128
143
|
|
129
|
-
return
|
144
|
+
return (
|
145
|
+
np.sqrt(squared_errors / (self.n_features_in_ - self.n_components_))
|
146
|
+
/ self.train_spe_
|
147
|
+
)
|
130
148
|
|
131
149
|
def _calculate_critical_value(self, X: Optional[np.ndarray] = None) -> float:
|
132
150
|
"""Calculate F-distribution based critical value.
|
@@ -24,10 +24,10 @@ class HotellingT2(_ModelResidualsBase):
|
|
24
24
|
|
25
25
|
Attributes
|
26
26
|
----------
|
27
|
-
|
27
|
+
estimator_ : ModelType
|
28
28
|
The fitted model of type _BasePCA or _PLS
|
29
29
|
|
30
|
-
|
30
|
+
transformer_ : Optional[Pipeline]
|
31
31
|
Preprocessing steps before the model
|
32
32
|
|
33
33
|
n_features_in_ : int
|
@@ -51,6 +51,7 @@ class HotellingT2(_ModelResidualsBase):
|
|
51
51
|
def __init__(
|
52
52
|
self, model: Union[ModelTypes, Pipeline], confidence: float = 0.95
|
53
53
|
) -> None:
|
54
|
+
self.model, self.confidence = model, confidence
|
54
55
|
super().__init__(model, confidence)
|
55
56
|
|
56
57
|
def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "HotellingT2":
|
@@ -93,7 +94,7 @@ class HotellingT2(_ModelResidualsBase):
|
|
93
94
|
return np.where(hotelling_t2_values > self.critical_value_, -1, 1)
|
94
95
|
|
95
96
|
def predict_residuals(
|
96
|
-
self, X: np.ndarray, y: Optional[np.ndarray], validate: bool = True
|
97
|
+
self, X: np.ndarray, y: Optional[np.ndarray] = None, validate: bool = True
|
97
98
|
) -> np.ndarray:
|
98
99
|
"""Calculate Hotelling's T-squared statistics for input data.
|
99
100
|
|
@@ -117,20 +118,20 @@ class HotellingT2(_ModelResidualsBase):
|
|
117
118
|
)
|
118
119
|
|
119
120
|
# Apply preprocessing steps
|
120
|
-
if self.
|
121
|
-
X = self.
|
121
|
+
if self.transformer_:
|
122
|
+
X = self.transformer_.transform(X)
|
122
123
|
|
123
124
|
# Calculate the Hotelling's T-squared statistics
|
124
|
-
if isinstance(self.
|
125
|
+
if isinstance(self.estimator_, _BasePCA):
|
125
126
|
# For PCA-like models
|
126
|
-
variances = self.
|
127
|
+
variances = self.estimator_.explained_variance_
|
127
128
|
|
128
|
-
if isinstance(self.
|
129
|
+
if isinstance(self.estimator_, _PLS):
|
129
130
|
# For PLS-like models
|
130
|
-
variances = np.var(self.
|
131
|
+
variances = np.var(self.estimator_.x_scores_, axis=0)
|
131
132
|
|
132
133
|
# Equivalent to X @ model.components_.T for _BasePCA and X @ model.x_rotations_ for _PLS
|
133
|
-
X_transformed = self.
|
134
|
+
X_transformed = self.estimator_.transform(X)
|
134
135
|
|
135
136
|
return np.sum((X_transformed**2) / variances, axis=1)
|
136
137
|
|
chemotools/outliers/leverage.py
CHANGED
@@ -20,10 +20,10 @@ class Leverage(_ModelResidualsBase):
|
|
20
20
|
|
21
21
|
Attributes
|
22
22
|
----------
|
23
|
-
|
23
|
+
estimator_ : ModelType
|
24
24
|
The fitted model of type _BasePCA or _PLS
|
25
25
|
|
26
|
-
|
26
|
+
transformer_ : Optional[Pipeline]
|
27
27
|
Preprocessing steps before the model
|
28
28
|
|
29
29
|
References
|
@@ -34,6 +34,7 @@ class Leverage(_ModelResidualsBase):
|
|
34
34
|
def __init__(
|
35
35
|
self, model: Union[ModelTypes, Pipeline], confidence: float = 0.95
|
36
36
|
) -> None:
|
37
|
+
model, confidence = model, confidence
|
37
38
|
super().__init__(model, confidence)
|
38
39
|
|
39
40
|
def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "Leverage":
|
@@ -47,8 +48,8 @@ class Leverage(_ModelResidualsBase):
|
|
47
48
|
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
48
49
|
)
|
49
50
|
|
50
|
-
if self.
|
51
|
-
X = self.
|
51
|
+
if self.transformer_:
|
52
|
+
X = self.transformer_.fit_transform(X)
|
52
53
|
|
53
54
|
# Compute the critical threshold
|
54
55
|
self.critical_value_ = self._calculate_critical_value(X)
|
@@ -77,15 +78,15 @@ class Leverage(_ModelResidualsBase):
|
|
77
78
|
)
|
78
79
|
|
79
80
|
# Preprocess the data
|
80
|
-
if self.
|
81
|
-
X = self.
|
81
|
+
if self.transformer_:
|
82
|
+
X = self.transformer_.transform(X)
|
82
83
|
|
83
84
|
# Calculate outliers based on samples with too high leverage
|
84
|
-
leverage = calculate_leverage(self.
|
85
|
+
leverage = calculate_leverage(X, self.estimator_)
|
85
86
|
return np.where(leverage > self.critical_value_, -1, 1)
|
86
87
|
|
87
88
|
def predict_residuals(
|
88
|
-
self, X: np.ndarray, y: Optional[np.ndarray], validate: bool = True
|
89
|
+
self, X: np.ndarray, y: Optional[np.ndarray] = None, validate: bool = True
|
89
90
|
) -> np.ndarray:
|
90
91
|
"""Calculate the leverage of the samples.
|
91
92
|
|
@@ -107,23 +108,23 @@ class Leverage(_ModelResidualsBase):
|
|
107
108
|
X = validate_data(self, X, ensure_2d=True, dtype=np.float64)
|
108
109
|
|
109
110
|
# Apply preprocessing if available
|
110
|
-
if self.
|
111
|
-
X = self.
|
111
|
+
if self.transformer_:
|
112
|
+
X = self.transformer_.transform(X)
|
112
113
|
|
113
114
|
# Calculate the leverage
|
114
|
-
return calculate_leverage(self.
|
115
|
+
return calculate_leverage(X, self.estimator_)
|
115
116
|
|
116
|
-
def _calculate_critical_value(self, X:
|
117
|
+
def _calculate_critical_value(self, X: np.ndarray) -> float:
|
117
118
|
"""Calculate the critical value for outlier detection using the percentile outlier method."""
|
118
119
|
|
119
120
|
# Calculate the leverage of the samples
|
120
|
-
leverage = calculate_leverage(self.
|
121
|
+
leverage = calculate_leverage(X, self.estimator_)
|
121
122
|
|
122
123
|
# Calculate the critical value
|
123
124
|
return np.percentile(leverage, self.confidence * 100)
|
124
125
|
|
125
126
|
|
126
|
-
def calculate_leverage(
|
127
|
+
def calculate_leverage(X: np.ndarray, model: ModelTypes) -> np.ndarray:
|
127
128
|
"""
|
128
129
|
Calculate the leverage of the training samples in a PLS/PCA-like model.
|
129
130
|
|
@@ -7,6 +7,7 @@ from sklearn.pipeline import Pipeline
|
|
7
7
|
from sklearn.utils.validation import validate_data, check_is_fitted
|
8
8
|
|
9
9
|
from ._base import _ModelResidualsBase, ModelTypes
|
10
|
+
from .utils import calculate_residual_spectrum
|
10
11
|
|
11
12
|
|
12
13
|
class QResiduals(_ModelResidualsBase):
|
@@ -21,7 +22,7 @@ class QResiduals(_ModelResidualsBase):
|
|
21
22
|
confidence : float, default=0.95
|
22
23
|
Confidence level for statistical calculations (between 0 and 1).
|
23
24
|
|
24
|
-
method : str, default="
|
25
|
+
method : str, default="jackson-mudholkar"
|
25
26
|
The method used to compute the confidence threshold for Q residuals.
|
26
27
|
Options:
|
27
28
|
- "chi-square" : Uses mean and standard deviation to approximate Q residuals threshold.
|
@@ -30,10 +31,10 @@ class QResiduals(_ModelResidualsBase):
|
|
30
31
|
|
31
32
|
Attributes
|
32
33
|
----------
|
33
|
-
|
34
|
+
estimator_ : ModelType
|
34
35
|
The fitted model of type _BasePCA or _PLS.
|
35
36
|
|
36
|
-
|
37
|
+
transformer_ : Optional[Pipeline]
|
37
38
|
Preprocessing steps before the model.
|
38
39
|
|
39
40
|
n_features_in_ : int
|
@@ -58,9 +59,11 @@ class QResiduals(_ModelResidualsBase):
|
|
58
59
|
self,
|
59
60
|
model: Union[ModelTypes, Pipeline],
|
60
61
|
confidence: float = 0.95,
|
61
|
-
method: Literal[
|
62
|
+
method: Literal[
|
63
|
+
"chi-square", "jackson-mudholkar", "percentile"
|
64
|
+
] = "jackson-mudholkar",
|
62
65
|
) -> None:
|
63
|
-
self.method = method
|
66
|
+
self.model, self.confidence, self.method = model, confidence, method
|
64
67
|
super().__init__(model, confidence)
|
65
68
|
|
66
69
|
def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "QResiduals":
|
@@ -79,8 +82,8 @@ class QResiduals(_ModelResidualsBase):
|
|
79
82
|
"""
|
80
83
|
X = validate_data(self, X, ensure_2d=True, dtype=np.float64)
|
81
84
|
|
82
|
-
if self.
|
83
|
-
X = self.
|
85
|
+
if self.transformer_:
|
86
|
+
X = self.transformer_.fit_transform(X)
|
84
87
|
|
85
88
|
# Compute the critical threshold using the chosen method
|
86
89
|
self.critical_value_ = self._calculate_critical_value(X)
|
@@ -138,19 +141,18 @@ class QResiduals(_ModelResidualsBase):
|
|
138
141
|
X = validate_data(self, X, ensure_2d=True, dtype=np.float64)
|
139
142
|
|
140
143
|
# Apply preprocessing if available
|
141
|
-
if self.
|
142
|
-
X = self.
|
144
|
+
if self.transformer_:
|
145
|
+
X = self.transformer_.transform(X)
|
143
146
|
|
144
147
|
# Compute reconstruction error (Q residuals)
|
145
|
-
|
146
|
-
|
147
|
-
Q_residuals = np.sum((X - X_reconstructed) ** 2, axis=1)
|
148
|
+
residual = calculate_residual_spectrum(X, self.estimator_)
|
149
|
+
Q_residuals = np.sum(residual**2, axis=1)
|
148
150
|
|
149
151
|
return Q_residuals
|
150
152
|
|
151
153
|
def _calculate_critical_value(
|
152
154
|
self,
|
153
|
-
X:
|
155
|
+
X: np.ndarray,
|
154
156
|
) -> float:
|
155
157
|
"""Calculate the critical value for outlier detection.
|
156
158
|
|
@@ -172,17 +174,18 @@ class QResiduals(_ModelResidualsBase):
|
|
172
174
|
|
173
175
|
"""
|
174
176
|
# Compute Q residuals for training data
|
175
|
-
|
176
|
-
X_reconstructed = self.model_.inverse_transform(X_transformed)
|
177
|
-
residuals = X - X_reconstructed
|
177
|
+
residuals = calculate_residual_spectrum(X, self.estimator_)
|
178
178
|
|
179
179
|
if self.method == "chi-square":
|
180
180
|
return self._chi_square_threshold(residuals)
|
181
|
+
|
181
182
|
elif self.method == "jackson-mudholkar":
|
182
183
|
return self._jackson_mudholkar_threshold(residuals)
|
184
|
+
|
183
185
|
elif self.method == "percentile":
|
184
186
|
Q_residuals = np.sum((residuals) ** 2, axis=1)
|
185
187
|
return self._percentile_threshold(Q_residuals)
|
188
|
+
|
186
189
|
else:
|
187
190
|
raise ValueError(
|
188
191
|
"Invalid method. Choose from 'chi-square', 'jackson-mudholkar', or 'percentile'."
|
@@ -21,10 +21,10 @@ class StudentizedResiduals(_ModelResidualsBase):
|
|
21
21
|
|
22
22
|
Attributes
|
23
23
|
----------
|
24
|
-
|
24
|
+
estimator_ : ModelType
|
25
25
|
The fitted model of type _BasePCA or _PLS
|
26
26
|
|
27
|
-
|
27
|
+
transformer_ : Optional[Pipeline]
|
28
28
|
Preprocessing steps before the model
|
29
29
|
|
30
30
|
References
|
@@ -33,6 +33,7 @@ class StudentizedResiduals(_ModelResidualsBase):
|
|
33
33
|
"""
|
34
34
|
|
35
35
|
def __init__(self, model: Union[_PLS, Pipeline], confidence=0.95) -> None:
|
36
|
+
self.model, self.confidence = model, confidence
|
36
37
|
super().__init__(model, confidence)
|
37
38
|
|
38
39
|
def fit(self, X: np.ndarray, y: Optional[np.ndarray]) -> "StudentizedResiduals":
|
@@ -53,18 +54,18 @@ class StudentizedResiduals(_ModelResidualsBase):
|
|
53
54
|
)
|
54
55
|
|
55
56
|
# Preprocess the data
|
56
|
-
if self.
|
57
|
-
X = self.
|
57
|
+
if self.transformer_:
|
58
|
+
X = self.transformer_.transform(X)
|
58
59
|
|
59
60
|
# Calculate y residuals
|
60
|
-
y_residuals = y - self.
|
61
|
+
y_residuals = y - self.estimator_.predict(X)
|
61
62
|
y_residuals = (
|
62
63
|
y_residuals.reshape(-1, 1) if len(y_residuals.shape) == 1 else y_residuals
|
63
64
|
)
|
64
65
|
|
65
66
|
# Calculate the studentized residuals
|
66
67
|
studentized_residuals = calculate_studentized_residuals(
|
67
|
-
self.
|
68
|
+
self.estimator_, X, y_residuals
|
68
69
|
)
|
69
70
|
|
70
71
|
# Calculate the critical threshold
|
@@ -97,18 +98,18 @@ class StudentizedResiduals(_ModelResidualsBase):
|
|
97
98
|
)
|
98
99
|
|
99
100
|
# Preprocess the data
|
100
|
-
if self.
|
101
|
-
X = self.
|
101
|
+
if self.transformer_:
|
102
|
+
X = self.transformer_.transform(X)
|
102
103
|
|
103
104
|
# Calculate y residuals
|
104
|
-
y_residuals = y - self.
|
105
|
+
y_residuals = y - self.estimator_.predict(X)
|
105
106
|
y_residuals = (
|
106
107
|
y_residuals.reshape(-1, 1) if len(y_residuals.shape) == 1 else y_residuals
|
107
108
|
)
|
108
109
|
|
109
110
|
# Calculate the studentized residuals
|
110
111
|
studentized_residuals = calculate_studentized_residuals(
|
111
|
-
self.
|
112
|
+
self.estimator_, X, y_residuals
|
112
113
|
)
|
113
114
|
return np.where(studentized_residuals > self.critical_value_, -1, 1)
|
114
115
|
|
@@ -138,18 +139,18 @@ class StudentizedResiduals(_ModelResidualsBase):
|
|
138
139
|
X = validate_data(self, X, ensure_2d=True, dtype=np.float64)
|
139
140
|
|
140
141
|
# Apply preprocessing if available
|
141
|
-
if self.
|
142
|
-
X = self.
|
142
|
+
if self.transformer_:
|
143
|
+
X = self.transformer_.transform(X)
|
143
144
|
|
144
145
|
# Calculate y residuals
|
145
|
-
y_residuals = y - self.
|
146
|
+
y_residuals = y - self.estimator_.predict(X)
|
146
147
|
y_residuals = (
|
147
148
|
y_residuals.reshape(-1, 1) if len(y_residuals.shape) == 1 else y_residuals
|
148
149
|
)
|
149
150
|
|
150
|
-
return calculate_studentized_residuals(self.
|
151
|
+
return calculate_studentized_residuals(self.estimator_, X, y_residuals)
|
151
152
|
|
152
|
-
def _calculate_critical_value(self, X:
|
153
|
+
def _calculate_critical_value(self, X: np.ndarray) -> float:
|
153
154
|
"""Calculate the critical value for outlier detection.
|
154
155
|
|
155
156
|
Parameters
|
@@ -189,7 +190,7 @@ def calculate_studentized_residuals(
|
|
189
190
|
"""
|
190
191
|
|
191
192
|
# Calculate the leverage of the samples
|
192
|
-
leverage = calculate_leverage(
|
193
|
+
leverage = calculate_leverage(X, model)
|
193
194
|
|
194
195
|
# Calculate the standard deviation of the residuals
|
195
196
|
std = np.sqrt(np.sum(y_residuals**2, axis=0) / (X.shape[0] - model.n_components))
|
@@ -0,0 +1,51 @@
|
|
1
|
+
import numpy as np
|
2
|
+
|
3
|
+
from ._base import ModelTypes
|
4
|
+
|
5
|
+
|
6
|
+
def calculate_decoded_spectrum(X: np.ndarray, estimator: ModelTypes):
|
7
|
+
"""
|
8
|
+
Calculate the decoded spectrum for a given transformed (preprocessed!!) spectrum and estimator from the latent space.
|
9
|
+
|
10
|
+
Parameters
|
11
|
+
----------
|
12
|
+
spectrum : np.ndarray
|
13
|
+
The transformed spectrum data.
|
14
|
+
|
15
|
+
estimator : ModelTypes
|
16
|
+
The fitted PCA or PLS model.
|
17
|
+
|
18
|
+
Returns
|
19
|
+
-------
|
20
|
+
np.ndarray
|
21
|
+
The decoded spectrum.
|
22
|
+
"""
|
23
|
+
# Project the transformed spectrum onto the latent space
|
24
|
+
X_transformed = estimator.transform(X)
|
25
|
+
|
26
|
+
# Decode the spectrum back to the original space
|
27
|
+
return estimator.inverse_transform(X_transformed)
|
28
|
+
|
29
|
+
|
30
|
+
def calculate_residual_spectrum(X: np.ndarray, estimator: ModelTypes):
|
31
|
+
"""
|
32
|
+
Calculate the residual spectrum for a given transformed (preprocessed!!) spectrum and estimator.
|
33
|
+
|
34
|
+
Parameters
|
35
|
+
----------
|
36
|
+
spectrum : np.ndarray
|
37
|
+
The transformed spectrum data.
|
38
|
+
|
39
|
+
estimator : ModelTypes
|
40
|
+
The fitted PCA or PLS model.
|
41
|
+
|
42
|
+
Returns
|
43
|
+
-------
|
44
|
+
np.ndarray
|
45
|
+
The residual spectrum.
|
46
|
+
"""
|
47
|
+
# Compute the reconstruction error (Q residuals)
|
48
|
+
decoded_spectrum = calculate_decoded_spectrum(X, estimator)
|
49
|
+
|
50
|
+
# Calculate the residual
|
51
|
+
return X - decoded_spectrum
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: chemotools
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.11
|
4
4
|
Summary: chemotools: A Python Package that Integrates Chemometrics and scikit-learn
|
5
5
|
License: MIT
|
6
6
|
Author: Pau Cabaneros
|
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.13
|
|
14
14
|
Requires-Dist: numpy (>=2.0.0,<3.0.0)
|
15
15
|
Requires-Dist: pandas (>=2.0.0,<3.0.0)
|
16
16
|
Requires-Dist: polars (>=1.17.0,<2.0.0)
|
17
|
-
Requires-Dist: pyarrow (>=18
|
17
|
+
Requires-Dist: pyarrow (>=18,<21)
|
18
18
|
Requires-Dist: scikit-learn (>=1.4.0,<2.0.0)
|
19
19
|
Description-Content-Type: text/markdown
|
20
20
|
|
@@ -27,17 +27,20 @@ chemotools/datasets/data/train_spectra.csv,sha256=iVF19W52NHlbqq8BbLomn8n47kSPT0
|
|
27
27
|
chemotools/derivative/__init__.py,sha256=FkckdzO30jrRWPGpIU3cfnaTtxPtNT5Tb2G9F9PmVTw,134
|
28
28
|
chemotools/derivative/_norris_william.py,sha256=rMY_yntpiB5fbSM1tPph4AaGmF1k-HqJp7o48ijePBs,4958
|
29
29
|
chemotools/derivative/_savitzky_golay.py,sha256=CuCrKoLmrB1YmJ4ihIykgkL3tO3frqkStMogtsVhO3A,3632
|
30
|
-
chemotools/feature_selection/__init__.py,sha256=
|
30
|
+
chemotools/feature_selection/__init__.py,sha256=e_GFVawlDNEQv3EqrGSXUr5cvDN1jckoxe2C2jRwVl8,222
|
31
|
+
chemotools/feature_selection/_base.py,sha256=SIH6kl9AePVWTByL0OvJFfc2j3idqs7lm_7Zi1YMp4Y,2311
|
31
32
|
chemotools/feature_selection/_index_selector.py,sha256=lNTP2b7P3doWl30KiAr3Xd2HOMxeUmj24MuqoXl4Voc,3556
|
32
33
|
chemotools/feature_selection/_range_cut.py,sha256=lVVVC30ZsK2z9jsDGb_z6l8Ty2I89yM05_dIDbMP73Q,3564
|
34
|
+
chemotools/feature_selection/_sr_selector.py,sha256=OaXkt3t_NvymgDy6R15ig87jhcb-vM7i63LgtsNdfZo,3969
|
35
|
+
chemotools/feature_selection/_vip_selector.py,sha256=ZK3bhdpl3nBYt6xmuHq2IvWtpgJ8ZdElH06xnCFA-Xs,3835
|
33
36
|
chemotools/outliers/__init__.py,sha256=wpdlyqU34n1Pb9kGCM4idhcok35WAakxEhzP0xeKaZw,272
|
34
|
-
chemotools/outliers/_base.py,sha256=
|
35
|
-
chemotools/outliers/
|
36
|
-
chemotools/outliers/
|
37
|
-
chemotools/outliers/
|
38
|
-
chemotools/outliers/
|
39
|
-
chemotools/outliers/
|
40
|
-
chemotools/outliers/
|
37
|
+
chemotools/outliers/_base.py,sha256=zl0LhRKjpvj5IbYc3su6zEZ7YZ0pDSR3yqNWt2qBjNA,5374
|
38
|
+
chemotools/outliers/dmodx.py,sha256=sgizal_BDlqWTZNT8y2D_ImcKAJejXt6vqvFYk4Vqi0,5152
|
39
|
+
chemotools/outliers/hotelling_t2.py,sha256=g_IOQD_rhKb3cjIJkn5OTto6bYClQtqXunG_02BSIs8,5087
|
40
|
+
chemotools/outliers/leverage.py,sha256=hNQ_x68LPPTDZvSJP_eRqu3GoeV3OBU37VC_XTFEzvw,4250
|
41
|
+
chemotools/outliers/q_residuals.py,sha256=sg7u8ockQvSSnXwNM4U-GITB-5OcbsDMX6Oig_TcONM,7598
|
42
|
+
chemotools/outliers/studentized_residuals.py,sha256=1L-GiutuO1x9s3UKMOBpmhs2Q-UuDtfG2YLELIxiiao,5890
|
43
|
+
chemotools/outliers/utils.py,sha256=SAjvtjl9oWHrQnkqGnDfYE4WWAgiL1RwnKmW-ql5TIc,1304
|
41
44
|
chemotools/scale/__init__.py,sha256=eztqcHg-TKE1Rr0N9ArfytHk8teuqVfi4SZi2DS96vc,175
|
42
45
|
chemotools/scale/_min_max_scaler.py,sha256=YvqRkV2pXu-viQrpjzWcp9KmSSCYSoubSnrZHRLqgKQ,3011
|
43
46
|
chemotools/scale/_norm_scaler.py,sha256=CHWSir2q-pL1hxzw_ZB45yi4mw-SkJ4YOa1CUL4nm2I,2568
|
@@ -53,7 +56,7 @@ chemotools/smooth/_median_filter.py,sha256=9ndTJCwrZirWlvDNldiigMddy79KIGq9OwwYN
|
|
53
56
|
chemotools/smooth/_savitzky_golay_filter.py,sha256=27iFUWxdL9_7oZabR0R5L0ZTpBmYfVUjx2XCTukihBE,3509
|
54
57
|
chemotools/smooth/_whittaker_smooth.py,sha256=lpLAyf4GdyDW4ulT1nyEoK6xQEl2cVUKquawQdGWbHU,3571
|
55
58
|
chemotools/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
56
|
-
chemotools-0.1.
|
57
|
-
chemotools-0.1.
|
58
|
-
chemotools-0.1.
|
59
|
-
chemotools-0.1.
|
59
|
+
chemotools-0.1.11.dist-info/LICENSE,sha256=qtyOy2wDQVX9hxp58h3T-6Lmfv-mSCHoSRkcLUdM9bg,1070
|
60
|
+
chemotools-0.1.11.dist-info/METADATA,sha256=Ne8xEa1cZUhbP-I4D1CFVvy8fhJANUjsY5cXRpNVV1k,5232
|
61
|
+
chemotools-0.1.11.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
62
|
+
chemotools-0.1.11.dist-info/RECORD,,
|
chemotools/outliers/_utils.py
DELETED
@@ -1,91 +0,0 @@
|
|
1
|
-
from typing import Optional, Tuple, Union
|
2
|
-
|
3
|
-
from sklearn.cross_decomposition._pls import _PLS
|
4
|
-
from sklearn.decomposition._base import _BasePCA
|
5
|
-
from sklearn.pipeline import Pipeline
|
6
|
-
from sklearn.utils.validation import check_is_fitted
|
7
|
-
|
8
|
-
ModelTypes = Union[_BasePCA, _PLS]
|
9
|
-
|
10
|
-
|
11
|
-
def get_model_parameters(model: ModelTypes) -> Tuple[int, int, int]:
|
12
|
-
"""
|
13
|
-
Get the number of features, components and samples from a model with PLS or PCA. types.
|
14
|
-
|
15
|
-
Parameters
|
16
|
-
----------
|
17
|
-
model : ModelType
|
18
|
-
A fitted model of type _BasePCA or _PLS
|
19
|
-
|
20
|
-
Returns
|
21
|
-
-------
|
22
|
-
Tuple[int, int, int]
|
23
|
-
The number of features, components and samples in the model
|
24
|
-
"""
|
25
|
-
if isinstance(model, _BasePCA):
|
26
|
-
return model.n_features_in_, model.n_components_, model.n_samples_
|
27
|
-
elif isinstance(model, _PLS):
|
28
|
-
return model.n_features_in_, model.n_components, len(model.x_scores_)
|
29
|
-
else:
|
30
|
-
raise ValueError(
|
31
|
-
"Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
|
32
|
-
)
|
33
|
-
|
34
|
-
|
35
|
-
def validate_confidence(confidence: float) -> float:
|
36
|
-
"""Validate parameters using sklearn conventions.
|
37
|
-
|
38
|
-
Parameters
|
39
|
-
----------
|
40
|
-
confidence : float
|
41
|
-
Confidence level for statistical calculations (between 0 and 1)
|
42
|
-
|
43
|
-
Returns
|
44
|
-
-------
|
45
|
-
float
|
46
|
-
The validated confidence level
|
47
|
-
|
48
|
-
Raises
|
49
|
-
------
|
50
|
-
ValueError
|
51
|
-
If confidence is not between 0 and 1
|
52
|
-
"""
|
53
|
-
if not 0 < confidence < 1:
|
54
|
-
raise ValueError("Confidence must be between 0 and 1")
|
55
|
-
return confidence
|
56
|
-
|
57
|
-
|
58
|
-
def validate_and_extract_model(
|
59
|
-
model: Union[ModelTypes, Pipeline],
|
60
|
-
) -> Tuple[ModelTypes, Optional[Pipeline], int, int, int]:
|
61
|
-
"""Validate and extract the model and preprocessing steps.
|
62
|
-
|
63
|
-
Parameters
|
64
|
-
----------
|
65
|
-
model : Union[ModelTypes, Pipeline]
|
66
|
-
A fitted PCA/PLS model or Pipeline ending with such a model
|
67
|
-
|
68
|
-
Returns
|
69
|
-
-------
|
70
|
-
Tuple[ModelTypes, Optional[Pipeline]]
|
71
|
-
The extracted model and preprocessing steps
|
72
|
-
|
73
|
-
Raises
|
74
|
-
------
|
75
|
-
ValueError
|
76
|
-
If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
|
77
|
-
"""
|
78
|
-
if isinstance(model, Pipeline):
|
79
|
-
preprocessing = model[:-1]
|
80
|
-
model = model[-1]
|
81
|
-
else:
|
82
|
-
preprocessing = None
|
83
|
-
|
84
|
-
if not isinstance(model, (_BasePCA, _PLS)):
|
85
|
-
raise ValueError(
|
86
|
-
"Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
|
87
|
-
)
|
88
|
-
|
89
|
-
check_is_fitted(model)
|
90
|
-
n_features_in, n_components, n_samples = get_model_parameters(model)
|
91
|
-
return model, preprocessing, n_features_in, n_components, n_samples
|
File without changes
|