chemotools 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chemotools/augmentation/__init__.py +2 -0
- chemotools/augmentation/_gaussian_broadening.py +136 -0
- chemotools/feature_selection/__init__.py +3 -1
- chemotools/feature_selection/_base.py +88 -0
- chemotools/feature_selection/_sr_selector.py +137 -0
- chemotools/feature_selection/_vip_selector.py +129 -0
- chemotools/outliers/__init__.py +7 -0
- chemotools/outliers/_base.py +188 -0
- chemotools/outliers/dmodx.py +164 -0
- chemotools/outliers/hotelling_t2.py +156 -0
- chemotools/outliers/leverage.py +151 -0
- chemotools/outliers/q_residuals.py +228 -0
- chemotools/outliers/studentized_residuals.py +198 -0
- chemotools/outliers/utils.py +51 -0
- {chemotools-0.1.9.dist-info → chemotools-0.1.11.dist-info}/METADATA +2 -2
- {chemotools-0.1.9.dist-info → chemotools-0.1.11.dist-info}/RECORD +18 -6
- {chemotools-0.1.9.dist-info → chemotools-0.1.11.dist-info}/WHEEL +1 -1
- {chemotools-0.1.9.dist-info → chemotools-0.1.11.dist-info}/LICENSE +0 -0
@@ -0,0 +1,188 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import Optional, Tuple, Union
|
3
|
+
|
4
|
+
import numpy as np
|
5
|
+
|
6
|
+
from sklearn.base import BaseEstimator, OutlierMixin
|
7
|
+
from sklearn.decomposition._base import _BasePCA
|
8
|
+
from sklearn.cross_decomposition._pls import _PLS
|
9
|
+
from sklearn.pipeline import Pipeline
|
10
|
+
from sklearn.utils.validation import check_is_fitted
|
11
|
+
|
12
|
+
|
13
|
+
ModelTypes = Union[_BasePCA, _PLS]
|
14
|
+
|
15
|
+
|
16
|
+
class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
|
17
|
+
"""Base class for model outlier calculations.
|
18
|
+
|
19
|
+
Implements statistical calculations for outlier detection in dimensionality
|
20
|
+
reduction models like PCA and PLS.
|
21
|
+
|
22
|
+
Parameters
|
23
|
+
----------
|
24
|
+
model : Union[ModelTypes, Pipeline]
|
25
|
+
A fitted _BasePCA or _PLS models or Pipeline ending with such a model
|
26
|
+
confidence : float
|
27
|
+
Confidence level for statistical calculations (between 0 and 1)
|
28
|
+
|
29
|
+
Attributes
|
30
|
+
----------
|
31
|
+
estimator_ : ModelTypes
|
32
|
+
The fitted model of type _BasePCA or _PLS
|
33
|
+
|
34
|
+
transformer_ : Optional[Pipeline]
|
35
|
+
Preprocessing steps before the model
|
36
|
+
|
37
|
+
n_features_in_ : int
|
38
|
+
Number of features in the input data
|
39
|
+
|
40
|
+
n_components_ : int
|
41
|
+
Number of components in the model
|
42
|
+
|
43
|
+
n_samples_ : int
|
44
|
+
Number of samples used to train the model
|
45
|
+
|
46
|
+
critical_value_ : float
|
47
|
+
The calculated critical value for outlier detection
|
48
|
+
"""
|
49
|
+
|
50
|
+
def __init__(
|
51
|
+
self,
|
52
|
+
model: Union[ModelTypes, Pipeline],
|
53
|
+
confidence: float,
|
54
|
+
) -> None:
|
55
|
+
(
|
56
|
+
self.estimator_,
|
57
|
+
self.transformer_,
|
58
|
+
self.n_features_in_,
|
59
|
+
self.n_components_,
|
60
|
+
self.n_samples_,
|
61
|
+
) = _validate_and_extract_model(model)
|
62
|
+
self.confidence = _validate_confidence(confidence)
|
63
|
+
|
64
|
+
def fit_predict_residuals(
|
65
|
+
self, X: np.ndarray, y: Optional[np.ndarray] = None
|
66
|
+
) -> np.ndarray:
|
67
|
+
"""Fit the model to the input data and calculate the residuals.
|
68
|
+
|
69
|
+
Parameters
|
70
|
+
----------
|
71
|
+
X : array-like of shape (n_samples, n_features)
|
72
|
+
Input data
|
73
|
+
|
74
|
+
y : array-like of shape (n_samples,), default=None
|
75
|
+
Target values
|
76
|
+
|
77
|
+
Returns
|
78
|
+
-------
|
79
|
+
ndarray of shape (n_samples,)
|
80
|
+
The residuals of the model
|
81
|
+
"""
|
82
|
+
self.fit(X, y)
|
83
|
+
return self.predict_residuals(X, y, validate=True)
|
84
|
+
|
85
|
+
@abstractmethod
|
86
|
+
def predict_residuals(
|
87
|
+
self, X: np.ndarray, y: Optional[np.ndarray], validate: bool
|
88
|
+
) -> np.ndarray:
|
89
|
+
"""Calculate the residuals of the model.
|
90
|
+
|
91
|
+
Returns
|
92
|
+
-------
|
93
|
+
ndarray of shape (n_samples,)
|
94
|
+
The residuals of the model
|
95
|
+
"""
|
96
|
+
|
97
|
+
@abstractmethod
|
98
|
+
def _calculate_critical_value(self, X: np.ndarray) -> float:
|
99
|
+
"""Calculate the critical value for outlier detection.
|
100
|
+
|
101
|
+
Returns
|
102
|
+
-------
|
103
|
+
float
|
104
|
+
The calculated critical value for outlier detection
|
105
|
+
"""
|
106
|
+
|
107
|
+
|
108
|
+
def _get_model_parameters(model: ModelTypes) -> Tuple[int, int, int]:
|
109
|
+
"""
|
110
|
+
Get the number of features, components and samples from a model with PLS or PCA. types.
|
111
|
+
|
112
|
+
Parameters
|
113
|
+
----------
|
114
|
+
model : ModelType
|
115
|
+
A fitted model of type _BasePCA or _PLS
|
116
|
+
|
117
|
+
Returns
|
118
|
+
-------
|
119
|
+
Tuple[int, int, int]
|
120
|
+
The number of features, components and samples in the model
|
121
|
+
"""
|
122
|
+
if isinstance(model, _BasePCA):
|
123
|
+
return model.n_features_in_, model.n_components_, model.n_samples_
|
124
|
+
elif isinstance(model, _PLS):
|
125
|
+
return model.n_features_in_, model.n_components, len(model.x_scores_)
|
126
|
+
else:
|
127
|
+
raise ValueError(
|
128
|
+
"Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
|
129
|
+
)
|
130
|
+
|
131
|
+
|
132
|
+
def _validate_confidence(confidence: float) -> float:
|
133
|
+
"""Validate parameters using sklearn conventions.
|
134
|
+
|
135
|
+
Parameters
|
136
|
+
----------
|
137
|
+
confidence : float
|
138
|
+
Confidence level for statistical calculations (between 0 and 1)
|
139
|
+
|
140
|
+
Returns
|
141
|
+
-------
|
142
|
+
float
|
143
|
+
The validated confidence level
|
144
|
+
|
145
|
+
Raises
|
146
|
+
------
|
147
|
+
ValueError
|
148
|
+
If confidence is not between 0 and 1
|
149
|
+
"""
|
150
|
+
if not 0 < confidence < 1:
|
151
|
+
raise ValueError("Confidence must be between 0 and 1")
|
152
|
+
return confidence
|
153
|
+
|
154
|
+
|
155
|
+
def _validate_and_extract_model(
|
156
|
+
model: Union[ModelTypes, Pipeline],
|
157
|
+
) -> Tuple[ModelTypes, Optional[Pipeline], int, int, int]:
|
158
|
+
"""Validate and extract the model and preprocessing steps.
|
159
|
+
|
160
|
+
Parameters
|
161
|
+
----------
|
162
|
+
model : Union[ModelTypes, Pipeline]
|
163
|
+
A fitted PCA/PLS model or Pipeline ending with such a model
|
164
|
+
|
165
|
+
Returns
|
166
|
+
-------
|
167
|
+
Tuple[ModelTypes, Optional[Pipeline]]
|
168
|
+
The extracted model and preprocessing steps
|
169
|
+
|
170
|
+
Raises
|
171
|
+
------
|
172
|
+
ValueError
|
173
|
+
If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
|
174
|
+
"""
|
175
|
+
if isinstance(model, Pipeline):
|
176
|
+
preprocessing = model[:-1]
|
177
|
+
model = model[-1]
|
178
|
+
else:
|
179
|
+
preprocessing = None
|
180
|
+
|
181
|
+
if not isinstance(model, (_BasePCA, _PLS)):
|
182
|
+
raise ValueError(
|
183
|
+
"Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
|
184
|
+
)
|
185
|
+
|
186
|
+
check_is_fitted(model)
|
187
|
+
n_features_in, n_components, n_samples = _get_model_parameters(model)
|
188
|
+
return model, preprocessing, n_features_in, n_components, n_samples
|
@@ -0,0 +1,164 @@
|
|
1
|
+
from typing import Optional, Union
|
2
|
+
import numpy as np
|
3
|
+
|
4
|
+
from sklearn.pipeline import Pipeline
|
5
|
+
from sklearn.utils.validation import validate_data, check_is_fitted
|
6
|
+
from scipy.stats import f as f_distribution
|
7
|
+
|
8
|
+
|
9
|
+
from ._base import _ModelResidualsBase, ModelTypes
|
10
|
+
from .utils import calculate_residual_spectrum
|
11
|
+
|
12
|
+
|
13
|
+
class DModX(_ModelResidualsBase):
|
14
|
+
"""Calculate Distance to Model (DModX) statistics.
|
15
|
+
|
16
|
+
DModX measures the distance between an observation and the model plane
|
17
|
+
in the X-space, useful for detecting outliers.
|
18
|
+
|
19
|
+
Parameters
|
20
|
+
----------
|
21
|
+
model : Union[ModelType, Pipeline]
|
22
|
+
A fitted PCA/PLS model or Pipeline ending with such a model
|
23
|
+
|
24
|
+
confidence : float, default=0.95
|
25
|
+
Confidence level for statistical calculations (between 0 and 1)
|
26
|
+
|
27
|
+
Attributes
|
28
|
+
----------
|
29
|
+
estimator_ : ModelType
|
30
|
+
The fitted model of type _BasePCA or _PLS
|
31
|
+
|
32
|
+
transformer_ : Optional[Pipeline]
|
33
|
+
Preprocessing steps before the model
|
34
|
+
|
35
|
+
n_features_in_ : int
|
36
|
+
Number of features in the input data
|
37
|
+
|
38
|
+
n_components_ : int
|
39
|
+
Number of components in the model
|
40
|
+
|
41
|
+
n_samples_ : int
|
42
|
+
Number of samples used to train the model
|
43
|
+
|
44
|
+
critical_value_ : float
|
45
|
+
The calculated critical value for outlier detection
|
46
|
+
|
47
|
+
train_spe_: float
|
48
|
+
The training sum of squared errors (SSE) for the model normalized by degrees of freedom
|
49
|
+
"""
|
50
|
+
|
51
|
+
def __init__(
|
52
|
+
self,
|
53
|
+
model: Union[ModelTypes, Pipeline],
|
54
|
+
confidence: float = 0.95,
|
55
|
+
) -> None:
|
56
|
+
model, confidence = model, confidence
|
57
|
+
super().__init__(model, confidence)
|
58
|
+
|
59
|
+
def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "DModX":
|
60
|
+
"""
|
61
|
+
Fit the model to the input data.
|
62
|
+
|
63
|
+
This step calculates the critical value for the outlier detection. In the DmodX method,
|
64
|
+
the critical value is not depend on the input data but on the model parameters.
|
65
|
+
"""
|
66
|
+
X = validate_data(
|
67
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
68
|
+
)
|
69
|
+
|
70
|
+
# Calculate the critical value
|
71
|
+
self.critical_value_ = self._calculate_critical_value()
|
72
|
+
|
73
|
+
# Calculate the degrees of freedom normalized SPE of the training set
|
74
|
+
residuals = calculate_residual_spectrum(X, self.estimator_)
|
75
|
+
squared_errors = np.sum((residuals) ** 2, axis=1)
|
76
|
+
self.train_spe_ = np.sqrt(
|
77
|
+
squared_errors
|
78
|
+
/ (self.n_samples_ - self.n_components_ - 1)
|
79
|
+
* (self.n_features_in_ - self.n_components_)
|
80
|
+
)
|
81
|
+
|
82
|
+
return self
|
83
|
+
|
84
|
+
def predict(self, X: np.ndarray) -> np.ndarray:
|
85
|
+
"""Identify outliers in the input data.
|
86
|
+
|
87
|
+
Parameters
|
88
|
+
----------
|
89
|
+
X : array-like of shape (n_samples, n_features)
|
90
|
+
Input data
|
91
|
+
|
92
|
+
Returns
|
93
|
+
-------
|
94
|
+
ndarray of shape (n_samples,)
|
95
|
+
Boolean array indicating outliers
|
96
|
+
"""
|
97
|
+
# Check the estimator has been fitted
|
98
|
+
check_is_fitted(self, ["critical_value_"])
|
99
|
+
|
100
|
+
# Validate the input data
|
101
|
+
X = validate_data(
|
102
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
103
|
+
)
|
104
|
+
|
105
|
+
# Calculate outliers based on the DModX statistics
|
106
|
+
dmodx_values = self.predict_residuals(X, validate=False)
|
107
|
+
return np.where(dmodx_values > self.critical_value_, -1, 1)
|
108
|
+
|
109
|
+
def predict_residuals(
|
110
|
+
self, X: np.ndarray, y: Optional[np.ndarray] = None, validate: bool = True
|
111
|
+
) -> np.ndarray:
|
112
|
+
"""Calculate DModX statistics for input data.
|
113
|
+
|
114
|
+
Parameters
|
115
|
+
----------
|
116
|
+
X : array-like of shape (n_samples, n_features)
|
117
|
+
Input data
|
118
|
+
|
119
|
+
validate : bool, default=True
|
120
|
+
Whether to validate the input data
|
121
|
+
|
122
|
+
Returns
|
123
|
+
-------
|
124
|
+
ndarray of shape (n_samples,)
|
125
|
+
DModX statistics for each sample
|
126
|
+
"""
|
127
|
+
# Check the estimator has been fitted
|
128
|
+
check_is_fitted(self, ["critical_value_"])
|
129
|
+
|
130
|
+
# Validate the input data
|
131
|
+
if validate:
|
132
|
+
X = validate_data(
|
133
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
134
|
+
)
|
135
|
+
|
136
|
+
# Apply preprocessing if available
|
137
|
+
if self.transformer_:
|
138
|
+
X = self.transformer_.transform(X)
|
139
|
+
|
140
|
+
# Calculate the DModX statistics
|
141
|
+
residual = calculate_residual_spectrum(X, self.estimator_)
|
142
|
+
squared_errors = np.sum((residual) ** 2, axis=1)
|
143
|
+
|
144
|
+
return (
|
145
|
+
np.sqrt(squared_errors / (self.n_features_in_ - self.n_components_))
|
146
|
+
/ self.train_spe_
|
147
|
+
)
|
148
|
+
|
149
|
+
def _calculate_critical_value(self, X: Optional[np.ndarray] = None) -> float:
|
150
|
+
"""Calculate F-distribution based critical value.
|
151
|
+
|
152
|
+
Returns
|
153
|
+
-------
|
154
|
+
float
|
155
|
+
The critical value for outlier detection
|
156
|
+
"""
|
157
|
+
|
158
|
+
dof_numerator = self.n_features_in_ - self.n_components_
|
159
|
+
dof_denominator = self.n_features_in_ - self.n_components_ - 1
|
160
|
+
|
161
|
+
upper_control_limit = f_distribution.ppf(
|
162
|
+
self.confidence, dof_numerator, dof_denominator
|
163
|
+
)
|
164
|
+
return np.sqrt(upper_control_limit)
|
@@ -0,0 +1,156 @@
|
|
1
|
+
from typing import Optional, Union
|
2
|
+
import numpy as np
|
3
|
+
|
4
|
+
from sklearn.cross_decomposition._pls import _PLS
|
5
|
+
from sklearn.decomposition._base import _BasePCA
|
6
|
+
from sklearn.pipeline import Pipeline
|
7
|
+
from sklearn.utils.validation import validate_data, check_is_fitted
|
8
|
+
from scipy.stats import f as f_distribution
|
9
|
+
|
10
|
+
from ._base import _ModelResidualsBase, ModelTypes
|
11
|
+
|
12
|
+
|
13
|
+
class HotellingT2(_ModelResidualsBase):
|
14
|
+
"""
|
15
|
+
Calculate Hotelling's T-squared statistics for PCA or PLS like models.
|
16
|
+
|
17
|
+
Parameters
|
18
|
+
----------
|
19
|
+
model : Union[ModelType, Pipeline]
|
20
|
+
A fitted PCA/PLS model or Pipeline ending with such a model
|
21
|
+
|
22
|
+
confidence : float, default=0.95
|
23
|
+
Confidence level for statistical calculations (between 0 and 1)
|
24
|
+
|
25
|
+
Attributes
|
26
|
+
----------
|
27
|
+
estimator_ : ModelType
|
28
|
+
The fitted model of type _BasePCA or _PLS
|
29
|
+
|
30
|
+
transformer_ : Optional[Pipeline]
|
31
|
+
Preprocessing steps before the model
|
32
|
+
|
33
|
+
n_features_in_ : int
|
34
|
+
Number of features in the input data
|
35
|
+
|
36
|
+
n_components_ : int
|
37
|
+
Number of components in the model
|
38
|
+
|
39
|
+
n_samples_ : int
|
40
|
+
Number of samples used to train the model
|
41
|
+
|
42
|
+
critical_value_ : float
|
43
|
+
The calculated critical value for outlier detection
|
44
|
+
|
45
|
+
References
|
46
|
+
----------
|
47
|
+
Johan A. Westerhuis, Stephen P. Gurden, Age K. Smilde (2001) Generalized contribution plots in multivariate statistical process
|
48
|
+
monitoring Chemometrics and Intelligent Laboratory Systems 51 2000 95–114
|
49
|
+
"""
|
50
|
+
|
51
|
+
def __init__(
|
52
|
+
self, model: Union[ModelTypes, Pipeline], confidence: float = 0.95
|
53
|
+
) -> None:
|
54
|
+
self.model, self.confidence = model, confidence
|
55
|
+
super().__init__(model, confidence)
|
56
|
+
|
57
|
+
def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "HotellingT2":
|
58
|
+
"""
|
59
|
+
Fit the model to the input data.
|
60
|
+
|
61
|
+
This step calculates the critical value for the outlier detection. In the DmodX method,
|
62
|
+
the critical value is not depend on the input data but on the model parameters.
|
63
|
+
"""
|
64
|
+
X = validate_data(
|
65
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
66
|
+
)
|
67
|
+
|
68
|
+
self.critical_value_ = self._calculate_critical_value()
|
69
|
+
return self
|
70
|
+
|
71
|
+
def predict(self, X: np.ndarray) -> np.ndarray:
|
72
|
+
"""Identify outliers in the input data.
|
73
|
+
|
74
|
+
Parameters
|
75
|
+
----------
|
76
|
+
X : array-like of shape (n_samples, n_features)
|
77
|
+
Input data
|
78
|
+
|
79
|
+
Returns
|
80
|
+
-------
|
81
|
+
ndarray of shape (n_samples,)
|
82
|
+
Boolean array indicating outliers
|
83
|
+
"""
|
84
|
+
# Check the estimator has been fitted
|
85
|
+
check_is_fitted(self, ["critical_value_"])
|
86
|
+
|
87
|
+
# Validate the input data
|
88
|
+
X = validate_data(
|
89
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
90
|
+
)
|
91
|
+
|
92
|
+
# Calculate the Hotelling's T-squared statistics
|
93
|
+
hotelling_t2_values = self.predict_residuals(X, y=None, validate=False)
|
94
|
+
return np.where(hotelling_t2_values > self.critical_value_, -1, 1)
|
95
|
+
|
96
|
+
def predict_residuals(
|
97
|
+
self, X: np.ndarray, y: Optional[np.ndarray] = None, validate: bool = True
|
98
|
+
) -> np.ndarray:
|
99
|
+
"""Calculate Hotelling's T-squared statistics for input data.
|
100
|
+
|
101
|
+
Parameters
|
102
|
+
----------
|
103
|
+
X : array-like of shape (n_samples, n_features)
|
104
|
+
Input data
|
105
|
+
|
106
|
+
Returns
|
107
|
+
-------
|
108
|
+
ndarray of shape (n_samples,)
|
109
|
+
Hotellin's T-squared statistics for each sample
|
110
|
+
"""
|
111
|
+
# Check the estimator has been fitted
|
112
|
+
check_is_fitted(self, ["critical_value_"])
|
113
|
+
|
114
|
+
# Validate the input data
|
115
|
+
if validate:
|
116
|
+
X = validate_data(
|
117
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
118
|
+
)
|
119
|
+
|
120
|
+
# Apply preprocessing steps
|
121
|
+
if self.transformer_:
|
122
|
+
X = self.transformer_.transform(X)
|
123
|
+
|
124
|
+
# Calculate the Hotelling's T-squared statistics
|
125
|
+
if isinstance(self.estimator_, _BasePCA):
|
126
|
+
# For PCA-like models
|
127
|
+
variances = self.estimator_.explained_variance_
|
128
|
+
|
129
|
+
if isinstance(self.estimator_, _PLS):
|
130
|
+
# For PLS-like models
|
131
|
+
variances = np.var(self.estimator_.x_scores_, axis=0)
|
132
|
+
|
133
|
+
# Equivalent to X @ model.components_.T for _BasePCA and X @ model.x_rotations_ for _PLS
|
134
|
+
X_transformed = self.estimator_.transform(X)
|
135
|
+
|
136
|
+
return np.sum((X_transformed**2) / variances, axis=1)
|
137
|
+
|
138
|
+
def _calculate_critical_value(self, X: Optional[np.ndarray] = None) -> float:
|
139
|
+
"""
|
140
|
+
Calculate the critical value for the Hotelling's T-squared statistics.
|
141
|
+
|
142
|
+
Returns
|
143
|
+
-------
|
144
|
+
float
|
145
|
+
The critical value for the Hotelling's T-squared statistics
|
146
|
+
"""
|
147
|
+
|
148
|
+
critical_value = f_distribution.ppf(
|
149
|
+
self.confidence, self.n_components_, self.n_samples_ - self.n_components_
|
150
|
+
)
|
151
|
+
return (
|
152
|
+
critical_value
|
153
|
+
* self.n_components_
|
154
|
+
* (self.n_samples_ - 1)
|
155
|
+
/ (self.n_samples_ - self.n_components_)
|
156
|
+
)
|
@@ -0,0 +1,151 @@
|
|
1
|
+
from typing import Optional, Union
|
2
|
+
import numpy as np
|
3
|
+
|
4
|
+
from sklearn.pipeline import Pipeline
|
5
|
+
from sklearn.utils.validation import validate_data, check_is_fitted
|
6
|
+
|
7
|
+
|
8
|
+
from ._base import _ModelResidualsBase, ModelTypes
|
9
|
+
|
10
|
+
|
11
|
+
class Leverage(_ModelResidualsBase):
|
12
|
+
"""
|
13
|
+
Calculate the leverage of the training samples on the latent space of a PCA or PLS models.
|
14
|
+
This method allows to detect datapoints with high leverage in the model.
|
15
|
+
|
16
|
+
Parameters
|
17
|
+
----------
|
18
|
+
model : Union[ModelType, Pipeline]
|
19
|
+
A fitted PCA/PLS model or Pipeline ending with such a model
|
20
|
+
|
21
|
+
Attributes
|
22
|
+
----------
|
23
|
+
estimator_ : ModelType
|
24
|
+
The fitted model of type _BasePCA or _PLS
|
25
|
+
|
26
|
+
transformer_ : Optional[Pipeline]
|
27
|
+
Preprocessing steps before the model
|
28
|
+
|
29
|
+
References
|
30
|
+
----------
|
31
|
+
|
32
|
+
"""
|
33
|
+
|
34
|
+
def __init__(
|
35
|
+
self, model: Union[ModelTypes, Pipeline], confidence: float = 0.95
|
36
|
+
) -> None:
|
37
|
+
model, confidence = model, confidence
|
38
|
+
super().__init__(model, confidence)
|
39
|
+
|
40
|
+
def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "Leverage":
|
41
|
+
"""
|
42
|
+
Fit the model to the input data.
|
43
|
+
|
44
|
+
Parameters
|
45
|
+
|
46
|
+
"""
|
47
|
+
X = validate_data(
|
48
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
49
|
+
)
|
50
|
+
|
51
|
+
if self.transformer_:
|
52
|
+
X = self.transformer_.fit_transform(X)
|
53
|
+
|
54
|
+
# Compute the critical threshold
|
55
|
+
self.critical_value_ = self._calculate_critical_value(X)
|
56
|
+
|
57
|
+
return self
|
58
|
+
|
59
|
+
def predict(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray:
|
60
|
+
"""Calculate Leverage for training data on the model.
|
61
|
+
|
62
|
+
Parameters
|
63
|
+
----------
|
64
|
+
X : array-like of shape (n_samples, n_features)
|
65
|
+
Input data
|
66
|
+
|
67
|
+
Returns
|
68
|
+
-------
|
69
|
+
ndarray of shape (n_samples,)
|
70
|
+
Bool with samples with a leverage above the critical value
|
71
|
+
"""
|
72
|
+
# Check the estimator has been fitted
|
73
|
+
check_is_fitted(self, ["critical_value_"])
|
74
|
+
|
75
|
+
# Validate the input data
|
76
|
+
X = validate_data(
|
77
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
78
|
+
)
|
79
|
+
|
80
|
+
# Preprocess the data
|
81
|
+
if self.transformer_:
|
82
|
+
X = self.transformer_.transform(X)
|
83
|
+
|
84
|
+
# Calculate outliers based on samples with too high leverage
|
85
|
+
leverage = calculate_leverage(X, self.estimator_)
|
86
|
+
return np.where(leverage > self.critical_value_, -1, 1)
|
87
|
+
|
88
|
+
def predict_residuals(
|
89
|
+
self, X: np.ndarray, y: Optional[np.ndarray] = None, validate: bool = True
|
90
|
+
) -> np.ndarray:
|
91
|
+
"""Calculate the leverage of the samples.
|
92
|
+
|
93
|
+
Parameters
|
94
|
+
----------
|
95
|
+
X : array-like of shape (n_samples, n_features)
|
96
|
+
Input data
|
97
|
+
|
98
|
+
Returns
|
99
|
+
-------
|
100
|
+
np.ndarray
|
101
|
+
Leverage of the samples
|
102
|
+
"""
|
103
|
+
# Check the estimator has been fitted
|
104
|
+
check_is_fitted(self, ["critical_value_"])
|
105
|
+
|
106
|
+
# Validate the input data
|
107
|
+
if validate:
|
108
|
+
X = validate_data(self, X, ensure_2d=True, dtype=np.float64)
|
109
|
+
|
110
|
+
# Apply preprocessing if available
|
111
|
+
if self.transformer_:
|
112
|
+
X = self.transformer_.transform(X)
|
113
|
+
|
114
|
+
# Calculate the leverage
|
115
|
+
return calculate_leverage(X, self.estimator_)
|
116
|
+
|
117
|
+
def _calculate_critical_value(self, X: np.ndarray) -> float:
|
118
|
+
"""Calculate the critical value for outlier detection using the percentile outlier method."""
|
119
|
+
|
120
|
+
# Calculate the leverage of the samples
|
121
|
+
leverage = calculate_leverage(X, self.estimator_)
|
122
|
+
|
123
|
+
# Calculate the critical value
|
124
|
+
return np.percentile(leverage, self.confidence * 100)
|
125
|
+
|
126
|
+
|
127
|
+
def calculate_leverage(X: np.ndarray, model: ModelTypes) -> np.ndarray:
|
128
|
+
"""
|
129
|
+
Calculate the leverage of the training samples in a PLS/PCA-like model.
|
130
|
+
|
131
|
+
Parameters
|
132
|
+
----------
|
133
|
+
model : Union[_BasePCA, _PLS]
|
134
|
+
A fitted PCA/PLS model
|
135
|
+
|
136
|
+
X : np.ndarray
|
137
|
+
Preprocessed input data
|
138
|
+
|
139
|
+
Returns
|
140
|
+
-------
|
141
|
+
np.ndarray
|
142
|
+
Leverage of the samples
|
143
|
+
"""
|
144
|
+
|
145
|
+
X_transformed = model.transform(X)
|
146
|
+
|
147
|
+
X_hat = (
|
148
|
+
X_transformed @ np.linalg.inv(X_transformed.T @ X_transformed) @ X_transformed.T
|
149
|
+
)
|
150
|
+
|
151
|
+
return np.diag(X_hat)
|