chemotools 0.1.9__py3-none-any.whl → 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chemotools/augmentation/__init__.py +2 -0
- chemotools/augmentation/_gaussian_broadening.py +136 -0
- chemotools/outliers/__init__.py +7 -0
- chemotools/outliers/_base.py +180 -0
- chemotools/outliers/_utils.py +91 -0
- chemotools/outliers/dmodx.py +146 -0
- chemotools/outliers/hotelling_t2.py +155 -0
- chemotools/outliers/leverage.py +150 -0
- chemotools/outliers/q_residuals.py +225 -0
- chemotools/outliers/studentized_residuals.py +197 -0
- {chemotools-0.1.9.dist-info → chemotools-0.1.10.dist-info}/METADATA +1 -1
- {chemotools-0.1.9.dist-info → chemotools-0.1.10.dist-info}/RECORD +14 -5
- {chemotools-0.1.9.dist-info → chemotools-0.1.10.dist-info}/WHEEL +1 -1
- {chemotools-0.1.9.dist-info → chemotools-0.1.10.dist-info}/LICENSE +0 -0
@@ -1,6 +1,7 @@
|
|
1
1
|
from ._add_noise import AddNoise
|
2
2
|
from ._baseline_shift import BaselineShift
|
3
3
|
from ._fractional_shift import FractionalShift
|
4
|
+
from ._gaussian_broadening import GaussianBroadening
|
4
5
|
from ._index_shift import IndexShift
|
5
6
|
from ._spectrum_scale import SpectrumScale
|
6
7
|
|
@@ -9,6 +10,7 @@ __all__ = [
|
|
9
10
|
"AddNoise",
|
10
11
|
"BaselineShift",
|
11
12
|
"FractionalShift",
|
13
|
+
"GaussianBroadening",
|
12
14
|
"IndexShift",
|
13
15
|
"SpectrumScale",
|
14
16
|
]
|
@@ -0,0 +1,136 @@
|
|
1
|
+
from typing import Literal, Optional
|
2
|
+
import numpy as np
|
3
|
+
from scipy.ndimage import gaussian_filter1d
|
4
|
+
from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
|
5
|
+
from sklearn.utils.validation import check_is_fitted, validate_data
|
6
|
+
|
7
|
+
|
8
|
+
class GaussianBroadening(TransformerMixin, OneToOneFeatureMixin, BaseEstimator):
|
9
|
+
"""
|
10
|
+
Transform spectral data by broadening peaks using Gaussian convolution.
|
11
|
+
|
12
|
+
This transformer applies Gaussian smoothing to broaden peaks in spectral data.
|
13
|
+
For each signal, a random sigma is chosen between 0 and the specified sigma value.
|
14
|
+
|
15
|
+
Parameters
|
16
|
+
----------
|
17
|
+
sigma : float, default=1.0
|
18
|
+
Maximum standard deviation for the Gaussian kernel.
|
19
|
+
The actual sigma used will be randomly chosen between 0 and this value.
|
20
|
+
|
21
|
+
mode : {'reflect', 'constant', 'nearest', 'mirror', 'wrap'}, default='reflect'
|
22
|
+
The mode parameter determines how the input array is extended when
|
23
|
+
the filter overlaps a border. Default is 'reflect'.
|
24
|
+
|
25
|
+
pad_value : float, default=0.0
|
26
|
+
Value to fill past edges of input if mode is 'constant'.
|
27
|
+
|
28
|
+
random_state : int, optional, default=None
|
29
|
+
Random state for reproducible sigma selection.
|
30
|
+
|
31
|
+
truncate : float, default=4.0
|
32
|
+
Truncate the filter at this many standard deviations.
|
33
|
+
Larger values increase computation time but improve accuracy.
|
34
|
+
"""
|
35
|
+
|
36
|
+
def __init__(
|
37
|
+
self,
|
38
|
+
sigma: float = 1.0,
|
39
|
+
mode: Literal["reflect", "constant", "nearest", "mirror", "wrap"] = "reflect",
|
40
|
+
pad_value: float = 0.0,
|
41
|
+
random_state: Optional[int] = None,
|
42
|
+
truncate: float = 4.0,
|
43
|
+
):
|
44
|
+
self.sigma = sigma
|
45
|
+
self.mode = mode
|
46
|
+
self.pad_value = pad_value
|
47
|
+
self.random_state = random_state
|
48
|
+
self.truncate = truncate
|
49
|
+
|
50
|
+
def fit(self, X: np.ndarray, y=None) -> "GaussianBroadening":
|
51
|
+
"""
|
52
|
+
Fit the transformer to the data (in this case, only validates input).
|
53
|
+
|
54
|
+
Parameters
|
55
|
+
----------
|
56
|
+
X : array-like of shape (n_samples, n_features)
|
57
|
+
Input data to validate.
|
58
|
+
|
59
|
+
y : None
|
60
|
+
Ignored.
|
61
|
+
|
62
|
+
Returns
|
63
|
+
-------
|
64
|
+
self : GaussianBroadening
|
65
|
+
The fitted transformer.
|
66
|
+
"""
|
67
|
+
X = validate_data(
|
68
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
69
|
+
)
|
70
|
+
|
71
|
+
# Validate sigma parameter
|
72
|
+
if not isinstance(self.sigma, (int, float)):
|
73
|
+
raise ValueError("sigma must be a number")
|
74
|
+
if self.sigma < 0:
|
75
|
+
raise ValueError("sigma must be non-negative")
|
76
|
+
|
77
|
+
# Initialize random number generator
|
78
|
+
self._rng = np.random.default_rng(self.random_state)
|
79
|
+
|
80
|
+
return self
|
81
|
+
|
82
|
+
def transform(self, X: np.ndarray, y=None) -> np.ndarray:
|
83
|
+
"""
|
84
|
+
Apply Gaussian broadening to the input data.
|
85
|
+
|
86
|
+
Parameters
|
87
|
+
----------
|
88
|
+
X : array-like of shape (n_samples, n_features)
|
89
|
+
The data to transform.
|
90
|
+
|
91
|
+
y : None
|
92
|
+
Ignored.
|
93
|
+
|
94
|
+
Returns
|
95
|
+
-------
|
96
|
+
X_transformed : ndarray of shape (n_samples, n_features)
|
97
|
+
The transformed data with broadened peaks.
|
98
|
+
"""
|
99
|
+
check_is_fitted(self, "n_features_in_")
|
100
|
+
X_ = validate_data(
|
101
|
+
self,
|
102
|
+
X,
|
103
|
+
y="no_validation",
|
104
|
+
ensure_2d=True,
|
105
|
+
copy=True,
|
106
|
+
reset=False,
|
107
|
+
dtype=np.float64,
|
108
|
+
)
|
109
|
+
|
110
|
+
# Transform each sample
|
111
|
+
for i, x in enumerate(X_):
|
112
|
+
X_[i] = self._broaden_signal(x)
|
113
|
+
|
114
|
+
return X_
|
115
|
+
|
116
|
+
def _broaden_signal(self, x: np.ndarray) -> np.ndarray:
|
117
|
+
"""
|
118
|
+
Apply Gaussian broadening to a single signal.
|
119
|
+
|
120
|
+
Parameters
|
121
|
+
----------
|
122
|
+
x : ndarray of shape (n_features,)
|
123
|
+
The input signal to broaden.
|
124
|
+
|
125
|
+
Returns
|
126
|
+
-------
|
127
|
+
broadened_signal : ndarray of shape (n_features,)
|
128
|
+
The broadened signal.
|
129
|
+
"""
|
130
|
+
# Randomly choose sigma between 0 and max sigma
|
131
|
+
sigma = self._rng.uniform(0, self.sigma)
|
132
|
+
|
133
|
+
# Apply Gaussian filter
|
134
|
+
return gaussian_filter1d(
|
135
|
+
x, sigma=sigma, mode=self.mode, cval=self.pad_value, truncate=self.truncate
|
136
|
+
)
|
@@ -0,0 +1,7 @@
|
|
1
|
+
from .dmodx import DModX
|
2
|
+
from .hotelling_t2 import HotellingT2
|
3
|
+
from .q_residuals import QResiduals
|
4
|
+
from .leverage import Leverage
|
5
|
+
from .studentized_residuals import StudentizedResiduals
|
6
|
+
|
7
|
+
__all__ = ["DModX", "HotellingT2", "QResiduals", "Leverage", "StudentizedResiduals"]
|
@@ -0,0 +1,180 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import Union, Optional
|
3
|
+
|
4
|
+
import numpy as np
|
5
|
+
|
6
|
+
from sklearn.base import BaseEstimator, OutlierMixin
|
7
|
+
from sklearn.decomposition._base import _BasePCA
|
8
|
+
from sklearn.cross_decomposition._pls import _PLS
|
9
|
+
from sklearn.pipeline import Pipeline
|
10
|
+
from sklearn.utils.validation import check_is_fitted
|
11
|
+
|
12
|
+
from ._utils import validate_confidence, validate_and_extract_model
|
13
|
+
|
14
|
+
ModelTypes = Union[_BasePCA, _PLS]
|
15
|
+
|
16
|
+
|
17
|
+
class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
|
18
|
+
"""Base class for model outlier calculations.
|
19
|
+
|
20
|
+
Implements statistical calculations for outlier detection in dimensionality
|
21
|
+
reduction models like PCA and PLS.
|
22
|
+
|
23
|
+
Parameters
|
24
|
+
----------
|
25
|
+
model : Union[ModelTypes, Pipeline]
|
26
|
+
A fitted _BasePCA or _PLS models or Pipeline ending with such a model
|
27
|
+
confidence : float
|
28
|
+
Confidence level for statistical calculations (between 0 and 1)
|
29
|
+
|
30
|
+
Attributes
|
31
|
+
----------
|
32
|
+
model_ : ModelTypes
|
33
|
+
The fitted model of type _BasePCA or _PLS
|
34
|
+
|
35
|
+
preprocessing_ : Optional[Pipeline]
|
36
|
+
Preprocessing steps before the model
|
37
|
+
|
38
|
+
n_features_in_ : int
|
39
|
+
Number of features in the input data
|
40
|
+
|
41
|
+
n_components_ : int
|
42
|
+
Number of components in the model
|
43
|
+
|
44
|
+
n_samples_ : int
|
45
|
+
Number of samples used to train the model
|
46
|
+
|
47
|
+
critical_value_ : float
|
48
|
+
The calculated critical value for outlier detection
|
49
|
+
"""
|
50
|
+
|
51
|
+
def __init__(
|
52
|
+
self,
|
53
|
+
model: Union[ModelTypes, Pipeline],
|
54
|
+
confidence: float,
|
55
|
+
) -> None:
|
56
|
+
(
|
57
|
+
self.model_,
|
58
|
+
self.preprocessing_,
|
59
|
+
self.n_features_in_,
|
60
|
+
self.n_components_,
|
61
|
+
self.n_samples_,
|
62
|
+
) = validate_and_extract_model(model)
|
63
|
+
self.confidence = validate_confidence(confidence)
|
64
|
+
|
65
|
+
def fit_predict_residuals(
|
66
|
+
self, X: np.ndarray, y: Optional[np.ndarray] = None
|
67
|
+
) -> np.ndarray:
|
68
|
+
"""Fit the model to the input data and calculate the residuals.
|
69
|
+
|
70
|
+
Parameters
|
71
|
+
----------
|
72
|
+
X : array-like of shape (n_samples, n_features)
|
73
|
+
Input data
|
74
|
+
|
75
|
+
y : array-like of shape (n_samples,), default=None
|
76
|
+
Target values
|
77
|
+
|
78
|
+
Returns
|
79
|
+
-------
|
80
|
+
ndarray of shape (n_samples,)
|
81
|
+
The residuals of the model
|
82
|
+
"""
|
83
|
+
self.fit(X, y)
|
84
|
+
return self.predict_residuals(X, y, validate=True)
|
85
|
+
|
86
|
+
@abstractmethod
|
87
|
+
def predict_residuals(
|
88
|
+
self, X: np.ndarray, y: Optional[np.ndarray], validate: bool
|
89
|
+
) -> np.ndarray:
|
90
|
+
"""Calculate the residuals of the model.
|
91
|
+
|
92
|
+
Returns
|
93
|
+
-------
|
94
|
+
ndarray of shape (n_samples,)
|
95
|
+
The residuals of the model
|
96
|
+
"""
|
97
|
+
|
98
|
+
@abstractmethod
|
99
|
+
def _calculate_critical_value(self, X: Optional[np.ndarray]) -> float:
|
100
|
+
"""Calculate the critical value for outlier detection.
|
101
|
+
|
102
|
+
Returns
|
103
|
+
-------
|
104
|
+
float
|
105
|
+
The calculated critical value for outlier detection
|
106
|
+
"""
|
107
|
+
|
108
|
+
|
109
|
+
class _ModelDiagnosticsBase(ABC):
|
110
|
+
"""Base class for model diagnostics methods. This does not implement outlier detection algorithms,
|
111
|
+
but rather implements methods that are used to assess trained models.
|
112
|
+
|
113
|
+
Parameters
|
114
|
+
----------
|
115
|
+
model : Union[ModelTypes, Pipeline]
|
116
|
+
A fitted PCA/PLS model or Pipeline ending with such a model
|
117
|
+
|
118
|
+
Attributes
|
119
|
+
----------
|
120
|
+
model_ : ModelTypes
|
121
|
+
The fitted model of type _BasePCA or _PLS
|
122
|
+
|
123
|
+
preprocessing_ : Optional[Pipeline]
|
124
|
+
Preprocessing steps before the model
|
125
|
+
|
126
|
+
"""
|
127
|
+
|
128
|
+
def __init__(self, model: Union[ModelTypes, Pipeline]):
|
129
|
+
self.model_, self.preprocessing_ = self._validate_and_extract_model(model)
|
130
|
+
|
131
|
+
def _validate_and_extract_model(self, model):
|
132
|
+
"""Validate and extract the model and preprocessing steps.
|
133
|
+
|
134
|
+
Parameters
|
135
|
+
----------
|
136
|
+
model : Union[ModelTypes, Pipeline]
|
137
|
+
A fitted PCA/PLS model or Pipeline ending with such a model
|
138
|
+
|
139
|
+
Returns
|
140
|
+
-------
|
141
|
+
Tuple[ModelTypes, Optional[Pipeline]]
|
142
|
+
The extracted model and preprocessing steps
|
143
|
+
|
144
|
+
Raises
|
145
|
+
------
|
146
|
+
ValueError
|
147
|
+
If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
|
148
|
+
"""
|
149
|
+
if isinstance(model, Pipeline):
|
150
|
+
preprocessing = model[:-1]
|
151
|
+
model = model[-1]
|
152
|
+
else:
|
153
|
+
preprocessing = None
|
154
|
+
|
155
|
+
if isinstance(model, (_BasePCA, _PLS)):
|
156
|
+
check_is_fitted(model)
|
157
|
+
else:
|
158
|
+
raise ValueError(
|
159
|
+
"Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
|
160
|
+
)
|
161
|
+
check_is_fitted(model)
|
162
|
+
return model, preprocessing
|
163
|
+
|
164
|
+
@abstractmethod
|
165
|
+
def predict(self, X: np.ndarray, y: Optional[np.ndarray]) -> np.ndarray:
|
166
|
+
"""Predict the output of the model.
|
167
|
+
|
168
|
+
Parameters
|
169
|
+
----------
|
170
|
+
X : array-like of shape (n_samples, n_features)
|
171
|
+
Input data
|
172
|
+
|
173
|
+
y : array-like of shape (n_samples,), default=None
|
174
|
+
Target values
|
175
|
+
|
176
|
+
Returns
|
177
|
+
-------
|
178
|
+
ndarray of shape (n_samples,)
|
179
|
+
Predicted values
|
180
|
+
"""
|
@@ -0,0 +1,91 @@
|
|
1
|
+
from typing import Optional, Tuple, Union
|
2
|
+
|
3
|
+
from sklearn.cross_decomposition._pls import _PLS
|
4
|
+
from sklearn.decomposition._base import _BasePCA
|
5
|
+
from sklearn.pipeline import Pipeline
|
6
|
+
from sklearn.utils.validation import check_is_fitted
|
7
|
+
|
8
|
+
ModelTypes = Union[_BasePCA, _PLS]
|
9
|
+
|
10
|
+
|
11
|
+
def get_model_parameters(model: ModelTypes) -> Tuple[int, int, int]:
|
12
|
+
"""
|
13
|
+
Get the number of features, components and samples from a model with PLS or PCA. types.
|
14
|
+
|
15
|
+
Parameters
|
16
|
+
----------
|
17
|
+
model : ModelType
|
18
|
+
A fitted model of type _BasePCA or _PLS
|
19
|
+
|
20
|
+
Returns
|
21
|
+
-------
|
22
|
+
Tuple[int, int, int]
|
23
|
+
The number of features, components and samples in the model
|
24
|
+
"""
|
25
|
+
if isinstance(model, _BasePCA):
|
26
|
+
return model.n_features_in_, model.n_components_, model.n_samples_
|
27
|
+
elif isinstance(model, _PLS):
|
28
|
+
return model.n_features_in_, model.n_components, len(model.x_scores_)
|
29
|
+
else:
|
30
|
+
raise ValueError(
|
31
|
+
"Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
|
32
|
+
)
|
33
|
+
|
34
|
+
|
35
|
+
def validate_confidence(confidence: float) -> float:
|
36
|
+
"""Validate parameters using sklearn conventions.
|
37
|
+
|
38
|
+
Parameters
|
39
|
+
----------
|
40
|
+
confidence : float
|
41
|
+
Confidence level for statistical calculations (between 0 and 1)
|
42
|
+
|
43
|
+
Returns
|
44
|
+
-------
|
45
|
+
float
|
46
|
+
The validated confidence level
|
47
|
+
|
48
|
+
Raises
|
49
|
+
------
|
50
|
+
ValueError
|
51
|
+
If confidence is not between 0 and 1
|
52
|
+
"""
|
53
|
+
if not 0 < confidence < 1:
|
54
|
+
raise ValueError("Confidence must be between 0 and 1")
|
55
|
+
return confidence
|
56
|
+
|
57
|
+
|
58
|
+
def validate_and_extract_model(
|
59
|
+
model: Union[ModelTypes, Pipeline],
|
60
|
+
) -> Tuple[ModelTypes, Optional[Pipeline], int, int, int]:
|
61
|
+
"""Validate and extract the model and preprocessing steps.
|
62
|
+
|
63
|
+
Parameters
|
64
|
+
----------
|
65
|
+
model : Union[ModelTypes, Pipeline]
|
66
|
+
A fitted PCA/PLS model or Pipeline ending with such a model
|
67
|
+
|
68
|
+
Returns
|
69
|
+
-------
|
70
|
+
Tuple[ModelTypes, Optional[Pipeline]]
|
71
|
+
The extracted model and preprocessing steps
|
72
|
+
|
73
|
+
Raises
|
74
|
+
------
|
75
|
+
ValueError
|
76
|
+
If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
|
77
|
+
"""
|
78
|
+
if isinstance(model, Pipeline):
|
79
|
+
preprocessing = model[:-1]
|
80
|
+
model = model[-1]
|
81
|
+
else:
|
82
|
+
preprocessing = None
|
83
|
+
|
84
|
+
if not isinstance(model, (_BasePCA, _PLS)):
|
85
|
+
raise ValueError(
|
86
|
+
"Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
|
87
|
+
)
|
88
|
+
|
89
|
+
check_is_fitted(model)
|
90
|
+
n_features_in, n_components, n_samples = get_model_parameters(model)
|
91
|
+
return model, preprocessing, n_features_in, n_components, n_samples
|
@@ -0,0 +1,146 @@
|
|
1
|
+
from typing import Optional, Union
|
2
|
+
import numpy as np
|
3
|
+
|
4
|
+
from sklearn.pipeline import Pipeline
|
5
|
+
from sklearn.utils.validation import validate_data, check_is_fitted
|
6
|
+
from scipy.stats import f as f_distribution
|
7
|
+
|
8
|
+
|
9
|
+
from ._base import _ModelResidualsBase, ModelTypes
|
10
|
+
|
11
|
+
|
12
|
+
class DModX(_ModelResidualsBase):
|
13
|
+
"""Calculate Distance to Model (DModX) statistics.
|
14
|
+
|
15
|
+
DModX measures the distance between an observation and the model plane
|
16
|
+
in the X-space, useful for detecting outliers.
|
17
|
+
|
18
|
+
Parameters
|
19
|
+
----------
|
20
|
+
model : Union[ModelType, Pipeline]
|
21
|
+
A fitted PCA/PLS model or Pipeline ending with such a model
|
22
|
+
|
23
|
+
confidence : float, default=0.95
|
24
|
+
Confidence level for statistical calculations (between 0 and 1)
|
25
|
+
|
26
|
+
Attributes
|
27
|
+
----------
|
28
|
+
model_ : ModelType
|
29
|
+
The fitted model of type _BasePCA or _PLS
|
30
|
+
|
31
|
+
preprocessing_ : Optional[Pipeline]
|
32
|
+
Preprocessing steps before the model
|
33
|
+
|
34
|
+
n_features_in_ : int
|
35
|
+
Number of features in the input data
|
36
|
+
|
37
|
+
n_components_ : int
|
38
|
+
Number of components in the model
|
39
|
+
|
40
|
+
n_samples_ : int
|
41
|
+
Number of samples used to train the model
|
42
|
+
|
43
|
+
critical_value_ : float
|
44
|
+
The calculated critical value for outlier detection
|
45
|
+
"""
|
46
|
+
|
47
|
+
def __init__(
|
48
|
+
self,
|
49
|
+
model: Union[ModelTypes, Pipeline],
|
50
|
+
confidence: float = 0.95,
|
51
|
+
) -> None:
|
52
|
+
super().__init__(model, confidence)
|
53
|
+
|
54
|
+
def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "DModX":
|
55
|
+
"""
|
56
|
+
Fit the model to the input data.
|
57
|
+
|
58
|
+
This step calculates the critical value for the outlier detection. In the DmodX method,
|
59
|
+
the critical value is not depend on the input data but on the model parameters.
|
60
|
+
"""
|
61
|
+
X = validate_data(
|
62
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
63
|
+
)
|
64
|
+
|
65
|
+
self.critical_value_ = self._calculate_critical_value()
|
66
|
+
return self
|
67
|
+
|
68
|
+
def predict(self, X: np.ndarray) -> np.ndarray:
|
69
|
+
"""Identify outliers in the input data.
|
70
|
+
|
71
|
+
Parameters
|
72
|
+
----------
|
73
|
+
X : array-like of shape (n_samples, n_features)
|
74
|
+
Input data
|
75
|
+
|
76
|
+
Returns
|
77
|
+
-------
|
78
|
+
ndarray of shape (n_samples,)
|
79
|
+
Boolean array indicating outliers
|
80
|
+
"""
|
81
|
+
# Check the estimator has been fitted
|
82
|
+
check_is_fitted(self, ["critical_value_"])
|
83
|
+
|
84
|
+
# Validate the input data
|
85
|
+
X = validate_data(
|
86
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
87
|
+
)
|
88
|
+
|
89
|
+
# Calculate outliers based on the DModX statistics
|
90
|
+
dmodx_values = self.predict_residuals(X, validate=False)
|
91
|
+
return np.where(dmodx_values > self.critical_value_, -1, 1)
|
92
|
+
|
93
|
+
def predict_residuals(
|
94
|
+
self, X: np.ndarray, y: Optional[np.ndarray] = None, validate: bool = True
|
95
|
+
) -> np.ndarray:
|
96
|
+
"""Calculate DModX statistics for input data.
|
97
|
+
|
98
|
+
Parameters
|
99
|
+
----------
|
100
|
+
X : array-like of shape (n_samples, n_features)
|
101
|
+
Input data
|
102
|
+
|
103
|
+
validate : bool, default=True
|
104
|
+
Whether to validate the input data
|
105
|
+
|
106
|
+
Returns
|
107
|
+
-------
|
108
|
+
ndarray of shape (n_samples,)
|
109
|
+
DModX statistics for each sample
|
110
|
+
"""
|
111
|
+
# Check the estimator has been fitted
|
112
|
+
check_is_fitted(self, ["critical_value_"])
|
113
|
+
|
114
|
+
# Validate the input data
|
115
|
+
if validate:
|
116
|
+
X = validate_data(
|
117
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
118
|
+
)
|
119
|
+
|
120
|
+
# Apply preprocessing if available
|
121
|
+
if self.preprocessing_:
|
122
|
+
X = self.preprocessing_.transform(X)
|
123
|
+
|
124
|
+
# Calculate the DModX statistics
|
125
|
+
X_transformed = self.model_.transform(X)
|
126
|
+
X_reconstructed = self.model_.inverse_transform(X_transformed)
|
127
|
+
squared_errors = np.sum((X - X_reconstructed) ** 2, axis=1)
|
128
|
+
|
129
|
+
return np.sqrt(squared_errors / (self.n_features_in_ - self.n_components_))
|
130
|
+
|
131
|
+
def _calculate_critical_value(self, X: Optional[np.ndarray] = None) -> float:
|
132
|
+
"""Calculate F-distribution based critical value.
|
133
|
+
|
134
|
+
Returns
|
135
|
+
-------
|
136
|
+
float
|
137
|
+
The critical value for outlier detection
|
138
|
+
"""
|
139
|
+
|
140
|
+
dof_numerator = self.n_features_in_ - self.n_components_
|
141
|
+
dof_denominator = self.n_features_in_ - self.n_components_ - 1
|
142
|
+
|
143
|
+
upper_control_limit = f_distribution.ppf(
|
144
|
+
self.confidence, dof_numerator, dof_denominator
|
145
|
+
)
|
146
|
+
return np.sqrt(upper_control_limit)
|
@@ -0,0 +1,155 @@
|
|
1
|
+
from typing import Optional, Union
|
2
|
+
import numpy as np
|
3
|
+
|
4
|
+
from sklearn.cross_decomposition._pls import _PLS
|
5
|
+
from sklearn.decomposition._base import _BasePCA
|
6
|
+
from sklearn.pipeline import Pipeline
|
7
|
+
from sklearn.utils.validation import validate_data, check_is_fitted
|
8
|
+
from scipy.stats import f as f_distribution
|
9
|
+
|
10
|
+
from ._base import _ModelResidualsBase, ModelTypes
|
11
|
+
|
12
|
+
|
13
|
+
class HotellingT2(_ModelResidualsBase):
|
14
|
+
"""
|
15
|
+
Calculate Hotelling's T-squared statistics for PCA or PLS like models.
|
16
|
+
|
17
|
+
Parameters
|
18
|
+
----------
|
19
|
+
model : Union[ModelType, Pipeline]
|
20
|
+
A fitted PCA/PLS model or Pipeline ending with such a model
|
21
|
+
|
22
|
+
confidence : float, default=0.95
|
23
|
+
Confidence level for statistical calculations (between 0 and 1)
|
24
|
+
|
25
|
+
Attributes
|
26
|
+
----------
|
27
|
+
model_ : ModelType
|
28
|
+
The fitted model of type _BasePCA or _PLS
|
29
|
+
|
30
|
+
preprocessing_ : Optional[Pipeline]
|
31
|
+
Preprocessing steps before the model
|
32
|
+
|
33
|
+
n_features_in_ : int
|
34
|
+
Number of features in the input data
|
35
|
+
|
36
|
+
n_components_ : int
|
37
|
+
Number of components in the model
|
38
|
+
|
39
|
+
n_samples_ : int
|
40
|
+
Number of samples used to train the model
|
41
|
+
|
42
|
+
critical_value_ : float
|
43
|
+
The calculated critical value for outlier detection
|
44
|
+
|
45
|
+
References
|
46
|
+
----------
|
47
|
+
Johan A. Westerhuis, Stephen P. Gurden, Age K. Smilde (2001) Generalized contribution plots in multivariate statistical process
|
48
|
+
monitoring Chemometrics and Intelligent Laboratory Systems 51 2000 95–114
|
49
|
+
"""
|
50
|
+
|
51
|
+
def __init__(
|
52
|
+
self, model: Union[ModelTypes, Pipeline], confidence: float = 0.95
|
53
|
+
) -> None:
|
54
|
+
super().__init__(model, confidence)
|
55
|
+
|
56
|
+
def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "HotellingT2":
|
57
|
+
"""
|
58
|
+
Fit the model to the input data.
|
59
|
+
|
60
|
+
This step calculates the critical value for the outlier detection. In the DmodX method,
|
61
|
+
the critical value is not depend on the input data but on the model parameters.
|
62
|
+
"""
|
63
|
+
X = validate_data(
|
64
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
65
|
+
)
|
66
|
+
|
67
|
+
self.critical_value_ = self._calculate_critical_value()
|
68
|
+
return self
|
69
|
+
|
70
|
+
def predict(self, X: np.ndarray) -> np.ndarray:
|
71
|
+
"""Identify outliers in the input data.
|
72
|
+
|
73
|
+
Parameters
|
74
|
+
----------
|
75
|
+
X : array-like of shape (n_samples, n_features)
|
76
|
+
Input data
|
77
|
+
|
78
|
+
Returns
|
79
|
+
-------
|
80
|
+
ndarray of shape (n_samples,)
|
81
|
+
Boolean array indicating outliers
|
82
|
+
"""
|
83
|
+
# Check the estimator has been fitted
|
84
|
+
check_is_fitted(self, ["critical_value_"])
|
85
|
+
|
86
|
+
# Validate the input data
|
87
|
+
X = validate_data(
|
88
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
89
|
+
)
|
90
|
+
|
91
|
+
# Calculate the Hotelling's T-squared statistics
|
92
|
+
hotelling_t2_values = self.predict_residuals(X, y=None, validate=False)
|
93
|
+
return np.where(hotelling_t2_values > self.critical_value_, -1, 1)
|
94
|
+
|
95
|
+
def predict_residuals(
|
96
|
+
self, X: np.ndarray, y: Optional[np.ndarray], validate: bool = True
|
97
|
+
) -> np.ndarray:
|
98
|
+
"""Calculate Hotelling's T-squared statistics for input data.
|
99
|
+
|
100
|
+
Parameters
|
101
|
+
----------
|
102
|
+
X : array-like of shape (n_samples, n_features)
|
103
|
+
Input data
|
104
|
+
|
105
|
+
Returns
|
106
|
+
-------
|
107
|
+
ndarray of shape (n_samples,)
|
108
|
+
Hotellin's T-squared statistics for each sample
|
109
|
+
"""
|
110
|
+
# Check the estimator has been fitted
|
111
|
+
check_is_fitted(self, ["critical_value_"])
|
112
|
+
|
113
|
+
# Validate the input data
|
114
|
+
if validate:
|
115
|
+
X = validate_data(
|
116
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
117
|
+
)
|
118
|
+
|
119
|
+
# Apply preprocessing steps
|
120
|
+
if self.preprocessing_:
|
121
|
+
X = self.preprocessing_.transform(X)
|
122
|
+
|
123
|
+
# Calculate the Hotelling's T-squared statistics
|
124
|
+
if isinstance(self.model_, _BasePCA):
|
125
|
+
# For PCA-like models
|
126
|
+
variances = self.model_.explained_variance_
|
127
|
+
|
128
|
+
if isinstance(self.model_, _PLS):
|
129
|
+
# For PLS-like models
|
130
|
+
variances = np.var(self.model_.x_scores_, axis=0)
|
131
|
+
|
132
|
+
# Equivalent to X @ model.components_.T for _BasePCA and X @ model.x_rotations_ for _PLS
|
133
|
+
X_transformed = self.model_.transform(X)
|
134
|
+
|
135
|
+
return np.sum((X_transformed**2) / variances, axis=1)
|
136
|
+
|
137
|
+
def _calculate_critical_value(self, X: Optional[np.ndarray] = None) -> float:
|
138
|
+
"""
|
139
|
+
Calculate the critical value for the Hotelling's T-squared statistics.
|
140
|
+
|
141
|
+
Returns
|
142
|
+
-------
|
143
|
+
float
|
144
|
+
The critical value for the Hotelling's T-squared statistics
|
145
|
+
"""
|
146
|
+
|
147
|
+
critical_value = f_distribution.ppf(
|
148
|
+
self.confidence, self.n_components_, self.n_samples_ - self.n_components_
|
149
|
+
)
|
150
|
+
return (
|
151
|
+
critical_value
|
152
|
+
* self.n_components_
|
153
|
+
* (self.n_samples_ - 1)
|
154
|
+
/ (self.n_samples_ - self.n_components_)
|
155
|
+
)
|
@@ -0,0 +1,150 @@
|
|
1
|
+
from typing import Optional, Union
|
2
|
+
import numpy as np
|
3
|
+
|
4
|
+
from sklearn.pipeline import Pipeline
|
5
|
+
from sklearn.utils.validation import validate_data, check_is_fitted
|
6
|
+
|
7
|
+
|
8
|
+
from ._base import _ModelResidualsBase, ModelTypes
|
9
|
+
|
10
|
+
|
11
|
+
class Leverage(_ModelResidualsBase):
|
12
|
+
"""
|
13
|
+
Calculate the leverage of the training samples on the latent space of a PCA or PLS models.
|
14
|
+
This method allows to detect datapoints with high leverage in the model.
|
15
|
+
|
16
|
+
Parameters
|
17
|
+
----------
|
18
|
+
model : Union[ModelType, Pipeline]
|
19
|
+
A fitted PCA/PLS model or Pipeline ending with such a model
|
20
|
+
|
21
|
+
Attributes
|
22
|
+
----------
|
23
|
+
model_ : ModelType
|
24
|
+
The fitted model of type _BasePCA or _PLS
|
25
|
+
|
26
|
+
preprocessing_ : Optional[Pipeline]
|
27
|
+
Preprocessing steps before the model
|
28
|
+
|
29
|
+
References
|
30
|
+
----------
|
31
|
+
|
32
|
+
"""
|
33
|
+
|
34
|
+
def __init__(
|
35
|
+
self, model: Union[ModelTypes, Pipeline], confidence: float = 0.95
|
36
|
+
) -> None:
|
37
|
+
super().__init__(model, confidence)
|
38
|
+
|
39
|
+
def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "Leverage":
|
40
|
+
"""
|
41
|
+
Fit the model to the input data.
|
42
|
+
|
43
|
+
Parameters
|
44
|
+
|
45
|
+
"""
|
46
|
+
X = validate_data(
|
47
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
48
|
+
)
|
49
|
+
|
50
|
+
if self.preprocessing_:
|
51
|
+
X = self.preprocessing_.fit_transform(X)
|
52
|
+
|
53
|
+
# Compute the critical threshold
|
54
|
+
self.critical_value_ = self._calculate_critical_value(X)
|
55
|
+
|
56
|
+
return self
|
57
|
+
|
58
|
+
def predict(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray:
|
59
|
+
"""Calculate Leverage for training data on the model.
|
60
|
+
|
61
|
+
Parameters
|
62
|
+
----------
|
63
|
+
X : array-like of shape (n_samples, n_features)
|
64
|
+
Input data
|
65
|
+
|
66
|
+
Returns
|
67
|
+
-------
|
68
|
+
ndarray of shape (n_samples,)
|
69
|
+
Bool with samples with a leverage above the critical value
|
70
|
+
"""
|
71
|
+
# Check the estimator has been fitted
|
72
|
+
check_is_fitted(self, ["critical_value_"])
|
73
|
+
|
74
|
+
# Validate the input data
|
75
|
+
X = validate_data(
|
76
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
77
|
+
)
|
78
|
+
|
79
|
+
# Preprocess the data
|
80
|
+
if self.preprocessing_:
|
81
|
+
X = self.preprocessing_.transform(X)
|
82
|
+
|
83
|
+
# Calculate outliers based on samples with too high leverage
|
84
|
+
leverage = calculate_leverage(self.model_, X)
|
85
|
+
return np.where(leverage > self.critical_value_, -1, 1)
|
86
|
+
|
87
|
+
def predict_residuals(
|
88
|
+
self, X: np.ndarray, y: Optional[np.ndarray], validate: bool = True
|
89
|
+
) -> np.ndarray:
|
90
|
+
"""Calculate the leverage of the samples.
|
91
|
+
|
92
|
+
Parameters
|
93
|
+
----------
|
94
|
+
X : array-like of shape (n_samples, n_features)
|
95
|
+
Input data
|
96
|
+
|
97
|
+
Returns
|
98
|
+
-------
|
99
|
+
np.ndarray
|
100
|
+
Leverage of the samples
|
101
|
+
"""
|
102
|
+
# Check the estimator has been fitted
|
103
|
+
check_is_fitted(self, ["critical_value_"])
|
104
|
+
|
105
|
+
# Validate the input data
|
106
|
+
if validate:
|
107
|
+
X = validate_data(self, X, ensure_2d=True, dtype=np.float64)
|
108
|
+
|
109
|
+
# Apply preprocessing if available
|
110
|
+
if self.preprocessing_:
|
111
|
+
X = self.preprocessing_.transform(X)
|
112
|
+
|
113
|
+
# Calculate the leverage
|
114
|
+
return calculate_leverage(self.model_, X)
|
115
|
+
|
116
|
+
def _calculate_critical_value(self, X: Optional[np.ndarray]) -> float:
|
117
|
+
"""Calculate the critical value for outlier detection using the percentile outlier method."""
|
118
|
+
|
119
|
+
# Calculate the leverage of the samples
|
120
|
+
leverage = calculate_leverage(self.model_, X)
|
121
|
+
|
122
|
+
# Calculate the critical value
|
123
|
+
return np.percentile(leverage, self.confidence * 100)
|
124
|
+
|
125
|
+
|
126
|
+
def calculate_leverage(model: ModelTypes, X: Optional[np.ndarray]) -> np.ndarray:
|
127
|
+
"""
|
128
|
+
Calculate the leverage of the training samples in a PLS/PCA-like model.
|
129
|
+
|
130
|
+
Parameters
|
131
|
+
----------
|
132
|
+
model : Union[_BasePCA, _PLS]
|
133
|
+
A fitted PCA/PLS model
|
134
|
+
|
135
|
+
X : np.ndarray
|
136
|
+
Preprocessed input data
|
137
|
+
|
138
|
+
Returns
|
139
|
+
-------
|
140
|
+
np.ndarray
|
141
|
+
Leverage of the samples
|
142
|
+
"""
|
143
|
+
|
144
|
+
X_transformed = model.transform(X)
|
145
|
+
|
146
|
+
X_hat = (
|
147
|
+
X_transformed @ np.linalg.inv(X_transformed.T @ X_transformed) @ X_transformed.T
|
148
|
+
)
|
149
|
+
|
150
|
+
return np.diag(X_hat)
|
@@ -0,0 +1,225 @@
|
|
1
|
+
from typing import Optional, Literal, Union
|
2
|
+
|
3
|
+
import numpy as np
|
4
|
+
|
5
|
+
from scipy.stats import norm, chi2
|
6
|
+
from sklearn.pipeline import Pipeline
|
7
|
+
from sklearn.utils.validation import validate_data, check_is_fitted
|
8
|
+
|
9
|
+
from ._base import _ModelResidualsBase, ModelTypes
|
10
|
+
|
11
|
+
|
12
|
+
class QResiduals(_ModelResidualsBase):
|
13
|
+
"""
|
14
|
+
Calculate Q residuals (Squared Prediction Error - SPE) for PCA or PLS models.
|
15
|
+
|
16
|
+
Parameters
|
17
|
+
----------
|
18
|
+
model : Union[ModelType, Pipeline]
|
19
|
+
A fitted PCA/PLS model or Pipeline ending with such a model.
|
20
|
+
|
21
|
+
confidence : float, default=0.95
|
22
|
+
Confidence level for statistical calculations (between 0 and 1).
|
23
|
+
|
24
|
+
method : str, default="chi-square"
|
25
|
+
The method used to compute the confidence threshold for Q residuals.
|
26
|
+
Options:
|
27
|
+
- "chi-square" : Uses mean and standard deviation to approximate Q residuals threshold.
|
28
|
+
- "jackson-mudholkar" : Uses eigenvalue-based analytical approximation.
|
29
|
+
- "percentile" : Uses empirical percentile threshold.
|
30
|
+
|
31
|
+
Attributes
|
32
|
+
----------
|
33
|
+
model_ : ModelType
|
34
|
+
The fitted model of type _BasePCA or _PLS.
|
35
|
+
|
36
|
+
preprocessing_ : Optional[Pipeline]
|
37
|
+
Preprocessing steps before the model.
|
38
|
+
|
39
|
+
n_features_in_ : int
|
40
|
+
Number of features in the input data.
|
41
|
+
|
42
|
+
n_components_ : int
|
43
|
+
Number of components in the model.
|
44
|
+
|
45
|
+
n_samples_ : int
|
46
|
+
Number of samples used to train the model.
|
47
|
+
|
48
|
+
critical_value_ : float
|
49
|
+
The calculated critical value for outlier detection.
|
50
|
+
|
51
|
+
References
|
52
|
+
----------
|
53
|
+
Johan A. Westerhuis, Stephen P. Gurden, Age K. Smilde (2001) Generalized contribution plots in multivariate statistical process
|
54
|
+
monitoring Chemometrics and Intelligent Laboratory Systems 51 2000 95–114
|
55
|
+
"""
|
56
|
+
|
57
|
+
def __init__(
|
58
|
+
self,
|
59
|
+
model: Union[ModelTypes, Pipeline],
|
60
|
+
confidence: float = 0.95,
|
61
|
+
method: Literal["chi-square", "jackson-mudholkar", "percentile"] = "percentile",
|
62
|
+
) -> None:
|
63
|
+
self.method = method
|
64
|
+
super().__init__(model, confidence)
|
65
|
+
|
66
|
+
def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "QResiduals":
|
67
|
+
"""
|
68
|
+
Fit the Q Residuals model by computing residuals from the training set.
|
69
|
+
|
70
|
+
Parameters
|
71
|
+
----------
|
72
|
+
X : array-like of shape (n_samples, n_features)
|
73
|
+
Training data.
|
74
|
+
|
75
|
+
Returns
|
76
|
+
-------
|
77
|
+
self : object
|
78
|
+
Fitted instance of QResiduals.
|
79
|
+
"""
|
80
|
+
X = validate_data(self, X, ensure_2d=True, dtype=np.float64)
|
81
|
+
|
82
|
+
if self.preprocessing_:
|
83
|
+
X = self.preprocessing_.fit_transform(X)
|
84
|
+
|
85
|
+
# Compute the critical threshold using the chosen method
|
86
|
+
self.critical_value_ = self._calculate_critical_value(X)
|
87
|
+
|
88
|
+
return self
|
89
|
+
|
90
|
+
def predict(self, X: np.ndarray) -> np.ndarray:
|
91
|
+
"""Identify outliers in the input data based on Q residuals threshold.
|
92
|
+
|
93
|
+
Parameters
|
94
|
+
----------
|
95
|
+
X : array-like of shape (n_samples, n_features)
|
96
|
+
Input data.
|
97
|
+
|
98
|
+
Returns
|
99
|
+
-------
|
100
|
+
ndarray of shape (n_samples,)
|
101
|
+
Boolean array indicating outliers (-1 for outliers, 1 for normal data).
|
102
|
+
"""
|
103
|
+
# Check the estimator has been fitted
|
104
|
+
check_is_fitted(self, ["critical_value_"])
|
105
|
+
|
106
|
+
# Validate the input data
|
107
|
+
X = validate_data(
|
108
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
109
|
+
)
|
110
|
+
|
111
|
+
# Calculate outliers based on the Q residuals
|
112
|
+
Q_residuals = self.predict_residuals(X, validate=False)
|
113
|
+
return np.where(Q_residuals > self.critical_value_, -1, 1)
|
114
|
+
|
115
|
+
def predict_residuals(
|
116
|
+
self, X: np.ndarray, y: Optional[np.ndarray] = None, validate: bool = True
|
117
|
+
) -> np.ndarray:
|
118
|
+
"""Calculate Q residuals (Squared Prediction Error - SPE) for input data.
|
119
|
+
|
120
|
+
Parameters
|
121
|
+
----------
|
122
|
+
X : array-like of shape (n_samples, n_features)
|
123
|
+
Input data.
|
124
|
+
|
125
|
+
validate : bool, default=True
|
126
|
+
Whether to validate the input data.
|
127
|
+
|
128
|
+
Returns
|
129
|
+
-------
|
130
|
+
ndarray of shape (n_samples,)
|
131
|
+
Q residuals for each sample.
|
132
|
+
"""
|
133
|
+
# Check the estimator has been fitted
|
134
|
+
check_is_fitted(self, ["critical_value_"])
|
135
|
+
|
136
|
+
# Validate the input data
|
137
|
+
if validate:
|
138
|
+
X = validate_data(self, X, ensure_2d=True, dtype=np.float64)
|
139
|
+
|
140
|
+
# Apply preprocessing if available
|
141
|
+
if self.preprocessing_:
|
142
|
+
X = self.preprocessing_.transform(X)
|
143
|
+
|
144
|
+
# Compute reconstruction error (Q residuals)
|
145
|
+
X_transformed = self.model_.transform(X)
|
146
|
+
X_reconstructed = self.model_.inverse_transform(X_transformed)
|
147
|
+
Q_residuals = np.sum((X - X_reconstructed) ** 2, axis=1)
|
148
|
+
|
149
|
+
return Q_residuals
|
150
|
+
|
151
|
+
def _calculate_critical_value(
|
152
|
+
self,
|
153
|
+
X: Optional[np.ndarray] = None,
|
154
|
+
) -> float:
|
155
|
+
"""Calculate the critical value for outlier detection.
|
156
|
+
|
157
|
+
Parameters
|
158
|
+
----------
|
159
|
+
X : array-like of shape (n_samples, n_features)
|
160
|
+
Input data.
|
161
|
+
|
162
|
+
X_reconstructed : array-like of shape (n_samples, n_features)
|
163
|
+
Reconstructed input data.
|
164
|
+
|
165
|
+
method : str Literal["chi-square", "jackson-mudholkar", "percentile"]
|
166
|
+
The method used to compute the confidence threshold for Q residuals.
|
167
|
+
|
168
|
+
Returns
|
169
|
+
-------
|
170
|
+
float
|
171
|
+
The calculated critical value for outlier detection.
|
172
|
+
|
173
|
+
"""
|
174
|
+
# Compute Q residuals for training data
|
175
|
+
X_transformed = self.model_.transform(X)
|
176
|
+
X_reconstructed = self.model_.inverse_transform(X_transformed)
|
177
|
+
residuals = X - X_reconstructed
|
178
|
+
|
179
|
+
if self.method == "chi-square":
|
180
|
+
return self._chi_square_threshold(residuals)
|
181
|
+
elif self.method == "jackson-mudholkar":
|
182
|
+
return self._jackson_mudholkar_threshold(residuals)
|
183
|
+
elif self.method == "percentile":
|
184
|
+
Q_residuals = np.sum((residuals) ** 2, axis=1)
|
185
|
+
return self._percentile_threshold(Q_residuals)
|
186
|
+
else:
|
187
|
+
raise ValueError(
|
188
|
+
"Invalid method. Choose from 'chi-square', 'jackson-mudholkar', or 'percentile'."
|
189
|
+
)
|
190
|
+
|
191
|
+
def _chi_square_threshold(self, residuals: np.ndarray) -> float:
|
192
|
+
"""Compute Q residual threshold using Chi-Square Approximation."""
|
193
|
+
eigenvalues = np.linalg.trace(np.cov(residuals.T))
|
194
|
+
|
195
|
+
theta_1 = np.sum(eigenvalues)
|
196
|
+
theta_2 = np.sum(eigenvalues**2)
|
197
|
+
# Degrees of freedom approximation
|
198
|
+
g = theta_2 / theta_1
|
199
|
+
h = (2 * theta_1**2) / theta_2
|
200
|
+
|
201
|
+
# Compute chi-square critical value at given confidence level
|
202
|
+
chi_critical = chi2.ppf(self.confidence, df=h)
|
203
|
+
|
204
|
+
# Compute final Q residual threshold
|
205
|
+
return g * chi_critical
|
206
|
+
|
207
|
+
def _jackson_mudholkar_threshold(self, residuals: np.ndarray) -> float:
|
208
|
+
"""Compute Q residual threshold using Jackson & Mudholkar’s analytical method."""
|
209
|
+
|
210
|
+
eigenvalues = np.linalg.trace(np.cov(residuals.T))
|
211
|
+
theta_1 = np.sum(eigenvalues)
|
212
|
+
theta_2 = np.sum(eigenvalues**2)
|
213
|
+
theta_3 = np.sum(eigenvalues**3)
|
214
|
+
z_alpha = norm.ppf(self.confidence)
|
215
|
+
|
216
|
+
h0 = 1 - (2 * theta_1 * theta_3) / (3 * theta_2**2)
|
217
|
+
|
218
|
+
term1 = theta_2 * h0 * (1 - h0) / theta_1**2
|
219
|
+
term2 = np.sqrt(z_alpha * 2 * theta_2 * h0**2) / theta_1
|
220
|
+
|
221
|
+
return theta_1 * (1 - term1 + term2) ** (1 / h0)
|
222
|
+
|
223
|
+
def _percentile_threshold(self, Q_residuals: np.ndarray) -> float:
|
224
|
+
"""Compute Q residual threshold using the empirical percentile method."""
|
225
|
+
return np.percentile(Q_residuals, self.confidence * 100)
|
@@ -0,0 +1,197 @@
|
|
1
|
+
from typing import Optional, Union
|
2
|
+
import numpy as np
|
3
|
+
|
4
|
+
from sklearn.cross_decomposition._pls import _PLS
|
5
|
+
from sklearn.pipeline import Pipeline
|
6
|
+
from sklearn.utils.validation import validate_data, check_is_fitted
|
7
|
+
|
8
|
+
|
9
|
+
from ._base import _ModelResidualsBase, ModelTypes
|
10
|
+
from .leverage import calculate_leverage
|
11
|
+
|
12
|
+
|
13
|
+
class StudentizedResiduals(_ModelResidualsBase):
|
14
|
+
"""
|
15
|
+
Calculate the Studentized Residuals on a _PLS model preditions.
|
16
|
+
|
17
|
+
Parameters
|
18
|
+
----------
|
19
|
+
model : Union[ModelType, Pipeline]
|
20
|
+
A fitted _PLS model or Pipeline ending with such a model
|
21
|
+
|
22
|
+
Attributes
|
23
|
+
----------
|
24
|
+
model_ : ModelType
|
25
|
+
The fitted model of type _BasePCA or _PLS
|
26
|
+
|
27
|
+
preprocessing_ : Optional[Pipeline]
|
28
|
+
Preprocessing steps before the model
|
29
|
+
|
30
|
+
References
|
31
|
+
----------
|
32
|
+
|
33
|
+
"""
|
34
|
+
|
35
|
+
def __init__(self, model: Union[_PLS, Pipeline], confidence=0.95) -> None:
|
36
|
+
super().__init__(model, confidence)
|
37
|
+
|
38
|
+
def fit(self, X: np.ndarray, y: Optional[np.ndarray]) -> "StudentizedResiduals":
|
39
|
+
"""
|
40
|
+
Fit the model to the input data.
|
41
|
+
|
42
|
+
Parameters
|
43
|
+
----------
|
44
|
+
X : array-like of shape (n_samples, n_features)
|
45
|
+
Input data
|
46
|
+
|
47
|
+
y : array-like of shape (n_samples,)
|
48
|
+
Target data
|
49
|
+
"""
|
50
|
+
# Validate the input data
|
51
|
+
X = validate_data(
|
52
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
53
|
+
)
|
54
|
+
|
55
|
+
# Preprocess the data
|
56
|
+
if self.preprocessing_:
|
57
|
+
X = self.preprocessing_.transform(X)
|
58
|
+
|
59
|
+
# Calculate y residuals
|
60
|
+
y_residuals = y - self.model_.predict(X)
|
61
|
+
y_residuals = (
|
62
|
+
y_residuals.reshape(-1, 1) if len(y_residuals.shape) == 1 else y_residuals
|
63
|
+
)
|
64
|
+
|
65
|
+
# Calculate the studentized residuals
|
66
|
+
studentized_residuals = calculate_studentized_residuals(
|
67
|
+
self.model_, X, y_residuals
|
68
|
+
)
|
69
|
+
|
70
|
+
# Calculate the critical threshold
|
71
|
+
self.critical_value_ = self._calculate_critical_value(studentized_residuals)
|
72
|
+
|
73
|
+
return self
|
74
|
+
|
75
|
+
def predict(self, X: np.ndarray, y: Optional[np.ndarray]) -> np.ndarray:
|
76
|
+
"""Calculate studentized residuals in the model predictions. and return a boolean array indicating outliers.
|
77
|
+
|
78
|
+
Parameters
|
79
|
+
----------
|
80
|
+
X : array-like of shape (n_samples, n_features)
|
81
|
+
Input data
|
82
|
+
|
83
|
+
y : array-like of shape (n_samples,)
|
84
|
+
Target data
|
85
|
+
|
86
|
+
Returns
|
87
|
+
-------
|
88
|
+
ndarray of shape (n_samples,)
|
89
|
+
Studentized residuals of the predictions
|
90
|
+
"""
|
91
|
+
# Check the estimator has been fitted
|
92
|
+
check_is_fitted(self, ["critical_value_"])
|
93
|
+
|
94
|
+
# Validate the input data
|
95
|
+
X = validate_data(
|
96
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
97
|
+
)
|
98
|
+
|
99
|
+
# Preprocess the data
|
100
|
+
if self.preprocessing_:
|
101
|
+
X = self.preprocessing_.transform(X)
|
102
|
+
|
103
|
+
# Calculate y residuals
|
104
|
+
y_residuals = y - self.model_.predict(X)
|
105
|
+
y_residuals = (
|
106
|
+
y_residuals.reshape(-1, 1) if len(y_residuals.shape) == 1 else y_residuals
|
107
|
+
)
|
108
|
+
|
109
|
+
# Calculate the studentized residuals
|
110
|
+
studentized_residuals = calculate_studentized_residuals(
|
111
|
+
self.model_, X, y_residuals
|
112
|
+
)
|
113
|
+
return np.where(studentized_residuals > self.critical_value_, -1, 1)
|
114
|
+
|
115
|
+
def predict_residuals(
|
116
|
+
self, X: np.ndarray, y: Optional[np.ndarray], validate: bool = True
|
117
|
+
) -> np.ndarray:
|
118
|
+
"""Calculate the studentized residuals of the model predictions.
|
119
|
+
|
120
|
+
Parameters
|
121
|
+
----------
|
122
|
+
X : array-like of shape (n_samples, n_features)
|
123
|
+
Input data
|
124
|
+
|
125
|
+
y : array-like of shape (n_samples,)
|
126
|
+
Target values
|
127
|
+
|
128
|
+
Returns
|
129
|
+
-------
|
130
|
+
ndarray of shape (n_samples,)
|
131
|
+
Studentized residuals of the model predictions
|
132
|
+
"""
|
133
|
+
# Check the estimator has been fitted
|
134
|
+
check_is_fitted(self, ["critical_value_"])
|
135
|
+
|
136
|
+
# Validate the input data
|
137
|
+
if validate:
|
138
|
+
X = validate_data(self, X, ensure_2d=True, dtype=np.float64)
|
139
|
+
|
140
|
+
# Apply preprocessing if available
|
141
|
+
if self.preprocessing_:
|
142
|
+
X = self.preprocessing_.transform(X)
|
143
|
+
|
144
|
+
# Calculate y residuals
|
145
|
+
y_residuals = y - self.model_.predict(X)
|
146
|
+
y_residuals = (
|
147
|
+
y_residuals.reshape(-1, 1) if len(y_residuals.shape) == 1 else y_residuals
|
148
|
+
)
|
149
|
+
|
150
|
+
return calculate_studentized_residuals(self.model_, X, y_residuals)
|
151
|
+
|
152
|
+
def _calculate_critical_value(self, X: Optional[np.ndarray]) -> float:
|
153
|
+
"""Calculate the critical value for outlier detection.
|
154
|
+
|
155
|
+
Parameters
|
156
|
+
----------
|
157
|
+
X : array-like of shape (n_samples,)
|
158
|
+
Studentized residuals
|
159
|
+
|
160
|
+
Returns
|
161
|
+
-------
|
162
|
+
float
|
163
|
+
The calculated critical value for outlier detection
|
164
|
+
"""
|
165
|
+
|
166
|
+
return np.percentile(X, self.confidence * 100) if X is not None else 0.0
|
167
|
+
|
168
|
+
|
169
|
+
def calculate_studentized_residuals(
|
170
|
+
model: ModelTypes, X: np.ndarray, y_residuals: np.ndarray
|
171
|
+
) -> np.ndarray:
|
172
|
+
"""Calculate the studentized residuals of the model predictions.
|
173
|
+
|
174
|
+
Parameters
|
175
|
+
----------
|
176
|
+
model : ModelTypes
|
177
|
+
A fitted model
|
178
|
+
|
179
|
+
X : array-like of shape (n_samples, n_features)
|
180
|
+
Input data
|
181
|
+
|
182
|
+
y : array-like of shape (n_samples,)
|
183
|
+
Target values
|
184
|
+
|
185
|
+
Returns
|
186
|
+
-------
|
187
|
+
ndarray of shape (n_samples,)
|
188
|
+
Studentized residuals of the model predictions
|
189
|
+
"""
|
190
|
+
|
191
|
+
# Calculate the leverage of the samples
|
192
|
+
leverage = calculate_leverage(model, X)
|
193
|
+
|
194
|
+
# Calculate the standard deviation of the residuals
|
195
|
+
std = np.sqrt(np.sum(y_residuals**2, axis=0) / (X.shape[0] - model.n_components))
|
196
|
+
|
197
|
+
return (y_residuals / (std * np.sqrt(1 - leverage.reshape(-1, 1)))).flatten()
|
@@ -1,8 +1,9 @@
|
|
1
1
|
chemotools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
chemotools/augmentation/__init__.py,sha256=
|
2
|
+
chemotools/augmentation/__init__.py,sha256=ohlRHgRWTkvNpO3RikKtowzboqunQqx0WqtNccuWOHw,397
|
3
3
|
chemotools/augmentation/_add_noise.py,sha256=fkTJfIYtZXezcjy6Vz8asIhpBoVp4oaIifppK9vZpM8,4362
|
4
4
|
chemotools/augmentation/_baseline_shift.py,sha256=kIlYvmKS9pu9vh_-eZ7PSHPuH_58V9mgYbSJt6Gq3BA,3476
|
5
5
|
chemotools/augmentation/_fractional_shift.py,sha256=dJ0Vuc-U02HhjKkOwc48qnOksZYgbHwL2ko7tWCZTQU,6916
|
6
|
+
chemotools/augmentation/_gaussian_broadening.py,sha256=dJsPlTKqpecKaCDU3vOvedIb-t_HyCkQprxNv0DmYZQ,4236
|
6
7
|
chemotools/augmentation/_index_shift.py,sha256=BTtadweDvvMtiF8t7ldwsE6Kl6FmKLCkVJjSzSWyIDs,6904
|
7
8
|
chemotools/augmentation/_spectrum_scale.py,sha256=hMsmzXpssbI7tGm_YnQn9wjbByso3CgVxd3Hs8kfLS8,3442
|
8
9
|
chemotools/baseline/__init__.py,sha256=VzoblGg8Hx_FkTc_n7a-ZjGvtKP8JE_NwJKWenGFQkM,584
|
@@ -29,6 +30,14 @@ chemotools/derivative/_savitzky_golay.py,sha256=CuCrKoLmrB1YmJ4ihIykgkL3tO3frqkS
|
|
29
30
|
chemotools/feature_selection/__init__.py,sha256=1_i28hIxijjwhMypTy1w2fLbzXXVkKD5IYzzY8ZSuHw,117
|
30
31
|
chemotools/feature_selection/_index_selector.py,sha256=lNTP2b7P3doWl30KiAr3Xd2HOMxeUmj24MuqoXl4Voc,3556
|
31
32
|
chemotools/feature_selection/_range_cut.py,sha256=lVVVC30ZsK2z9jsDGb_z6l8Ty2I89yM05_dIDbMP73Q,3564
|
33
|
+
chemotools/outliers/__init__.py,sha256=wpdlyqU34n1Pb9kGCM4idhcok35WAakxEhzP0xeKaZw,272
|
34
|
+
chemotools/outliers/_base.py,sha256=zx9z_Snkvq5YWBoRi_-kRr3a-Q7jTz1zVlrGWycUTb4,5232
|
35
|
+
chemotools/outliers/_utils.py,sha256=SDrrDvgEVQyPuKdh0Rw0DD4a8LShbNAQLRwSLICtiYU,2720
|
36
|
+
chemotools/outliers/dmodx.py,sha256=R9LaQpUJeDv4GJ0hroKOlFcFbsfQRtrHWD_EI3-TX7Y,4521
|
37
|
+
chemotools/outliers/hotelling_t2.py,sha256=Ga1qmlurF_fps9igaTUGOrnUOctIJEYqoCdb468KhY4,5006
|
38
|
+
chemotools/outliers/leverage.py,sha256=zgxG2F7ZCf5wRVJeezHSJ2gaUDTP2CvK38Rr-hR6niA,4215
|
39
|
+
chemotools/outliers/q_residuals.py,sha256=6_h73A1YxHBcQtjAXOAp1Rb7egHJwj0DQ0MKdnj6aBQ,7647
|
40
|
+
chemotools/outliers/studentized_residuals.py,sha256=rF0wObKQV5DCa8THkZcuwdu7u4mBk-dbOHth5tj5cqM,5830
|
32
41
|
chemotools/scale/__init__.py,sha256=eztqcHg-TKE1Rr0N9ArfytHk8teuqVfi4SZi2DS96vc,175
|
33
42
|
chemotools/scale/_min_max_scaler.py,sha256=YvqRkV2pXu-viQrpjzWcp9KmSSCYSoubSnrZHRLqgKQ,3011
|
34
43
|
chemotools/scale/_norm_scaler.py,sha256=CHWSir2q-pL1hxzw_ZB45yi4mw-SkJ4YOa1CUL4nm2I,2568
|
@@ -44,7 +53,7 @@ chemotools/smooth/_median_filter.py,sha256=9ndTJCwrZirWlvDNldiigMddy79KIGq9OwwYN
|
|
44
53
|
chemotools/smooth/_savitzky_golay_filter.py,sha256=27iFUWxdL9_7oZabR0R5L0ZTpBmYfVUjx2XCTukihBE,3509
|
45
54
|
chemotools/smooth/_whittaker_smooth.py,sha256=lpLAyf4GdyDW4ulT1nyEoK6xQEl2cVUKquawQdGWbHU,3571
|
46
55
|
chemotools/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
47
|
-
chemotools-0.1.
|
48
|
-
chemotools-0.1.
|
49
|
-
chemotools-0.1.
|
50
|
-
chemotools-0.1.
|
56
|
+
chemotools-0.1.10.dist-info/LICENSE,sha256=qtyOy2wDQVX9hxp58h3T-6Lmfv-mSCHoSRkcLUdM9bg,1070
|
57
|
+
chemotools-0.1.10.dist-info/METADATA,sha256=fRgOO8cS2JNtNWz_CEG0uKvncSHEJ8myfhm2IOz3y-4,5240
|
58
|
+
chemotools-0.1.10.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
59
|
+
chemotools-0.1.10.dist-info/RECORD,,
|
File without changes
|