chemotools 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chemotools/augmentation/__init__.py +4 -0
- chemotools/augmentation/_add_noise.py +70 -49
- chemotools/augmentation/_fractional_shift.py +203 -0
- chemotools/augmentation/_gaussian_broadening.py +136 -0
- chemotools/augmentation/_index_shift.py +116 -101
- chemotools/outliers/__init__.py +7 -0
- chemotools/outliers/_base.py +180 -0
- chemotools/outliers/_utils.py +91 -0
- chemotools/outliers/dmodx.py +146 -0
- chemotools/outliers/hotelling_t2.py +155 -0
- chemotools/outliers/leverage.py +150 -0
- chemotools/outliers/q_residuals.py +225 -0
- chemotools/outliers/studentized_residuals.py +197 -0
- {chemotools-0.1.8.dist-info → chemotools-0.1.10.dist-info}/METADATA +1 -1
- {chemotools-0.1.8.dist-info → chemotools-0.1.10.dist-info}/RECORD +17 -7
- {chemotools-0.1.8.dist-info → chemotools-0.1.10.dist-info}/WHEEL +1 -1
- {chemotools-0.1.8.dist-info → chemotools-0.1.10.dist-info}/LICENSE +0 -0
@@ -1,23 +1,37 @@
|
|
1
1
|
from typing import Literal, Optional
|
2
2
|
|
3
3
|
import numpy as np
|
4
|
-
from
|
4
|
+
from scipy.signal import convolve
|
5
|
+
from scipy import stats
|
5
6
|
from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
|
6
7
|
from sklearn.utils.validation import check_is_fitted, validate_data
|
7
8
|
|
8
9
|
|
9
10
|
class IndexShift(TransformerMixin, OneToOneFeatureMixin, BaseEstimator):
|
10
11
|
"""
|
11
|
-
Shift the spectrum a given number of indices between -
|
12
|
+
Shift the spectrum a given number of indices between -shift and +shift drawn
|
12
13
|
from a discrete uniform distribution.
|
13
14
|
|
14
15
|
Parameters
|
15
16
|
----------
|
16
|
-
shift :
|
17
|
-
|
17
|
+
shift : int, default=0
|
18
|
+
Maximum number of indices by which the data is randomly shifted.
|
19
|
+
The actual shift is a random integer between -shift and shift (inclusive).
|
18
20
|
|
19
|
-
|
20
|
-
|
21
|
+
padding_mode : {'zeros', 'constant', 'wrap', 'extend', 'mirror', 'linear'}, default='linear'
|
22
|
+
Specifies how to handle padding when shifting the data:
|
23
|
+
- 'zeros': Pads with zeros.
|
24
|
+
- 'constant': Pads with a constant value defined by `pad_value`.
|
25
|
+
- 'wrap': Circular shift (wraps around).
|
26
|
+
- 'extend': Extends using edge values.
|
27
|
+
- 'mirror': Mirrors the signal.
|
28
|
+
- 'linear': Uses linear regression to extrapolate values.
|
29
|
+
|
30
|
+
pad_value : float, default=0.0
|
31
|
+
The value used for padding when `padding_mode='constant'`.
|
32
|
+
|
33
|
+
random_state : int, optional, default=None
|
34
|
+
The random seed for reproducibility.
|
21
35
|
|
22
36
|
Attributes
|
23
37
|
----------
|
@@ -27,23 +41,22 @@ class IndexShift(TransformerMixin, OneToOneFeatureMixin, BaseEstimator):
|
|
27
41
|
_is_fitted : bool
|
28
42
|
Whether the transformer has been fitted to data.
|
29
43
|
|
30
|
-
|
31
|
-
|
32
|
-
fit(X, y=None)
|
33
|
-
Fit the transformer to the input data.
|
34
|
-
|
35
|
-
transform(X, y=0, copy=True)
|
36
|
-
Transform the input data by shifting the spectrum.
|
44
|
+
_rng : numpy.random.Generator
|
45
|
+
Random number generator instance used for shifting.
|
37
46
|
"""
|
38
47
|
|
39
48
|
def __init__(
|
40
49
|
self,
|
41
50
|
shift: int = 0,
|
42
|
-
|
51
|
+
padding_mode: Literal[
|
52
|
+
"zeros", "constant", "wrap", "extend", "mirror", "linear"
|
53
|
+
] = "linear",
|
54
|
+
pad_value: float = 0.0,
|
43
55
|
random_state: Optional[int] = None,
|
44
56
|
):
|
45
57
|
self.shift = shift
|
46
|
-
self.
|
58
|
+
self.padding_mode = padding_mode
|
59
|
+
self.pad_value = pad_value
|
47
60
|
self.random_state = random_state
|
48
61
|
|
49
62
|
def fit(self, X: np.ndarray, y=None) -> "IndexShift":
|
@@ -68,12 +81,6 @@ class IndexShift(TransformerMixin, OneToOneFeatureMixin, BaseEstimator):
|
|
68
81
|
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
69
82
|
)
|
70
83
|
|
71
|
-
# Set the number of features
|
72
|
-
self.n_features_in_ = X.shape[1]
|
73
|
-
|
74
|
-
# Set the fitted attribute to True
|
75
|
-
self._is_fitted = True
|
76
|
-
|
77
84
|
# Instantiate the random number generator
|
78
85
|
self._rng = np.random.default_rng(self.random_state)
|
79
86
|
|
@@ -94,10 +101,10 @@ class IndexShift(TransformerMixin, OneToOneFeatureMixin, BaseEstimator):
|
|
94
101
|
Returns
|
95
102
|
-------
|
96
103
|
X_ : np.ndarray of shape (n_samples, n_features)
|
97
|
-
The transformed data.
|
104
|
+
The transformed data with the applied shifts.
|
98
105
|
"""
|
99
106
|
# Check that the estimator is fitted
|
100
|
-
check_is_fitted(self, "
|
107
|
+
check_is_fitted(self, "n_features_in_")
|
101
108
|
|
102
109
|
# Check that X is a 2D array and has only finite values
|
103
110
|
X_ = validate_data(
|
@@ -110,90 +117,98 @@ class IndexShift(TransformerMixin, OneToOneFeatureMixin, BaseEstimator):
|
|
110
117
|
dtype=np.float64,
|
111
118
|
)
|
112
119
|
|
113
|
-
# Check that the number of features is the same as the fitted data
|
114
|
-
if X_.shape[1] != self.n_features_in_:
|
115
|
-
raise ValueError(
|
116
|
-
f"Expected {self.n_features_in_} features but got {X_.shape[1]}"
|
117
|
-
)
|
118
|
-
|
119
120
|
# Calculate the standard normal variate
|
120
121
|
for i, x in enumerate(X_):
|
121
|
-
X_[i] = self.
|
122
|
+
X_[i] = self._shift_signal(x)
|
122
123
|
|
123
124
|
return X_.reshape(-1, 1) if X_.ndim == 1 else X_
|
124
125
|
|
125
|
-
def
|
126
|
-
shift_amount = self._rng.integers(-self.shift, self.shift, endpoint=True)
|
127
|
-
return np.roll(x, shift_amount)
|
128
|
-
|
129
|
-
def _shift_vector(
|
130
|
-
self,
|
131
|
-
x: np.ndarray,
|
132
|
-
) -> np.ndarray:
|
126
|
+
def _shift_signal(self, x: np.ndarray):
|
133
127
|
"""
|
134
|
-
|
135
|
-
|
136
|
-
Args:
|
137
|
-
arr: Input numpy array
|
138
|
-
shift: Number of positions to shift
|
139
|
-
fill_method: Method to fill missing values
|
140
|
-
'constant': fill with first/last value
|
141
|
-
'linear': fill using linear regression
|
142
|
-
'quadratic': fill using quadratic regression
|
143
|
-
|
144
|
-
Returns:
|
145
|
-
Shifted numpy array
|
146
|
-
"""
|
147
|
-
shift = self._rng.integers(-self.shift, self.shift, endpoint=True)
|
148
|
-
|
149
|
-
result = np.roll(x, shift)
|
150
|
-
|
151
|
-
if self.fill_method == "constant":
|
152
|
-
if shift > 0:
|
153
|
-
result[:shift] = x[0]
|
154
|
-
elif shift < 0:
|
155
|
-
result[shift:] = x[-1]
|
156
|
-
|
157
|
-
elif self.fill_method == "linear":
|
158
|
-
if shift > 0:
|
159
|
-
x_ = np.arange(5)
|
160
|
-
coeffs = poly.polyfit(x_, x[:5], 1)
|
128
|
+
Shifts a discrete signal using convolution with a Dirac delta kernel.
|
161
129
|
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
elif shift < 0:
|
168
|
-
x_ = np.arange(5)
|
169
|
-
coeffs = poly.polyfit(x_, x[-5:], 1)
|
170
|
-
|
171
|
-
extrapolate_x = np.arange(len(x_), len(x_) - shift)
|
172
|
-
extrapolated_values = poly.polyval(extrapolate_x, coeffs)
|
173
|
-
|
174
|
-
result[shift:] = extrapolated_values
|
175
|
-
|
176
|
-
elif self.fill_method == "quadratic":
|
177
|
-
if shift > 0:
|
178
|
-
# Use first 3 values for quadratic regression
|
179
|
-
x_ = np.arange(5)
|
180
|
-
coeffs = poly.polyfit(x_, x[:5], 2)
|
181
|
-
|
182
|
-
# Extrapolate to fill shifted region
|
183
|
-
extrapolate_x = np.arange(-shift, 0)
|
184
|
-
extrapolated_values = poly.polyval(extrapolate_x, coeffs)
|
185
|
-
|
186
|
-
result[:shift] = extrapolated_values
|
187
|
-
|
188
|
-
elif shift < 0:
|
189
|
-
# Use last 3 values for quadratic regression
|
190
|
-
x_ = np.arange(5)
|
191
|
-
coeffs = poly.polyfit(x_, x[-5:], 2)
|
192
|
-
|
193
|
-
# Extrapolate to fill shifted region
|
194
|
-
extrapolate_x = np.arange(len(x_), len(x_) - shift)
|
195
|
-
extrapolated_values = poly.polyval(extrapolate_x, coeffs)
|
130
|
+
Parameters
|
131
|
+
----------
|
132
|
+
x : np.ndarray of shape (n_features,)
|
133
|
+
The input signal to shift.
|
196
134
|
|
197
|
-
|
135
|
+
Returns
|
136
|
+
-------
|
137
|
+
result : np.ndarray of shape (n_features,)
|
138
|
+
The shifted signal.
|
139
|
+
"""
|
140
|
+
shift = self._rng.integers(-self.shift, self.shift, endpoint=True)
|
198
141
|
|
199
|
-
|
142
|
+
if self.padding_mode == "wrap":
|
143
|
+
return np.roll(x, shift)
|
144
|
+
|
145
|
+
# Create Dirac delta kernel with proper dimensions
|
146
|
+
|
147
|
+
if shift >= 0:
|
148
|
+
kernel = np.zeros(shift + 1)
|
149
|
+
kernel[-1] = 1
|
150
|
+
else:
|
151
|
+
kernel = np.zeros(-shift + 1)
|
152
|
+
kernel[0] = 1
|
153
|
+
|
154
|
+
# Convolve signal with kernel
|
155
|
+
shifted = convolve(x, kernel, mode="full")
|
156
|
+
|
157
|
+
if shift >= 0:
|
158
|
+
result = shifted[: len(x)] if x.ndim == 1 else shifted[: x.shape[0]]
|
159
|
+
pad_length = shift
|
160
|
+
pad_left = True
|
161
|
+
else:
|
162
|
+
result = shifted[-len(x) :] if x.ndim == 1 else shifted[-x.shape[0] :]
|
163
|
+
pad_length = -shift
|
164
|
+
pad_left = False
|
165
|
+
|
166
|
+
if self.padding_mode == "zeros":
|
167
|
+
return result
|
168
|
+
|
169
|
+
elif self.padding_mode == "constant":
|
170
|
+
mask = np.abs(result) < 1e-10
|
171
|
+
result[mask] = self.pad_value
|
172
|
+
return result
|
173
|
+
|
174
|
+
elif self.padding_mode == "mirror":
|
175
|
+
if pad_left:
|
176
|
+
pad_values = x[pad_length - 1 :: -1]
|
177
|
+
result[:pad_length] = pad_values[-pad_length:]
|
178
|
+
else:
|
179
|
+
pad_values = x[:-1][::-1]
|
180
|
+
result[-pad_length:] = pad_values[:pad_length]
|
181
|
+
|
182
|
+
return result
|
183
|
+
|
184
|
+
elif self.padding_mode == "extend":
|
185
|
+
if pad_left:
|
186
|
+
result[:pad_length] = x[0]
|
187
|
+
else:
|
188
|
+
result[-pad_length:] = x[-1]
|
189
|
+
return result
|
190
|
+
|
191
|
+
elif self.padding_mode == "linear":
|
192
|
+
# Get points for linear regression
|
193
|
+
if pad_left:
|
194
|
+
points = x[: pad_length + 1] # Take first pad_length+1 points
|
195
|
+
x_coords = np.arange(len(points))
|
196
|
+
slope, intercept, _, _, _ = stats.linregress(x_coords, points)
|
197
|
+
|
198
|
+
# Generate new points using linear regression
|
199
|
+
new_x = np.arange(-pad_length, 0)
|
200
|
+
extrapolated = slope * new_x + intercept
|
201
|
+
result[:pad_length] = extrapolated
|
202
|
+
else:
|
203
|
+
points = x[-pad_length - 1 :] # Take last pad_length+1 points
|
204
|
+
x_coords = np.arange(len(points))
|
205
|
+
slope, intercept, _, _, _ = stats.linregress(x_coords, points)
|
206
|
+
|
207
|
+
# Generate new points using linear regression
|
208
|
+
new_x = np.arange(len(points), len(points) + pad_length)
|
209
|
+
extrapolated = slope * new_x + intercept
|
210
|
+
result[-pad_length:] = extrapolated
|
211
|
+
return result
|
212
|
+
|
213
|
+
else:
|
214
|
+
raise ValueError(f"Unknown padding mode: {self.padding_mode}")
|
@@ -0,0 +1,7 @@
|
|
1
|
+
from .dmodx import DModX
|
2
|
+
from .hotelling_t2 import HotellingT2
|
3
|
+
from .q_residuals import QResiduals
|
4
|
+
from .leverage import Leverage
|
5
|
+
from .studentized_residuals import StudentizedResiduals
|
6
|
+
|
7
|
+
__all__ = ["DModX", "HotellingT2", "QResiduals", "Leverage", "StudentizedResiduals"]
|
@@ -0,0 +1,180 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import Union, Optional
|
3
|
+
|
4
|
+
import numpy as np
|
5
|
+
|
6
|
+
from sklearn.base import BaseEstimator, OutlierMixin
|
7
|
+
from sklearn.decomposition._base import _BasePCA
|
8
|
+
from sklearn.cross_decomposition._pls import _PLS
|
9
|
+
from sklearn.pipeline import Pipeline
|
10
|
+
from sklearn.utils.validation import check_is_fitted
|
11
|
+
|
12
|
+
from ._utils import validate_confidence, validate_and_extract_model
|
13
|
+
|
14
|
+
ModelTypes = Union[_BasePCA, _PLS]
|
15
|
+
|
16
|
+
|
17
|
+
class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
|
18
|
+
"""Base class for model outlier calculations.
|
19
|
+
|
20
|
+
Implements statistical calculations for outlier detection in dimensionality
|
21
|
+
reduction models like PCA and PLS.
|
22
|
+
|
23
|
+
Parameters
|
24
|
+
----------
|
25
|
+
model : Union[ModelTypes, Pipeline]
|
26
|
+
A fitted _BasePCA or _PLS models or Pipeline ending with such a model
|
27
|
+
confidence : float
|
28
|
+
Confidence level for statistical calculations (between 0 and 1)
|
29
|
+
|
30
|
+
Attributes
|
31
|
+
----------
|
32
|
+
model_ : ModelTypes
|
33
|
+
The fitted model of type _BasePCA or _PLS
|
34
|
+
|
35
|
+
preprocessing_ : Optional[Pipeline]
|
36
|
+
Preprocessing steps before the model
|
37
|
+
|
38
|
+
n_features_in_ : int
|
39
|
+
Number of features in the input data
|
40
|
+
|
41
|
+
n_components_ : int
|
42
|
+
Number of components in the model
|
43
|
+
|
44
|
+
n_samples_ : int
|
45
|
+
Number of samples used to train the model
|
46
|
+
|
47
|
+
critical_value_ : float
|
48
|
+
The calculated critical value for outlier detection
|
49
|
+
"""
|
50
|
+
|
51
|
+
def __init__(
|
52
|
+
self,
|
53
|
+
model: Union[ModelTypes, Pipeline],
|
54
|
+
confidence: float,
|
55
|
+
) -> None:
|
56
|
+
(
|
57
|
+
self.model_,
|
58
|
+
self.preprocessing_,
|
59
|
+
self.n_features_in_,
|
60
|
+
self.n_components_,
|
61
|
+
self.n_samples_,
|
62
|
+
) = validate_and_extract_model(model)
|
63
|
+
self.confidence = validate_confidence(confidence)
|
64
|
+
|
65
|
+
def fit_predict_residuals(
|
66
|
+
self, X: np.ndarray, y: Optional[np.ndarray] = None
|
67
|
+
) -> np.ndarray:
|
68
|
+
"""Fit the model to the input data and calculate the residuals.
|
69
|
+
|
70
|
+
Parameters
|
71
|
+
----------
|
72
|
+
X : array-like of shape (n_samples, n_features)
|
73
|
+
Input data
|
74
|
+
|
75
|
+
y : array-like of shape (n_samples,), default=None
|
76
|
+
Target values
|
77
|
+
|
78
|
+
Returns
|
79
|
+
-------
|
80
|
+
ndarray of shape (n_samples,)
|
81
|
+
The residuals of the model
|
82
|
+
"""
|
83
|
+
self.fit(X, y)
|
84
|
+
return self.predict_residuals(X, y, validate=True)
|
85
|
+
|
86
|
+
@abstractmethod
|
87
|
+
def predict_residuals(
|
88
|
+
self, X: np.ndarray, y: Optional[np.ndarray], validate: bool
|
89
|
+
) -> np.ndarray:
|
90
|
+
"""Calculate the residuals of the model.
|
91
|
+
|
92
|
+
Returns
|
93
|
+
-------
|
94
|
+
ndarray of shape (n_samples,)
|
95
|
+
The residuals of the model
|
96
|
+
"""
|
97
|
+
|
98
|
+
@abstractmethod
|
99
|
+
def _calculate_critical_value(self, X: Optional[np.ndarray]) -> float:
|
100
|
+
"""Calculate the critical value for outlier detection.
|
101
|
+
|
102
|
+
Returns
|
103
|
+
-------
|
104
|
+
float
|
105
|
+
The calculated critical value for outlier detection
|
106
|
+
"""
|
107
|
+
|
108
|
+
|
109
|
+
class _ModelDiagnosticsBase(ABC):
|
110
|
+
"""Base class for model diagnostics methods. This does not implement outlier detection algorithms,
|
111
|
+
but rather implements methods that are used to assess trained models.
|
112
|
+
|
113
|
+
Parameters
|
114
|
+
----------
|
115
|
+
model : Union[ModelTypes, Pipeline]
|
116
|
+
A fitted PCA/PLS model or Pipeline ending with such a model
|
117
|
+
|
118
|
+
Attributes
|
119
|
+
----------
|
120
|
+
model_ : ModelTypes
|
121
|
+
The fitted model of type _BasePCA or _PLS
|
122
|
+
|
123
|
+
preprocessing_ : Optional[Pipeline]
|
124
|
+
Preprocessing steps before the model
|
125
|
+
|
126
|
+
"""
|
127
|
+
|
128
|
+
def __init__(self, model: Union[ModelTypes, Pipeline]):
|
129
|
+
self.model_, self.preprocessing_ = self._validate_and_extract_model(model)
|
130
|
+
|
131
|
+
def _validate_and_extract_model(self, model):
|
132
|
+
"""Validate and extract the model and preprocessing steps.
|
133
|
+
|
134
|
+
Parameters
|
135
|
+
----------
|
136
|
+
model : Union[ModelTypes, Pipeline]
|
137
|
+
A fitted PCA/PLS model or Pipeline ending with such a model
|
138
|
+
|
139
|
+
Returns
|
140
|
+
-------
|
141
|
+
Tuple[ModelTypes, Optional[Pipeline]]
|
142
|
+
The extracted model and preprocessing steps
|
143
|
+
|
144
|
+
Raises
|
145
|
+
------
|
146
|
+
ValueError
|
147
|
+
If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
|
148
|
+
"""
|
149
|
+
if isinstance(model, Pipeline):
|
150
|
+
preprocessing = model[:-1]
|
151
|
+
model = model[-1]
|
152
|
+
else:
|
153
|
+
preprocessing = None
|
154
|
+
|
155
|
+
if isinstance(model, (_BasePCA, _PLS)):
|
156
|
+
check_is_fitted(model)
|
157
|
+
else:
|
158
|
+
raise ValueError(
|
159
|
+
"Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
|
160
|
+
)
|
161
|
+
check_is_fitted(model)
|
162
|
+
return model, preprocessing
|
163
|
+
|
164
|
+
@abstractmethod
|
165
|
+
def predict(self, X: np.ndarray, y: Optional[np.ndarray]) -> np.ndarray:
|
166
|
+
"""Predict the output of the model.
|
167
|
+
|
168
|
+
Parameters
|
169
|
+
----------
|
170
|
+
X : array-like of shape (n_samples, n_features)
|
171
|
+
Input data
|
172
|
+
|
173
|
+
y : array-like of shape (n_samples,), default=None
|
174
|
+
Target values
|
175
|
+
|
176
|
+
Returns
|
177
|
+
-------
|
178
|
+
ndarray of shape (n_samples,)
|
179
|
+
Predicted values
|
180
|
+
"""
|
@@ -0,0 +1,91 @@
|
|
1
|
+
from typing import Optional, Tuple, Union
|
2
|
+
|
3
|
+
from sklearn.cross_decomposition._pls import _PLS
|
4
|
+
from sklearn.decomposition._base import _BasePCA
|
5
|
+
from sklearn.pipeline import Pipeline
|
6
|
+
from sklearn.utils.validation import check_is_fitted
|
7
|
+
|
8
|
+
ModelTypes = Union[_BasePCA, _PLS]
|
9
|
+
|
10
|
+
|
11
|
+
def get_model_parameters(model: ModelTypes) -> Tuple[int, int, int]:
|
12
|
+
"""
|
13
|
+
Get the number of features, components and samples from a model with PLS or PCA. types.
|
14
|
+
|
15
|
+
Parameters
|
16
|
+
----------
|
17
|
+
model : ModelType
|
18
|
+
A fitted model of type _BasePCA or _PLS
|
19
|
+
|
20
|
+
Returns
|
21
|
+
-------
|
22
|
+
Tuple[int, int, int]
|
23
|
+
The number of features, components and samples in the model
|
24
|
+
"""
|
25
|
+
if isinstance(model, _BasePCA):
|
26
|
+
return model.n_features_in_, model.n_components_, model.n_samples_
|
27
|
+
elif isinstance(model, _PLS):
|
28
|
+
return model.n_features_in_, model.n_components, len(model.x_scores_)
|
29
|
+
else:
|
30
|
+
raise ValueError(
|
31
|
+
"Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
|
32
|
+
)
|
33
|
+
|
34
|
+
|
35
|
+
def validate_confidence(confidence: float) -> float:
|
36
|
+
"""Validate parameters using sklearn conventions.
|
37
|
+
|
38
|
+
Parameters
|
39
|
+
----------
|
40
|
+
confidence : float
|
41
|
+
Confidence level for statistical calculations (between 0 and 1)
|
42
|
+
|
43
|
+
Returns
|
44
|
+
-------
|
45
|
+
float
|
46
|
+
The validated confidence level
|
47
|
+
|
48
|
+
Raises
|
49
|
+
------
|
50
|
+
ValueError
|
51
|
+
If confidence is not between 0 and 1
|
52
|
+
"""
|
53
|
+
if not 0 < confidence < 1:
|
54
|
+
raise ValueError("Confidence must be between 0 and 1")
|
55
|
+
return confidence
|
56
|
+
|
57
|
+
|
58
|
+
def validate_and_extract_model(
|
59
|
+
model: Union[ModelTypes, Pipeline],
|
60
|
+
) -> Tuple[ModelTypes, Optional[Pipeline], int, int, int]:
|
61
|
+
"""Validate and extract the model and preprocessing steps.
|
62
|
+
|
63
|
+
Parameters
|
64
|
+
----------
|
65
|
+
model : Union[ModelTypes, Pipeline]
|
66
|
+
A fitted PCA/PLS model or Pipeline ending with such a model
|
67
|
+
|
68
|
+
Returns
|
69
|
+
-------
|
70
|
+
Tuple[ModelTypes, Optional[Pipeline]]
|
71
|
+
The extracted model and preprocessing steps
|
72
|
+
|
73
|
+
Raises
|
74
|
+
------
|
75
|
+
ValueError
|
76
|
+
If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
|
77
|
+
"""
|
78
|
+
if isinstance(model, Pipeline):
|
79
|
+
preprocessing = model[:-1]
|
80
|
+
model = model[-1]
|
81
|
+
else:
|
82
|
+
preprocessing = None
|
83
|
+
|
84
|
+
if not isinstance(model, (_BasePCA, _PLS)):
|
85
|
+
raise ValueError(
|
86
|
+
"Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
|
87
|
+
)
|
88
|
+
|
89
|
+
check_is_fitted(model)
|
90
|
+
n_features_in, n_components, n_samples = get_model_parameters(model)
|
91
|
+
return model, preprocessing, n_features_in, n_components, n_samples
|
@@ -0,0 +1,146 @@
|
|
1
|
+
from typing import Optional, Union
|
2
|
+
import numpy as np
|
3
|
+
|
4
|
+
from sklearn.pipeline import Pipeline
|
5
|
+
from sklearn.utils.validation import validate_data, check_is_fitted
|
6
|
+
from scipy.stats import f as f_distribution
|
7
|
+
|
8
|
+
|
9
|
+
from ._base import _ModelResidualsBase, ModelTypes
|
10
|
+
|
11
|
+
|
12
|
+
class DModX(_ModelResidualsBase):
|
13
|
+
"""Calculate Distance to Model (DModX) statistics.
|
14
|
+
|
15
|
+
DModX measures the distance between an observation and the model plane
|
16
|
+
in the X-space, useful for detecting outliers.
|
17
|
+
|
18
|
+
Parameters
|
19
|
+
----------
|
20
|
+
model : Union[ModelType, Pipeline]
|
21
|
+
A fitted PCA/PLS model or Pipeline ending with such a model
|
22
|
+
|
23
|
+
confidence : float, default=0.95
|
24
|
+
Confidence level for statistical calculations (between 0 and 1)
|
25
|
+
|
26
|
+
Attributes
|
27
|
+
----------
|
28
|
+
model_ : ModelType
|
29
|
+
The fitted model of type _BasePCA or _PLS
|
30
|
+
|
31
|
+
preprocessing_ : Optional[Pipeline]
|
32
|
+
Preprocessing steps before the model
|
33
|
+
|
34
|
+
n_features_in_ : int
|
35
|
+
Number of features in the input data
|
36
|
+
|
37
|
+
n_components_ : int
|
38
|
+
Number of components in the model
|
39
|
+
|
40
|
+
n_samples_ : int
|
41
|
+
Number of samples used to train the model
|
42
|
+
|
43
|
+
critical_value_ : float
|
44
|
+
The calculated critical value for outlier detection
|
45
|
+
"""
|
46
|
+
|
47
|
+
def __init__(
|
48
|
+
self,
|
49
|
+
model: Union[ModelTypes, Pipeline],
|
50
|
+
confidence: float = 0.95,
|
51
|
+
) -> None:
|
52
|
+
super().__init__(model, confidence)
|
53
|
+
|
54
|
+
def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "DModX":
|
55
|
+
"""
|
56
|
+
Fit the model to the input data.
|
57
|
+
|
58
|
+
This step calculates the critical value for the outlier detection. In the DmodX method,
|
59
|
+
the critical value is not depend on the input data but on the model parameters.
|
60
|
+
"""
|
61
|
+
X = validate_data(
|
62
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
63
|
+
)
|
64
|
+
|
65
|
+
self.critical_value_ = self._calculate_critical_value()
|
66
|
+
return self
|
67
|
+
|
68
|
+
def predict(self, X: np.ndarray) -> np.ndarray:
|
69
|
+
"""Identify outliers in the input data.
|
70
|
+
|
71
|
+
Parameters
|
72
|
+
----------
|
73
|
+
X : array-like of shape (n_samples, n_features)
|
74
|
+
Input data
|
75
|
+
|
76
|
+
Returns
|
77
|
+
-------
|
78
|
+
ndarray of shape (n_samples,)
|
79
|
+
Boolean array indicating outliers
|
80
|
+
"""
|
81
|
+
# Check the estimator has been fitted
|
82
|
+
check_is_fitted(self, ["critical_value_"])
|
83
|
+
|
84
|
+
# Validate the input data
|
85
|
+
X = validate_data(
|
86
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
87
|
+
)
|
88
|
+
|
89
|
+
# Calculate outliers based on the DModX statistics
|
90
|
+
dmodx_values = self.predict_residuals(X, validate=False)
|
91
|
+
return np.where(dmodx_values > self.critical_value_, -1, 1)
|
92
|
+
|
93
|
+
def predict_residuals(
|
94
|
+
self, X: np.ndarray, y: Optional[np.ndarray] = None, validate: bool = True
|
95
|
+
) -> np.ndarray:
|
96
|
+
"""Calculate DModX statistics for input data.
|
97
|
+
|
98
|
+
Parameters
|
99
|
+
----------
|
100
|
+
X : array-like of shape (n_samples, n_features)
|
101
|
+
Input data
|
102
|
+
|
103
|
+
validate : bool, default=True
|
104
|
+
Whether to validate the input data
|
105
|
+
|
106
|
+
Returns
|
107
|
+
-------
|
108
|
+
ndarray of shape (n_samples,)
|
109
|
+
DModX statistics for each sample
|
110
|
+
"""
|
111
|
+
# Check the estimator has been fitted
|
112
|
+
check_is_fitted(self, ["critical_value_"])
|
113
|
+
|
114
|
+
# Validate the input data
|
115
|
+
if validate:
|
116
|
+
X = validate_data(
|
117
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
118
|
+
)
|
119
|
+
|
120
|
+
# Apply preprocessing if available
|
121
|
+
if self.preprocessing_:
|
122
|
+
X = self.preprocessing_.transform(X)
|
123
|
+
|
124
|
+
# Calculate the DModX statistics
|
125
|
+
X_transformed = self.model_.transform(X)
|
126
|
+
X_reconstructed = self.model_.inverse_transform(X_transformed)
|
127
|
+
squared_errors = np.sum((X - X_reconstructed) ** 2, axis=1)
|
128
|
+
|
129
|
+
return np.sqrt(squared_errors / (self.n_features_in_ - self.n_components_))
|
130
|
+
|
131
|
+
def _calculate_critical_value(self, X: Optional[np.ndarray] = None) -> float:
|
132
|
+
"""Calculate F-distribution based critical value.
|
133
|
+
|
134
|
+
Returns
|
135
|
+
-------
|
136
|
+
float
|
137
|
+
The critical value for outlier detection
|
138
|
+
"""
|
139
|
+
|
140
|
+
dof_numerator = self.n_features_in_ - self.n_components_
|
141
|
+
dof_denominator = self.n_features_in_ - self.n_components_ - 1
|
142
|
+
|
143
|
+
upper_control_limit = f_distribution.ppf(
|
144
|
+
self.confidence, dof_numerator, dof_denominator
|
145
|
+
)
|
146
|
+
return np.sqrt(upper_control_limit)
|