chemotools 0.1.9__tar.gz → 0.1.11rc0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/PKG-INFO +1 -1
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/augmentation/__init__.py +2 -0
- chemotools-0.1.11rc0/chemotools/augmentation/_gaussian_broadening.py +136 -0
- chemotools-0.1.11rc0/chemotools/outliers/__init__.py +7 -0
- chemotools-0.1.11rc0/chemotools/outliers/_base.py +188 -0
- chemotools-0.1.11rc0/chemotools/outliers/dmodx.py +164 -0
- chemotools-0.1.11rc0/chemotools/outliers/hotelling_t2.py +156 -0
- chemotools-0.1.11rc0/chemotools/outliers/leverage.py +151 -0
- chemotools-0.1.11rc0/chemotools/outliers/q_residuals.py +228 -0
- chemotools-0.1.11rc0/chemotools/outliers/studentized_residuals.py +198 -0
- chemotools-0.1.11rc0/chemotools/outliers/utils.py +51 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/pyproject.toml +1 -1
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/LICENSE +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/README.md +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/__init__.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/augmentation/_add_noise.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/augmentation/_baseline_shift.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/augmentation/_fractional_shift.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/augmentation/_index_shift.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/augmentation/_spectrum_scale.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/baseline/__init__.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/baseline/_air_pls.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/baseline/_ar_pls.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/baseline/_constant_baseline_correction.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/baseline/_cubic_spline_correction.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/baseline/_linear_correction.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/baseline/_non_negative.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/baseline/_polynomial_correction.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/baseline/_subtract_reference.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/datasets/__init__.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/datasets/_base.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/datasets/data/__init__.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/datasets/data/coffee_labels.csv +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/datasets/data/coffee_spectra.csv +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/datasets/data/fermentation_hplc.csv +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/datasets/data/fermentation_spectra.csv +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/datasets/data/train_hplc.csv +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/datasets/data/train_spectra.csv +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/derivative/__init__.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/derivative/_norris_william.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/derivative/_savitzky_golay.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/feature_selection/__init__.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/feature_selection/_index_selector.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/feature_selection/_range_cut.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/scale/__init__.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/scale/_min_max_scaler.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/scale/_norm_scaler.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/scale/_point_scaler.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/scatter/__init__.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/scatter/_extended_multiplicative_scatter_correction.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/scatter/_multiplicative_scatter_correction.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/scatter/_robust_normal_variate.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/scatter/_standard_normal_variate.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/smooth/__init__.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/smooth/_mean_filter.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/smooth/_median_filter.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/smooth/_savitzky_golay_filter.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/smooth/_whittaker_smooth.py +0 -0
- {chemotools-0.1.9 → chemotools-0.1.11rc0}/chemotools/utils/__init__.py +0 -0
@@ -1,6 +1,7 @@
|
|
1
1
|
from ._add_noise import AddNoise
|
2
2
|
from ._baseline_shift import BaselineShift
|
3
3
|
from ._fractional_shift import FractionalShift
|
4
|
+
from ._gaussian_broadening import GaussianBroadening
|
4
5
|
from ._index_shift import IndexShift
|
5
6
|
from ._spectrum_scale import SpectrumScale
|
6
7
|
|
@@ -9,6 +10,7 @@ __all__ = [
|
|
9
10
|
"AddNoise",
|
10
11
|
"BaselineShift",
|
11
12
|
"FractionalShift",
|
13
|
+
"GaussianBroadening",
|
12
14
|
"IndexShift",
|
13
15
|
"SpectrumScale",
|
14
16
|
]
|
@@ -0,0 +1,136 @@
|
|
1
|
+
from typing import Literal, Optional
|
2
|
+
import numpy as np
|
3
|
+
from scipy.ndimage import gaussian_filter1d
|
4
|
+
from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
|
5
|
+
from sklearn.utils.validation import check_is_fitted, validate_data
|
6
|
+
|
7
|
+
|
8
|
+
class GaussianBroadening(TransformerMixin, OneToOneFeatureMixin, BaseEstimator):
|
9
|
+
"""
|
10
|
+
Transform spectral data by broadening peaks using Gaussian convolution.
|
11
|
+
|
12
|
+
This transformer applies Gaussian smoothing to broaden peaks in spectral data.
|
13
|
+
For each signal, a random sigma is chosen between 0 and the specified sigma value.
|
14
|
+
|
15
|
+
Parameters
|
16
|
+
----------
|
17
|
+
sigma : float, default=1.0
|
18
|
+
Maximum standard deviation for the Gaussian kernel.
|
19
|
+
The actual sigma used will be randomly chosen between 0 and this value.
|
20
|
+
|
21
|
+
mode : {'reflect', 'constant', 'nearest', 'mirror', 'wrap'}, default='reflect'
|
22
|
+
The mode parameter determines how the input array is extended when
|
23
|
+
the filter overlaps a border. Default is 'reflect'.
|
24
|
+
|
25
|
+
pad_value : float, default=0.0
|
26
|
+
Value to fill past edges of input if mode is 'constant'.
|
27
|
+
|
28
|
+
random_state : int, optional, default=None
|
29
|
+
Random state for reproducible sigma selection.
|
30
|
+
|
31
|
+
truncate : float, default=4.0
|
32
|
+
Truncate the filter at this many standard deviations.
|
33
|
+
Larger values increase computation time but improve accuracy.
|
34
|
+
"""
|
35
|
+
|
36
|
+
def __init__(
|
37
|
+
self,
|
38
|
+
sigma: float = 1.0,
|
39
|
+
mode: Literal["reflect", "constant", "nearest", "mirror", "wrap"] = "reflect",
|
40
|
+
pad_value: float = 0.0,
|
41
|
+
random_state: Optional[int] = None,
|
42
|
+
truncate: float = 4.0,
|
43
|
+
):
|
44
|
+
self.sigma = sigma
|
45
|
+
self.mode = mode
|
46
|
+
self.pad_value = pad_value
|
47
|
+
self.random_state = random_state
|
48
|
+
self.truncate = truncate
|
49
|
+
|
50
|
+
def fit(self, X: np.ndarray, y=None) -> "GaussianBroadening":
|
51
|
+
"""
|
52
|
+
Fit the transformer to the data (in this case, only validates input).
|
53
|
+
|
54
|
+
Parameters
|
55
|
+
----------
|
56
|
+
X : array-like of shape (n_samples, n_features)
|
57
|
+
Input data to validate.
|
58
|
+
|
59
|
+
y : None
|
60
|
+
Ignored.
|
61
|
+
|
62
|
+
Returns
|
63
|
+
-------
|
64
|
+
self : GaussianBroadening
|
65
|
+
The fitted transformer.
|
66
|
+
"""
|
67
|
+
X = validate_data(
|
68
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
69
|
+
)
|
70
|
+
|
71
|
+
# Validate sigma parameter
|
72
|
+
if not isinstance(self.sigma, (int, float)):
|
73
|
+
raise ValueError("sigma must be a number")
|
74
|
+
if self.sigma < 0:
|
75
|
+
raise ValueError("sigma must be non-negative")
|
76
|
+
|
77
|
+
# Initialize random number generator
|
78
|
+
self._rng = np.random.default_rng(self.random_state)
|
79
|
+
|
80
|
+
return self
|
81
|
+
|
82
|
+
def transform(self, X: np.ndarray, y=None) -> np.ndarray:
|
83
|
+
"""
|
84
|
+
Apply Gaussian broadening to the input data.
|
85
|
+
|
86
|
+
Parameters
|
87
|
+
----------
|
88
|
+
X : array-like of shape (n_samples, n_features)
|
89
|
+
The data to transform.
|
90
|
+
|
91
|
+
y : None
|
92
|
+
Ignored.
|
93
|
+
|
94
|
+
Returns
|
95
|
+
-------
|
96
|
+
X_transformed : ndarray of shape (n_samples, n_features)
|
97
|
+
The transformed data with broadened peaks.
|
98
|
+
"""
|
99
|
+
check_is_fitted(self, "n_features_in_")
|
100
|
+
X_ = validate_data(
|
101
|
+
self,
|
102
|
+
X,
|
103
|
+
y="no_validation",
|
104
|
+
ensure_2d=True,
|
105
|
+
copy=True,
|
106
|
+
reset=False,
|
107
|
+
dtype=np.float64,
|
108
|
+
)
|
109
|
+
|
110
|
+
# Transform each sample
|
111
|
+
for i, x in enumerate(X_):
|
112
|
+
X_[i] = self._broaden_signal(x)
|
113
|
+
|
114
|
+
return X_
|
115
|
+
|
116
|
+
def _broaden_signal(self, x: np.ndarray) -> np.ndarray:
|
117
|
+
"""
|
118
|
+
Apply Gaussian broadening to a single signal.
|
119
|
+
|
120
|
+
Parameters
|
121
|
+
----------
|
122
|
+
x : ndarray of shape (n_features,)
|
123
|
+
The input signal to broaden.
|
124
|
+
|
125
|
+
Returns
|
126
|
+
-------
|
127
|
+
broadened_signal : ndarray of shape (n_features,)
|
128
|
+
The broadened signal.
|
129
|
+
"""
|
130
|
+
# Randomly choose sigma between 0 and max sigma
|
131
|
+
sigma = self._rng.uniform(0, self.sigma)
|
132
|
+
|
133
|
+
# Apply Gaussian filter
|
134
|
+
return gaussian_filter1d(
|
135
|
+
x, sigma=sigma, mode=self.mode, cval=self.pad_value, truncate=self.truncate
|
136
|
+
)
|
@@ -0,0 +1,7 @@
|
|
1
|
+
from .dmodx import DModX
|
2
|
+
from .hotelling_t2 import HotellingT2
|
3
|
+
from .q_residuals import QResiduals
|
4
|
+
from .leverage import Leverage
|
5
|
+
from .studentized_residuals import StudentizedResiduals
|
6
|
+
|
7
|
+
__all__ = ["DModX", "HotellingT2", "QResiduals", "Leverage", "StudentizedResiduals"]
|
@@ -0,0 +1,188 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import Optional, Tuple, Union
|
3
|
+
|
4
|
+
import numpy as np
|
5
|
+
|
6
|
+
from sklearn.base import BaseEstimator, OutlierMixin
|
7
|
+
from sklearn.decomposition._base import _BasePCA
|
8
|
+
from sklearn.cross_decomposition._pls import _PLS
|
9
|
+
from sklearn.pipeline import Pipeline
|
10
|
+
from sklearn.utils.validation import check_is_fitted
|
11
|
+
|
12
|
+
|
13
|
+
ModelTypes = Union[_BasePCA, _PLS]
|
14
|
+
|
15
|
+
|
16
|
+
class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
|
17
|
+
"""Base class for model outlier calculations.
|
18
|
+
|
19
|
+
Implements statistical calculations for outlier detection in dimensionality
|
20
|
+
reduction models like PCA and PLS.
|
21
|
+
|
22
|
+
Parameters
|
23
|
+
----------
|
24
|
+
model : Union[ModelTypes, Pipeline]
|
25
|
+
A fitted _BasePCA or _PLS models or Pipeline ending with such a model
|
26
|
+
confidence : float
|
27
|
+
Confidence level for statistical calculations (between 0 and 1)
|
28
|
+
|
29
|
+
Attributes
|
30
|
+
----------
|
31
|
+
estimator_ : ModelTypes
|
32
|
+
The fitted model of type _BasePCA or _PLS
|
33
|
+
|
34
|
+
transformer_ : Optional[Pipeline]
|
35
|
+
Preprocessing steps before the model
|
36
|
+
|
37
|
+
n_features_in_ : int
|
38
|
+
Number of features in the input data
|
39
|
+
|
40
|
+
n_components_ : int
|
41
|
+
Number of components in the model
|
42
|
+
|
43
|
+
n_samples_ : int
|
44
|
+
Number of samples used to train the model
|
45
|
+
|
46
|
+
critical_value_ : float
|
47
|
+
The calculated critical value for outlier detection
|
48
|
+
"""
|
49
|
+
|
50
|
+
def __init__(
|
51
|
+
self,
|
52
|
+
model: Union[ModelTypes, Pipeline],
|
53
|
+
confidence: float,
|
54
|
+
) -> None:
|
55
|
+
(
|
56
|
+
self.estimator_,
|
57
|
+
self.transformer_,
|
58
|
+
self.n_features_in_,
|
59
|
+
self.n_components_,
|
60
|
+
self.n_samples_,
|
61
|
+
) = _validate_and_extract_model(model)
|
62
|
+
self.confidence = _validate_confidence(confidence)
|
63
|
+
|
64
|
+
def fit_predict_residuals(
|
65
|
+
self, X: np.ndarray, y: Optional[np.ndarray] = None
|
66
|
+
) -> np.ndarray:
|
67
|
+
"""Fit the model to the input data and calculate the residuals.
|
68
|
+
|
69
|
+
Parameters
|
70
|
+
----------
|
71
|
+
X : array-like of shape (n_samples, n_features)
|
72
|
+
Input data
|
73
|
+
|
74
|
+
y : array-like of shape (n_samples,), default=None
|
75
|
+
Target values
|
76
|
+
|
77
|
+
Returns
|
78
|
+
-------
|
79
|
+
ndarray of shape (n_samples,)
|
80
|
+
The residuals of the model
|
81
|
+
"""
|
82
|
+
self.fit(X, y)
|
83
|
+
return self.predict_residuals(X, y, validate=True)
|
84
|
+
|
85
|
+
@abstractmethod
|
86
|
+
def predict_residuals(
|
87
|
+
self, X: np.ndarray, y: Optional[np.ndarray], validate: bool
|
88
|
+
) -> np.ndarray:
|
89
|
+
"""Calculate the residuals of the model.
|
90
|
+
|
91
|
+
Returns
|
92
|
+
-------
|
93
|
+
ndarray of shape (n_samples,)
|
94
|
+
The residuals of the model
|
95
|
+
"""
|
96
|
+
|
97
|
+
@abstractmethod
|
98
|
+
def _calculate_critical_value(self, X: np.ndarray) -> float:
|
99
|
+
"""Calculate the critical value for outlier detection.
|
100
|
+
|
101
|
+
Returns
|
102
|
+
-------
|
103
|
+
float
|
104
|
+
The calculated critical value for outlier detection
|
105
|
+
"""
|
106
|
+
|
107
|
+
|
108
|
+
def _get_model_parameters(model: ModelTypes) -> Tuple[int, int, int]:
|
109
|
+
"""
|
110
|
+
Get the number of features, components and samples from a model with PLS or PCA. types.
|
111
|
+
|
112
|
+
Parameters
|
113
|
+
----------
|
114
|
+
model : ModelType
|
115
|
+
A fitted model of type _BasePCA or _PLS
|
116
|
+
|
117
|
+
Returns
|
118
|
+
-------
|
119
|
+
Tuple[int, int, int]
|
120
|
+
The number of features, components and samples in the model
|
121
|
+
"""
|
122
|
+
if isinstance(model, _BasePCA):
|
123
|
+
return model.n_features_in_, model.n_components_, model.n_samples_
|
124
|
+
elif isinstance(model, _PLS):
|
125
|
+
return model.n_features_in_, model.n_components, len(model.x_scores_)
|
126
|
+
else:
|
127
|
+
raise ValueError(
|
128
|
+
"Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
|
129
|
+
)
|
130
|
+
|
131
|
+
|
132
|
+
def _validate_confidence(confidence: float) -> float:
|
133
|
+
"""Validate parameters using sklearn conventions.
|
134
|
+
|
135
|
+
Parameters
|
136
|
+
----------
|
137
|
+
confidence : float
|
138
|
+
Confidence level for statistical calculations (between 0 and 1)
|
139
|
+
|
140
|
+
Returns
|
141
|
+
-------
|
142
|
+
float
|
143
|
+
The validated confidence level
|
144
|
+
|
145
|
+
Raises
|
146
|
+
------
|
147
|
+
ValueError
|
148
|
+
If confidence is not between 0 and 1
|
149
|
+
"""
|
150
|
+
if not 0 < confidence < 1:
|
151
|
+
raise ValueError("Confidence must be between 0 and 1")
|
152
|
+
return confidence
|
153
|
+
|
154
|
+
|
155
|
+
def _validate_and_extract_model(
|
156
|
+
model: Union[ModelTypes, Pipeline],
|
157
|
+
) -> Tuple[ModelTypes, Optional[Pipeline], int, int, int]:
|
158
|
+
"""Validate and extract the model and preprocessing steps.
|
159
|
+
|
160
|
+
Parameters
|
161
|
+
----------
|
162
|
+
model : Union[ModelTypes, Pipeline]
|
163
|
+
A fitted PCA/PLS model or Pipeline ending with such a model
|
164
|
+
|
165
|
+
Returns
|
166
|
+
-------
|
167
|
+
Tuple[ModelTypes, Optional[Pipeline]]
|
168
|
+
The extracted model and preprocessing steps
|
169
|
+
|
170
|
+
Raises
|
171
|
+
------
|
172
|
+
ValueError
|
173
|
+
If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
|
174
|
+
"""
|
175
|
+
if isinstance(model, Pipeline):
|
176
|
+
preprocessing = model[:-1]
|
177
|
+
model = model[-1]
|
178
|
+
else:
|
179
|
+
preprocessing = None
|
180
|
+
|
181
|
+
if not isinstance(model, (_BasePCA, _PLS)):
|
182
|
+
raise ValueError(
|
183
|
+
"Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
|
184
|
+
)
|
185
|
+
|
186
|
+
check_is_fitted(model)
|
187
|
+
n_features_in, n_components, n_samples = _get_model_parameters(model)
|
188
|
+
return model, preprocessing, n_features_in, n_components, n_samples
|
@@ -0,0 +1,164 @@
|
|
1
|
+
from typing import Optional, Union
|
2
|
+
import numpy as np
|
3
|
+
|
4
|
+
from sklearn.pipeline import Pipeline
|
5
|
+
from sklearn.utils.validation import validate_data, check_is_fitted
|
6
|
+
from scipy.stats import f as f_distribution
|
7
|
+
|
8
|
+
|
9
|
+
from ._base import _ModelResidualsBase, ModelTypes
|
10
|
+
from .utils import calculate_residual_spectrum
|
11
|
+
|
12
|
+
|
13
|
+
class DModX(_ModelResidualsBase):
|
14
|
+
"""Calculate Distance to Model (DModX) statistics.
|
15
|
+
|
16
|
+
DModX measures the distance between an observation and the model plane
|
17
|
+
in the X-space, useful for detecting outliers.
|
18
|
+
|
19
|
+
Parameters
|
20
|
+
----------
|
21
|
+
model : Union[ModelType, Pipeline]
|
22
|
+
A fitted PCA/PLS model or Pipeline ending with such a model
|
23
|
+
|
24
|
+
confidence : float, default=0.95
|
25
|
+
Confidence level for statistical calculations (between 0 and 1)
|
26
|
+
|
27
|
+
Attributes
|
28
|
+
----------
|
29
|
+
estimator_ : ModelType
|
30
|
+
The fitted model of type _BasePCA or _PLS
|
31
|
+
|
32
|
+
transformer_ : Optional[Pipeline]
|
33
|
+
Preprocessing steps before the model
|
34
|
+
|
35
|
+
n_features_in_ : int
|
36
|
+
Number of features in the input data
|
37
|
+
|
38
|
+
n_components_ : int
|
39
|
+
Number of components in the model
|
40
|
+
|
41
|
+
n_samples_ : int
|
42
|
+
Number of samples used to train the model
|
43
|
+
|
44
|
+
critical_value_ : float
|
45
|
+
The calculated critical value for outlier detection
|
46
|
+
|
47
|
+
train_spe_: float
|
48
|
+
The training sum of squared errors (SSE) for the model normalized by degrees of freedom
|
49
|
+
"""
|
50
|
+
|
51
|
+
def __init__(
|
52
|
+
self,
|
53
|
+
model: Union[ModelTypes, Pipeline],
|
54
|
+
confidence: float = 0.95,
|
55
|
+
) -> None:
|
56
|
+
model, confidence = model, confidence
|
57
|
+
super().__init__(model, confidence)
|
58
|
+
|
59
|
+
def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "DModX":
|
60
|
+
"""
|
61
|
+
Fit the model to the input data.
|
62
|
+
|
63
|
+
This step calculates the critical value for the outlier detection. In the DmodX method,
|
64
|
+
the critical value is not depend on the input data but on the model parameters.
|
65
|
+
"""
|
66
|
+
X = validate_data(
|
67
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
68
|
+
)
|
69
|
+
|
70
|
+
# Calculate the critical value
|
71
|
+
self.critical_value_ = self._calculate_critical_value()
|
72
|
+
|
73
|
+
# Calculate the degrees of freedom normalized SPE of the training set
|
74
|
+
residuals = calculate_residual_spectrum(X, self.estimator_)
|
75
|
+
squared_errors = np.sum((residuals) ** 2, axis=1)
|
76
|
+
self.train_spe_ = np.sqrt(
|
77
|
+
squared_errors
|
78
|
+
/ (self.n_samples_ - self.n_components_ - 1)
|
79
|
+
* (self.n_features_in_ - self.n_components_)
|
80
|
+
)
|
81
|
+
|
82
|
+
return self
|
83
|
+
|
84
|
+
def predict(self, X: np.ndarray) -> np.ndarray:
|
85
|
+
"""Identify outliers in the input data.
|
86
|
+
|
87
|
+
Parameters
|
88
|
+
----------
|
89
|
+
X : array-like of shape (n_samples, n_features)
|
90
|
+
Input data
|
91
|
+
|
92
|
+
Returns
|
93
|
+
-------
|
94
|
+
ndarray of shape (n_samples,)
|
95
|
+
Boolean array indicating outliers
|
96
|
+
"""
|
97
|
+
# Check the estimator has been fitted
|
98
|
+
check_is_fitted(self, ["critical_value_"])
|
99
|
+
|
100
|
+
# Validate the input data
|
101
|
+
X = validate_data(
|
102
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
103
|
+
)
|
104
|
+
|
105
|
+
# Calculate outliers based on the DModX statistics
|
106
|
+
dmodx_values = self.predict_residuals(X, validate=False)
|
107
|
+
return np.where(dmodx_values > self.critical_value_, -1, 1)
|
108
|
+
|
109
|
+
def predict_residuals(
|
110
|
+
self, X: np.ndarray, y: Optional[np.ndarray] = None, validate: bool = True
|
111
|
+
) -> np.ndarray:
|
112
|
+
"""Calculate DModX statistics for input data.
|
113
|
+
|
114
|
+
Parameters
|
115
|
+
----------
|
116
|
+
X : array-like of shape (n_samples, n_features)
|
117
|
+
Input data
|
118
|
+
|
119
|
+
validate : bool, default=True
|
120
|
+
Whether to validate the input data
|
121
|
+
|
122
|
+
Returns
|
123
|
+
-------
|
124
|
+
ndarray of shape (n_samples,)
|
125
|
+
DModX statistics for each sample
|
126
|
+
"""
|
127
|
+
# Check the estimator has been fitted
|
128
|
+
check_is_fitted(self, ["critical_value_"])
|
129
|
+
|
130
|
+
# Validate the input data
|
131
|
+
if validate:
|
132
|
+
X = validate_data(
|
133
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
134
|
+
)
|
135
|
+
|
136
|
+
# Apply preprocessing if available
|
137
|
+
if self.transformer_:
|
138
|
+
X = self.transformer_.transform(X)
|
139
|
+
|
140
|
+
# Calculate the DModX statistics
|
141
|
+
residual = calculate_residual_spectrum(X, self.estimator_)
|
142
|
+
squared_errors = np.sum((residual) ** 2, axis=1)
|
143
|
+
|
144
|
+
return (
|
145
|
+
np.sqrt(squared_errors / (self.n_features_in_ - self.n_components_))
|
146
|
+
/ self.train_spe_
|
147
|
+
)
|
148
|
+
|
149
|
+
def _calculate_critical_value(self, X: Optional[np.ndarray] = None) -> float:
|
150
|
+
"""Calculate F-distribution based critical value.
|
151
|
+
|
152
|
+
Returns
|
153
|
+
-------
|
154
|
+
float
|
155
|
+
The critical value for outlier detection
|
156
|
+
"""
|
157
|
+
|
158
|
+
dof_numerator = self.n_features_in_ - self.n_components_
|
159
|
+
dof_denominator = self.n_features_in_ - self.n_components_ - 1
|
160
|
+
|
161
|
+
upper_control_limit = f_distribution.ppf(
|
162
|
+
self.confidence, dof_numerator, dof_denominator
|
163
|
+
)
|
164
|
+
return np.sqrt(upper_control_limit)
|
@@ -0,0 +1,156 @@
|
|
1
|
+
from typing import Optional, Union
|
2
|
+
import numpy as np
|
3
|
+
|
4
|
+
from sklearn.cross_decomposition._pls import _PLS
|
5
|
+
from sklearn.decomposition._base import _BasePCA
|
6
|
+
from sklearn.pipeline import Pipeline
|
7
|
+
from sklearn.utils.validation import validate_data, check_is_fitted
|
8
|
+
from scipy.stats import f as f_distribution
|
9
|
+
|
10
|
+
from ._base import _ModelResidualsBase, ModelTypes
|
11
|
+
|
12
|
+
|
13
|
+
class HotellingT2(_ModelResidualsBase):
|
14
|
+
"""
|
15
|
+
Calculate Hotelling's T-squared statistics for PCA or PLS like models.
|
16
|
+
|
17
|
+
Parameters
|
18
|
+
----------
|
19
|
+
model : Union[ModelType, Pipeline]
|
20
|
+
A fitted PCA/PLS model or Pipeline ending with such a model
|
21
|
+
|
22
|
+
confidence : float, default=0.95
|
23
|
+
Confidence level for statistical calculations (between 0 and 1)
|
24
|
+
|
25
|
+
Attributes
|
26
|
+
----------
|
27
|
+
estimator_ : ModelType
|
28
|
+
The fitted model of type _BasePCA or _PLS
|
29
|
+
|
30
|
+
transformer_ : Optional[Pipeline]
|
31
|
+
Preprocessing steps before the model
|
32
|
+
|
33
|
+
n_features_in_ : int
|
34
|
+
Number of features in the input data
|
35
|
+
|
36
|
+
n_components_ : int
|
37
|
+
Number of components in the model
|
38
|
+
|
39
|
+
n_samples_ : int
|
40
|
+
Number of samples used to train the model
|
41
|
+
|
42
|
+
critical_value_ : float
|
43
|
+
The calculated critical value for outlier detection
|
44
|
+
|
45
|
+
References
|
46
|
+
----------
|
47
|
+
Johan A. Westerhuis, Stephen P. Gurden, Age K. Smilde (2001) Generalized contribution plots in multivariate statistical process
|
48
|
+
monitoring Chemometrics and Intelligent Laboratory Systems 51 2000 95–114
|
49
|
+
"""
|
50
|
+
|
51
|
+
def __init__(
|
52
|
+
self, model: Union[ModelTypes, Pipeline], confidence: float = 0.95
|
53
|
+
) -> None:
|
54
|
+
self.model, self.confidence = model, confidence
|
55
|
+
super().__init__(model, confidence)
|
56
|
+
|
57
|
+
def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "HotellingT2":
|
58
|
+
"""
|
59
|
+
Fit the model to the input data.
|
60
|
+
|
61
|
+
This step calculates the critical value for the outlier detection. In the DmodX method,
|
62
|
+
the critical value is not depend on the input data but on the model parameters.
|
63
|
+
"""
|
64
|
+
X = validate_data(
|
65
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
66
|
+
)
|
67
|
+
|
68
|
+
self.critical_value_ = self._calculate_critical_value()
|
69
|
+
return self
|
70
|
+
|
71
|
+
def predict(self, X: np.ndarray) -> np.ndarray:
|
72
|
+
"""Identify outliers in the input data.
|
73
|
+
|
74
|
+
Parameters
|
75
|
+
----------
|
76
|
+
X : array-like of shape (n_samples, n_features)
|
77
|
+
Input data
|
78
|
+
|
79
|
+
Returns
|
80
|
+
-------
|
81
|
+
ndarray of shape (n_samples,)
|
82
|
+
Boolean array indicating outliers
|
83
|
+
"""
|
84
|
+
# Check the estimator has been fitted
|
85
|
+
check_is_fitted(self, ["critical_value_"])
|
86
|
+
|
87
|
+
# Validate the input data
|
88
|
+
X = validate_data(
|
89
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
90
|
+
)
|
91
|
+
|
92
|
+
# Calculate the Hotelling's T-squared statistics
|
93
|
+
hotelling_t2_values = self.predict_residuals(X, y=None, validate=False)
|
94
|
+
return np.where(hotelling_t2_values > self.critical_value_, -1, 1)
|
95
|
+
|
96
|
+
def predict_residuals(
|
97
|
+
self, X: np.ndarray, y: Optional[np.ndarray] = None, validate: bool = True
|
98
|
+
) -> np.ndarray:
|
99
|
+
"""Calculate Hotelling's T-squared statistics for input data.
|
100
|
+
|
101
|
+
Parameters
|
102
|
+
----------
|
103
|
+
X : array-like of shape (n_samples, n_features)
|
104
|
+
Input data
|
105
|
+
|
106
|
+
Returns
|
107
|
+
-------
|
108
|
+
ndarray of shape (n_samples,)
|
109
|
+
Hotellin's T-squared statistics for each sample
|
110
|
+
"""
|
111
|
+
# Check the estimator has been fitted
|
112
|
+
check_is_fitted(self, ["critical_value_"])
|
113
|
+
|
114
|
+
# Validate the input data
|
115
|
+
if validate:
|
116
|
+
X = validate_data(
|
117
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
118
|
+
)
|
119
|
+
|
120
|
+
# Apply preprocessing steps
|
121
|
+
if self.transformer_:
|
122
|
+
X = self.transformer_.transform(X)
|
123
|
+
|
124
|
+
# Calculate the Hotelling's T-squared statistics
|
125
|
+
if isinstance(self.estimator_, _BasePCA):
|
126
|
+
# For PCA-like models
|
127
|
+
variances = self.estimator_.explained_variance_
|
128
|
+
|
129
|
+
if isinstance(self.estimator_, _PLS):
|
130
|
+
# For PLS-like models
|
131
|
+
variances = np.var(self.estimator_.x_scores_, axis=0)
|
132
|
+
|
133
|
+
# Equivalent to X @ model.components_.T for _BasePCA and X @ model.x_rotations_ for _PLS
|
134
|
+
X_transformed = self.estimator_.transform(X)
|
135
|
+
|
136
|
+
return np.sum((X_transformed**2) / variances, axis=1)
|
137
|
+
|
138
|
+
def _calculate_critical_value(self, X: Optional[np.ndarray] = None) -> float:
|
139
|
+
"""
|
140
|
+
Calculate the critical value for the Hotelling's T-squared statistics.
|
141
|
+
|
142
|
+
Returns
|
143
|
+
-------
|
144
|
+
float
|
145
|
+
The critical value for the Hotelling's T-squared statistics
|
146
|
+
"""
|
147
|
+
|
148
|
+
critical_value = f_distribution.ppf(
|
149
|
+
self.confidence, self.n_components_, self.n_samples_ - self.n_components_
|
150
|
+
)
|
151
|
+
return (
|
152
|
+
critical_value
|
153
|
+
* self.n_components_
|
154
|
+
* (self.n_samples_ - 1)
|
155
|
+
/ (self.n_samples_ - self.n_components_)
|
156
|
+
)
|