chemotools 0.1.10__tar.gz → 0.1.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {chemotools-0.1.10 → chemotools-0.1.11}/PKG-INFO +2 -2
- chemotools-0.1.11/chemotools/feature_selection/__init__.py +6 -0
- chemotools-0.1.11/chemotools/feature_selection/_base.py +88 -0
- chemotools-0.1.11/chemotools/feature_selection/_sr_selector.py +137 -0
- chemotools-0.1.11/chemotools/feature_selection/_vip_selector.py +129 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/outliers/_base.py +75 -67
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/outliers/dmodx.py +26 -8
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/outliers/hotelling_t2.py +11 -10
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/outliers/leverage.py +15 -14
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/outliers/q_residuals.py +19 -16
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/outliers/studentized_residuals.py +17 -16
- chemotools-0.1.11/chemotools/outliers/utils.py +51 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/pyproject.toml +2 -2
- chemotools-0.1.10/chemotools/feature_selection/__init__.py +0 -4
- chemotools-0.1.10/chemotools/outliers/_utils.py +0 -91
- {chemotools-0.1.10 → chemotools-0.1.11}/LICENSE +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/README.md +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/__init__.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/augmentation/__init__.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/augmentation/_add_noise.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/augmentation/_baseline_shift.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/augmentation/_fractional_shift.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/augmentation/_gaussian_broadening.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/augmentation/_index_shift.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/augmentation/_spectrum_scale.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/baseline/__init__.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/baseline/_air_pls.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/baseline/_ar_pls.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/baseline/_constant_baseline_correction.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/baseline/_cubic_spline_correction.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/baseline/_linear_correction.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/baseline/_non_negative.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/baseline/_polynomial_correction.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/baseline/_subtract_reference.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/datasets/__init__.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/datasets/_base.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/datasets/data/__init__.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/datasets/data/coffee_labels.csv +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/datasets/data/coffee_spectra.csv +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/datasets/data/fermentation_hplc.csv +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/datasets/data/fermentation_spectra.csv +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/datasets/data/train_hplc.csv +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/datasets/data/train_spectra.csv +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/derivative/__init__.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/derivative/_norris_william.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/derivative/_savitzky_golay.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/feature_selection/_index_selector.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/feature_selection/_range_cut.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/outliers/__init__.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/scale/__init__.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/scale/_min_max_scaler.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/scale/_norm_scaler.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/scale/_point_scaler.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/scatter/__init__.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/scatter/_extended_multiplicative_scatter_correction.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/scatter/_multiplicative_scatter_correction.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/scatter/_robust_normal_variate.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/scatter/_standard_normal_variate.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/smooth/__init__.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/smooth/_mean_filter.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/smooth/_median_filter.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/smooth/_savitzky_golay_filter.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/smooth/_whittaker_smooth.py +0 -0
- {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/utils/__init__.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: chemotools
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.11
|
4
4
|
Summary: chemotools: A Python Package that Integrates Chemometrics and scikit-learn
|
5
5
|
License: MIT
|
6
6
|
Author: Pau Cabaneros
|
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.13
|
|
14
14
|
Requires-Dist: numpy (>=2.0.0,<3.0.0)
|
15
15
|
Requires-Dist: pandas (>=2.0.0,<3.0.0)
|
16
16
|
Requires-Dist: polars (>=1.17.0,<2.0.0)
|
17
|
-
Requires-Dist: pyarrow (>=18
|
17
|
+
Requires-Dist: pyarrow (>=18,<21)
|
18
18
|
Requires-Dist: scikit-learn (>=1.4.0,<2.0.0)
|
19
19
|
Description-Content-Type: text/markdown
|
20
20
|
|
@@ -0,0 +1,88 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import Union
|
3
|
+
|
4
|
+
import numpy as np
|
5
|
+
|
6
|
+
from sklearn.base import BaseEstimator
|
7
|
+
from sklearn.cross_decomposition._pls import _PLS
|
8
|
+
from sklearn.feature_selection._base import SelectorMixin
|
9
|
+
from sklearn.pipeline import Pipeline
|
10
|
+
from sklearn.utils.validation import check_is_fitted
|
11
|
+
|
12
|
+
ModelTypes = Union[_PLS, Pipeline]
|
13
|
+
|
14
|
+
|
15
|
+
class _PLSFeatureSelectorBase(ABC, BaseEstimator, SelectorMixin):
|
16
|
+
"""Feature selection base class for _PLS-like models.
|
17
|
+
|
18
|
+
|
19
|
+
Parameters
|
20
|
+
----------
|
21
|
+
model : Union[_PLS, Pipeline]
|
22
|
+
A fitted _PLS models or Pipeline ending with such a model
|
23
|
+
|
24
|
+
threshold : float
|
25
|
+
The threshold for feature selection. Features with importance
|
26
|
+
above this threshold will be selected.
|
27
|
+
|
28
|
+
Attributes
|
29
|
+
----------
|
30
|
+
estimator_ : ModelTypes
|
31
|
+
The fitted model of type _BasePCA or _PLS
|
32
|
+
|
33
|
+
feature_scores_ : np.ndarray
|
34
|
+
The calculated feature scores based on the selected method.
|
35
|
+
|
36
|
+
support_mask : np.ndarray
|
37
|
+
The boolean mask indicating which features are selected.
|
38
|
+
"""
|
39
|
+
|
40
|
+
def __init__(
|
41
|
+
self,
|
42
|
+
model: Union[_PLS, Pipeline],
|
43
|
+
) -> None:
|
44
|
+
self.estimator_ = _validate_and_extract_model(model)
|
45
|
+
|
46
|
+
@abstractmethod
|
47
|
+
def _calculate_features(self, X: np.ndarray) -> np.ndarray:
|
48
|
+
"""Calculate the residuals of the model.
|
49
|
+
|
50
|
+
Returns
|
51
|
+
-------
|
52
|
+
ndarray of shape (n_samples,)
|
53
|
+
The residuals of the model
|
54
|
+
"""
|
55
|
+
|
56
|
+
|
57
|
+
def _validate_and_extract_model(
|
58
|
+
model: Union[_PLS, Pipeline],
|
59
|
+
) -> _PLS:
|
60
|
+
"""Validate and extract the model.
|
61
|
+
|
62
|
+
Parameters
|
63
|
+
----------
|
64
|
+
model : Union[_PLS, Pipeline]
|
65
|
+
A fitted _PLS model or Pipeline ending with such a model
|
66
|
+
|
67
|
+
Returns
|
68
|
+
-------
|
69
|
+
_PLS
|
70
|
+
The extracted estimator
|
71
|
+
|
72
|
+
Raises
|
73
|
+
------
|
74
|
+
TypeError
|
75
|
+
If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
|
76
|
+
"""
|
77
|
+
if isinstance(model, Pipeline):
|
78
|
+
estimator = model[-1]
|
79
|
+
else:
|
80
|
+
estimator = model
|
81
|
+
|
82
|
+
if not isinstance(estimator, _PLS):
|
83
|
+
raise TypeError(
|
84
|
+
"Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
|
85
|
+
)
|
86
|
+
|
87
|
+
check_is_fitted(model)
|
88
|
+
return estimator
|
@@ -0,0 +1,137 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from sklearn.utils.validation import validate_data
|
3
|
+
|
4
|
+
from ._base import _PLSFeatureSelectorBase
|
5
|
+
|
6
|
+
|
7
|
+
class SRSelector(_PLSFeatureSelectorBase):
|
8
|
+
"""
|
9
|
+
This selector is used to select features that contribute significantly
|
10
|
+
to the latent variables in a PLS regression model using the Selectivity
|
11
|
+
Ratio (SR) method.
|
12
|
+
|
13
|
+
Parameters
|
14
|
+
----------
|
15
|
+
- model: Union[_PLS, Pipeline]
|
16
|
+
The PLS regression model or a pipeline with a PLS regression model as last step.
|
17
|
+
|
18
|
+
- threshold: float, default=1.0
|
19
|
+
The threshold for feature selection. Features with importance
|
20
|
+
above this threshold will be selected.
|
21
|
+
|
22
|
+
Attributes
|
23
|
+
----------
|
24
|
+
estimator_ : ModelTypes
|
25
|
+
The fitted model of type _BasePCA or _PLS
|
26
|
+
|
27
|
+
feature_scores_ : np.ndarray
|
28
|
+
The calculated feature scores based on the selected method.
|
29
|
+
|
30
|
+
support_mask_ : np.ndarray
|
31
|
+
The boolean mask indicating which features are selected.
|
32
|
+
|
33
|
+
Methods
|
34
|
+
-------
|
35
|
+
fit(X, y=None)
|
36
|
+
Fit the transformer to the input data. It calculates the feature scores and the feature_mask.
|
37
|
+
"""
|
38
|
+
|
39
|
+
def __init__(
|
40
|
+
self,
|
41
|
+
model,
|
42
|
+
threshold: float = 1.0,
|
43
|
+
):
|
44
|
+
self.model = model
|
45
|
+
self.threshold = threshold
|
46
|
+
super().__init__(self.model)
|
47
|
+
|
48
|
+
def fit(self, X: np.ndarray, y=None) -> "SRSelector":
|
49
|
+
"""
|
50
|
+
Fit the transformer to calculate the feature scores and the support mask.
|
51
|
+
|
52
|
+
Parameters
|
53
|
+
----------
|
54
|
+
X : array-like of shape (n_samples, n_features)
|
55
|
+
The input data to fit the transformer to.
|
56
|
+
|
57
|
+
y : None
|
58
|
+
Ignored.
|
59
|
+
|
60
|
+
Returns
|
61
|
+
-------
|
62
|
+
self : SRSelector
|
63
|
+
The fitted transformer.
|
64
|
+
"""
|
65
|
+
# Check that X is a 2D array and has only finite values
|
66
|
+
X = validate_data(
|
67
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
68
|
+
)
|
69
|
+
|
70
|
+
# Calculate the SR scores
|
71
|
+
self.feature_scores_ = self._calculate_features(X)
|
72
|
+
|
73
|
+
# Calculate the support mask
|
74
|
+
self.support_mask_ = self._get_support_mask()
|
75
|
+
|
76
|
+
return self
|
77
|
+
|
78
|
+
def _get_support_mask(self) -> np.ndarray:
|
79
|
+
"""
|
80
|
+
Get the support mask based on the feature scores and threshold.
|
81
|
+
Features with scores above the threshold are selected.
|
82
|
+
Parameters
|
83
|
+
----------
|
84
|
+
self : SRSelector
|
85
|
+
The fitted transformer.
|
86
|
+
|
87
|
+
Returns
|
88
|
+
-------
|
89
|
+
support_mask_ : np.ndarray
|
90
|
+
The boolean mask indicating which features are selected.
|
91
|
+
"""
|
92
|
+
return self.feature_scores_ > self.threshold
|
93
|
+
|
94
|
+
def _calculate_features(self, X: np.ndarray) -> np.ndarray:
|
95
|
+
"""
|
96
|
+
Vectorized Selectivity Ratio calculation from a fitted _PLS
|
97
|
+
like model.
|
98
|
+
|
99
|
+
Parameters:
|
100
|
+
----------
|
101
|
+
- self: SRSelector
|
102
|
+
The fitted transformer.
|
103
|
+
|
104
|
+
- X: array-like of shape (n_samples, n_features)
|
105
|
+
The input training data to calculate the feature scores from.
|
106
|
+
|
107
|
+
Returns
|
108
|
+
-------
|
109
|
+
feature_scores_ : np.ndarray
|
110
|
+
The calculated feature scores based on the selected method.
|
111
|
+
"""
|
112
|
+
bpls = self.estimator_.coef_
|
113
|
+
bpls_norm = bpls.T / np.linalg.norm(bpls)
|
114
|
+
|
115
|
+
# Handle 1D case correctly
|
116
|
+
if bpls.ndim == 1:
|
117
|
+
bpls_norm = bpls_norm.reshape(-1, 1)
|
118
|
+
|
119
|
+
# Project X onto the regression vector
|
120
|
+
ttp = X @ bpls_norm
|
121
|
+
ptp = X.T @ np.linalg.pinv(ttp).T
|
122
|
+
|
123
|
+
# Predicted part of X
|
124
|
+
X_hat = ttp @ ptp.T
|
125
|
+
|
126
|
+
# Compute squared norms directly
|
127
|
+
total_ss = np.linalg.norm(X, axis=0) ** 2
|
128
|
+
explained_ss = np.linalg.norm(X_hat, axis=0) ** 2
|
129
|
+
|
130
|
+
# Calculate residual sum of squares
|
131
|
+
residual_ss = total_ss - explained_ss
|
132
|
+
|
133
|
+
# Stability: avoid division by zero
|
134
|
+
epsilon = 1e-12
|
135
|
+
|
136
|
+
# Calculate Selectivity Ratio
|
137
|
+
return explained_ss / (residual_ss + epsilon)
|
@@ -0,0 +1,129 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from sklearn.utils.validation import validate_data
|
3
|
+
|
4
|
+
from ._base import _PLSFeatureSelectorBase
|
5
|
+
|
6
|
+
|
7
|
+
class VIPSelector(_PLSFeatureSelectorBase):
|
8
|
+
"""
|
9
|
+
This selector is used to select features that contribute significantly
|
10
|
+
to the latent variables in a PLS regression model using the Variables
|
11
|
+
Importance in Projection (VIP) method.
|
12
|
+
|
13
|
+
Parameters
|
14
|
+
----------
|
15
|
+
- model: Union[_PLS, Pipeline]
|
16
|
+
The PLS regression model or a pipeline with a PLS regression model as last step.
|
17
|
+
|
18
|
+
- threshold: float, default=1.0
|
19
|
+
The threshold for feature selection. Features with importance
|
20
|
+
above this threshold will be selected.
|
21
|
+
|
22
|
+
Attributes
|
23
|
+
----------
|
24
|
+
estimator_ : ModelTypes
|
25
|
+
The fitted model of type _BasePCA or _PLS
|
26
|
+
|
27
|
+
feature_scores_ : np.ndarray
|
28
|
+
The calculated feature scores based on the selected method.
|
29
|
+
|
30
|
+
support_mask_ : np.ndarray
|
31
|
+
The boolean mask indicating which features are selected.
|
32
|
+
|
33
|
+
Methods
|
34
|
+
-------
|
35
|
+
fit(X, y=None)
|
36
|
+
Fit the transformer to the input data. It calculates the feature scores and the feature_mask.
|
37
|
+
"""
|
38
|
+
|
39
|
+
def __init__(
|
40
|
+
self,
|
41
|
+
model,
|
42
|
+
threshold: float = 1.0,
|
43
|
+
):
|
44
|
+
self.model = model
|
45
|
+
self.threshold = threshold
|
46
|
+
super().__init__(self.model)
|
47
|
+
|
48
|
+
def fit(self, X: np.ndarray, y=None) -> "VIPSelector":
|
49
|
+
"""
|
50
|
+
Fit the transformer to calculate the feature scores and the support mask.
|
51
|
+
|
52
|
+
Parameters
|
53
|
+
----------
|
54
|
+
X : array-like of shape (n_samples, n_features)
|
55
|
+
The input data to fit the transformer to.
|
56
|
+
|
57
|
+
y : None
|
58
|
+
Ignored.
|
59
|
+
|
60
|
+
Returns
|
61
|
+
-------
|
62
|
+
self : VIPSelector
|
63
|
+
The fitted transformer.
|
64
|
+
"""
|
65
|
+
# Check that X is a 2D array and has only finite values
|
66
|
+
X = validate_data(
|
67
|
+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
68
|
+
)
|
69
|
+
|
70
|
+
# Calculate the VIP scores
|
71
|
+
self.feature_scores_ = self._calculate_features(X)
|
72
|
+
|
73
|
+
# Calculate the support mask
|
74
|
+
self.support_mask_ = self._get_support_mask()
|
75
|
+
|
76
|
+
return self
|
77
|
+
|
78
|
+
def _get_support_mask(self) -> np.ndarray:
|
79
|
+
"""
|
80
|
+
Get the support mask based on the feature scores and threshold.
|
81
|
+
Features with scores above the threshold are selected.
|
82
|
+
Parameters
|
83
|
+
----------
|
84
|
+
self : VIPSelector
|
85
|
+
The fitted transformer.
|
86
|
+
|
87
|
+
Returns
|
88
|
+
-------
|
89
|
+
support_mask_ : np.ndarray
|
90
|
+
The boolean mask indicating which features are selected.
|
91
|
+
"""
|
92
|
+
return self.feature_scores_ > self.threshold
|
93
|
+
|
94
|
+
def _calculate_features(self, X: np.ndarray) -> np.ndarray:
|
95
|
+
"""
|
96
|
+
Calculate the VIP scores based on the fitted model.
|
97
|
+
|
98
|
+
Parameters
|
99
|
+
----------
|
100
|
+
self : VIPSelector
|
101
|
+
The fitted transformer.
|
102
|
+
|
103
|
+
Returns
|
104
|
+
-------
|
105
|
+
feature_scores_ : np.ndarray
|
106
|
+
The calculated feature scores based on the selected method.
|
107
|
+
"""
|
108
|
+
# Calculate sum of squares of y_loadings and x_scores
|
109
|
+
sum_of_squares_y_loadings = (
|
110
|
+
np.linalg.norm(self.estimator_.y_loadings_, ord=2, axis=0) ** 2
|
111
|
+
)
|
112
|
+
sum_of_squares_x_scores = (
|
113
|
+
np.linalg.norm(self.estimator_.x_scores_, ord=2, axis=0) ** 2
|
114
|
+
)
|
115
|
+
|
116
|
+
# Calculate the sum of squares
|
117
|
+
sum_of_squares = sum_of_squares_y_loadings * sum_of_squares_x_scores
|
118
|
+
|
119
|
+
# Calculate the numerator
|
120
|
+
numerator = self.estimator_.n_features_in_ * np.sum(
|
121
|
+
sum_of_squares * self.estimator_.x_weights_**2,
|
122
|
+
axis=1,
|
123
|
+
)
|
124
|
+
|
125
|
+
# Calculate the denominator
|
126
|
+
denominator = np.sum(sum_of_squares, axis=0)
|
127
|
+
|
128
|
+
# Calculate the VIP scores
|
129
|
+
return np.sqrt(numerator / denominator)
|
@@ -1,5 +1,5 @@
|
|
1
1
|
from abc import ABC, abstractmethod
|
2
|
-
from typing import
|
2
|
+
from typing import Optional, Tuple, Union
|
3
3
|
|
4
4
|
import numpy as np
|
5
5
|
|
@@ -9,7 +9,6 @@ from sklearn.cross_decomposition._pls import _PLS
|
|
9
9
|
from sklearn.pipeline import Pipeline
|
10
10
|
from sklearn.utils.validation import check_is_fitted
|
11
11
|
|
12
|
-
from ._utils import validate_confidence, validate_and_extract_model
|
13
12
|
|
14
13
|
ModelTypes = Union[_BasePCA, _PLS]
|
15
14
|
|
@@ -29,10 +28,10 @@ class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
|
|
29
28
|
|
30
29
|
Attributes
|
31
30
|
----------
|
32
|
-
|
31
|
+
estimator_ : ModelTypes
|
33
32
|
The fitted model of type _BasePCA or _PLS
|
34
33
|
|
35
|
-
|
34
|
+
transformer_ : Optional[Pipeline]
|
36
35
|
Preprocessing steps before the model
|
37
36
|
|
38
37
|
n_features_in_ : int
|
@@ -54,13 +53,13 @@ class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
|
|
54
53
|
confidence: float,
|
55
54
|
) -> None:
|
56
55
|
(
|
57
|
-
self.
|
58
|
-
self.
|
56
|
+
self.estimator_,
|
57
|
+
self.transformer_,
|
59
58
|
self.n_features_in_,
|
60
59
|
self.n_components_,
|
61
60
|
self.n_samples_,
|
62
|
-
) =
|
63
|
-
self.confidence =
|
61
|
+
) = _validate_and_extract_model(model)
|
62
|
+
self.confidence = _validate_confidence(confidence)
|
64
63
|
|
65
64
|
def fit_predict_residuals(
|
66
65
|
self, X: np.ndarray, y: Optional[np.ndarray] = None
|
@@ -96,7 +95,7 @@ class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
|
|
96
95
|
"""
|
97
96
|
|
98
97
|
@abstractmethod
|
99
|
-
def _calculate_critical_value(self, X:
|
98
|
+
def _calculate_critical_value(self, X: np.ndarray) -> float:
|
100
99
|
"""Calculate the critical value for outlier detection.
|
101
100
|
|
102
101
|
Returns
|
@@ -106,75 +105,84 @@ class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
|
|
106
105
|
"""
|
107
106
|
|
108
107
|
|
109
|
-
|
110
|
-
"""
|
111
|
-
|
108
|
+
def _get_model_parameters(model: ModelTypes) -> Tuple[int, int, int]:
|
109
|
+
"""
|
110
|
+
Get the number of features, components and samples from a model with PLS or PCA. types.
|
112
111
|
|
113
112
|
Parameters
|
114
113
|
----------
|
115
|
-
model :
|
116
|
-
A fitted
|
117
|
-
|
118
|
-
Attributes
|
119
|
-
----------
|
120
|
-
model_ : ModelTypes
|
121
|
-
The fitted model of type _BasePCA or _PLS
|
122
|
-
|
123
|
-
preprocessing_ : Optional[Pipeline]
|
124
|
-
Preprocessing steps before the model
|
114
|
+
model : ModelType
|
115
|
+
A fitted model of type _BasePCA or _PLS
|
125
116
|
|
117
|
+
Returns
|
118
|
+
-------
|
119
|
+
Tuple[int, int, int]
|
120
|
+
The number of features, components and samples in the model
|
126
121
|
"""
|
122
|
+
if isinstance(model, _BasePCA):
|
123
|
+
return model.n_features_in_, model.n_components_, model.n_samples_
|
124
|
+
elif isinstance(model, _PLS):
|
125
|
+
return model.n_features_in_, model.n_components, len(model.x_scores_)
|
126
|
+
else:
|
127
|
+
raise ValueError(
|
128
|
+
"Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
|
129
|
+
)
|
127
130
|
|
128
|
-
def __init__(self, model: Union[ModelTypes, Pipeline]):
|
129
|
-
self.model_, self.preprocessing_ = self._validate_and_extract_model(model)
|
130
131
|
|
131
|
-
|
132
|
-
|
132
|
+
def _validate_confidence(confidence: float) -> float:
|
133
|
+
"""Validate parameters using sklearn conventions.
|
133
134
|
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
135
|
+
Parameters
|
136
|
+
----------
|
137
|
+
confidence : float
|
138
|
+
Confidence level for statistical calculations (between 0 and 1)
|
138
139
|
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
140
|
+
Returns
|
141
|
+
-------
|
142
|
+
float
|
143
|
+
The validated confidence level
|
143
144
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
else:
|
153
|
-
preprocessing = None
|
154
|
-
|
155
|
-
if isinstance(model, (_BasePCA, _PLS)):
|
156
|
-
check_is_fitted(model)
|
157
|
-
else:
|
158
|
-
raise ValueError(
|
159
|
-
"Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
|
160
|
-
)
|
161
|
-
check_is_fitted(model)
|
162
|
-
return model, preprocessing
|
145
|
+
Raises
|
146
|
+
------
|
147
|
+
ValueError
|
148
|
+
If confidence is not between 0 and 1
|
149
|
+
"""
|
150
|
+
if not 0 < confidence < 1:
|
151
|
+
raise ValueError("Confidence must be between 0 and 1")
|
152
|
+
return confidence
|
163
153
|
|
164
|
-
@abstractmethod
|
165
|
-
def predict(self, X: np.ndarray, y: Optional[np.ndarray]) -> np.ndarray:
|
166
|
-
"""Predict the output of the model.
|
167
154
|
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
155
|
+
def _validate_and_extract_model(
|
156
|
+
model: Union[ModelTypes, Pipeline],
|
157
|
+
) -> Tuple[ModelTypes, Optional[Pipeline], int, int, int]:
|
158
|
+
"""Validate and extract the model and preprocessing steps.
|
172
159
|
|
173
|
-
|
174
|
-
|
160
|
+
Parameters
|
161
|
+
----------
|
162
|
+
model : Union[ModelTypes, Pipeline]
|
163
|
+
A fitted PCA/PLS model or Pipeline ending with such a model
|
175
164
|
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
165
|
+
Returns
|
166
|
+
-------
|
167
|
+
Tuple[ModelTypes, Optional[Pipeline]]
|
168
|
+
The extracted model and preprocessing steps
|
169
|
+
|
170
|
+
Raises
|
171
|
+
------
|
172
|
+
ValueError
|
173
|
+
If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
|
174
|
+
"""
|
175
|
+
if isinstance(model, Pipeline):
|
176
|
+
preprocessing = model[:-1]
|
177
|
+
model = model[-1]
|
178
|
+
else:
|
179
|
+
preprocessing = None
|
180
|
+
|
181
|
+
if not isinstance(model, (_BasePCA, _PLS)):
|
182
|
+
raise ValueError(
|
183
|
+
"Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
|
184
|
+
)
|
185
|
+
|
186
|
+
check_is_fitted(model)
|
187
|
+
n_features_in, n_components, n_samples = _get_model_parameters(model)
|
188
|
+
return model, preprocessing, n_features_in, n_components, n_samples
|
@@ -7,6 +7,7 @@ from scipy.stats import f as f_distribution
|
|
7
7
|
|
8
8
|
|
9
9
|
from ._base import _ModelResidualsBase, ModelTypes
|
10
|
+
from .utils import calculate_residual_spectrum
|
10
11
|
|
11
12
|
|
12
13
|
class DModX(_ModelResidualsBase):
|
@@ -25,10 +26,10 @@ class DModX(_ModelResidualsBase):
|
|
25
26
|
|
26
27
|
Attributes
|
27
28
|
----------
|
28
|
-
|
29
|
+
estimator_ : ModelType
|
29
30
|
The fitted model of type _BasePCA or _PLS
|
30
31
|
|
31
|
-
|
32
|
+
transformer_ : Optional[Pipeline]
|
32
33
|
Preprocessing steps before the model
|
33
34
|
|
34
35
|
n_features_in_ : int
|
@@ -42,6 +43,9 @@ class DModX(_ModelResidualsBase):
|
|
42
43
|
|
43
44
|
critical_value_ : float
|
44
45
|
The calculated critical value for outlier detection
|
46
|
+
|
47
|
+
train_spe_: float
|
48
|
+
The training sum of squared errors (SSE) for the model normalized by degrees of freedom
|
45
49
|
"""
|
46
50
|
|
47
51
|
def __init__(
|
@@ -49,6 +53,7 @@ class DModX(_ModelResidualsBase):
|
|
49
53
|
model: Union[ModelTypes, Pipeline],
|
50
54
|
confidence: float = 0.95,
|
51
55
|
) -> None:
|
56
|
+
model, confidence = model, confidence
|
52
57
|
super().__init__(model, confidence)
|
53
58
|
|
54
59
|
def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "DModX":
|
@@ -62,7 +67,18 @@ class DModX(_ModelResidualsBase):
|
|
62
67
|
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
|
63
68
|
)
|
64
69
|
|
70
|
+
# Calculate the critical value
|
65
71
|
self.critical_value_ = self._calculate_critical_value()
|
72
|
+
|
73
|
+
# Calculate the degrees of freedom normalized SPE of the training set
|
74
|
+
residuals = calculate_residual_spectrum(X, self.estimator_)
|
75
|
+
squared_errors = np.sum((residuals) ** 2, axis=1)
|
76
|
+
self.train_spe_ = np.sqrt(
|
77
|
+
squared_errors
|
78
|
+
/ (self.n_samples_ - self.n_components_ - 1)
|
79
|
+
* (self.n_features_in_ - self.n_components_)
|
80
|
+
)
|
81
|
+
|
66
82
|
return self
|
67
83
|
|
68
84
|
def predict(self, X: np.ndarray) -> np.ndarray:
|
@@ -118,15 +134,17 @@ class DModX(_ModelResidualsBase):
|
|
118
134
|
)
|
119
135
|
|
120
136
|
# Apply preprocessing if available
|
121
|
-
if self.
|
122
|
-
X = self.
|
137
|
+
if self.transformer_:
|
138
|
+
X = self.transformer_.transform(X)
|
123
139
|
|
124
140
|
# Calculate the DModX statistics
|
125
|
-
|
126
|
-
|
127
|
-
squared_errors = np.sum((X - X_reconstructed) ** 2, axis=1)
|
141
|
+
residual = calculate_residual_spectrum(X, self.estimator_)
|
142
|
+
squared_errors = np.sum((residual) ** 2, axis=1)
|
128
143
|
|
129
|
-
return
|
144
|
+
return (
|
145
|
+
np.sqrt(squared_errors / (self.n_features_in_ - self.n_components_))
|
146
|
+
/ self.train_spe_
|
147
|
+
)
|
130
148
|
|
131
149
|
def _calculate_critical_value(self, X: Optional[np.ndarray] = None) -> float:
|
132
150
|
"""Calculate F-distribution based critical value.
|