chemotools 0.1.10__tar.gz → 0.1.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {chemotools-0.1.10 → chemotools-0.1.11}/PKG-INFO +2 -2
  2. chemotools-0.1.11/chemotools/feature_selection/__init__.py +6 -0
  3. chemotools-0.1.11/chemotools/feature_selection/_base.py +88 -0
  4. chemotools-0.1.11/chemotools/feature_selection/_sr_selector.py +137 -0
  5. chemotools-0.1.11/chemotools/feature_selection/_vip_selector.py +129 -0
  6. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/outliers/_base.py +75 -67
  7. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/outliers/dmodx.py +26 -8
  8. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/outliers/hotelling_t2.py +11 -10
  9. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/outliers/leverage.py +15 -14
  10. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/outliers/q_residuals.py +19 -16
  11. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/outliers/studentized_residuals.py +17 -16
  12. chemotools-0.1.11/chemotools/outliers/utils.py +51 -0
  13. {chemotools-0.1.10 → chemotools-0.1.11}/pyproject.toml +2 -2
  14. chemotools-0.1.10/chemotools/feature_selection/__init__.py +0 -4
  15. chemotools-0.1.10/chemotools/outliers/_utils.py +0 -91
  16. {chemotools-0.1.10 → chemotools-0.1.11}/LICENSE +0 -0
  17. {chemotools-0.1.10 → chemotools-0.1.11}/README.md +0 -0
  18. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/__init__.py +0 -0
  19. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/augmentation/__init__.py +0 -0
  20. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/augmentation/_add_noise.py +0 -0
  21. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/augmentation/_baseline_shift.py +0 -0
  22. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/augmentation/_fractional_shift.py +0 -0
  23. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/augmentation/_gaussian_broadening.py +0 -0
  24. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/augmentation/_index_shift.py +0 -0
  25. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/augmentation/_spectrum_scale.py +0 -0
  26. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/baseline/__init__.py +0 -0
  27. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/baseline/_air_pls.py +0 -0
  28. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/baseline/_ar_pls.py +0 -0
  29. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/baseline/_constant_baseline_correction.py +0 -0
  30. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/baseline/_cubic_spline_correction.py +0 -0
  31. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/baseline/_linear_correction.py +0 -0
  32. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/baseline/_non_negative.py +0 -0
  33. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/baseline/_polynomial_correction.py +0 -0
  34. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/baseline/_subtract_reference.py +0 -0
  35. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/datasets/__init__.py +0 -0
  36. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/datasets/_base.py +0 -0
  37. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/datasets/data/__init__.py +0 -0
  38. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/datasets/data/coffee_labels.csv +0 -0
  39. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/datasets/data/coffee_spectra.csv +0 -0
  40. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/datasets/data/fermentation_hplc.csv +0 -0
  41. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/datasets/data/fermentation_spectra.csv +0 -0
  42. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/datasets/data/train_hplc.csv +0 -0
  43. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/datasets/data/train_spectra.csv +0 -0
  44. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/derivative/__init__.py +0 -0
  45. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/derivative/_norris_william.py +0 -0
  46. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/derivative/_savitzky_golay.py +0 -0
  47. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/feature_selection/_index_selector.py +0 -0
  48. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/feature_selection/_range_cut.py +0 -0
  49. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/outliers/__init__.py +0 -0
  50. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/scale/__init__.py +0 -0
  51. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/scale/_min_max_scaler.py +0 -0
  52. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/scale/_norm_scaler.py +0 -0
  53. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/scale/_point_scaler.py +0 -0
  54. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/scatter/__init__.py +0 -0
  55. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/scatter/_extended_multiplicative_scatter_correction.py +0 -0
  56. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/scatter/_multiplicative_scatter_correction.py +0 -0
  57. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/scatter/_robust_normal_variate.py +0 -0
  58. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/scatter/_standard_normal_variate.py +0 -0
  59. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/smooth/__init__.py +0 -0
  60. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/smooth/_mean_filter.py +0 -0
  61. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/smooth/_median_filter.py +0 -0
  62. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/smooth/_savitzky_golay_filter.py +0 -0
  63. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/smooth/_whittaker_smooth.py +0 -0
  64. {chemotools-0.1.10 → chemotools-0.1.11}/chemotools/utils/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: chemotools
3
- Version: 0.1.10
3
+ Version: 0.1.11
4
4
  Summary: chemotools: A Python Package that Integrates Chemometrics and scikit-learn
5
5
  License: MIT
6
6
  Author: Pau Cabaneros
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.13
14
14
  Requires-Dist: numpy (>=2.0.0,<3.0.0)
15
15
  Requires-Dist: pandas (>=2.0.0,<3.0.0)
16
16
  Requires-Dist: polars (>=1.17.0,<2.0.0)
17
- Requires-Dist: pyarrow (>=18.0.0,<19.0.0)
17
+ Requires-Dist: pyarrow (>=18,<21)
18
18
  Requires-Dist: scikit-learn (>=1.4.0,<2.0.0)
19
19
  Description-Content-Type: text/markdown
20
20
 
@@ -0,0 +1,6 @@
1
+ from ._index_selector import IndexSelector
2
+ from ._range_cut import RangeCut
3
+ from ._sr_selector import SRSelector
4
+ from ._vip_selector import VIPSelector
5
+
6
+ __all__ = ["IndexSelector", "RangeCut", "SRSelector", "VIPSelector"]
@@ -0,0 +1,88 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Union
3
+
4
+ import numpy as np
5
+
6
+ from sklearn.base import BaseEstimator
7
+ from sklearn.cross_decomposition._pls import _PLS
8
+ from sklearn.feature_selection._base import SelectorMixin
9
+ from sklearn.pipeline import Pipeline
10
+ from sklearn.utils.validation import check_is_fitted
11
+
12
+ ModelTypes = Union[_PLS, Pipeline]
13
+
14
+
15
+ class _PLSFeatureSelectorBase(ABC, BaseEstimator, SelectorMixin):
16
+ """Feature selection base class for _PLS-like models.
17
+
18
+
19
+ Parameters
20
+ ----------
21
+ model : Union[_PLS, Pipeline]
22
+ A fitted _PLS models or Pipeline ending with such a model
23
+
24
+ threshold : float
25
+ The threshold for feature selection. Features with importance
26
+ above this threshold will be selected.
27
+
28
+ Attributes
29
+ ----------
30
+ estimator_ : ModelTypes
31
+ The fitted model of type _BasePCA or _PLS
32
+
33
+ feature_scores_ : np.ndarray
34
+ The calculated feature scores based on the selected method.
35
+
36
+ support_mask : np.ndarray
37
+ The boolean mask indicating which features are selected.
38
+ """
39
+
40
+ def __init__(
41
+ self,
42
+ model: Union[_PLS, Pipeline],
43
+ ) -> None:
44
+ self.estimator_ = _validate_and_extract_model(model)
45
+
46
+ @abstractmethod
47
+ def _calculate_features(self, X: np.ndarray) -> np.ndarray:
48
+ """Calculate the residuals of the model.
49
+
50
+ Returns
51
+ -------
52
+ ndarray of shape (n_samples,)
53
+ The residuals of the model
54
+ """
55
+
56
+
57
+ def _validate_and_extract_model(
58
+ model: Union[_PLS, Pipeline],
59
+ ) -> _PLS:
60
+ """Validate and extract the model.
61
+
62
+ Parameters
63
+ ----------
64
+ model : Union[_PLS, Pipeline]
65
+ A fitted _PLS model or Pipeline ending with such a model
66
+
67
+ Returns
68
+ -------
69
+ _PLS
70
+ The extracted estimator
71
+
72
+ Raises
73
+ ------
74
+ TypeError
75
+ If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
76
+ """
77
+ if isinstance(model, Pipeline):
78
+ estimator = model[-1]
79
+ else:
80
+ estimator = model
81
+
82
+ if not isinstance(estimator, _PLS):
83
+ raise TypeError(
84
+ "Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
85
+ )
86
+
87
+ check_is_fitted(model)
88
+ return estimator
@@ -0,0 +1,137 @@
1
+ import numpy as np
2
+ from sklearn.utils.validation import validate_data
3
+
4
+ from ._base import _PLSFeatureSelectorBase
5
+
6
+
7
+ class SRSelector(_PLSFeatureSelectorBase):
8
+ """
9
+ This selector is used to select features that contribute significantly
10
+ to the latent variables in a PLS regression model using the Selectivity
11
+ Ratio (SR) method.
12
+
13
+ Parameters
14
+ ----------
15
+ - model: Union[_PLS, Pipeline]
16
+ The PLS regression model or a pipeline with a PLS regression model as last step.
17
+
18
+ - threshold: float, default=1.0
19
+ The threshold for feature selection. Features with importance
20
+ above this threshold will be selected.
21
+
22
+ Attributes
23
+ ----------
24
+ estimator_ : ModelTypes
25
+ The fitted model of type _BasePCA or _PLS
26
+
27
+ feature_scores_ : np.ndarray
28
+ The calculated feature scores based on the selected method.
29
+
30
+ support_mask_ : np.ndarray
31
+ The boolean mask indicating which features are selected.
32
+
33
+ Methods
34
+ -------
35
+ fit(X, y=None)
36
+ Fit the transformer to the input data. It calculates the feature scores and the feature_mask.
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ model,
42
+ threshold: float = 1.0,
43
+ ):
44
+ self.model = model
45
+ self.threshold = threshold
46
+ super().__init__(self.model)
47
+
48
+ def fit(self, X: np.ndarray, y=None) -> "SRSelector":
49
+ """
50
+ Fit the transformer to calculate the feature scores and the support mask.
51
+
52
+ Parameters
53
+ ----------
54
+ X : array-like of shape (n_samples, n_features)
55
+ The input data to fit the transformer to.
56
+
57
+ y : None
58
+ Ignored.
59
+
60
+ Returns
61
+ -------
62
+ self : SRSelector
63
+ The fitted transformer.
64
+ """
65
+ # Check that X is a 2D array and has only finite values
66
+ X = validate_data(
67
+ self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
68
+ )
69
+
70
+ # Calculate the SR scores
71
+ self.feature_scores_ = self._calculate_features(X)
72
+
73
+ # Calculate the support mask
74
+ self.support_mask_ = self._get_support_mask()
75
+
76
+ return self
77
+
78
+ def _get_support_mask(self) -> np.ndarray:
79
+ """
80
+ Get the support mask based on the feature scores and threshold.
81
+ Features with scores above the threshold are selected.
82
+ Parameters
83
+ ----------
84
+ self : SRSelector
85
+ The fitted transformer.
86
+
87
+ Returns
88
+ -------
89
+ support_mask_ : np.ndarray
90
+ The boolean mask indicating which features are selected.
91
+ """
92
+ return self.feature_scores_ > self.threshold
93
+
94
+ def _calculate_features(self, X: np.ndarray) -> np.ndarray:
95
+ """
96
+ Vectorized Selectivity Ratio calculation from a fitted _PLS
97
+ like model.
98
+
99
+ Parameters:
100
+ ----------
101
+ - self: SRSelector
102
+ The fitted transformer.
103
+
104
+ - X: array-like of shape (n_samples, n_features)
105
+ The input training data to calculate the feature scores from.
106
+
107
+ Returns
108
+ -------
109
+ feature_scores_ : np.ndarray
110
+ The calculated feature scores based on the selected method.
111
+ """
112
+ bpls = self.estimator_.coef_
113
+ bpls_norm = bpls.T / np.linalg.norm(bpls)
114
+
115
+ # Handle 1D case correctly
116
+ if bpls.ndim == 1:
117
+ bpls_norm = bpls_norm.reshape(-1, 1)
118
+
119
+ # Project X onto the regression vector
120
+ ttp = X @ bpls_norm
121
+ ptp = X.T @ np.linalg.pinv(ttp).T
122
+
123
+ # Predicted part of X
124
+ X_hat = ttp @ ptp.T
125
+
126
+ # Compute squared norms directly
127
+ total_ss = np.linalg.norm(X, axis=0) ** 2
128
+ explained_ss = np.linalg.norm(X_hat, axis=0) ** 2
129
+
130
+ # Calculate residual sum of squares
131
+ residual_ss = total_ss - explained_ss
132
+
133
+ # Stability: avoid division by zero
134
+ epsilon = 1e-12
135
+
136
+ # Calculate Selectivity Ratio
137
+ return explained_ss / (residual_ss + epsilon)
@@ -0,0 +1,129 @@
1
+ import numpy as np
2
+ from sklearn.utils.validation import validate_data
3
+
4
+ from ._base import _PLSFeatureSelectorBase
5
+
6
+
7
+ class VIPSelector(_PLSFeatureSelectorBase):
8
+ """
9
+ This selector is used to select features that contribute significantly
10
+ to the latent variables in a PLS regression model using the Variables
11
+ Importance in Projection (VIP) method.
12
+
13
+ Parameters
14
+ ----------
15
+ - model: Union[_PLS, Pipeline]
16
+ The PLS regression model or a pipeline with a PLS regression model as last step.
17
+
18
+ - threshold: float, default=1.0
19
+ The threshold for feature selection. Features with importance
20
+ above this threshold will be selected.
21
+
22
+ Attributes
23
+ ----------
24
+ estimator_ : ModelTypes
25
+ The fitted model of type _BasePCA or _PLS
26
+
27
+ feature_scores_ : np.ndarray
28
+ The calculated feature scores based on the selected method.
29
+
30
+ support_mask_ : np.ndarray
31
+ The boolean mask indicating which features are selected.
32
+
33
+ Methods
34
+ -------
35
+ fit(X, y=None)
36
+ Fit the transformer to the input data. It calculates the feature scores and the feature_mask.
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ model,
42
+ threshold: float = 1.0,
43
+ ):
44
+ self.model = model
45
+ self.threshold = threshold
46
+ super().__init__(self.model)
47
+
48
+ def fit(self, X: np.ndarray, y=None) -> "VIPSelector":
49
+ """
50
+ Fit the transformer to calculate the feature scores and the support mask.
51
+
52
+ Parameters
53
+ ----------
54
+ X : array-like of shape (n_samples, n_features)
55
+ The input data to fit the transformer to.
56
+
57
+ y : None
58
+ Ignored.
59
+
60
+ Returns
61
+ -------
62
+ self : VIPSelector
63
+ The fitted transformer.
64
+ """
65
+ # Check that X is a 2D array and has only finite values
66
+ X = validate_data(
67
+ self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
68
+ )
69
+
70
+ # Calculate the VIP scores
71
+ self.feature_scores_ = self._calculate_features(X)
72
+
73
+ # Calculate the support mask
74
+ self.support_mask_ = self._get_support_mask()
75
+
76
+ return self
77
+
78
+ def _get_support_mask(self) -> np.ndarray:
79
+ """
80
+ Get the support mask based on the feature scores and threshold.
81
+ Features with scores above the threshold are selected.
82
+ Parameters
83
+ ----------
84
+ self : VIPSelector
85
+ The fitted transformer.
86
+
87
+ Returns
88
+ -------
89
+ support_mask_ : np.ndarray
90
+ The boolean mask indicating which features are selected.
91
+ """
92
+ return self.feature_scores_ > self.threshold
93
+
94
+ def _calculate_features(self, X: np.ndarray) -> np.ndarray:
95
+ """
96
+ Calculate the VIP scores based on the fitted model.
97
+
98
+ Parameters
99
+ ----------
100
+ self : VIPSelector
101
+ The fitted transformer.
102
+
103
+ Returns
104
+ -------
105
+ feature_scores_ : np.ndarray
106
+ The calculated feature scores based on the selected method.
107
+ """
108
+ # Calculate sum of squares of y_loadings and x_scores
109
+ sum_of_squares_y_loadings = (
110
+ np.linalg.norm(self.estimator_.y_loadings_, ord=2, axis=0) ** 2
111
+ )
112
+ sum_of_squares_x_scores = (
113
+ np.linalg.norm(self.estimator_.x_scores_, ord=2, axis=0) ** 2
114
+ )
115
+
116
+ # Calculate the sum of squares
117
+ sum_of_squares = sum_of_squares_y_loadings * sum_of_squares_x_scores
118
+
119
+ # Calculate the numerator
120
+ numerator = self.estimator_.n_features_in_ * np.sum(
121
+ sum_of_squares * self.estimator_.x_weights_**2,
122
+ axis=1,
123
+ )
124
+
125
+ # Calculate the denominator
126
+ denominator = np.sum(sum_of_squares, axis=0)
127
+
128
+ # Calculate the VIP scores
129
+ return np.sqrt(numerator / denominator)
@@ -1,5 +1,5 @@
1
1
  from abc import ABC, abstractmethod
2
- from typing import Union, Optional
2
+ from typing import Optional, Tuple, Union
3
3
 
4
4
  import numpy as np
5
5
 
@@ -9,7 +9,6 @@ from sklearn.cross_decomposition._pls import _PLS
9
9
  from sklearn.pipeline import Pipeline
10
10
  from sklearn.utils.validation import check_is_fitted
11
11
 
12
- from ._utils import validate_confidence, validate_and_extract_model
13
12
 
14
13
  ModelTypes = Union[_BasePCA, _PLS]
15
14
 
@@ -29,10 +28,10 @@ class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
29
28
 
30
29
  Attributes
31
30
  ----------
32
- model_ : ModelTypes
31
+ estimator_ : ModelTypes
33
32
  The fitted model of type _BasePCA or _PLS
34
33
 
35
- preprocessing_ : Optional[Pipeline]
34
+ transformer_ : Optional[Pipeline]
36
35
  Preprocessing steps before the model
37
36
 
38
37
  n_features_in_ : int
@@ -54,13 +53,13 @@ class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
54
53
  confidence: float,
55
54
  ) -> None:
56
55
  (
57
- self.model_,
58
- self.preprocessing_,
56
+ self.estimator_,
57
+ self.transformer_,
59
58
  self.n_features_in_,
60
59
  self.n_components_,
61
60
  self.n_samples_,
62
- ) = validate_and_extract_model(model)
63
- self.confidence = validate_confidence(confidence)
61
+ ) = _validate_and_extract_model(model)
62
+ self.confidence = _validate_confidence(confidence)
64
63
 
65
64
  def fit_predict_residuals(
66
65
  self, X: np.ndarray, y: Optional[np.ndarray] = None
@@ -96,7 +95,7 @@ class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
96
95
  """
97
96
 
98
97
  @abstractmethod
99
- def _calculate_critical_value(self, X: Optional[np.ndarray]) -> float:
98
+ def _calculate_critical_value(self, X: np.ndarray) -> float:
100
99
  """Calculate the critical value for outlier detection.
101
100
 
102
101
  Returns
@@ -106,75 +105,84 @@ class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
106
105
  """
107
106
 
108
107
 
109
- class _ModelDiagnosticsBase(ABC):
110
- """Base class for model diagnostics methods. This does not implement outlier detection algorithms,
111
- but rather implements methods that are used to assess trained models.
108
+ def _get_model_parameters(model: ModelTypes) -> Tuple[int, int, int]:
109
+ """
110
+ Get the number of features, components and samples from a model with PLS or PCA. types.
112
111
 
113
112
  Parameters
114
113
  ----------
115
- model : Union[ModelTypes, Pipeline]
116
- A fitted PCA/PLS model or Pipeline ending with such a model
117
-
118
- Attributes
119
- ----------
120
- model_ : ModelTypes
121
- The fitted model of type _BasePCA or _PLS
122
-
123
- preprocessing_ : Optional[Pipeline]
124
- Preprocessing steps before the model
114
+ model : ModelType
115
+ A fitted model of type _BasePCA or _PLS
125
116
 
117
+ Returns
118
+ -------
119
+ Tuple[int, int, int]
120
+ The number of features, components and samples in the model
126
121
  """
122
+ if isinstance(model, _BasePCA):
123
+ return model.n_features_in_, model.n_components_, model.n_samples_
124
+ elif isinstance(model, _PLS):
125
+ return model.n_features_in_, model.n_components, len(model.x_scores_)
126
+ else:
127
+ raise ValueError(
128
+ "Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
129
+ )
127
130
 
128
- def __init__(self, model: Union[ModelTypes, Pipeline]):
129
- self.model_, self.preprocessing_ = self._validate_and_extract_model(model)
130
131
 
131
- def _validate_and_extract_model(self, model):
132
- """Validate and extract the model and preprocessing steps.
132
+ def _validate_confidence(confidence: float) -> float:
133
+ """Validate parameters using sklearn conventions.
133
134
 
134
- Parameters
135
- ----------
136
- model : Union[ModelTypes, Pipeline]
137
- A fitted PCA/PLS model or Pipeline ending with such a model
135
+ Parameters
136
+ ----------
137
+ confidence : float
138
+ Confidence level for statistical calculations (between 0 and 1)
138
139
 
139
- Returns
140
- -------
141
- Tuple[ModelTypes, Optional[Pipeline]]
142
- The extracted model and preprocessing steps
140
+ Returns
141
+ -------
142
+ float
143
+ The validated confidence level
143
144
 
144
- Raises
145
- ------
146
- ValueError
147
- If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
148
- """
149
- if isinstance(model, Pipeline):
150
- preprocessing = model[:-1]
151
- model = model[-1]
152
- else:
153
- preprocessing = None
154
-
155
- if isinstance(model, (_BasePCA, _PLS)):
156
- check_is_fitted(model)
157
- else:
158
- raise ValueError(
159
- "Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
160
- )
161
- check_is_fitted(model)
162
- return model, preprocessing
145
+ Raises
146
+ ------
147
+ ValueError
148
+ If confidence is not between 0 and 1
149
+ """
150
+ if not 0 < confidence < 1:
151
+ raise ValueError("Confidence must be between 0 and 1")
152
+ return confidence
163
153
 
164
- @abstractmethod
165
- def predict(self, X: np.ndarray, y: Optional[np.ndarray]) -> np.ndarray:
166
- """Predict the output of the model.
167
154
 
168
- Parameters
169
- ----------
170
- X : array-like of shape (n_samples, n_features)
171
- Input data
155
+ def _validate_and_extract_model(
156
+ model: Union[ModelTypes, Pipeline],
157
+ ) -> Tuple[ModelTypes, Optional[Pipeline], int, int, int]:
158
+ """Validate and extract the model and preprocessing steps.
172
159
 
173
- y : array-like of shape (n_samples,), default=None
174
- Target values
160
+ Parameters
161
+ ----------
162
+ model : Union[ModelTypes, Pipeline]
163
+ A fitted PCA/PLS model or Pipeline ending with such a model
175
164
 
176
- Returns
177
- -------
178
- ndarray of shape (n_samples,)
179
- Predicted values
180
- """
165
+ Returns
166
+ -------
167
+ Tuple[ModelTypes, Optional[Pipeline]]
168
+ The extracted model and preprocessing steps
169
+
170
+ Raises
171
+ ------
172
+ ValueError
173
+ If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
174
+ """
175
+ if isinstance(model, Pipeline):
176
+ preprocessing = model[:-1]
177
+ model = model[-1]
178
+ else:
179
+ preprocessing = None
180
+
181
+ if not isinstance(model, (_BasePCA, _PLS)):
182
+ raise ValueError(
183
+ "Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
184
+ )
185
+
186
+ check_is_fitted(model)
187
+ n_features_in, n_components, n_samples = _get_model_parameters(model)
188
+ return model, preprocessing, n_features_in, n_components, n_samples
@@ -7,6 +7,7 @@ from scipy.stats import f as f_distribution
7
7
 
8
8
 
9
9
  from ._base import _ModelResidualsBase, ModelTypes
10
+ from .utils import calculate_residual_spectrum
10
11
 
11
12
 
12
13
  class DModX(_ModelResidualsBase):
@@ -25,10 +26,10 @@ class DModX(_ModelResidualsBase):
25
26
 
26
27
  Attributes
27
28
  ----------
28
- model_ : ModelType
29
+ estimator_ : ModelType
29
30
  The fitted model of type _BasePCA or _PLS
30
31
 
31
- preprocessing_ : Optional[Pipeline]
32
+ transformer_ : Optional[Pipeline]
32
33
  Preprocessing steps before the model
33
34
 
34
35
  n_features_in_ : int
@@ -42,6 +43,9 @@ class DModX(_ModelResidualsBase):
42
43
 
43
44
  critical_value_ : float
44
45
  The calculated critical value for outlier detection
46
+
47
+ train_spe_: float
48
+ The training sum of squared errors (SSE) for the model normalized by degrees of freedom
45
49
  """
46
50
 
47
51
  def __init__(
@@ -49,6 +53,7 @@ class DModX(_ModelResidualsBase):
49
53
  model: Union[ModelTypes, Pipeline],
50
54
  confidence: float = 0.95,
51
55
  ) -> None:
56
+ model, confidence = model, confidence
52
57
  super().__init__(model, confidence)
53
58
 
54
59
  def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "DModX":
@@ -62,7 +67,18 @@ class DModX(_ModelResidualsBase):
62
67
  self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
63
68
  )
64
69
 
70
+ # Calculate the critical value
65
71
  self.critical_value_ = self._calculate_critical_value()
72
+
73
+ # Calculate the degrees of freedom normalized SPE of the training set
74
+ residuals = calculate_residual_spectrum(X, self.estimator_)
75
+ squared_errors = np.sum((residuals) ** 2, axis=1)
76
+ self.train_spe_ = np.sqrt(
77
+ squared_errors
78
+ / (self.n_samples_ - self.n_components_ - 1)
79
+ * (self.n_features_in_ - self.n_components_)
80
+ )
81
+
66
82
  return self
67
83
 
68
84
  def predict(self, X: np.ndarray) -> np.ndarray:
@@ -118,15 +134,17 @@ class DModX(_ModelResidualsBase):
118
134
  )
119
135
 
120
136
  # Apply preprocessing if available
121
- if self.preprocessing_:
122
- X = self.preprocessing_.transform(X)
137
+ if self.transformer_:
138
+ X = self.transformer_.transform(X)
123
139
 
124
140
  # Calculate the DModX statistics
125
- X_transformed = self.model_.transform(X)
126
- X_reconstructed = self.model_.inverse_transform(X_transformed)
127
- squared_errors = np.sum((X - X_reconstructed) ** 2, axis=1)
141
+ residual = calculate_residual_spectrum(X, self.estimator_)
142
+ squared_errors = np.sum((residual) ** 2, axis=1)
128
143
 
129
- return np.sqrt(squared_errors / (self.n_features_in_ - self.n_components_))
144
+ return (
145
+ np.sqrt(squared_errors / (self.n_features_in_ - self.n_components_))
146
+ / self.train_spe_
147
+ )
130
148
 
131
149
  def _calculate_critical_value(self, X: Optional[np.ndarray] = None) -> float:
132
150
  """Calculate F-distribution based critical value.