chemotools 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,23 +1,37 @@
1
1
  from typing import Literal, Optional
2
2
 
3
3
  import numpy as np
4
- from numpy.polynomial import polynomial as poly
4
+ from scipy.signal import convolve
5
+ from scipy import stats
5
6
  from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
6
7
  from sklearn.utils.validation import check_is_fitted, validate_data
7
8
 
8
9
 
9
10
  class IndexShift(TransformerMixin, OneToOneFeatureMixin, BaseEstimator):
10
11
  """
11
- Shift the spectrum a given number of indices between - shift and + shift drawn
12
+ Shift the spectrum a given number of indices between -shift and +shift drawn
12
13
  from a discrete uniform distribution.
13
14
 
14
15
  Parameters
15
16
  ----------
16
- shift : float, default=0.0
17
- Shifts the data by a random integer between -shift and shift.
17
+ shift : int, default=0
18
+ Maximum number of indices by which the data is randomly shifted.
19
+ The actual shift is a random integer between -shift and shift (inclusive).
18
20
 
19
- random_state : int, default=None
20
- The random state to use for the random number generator.
21
+ padding_mode : {'zeros', 'constant', 'wrap', 'extend', 'mirror', 'linear'}, default='linear'
22
+ Specifies how to handle padding when shifting the data:
23
+ - 'zeros': Pads with zeros.
24
+ - 'constant': Pads with a constant value defined by `pad_value`.
25
+ - 'wrap': Circular shift (wraps around).
26
+ - 'extend': Extends using edge values.
27
+ - 'mirror': Mirrors the signal.
28
+ - 'linear': Uses linear regression to extrapolate values.
29
+
30
+ pad_value : float, default=0.0
31
+ The value used for padding when `padding_mode='constant'`.
32
+
33
+ random_state : int, optional, default=None
34
+ The random seed for reproducibility.
21
35
 
22
36
  Attributes
23
37
  ----------
@@ -27,23 +41,22 @@ class IndexShift(TransformerMixin, OneToOneFeatureMixin, BaseEstimator):
27
41
  _is_fitted : bool
28
42
  Whether the transformer has been fitted to data.
29
43
 
30
- Methods
31
- -------
32
- fit(X, y=None)
33
- Fit the transformer to the input data.
34
-
35
- transform(X, y=0, copy=True)
36
- Transform the input data by shifting the spectrum.
44
+ _rng : numpy.random.Generator
45
+ Random number generator instance used for shifting.
37
46
  """
38
47
 
39
48
  def __init__(
40
49
  self,
41
50
  shift: int = 0,
42
- fill_method: Literal["constant", "linear", "quadratic"] = "constant",
51
+ padding_mode: Literal[
52
+ "zeros", "constant", "wrap", "extend", "mirror", "linear"
53
+ ] = "linear",
54
+ pad_value: float = 0.0,
43
55
  random_state: Optional[int] = None,
44
56
  ):
45
57
  self.shift = shift
46
- self.fill_method = fill_method
58
+ self.padding_mode = padding_mode
59
+ self.pad_value = pad_value
47
60
  self.random_state = random_state
48
61
 
49
62
  def fit(self, X: np.ndarray, y=None) -> "IndexShift":
@@ -68,12 +81,6 @@ class IndexShift(TransformerMixin, OneToOneFeatureMixin, BaseEstimator):
68
81
  self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
69
82
  )
70
83
 
71
- # Set the number of features
72
- self.n_features_in_ = X.shape[1]
73
-
74
- # Set the fitted attribute to True
75
- self._is_fitted = True
76
-
77
84
  # Instantiate the random number generator
78
85
  self._rng = np.random.default_rng(self.random_state)
79
86
 
@@ -94,10 +101,10 @@ class IndexShift(TransformerMixin, OneToOneFeatureMixin, BaseEstimator):
94
101
  Returns
95
102
  -------
96
103
  X_ : np.ndarray of shape (n_samples, n_features)
97
- The transformed data.
104
+ The transformed data with the applied shifts.
98
105
  """
99
106
  # Check that the estimator is fitted
100
- check_is_fitted(self, "_is_fitted")
107
+ check_is_fitted(self, "n_features_in_")
101
108
 
102
109
  # Check that X is a 2D array and has only finite values
103
110
  X_ = validate_data(
@@ -110,90 +117,98 @@ class IndexShift(TransformerMixin, OneToOneFeatureMixin, BaseEstimator):
110
117
  dtype=np.float64,
111
118
  )
112
119
 
113
- # Check that the number of features is the same as the fitted data
114
- if X_.shape[1] != self.n_features_in_:
115
- raise ValueError(
116
- f"Expected {self.n_features_in_} features but got {X_.shape[1]}"
117
- )
118
-
119
120
  # Calculate the standard normal variate
120
121
  for i, x in enumerate(X_):
121
- X_[i] = self._shift_vector(x)
122
+ X_[i] = self._shift_signal(x)
122
123
 
123
124
  return X_.reshape(-1, 1) if X_.ndim == 1 else X_
124
125
 
125
- def _shift_spectrum(self, x) -> np.ndarray:
126
- shift_amount = self._rng.integers(-self.shift, self.shift, endpoint=True)
127
- return np.roll(x, shift_amount)
128
-
129
- def _shift_vector(
130
- self,
131
- x: np.ndarray,
132
- ) -> np.ndarray:
126
+ def _shift_signal(self, x: np.ndarray):
133
127
  """
134
- Shift vector with option to fill missing values.
135
-
136
- Args:
137
- arr: Input numpy array
138
- shift: Number of positions to shift
139
- fill_method: Method to fill missing values
140
- 'constant': fill with first/last value
141
- 'linear': fill using linear regression
142
- 'quadratic': fill using quadratic regression
143
-
144
- Returns:
145
- Shifted numpy array
146
- """
147
- shift = self._rng.integers(-self.shift, self.shift, endpoint=True)
148
-
149
- result = np.roll(x, shift)
150
-
151
- if self.fill_method == "constant":
152
- if shift > 0:
153
- result[:shift] = x[0]
154
- elif shift < 0:
155
- result[shift:] = x[-1]
156
-
157
- elif self.fill_method == "linear":
158
- if shift > 0:
159
- x_ = np.arange(5)
160
- coeffs = poly.polyfit(x_, x[:5], 1)
128
+ Shifts a discrete signal using convolution with a Dirac delta kernel.
161
129
 
162
- extrapolate_x = np.arange(-shift, 0)
163
- extrapolated_values = poly.polyval(extrapolate_x, coeffs)
164
-
165
- result[:shift] = extrapolated_values
166
-
167
- elif shift < 0:
168
- x_ = np.arange(5)
169
- coeffs = poly.polyfit(x_, x[-5:], 1)
170
-
171
- extrapolate_x = np.arange(len(x_), len(x_) - shift)
172
- extrapolated_values = poly.polyval(extrapolate_x, coeffs)
173
-
174
- result[shift:] = extrapolated_values
175
-
176
- elif self.fill_method == "quadratic":
177
- if shift > 0:
178
- # Use first 3 values for quadratic regression
179
- x_ = np.arange(5)
180
- coeffs = poly.polyfit(x_, x[:5], 2)
181
-
182
- # Extrapolate to fill shifted region
183
- extrapolate_x = np.arange(-shift, 0)
184
- extrapolated_values = poly.polyval(extrapolate_x, coeffs)
185
-
186
- result[:shift] = extrapolated_values
187
-
188
- elif shift < 0:
189
- # Use last 3 values for quadratic regression
190
- x_ = np.arange(5)
191
- coeffs = poly.polyfit(x_, x[-5:], 2)
192
-
193
- # Extrapolate to fill shifted region
194
- extrapolate_x = np.arange(len(x_), len(x_) - shift)
195
- extrapolated_values = poly.polyval(extrapolate_x, coeffs)
130
+ Parameters
131
+ ----------
132
+ x : np.ndarray of shape (n_features,)
133
+ The input signal to shift.
196
134
 
197
- result[shift:] = extrapolated_values
135
+ Returns
136
+ -------
137
+ result : np.ndarray of shape (n_features,)
138
+ The shifted signal.
139
+ """
140
+ shift = self._rng.integers(-self.shift, self.shift, endpoint=True)
198
141
 
199
- return result
142
+ if self.padding_mode == "wrap":
143
+ return np.roll(x, shift)
144
+
145
+ # Create Dirac delta kernel with proper dimensions
146
+
147
+ if shift >= 0:
148
+ kernel = np.zeros(shift + 1)
149
+ kernel[-1] = 1
150
+ else:
151
+ kernel = np.zeros(-shift + 1)
152
+ kernel[0] = 1
153
+
154
+ # Convolve signal with kernel
155
+ shifted = convolve(x, kernel, mode="full")
156
+
157
+ if shift >= 0:
158
+ result = shifted[: len(x)] if x.ndim == 1 else shifted[: x.shape[0]]
159
+ pad_length = shift
160
+ pad_left = True
161
+ else:
162
+ result = shifted[-len(x) :] if x.ndim == 1 else shifted[-x.shape[0] :]
163
+ pad_length = -shift
164
+ pad_left = False
165
+
166
+ if self.padding_mode == "zeros":
167
+ return result
168
+
169
+ elif self.padding_mode == "constant":
170
+ mask = np.abs(result) < 1e-10
171
+ result[mask] = self.pad_value
172
+ return result
173
+
174
+ elif self.padding_mode == "mirror":
175
+ if pad_left:
176
+ pad_values = x[pad_length - 1 :: -1]
177
+ result[:pad_length] = pad_values[-pad_length:]
178
+ else:
179
+ pad_values = x[:-1][::-1]
180
+ result[-pad_length:] = pad_values[:pad_length]
181
+
182
+ return result
183
+
184
+ elif self.padding_mode == "extend":
185
+ if pad_left:
186
+ result[:pad_length] = x[0]
187
+ else:
188
+ result[-pad_length:] = x[-1]
189
+ return result
190
+
191
+ elif self.padding_mode == "linear":
192
+ # Get points for linear regression
193
+ if pad_left:
194
+ points = x[: pad_length + 1] # Take first pad_length+1 points
195
+ x_coords = np.arange(len(points))
196
+ slope, intercept, _, _, _ = stats.linregress(x_coords, points)
197
+
198
+ # Generate new points using linear regression
199
+ new_x = np.arange(-pad_length, 0)
200
+ extrapolated = slope * new_x + intercept
201
+ result[:pad_length] = extrapolated
202
+ else:
203
+ points = x[-pad_length - 1 :] # Take last pad_length+1 points
204
+ x_coords = np.arange(len(points))
205
+ slope, intercept, _, _, _ = stats.linregress(x_coords, points)
206
+
207
+ # Generate new points using linear regression
208
+ new_x = np.arange(len(points), len(points) + pad_length)
209
+ extrapolated = slope * new_x + intercept
210
+ result[-pad_length:] = extrapolated
211
+ return result
212
+
213
+ else:
214
+ raise ValueError(f"Unknown padding mode: {self.padding_mode}")
@@ -0,0 +1,7 @@
1
+ from .dmodx import DModX
2
+ from .hotelling_t2 import HotellingT2
3
+ from .q_residuals import QResiduals
4
+ from .leverage import Leverage
5
+ from .studentized_residuals import StudentizedResiduals
6
+
7
+ __all__ = ["DModX", "HotellingT2", "QResiduals", "Leverage", "StudentizedResiduals"]
@@ -0,0 +1,180 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Union, Optional
3
+
4
+ import numpy as np
5
+
6
+ from sklearn.base import BaseEstimator, OutlierMixin
7
+ from sklearn.decomposition._base import _BasePCA
8
+ from sklearn.cross_decomposition._pls import _PLS
9
+ from sklearn.pipeline import Pipeline
10
+ from sklearn.utils.validation import check_is_fitted
11
+
12
+ from ._utils import validate_confidence, validate_and_extract_model
13
+
14
+ ModelTypes = Union[_BasePCA, _PLS]
15
+
16
+
17
+ class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
18
+ """Base class for model outlier calculations.
19
+
20
+ Implements statistical calculations for outlier detection in dimensionality
21
+ reduction models like PCA and PLS.
22
+
23
+ Parameters
24
+ ----------
25
+ model : Union[ModelTypes, Pipeline]
26
+ A fitted _BasePCA or _PLS models or Pipeline ending with such a model
27
+ confidence : float
28
+ Confidence level for statistical calculations (between 0 and 1)
29
+
30
+ Attributes
31
+ ----------
32
+ model_ : ModelTypes
33
+ The fitted model of type _BasePCA or _PLS
34
+
35
+ preprocessing_ : Optional[Pipeline]
36
+ Preprocessing steps before the model
37
+
38
+ n_features_in_ : int
39
+ Number of features in the input data
40
+
41
+ n_components_ : int
42
+ Number of components in the model
43
+
44
+ n_samples_ : int
45
+ Number of samples used to train the model
46
+
47
+ critical_value_ : float
48
+ The calculated critical value for outlier detection
49
+ """
50
+
51
+ def __init__(
52
+ self,
53
+ model: Union[ModelTypes, Pipeline],
54
+ confidence: float,
55
+ ) -> None:
56
+ (
57
+ self.model_,
58
+ self.preprocessing_,
59
+ self.n_features_in_,
60
+ self.n_components_,
61
+ self.n_samples_,
62
+ ) = validate_and_extract_model(model)
63
+ self.confidence = validate_confidence(confidence)
64
+
65
+ def fit_predict_residuals(
66
+ self, X: np.ndarray, y: Optional[np.ndarray] = None
67
+ ) -> np.ndarray:
68
+ """Fit the model to the input data and calculate the residuals.
69
+
70
+ Parameters
71
+ ----------
72
+ X : array-like of shape (n_samples, n_features)
73
+ Input data
74
+
75
+ y : array-like of shape (n_samples,), default=None
76
+ Target values
77
+
78
+ Returns
79
+ -------
80
+ ndarray of shape (n_samples,)
81
+ The residuals of the model
82
+ """
83
+ self.fit(X, y)
84
+ return self.predict_residuals(X, y, validate=True)
85
+
86
+ @abstractmethod
87
+ def predict_residuals(
88
+ self, X: np.ndarray, y: Optional[np.ndarray], validate: bool
89
+ ) -> np.ndarray:
90
+ """Calculate the residuals of the model.
91
+
92
+ Returns
93
+ -------
94
+ ndarray of shape (n_samples,)
95
+ The residuals of the model
96
+ """
97
+
98
+ @abstractmethod
99
+ def _calculate_critical_value(self, X: Optional[np.ndarray]) -> float:
100
+ """Calculate the critical value for outlier detection.
101
+
102
+ Returns
103
+ -------
104
+ float
105
+ The calculated critical value for outlier detection
106
+ """
107
+
108
+
109
+ class _ModelDiagnosticsBase(ABC):
110
+ """Base class for model diagnostics methods. This does not implement outlier detection algorithms,
111
+ but rather implements methods that are used to assess trained models.
112
+
113
+ Parameters
114
+ ----------
115
+ model : Union[ModelTypes, Pipeline]
116
+ A fitted PCA/PLS model or Pipeline ending with such a model
117
+
118
+ Attributes
119
+ ----------
120
+ model_ : ModelTypes
121
+ The fitted model of type _BasePCA or _PLS
122
+
123
+ preprocessing_ : Optional[Pipeline]
124
+ Preprocessing steps before the model
125
+
126
+ """
127
+
128
+ def __init__(self, model: Union[ModelTypes, Pipeline]):
129
+ self.model_, self.preprocessing_ = self._validate_and_extract_model(model)
130
+
131
+ def _validate_and_extract_model(self, model):
132
+ """Validate and extract the model and preprocessing steps.
133
+
134
+ Parameters
135
+ ----------
136
+ model : Union[ModelTypes, Pipeline]
137
+ A fitted PCA/PLS model or Pipeline ending with such a model
138
+
139
+ Returns
140
+ -------
141
+ Tuple[ModelTypes, Optional[Pipeline]]
142
+ The extracted model and preprocessing steps
143
+
144
+ Raises
145
+ ------
146
+ ValueError
147
+ If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
148
+ """
149
+ if isinstance(model, Pipeline):
150
+ preprocessing = model[:-1]
151
+ model = model[-1]
152
+ else:
153
+ preprocessing = None
154
+
155
+ if isinstance(model, (_BasePCA, _PLS)):
156
+ check_is_fitted(model)
157
+ else:
158
+ raise ValueError(
159
+ "Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
160
+ )
161
+ check_is_fitted(model)
162
+ return model, preprocessing
163
+
164
+ @abstractmethod
165
+ def predict(self, X: np.ndarray, y: Optional[np.ndarray]) -> np.ndarray:
166
+ """Predict the output of the model.
167
+
168
+ Parameters
169
+ ----------
170
+ X : array-like of shape (n_samples, n_features)
171
+ Input data
172
+
173
+ y : array-like of shape (n_samples,), default=None
174
+ Target values
175
+
176
+ Returns
177
+ -------
178
+ ndarray of shape (n_samples,)
179
+ Predicted values
180
+ """
@@ -0,0 +1,91 @@
1
+ from typing import Optional, Tuple, Union
2
+
3
+ from sklearn.cross_decomposition._pls import _PLS
4
+ from sklearn.decomposition._base import _BasePCA
5
+ from sklearn.pipeline import Pipeline
6
+ from sklearn.utils.validation import check_is_fitted
7
+
8
+ ModelTypes = Union[_BasePCA, _PLS]
9
+
10
+
11
+ def get_model_parameters(model: ModelTypes) -> Tuple[int, int, int]:
12
+ """
13
+ Get the number of features, components and samples from a model with PLS or PCA. types.
14
+
15
+ Parameters
16
+ ----------
17
+ model : ModelType
18
+ A fitted model of type _BasePCA or _PLS
19
+
20
+ Returns
21
+ -------
22
+ Tuple[int, int, int]
23
+ The number of features, components and samples in the model
24
+ """
25
+ if isinstance(model, _BasePCA):
26
+ return model.n_features_in_, model.n_components_, model.n_samples_
27
+ elif isinstance(model, _PLS):
28
+ return model.n_features_in_, model.n_components, len(model.x_scores_)
29
+ else:
30
+ raise ValueError(
31
+ "Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
32
+ )
33
+
34
+
35
+ def validate_confidence(confidence: float) -> float:
36
+ """Validate parameters using sklearn conventions.
37
+
38
+ Parameters
39
+ ----------
40
+ confidence : float
41
+ Confidence level for statistical calculations (between 0 and 1)
42
+
43
+ Returns
44
+ -------
45
+ float
46
+ The validated confidence level
47
+
48
+ Raises
49
+ ------
50
+ ValueError
51
+ If confidence is not between 0 and 1
52
+ """
53
+ if not 0 < confidence < 1:
54
+ raise ValueError("Confidence must be between 0 and 1")
55
+ return confidence
56
+
57
+
58
+ def validate_and_extract_model(
59
+ model: Union[ModelTypes, Pipeline],
60
+ ) -> Tuple[ModelTypes, Optional[Pipeline], int, int, int]:
61
+ """Validate and extract the model and preprocessing steps.
62
+
63
+ Parameters
64
+ ----------
65
+ model : Union[ModelTypes, Pipeline]
66
+ A fitted PCA/PLS model or Pipeline ending with such a model
67
+
68
+ Returns
69
+ -------
70
+ Tuple[ModelTypes, Optional[Pipeline]]
71
+ The extracted model and preprocessing steps
72
+
73
+ Raises
74
+ ------
75
+ ValueError
76
+ If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
77
+ """
78
+ if isinstance(model, Pipeline):
79
+ preprocessing = model[:-1]
80
+ model = model[-1]
81
+ else:
82
+ preprocessing = None
83
+
84
+ if not isinstance(model, (_BasePCA, _PLS)):
85
+ raise ValueError(
86
+ "Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
87
+ )
88
+
89
+ check_is_fitted(model)
90
+ n_features_in, n_components, n_samples = get_model_parameters(model)
91
+ return model, preprocessing, n_features_in, n_components, n_samples
@@ -0,0 +1,146 @@
1
+ from typing import Optional, Union
2
+ import numpy as np
3
+
4
+ from sklearn.pipeline import Pipeline
5
+ from sklearn.utils.validation import validate_data, check_is_fitted
6
+ from scipy.stats import f as f_distribution
7
+
8
+
9
+ from ._base import _ModelResidualsBase, ModelTypes
10
+
11
+
12
+ class DModX(_ModelResidualsBase):
13
+ """Calculate Distance to Model (DModX) statistics.
14
+
15
+ DModX measures the distance between an observation and the model plane
16
+ in the X-space, useful for detecting outliers.
17
+
18
+ Parameters
19
+ ----------
20
+ model : Union[ModelType, Pipeline]
21
+ A fitted PCA/PLS model or Pipeline ending with such a model
22
+
23
+ confidence : float, default=0.95
24
+ Confidence level for statistical calculations (between 0 and 1)
25
+
26
+ Attributes
27
+ ----------
28
+ model_ : ModelType
29
+ The fitted model of type _BasePCA or _PLS
30
+
31
+ preprocessing_ : Optional[Pipeline]
32
+ Preprocessing steps before the model
33
+
34
+ n_features_in_ : int
35
+ Number of features in the input data
36
+
37
+ n_components_ : int
38
+ Number of components in the model
39
+
40
+ n_samples_ : int
41
+ Number of samples used to train the model
42
+
43
+ critical_value_ : float
44
+ The calculated critical value for outlier detection
45
+ """
46
+
47
+ def __init__(
48
+ self,
49
+ model: Union[ModelTypes, Pipeline],
50
+ confidence: float = 0.95,
51
+ ) -> None:
52
+ super().__init__(model, confidence)
53
+
54
+ def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "DModX":
55
+ """
56
+ Fit the model to the input data.
57
+
58
+ This step calculates the critical value for the outlier detection. In the DmodX method,
59
+ the critical value is not depend on the input data but on the model parameters.
60
+ """
61
+ X = validate_data(
62
+ self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
63
+ )
64
+
65
+ self.critical_value_ = self._calculate_critical_value()
66
+ return self
67
+
68
+ def predict(self, X: np.ndarray) -> np.ndarray:
69
+ """Identify outliers in the input data.
70
+
71
+ Parameters
72
+ ----------
73
+ X : array-like of shape (n_samples, n_features)
74
+ Input data
75
+
76
+ Returns
77
+ -------
78
+ ndarray of shape (n_samples,)
79
+ Boolean array indicating outliers
80
+ """
81
+ # Check the estimator has been fitted
82
+ check_is_fitted(self, ["critical_value_"])
83
+
84
+ # Validate the input data
85
+ X = validate_data(
86
+ self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
87
+ )
88
+
89
+ # Calculate outliers based on the DModX statistics
90
+ dmodx_values = self.predict_residuals(X, validate=False)
91
+ return np.where(dmodx_values > self.critical_value_, -1, 1)
92
+
93
+ def predict_residuals(
94
+ self, X: np.ndarray, y: Optional[np.ndarray] = None, validate: bool = True
95
+ ) -> np.ndarray:
96
+ """Calculate DModX statistics for input data.
97
+
98
+ Parameters
99
+ ----------
100
+ X : array-like of shape (n_samples, n_features)
101
+ Input data
102
+
103
+ validate : bool, default=True
104
+ Whether to validate the input data
105
+
106
+ Returns
107
+ -------
108
+ ndarray of shape (n_samples,)
109
+ DModX statistics for each sample
110
+ """
111
+ # Check the estimator has been fitted
112
+ check_is_fitted(self, ["critical_value_"])
113
+
114
+ # Validate the input data
115
+ if validate:
116
+ X = validate_data(
117
+ self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
118
+ )
119
+
120
+ # Apply preprocessing if available
121
+ if self.preprocessing_:
122
+ X = self.preprocessing_.transform(X)
123
+
124
+ # Calculate the DModX statistics
125
+ X_transformed = self.model_.transform(X)
126
+ X_reconstructed = self.model_.inverse_transform(X_transformed)
127
+ squared_errors = np.sum((X - X_reconstructed) ** 2, axis=1)
128
+
129
+ return np.sqrt(squared_errors / (self.n_features_in_ - self.n_components_))
130
+
131
+ def _calculate_critical_value(self, X: Optional[np.ndarray] = None) -> float:
132
+ """Calculate F-distribution based critical value.
133
+
134
+ Returns
135
+ -------
136
+ float
137
+ The critical value for outlier detection
138
+ """
139
+
140
+ dof_numerator = self.n_features_in_ - self.n_components_
141
+ dof_denominator = self.n_features_in_ - self.n_components_ - 1
142
+
143
+ upper_control_limit = f_distribution.ppf(
144
+ self.confidence, dof_numerator, dof_denominator
145
+ )
146
+ return np.sqrt(upper_control_limit)