msreport 0.0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
msreport/normalize.py ADDED
@@ -0,0 +1,496 @@
1
+ from __future__ import annotations
2
+ import abc
3
+ import itertools
4
+ from typing import Callable, Iterable, Optional
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ import statsmodels.nonparametric.smoothers_lowess
9
+
10
+ import msreport.helper
11
+ import msreport.helper.maxlfq as MAXLFQ
12
+ from msreport.errors import NotFittedError
13
+
14
+
15
+ class BaseSampleNormalizer(abc.ABC):
16
+ """Base class for all sample normalizers."""
17
+
18
+ @abc.abstractmethod
19
+ def fit(self, table: pd.DataFrame) -> BaseSampleNormalizer:
20
+ ...
21
+
22
+ @abc.abstractmethod
23
+ def is_fitted(self) -> bool:
24
+ ...
25
+
26
+ @abc.abstractmethod
27
+ def get_fits(self) -> dict[...]:
28
+ ...
29
+
30
+ @abc.abstractmethod
31
+ def transform(self, table: pd.DataFrame) -> pd.DataFrame:
32
+ ...
33
+
34
+
35
+ class FixedValueNormalizer(BaseSampleNormalizer):
36
+ """Normalization by a constant normalization factor for each sample.
37
+
38
+ Expects log transformed intensity values.
39
+ """
40
+
41
+ def __init__(self, center_function: Callable, comparison: str):
42
+ """Initializes the FixedValueNormalizer.
43
+
44
+ Args:
45
+ center_function: A function that accepts a sequence of values and
46
+ returns a center value such as the median.
47
+ comparison: Must be "paired" or "reference". When "paired" is specified
48
+ the normalization values are first calculated for each column pair. Then
49
+ an optimal normalization value for each column is calculated by solving
50
+ a matrix of linear equations of the column pair values with least
51
+ squares. When "reference" is selected, a pseudo-reference sample is
52
+ generated by calculating the mean value for each row. Only rows with
53
+ valid values in all columns are used. Normalization values are then
54
+ calculated by comparing each column to the pseudo-reference sample.
55
+ """
56
+ if comparison not in ["paired", "reference"]:
57
+ raise ValueError(
58
+ f'"comparison" = {comparison} not allowed. '
59
+ 'Must be either "paired" or "reference".'
60
+ )
61
+ self._comparison_mode = comparison
62
+ self._fit_function = center_function
63
+ self._sample_fits = None
64
+
65
+ def fit(self, table: pd.DataFrame) -> BaseSampleNormalizer:
66
+ """Fits the FixedValueNormalizer.
67
+
68
+ Args:
69
+ table: Dataframe used to calculate normalization values for each column.
70
+ The normalization values are stored with the column names.
71
+
72
+ Returns:
73
+ Returns the instance itself.
74
+ """
75
+ if self._comparison_mode == "paired":
76
+ self._fit_with_paired_samples(table)
77
+ elif self._comparison_mode == "reference":
78
+ self._fit_with_pseudo_reference(table)
79
+ return self
80
+
81
+ def is_fitted(self) -> bool:
82
+ """Returns True if the FixedValueNormalizer has been fitted."""
83
+ return self._sample_fits is not None
84
+
85
+ def get_fits(self) -> dict[str, float]:
86
+ """Returns a dictionary containing the fitted center values per sample.
87
+
88
+ Raises:
89
+ NotFittedError: If the FixedValueNormalizer has not been fitted yet.
90
+ """
91
+ confirm_is_fitted(self)
92
+ return self._sample_fits.copy()
93
+
94
+ def transform(self, table: pd.DataFrame) -> pd.DataFrame:
95
+ """Applies a fixed value normalization to each column of the table.
96
+
97
+ Args:
98
+ table: The data to normalize. Each column name must correspond to a column
99
+ name from the table that was used for the fitting.
100
+
101
+ Returns:
102
+ Transformed dataframe.
103
+
104
+ Raises:
105
+ NotFittedError: If the FixedValueNormalizer has not been fitted yet.
106
+ """
107
+ confirm_is_fitted(self)
108
+
109
+ _table = table.copy()
110
+ for column in _table.columns:
111
+ column_data = np.array(_table[column], dtype=float)
112
+ mask = np.isfinite(column_data)
113
+ column_data[mask] = column_data[mask] - self._sample_fits[column]
114
+
115
+ _table[column] = column_data
116
+ return _table
117
+
118
+ def _fit_with_paired_samples(self, table: pd.DataFrame) -> None:
119
+ """Fits the FixedValueNormalizer by doing pair-wise column comparisons.
120
+
121
+ Normalization values are first calculated for each column pair. Then an optimal
122
+ normalization value for each column is calculated by solving a matrix of linear
123
+ equations of the column pair values with least squares. The individual
124
+ normalization values are stored in a dictionary with the column names as keys.
125
+
126
+ Args:
127
+ table: Dataframe used to calculate normalization values for each column.
128
+ """
129
+ samples = table.columns.tolist()
130
+ array = table.to_numpy()
131
+
132
+ ratio_matrix = MAXLFQ._calculate_pairwise_centered_log_ratio_matrix(
133
+ array, self._fit_function, log_transformed=True
134
+ )
135
+ coef_matrix, ratio_array, _ = MAXLFQ.prepare_coefficient_matrix(ratio_matrix)
136
+ profile = MAXLFQ.log_profiles_by_lstsq(coef_matrix, ratio_array)
137
+ self._sample_fits = dict(zip(samples, profile))
138
+
139
+ def _fit_with_pseudo_reference(self, table: pd.DataFrame) -> None:
140
+ """Fits the FixedValueNormalizer by comparing columns to a pseudo-reference.
141
+
142
+ First, a pseudo-reference samples is generated by calculating the mean value for
143
+ each row. Only rows with valid values in all columns are used. Normalization
144
+ values are then calculated by comparing each column to the pseudo-reference
145
+ sample. The individual normalization values are stored in a dictionary with the
146
+ column names as keys.
147
+
148
+ Args:
149
+ table: Dataframe used to calculate normalization values for each column.
150
+ """
151
+ ref_mask = table.isna().sum(axis=1) == 0
152
+ ref_values = table[ref_mask].mean(axis=1)
153
+ samples = table.columns.tolist()
154
+
155
+ self._sample_fits = {}
156
+ for sample in samples:
157
+ sample_values = table.loc[ref_mask, sample]
158
+ sample_fit = self._fit_function(sample_values - ref_values)
159
+ self._sample_fits[sample] = sample_fit
160
+
161
+
162
+ class ValueDependentNormalizer(BaseSampleNormalizer):
163
+ """Normalization with a value dependent fit for each sample.
164
+
165
+ Expects log transformed intensity values.
166
+ """
167
+
168
+ def __init__(self, fit_function: Callable):
169
+ """Initializes the ValueDependentNormalizer.
170
+
171
+ Args:
172
+ fit_function: A function that accepts two sequences of values with equal
173
+ length, with the first sequence being the observed samples values and
174
+ the second the reference values. The function must return a numpy array
175
+ with two columns. The first column contains the values and the second
176
+ column the fitted deviations.
177
+ """
178
+ self._sample_fits = None
179
+ self._fit_function = fit_function
180
+
181
+ def fit(self, table: pd.DataFrame) -> BaseSampleNormalizer:
182
+ """Fits the ValueDependentNormalizer.
183
+
184
+ Args:
185
+ table: Dataframe used to calculate normalization arrays for each column.
186
+
187
+ Returns:
188
+ Returns the instance itself.
189
+ """
190
+ self._fit_with_pseudo_reference(table)
191
+ return self
192
+
193
+ def is_fitted(self) -> bool:
194
+ """Returns True if the ValueDependentNormalizer has been fitted."""
195
+ return self._sample_fits is not None
196
+
197
+ def get_fits(self) -> dict[str, Iterable[float, float]]:
198
+ """Returns a dictionary containing lists of fitting data per sample.
199
+
200
+ Returns:
201
+ A dictionary mapping sample names to fitting data. Fitting data is sequence
202
+ of [itensity, deviation at this intensity] pairs.
203
+
204
+ Raises:
205
+ NotFittedError: If the ValueDependentNormalizer has not been fitted yet.
206
+ """
207
+ confirm_is_fitted(self)
208
+ return self._sample_fits.copy()
209
+
210
+ def transform(self, table: pd.DataFrame) -> pd.DataFrame:
211
+ """Applies a value dependent normalization to each column of the table.
212
+
213
+ Args:
214
+ table: The data to normalize. Each column name must correspond to a column
215
+ name from the table that was used for the fitting.
216
+
217
+ Returns:
218
+ Transformed dataframe.
219
+
220
+ Raises:
221
+ NotFittedError: If the ValueDependentNormalizer has not been fitted yet.
222
+ """
223
+ confirm_is_fitted(self)
224
+
225
+ _table = table.copy()
226
+ for column in _table.columns:
227
+ column_data = np.array(_table[column], dtype=float)
228
+ mask = np.isfinite(column_data)
229
+
230
+ sample_fit = self._sample_fits[column]
231
+ fit_values, fit_deviations = [np.array(i) for i in zip(*sample_fit)]
232
+ column_data[mask] = column_data[mask] - np.interp(
233
+ column_data[mask], fit_values, fit_deviations
234
+ )
235
+
236
+ _table[column] = column_data
237
+ return _table
238
+
239
+ def _fit_with_pseudo_reference(self, table: pd.DataFrame) -> None:
240
+ """Fits the FixedValueNormalizer by comparing columns to a pseudo-reference.
241
+
242
+ First, a pseudo-reference samples is generated by calculating the mean value for
243
+ each row. Only rows with valid values in all columns are used. Normalization
244
+ arrays are then calculated by comparing each column to the pseudo-reference
245
+ sample. The individual normalization arrays are stored in a dictionary with the
246
+ column names as keys.
247
+
248
+ Args:
249
+ table: Dataframe used to calculate normalization values for each column.
250
+ """
251
+ ref_mask = table.isna().sum(axis=1) == 0
252
+ ref_values = table[ref_mask].mean(axis=1)
253
+ samples = table.columns.tolist()
254
+
255
+ self._sample_fits = {}
256
+ for sample in samples:
257
+ sample_values = table.loc[ref_mask, sample]
258
+ sample_fit = self._fit_function(sample_values, ref_values)
259
+ self._sample_fits[sample] = sample_fit
260
+
261
+
262
+ class MedianNormalizer(FixedValueNormalizer):
263
+ """A FixedValueNormalizer that uses the median as the fitting function.
264
+
265
+ Use MedianNormalizer.fit(table: pd.DataFrame) to fit the normalizer, and then
266
+ MedianNormalizer.transform(table: pd.DataFrame) with the fitted normalizer to apply
267
+ the normalization.
268
+ """
269
+
270
+ def __init__(self):
271
+ """Initializes the MedianNormalizer."""
272
+ super(MedianNormalizer, self).__init__(
273
+ center_function=np.median, comparison="paired"
274
+ )
275
+
276
+
277
+ class ModeNormalizer(FixedValueNormalizer):
278
+ """A FixedValueNormalizer that uses the mode as the fitting function.
279
+
280
+ Use ModeNormalizer.fit(table: pd.DataFrame) to fit the normalizer, and then
281
+ ModeNormalizer.transform(table: pd.DataFrame) with the fitted normalizer to apply
282
+ the normalization.
283
+ """
284
+
285
+ def __init__(self):
286
+ """Initializes the ModeNormalizer."""
287
+ super(ModeNormalizer, self).__init__(
288
+ center_function=msreport.helper.mode, comparison="paired"
289
+ )
290
+
291
+
292
+ class LowessNormalizer(ValueDependentNormalizer):
293
+ """A ValueDependentNormalizer that uses lowess as the fitting function.
294
+
295
+ Use LowessNormalizer.fit(table: pd.DataFrame) to fit the normalizer, and then
296
+ LowessNormalizer.transform(table: pd.DataFrame) with the fitted normalizer to apply
297
+ the normalization.
298
+ """
299
+
300
+ def __init__(self):
301
+ """Initializes the LowessNormalizer."""
302
+ super(LowessNormalizer, self).__init__(fit_function=_value_dependent_fit_lowess)
303
+
304
+
305
+ class CategoricalNormalizer:
306
+ """Normalize samples based on category-dependent reference values.
307
+
308
+ Values from the reference table are used for normalization of the corresponding
309
+ categories in the table that will be transformed. The normalization is applied to
310
+ each column of the input table based on the category of each row.
311
+
312
+ The reference table must not contain NaN values and values in the sample columns
313
+ must be log-transformed. The table to be transformed must contain the same
314
+ `category_column` as the reference table and only include sample columns that were
315
+ used for fitting. Values from categories not present in the reference table will be
316
+ set to NaN. The table sample columns must also be log-transformed.
317
+ """
318
+
319
+ def __init__(self, category_column: str):
320
+ """Initializes a new instance of the CategoricalNormalizer class.
321
+
322
+ Args:
323
+ category_column: The name of the column containing the categories. This
324
+ column must be present in the reference table and the table to be
325
+ transformed.
326
+ """
327
+ self._fitted_table = None
328
+ self._category_column = category_column
329
+
330
+ def is_fitted(self) -> bool:
331
+ """Returns True if the CategoricalNormalizer has been fitted."""
332
+ return self._fitted_table is not None
333
+
334
+ def fit(self, reference_table: pd.DataFrame) -> BaseSampleNormalizer:
335
+ """Fits the CategoricalNormalizer to a reference table.
336
+
337
+ Args:
338
+ reference_table: The reference table used for fitting.
339
+
340
+ Returns:
341
+ Returns the instance itself.
342
+
343
+ Raises:
344
+ ValueError: If the reference table contains NaN values.
345
+ """
346
+ if reference_table.isna().values.any():
347
+ raise ValueError("Input table contains NaN values")
348
+ reference_table = reference_table.set_index(self.get_category_column())
349
+ self._fitted_table = reference_table
350
+ return self
351
+
352
+ def get_fits(self) -> pd.DataFrame:
353
+ """Returns a copy of the reference table used for fitting.
354
+
355
+ Raises:
356
+ NotFittedError: If the CategoricalNormalizer has not been fitted yet.
357
+ """
358
+ confirm_is_fitted(self)
359
+ return self._fitted_table.copy()
360
+
361
+ def get_category_column(self) -> str:
362
+ """Returns the name of the category column."""
363
+ return self._category_column
364
+
365
+ def transform(self, table: pd.DataFrame) -> pd.DataFrame:
366
+ """Applies a category dependent normalization to the table.
367
+
368
+ Args:
369
+ table: The table to normalize.
370
+
371
+ Returns:
372
+ The normalized table.
373
+
374
+ Raises:
375
+ KeyError: If the input table contains columns not present in the reference
376
+ table.
377
+ NotFittedError: If the CategoricalNormalizer has not been fitted yet.
378
+ """
379
+ confirm_is_fitted(self)
380
+
381
+ original_index = table.index
382
+ table = table.set_index(self.get_category_column(), drop=True, inplace=False)
383
+
384
+ if not table.columns.isin(self._fitted_table).all():
385
+ raise KeyError("The `table` contains columns not present in the fits")
386
+
387
+ valid_categories = table.index.isin(self._fitted_table.index)
388
+ sub_table = table[valid_categories]
389
+ values_for_fitting = self._fitted_table.loc[sub_table.index, sub_table.columns]
390
+
391
+ transformed_table = table.copy()
392
+ transformed_table[~valid_categories] = np.nan
393
+ transformed_table[valid_categories] = sub_table.sub(values_for_fitting, axis=1)
394
+
395
+ transformed_table.reset_index(inplace=True)
396
+ transformed_table.index = original_index
397
+ return transformed_table
398
+
399
+
400
+ class ZscoreScaler(BaseSampleNormalizer):
401
+ """Normalize samples by z-score scaling."""
402
+
403
+ def __init__(self, with_mean: bool = True, with_std: bool = True):
404
+ """Initializes a new instance of the ZscoreScaler class.
405
+
406
+ Args:
407
+ with_mean: If True, center row values by subtracting the row mean.
408
+ with_std: If True, scale row values by dividing by the row std.
409
+ """
410
+ self._with_mean = with_mean
411
+ self._with_std = with_std
412
+
413
+ def fit(self, table: pd.DataFrame) -> BaseSampleNormalizer:
414
+ """Returns the instance itself."""
415
+ return self
416
+
417
+ def is_fitted(self) -> bool:
418
+ """Always returns True because the ZscoreScaler does not need to be fitted."""
419
+ return True
420
+
421
+ def get_fits(self) -> dict:
422
+ """Returns a dictionary containing the parameters 'with_mean' and 'with_std'."""
423
+ return {"with_mean": self._with_mean, "with_std": self._with_std}
424
+
425
+ def transform(self, table: pd.DataFrame) -> pd.DataFrame:
426
+ """Applies a z-score normalization to each column of the table.
427
+
428
+ Args:
429
+ table: The table used to scale row values.
430
+
431
+ Returns:
432
+ A copy of the table containing the scaled values.
433
+ """
434
+ scaled_table = table.copy()
435
+ if self._with_mean:
436
+ scaled_table = scaled_table.subtract(scaled_table.mean(axis=1), axis=0)
437
+ if self._with_std:
438
+ scaled_table = scaled_table.divide(scaled_table.std(axis=1, ddof=0), axis=0)
439
+ return scaled_table
440
+
441
+
442
+ def confirm_is_fitted(
443
+ normalizer: BaseSampleNormalizer, msg: Optional[str] = None
444
+ ) -> None:
445
+ """Perform is_fitted validation for normalizer instances.
446
+
447
+ Checks if the normalizer is fitted by verifying the presence of fitted attributes
448
+ and otherwise raises a NotFittedError with the given message.
449
+
450
+ Args:
451
+ msg : str, default=None
452
+ The default error message is, "This %(name) instance is not fitted
453
+ yet. Call 'fit' with appropriate arguments before using this
454
+ normalizer."
455
+ """
456
+ if msg is None:
457
+ msg = (
458
+ "This %(name)s instance is not fitted yet. Call 'fit' with "
459
+ "appropriate arguments before using this normalizer."
460
+ )
461
+
462
+ if not hasattr(normalizer, "is_fitted"):
463
+ raise TypeError(f"{normalizer} is not an normalizer instance.")
464
+ else:
465
+ fitted = normalizer.is_fitted()
466
+
467
+ if not fitted:
468
+ raise NotFittedError(msg % {"name": type(normalizer).__name__})
469
+
470
+
471
+ def _value_dependent_fit_lowess(
472
+ values: np.ndarray,
473
+ reference_values: np.ndarray,
474
+ delta_span_percentage: float = 0.05,
475
+ iterations: int = 5,
476
+ ) -> np.ndarray:
477
+ """Calculates estimated deviations between values and reference_values using lowess.
478
+
479
+ Args:
480
+ values: The y-values of the observed points
481
+ reference_values: Used to calcualte the x-values of the observed points, as
482
+ 'values' - 'reference_values'.
483
+ delta_span_percentage: Distance within which to use linear-interpolation
484
+ instead of weighted regression, as a percentage of the data span.
485
+ iterations: The number of residual-based reweightings to perform
486
+
487
+ Returns:
488
+ A numpy array with two columns. The first column contains the sorted 'values'
489
+ and the second column the associated estimated deviation values from the
490
+ reference.
491
+ """
492
+ delta = (reference_values.max() - reference_values.min()) * delta_span_percentage
493
+ deviations = values - reference_values
494
+ return statsmodels.nonparametric.smoothers_lowess.lowess(
495
+ deviations, values, delta=delta, it=iterations
496
+ )