msreport 0.0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,99 @@
1
+ def extract_modifications(
2
+ peptide: str,
3
+ tag_open: str,
4
+ tag_close: str,
5
+ ) -> list[tuple[int, str]]:
6
+ """Returns a list of modification positions and strings.
7
+
8
+ Args:
9
+ peptide: Peptide sequence containing modifications
10
+ tag_open: Symbol that indicates the beginning of a modification tag, e.g. "[".
11
+ tag_close: Symbol that indicates the end of a modification tag, e.g. "]".
12
+
13
+ Returns:
14
+ A sorted list of modification tuples, containing position and modification
15
+ string (excluding the tag_open and tag_close strings).
16
+ """
17
+ start_counter = 0
18
+ tags = []
19
+ for position, char in enumerate(peptide):
20
+ if char == tag_open:
21
+ start_counter += 1
22
+ if start_counter == 1:
23
+ start_position = position
24
+ elif char == tag_close:
25
+ start_counter -= 1
26
+ if start_counter == 0:
27
+ tags.append((start_position, position))
28
+
29
+ modifications = []
30
+ last_position = 0
31
+ for tag_start, tag_end in tags:
32
+ mod_position = tag_start - last_position
33
+ modification = peptide[tag_start + 1 : tag_end]
34
+ modifications.append((mod_position, modification))
35
+ last_position += tag_end - tag_start + 1
36
+ return sorted(modifications)
37
+
38
+
39
+ def modify_peptide(
40
+ sequence: str,
41
+ modifications: list[tuple[int, str]],
42
+ tag_open: str = "[",
43
+ tag_close: str = "]",
44
+ ) -> str:
45
+ """Returns a string containing the modifications within the peptide sequence.
46
+
47
+ Returns:
48
+ Modified sequence. For example "PEPT[phospho]IDE", for sequence = "PEPTIDE" and
49
+ modifications = [(4, "phospho")]
50
+ """
51
+ last_pos = 0
52
+ modified_sequence = ""
53
+ for pos, mod in sorted(modifications):
54
+ tag = mod.join((tag_open, tag_close))
55
+ modified_sequence += sequence[last_pos:pos] + tag
56
+ last_pos = pos
57
+ modified_sequence += sequence[last_pos:]
58
+ return modified_sequence
59
+
60
+
61
+ def extract_window_around_position(protein_sequence: str, position: int) -> str:
62
+ """Extracts a window around the specified position in the protein sequence.
63
+
64
+ Args:
65
+ protein_sequence: The input protein sequence string.
66
+ position: The position in the protein sequence to extract the window around.
67
+ Position is one-indexed, which means that the first amino acid position 1.
68
+
69
+ Returns:
70
+ A string containing the window +/- 5 characters around the specified position.
71
+ If the position is too close to the beginning or the end of the
72
+ 'protein_sequence', the window is padded with '-' to ensure there are five
73
+ characters before and after the position.
74
+
75
+ Example:
76
+ >>> protein_sequence = "ABCDEFGHIJKLM"
77
+ >>> extract_window_around_position(protein_sequence, 7)
78
+ 'BCDEFGHIJKL'
79
+ >>> extract_window_around_position(protein_sequence, 1)
80
+ '-----ABCDEF'
81
+ >>> extract_window_around_position(protein_sequence, 13)
82
+ 'HIJKLM-----'
83
+ """
84
+ # TODO: Not tested
85
+ extension = 5
86
+ ond_index_correction = -1
87
+ _position = position + ond_index_correction
88
+ gap_filler = "-"
89
+
90
+ gap_to_end = len(protein_sequence) - (_position + 1)
91
+ gap_to_start = _position
92
+ left_pad = extension - gap_to_start if gap_to_start < extension else 0
93
+ left_right = extension - gap_to_end if gap_to_end < extension else 0
94
+
95
+ window_start = max(_position - extension, 0)
96
+ window_end = min(_position + extension, len(protein_sequence))
97
+ window = protein_sequence[window_start : window_end + 1]
98
+ window = "".join([gap_filler * left_pad, window, gap_filler * left_right])
99
+ return window
msreport/impute.py ADDED
@@ -0,0 +1,275 @@
1
+ from __future__ import annotations
2
+ from typing import Optional
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+
7
+ from msreport.errors import NotFittedError
8
+
9
+
10
+ class FixedValueImputer:
11
+ """Imputer for completing missing values with a fixed value.
12
+
13
+ Replace missing values using a constant value or with an integer that is smaller
14
+ than the minimum value of each column or smaller than the minimum value of the whole
15
+ array.
16
+ """
17
+
18
+ def __init__(
19
+ self,
20
+ strategy: str,
21
+ fill_value: Optional[float] = None,
22
+ column_wise: bool = True,
23
+ ):
24
+ """Initializes the FixedValueImputer.
25
+
26
+ Args:
27
+ strategy: The imputation strategy.
28
+ - If "constant", replace missing values with 'fill_value'.
29
+ - If "below", replace missing values with an integer that is smaller
30
+ than the minimal value of the fitted dataframe. Minimal values are
31
+ calculated per column if 'column_wise' is True, otherwise the minimal
32
+ value is calculated for all columns.
33
+ fill_value: When strategy is "constant", 'fill_value' is used to replace all
34
+ occurrences of missing_values.
35
+ column_wise: If True, imputation is performed independently for each column,
36
+ otherwise the whole dataframe is imputed togeter. Default True.
37
+
38
+ """
39
+ self.strategy = strategy
40
+ self.fill_value = fill_value
41
+ self.column_wise = column_wise
42
+ self._sample_fill_values: dict[str, float] = {}
43
+
44
+ def fit(self, table: pd.DataFrame) -> FixedValueImputer:
45
+ """Fits the FixedValueImputer.
46
+
47
+ Args:
48
+ table: Input Dataframe for generating fill values for each column.
49
+
50
+ Returns:
51
+ Returns the fitted FixedValueImputer instance.
52
+ """
53
+ if self.strategy == "constant":
54
+ # if not isinstance(self.fill_value, (float, int)):
55
+ # raise Excpetion()
56
+ fill_values = {column: self.fill_value for column in table.columns}
57
+ elif self.strategy == "below":
58
+ if self.column_wise:
59
+ fill_values = {}
60
+ for column in table:
61
+ fill_values[column] = _calculate_integer_below_min(table[column])
62
+ else:
63
+ int_below_min = _calculate_integer_below_min(table)
64
+ fill_values = {column: int_below_min for column in table.columns}
65
+ self._sample_fill_values = fill_values
66
+ return self
67
+
68
+ def is_fitted(self) -> bool:
69
+ """Returns True if the FixedValueImputer has been fitted."""
70
+ return len(self._sample_fill_values) != 0
71
+
72
+ def transform(self, table: pd.DataFrame) -> pd.DataFrame:
73
+ """Impute all missing values in 'table'.
74
+
75
+ Args:
76
+ table: A dataframe of numeric values that will be completed. Each column
77
+ name must correspond to a column name from the table that was used for
78
+ the fitting.
79
+
80
+ Returns:
81
+ 'table' with imputed missing values.
82
+ """
83
+ confirm_is_fitted(self)
84
+
85
+ _table = table.copy()
86
+ for column in _table.columns:
87
+ column_data = np.array(_table[column], dtype=float)
88
+ mask = ~np.isfinite(column_data)
89
+ column_data[mask] = self._sample_fill_values[column]
90
+ _table[column] = column_data
91
+ return _table
92
+
93
+
94
+ class GaussianImputer:
95
+ """Imputer for completing missing values by drawing from a gaussian distribution."""
96
+
97
+ def __init__(self, mu: float, sigma: float, seed: Optional[int] = None):
98
+ """Initializes the GaussianImputer.
99
+
100
+ Args:
101
+ mu: Mean of the gaussian distribution.
102
+ sigma: Standard deviation of the gaussian distribution, must be positive.
103
+ seed: Optional, allows specifying a number for initializing the random
104
+ number generator. Using the same seed for the same input table will
105
+ generate the same set of imputed values each time. Default is None,
106
+ which results in different imputed values being generated each time.
107
+ """
108
+ self.mu = mu
109
+ self.sigma = sigma
110
+ self.seed = seed
111
+
112
+ def fit(self, table: pd.DataFrame) -> GaussianImputer:
113
+ """Fits the GaussianImputer, altough this is not necessary.
114
+
115
+ Args:
116
+ table: Input Dataframe for fitting.
117
+
118
+ Returns:
119
+ Returns the fitted GaussianImputer instance.
120
+ """
121
+ return self
122
+
123
+ def is_fitted(self) -> bool:
124
+ """Returns always True, as the GaussianImputer does not need to be fitted."""
125
+ return True
126
+
127
+ def transform(self, table: pd.DataFrame) -> pd.DataFrame:
128
+ """Impute all missing values in 'table'.
129
+
130
+ Args:
131
+ table: A dataframe of numeric values that will be completed. Each column
132
+ name must correspond to a column name from the table that was used for
133
+ the fitting.
134
+
135
+ Returns:
136
+ 'table' with imputed missing values.
137
+ """
138
+ confirm_is_fitted(self)
139
+ np.random.seed(self.seed)
140
+
141
+ _table = table.copy()
142
+ for column in _table.columns:
143
+ column_data = np.array(_table[column], dtype=float)
144
+ mask = ~np.isfinite(column_data)
145
+ column_data[mask] = np.random.normal(
146
+ loc=self.mu, scale=self.sigma, size=mask.sum()
147
+ )
148
+ _table[column] = column_data
149
+ return _table
150
+
151
+
152
+ class PerseusImputer:
153
+ """Imputer for completing missing values as implemented in Perseus.
154
+
155
+ Perseus-style imputation replaces missing values by random numbers drawn from a
156
+ normal distribution. Sigma and mu of this distribution are calculated from the
157
+ standard deviation and median of the observed values.
158
+ """
159
+
160
+ def __init__(
161
+ self,
162
+ median_downshift: float = 1.8,
163
+ std_width: float = 0.3,
164
+ column_wise: bool = True,
165
+ seed: Optional[int] = None,
166
+ ):
167
+ """Initializes the GaussianImputer.
168
+
169
+ Args:
170
+ median_downshift: Times of standard deviations the observed median is
171
+ downshifted for calulating mu of the normal distribution. Default is 1.8
172
+ std_width: Factor for adjusting the standard deviation of the observed
173
+ values to obtain sigma of the normal distribution. Default is 0.3
174
+ column_wise: If True, imputation is performed independently for each column,
175
+ otherwise the whole dataframe is imputed togeter. Default True.
176
+ seed: Optional, allows specifying a number for initializing the random
177
+ number generator. Using the same seed for the same input table will
178
+ generate the same set of imputed values each time. Default is None,
179
+ which results in different imputed values being generated each time.
180
+
181
+ """
182
+ self.median_downshift = median_downshift
183
+ self.std_width = std_width
184
+ self.column_wise = column_wise
185
+ self.seed = seed
186
+ self._column_params: dict[str, dict] = {}
187
+
188
+ def fit(self, table: pd.DataFrame) -> PerseusImputer:
189
+ """Fits the PerseusImputer.
190
+
191
+ Args:
192
+ table: Input Dataframe for calculating mu and sigma of the gaussian
193
+ distribution.
194
+
195
+ Returns:
196
+ Returns the fitted PerseusImputer instance.
197
+ """
198
+ for column in table.columns:
199
+ if self.column_wise:
200
+ median = np.nanmedian(table[column])
201
+ std = np.nanstd(table[column])
202
+ else:
203
+ median = np.nanmedian(table)
204
+ std = np.nanstd(table)
205
+
206
+ mu = median - (std * self.median_downshift)
207
+ sigma = std * self.std_width
208
+
209
+ self._column_params[column] = {"mu": mu, "sigma": sigma}
210
+ return self
211
+
212
+ def is_fitted(self) -> bool:
213
+ """Returns True if the PerseusImputer has been fitted."""
214
+ return len(self._column_params) != 0
215
+
216
+ def transform(self, table: pd.DataFrame) -> pd.DataFrame:
217
+ """Impute all missing values in 'table'.
218
+
219
+ Args:
220
+ table: A dataframe of numeric values that will be completed. Each column
221
+ name must correspond to a column name from the table that was used for
222
+ the fitting.
223
+
224
+ Returns:
225
+ 'table' with imputed missing values.
226
+ """
227
+ confirm_is_fitted(self)
228
+ np.random.seed(self.seed)
229
+
230
+ _table = table.copy()
231
+ for column in _table.columns:
232
+ column_data = np.array(_table[column], dtype=float)
233
+ mask = ~np.isfinite(column_data)
234
+ column_data[mask] = np.random.normal(
235
+ loc=self._column_params[column]["mu"],
236
+ scale=self._column_params[column]["sigma"],
237
+ size=mask.sum(),
238
+ )
239
+ _table[column] = column_data
240
+ return _table
241
+
242
+
243
+ def confirm_is_fitted(imputer: any, msg: Optional[str] = None) -> None:
244
+ """Perform is_fitted validation for imputer instances.
245
+
246
+ Checks if the imputer is fitted by verifying the presence of fitted attributes
247
+ and otherwise raises a NotFittedError with the given message.
248
+
249
+ Args:
250
+ msg : str, default=None
251
+ The default error message is, "This %(name) instance is not fitted
252
+ yet. Call 'fit' with appropriate arguments before using this
253
+ normalizer."
254
+ """
255
+ if msg is None:
256
+ msg = (
257
+ "This %(name)s instance is not fitted yet. Call 'fit' with "
258
+ "appropriate arguments before using this imputer."
259
+ )
260
+
261
+ if not hasattr(imputer, "is_fitted"):
262
+ raise TypeError(f"{imputer} is not an imputer instance.")
263
+ else:
264
+ fitted = imputer.is_fitted()
265
+
266
+ if not fitted:
267
+ raise NotFittedError(msg % {"name": type(imputer).__name__})
268
+
269
+
270
+ def _calculate_integer_below_min(table) -> int:
271
+ minimal_value = np.nanmin(table.to_numpy().flatten())
272
+ below_minimal = np.floor(minimal_value)
273
+ if minimal_value <= below_minimal:
274
+ below_minimal = below_minimal - 1
275
+ return int(below_minimal)
msreport/isobar.py ADDED
@@ -0,0 +1,161 @@
1
+ from __future__ import annotations
2
+ import functools
3
+ from typing import Protocol
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ import scipy
8
+
9
+ import msreport.helper
10
+ from msreport.errors import NotFittedError
11
+
12
+
13
+ class Transformer(Protocol):
14
+ def fit(self, table: pd.DataFrame) -> Transformer:
15
+ """Fits the Transformer and returns a fitted Transformer instance."""
16
+
17
+ def is_fitted(self) -> bool:
18
+ """Returns True if the Transformer has been fitted."""
19
+
20
+ def transform(self, table: pd.DataFrame) -> pd.DataFrame:
21
+ """Transform values in 'table'."""
22
+
23
+
24
+ class IsotopeImpurityCorrecter:
25
+ """Corrects isotope impurity interference in isobaric reporter expression values."""
26
+
27
+ def __init__(self):
28
+ self._impurity_matrix = None
29
+
30
+ def fit(self, impurity_matrix: np.array) -> IsotopeImpurityCorrecter:
31
+ """Fits the isotope impurity correcter to a given impurity matrix.
32
+
33
+ Args:
34
+ impurity_matrix: A reporter isotope impurity matrix in a diagonal format,
35
+ where columns describe the isotope impurity of a specific channel, and
36
+ the values in each row indicate the percentage of signal from the
37
+ reporter that is present in each channel. Both dimensions of the
38
+ impurity matrix must have the same length.
39
+
40
+ Returns:
41
+ Returns the fitted class IsotopeImpurityCorrecter instance.
42
+ """
43
+ if impurity_matrix.shape[0] != impurity_matrix.shape[1]:
44
+ raise ValueError("The impurity matrix must be square.")
45
+ if np.isnan(impurity_matrix).any():
46
+ raise ValueError("The impurity matrix contains NaN values.")
47
+ self._impurity_matrix = impurity_matrix
48
+ return self
49
+
50
+ def is_fitted(self) -> bool:
51
+ """Returns True if the IsotopeImpurityCorrecter has been fitted."""
52
+ return self._impurity_matrix is not None
53
+
54
+ def get_fits(self) -> np.array:
55
+ """Returns a copy of the fitted impurity matrix.
56
+
57
+ returns:
58
+ A numpy array representing a diagonal impurity matrix.
59
+ """
60
+ if not self.is_fitted():
61
+ raise NotFittedError("The IsotopeImpurityCorrecter has not been fitted.")
62
+ return self._impurity_matrix.copy()
63
+
64
+ def transform(self, table: pd.DataFrame) -> pd.DataFrame:
65
+ """Applies isotope impurity correction to the values of the table.
66
+
67
+ Args:
68
+ table: The data to normalize. The columns of the table must correspond to
69
+ the channels of the impurity matrix used for fitting.
70
+
71
+ Returns:
72
+ A copy of the table with isotope impurity corrected values.
73
+ """
74
+ if not self.is_fitted():
75
+ raise NotFittedError("The IsotopeImpurityCorrecter has not been fitted.")
76
+ if table.shape[1] != self.get_fits().shape[1]:
77
+ raise ValueError(
78
+ "The number of columns in the table does not match the number "
79
+ "of channels in the impurity matrix."
80
+ )
81
+
82
+ corrected_values = correct_isobaric_reporter_impurities(
83
+ intensity_table=table.to_numpy(),
84
+ diagonal_impurity_matrix=self._impurity_matrix,
85
+ )
86
+ corrected_table = table.copy()
87
+ corrected_table[:] = corrected_values
88
+ return corrected_table
89
+
90
+
91
+ def correct_isobaric_reporter_impurities(
92
+ intensity_table: np.array,
93
+ diagonal_impurity_matrix: np.array,
94
+ ) -> np.array:
95
+ """Performs isotope impurity correction on isobaric reporter expression values.
96
+
97
+ Args:
98
+ intensity_table: A two-dimenstional array with columns corresponding to isobaric
99
+ reporter channels and rows to measured units such as PSMs, peptides or
100
+ proteins.
101
+ diagonal_impurity_matrix: A reporter isotope impurity matrix in a diagonal
102
+ format, where columns describe the isotope impurity of a specific channel,
103
+ and the values in each row indicate the percentage of signal from the
104
+ reporter that is present in each channel.
105
+ """
106
+ apply_impurity_correction = functools.partial(
107
+ _correct_impurity_contamination,
108
+ impurity_matrix=diagonal_impurity_matrix,
109
+ )
110
+
111
+ data_was_in_logpsace = msreport.helper.intensities_in_logspace(intensity_table)
112
+
113
+ if data_was_in_logpsace:
114
+ intensity_table = np.power(2, intensity_table)
115
+ intensity_table[np.isnan(intensity_table)] = 0
116
+ corrected_table = np.apply_along_axis(apply_impurity_correction, 1, intensity_table)
117
+ corrected_table[corrected_table <= 0] = 0
118
+ if data_was_in_logpsace:
119
+ corrected_table = np.log2(corrected_table)
120
+
121
+ return corrected_table
122
+
123
+
124
+ def _apply_impurity_contamination(
125
+ intensities: np.array, impurity_matrix: np.array
126
+ ) -> np.array:
127
+ """Applies reporter isotope impurity interference to an intensity array.
128
+
129
+ Args:
130
+ intensities: An array containing non-contaminated isobaric reporter intensities.
131
+ impurity_matrix: A reporter isotope impurity matrix in a diagonal format, where
132
+ columns describe the isotope impurity of a specific channel, and the values
133
+ in each row indicate the percentage of signal from the reporter that is
134
+ present in each channel. Both dimensions of the impurity matrix must have
135
+ the same length as the intensity array.
136
+
137
+ Returns:
138
+ An array containing contaminated intensities.
139
+ """
140
+ return np.sum(impurity_matrix * intensities, axis=1)
141
+
142
+
143
+ def _correct_impurity_contamination(
144
+ intensities: np.array, impurity_matrix: np.array
145
+ ) -> np.array:
146
+ """Applies reporter isotope impurity interference correction to an intensity array.
147
+
148
+ Args:
149
+ intensities: An array containing isobaric reporter intensities affected by
150
+ isotope impurity interference.
151
+ impurity_matrix: A reporter isotope impurity matrix in a diagonal format, where
152
+ columns describe the isotope impurity of a specific channel, and the values
153
+ in each row indicate the percentage of signal from the reporter that is
154
+ present in each channel. Both dimensions of the impurity matrix must have
155
+ the same length as the intensity array.
156
+
157
+ Returns:
158
+ An array containing impurity corrected intensities.
159
+ """
160
+ corrected_intensities, _ = scipy.optimize.nnls(impurity_matrix, intensities)
161
+ return corrected_intensities