msreport 0.0.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- msreport/__init__.py +13 -0
- msreport/aggregate/__init__.py +0 -0
- msreport/aggregate/condense.py +163 -0
- msreport/aggregate/pivot.py +132 -0
- msreport/aggregate/summarize.py +281 -0
- msreport/analyze.py +586 -0
- msreport/errors.py +10 -0
- msreport/export.py +526 -0
- msreport/fasta.py +28 -0
- msreport/helper/__init__.py +23 -0
- msreport/helper/calc.py +120 -0
- msreport/helper/maxlfq.py +339 -0
- msreport/helper/table.py +267 -0
- msreport/helper/temp.py +99 -0
- msreport/impute.py +275 -0
- msreport/isobar.py +161 -0
- msreport/normalize.py +496 -0
- msreport/peptidoform.py +283 -0
- msreport/plot.py +1129 -0
- msreport/qtable.py +537 -0
- msreport/reader.py +2357 -0
- msreport/rinterface/__init__.py +3 -0
- msreport/rinterface/limma.py +126 -0
- msreport/rinterface/rinstaller.py +35 -0
- msreport/rinterface/rscripts/limma.R +104 -0
- msreport-0.0.24.dist-info/METADATA +128 -0
- msreport-0.0.24.dist-info/RECORD +30 -0
- msreport-0.0.24.dist-info/WHEEL +5 -0
- msreport-0.0.24.dist-info/licenses/LICENSE.txt +202 -0
- msreport-0.0.24.dist-info/top_level.txt +1 -0
msreport/helper/temp.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
def extract_modifications(
|
|
2
|
+
peptide: str,
|
|
3
|
+
tag_open: str,
|
|
4
|
+
tag_close: str,
|
|
5
|
+
) -> list[tuple[int, str]]:
|
|
6
|
+
"""Returns a list of modification positions and strings.
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
peptide: Peptide sequence containing modifications
|
|
10
|
+
tag_open: Symbol that indicates the beginning of a modification tag, e.g. "[".
|
|
11
|
+
tag_close: Symbol that indicates the end of a modification tag, e.g. "]".
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
A sorted list of modification tuples, containing position and modification
|
|
15
|
+
string (excluding the tag_open and tag_close strings).
|
|
16
|
+
"""
|
|
17
|
+
start_counter = 0
|
|
18
|
+
tags = []
|
|
19
|
+
for position, char in enumerate(peptide):
|
|
20
|
+
if char == tag_open:
|
|
21
|
+
start_counter += 1
|
|
22
|
+
if start_counter == 1:
|
|
23
|
+
start_position = position
|
|
24
|
+
elif char == tag_close:
|
|
25
|
+
start_counter -= 1
|
|
26
|
+
if start_counter == 0:
|
|
27
|
+
tags.append((start_position, position))
|
|
28
|
+
|
|
29
|
+
modifications = []
|
|
30
|
+
last_position = 0
|
|
31
|
+
for tag_start, tag_end in tags:
|
|
32
|
+
mod_position = tag_start - last_position
|
|
33
|
+
modification = peptide[tag_start + 1 : tag_end]
|
|
34
|
+
modifications.append((mod_position, modification))
|
|
35
|
+
last_position += tag_end - tag_start + 1
|
|
36
|
+
return sorted(modifications)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def modify_peptide(
|
|
40
|
+
sequence: str,
|
|
41
|
+
modifications: list[tuple[int, str]],
|
|
42
|
+
tag_open: str = "[",
|
|
43
|
+
tag_close: str = "]",
|
|
44
|
+
) -> str:
|
|
45
|
+
"""Returns a string containing the modifications within the peptide sequence.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
Modified sequence. For example "PEPT[phospho]IDE", for sequence = "PEPTIDE" and
|
|
49
|
+
modifications = [(4, "phospho")]
|
|
50
|
+
"""
|
|
51
|
+
last_pos = 0
|
|
52
|
+
modified_sequence = ""
|
|
53
|
+
for pos, mod in sorted(modifications):
|
|
54
|
+
tag = mod.join((tag_open, tag_close))
|
|
55
|
+
modified_sequence += sequence[last_pos:pos] + tag
|
|
56
|
+
last_pos = pos
|
|
57
|
+
modified_sequence += sequence[last_pos:]
|
|
58
|
+
return modified_sequence
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def extract_window_around_position(protein_sequence: str, position: int) -> str:
|
|
62
|
+
"""Extracts a window around the specified position in the protein sequence.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
protein_sequence: The input protein sequence string.
|
|
66
|
+
position: The position in the protein sequence to extract the window around.
|
|
67
|
+
Position is one-indexed, which means that the first amino acid position 1.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
A string containing the window +/- 5 characters around the specified position.
|
|
71
|
+
If the position is too close to the beginning or the end of the
|
|
72
|
+
'protein_sequence', the window is padded with '-' to ensure there are five
|
|
73
|
+
characters before and after the position.
|
|
74
|
+
|
|
75
|
+
Example:
|
|
76
|
+
>>> protein_sequence = "ABCDEFGHIJKLM"
|
|
77
|
+
>>> extract_window_around_position(protein_sequence, 7)
|
|
78
|
+
'BCDEFGHIJKL'
|
|
79
|
+
>>> extract_window_around_position(protein_sequence, 1)
|
|
80
|
+
'-----ABCDEF'
|
|
81
|
+
>>> extract_window_around_position(protein_sequence, 13)
|
|
82
|
+
'HIJKLM-----'
|
|
83
|
+
"""
|
|
84
|
+
# TODO: Not tested
|
|
85
|
+
extension = 5
|
|
86
|
+
ond_index_correction = -1
|
|
87
|
+
_position = position + ond_index_correction
|
|
88
|
+
gap_filler = "-"
|
|
89
|
+
|
|
90
|
+
gap_to_end = len(protein_sequence) - (_position + 1)
|
|
91
|
+
gap_to_start = _position
|
|
92
|
+
left_pad = extension - gap_to_start if gap_to_start < extension else 0
|
|
93
|
+
left_right = extension - gap_to_end if gap_to_end < extension else 0
|
|
94
|
+
|
|
95
|
+
window_start = max(_position - extension, 0)
|
|
96
|
+
window_end = min(_position + extension, len(protein_sequence))
|
|
97
|
+
window = protein_sequence[window_start : window_end + 1]
|
|
98
|
+
window = "".join([gap_filler * left_pad, window, gap_filler * left_right])
|
|
99
|
+
return window
|
msreport/impute.py
ADDED
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from msreport.errors import NotFittedError
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FixedValueImputer:
|
|
11
|
+
"""Imputer for completing missing values with a fixed value.
|
|
12
|
+
|
|
13
|
+
Replace missing values using a constant value or with an integer that is smaller
|
|
14
|
+
than the minimum value of each column or smaller than the minimum value of the whole
|
|
15
|
+
array.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
strategy: str,
|
|
21
|
+
fill_value: Optional[float] = None,
|
|
22
|
+
column_wise: bool = True,
|
|
23
|
+
):
|
|
24
|
+
"""Initializes the FixedValueImputer.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
strategy: The imputation strategy.
|
|
28
|
+
- If "constant", replace missing values with 'fill_value'.
|
|
29
|
+
- If "below", replace missing values with an integer that is smaller
|
|
30
|
+
than the minimal value of the fitted dataframe. Minimal values are
|
|
31
|
+
calculated per column if 'column_wise' is True, otherwise the minimal
|
|
32
|
+
value is calculated for all columns.
|
|
33
|
+
fill_value: When strategy is "constant", 'fill_value' is used to replace all
|
|
34
|
+
occurrences of missing_values.
|
|
35
|
+
column_wise: If True, imputation is performed independently for each column,
|
|
36
|
+
otherwise the whole dataframe is imputed togeter. Default True.
|
|
37
|
+
|
|
38
|
+
"""
|
|
39
|
+
self.strategy = strategy
|
|
40
|
+
self.fill_value = fill_value
|
|
41
|
+
self.column_wise = column_wise
|
|
42
|
+
self._sample_fill_values: dict[str, float] = {}
|
|
43
|
+
|
|
44
|
+
def fit(self, table: pd.DataFrame) -> FixedValueImputer:
|
|
45
|
+
"""Fits the FixedValueImputer.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
table: Input Dataframe for generating fill values for each column.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Returns the fitted FixedValueImputer instance.
|
|
52
|
+
"""
|
|
53
|
+
if self.strategy == "constant":
|
|
54
|
+
# if not isinstance(self.fill_value, (float, int)):
|
|
55
|
+
# raise Excpetion()
|
|
56
|
+
fill_values = {column: self.fill_value for column in table.columns}
|
|
57
|
+
elif self.strategy == "below":
|
|
58
|
+
if self.column_wise:
|
|
59
|
+
fill_values = {}
|
|
60
|
+
for column in table:
|
|
61
|
+
fill_values[column] = _calculate_integer_below_min(table[column])
|
|
62
|
+
else:
|
|
63
|
+
int_below_min = _calculate_integer_below_min(table)
|
|
64
|
+
fill_values = {column: int_below_min for column in table.columns}
|
|
65
|
+
self._sample_fill_values = fill_values
|
|
66
|
+
return self
|
|
67
|
+
|
|
68
|
+
def is_fitted(self) -> bool:
|
|
69
|
+
"""Returns True if the FixedValueImputer has been fitted."""
|
|
70
|
+
return len(self._sample_fill_values) != 0
|
|
71
|
+
|
|
72
|
+
def transform(self, table: pd.DataFrame) -> pd.DataFrame:
|
|
73
|
+
"""Impute all missing values in 'table'.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
table: A dataframe of numeric values that will be completed. Each column
|
|
77
|
+
name must correspond to a column name from the table that was used for
|
|
78
|
+
the fitting.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
'table' with imputed missing values.
|
|
82
|
+
"""
|
|
83
|
+
confirm_is_fitted(self)
|
|
84
|
+
|
|
85
|
+
_table = table.copy()
|
|
86
|
+
for column in _table.columns:
|
|
87
|
+
column_data = np.array(_table[column], dtype=float)
|
|
88
|
+
mask = ~np.isfinite(column_data)
|
|
89
|
+
column_data[mask] = self._sample_fill_values[column]
|
|
90
|
+
_table[column] = column_data
|
|
91
|
+
return _table
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class GaussianImputer:
|
|
95
|
+
"""Imputer for completing missing values by drawing from a gaussian distribution."""
|
|
96
|
+
|
|
97
|
+
def __init__(self, mu: float, sigma: float, seed: Optional[int] = None):
|
|
98
|
+
"""Initializes the GaussianImputer.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
mu: Mean of the gaussian distribution.
|
|
102
|
+
sigma: Standard deviation of the gaussian distribution, must be positive.
|
|
103
|
+
seed: Optional, allows specifying a number for initializing the random
|
|
104
|
+
number generator. Using the same seed for the same input table will
|
|
105
|
+
generate the same set of imputed values each time. Default is None,
|
|
106
|
+
which results in different imputed values being generated each time.
|
|
107
|
+
"""
|
|
108
|
+
self.mu = mu
|
|
109
|
+
self.sigma = sigma
|
|
110
|
+
self.seed = seed
|
|
111
|
+
|
|
112
|
+
def fit(self, table: pd.DataFrame) -> GaussianImputer:
|
|
113
|
+
"""Fits the GaussianImputer, altough this is not necessary.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
table: Input Dataframe for fitting.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
Returns the fitted GaussianImputer instance.
|
|
120
|
+
"""
|
|
121
|
+
return self
|
|
122
|
+
|
|
123
|
+
def is_fitted(self) -> bool:
|
|
124
|
+
"""Returns always True, as the GaussianImputer does not need to be fitted."""
|
|
125
|
+
return True
|
|
126
|
+
|
|
127
|
+
def transform(self, table: pd.DataFrame) -> pd.DataFrame:
|
|
128
|
+
"""Impute all missing values in 'table'.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
table: A dataframe of numeric values that will be completed. Each column
|
|
132
|
+
name must correspond to a column name from the table that was used for
|
|
133
|
+
the fitting.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
'table' with imputed missing values.
|
|
137
|
+
"""
|
|
138
|
+
confirm_is_fitted(self)
|
|
139
|
+
np.random.seed(self.seed)
|
|
140
|
+
|
|
141
|
+
_table = table.copy()
|
|
142
|
+
for column in _table.columns:
|
|
143
|
+
column_data = np.array(_table[column], dtype=float)
|
|
144
|
+
mask = ~np.isfinite(column_data)
|
|
145
|
+
column_data[mask] = np.random.normal(
|
|
146
|
+
loc=self.mu, scale=self.sigma, size=mask.sum()
|
|
147
|
+
)
|
|
148
|
+
_table[column] = column_data
|
|
149
|
+
return _table
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class PerseusImputer:
|
|
153
|
+
"""Imputer for completing missing values as implemented in Perseus.
|
|
154
|
+
|
|
155
|
+
Perseus-style imputation replaces missing values by random numbers drawn from a
|
|
156
|
+
normal distribution. Sigma and mu of this distribution are calculated from the
|
|
157
|
+
standard deviation and median of the observed values.
|
|
158
|
+
"""
|
|
159
|
+
|
|
160
|
+
def __init__(
|
|
161
|
+
self,
|
|
162
|
+
median_downshift: float = 1.8,
|
|
163
|
+
std_width: float = 0.3,
|
|
164
|
+
column_wise: bool = True,
|
|
165
|
+
seed: Optional[int] = None,
|
|
166
|
+
):
|
|
167
|
+
"""Initializes the GaussianImputer.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
median_downshift: Times of standard deviations the observed median is
|
|
171
|
+
downshifted for calulating mu of the normal distribution. Default is 1.8
|
|
172
|
+
std_width: Factor for adjusting the standard deviation of the observed
|
|
173
|
+
values to obtain sigma of the normal distribution. Default is 0.3
|
|
174
|
+
column_wise: If True, imputation is performed independently for each column,
|
|
175
|
+
otherwise the whole dataframe is imputed togeter. Default True.
|
|
176
|
+
seed: Optional, allows specifying a number for initializing the random
|
|
177
|
+
number generator. Using the same seed for the same input table will
|
|
178
|
+
generate the same set of imputed values each time. Default is None,
|
|
179
|
+
which results in different imputed values being generated each time.
|
|
180
|
+
|
|
181
|
+
"""
|
|
182
|
+
self.median_downshift = median_downshift
|
|
183
|
+
self.std_width = std_width
|
|
184
|
+
self.column_wise = column_wise
|
|
185
|
+
self.seed = seed
|
|
186
|
+
self._column_params: dict[str, dict] = {}
|
|
187
|
+
|
|
188
|
+
def fit(self, table: pd.DataFrame) -> PerseusImputer:
|
|
189
|
+
"""Fits the PerseusImputer.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
table: Input Dataframe for calculating mu and sigma of the gaussian
|
|
193
|
+
distribution.
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
Returns the fitted PerseusImputer instance.
|
|
197
|
+
"""
|
|
198
|
+
for column in table.columns:
|
|
199
|
+
if self.column_wise:
|
|
200
|
+
median = np.nanmedian(table[column])
|
|
201
|
+
std = np.nanstd(table[column])
|
|
202
|
+
else:
|
|
203
|
+
median = np.nanmedian(table)
|
|
204
|
+
std = np.nanstd(table)
|
|
205
|
+
|
|
206
|
+
mu = median - (std * self.median_downshift)
|
|
207
|
+
sigma = std * self.std_width
|
|
208
|
+
|
|
209
|
+
self._column_params[column] = {"mu": mu, "sigma": sigma}
|
|
210
|
+
return self
|
|
211
|
+
|
|
212
|
+
def is_fitted(self) -> bool:
|
|
213
|
+
"""Returns True if the PerseusImputer has been fitted."""
|
|
214
|
+
return len(self._column_params) != 0
|
|
215
|
+
|
|
216
|
+
def transform(self, table: pd.DataFrame) -> pd.DataFrame:
|
|
217
|
+
"""Impute all missing values in 'table'.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
table: A dataframe of numeric values that will be completed. Each column
|
|
221
|
+
name must correspond to a column name from the table that was used for
|
|
222
|
+
the fitting.
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
'table' with imputed missing values.
|
|
226
|
+
"""
|
|
227
|
+
confirm_is_fitted(self)
|
|
228
|
+
np.random.seed(self.seed)
|
|
229
|
+
|
|
230
|
+
_table = table.copy()
|
|
231
|
+
for column in _table.columns:
|
|
232
|
+
column_data = np.array(_table[column], dtype=float)
|
|
233
|
+
mask = ~np.isfinite(column_data)
|
|
234
|
+
column_data[mask] = np.random.normal(
|
|
235
|
+
loc=self._column_params[column]["mu"],
|
|
236
|
+
scale=self._column_params[column]["sigma"],
|
|
237
|
+
size=mask.sum(),
|
|
238
|
+
)
|
|
239
|
+
_table[column] = column_data
|
|
240
|
+
return _table
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def confirm_is_fitted(imputer: any, msg: Optional[str] = None) -> None:
|
|
244
|
+
"""Perform is_fitted validation for imputer instances.
|
|
245
|
+
|
|
246
|
+
Checks if the imputer is fitted by verifying the presence of fitted attributes
|
|
247
|
+
and otherwise raises a NotFittedError with the given message.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
msg : str, default=None
|
|
251
|
+
The default error message is, "This %(name) instance is not fitted
|
|
252
|
+
yet. Call 'fit' with appropriate arguments before using this
|
|
253
|
+
normalizer."
|
|
254
|
+
"""
|
|
255
|
+
if msg is None:
|
|
256
|
+
msg = (
|
|
257
|
+
"This %(name)s instance is not fitted yet. Call 'fit' with "
|
|
258
|
+
"appropriate arguments before using this imputer."
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
if not hasattr(imputer, "is_fitted"):
|
|
262
|
+
raise TypeError(f"{imputer} is not an imputer instance.")
|
|
263
|
+
else:
|
|
264
|
+
fitted = imputer.is_fitted()
|
|
265
|
+
|
|
266
|
+
if not fitted:
|
|
267
|
+
raise NotFittedError(msg % {"name": type(imputer).__name__})
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def _calculate_integer_below_min(table) -> int:
|
|
271
|
+
minimal_value = np.nanmin(table.to_numpy().flatten())
|
|
272
|
+
below_minimal = np.floor(minimal_value)
|
|
273
|
+
if minimal_value <= below_minimal:
|
|
274
|
+
below_minimal = below_minimal - 1
|
|
275
|
+
return int(below_minimal)
|
msreport/isobar.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import functools
|
|
3
|
+
from typing import Protocol
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import scipy
|
|
8
|
+
|
|
9
|
+
import msreport.helper
|
|
10
|
+
from msreport.errors import NotFittedError
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Transformer(Protocol):
|
|
14
|
+
def fit(self, table: pd.DataFrame) -> Transformer:
|
|
15
|
+
"""Fits the Transformer and returns a fitted Transformer instance."""
|
|
16
|
+
|
|
17
|
+
def is_fitted(self) -> bool:
|
|
18
|
+
"""Returns True if the Transformer has been fitted."""
|
|
19
|
+
|
|
20
|
+
def transform(self, table: pd.DataFrame) -> pd.DataFrame:
|
|
21
|
+
"""Transform values in 'table'."""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class IsotopeImpurityCorrecter:
|
|
25
|
+
"""Corrects isotope impurity interference in isobaric reporter expression values."""
|
|
26
|
+
|
|
27
|
+
def __init__(self):
|
|
28
|
+
self._impurity_matrix = None
|
|
29
|
+
|
|
30
|
+
def fit(self, impurity_matrix: np.array) -> IsotopeImpurityCorrecter:
|
|
31
|
+
"""Fits the isotope impurity correcter to a given impurity matrix.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
impurity_matrix: A reporter isotope impurity matrix in a diagonal format,
|
|
35
|
+
where columns describe the isotope impurity of a specific channel, and
|
|
36
|
+
the values in each row indicate the percentage of signal from the
|
|
37
|
+
reporter that is present in each channel. Both dimensions of the
|
|
38
|
+
impurity matrix must have the same length.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Returns the fitted class IsotopeImpurityCorrecter instance.
|
|
42
|
+
"""
|
|
43
|
+
if impurity_matrix.shape[0] != impurity_matrix.shape[1]:
|
|
44
|
+
raise ValueError("The impurity matrix must be square.")
|
|
45
|
+
if np.isnan(impurity_matrix).any():
|
|
46
|
+
raise ValueError("The impurity matrix contains NaN values.")
|
|
47
|
+
self._impurity_matrix = impurity_matrix
|
|
48
|
+
return self
|
|
49
|
+
|
|
50
|
+
def is_fitted(self) -> bool:
|
|
51
|
+
"""Returns True if the IsotopeImpurityCorrecter has been fitted."""
|
|
52
|
+
return self._impurity_matrix is not None
|
|
53
|
+
|
|
54
|
+
def get_fits(self) -> np.array:
|
|
55
|
+
"""Returns a copy of the fitted impurity matrix.
|
|
56
|
+
|
|
57
|
+
returns:
|
|
58
|
+
A numpy array representing a diagonal impurity matrix.
|
|
59
|
+
"""
|
|
60
|
+
if not self.is_fitted():
|
|
61
|
+
raise NotFittedError("The IsotopeImpurityCorrecter has not been fitted.")
|
|
62
|
+
return self._impurity_matrix.copy()
|
|
63
|
+
|
|
64
|
+
def transform(self, table: pd.DataFrame) -> pd.DataFrame:
|
|
65
|
+
"""Applies isotope impurity correction to the values of the table.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
table: The data to normalize. The columns of the table must correspond to
|
|
69
|
+
the channels of the impurity matrix used for fitting.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
A copy of the table with isotope impurity corrected values.
|
|
73
|
+
"""
|
|
74
|
+
if not self.is_fitted():
|
|
75
|
+
raise NotFittedError("The IsotopeImpurityCorrecter has not been fitted.")
|
|
76
|
+
if table.shape[1] != self.get_fits().shape[1]:
|
|
77
|
+
raise ValueError(
|
|
78
|
+
"The number of columns in the table does not match the number "
|
|
79
|
+
"of channels in the impurity matrix."
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
corrected_values = correct_isobaric_reporter_impurities(
|
|
83
|
+
intensity_table=table.to_numpy(),
|
|
84
|
+
diagonal_impurity_matrix=self._impurity_matrix,
|
|
85
|
+
)
|
|
86
|
+
corrected_table = table.copy()
|
|
87
|
+
corrected_table[:] = corrected_values
|
|
88
|
+
return corrected_table
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def correct_isobaric_reporter_impurities(
|
|
92
|
+
intensity_table: np.array,
|
|
93
|
+
diagonal_impurity_matrix: np.array,
|
|
94
|
+
) -> np.array:
|
|
95
|
+
"""Performs isotope impurity correction on isobaric reporter expression values.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
intensity_table: A two-dimenstional array with columns corresponding to isobaric
|
|
99
|
+
reporter channels and rows to measured units such as PSMs, peptides or
|
|
100
|
+
proteins.
|
|
101
|
+
diagonal_impurity_matrix: A reporter isotope impurity matrix in a diagonal
|
|
102
|
+
format, where columns describe the isotope impurity of a specific channel,
|
|
103
|
+
and the values in each row indicate the percentage of signal from the
|
|
104
|
+
reporter that is present in each channel.
|
|
105
|
+
"""
|
|
106
|
+
apply_impurity_correction = functools.partial(
|
|
107
|
+
_correct_impurity_contamination,
|
|
108
|
+
impurity_matrix=diagonal_impurity_matrix,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
data_was_in_logpsace = msreport.helper.intensities_in_logspace(intensity_table)
|
|
112
|
+
|
|
113
|
+
if data_was_in_logpsace:
|
|
114
|
+
intensity_table = np.power(2, intensity_table)
|
|
115
|
+
intensity_table[np.isnan(intensity_table)] = 0
|
|
116
|
+
corrected_table = np.apply_along_axis(apply_impurity_correction, 1, intensity_table)
|
|
117
|
+
corrected_table[corrected_table <= 0] = 0
|
|
118
|
+
if data_was_in_logpsace:
|
|
119
|
+
corrected_table = np.log2(corrected_table)
|
|
120
|
+
|
|
121
|
+
return corrected_table
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _apply_impurity_contamination(
|
|
125
|
+
intensities: np.array, impurity_matrix: np.array
|
|
126
|
+
) -> np.array:
|
|
127
|
+
"""Applies reporter isotope impurity interference to an intensity array.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
intensities: An array containing non-contaminated isobaric reporter intensities.
|
|
131
|
+
impurity_matrix: A reporter isotope impurity matrix in a diagonal format, where
|
|
132
|
+
columns describe the isotope impurity of a specific channel, and the values
|
|
133
|
+
in each row indicate the percentage of signal from the reporter that is
|
|
134
|
+
present in each channel. Both dimensions of the impurity matrix must have
|
|
135
|
+
the same length as the intensity array.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
An array containing contaminated intensities.
|
|
139
|
+
"""
|
|
140
|
+
return np.sum(impurity_matrix * intensities, axis=1)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _correct_impurity_contamination(
|
|
144
|
+
intensities: np.array, impurity_matrix: np.array
|
|
145
|
+
) -> np.array:
|
|
146
|
+
"""Applies reporter isotope impurity interference correction to an intensity array.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
intensities: An array containing isobaric reporter intensities affected by
|
|
150
|
+
isotope impurity interference.
|
|
151
|
+
impurity_matrix: A reporter isotope impurity matrix in a diagonal format, where
|
|
152
|
+
columns describe the isotope impurity of a specific channel, and the values
|
|
153
|
+
in each row indicate the percentage of signal from the reporter that is
|
|
154
|
+
present in each channel. Both dimensions of the impurity matrix must have
|
|
155
|
+
the same length as the intensity array.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
An array containing impurity corrected intensities.
|
|
159
|
+
"""
|
|
160
|
+
corrected_intensities, _ = scipy.optimize.nnls(impurity_matrix, intensities)
|
|
161
|
+
return corrected_intensities
|