msreport 0.0.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- msreport/__init__.py +13 -0
- msreport/aggregate/__init__.py +0 -0
- msreport/aggregate/condense.py +163 -0
- msreport/aggregate/pivot.py +132 -0
- msreport/aggregate/summarize.py +281 -0
- msreport/analyze.py +586 -0
- msreport/errors.py +10 -0
- msreport/export.py +526 -0
- msreport/fasta.py +28 -0
- msreport/helper/__init__.py +23 -0
- msreport/helper/calc.py +120 -0
- msreport/helper/maxlfq.py +339 -0
- msreport/helper/table.py +267 -0
- msreport/helper/temp.py +99 -0
- msreport/impute.py +275 -0
- msreport/isobar.py +161 -0
- msreport/normalize.py +496 -0
- msreport/peptidoform.py +283 -0
- msreport/plot.py +1129 -0
- msreport/qtable.py +537 -0
- msreport/reader.py +2357 -0
- msreport/rinterface/__init__.py +3 -0
- msreport/rinterface/limma.py +126 -0
- msreport/rinterface/rinstaller.py +35 -0
- msreport/rinterface/rscripts/limma.R +104 -0
- msreport-0.0.24.dist-info/METADATA +128 -0
- msreport-0.0.24.dist-info/RECORD +30 -0
- msreport-0.0.24.dist-info/WHEEL +5 -0
- msreport-0.0.24.dist-info/licenses/LICENSE.txt +202 -0
- msreport-0.0.24.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
from typing import Callable
|
|
3
|
+
import warnings
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
import msreport.helper
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def calculate_pairwise_log_ratio_matrix(
|
|
11
|
+
array: np.ndarray, log_transformed: bool = False
|
|
12
|
+
) -> np.ndarray:
|
|
13
|
+
"""Calculates a pairwise log ratio matrix from an intensity array.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
array: A two-dimensional numpy array, with the first dimension corresponding to
|
|
17
|
+
rows and the second dimension to columns.
|
|
18
|
+
log_transformed: If True, the 'array' is expected to contain log transformed
|
|
19
|
+
intensity values. If False, the array is expected to contain non-transformed
|
|
20
|
+
intensity values, which are log2 transformed for the calculation of ratios.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
A 3-dimensional numpy array, containing pair-wise log ratios. The shape of the
|
|
24
|
+
output array is (n, i, i), where n is the number of rows of the input array and
|
|
25
|
+
i is the number of columns.
|
|
26
|
+
|
|
27
|
+
Example:
|
|
28
|
+
>>> array = np.array(
|
|
29
|
+
... [
|
|
30
|
+
... [4.0, 4.0, 8.0],
|
|
31
|
+
... [8.0, 9.0, np.nan],
|
|
32
|
+
... ]
|
|
33
|
+
... )
|
|
34
|
+
>>> calculate_pairwise_log_ratio_matrix(array)
|
|
35
|
+
array([[[ 0. , 0. , -1. ],
|
|
36
|
+
[ 0. , 0. , -1. ],
|
|
37
|
+
[ 1. , 1. , 0. ]],
|
|
38
|
+
<BLANKLINE>
|
|
39
|
+
[[ 0. , -0.169925, nan],
|
|
40
|
+
[ 0.169925, 0. , nan],
|
|
41
|
+
[ nan, nan, nan]]])
|
|
42
|
+
"""
|
|
43
|
+
if np.issubdtype(array.dtype, np.integer):
|
|
44
|
+
log_array = array.astype(float)
|
|
45
|
+
else:
|
|
46
|
+
log_array = array.copy()
|
|
47
|
+
|
|
48
|
+
if not log_transformed:
|
|
49
|
+
log_array[log_array == 0] = np.nan
|
|
50
|
+
log_array = np.log2(log_array)
|
|
51
|
+
|
|
52
|
+
ratio_matrix = log_array[:, :, None] - log_array[:, None, :]
|
|
53
|
+
ratio_matrix[~np.isfinite(ratio_matrix)] = np.nan
|
|
54
|
+
return ratio_matrix
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def calculate_pairwise_median_log_ratio_matrix(
|
|
58
|
+
array: np.ndarray, log_transformed: bool = False
|
|
59
|
+
) -> np.ndarray:
|
|
60
|
+
"""Calculates a pairwise median log ratio matrix from an intensity array.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
array: A two-dimensional numpy array, with the first dimension corresponding to
|
|
64
|
+
rows and the second dimension to columns.
|
|
65
|
+
log_transformed: If True, the 'array' is expected to contain log transformed
|
|
66
|
+
intensity values. If False, the array is expected to contain non-transformed
|
|
67
|
+
intensity values, which are log2 transformed for the calculation of ratios.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
A square 2-dimensional numpy array, containing pair-wise log ratios. The shape
|
|
71
|
+
of the output array is (i, i), where i is the number of columns of the input
|
|
72
|
+
array.
|
|
73
|
+
|
|
74
|
+
Example:
|
|
75
|
+
>>> array = np.array(
|
|
76
|
+
... [
|
|
77
|
+
... [4.0, 4.0, 8.0],
|
|
78
|
+
... [8.0, 9.0, np.nan],
|
|
79
|
+
... ]
|
|
80
|
+
... )
|
|
81
|
+
>>> calculate_pairwise_median_log_ratio_matrix(array)
|
|
82
|
+
array([[ 0. , -0.0849625, -1. ],
|
|
83
|
+
[ 0.0849625, 0. , -1. ],
|
|
84
|
+
[ 1. , 1. , 0. ]])
|
|
85
|
+
"""
|
|
86
|
+
ratio_marix = _calculate_pairwise_centered_log_ratio_matrix(
|
|
87
|
+
array, np.median, log_transformed=log_transformed
|
|
88
|
+
)
|
|
89
|
+
return ratio_marix
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def calculate_pairwise_mode_log_ratio_matrix(
|
|
93
|
+
array: np.ndarray, log_transformed: bool = False
|
|
94
|
+
) -> np.ndarray:
|
|
95
|
+
"""Calculates a pairwise mode ratio matrix from an intensity array.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
array: A two-dimensional numpy array, with the first dimension corresponding to
|
|
99
|
+
rows and the second dimension to columns.
|
|
100
|
+
log_transformed: If True, the 'array' is expected to contain log transformed
|
|
101
|
+
intensity values. If False, the array is expected to contain non-transformed
|
|
102
|
+
intensity values, which are log2 transformed for the calculation of ratios.
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
A square 2-dimensional numpy array, containing pair-wise ratios. The shape of
|
|
106
|
+
the output array is (i, i), where i is the number of columns of the input array.
|
|
107
|
+
|
|
108
|
+
Example:
|
|
109
|
+
>>> array = np.array(
|
|
110
|
+
... [
|
|
111
|
+
... [4.0, 4.0, 8.0],
|
|
112
|
+
... [8.0, 9.0, np.nan],
|
|
113
|
+
... ]
|
|
114
|
+
... )
|
|
115
|
+
>>> calculate_pairwise_mode_log_ratio_matrix(array)
|
|
116
|
+
array([[ 0. , -0.0849625, -1. ],
|
|
117
|
+
[ 0.0849625, 0. , -1. ],
|
|
118
|
+
[ 1. , 1. , 0. ]])
|
|
119
|
+
"""
|
|
120
|
+
ratio_marix = _calculate_pairwise_centered_log_ratio_matrix(
|
|
121
|
+
array, msreport.helper.mode, log_transformed=log_transformed
|
|
122
|
+
)
|
|
123
|
+
return ratio_marix
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def prepare_coefficient_matrix(
|
|
127
|
+
ratio_matrix: np.ndarray,
|
|
128
|
+
) -> (np.ndarray, np.ndarray, np.ndarray):
|
|
129
|
+
"""Prepares coefficients, ratios, and initial row indices from a log ratio matrix.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
ratio_matrix: A numpy array containing one or multiple pair-wise ratio matrices.
|
|
133
|
+
Each ratio matrix must be a square array, with a ratio at position (i, j)
|
|
134
|
+
being calculated from an abundance table as 'column i - column j'. Ratios
|
|
135
|
+
should have been calculated by row index Only the upper triangular part of
|
|
136
|
+
the ratio matrix is used to generate the coefficient matrix. If the
|
|
137
|
+
'ratio_matrix' contains multiple ratio matrices, the shape of the array
|
|
138
|
+
has to be (n, i, i), where n is the number of ratio matrices and i is the
|
|
139
|
+
number of rows and columns per ratio matrix. If only one ratio matrix is
|
|
140
|
+
provided, the shape of the array has to be (i, i).
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
A tuple containing the following three elements:
|
|
144
|
+
- A coefficent matrix. 2d array with the number of rows corresponding to the
|
|
145
|
+
first dimension of the 'ratio_matrix' and the number of columns to the second
|
|
146
|
+
dimension.
|
|
147
|
+
- Ratios: 1d array containing the ratios from the ratio matrix, each entry
|
|
148
|
+
corresponds to a row in the coefficent matrix.
|
|
149
|
+
- Ratio matrix row indices: 1d array containing row indicies that refer to the
|
|
150
|
+
index of the first dimenstion from the 'ratio_matrix', and thus to row indices
|
|
151
|
+
from the original table that was used to generate 'the ratio_matrix'. If
|
|
152
|
+
the 'ratio_matrix' has only 2 dimensions all values are zero.
|
|
153
|
+
|
|
154
|
+
Example:
|
|
155
|
+
>>> ratio_matrix = np.array(
|
|
156
|
+
... [
|
|
157
|
+
... [0.0, -0.1, -1.0],
|
|
158
|
+
... [0.1, 0.0, -1.0],
|
|
159
|
+
... [1.0, 1.0, 0.0],
|
|
160
|
+
... ]
|
|
161
|
+
... )
|
|
162
|
+
>>> prepare_coefficient_matrix(ratio_matrix)
|
|
163
|
+
(array([[ 1, -1, 0],
|
|
164
|
+
[ 1, 0, -1],
|
|
165
|
+
[ 0, 1, -1]]),
|
|
166
|
+
array([-0.1, -1. , -1. ]),
|
|
167
|
+
array([0, 0, 0]))
|
|
168
|
+
|
|
169
|
+
"""
|
|
170
|
+
if len(ratio_matrix.shape) == 2:
|
|
171
|
+
result = _coefficients_from_single_row_matrix(ratio_matrix)
|
|
172
|
+
else:
|
|
173
|
+
result = _coefficients_from_multi_row_matrix(ratio_matrix)
|
|
174
|
+
coef_matrix, ratio_array, initial_rows = result
|
|
175
|
+
|
|
176
|
+
return coef_matrix, ratio_array, initial_rows
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def log_profiles_by_lstsq(coef_matrix: np.ndarray, ratio_array: np.ndarray):
|
|
180
|
+
"""Calculates estimated log abundance profiles by least-squares fitting.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
coef_matrix: Two-dimensional numpy array representing the coefficients.
|
|
184
|
+
ratio_array: One-dimensional numpy array representing the ratios, each entry
|
|
185
|
+
corresponds to a row in the coefficent matrix.
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
One-dimensional numpy array containing the estimated least-squares profile.
|
|
189
|
+
|
|
190
|
+
Example:
|
|
191
|
+
>>> coef_matrix = np.array(
|
|
192
|
+
... [
|
|
193
|
+
... [1, -1, 0],
|
|
194
|
+
... [1, 0, -1],
|
|
195
|
+
... [0, 1, -1],
|
|
196
|
+
... ]
|
|
197
|
+
... )
|
|
198
|
+
>>> ratio_array = np.array([-0.1, -1.0, -1.0])
|
|
199
|
+
>>> log_profiles_by_lstsq(coef_matrix, ratio_array)
|
|
200
|
+
array([-0.36666667, -0.3 , 0.66666667])
|
|
201
|
+
"""
|
|
202
|
+
finite_rows = np.isfinite(ratio_array)
|
|
203
|
+
coef_matrix = coef_matrix[finite_rows]
|
|
204
|
+
ratio_array = ratio_array[finite_rows]
|
|
205
|
+
|
|
206
|
+
absent_coef = np.abs(coef_matrix).sum(axis=0) == 0
|
|
207
|
+
coef_estimates, resid, rank, s = np.linalg.lstsq(
|
|
208
|
+
coef_matrix[:, ~absent_coef], ratio_array, rcond=None
|
|
209
|
+
)
|
|
210
|
+
log_profile = np.zeros(coef_matrix.shape[1])
|
|
211
|
+
log_profile[absent_coef] = np.nan
|
|
212
|
+
log_profile[~absent_coef] = coef_estimates
|
|
213
|
+
return log_profile
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _calculate_pairwise_centered_log_ratio_matrix(
|
|
217
|
+
array: np.ndarray, center_function: Callable, log_transformed: bool = False
|
|
218
|
+
) -> np.ndarray:
|
|
219
|
+
"""Calculates a pairwise, centered log2 ratio matrix from an intensity array.
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
array: A two-dimensional numpy array, with the first dimension corresponding to
|
|
223
|
+
rows and the second dimension to columns.
|
|
224
|
+
center_function: Function that is applied to the ratios of each pair-wise
|
|
225
|
+
comparison of columns in the input array to calculate the centered ratio.
|
|
226
|
+
log_transformed: If True, the 'array' is expected to contain log transformed
|
|
227
|
+
intensity values. If False, the array is expected to contain non-transformed
|
|
228
|
+
intensity values, which are log2 transformed for the calculation of ratios.
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
A square 2-dimensional numpy array, containing pair-wise ratios. The shape of
|
|
232
|
+
the output array is (i, i), where i is the number of columns of the input array.
|
|
233
|
+
"""
|
|
234
|
+
# Note: Is currently tested only via the calculate_pairwise_median_log_ratio_matrix
|
|
235
|
+
# and calculate_pairwise_mode_log_ratio_matrix functions.
|
|
236
|
+
if np.issubdtype(array.dtype, np.integer):
|
|
237
|
+
log_array = array.astype(float)
|
|
238
|
+
else:
|
|
239
|
+
log_array = array.copy()
|
|
240
|
+
|
|
241
|
+
if not log_transformed:
|
|
242
|
+
log_array[log_array == 0] = np.nan
|
|
243
|
+
log_array = np.log2(log_array)
|
|
244
|
+
|
|
245
|
+
num_cols = log_array.shape[1]
|
|
246
|
+
ratio_marix = np.full((num_cols, num_cols), fill_value=np.nan)
|
|
247
|
+
ratio_marix = np.zeros((num_cols, num_cols))
|
|
248
|
+
for i, j in itertools.combinations(range(num_cols), 2):
|
|
249
|
+
ratios = log_array[:, i] - log_array[:, j]
|
|
250
|
+
with warnings.catch_warnings():
|
|
251
|
+
warnings.simplefilter("ignore", category=RuntimeWarning)
|
|
252
|
+
median_ratio = center_function(ratios[np.isfinite(ratios)])
|
|
253
|
+
ratio_marix[i, j] = median_ratio
|
|
254
|
+
|
|
255
|
+
# Generate a full, mirrowed matrix where the lower triangle is upper triangle * -1
|
|
256
|
+
ratio_marix = ratio_marix - ratio_marix.T - np.diag(np.diag(ratio_marix))
|
|
257
|
+
return ratio_marix
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def _coefficients_from_single_row_matrix(ratio_matrix):
|
|
261
|
+
"""Calculates coefficients, ratios, and initial row indices for a single row matrix.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
ratio_matrix: A numpy array containing one single pair-wise ratio matrix. The
|
|
265
|
+
ratio matrix must be a square array, with a ratio at position (i, j) being
|
|
266
|
+
calculated from an abundance table as 'column i - column j'. Only the upper
|
|
267
|
+
triangular part of the ratio matrix is used to generate the coefficient
|
|
268
|
+
matrix.
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
A tuple containing the following three elements:
|
|
272
|
+
- A coefficent matrix. 2d array with the number of rows corresponding to the
|
|
273
|
+
first dimension of the 'ratio_matrix' and the number of columns to the second
|
|
274
|
+
dimension.
|
|
275
|
+
- Ratios: 1d array containing the ratios from the ratio matrix, each entry
|
|
276
|
+
corresponds to a row in the coefficent matrix.
|
|
277
|
+
- Ratio matrix row indices: 1d array with equal length as the ratios array,
|
|
278
|
+
containing all zero values. is returned for consistency with the function
|
|
279
|
+
`_coefficients_from_multi_row_matrix`.
|
|
280
|
+
"""
|
|
281
|
+
num_coef = ratio_matrix.shape[1]
|
|
282
|
+
coef_combinations = list(itertools.combinations(range(num_coef), 2))
|
|
283
|
+
num_coef_combinations = len(coef_combinations)
|
|
284
|
+
|
|
285
|
+
coef_matrix = np.zeros((num_coef_combinations, num_coef), dtype=int)
|
|
286
|
+
ratio_array = np.zeros(num_coef_combinations)
|
|
287
|
+
idx_ratio_matrix_first_dimension = np.zeros(num_coef_combinations, dtype=int)
|
|
288
|
+
|
|
289
|
+
for variable_position, (i, j) in enumerate(coef_combinations):
|
|
290
|
+
ratio_ij = ratio_matrix[i, j]
|
|
291
|
+
coef_matrix[variable_position, i] = 1
|
|
292
|
+
coef_matrix[variable_position, j] = -1
|
|
293
|
+
ratio_array[variable_position] = ratio_ij
|
|
294
|
+
return coef_matrix, ratio_array, idx_ratio_matrix_first_dimension
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def _coefficients_from_multi_row_matrix(ratio_matrix):
|
|
298
|
+
"""Calculates coefficients, ratios, and initial row indices for a multi row matrix.
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
ratio_matrix: A numpy array containing multiple pair-wise ratio matrices. Each
|
|
302
|
+
ratio matrix must be a square array, with a ratio at position (i, j) being
|
|
303
|
+
calculated from an abundance table as 'column i - column j'. Only the upper
|
|
304
|
+
triangular part of the ratio matrix is used to generate the coefficient
|
|
305
|
+
matrix. The shape of 'ratio_matrix' must be(n, i, i), where n is the number
|
|
306
|
+
of ratio matrices and i is the number of rows and columns per ratio
|
|
307
|
+
matrix.
|
|
308
|
+
|
|
309
|
+
Returns:
|
|
310
|
+
A tuple containing the following three elements:
|
|
311
|
+
- A coefficent matrix. 2d array with the number of rows corresponding to the
|
|
312
|
+
first dimension of the 'ratio_matrix' and the number of columns to the second
|
|
313
|
+
and third dimension.
|
|
314
|
+
- Ratios: 1d array containing the ratios from the ratio matrix, each entry
|
|
315
|
+
corresponds to a row in the coefficent matrix.
|
|
316
|
+
- Ratio matrix row indices: 1d array containing row indicies that refer to the
|
|
317
|
+
index of the first dimenstion from the 'ratio_matrix', and thus to row indices
|
|
318
|
+
from the original table that was used to generate 'the ratio_matrix'.
|
|
319
|
+
"""
|
|
320
|
+
num_coef = ratio_matrix.shape[1]
|
|
321
|
+
coef_combinations = list(itertools.combinations(range(num_coef), 2))
|
|
322
|
+
num_coef_combinations = len(coef_combinations)
|
|
323
|
+
num_matrices = ratio_matrix.shape[0]
|
|
324
|
+
coef_matrix_rows = num_coef_combinations * num_matrices
|
|
325
|
+
|
|
326
|
+
coef_matrix = np.zeros((coef_matrix_rows, num_coef), dtype=int)
|
|
327
|
+
ratio_array = np.zeros(coef_matrix_rows)
|
|
328
|
+
idx_ratio_matrix_first_dimension = np.zeros(coef_matrix_rows, dtype=int)
|
|
329
|
+
|
|
330
|
+
for matrix_position, matrix in enumerate(ratio_matrix):
|
|
331
|
+
for variable_position, (i, j) in enumerate(coef_combinations):
|
|
332
|
+
position = (matrix_position * num_coef_combinations) + variable_position
|
|
333
|
+
ratio_ij = matrix[i, j]
|
|
334
|
+
coef_matrix[position, i] = 1
|
|
335
|
+
coef_matrix[position, j] = -1
|
|
336
|
+
ratio_array[position] = ratio_ij
|
|
337
|
+
idx_ratio_matrix_first_dimension[position] = matrix_position
|
|
338
|
+
|
|
339
|
+
return coef_matrix, ratio_array, idx_ratio_matrix_first_dimension
|
msreport/helper/table.py
ADDED
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Iterable, Union
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def guess_design(table: pd.DataFrame, tag: str) -> pd.DataFrame:
|
|
9
|
+
"""Extracts sample name, experiment, and replicate from specified sample columns.
|
|
10
|
+
|
|
11
|
+
"Total" and "Combined", and their lower case variants, are not allowed as sample
|
|
12
|
+
names and will be ignored.
|
|
13
|
+
|
|
14
|
+
First a subset of columns containing a column tag are identified. Then sample names
|
|
15
|
+
are extracted by removing the column tag from each column name. And finally, sample
|
|
16
|
+
names are split into experiment and replicate at the last underscore.
|
|
17
|
+
|
|
18
|
+
This requires that the naming of samples follows a specific convention. Sample names
|
|
19
|
+
must begin with the experiment name, followed by an underscore and a unique
|
|
20
|
+
identifier of the sample, for example the replicate number. The experiment name can
|
|
21
|
+
also contain underscores, as it is split only by the last underscore.
|
|
22
|
+
|
|
23
|
+
For example "ExpA_r1" would be split into experiment "ExpA" and replicate "r1",
|
|
24
|
+
"Exp_A_1" would be experiment "Exp_A" and replicate "1".
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
table: Dataframe which columns are used for extracting sample names.
|
|
28
|
+
tag: Column names containing the 'tag' are selected for sample extraction.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
A dataframe containing the columns "Sample", "Experiment", and "Replicate"
|
|
32
|
+
"""
|
|
33
|
+
sample_entries = []
|
|
34
|
+
for column in find_columns(table, tag, must_be_substring=True):
|
|
35
|
+
sample = column.replace(tag, "").strip()
|
|
36
|
+
if sample.lower() in ["total", "combined"]:
|
|
37
|
+
continue
|
|
38
|
+
experiment = "_".join(sample.split("_")[:-1])
|
|
39
|
+
experiment = experiment if experiment else sample
|
|
40
|
+
replicate = sample.split("_")[-1]
|
|
41
|
+
replicate = replicate if replicate is not sample else "-1"
|
|
42
|
+
sample_entries.append([sample, experiment, replicate])
|
|
43
|
+
design = pd.DataFrame(sample_entries, columns=["Sample", "Experiment", "Replicate"])
|
|
44
|
+
return design
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def intensities_in_logspace(data: Union[pd.DataFrame, np.ndarray, Iterable]) -> bool:
|
|
48
|
+
"""Evaluates whether intensities are likely to be log transformed.
|
|
49
|
+
|
|
50
|
+
Assumes that intensities are log transformed if all values are smaller or equal to
|
|
51
|
+
64. Intensities values (and intensity peak areas) reported by tandem mass
|
|
52
|
+
spectrometry typically range from 10^3 to 10^12. To reach log2 transformed values
|
|
53
|
+
greater than 64, intensities would need to be higher than 10^19, which seems to be
|
|
54
|
+
very unlikely to be ever encountered.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
data: Dataset that contains only intensity values, can be any iterable,
|
|
58
|
+
a numpy.array or a pandas.DataFrame, multiple dimensions or columns
|
|
59
|
+
are allowed.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
True if intensity values in 'data' appear to be log transformed.
|
|
63
|
+
"""
|
|
64
|
+
data = np.array(data, dtype=float)
|
|
65
|
+
mask = np.isfinite(data)
|
|
66
|
+
return np.all(data[mask].flatten() <= 64)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def rename_sample_columns(table: pd.DataFrame, mapping: dict[str, str]) -> pd.DataFrame:
|
|
70
|
+
"""Renames sample names according to the mapping in a cautious manner.
|
|
71
|
+
|
|
72
|
+
In general, this function allows the use of 'mapping' with keys that are substrings
|
|
73
|
+
of any other keys, as well as values that are substrings of any of the keys.
|
|
74
|
+
|
|
75
|
+
Importantly, if the mapping keys (sample names) are substrings of other column names
|
|
76
|
+
within the table, unintended renaming of those columns will occur. For instance,
|
|
77
|
+
when renaming columns ["Abundance", "Intensity A"] with the mapping
|
|
78
|
+
{"A": "Sample Alpha"}, the columns will be renamed to ["Sample Alphabundance",
|
|
79
|
+
"Intensity Sample Alpha"].
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
table: Dataframe which columns will be renamed.
|
|
83
|
+
mapping: A mapping of old to new sample names that will be used to replace
|
|
84
|
+
matching substrings in the columns from table.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
A copy of the table with renamed columns.
|
|
88
|
+
"""
|
|
89
|
+
sorted_mapping_keys = sorted(mapping, key=len, reverse=True)
|
|
90
|
+
|
|
91
|
+
renamed_columns = []
|
|
92
|
+
for column in table.columns:
|
|
93
|
+
for sample_name in sorted_mapping_keys:
|
|
94
|
+
if sample_name in column:
|
|
95
|
+
column = column.replace(sample_name, mapping[sample_name])
|
|
96
|
+
break
|
|
97
|
+
renamed_columns.append(column)
|
|
98
|
+
|
|
99
|
+
renamed_table = table.copy()
|
|
100
|
+
renamed_table.columns = renamed_columns
|
|
101
|
+
return renamed_table
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def rename_mq_reporter_channels(
|
|
105
|
+
table: pd.DataFrame, channel_names: Iterable[str]
|
|
106
|
+
) -> None:
|
|
107
|
+
"""Renames reporter channel numbers with sample names.
|
|
108
|
+
|
|
109
|
+
MaxQuant writes reporter channel names either in the format "Reporter intensity 1"
|
|
110
|
+
or "Reporter intensity 1 Experiment Name", dependent on whether an experiment name
|
|
111
|
+
was specified. Renames "Reporter intensity", "Reporter intensity count", and
|
|
112
|
+
"Reporter intensity corrected" columns.
|
|
113
|
+
|
|
114
|
+
NOTE: This might not work for the peptides.txt table, as there are columns present
|
|
115
|
+
with the experiment name and also without it.
|
|
116
|
+
"""
|
|
117
|
+
pattern = re.compile("Reporter intensity [0-9]+")
|
|
118
|
+
reporter_columns = list(filter(pattern.match, table.columns.tolist()))
|
|
119
|
+
assert len(reporter_columns) == len(channel_names)
|
|
120
|
+
|
|
121
|
+
column_mapping = {}
|
|
122
|
+
base_name = "Reporter intensity "
|
|
123
|
+
for column, channel_name in zip(reporter_columns, channel_names):
|
|
124
|
+
for tag in ["", "count ", "corrected "]:
|
|
125
|
+
old_column = column.replace(f"{base_name}", f"{base_name}{tag}")
|
|
126
|
+
new_column = f"{base_name}{tag}{channel_name}"
|
|
127
|
+
column_mapping[old_column] = new_column
|
|
128
|
+
table.rename(columns=column_mapping, inplace=True)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def apply_intensity_cutoff(
|
|
132
|
+
table: pd.DataFrame, column_tag: str, threshold: float
|
|
133
|
+
) -> None:
|
|
134
|
+
"""Sets values below the threshold to NA.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
table: Dataframe to which the protein annotations are added.
|
|
138
|
+
column_tag: Substring used to identify intensity columns from the 'table' to
|
|
139
|
+
which the intensity cutoff is applied.
|
|
140
|
+
threshold: Values below the treshold will be set to NA.
|
|
141
|
+
"""
|
|
142
|
+
for column in find_columns(table, column_tag):
|
|
143
|
+
table.loc[table[column] < threshold, column] = np.nan
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def find_columns(
|
|
147
|
+
table: pd.DataFrame, substring: str, must_be_substring: bool = False
|
|
148
|
+
) -> list[str]:
|
|
149
|
+
"""Returns a list column names containing the substring.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
table: Columns of this datafram are queried.
|
|
153
|
+
substring: String that must be part of column names.
|
|
154
|
+
must_be_substring: If true than column names are not reported if they
|
|
155
|
+
are exactly equal to the substring.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
A list of column names.
|
|
159
|
+
"""
|
|
160
|
+
matches = [substring in col for col in table.columns]
|
|
161
|
+
matched_columns = np.array(table.columns)[matches].tolist()
|
|
162
|
+
if must_be_substring:
|
|
163
|
+
matched_columns = [col for col in matched_columns if col != substring]
|
|
164
|
+
return matched_columns
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def find_sample_columns(
|
|
168
|
+
table: pd.DataFrame, substring: str, samples: Iterable[str]
|
|
169
|
+
) -> list[str]:
|
|
170
|
+
"""Returns column names that contain the substring and any entry of 'samples'.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
table: Columns of this dataframe are queried.
|
|
174
|
+
substring: String that must be part of column names.
|
|
175
|
+
samples: List of strings from which at least one must be present in matched
|
|
176
|
+
columns.
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
A list of sample column names.
|
|
180
|
+
"""
|
|
181
|
+
matched_columns = []
|
|
182
|
+
for column in find_columns(table, substring):
|
|
183
|
+
if any([sample in column for sample in samples]):
|
|
184
|
+
matched_columns.append(column)
|
|
185
|
+
return matched_columns
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def keep_rows_by_partial_match(
|
|
189
|
+
table: pd.DataFrame, column: str, values: Iterable[str]
|
|
190
|
+
) -> pd.DataFrame:
|
|
191
|
+
"""Filter a table to keep only rows partially matching any of the specified values.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
table: The input table that will be filtered.
|
|
195
|
+
column: The name of the column in the 'table' which entries are checked for
|
|
196
|
+
partial matches to the values. This column must have the datatype 'str'.
|
|
197
|
+
modifications: An iterable of strings that are used to filter the table. Any of
|
|
198
|
+
the specified values must have at least a partial match to an entry from the
|
|
199
|
+
specified 'column' for a row to be kept in the filtered table.
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
A new DataFrame containing only the rows that have a partial or complete match
|
|
203
|
+
with any of the specified 'values'.
|
|
204
|
+
|
|
205
|
+
Example:
|
|
206
|
+
>>> df = pd.DataFrame({"Modifications": ["phos", "acetyl;phos", "acetyl"]})
|
|
207
|
+
>>> keep_rows_by_partial_match(df, "Modifications", ["phos"])
|
|
208
|
+
Modifications
|
|
209
|
+
0 phos
|
|
210
|
+
1 acetyl;phos
|
|
211
|
+
"""
|
|
212
|
+
value_masks = [table[column].str.contains(value, regex=False) for value in values]
|
|
213
|
+
target_mask = np.any(value_masks, axis=0)
|
|
214
|
+
filtered_table = table[target_mask].copy()
|
|
215
|
+
return filtered_table
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def remove_rows_by_partial_match(
|
|
219
|
+
table: pd.DataFrame, column: str, values: Iterable[str]
|
|
220
|
+
) -> pd.DataFrame:
|
|
221
|
+
"""Filter a table to remove rows partially matching any of the specified values.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
table: The input table that will be filtered.
|
|
225
|
+
column: The name of the column in the 'table' which entries are checked for
|
|
226
|
+
partial matches to the values. This column must have the datatype 'str'.
|
|
227
|
+
modifications: An iterable of strings that are used to filter the table. Any of
|
|
228
|
+
the specified values must have at least a partial match to an entry from the
|
|
229
|
+
specified 'column' for a row to be removed in the filtered table.
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
A new DataFrame containing no rows that have a partial or complete match with
|
|
233
|
+
any of the specified 'values'.
|
|
234
|
+
|
|
235
|
+
Example:
|
|
236
|
+
>>> df = pd.DataFrame({"Modifications": ["phos", "acetyl;phos", "acetyl"]})
|
|
237
|
+
>>> remove_rows_by_partial_match(df, "Modifications", ["phos"])
|
|
238
|
+
Modifications
|
|
239
|
+
2 acetyl
|
|
240
|
+
"""
|
|
241
|
+
value_masks = [table[column].str.contains(value, regex=False) for value in values]
|
|
242
|
+
target_mask = ~np.any(value_masks, axis=0)
|
|
243
|
+
filtered_table = table[target_mask].copy()
|
|
244
|
+
return filtered_table
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def join_tables(
|
|
248
|
+
tables: Iterable[pd.DataFrame], reset_index: bool = False
|
|
249
|
+
) -> pd.DataFrame:
|
|
250
|
+
"""Returns a joined dataframe.
|
|
251
|
+
|
|
252
|
+
Dataframes are merged iteratively on their index using an outer join, beginning with
|
|
253
|
+
the first entry from 'tables'. Can only join dataframes with different columns.
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
tables: Dataframes that will be merged together.
|
|
257
|
+
reset_index: If True, the index of the joined dataframe is reset.
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
A merged dataframe.
|
|
261
|
+
"""
|
|
262
|
+
merged_table = tables[0]
|
|
263
|
+
for table in tables[1:]:
|
|
264
|
+
merged_table = merged_table.join(table, how="outer")
|
|
265
|
+
if reset_index:
|
|
266
|
+
merged_table.reset_index(inplace=True)
|
|
267
|
+
return merged_table
|