msreport 0.0.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- msreport/__init__.py +13 -0
- msreport/aggregate/__init__.py +0 -0
- msreport/aggregate/condense.py +163 -0
- msreport/aggregate/pivot.py +132 -0
- msreport/aggregate/summarize.py +281 -0
- msreport/analyze.py +586 -0
- msreport/errors.py +10 -0
- msreport/export.py +526 -0
- msreport/fasta.py +28 -0
- msreport/helper/__init__.py +23 -0
- msreport/helper/calc.py +120 -0
- msreport/helper/maxlfq.py +339 -0
- msreport/helper/table.py +267 -0
- msreport/helper/temp.py +99 -0
- msreport/impute.py +275 -0
- msreport/isobar.py +161 -0
- msreport/normalize.py +496 -0
- msreport/peptidoform.py +283 -0
- msreport/plot.py +1129 -0
- msreport/qtable.py +537 -0
- msreport/reader.py +2357 -0
- msreport/rinterface/__init__.py +3 -0
- msreport/rinterface/limma.py +126 -0
- msreport/rinterface/rinstaller.py +35 -0
- msreport/rinterface/rscripts/limma.R +104 -0
- msreport-0.0.24.dist-info/METADATA +128 -0
- msreport-0.0.24.dist-info/RECORD +30 -0
- msreport-0.0.24.dist-info/WHEEL +5 -0
- msreport-0.0.24.dist-info/licenses/LICENSE.txt +202 -0
- msreport-0.0.24.dist-info/top_level.txt +1 -0
msreport/analyze.py
ADDED
|
@@ -0,0 +1,586 @@
|
|
|
1
|
+
""" The analyze module contains methods for analysing quantification results. """
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
from typing import Iterable, Optional, Protocol
|
|
5
|
+
import warnings
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
import msreport.normalize
|
|
11
|
+
import msreport.rinterface
|
|
12
|
+
from msreport.qtable import Qtable
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Transformer(Protocol):
|
|
16
|
+
def fit(self, table: pd.DataFrame) -> Transformer:
|
|
17
|
+
"""Fits the Transformer and returns a fitted Transformer instance."""
|
|
18
|
+
|
|
19
|
+
def is_fitted(self) -> bool:
|
|
20
|
+
"""Returns True if the Transformer has been fitted."""
|
|
21
|
+
|
|
22
|
+
def transform(self, table: pd.DataFrame) -> pd.DataFrame:
|
|
23
|
+
"""Transform values in 'table'."""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class CategoryTransformer(Protocol):
|
|
27
|
+
def fit(self, table: pd.DataFrame) -> Transformer:
|
|
28
|
+
"""Fits the Transformer and returns a fitted Transformer instance."""
|
|
29
|
+
|
|
30
|
+
def is_fitted(self) -> bool:
|
|
31
|
+
"""Returns True if the Transformer has been fitted."""
|
|
32
|
+
|
|
33
|
+
def transform(self, table: pd.DataFrame) -> pd.DataFrame:
|
|
34
|
+
"""Transform values in 'table'."""
|
|
35
|
+
|
|
36
|
+
def get_category_column(self, table: pd.DataFrame) -> pd.DataFrame:
|
|
37
|
+
"""Returns the specified category column."""
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def analyze_missingness(qtable: Qtable) -> None:
|
|
41
|
+
"""Quantifies missing values of expression columns.
|
|
42
|
+
|
|
43
|
+
Adds additional columns to the qtable; for the number of missing values per sample
|
|
44
|
+
"Missing sample_name", per experiment "Missing experiment_name" and in total
|
|
45
|
+
"Missing total"; and for the number of quantification events per experiment
|
|
46
|
+
"Events experiment_name" and in total "Events total".
|
|
47
|
+
|
|
48
|
+
Requires expression columns to be set. Missing values in expression columns must be
|
|
49
|
+
present as NaN, and not as zero or an empty string.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
qtable: A Qtable instance.
|
|
53
|
+
"""
|
|
54
|
+
# TODO: not tested #
|
|
55
|
+
missing_events = pd.DataFrame()
|
|
56
|
+
quant_events = pd.DataFrame()
|
|
57
|
+
table = qtable.make_expression_table(samples_as_columns=True)
|
|
58
|
+
num_missing = np.isnan(table).sum(axis=1)
|
|
59
|
+
num_events = np.isfinite(table).sum(axis=1)
|
|
60
|
+
quant_events["Events total"] = num_events
|
|
61
|
+
missing_events["Missing total"] = num_missing
|
|
62
|
+
for experiment in qtable.get_experiments():
|
|
63
|
+
exp_samples = qtable.get_samples(experiment)
|
|
64
|
+
num_events = np.isfinite(table[exp_samples]).sum(axis=1)
|
|
65
|
+
quant_events[f"Events {experiment}"] = num_events
|
|
66
|
+
num_missing = np.isnan(table[exp_samples]).sum(axis=1)
|
|
67
|
+
missing_events[f"Missing {experiment}"] = num_missing
|
|
68
|
+
for sample in qtable.get_samples():
|
|
69
|
+
sample_missing = np.isnan(table[sample])
|
|
70
|
+
missing_events[f"Missing {sample}"] = sample_missing
|
|
71
|
+
qtable.add_expression_features(missing_events)
|
|
72
|
+
qtable.add_expression_features(quant_events)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def validate_proteins(
|
|
76
|
+
qtable: Qtable,
|
|
77
|
+
min_peptides: int = 0,
|
|
78
|
+
remove_contaminants: bool = True,
|
|
79
|
+
min_events: Optional[int] = None,
|
|
80
|
+
max_missing: Optional[int] = None,
|
|
81
|
+
) -> None:
|
|
82
|
+
"""Validates protein entries (rows).
|
|
83
|
+
|
|
84
|
+
Adds an additional column "Valid" to the qtable, containing Boolean values.
|
|
85
|
+
|
|
86
|
+
Requires expression columns to be set. Depending on the arguments requires the
|
|
87
|
+
columns "Total peptides", "Potential contaminant", and the experiment columns
|
|
88
|
+
"Missing experiment_name" and "Events experiment_name".
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
qtable: A Qtable instance.
|
|
92
|
+
min_peptides: Minimum number of unique peptides, default 0.
|
|
93
|
+
remove_contaminants: If true, the "Potential contaminant" column is used to
|
|
94
|
+
remove invalid entries, default True. If no "Potential contaminant" column
|
|
95
|
+
is present 'remove_contaminants' is ignored.
|
|
96
|
+
min_events: If specified, at least one experiment must have the minimum number
|
|
97
|
+
of quantified events for the protein entry to be valid.
|
|
98
|
+
max_missing: If specified, at least one experiment must have no more than the
|
|
99
|
+
maximum number of missing values.
|
|
100
|
+
"""
|
|
101
|
+
valid_entries = np.ones(qtable.data.shape[0], dtype=bool)
|
|
102
|
+
|
|
103
|
+
if min_peptides > 0:
|
|
104
|
+
if "Total peptides" not in qtable:
|
|
105
|
+
raise KeyError("'Total peptides' column not present in qtable.data")
|
|
106
|
+
valid_entries = np.all(
|
|
107
|
+
[valid_entries, qtable["Total peptides"] >= min_peptides], axis=0
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# TODO: not tested from here #
|
|
111
|
+
if remove_contaminants:
|
|
112
|
+
if "Potential contaminant" not in qtable:
|
|
113
|
+
raise KeyError("'Potential contaminant' column not present in qtable.data")
|
|
114
|
+
valid_entries = np.all(
|
|
115
|
+
[valid_entries, np.invert(qtable["Potential contaminant"])], axis=0
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
if max_missing is not None:
|
|
119
|
+
cols = [" ".join(["Missing", e]) for e in qtable.get_experiments()]
|
|
120
|
+
if not pd.Series(cols).isin(qtable.data.columns).all():
|
|
121
|
+
raise Exception(
|
|
122
|
+
f"Not all columns from {cols} are present in qtable.data,"
|
|
123
|
+
" analyze missingness before calling validate_proteins()."
|
|
124
|
+
)
|
|
125
|
+
max_missing_valid = np.any(qtable[cols] <= max_missing, axis=1)
|
|
126
|
+
valid_entries = max_missing_valid & valid_entries
|
|
127
|
+
|
|
128
|
+
if min_events is not None:
|
|
129
|
+
cols = [" ".join(["Events", e]) for e in qtable.get_experiments()]
|
|
130
|
+
if not pd.Series(cols).isin(qtable.data.columns).all():
|
|
131
|
+
raise Exception(
|
|
132
|
+
f"Not all columns from {cols} are present in qtable.data,"
|
|
133
|
+
" analyze missingness before calling validate_proteins()."
|
|
134
|
+
)
|
|
135
|
+
min_events_valid = np.any(qtable[cols] >= min_events, axis=1)
|
|
136
|
+
valid_entries = min_events_valid & valid_entries
|
|
137
|
+
|
|
138
|
+
qtable["Valid"] = valid_entries
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def normalize_expression(
|
|
142
|
+
qtable: Qtable,
|
|
143
|
+
normalizer: Transformer,
|
|
144
|
+
exclude_invalid: bool = True,
|
|
145
|
+
) -> None:
|
|
146
|
+
"""Normalizes expression values in qtable.
|
|
147
|
+
|
|
148
|
+
Normalizes values present in the qtable expression columns, requires that expression
|
|
149
|
+
columns are defined. The normalizer will be fit with the expression values if it has
|
|
150
|
+
not been fitted already.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
qtable: A Qtable instance, which expression values will be normalized.
|
|
154
|
+
normalizer: A Normalizer instance from the msreport.normalize module. Note that
|
|
155
|
+
if an already fitted normalizer is passed, it has to be fitted with a
|
|
156
|
+
dataframe which column names correspond to the sample names present in
|
|
157
|
+
qtable.design. A not fitted normalizer is fitted with the expression values
|
|
158
|
+
present in the qtable.
|
|
159
|
+
exclude_invalid: If true, the column "Valid" is used to filter which expression
|
|
160
|
+
rows are used for fitting a not fitted normalizer; default True. Independent
|
|
161
|
+
of if exclude_invalid is True or False, all expression values will be
|
|
162
|
+
normalized.
|
|
163
|
+
"""
|
|
164
|
+
table = qtable.make_expression_table(samples_as_columns=True, features=["Valid"])
|
|
165
|
+
sample_columns = table.columns.drop("Valid")
|
|
166
|
+
expression_columns = [qtable.get_expression_column(s) for s in sample_columns]
|
|
167
|
+
|
|
168
|
+
raw_data = table[sample_columns]
|
|
169
|
+
if not normalizer.is_fitted():
|
|
170
|
+
if exclude_invalid:
|
|
171
|
+
valid_mask = table["Valid"]
|
|
172
|
+
else:
|
|
173
|
+
valid_mask = np.ones_like(table["Valid"], dtype=bool)
|
|
174
|
+
fit_data = raw_data[valid_mask]
|
|
175
|
+
normalizer = normalizer.fit(fit_data)
|
|
176
|
+
|
|
177
|
+
transformed_data = normalizer.transform(raw_data)
|
|
178
|
+
qtable[expression_columns] = transformed_data[sample_columns]
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def create_site_to_protein_normalizer(
|
|
182
|
+
qtable: Qtable, category_column: str = "Representative protein"
|
|
183
|
+
) -> msreport.normalizer.CategoricalNormalizer:
|
|
184
|
+
"""Creates a fitted `CategoricalNormalizer` for site-to-protein normalization.
|
|
185
|
+
|
|
186
|
+
The `CategoricalNormalizer` is fitted to protein expression profiles of the provided
|
|
187
|
+
`qtable`. The protein expression profiles are calculated by subtracting the mean
|
|
188
|
+
expression value of each protein from the protein expression values. Expression
|
|
189
|
+
values must be log transformed. The generated `CategoricalNormalizer` can be used to
|
|
190
|
+
normalize ion, peptide or site qtables based on protein categories.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
qtable: Qtable instance containing protein values for fitting the normalizer.
|
|
194
|
+
category_column: The name of the column containing the protein categories.
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
A fitted `CategoricalNormalizer` object.
|
|
198
|
+
"""
|
|
199
|
+
reference_expression = qtable.make_expression_table(
|
|
200
|
+
samples_as_columns=True,
|
|
201
|
+
features=[category_column],
|
|
202
|
+
)
|
|
203
|
+
completely_quantified = (
|
|
204
|
+
~reference_expression[qtable.get_samples()].isna().any(axis=1)
|
|
205
|
+
)
|
|
206
|
+
reference_expression = reference_expression[completely_quantified]
|
|
207
|
+
|
|
208
|
+
sample_columns = qtable.get_samples()
|
|
209
|
+
reference_profiles = reference_expression[sample_columns].sub(
|
|
210
|
+
reference_expression[sample_columns].mean(axis=1), axis=0
|
|
211
|
+
)
|
|
212
|
+
reference_profiles[category_column] = reference_expression[category_column]
|
|
213
|
+
|
|
214
|
+
normalizer = msreport.normalize.CategoricalNormalizer(category_column)
|
|
215
|
+
normalizer = normalizer.fit(reference_profiles)
|
|
216
|
+
|
|
217
|
+
return normalizer
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def create_ibaq_transformer(
|
|
221
|
+
qtable: Qtable,
|
|
222
|
+
category_column: str = "Representative protein",
|
|
223
|
+
ibaq_column: str = "iBAQ peptides",
|
|
224
|
+
) -> msreport.normalizer.CategoricalNormalizer:
|
|
225
|
+
"""Creates a fitted `CategoricalNormalizer` for iBAQ transformation.
|
|
226
|
+
|
|
227
|
+
The `CategoricalNormalizer` is fitted to iBAQ peptide counts of the provided
|
|
228
|
+
`qtable`, and can be used to transform protein intensities by dividing them by the
|
|
229
|
+
corresponding iBAQ peptide counts. Missing iBAQ peptide counts are replaced by 1 and
|
|
230
|
+
values smaller than 1 are replaced by 1. iBAQ peptide counts are then log2
|
|
231
|
+
transformed because the `CategoryTransformer` expects log2 transformed values.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
qtable: Qtable instance containing iBAQ peptide counts for fitting the
|
|
235
|
+
normalizer.
|
|
236
|
+
category_column: The name of the column containing the protein categories.
|
|
237
|
+
ibaq_column: The name of the column containing the iBAQ peptide counts.
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
A fitted `CategoricalNormalizer` object.
|
|
241
|
+
"""
|
|
242
|
+
category_values = qtable[category_column].copy()
|
|
243
|
+
ibaq_factor_values = qtable[ibaq_column].copy()
|
|
244
|
+
sample_columns = qtable.get_samples()
|
|
245
|
+
|
|
246
|
+
ibaq_factor_values = ibaq_factor_values.fillna(1)
|
|
247
|
+
ibaq_factor_values[ibaq_factor_values < 1] = 1
|
|
248
|
+
ibaq_factor_values = np.log2(ibaq_factor_values)
|
|
249
|
+
|
|
250
|
+
reference_table = pd.DataFrame({c: ibaq_factor_values for c in sample_columns})
|
|
251
|
+
reference_table[category_column] = category_values
|
|
252
|
+
|
|
253
|
+
normalizer = msreport.normalize.CategoricalNormalizer(category_column)
|
|
254
|
+
normalizer = normalizer.fit(reference_table)
|
|
255
|
+
|
|
256
|
+
return normalizer
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def normalize_expression_by_category(
|
|
260
|
+
qtable: Qtable, normalizer: CategoryTransformer
|
|
261
|
+
) -> None:
|
|
262
|
+
"""Normalizes expression values in a Qtable based on categories.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
qtable: A Qtable instance, which expression values will be normalized.
|
|
266
|
+
normalizer: A `CategoryTransformer` object used for normalization.
|
|
267
|
+
|
|
268
|
+
Raises:
|
|
269
|
+
KeyError: If the category column of the `CategoryTransformer` object is not
|
|
270
|
+
found in the `qtable.data`.
|
|
271
|
+
"""
|
|
272
|
+
category_column = normalizer.get_category_column()
|
|
273
|
+
if category_column not in qtable.data.columns:
|
|
274
|
+
raise KeyError(
|
|
275
|
+
f'The category column "{category_column}" in the normalizer '
|
|
276
|
+
f"is not found in `qtable.data`."
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
table = qtable.make_expression_table(
|
|
280
|
+
samples_as_columns=True, features=[category_column]
|
|
281
|
+
)
|
|
282
|
+
sample_columns = table.columns.drop(category_column)
|
|
283
|
+
expression_columns = [qtable.get_expression_column(s) for s in sample_columns]
|
|
284
|
+
|
|
285
|
+
raw_data = table[sample_columns.append(pd.Index([category_column]))]
|
|
286
|
+
transformed_data = normalizer.transform(raw_data)
|
|
287
|
+
qtable.data[expression_columns] = transformed_data[sample_columns]
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def impute_missing_values(
|
|
291
|
+
qtable: Qtable,
|
|
292
|
+
imputer: Transformer,
|
|
293
|
+
exclude_invalid: bool = True,
|
|
294
|
+
) -> None:
|
|
295
|
+
"""Imputes missing expression values in qtable.
|
|
296
|
+
|
|
297
|
+
Imputes missing values (nan) present in the qtable expression columns, requires
|
|
298
|
+
that the qtable has defined expression columns. If the passed imputer object is not
|
|
299
|
+
yet fitted, it will be fit with the expression values. If 'exclude_invalid' is True,
|
|
300
|
+
only valid expression values will be used for fitting the imputer.
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
qtable: A Qtable instance, which missing expression values will be imputed.
|
|
304
|
+
imputer: An Imputer instance from the msreport.impute module. Note that if an
|
|
305
|
+
already fitted imputer is passed, it has to be fitted with a dataframe which
|
|
306
|
+
column names correspond to the sample names present in qtable.design. A not
|
|
307
|
+
fitted imputer is fitted with the expression values present in the qtable.
|
|
308
|
+
exclude_invalid: If true, the column "Valid" is used to determine for which rows
|
|
309
|
+
imputation is performed. Default True.
|
|
310
|
+
"""
|
|
311
|
+
table = qtable.make_expression_table(samples_as_columns=True, features=["Valid"])
|
|
312
|
+
sample_columns = table.columns.drop("Valid")
|
|
313
|
+
expression_columns = [qtable.get_expression_column(s) for s in sample_columns]
|
|
314
|
+
if exclude_invalid:
|
|
315
|
+
valid_mask = table["Valid"]
|
|
316
|
+
else:
|
|
317
|
+
valid_mask = np.ones_like(table["Valid"], dtype=bool)
|
|
318
|
+
|
|
319
|
+
raw_data = table.loc[valid_mask, sample_columns]
|
|
320
|
+
if not imputer.is_fitted():
|
|
321
|
+
imputer = imputer.fit(raw_data)
|
|
322
|
+
|
|
323
|
+
imputed_data = imputer.transform(raw_data)
|
|
324
|
+
imputed_data.rename(
|
|
325
|
+
columns=dict(zip(sample_columns, expression_columns)), inplace=True
|
|
326
|
+
)
|
|
327
|
+
qtable.data.loc[valid_mask, expression_columns] = imputed_data
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def calculate_experiment_means(qtable: Qtable) -> None:
|
|
331
|
+
"""Calculates mean expression values for each experiment.
|
|
332
|
+
|
|
333
|
+
Adds a new column "Expression experiment_name" for each experiment, containing the
|
|
334
|
+
mean expression values of the corresponding samples.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
qtable: A Qtable instance, which mean experiment expression values will be
|
|
338
|
+
calculated.
|
|
339
|
+
"""
|
|
340
|
+
experiment_means = {}
|
|
341
|
+
for experiment in qtable.get_experiments():
|
|
342
|
+
samples = qtable.get_samples(experiment)
|
|
343
|
+
columns = [qtable.get_expression_column(s) for s in samples]
|
|
344
|
+
with warnings.catch_warnings():
|
|
345
|
+
warnings.simplefilter("ignore", category=RuntimeWarning)
|
|
346
|
+
row_means = np.nanmean(qtable[columns], axis=1)
|
|
347
|
+
experiment_means[f"Expression {experiment}"] = row_means
|
|
348
|
+
qtable.add_expression_features(pd.DataFrame(experiment_means))
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def calculate_multi_group_comparison(
|
|
352
|
+
qtable: Qtable,
|
|
353
|
+
experiment_pairs: Iterable[Iterable[str]],
|
|
354
|
+
exclude_invalid: bool = True,
|
|
355
|
+
) -> None:
|
|
356
|
+
"""Calculates average expression and ratios for multiple comparison groups.
|
|
357
|
+
|
|
358
|
+
For each experiment pair, adds new columns
|
|
359
|
+
"Average expression Experiment_1 vs Experiment_2" and
|
|
360
|
+
"Ratio [log2] Experiment_1 vs Experiment_2" to the qtable. Expression values must be
|
|
361
|
+
log transformed.
|
|
362
|
+
|
|
363
|
+
Args:
|
|
364
|
+
qtable: Qtable instance that contains expression values for calculating group
|
|
365
|
+
comparisons.
|
|
366
|
+
experiment_pairs: A list containing one or multiple experiment pairs for which
|
|
367
|
+
the group comparison should be calculated. The specified experiments must
|
|
368
|
+
correspond to entries from qtable.design["Experiment"].
|
|
369
|
+
exclude_invalid: If true, the column "Valid" is used to determine which rows are
|
|
370
|
+
used for calculating the group comparisons; default True.
|
|
371
|
+
"""
|
|
372
|
+
table = qtable.make_expression_table(samples_as_columns=True, features=["Valid"])
|
|
373
|
+
comparison_tag = " vs "
|
|
374
|
+
|
|
375
|
+
if exclude_invalid:
|
|
376
|
+
invalid = np.invert(table["Valid"].to_numpy())
|
|
377
|
+
else:
|
|
378
|
+
invalid = np.zeros(table.shape[0], dtype=bool)
|
|
379
|
+
|
|
380
|
+
for experiment_pair in experiment_pairs:
|
|
381
|
+
comparison_group = comparison_tag.join(experiment_pair)
|
|
382
|
+
with warnings.catch_warnings():
|
|
383
|
+
warnings.simplefilter("ignore", category=RuntimeWarning)
|
|
384
|
+
group_expressions = []
|
|
385
|
+
for experiment in experiment_pair:
|
|
386
|
+
samples = qtable.get_samples(experiment)
|
|
387
|
+
group_expressions.append(np.nanmean(table[samples], axis=1))
|
|
388
|
+
ratios = group_expressions[0] - group_expressions[1]
|
|
389
|
+
average_expressions = np.nanmean(group_expressions, axis=0)
|
|
390
|
+
|
|
391
|
+
comparison_table = pd.DataFrame(
|
|
392
|
+
{
|
|
393
|
+
f"Average expression {comparison_group}": average_expressions,
|
|
394
|
+
f"Ratio [log2] {comparison_group}": ratios,
|
|
395
|
+
}
|
|
396
|
+
)
|
|
397
|
+
comparison_table[invalid] = np.nan
|
|
398
|
+
qtable.add_expression_features(comparison_table)
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def two_group_comparison(
|
|
402
|
+
qtable: Qtable, experiment_pair: Iterable[str], exclude_invalid: bool = True
|
|
403
|
+
) -> None:
|
|
404
|
+
"""Calculates comparison values for two experiments.
|
|
405
|
+
|
|
406
|
+
Adds new columns "Average expression Experiment_1 vs Experiment_2" and
|
|
407
|
+
"Ratio [log2] Experiment_1 vs Experiment_2" to the qtable. Expects that expression
|
|
408
|
+
values are log2 transformed.
|
|
409
|
+
|
|
410
|
+
Args:
|
|
411
|
+
qtable: A Qtable instance, containing expression values.
|
|
412
|
+
experiment_pair: The two experiments that will be compared, experiments must be
|
|
413
|
+
present in qtable.design
|
|
414
|
+
exclude_invalid: If true, the column "Valid" is used to determine for which rows
|
|
415
|
+
comparison values are calculated.
|
|
416
|
+
"""
|
|
417
|
+
calculate_multi_group_comparison(
|
|
418
|
+
qtable, experiment_pairs=[experiment_pair], exclude_invalid=exclude_invalid
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def calculate_multi_group_limma(
|
|
423
|
+
qtable: Qtable,
|
|
424
|
+
experiment_pairs: list[list[str, str]],
|
|
425
|
+
exclude_invalid: bool = True,
|
|
426
|
+
batch: bool = False,
|
|
427
|
+
limma_trend: bool = True,
|
|
428
|
+
) -> None:
|
|
429
|
+
"""Uses limma to perform a differential expression analysis of multiple experiments.
|
|
430
|
+
|
|
431
|
+
For each experiment pair specified in 'experiment_pairs' the following new columns
|
|
432
|
+
are added to the qtable:
|
|
433
|
+
- "P-value Experiment_1 vs Experiment_2"
|
|
434
|
+
- "Adjusted p-value Experiment_1 vs Experiment_2"
|
|
435
|
+
- "Average expression Experiment_1 vs Experiment_2"
|
|
436
|
+
- "Ratio [log2] Experiment_1 vs Experiment_2"
|
|
437
|
+
|
|
438
|
+
Requires that expression columns are set, and expression values are log2 transformed
|
|
439
|
+
All rows with missing values are ignored, impute missing values to allow
|
|
440
|
+
differential expression analysis of all rows. The qtable.data column
|
|
441
|
+
"Representative protein" is used as the index.
|
|
442
|
+
|
|
443
|
+
Args:
|
|
444
|
+
qtable: Qtable instance that contains expression values for differential
|
|
445
|
+
expression analysis.
|
|
446
|
+
experiment_pairs: A list containing lists of experiment pairs for which the
|
|
447
|
+
results of the differential expression analysis should be reported. The
|
|
448
|
+
specified experiment pairs must correspond to entries from
|
|
449
|
+
qtable.design["Experiment"].
|
|
450
|
+
exclude_invalid: If true, the column "Valid" is used to determine which rows are
|
|
451
|
+
used for the differential expression analysis; default True.
|
|
452
|
+
batch: If true batch effects are considered for the differential expression
|
|
453
|
+
analysis. Batches must be specified in the design in a "Batch" column.
|
|
454
|
+
limma_trend: If true, an intensity-dependent trend is fitted to the prior
|
|
455
|
+
variance during calculation of the moderated t-statistics, refer to
|
|
456
|
+
limma.eBayes for details; default True.
|
|
457
|
+
"""
|
|
458
|
+
# TODO: not tested #
|
|
459
|
+
if batch and "Batch" not in qtable.get_design():
|
|
460
|
+
raise KeyError(
|
|
461
|
+
"When using calculate_multi_group_limma(batch=True) a"
|
|
462
|
+
' "Batch" column must be present in qtable.design'
|
|
463
|
+
)
|
|
464
|
+
if batch and qtable.get_design()["Batch"].nunique() == 1:
|
|
465
|
+
raise ValueError(
|
|
466
|
+
"When using calculate_multi_group_limma(batch=True), not all values from"
|
|
467
|
+
' qtable.design["Batch"] are allowed to be identical.'
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
design = qtable.get_design()
|
|
471
|
+
table = qtable.make_expression_table(
|
|
472
|
+
samples_as_columns=True, features=["Representative protein"]
|
|
473
|
+
)
|
|
474
|
+
table = table.set_index("Representative protein")
|
|
475
|
+
comparison_tag = " vs "
|
|
476
|
+
|
|
477
|
+
if exclude_invalid:
|
|
478
|
+
valid = qtable["Valid"]
|
|
479
|
+
else:
|
|
480
|
+
valid = np.full(table.shape[0], True)
|
|
481
|
+
not_nan = table.isna().sum(axis=1) == 0
|
|
482
|
+
mask = np.all([valid, not_nan], axis=0)
|
|
483
|
+
|
|
484
|
+
# Exchange experiment names with names that are guaranteed to be valid in R
|
|
485
|
+
experiment_to_r = {}
|
|
486
|
+
for i, experiment in enumerate(design["Experiment"].unique()):
|
|
487
|
+
experiment_to_r[experiment] = f".EXPERIMENT__{i:04d}"
|
|
488
|
+
r_to_experiment = {v: k for k, v in experiment_to_r.items()}
|
|
489
|
+
|
|
490
|
+
r_experiment_pairs = []
|
|
491
|
+
for exp1, exp2 in experiment_pairs:
|
|
492
|
+
r_experiment_pairs.append(f"{experiment_to_r[exp1]}-{experiment_to_r[exp2]}")
|
|
493
|
+
|
|
494
|
+
design.replace({"Experiment": experiment_to_r}, inplace=True)
|
|
495
|
+
|
|
496
|
+
# Run limma and join results for all comparison groups
|
|
497
|
+
limma_results = msreport.rinterface.multi_group_limma(
|
|
498
|
+
table[mask], design, r_experiment_pairs, batch, limma_trend
|
|
499
|
+
)
|
|
500
|
+
for r_comparison_group, limma_result in limma_results.items():
|
|
501
|
+
experiment_pair = [r_to_experiment[s] for s in r_comparison_group.split("-")]
|
|
502
|
+
comparison_group = comparison_tag.join(experiment_pair)
|
|
503
|
+
mapping = {col: f"{col} {comparison_group}" for col in limma_result.columns}
|
|
504
|
+
limma_result.rename(columns=mapping, inplace=True)
|
|
505
|
+
|
|
506
|
+
limma_table = pd.DataFrame(index=table.index)
|
|
507
|
+
limma_table = limma_table.join(limma_results.values())
|
|
508
|
+
limma_table.fillna(np.nan, inplace=True)
|
|
509
|
+
qtable.add_expression_features(limma_table)
|
|
510
|
+
|
|
511
|
+
# Average expression from limma is the whole row mean, overwrite with the average
|
|
512
|
+
# expression of the experiment group
|
|
513
|
+
for experiment_pair in experiment_pairs:
|
|
514
|
+
two_group_comparison(qtable, experiment_pair, exclude_invalid=exclude_invalid)
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
def calculate_two_group_limma(
|
|
518
|
+
qtable: Qtable,
|
|
519
|
+
experiment_pair: list[str, str],
|
|
520
|
+
exclude_invalid: bool = True,
|
|
521
|
+
limma_trend: bool = True,
|
|
522
|
+
) -> None:
|
|
523
|
+
"""Uses limma to perform a differential expression analysis of two experiments.
|
|
524
|
+
|
|
525
|
+
Adds new columns "P-value Experiment_1 vs Experiment_2",
|
|
526
|
+
"Adjusted p-value Experiment_1 vs Experiment_2",
|
|
527
|
+
"Average expression Experiment_1 vs Experiment_2", and
|
|
528
|
+
"Ratio [log2] Experiment_1 vs Experiment_2" to the qtable.
|
|
529
|
+
|
|
530
|
+
Requires that expression columns are set, and expression values are log2
|
|
531
|
+
transformed. All rows with missing values are ignored, impute missing values to
|
|
532
|
+
allow differential expression analysis of all rows. The qtable.data
|
|
533
|
+
column "Representative protein" is used as the index.
|
|
534
|
+
|
|
535
|
+
Args:
|
|
536
|
+
qtable: Qtable instance that contains expression values for differential
|
|
537
|
+
expression analysis.
|
|
538
|
+
experiment_pair: The names of the two experiments that will be compared,
|
|
539
|
+
experiments must be present in qtable.design
|
|
540
|
+
exclude_invalid: If true, the column "Valid" is used to determine which rows are
|
|
541
|
+
used for the differential expression analysis; default True.
|
|
542
|
+
limma_trend: If true, an intensity-dependent trend is fitted to the prior
|
|
543
|
+
variances; default True.
|
|
544
|
+
"""
|
|
545
|
+
# TODO: not tested #
|
|
546
|
+
expression_table = qtable.make_expression_table(
|
|
547
|
+
samples_as_columns=True, features=["Representative protein"]
|
|
548
|
+
)
|
|
549
|
+
comparison_tag = " vs "
|
|
550
|
+
|
|
551
|
+
if exclude_invalid:
|
|
552
|
+
valid = qtable["Valid"]
|
|
553
|
+
else:
|
|
554
|
+
valid = np.full(expression_table.shape[0], True)
|
|
555
|
+
|
|
556
|
+
samples_to_experiment = {}
|
|
557
|
+
for experiment in experiment_pair:
|
|
558
|
+
mapping = {s: experiment for s in qtable.get_samples(experiment)}
|
|
559
|
+
samples_to_experiment.update(mapping)
|
|
560
|
+
|
|
561
|
+
table_columns = ["Representative protein"]
|
|
562
|
+
table_columns.extend(samples_to_experiment.keys())
|
|
563
|
+
table = expression_table[table_columns]
|
|
564
|
+
table = table.set_index("Representative protein")
|
|
565
|
+
not_nan = table.isna().sum(axis=1) == 0
|
|
566
|
+
|
|
567
|
+
mask = np.all([valid, not_nan], axis=0)
|
|
568
|
+
experiments = list(samples_to_experiment.values())
|
|
569
|
+
|
|
570
|
+
# Note that the order of experiments for calling limma is reversed
|
|
571
|
+
limma_result = msreport.rinterface.two_group_limma(
|
|
572
|
+
table[mask], experiments, experiment_pair[1], experiment_pair[0], limma_trend
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
# For adding expression features to the qtable it is necessary that the
|
|
576
|
+
# the limma_results have the same number of rows.
|
|
577
|
+
limma_table = pd.DataFrame(index=table.index, columns=limma_result.columns)
|
|
578
|
+
limma_table[mask] = limma_result
|
|
579
|
+
limma_table.fillna(np.nan, inplace=True)
|
|
580
|
+
|
|
581
|
+
comparison_group = comparison_tag.join(experiment_pair)
|
|
582
|
+
mapping = {col: f"{col} {comparison_group}" for col in limma_table.columns}
|
|
583
|
+
limma_table.rename(columns=mapping, inplace=True)
|
|
584
|
+
qtable.add_expression_features(limma_table)
|
|
585
|
+
|
|
586
|
+
return limma_result
|
msreport/errors.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
class MsreportError(Exception):
|
|
2
|
+
...
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class NotFittedError(ValueError, AttributeError):
|
|
6
|
+
"""Exception class to raise if Normalizer is used before fitting."""
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ProteinsNotInFastaWarning(UserWarning):
|
|
10
|
+
"""Warning raised when queried proteins are absent from a FASTA file."""
|