msreport 0.0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
msreport/analyze.py ADDED
@@ -0,0 +1,586 @@
1
+ """ The analyze module contains methods for analysing quantification results. """
2
+
3
+ from __future__ import annotations
4
+ from typing import Iterable, Optional, Protocol
5
+ import warnings
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ import msreport.normalize
11
+ import msreport.rinterface
12
+ from msreport.qtable import Qtable
13
+
14
+
15
+ class Transformer(Protocol):
16
+ def fit(self, table: pd.DataFrame) -> Transformer:
17
+ """Fits the Transformer and returns a fitted Transformer instance."""
18
+
19
+ def is_fitted(self) -> bool:
20
+ """Returns True if the Transformer has been fitted."""
21
+
22
+ def transform(self, table: pd.DataFrame) -> pd.DataFrame:
23
+ """Transform values in 'table'."""
24
+
25
+
26
+ class CategoryTransformer(Protocol):
27
+ def fit(self, table: pd.DataFrame) -> Transformer:
28
+ """Fits the Transformer and returns a fitted Transformer instance."""
29
+
30
+ def is_fitted(self) -> bool:
31
+ """Returns True if the Transformer has been fitted."""
32
+
33
+ def transform(self, table: pd.DataFrame) -> pd.DataFrame:
34
+ """Transform values in 'table'."""
35
+
36
+ def get_category_column(self, table: pd.DataFrame) -> pd.DataFrame:
37
+ """Returns the specified category column."""
38
+
39
+
40
+ def analyze_missingness(qtable: Qtable) -> None:
41
+ """Quantifies missing values of expression columns.
42
+
43
+ Adds additional columns to the qtable; for the number of missing values per sample
44
+ "Missing sample_name", per experiment "Missing experiment_name" and in total
45
+ "Missing total"; and for the number of quantification events per experiment
46
+ "Events experiment_name" and in total "Events total".
47
+
48
+ Requires expression columns to be set. Missing values in expression columns must be
49
+ present as NaN, and not as zero or an empty string.
50
+
51
+ Args:
52
+ qtable: A Qtable instance.
53
+ """
54
+ # TODO: not tested #
55
+ missing_events = pd.DataFrame()
56
+ quant_events = pd.DataFrame()
57
+ table = qtable.make_expression_table(samples_as_columns=True)
58
+ num_missing = np.isnan(table).sum(axis=1)
59
+ num_events = np.isfinite(table).sum(axis=1)
60
+ quant_events["Events total"] = num_events
61
+ missing_events["Missing total"] = num_missing
62
+ for experiment in qtable.get_experiments():
63
+ exp_samples = qtable.get_samples(experiment)
64
+ num_events = np.isfinite(table[exp_samples]).sum(axis=1)
65
+ quant_events[f"Events {experiment}"] = num_events
66
+ num_missing = np.isnan(table[exp_samples]).sum(axis=1)
67
+ missing_events[f"Missing {experiment}"] = num_missing
68
+ for sample in qtable.get_samples():
69
+ sample_missing = np.isnan(table[sample])
70
+ missing_events[f"Missing {sample}"] = sample_missing
71
+ qtable.add_expression_features(missing_events)
72
+ qtable.add_expression_features(quant_events)
73
+
74
+
75
+ def validate_proteins(
76
+ qtable: Qtable,
77
+ min_peptides: int = 0,
78
+ remove_contaminants: bool = True,
79
+ min_events: Optional[int] = None,
80
+ max_missing: Optional[int] = None,
81
+ ) -> None:
82
+ """Validates protein entries (rows).
83
+
84
+ Adds an additional column "Valid" to the qtable, containing Boolean values.
85
+
86
+ Requires expression columns to be set. Depending on the arguments requires the
87
+ columns "Total peptides", "Potential contaminant", and the experiment columns
88
+ "Missing experiment_name" and "Events experiment_name".
89
+
90
+ Args:
91
+ qtable: A Qtable instance.
92
+ min_peptides: Minimum number of unique peptides, default 0.
93
+ remove_contaminants: If true, the "Potential contaminant" column is used to
94
+ remove invalid entries, default True. If no "Potential contaminant" column
95
+ is present 'remove_contaminants' is ignored.
96
+ min_events: If specified, at least one experiment must have the minimum number
97
+ of quantified events for the protein entry to be valid.
98
+ max_missing: If specified, at least one experiment must have no more than the
99
+ maximum number of missing values.
100
+ """
101
+ valid_entries = np.ones(qtable.data.shape[0], dtype=bool)
102
+
103
+ if min_peptides > 0:
104
+ if "Total peptides" not in qtable:
105
+ raise KeyError("'Total peptides' column not present in qtable.data")
106
+ valid_entries = np.all(
107
+ [valid_entries, qtable["Total peptides"] >= min_peptides], axis=0
108
+ )
109
+
110
+ # TODO: not tested from here #
111
+ if remove_contaminants:
112
+ if "Potential contaminant" not in qtable:
113
+ raise KeyError("'Potential contaminant' column not present in qtable.data")
114
+ valid_entries = np.all(
115
+ [valid_entries, np.invert(qtable["Potential contaminant"])], axis=0
116
+ )
117
+
118
+ if max_missing is not None:
119
+ cols = [" ".join(["Missing", e]) for e in qtable.get_experiments()]
120
+ if not pd.Series(cols).isin(qtable.data.columns).all():
121
+ raise Exception(
122
+ f"Not all columns from {cols} are present in qtable.data,"
123
+ " analyze missingness before calling validate_proteins()."
124
+ )
125
+ max_missing_valid = np.any(qtable[cols] <= max_missing, axis=1)
126
+ valid_entries = max_missing_valid & valid_entries
127
+
128
+ if min_events is not None:
129
+ cols = [" ".join(["Events", e]) for e in qtable.get_experiments()]
130
+ if not pd.Series(cols).isin(qtable.data.columns).all():
131
+ raise Exception(
132
+ f"Not all columns from {cols} are present in qtable.data,"
133
+ " analyze missingness before calling validate_proteins()."
134
+ )
135
+ min_events_valid = np.any(qtable[cols] >= min_events, axis=1)
136
+ valid_entries = min_events_valid & valid_entries
137
+
138
+ qtable["Valid"] = valid_entries
139
+
140
+
141
+ def normalize_expression(
142
+ qtable: Qtable,
143
+ normalizer: Transformer,
144
+ exclude_invalid: bool = True,
145
+ ) -> None:
146
+ """Normalizes expression values in qtable.
147
+
148
+ Normalizes values present in the qtable expression columns, requires that expression
149
+ columns are defined. The normalizer will be fit with the expression values if it has
150
+ not been fitted already.
151
+
152
+ Args:
153
+ qtable: A Qtable instance, which expression values will be normalized.
154
+ normalizer: A Normalizer instance from the msreport.normalize module. Note that
155
+ if an already fitted normalizer is passed, it has to be fitted with a
156
+ dataframe which column names correspond to the sample names present in
157
+ qtable.design. A not fitted normalizer is fitted with the expression values
158
+ present in the qtable.
159
+ exclude_invalid: If true, the column "Valid" is used to filter which expression
160
+ rows are used for fitting a not fitted normalizer; default True. Independent
161
+ of if exclude_invalid is True or False, all expression values will be
162
+ normalized.
163
+ """
164
+ table = qtable.make_expression_table(samples_as_columns=True, features=["Valid"])
165
+ sample_columns = table.columns.drop("Valid")
166
+ expression_columns = [qtable.get_expression_column(s) for s in sample_columns]
167
+
168
+ raw_data = table[sample_columns]
169
+ if not normalizer.is_fitted():
170
+ if exclude_invalid:
171
+ valid_mask = table["Valid"]
172
+ else:
173
+ valid_mask = np.ones_like(table["Valid"], dtype=bool)
174
+ fit_data = raw_data[valid_mask]
175
+ normalizer = normalizer.fit(fit_data)
176
+
177
+ transformed_data = normalizer.transform(raw_data)
178
+ qtable[expression_columns] = transformed_data[sample_columns]
179
+
180
+
181
+ def create_site_to_protein_normalizer(
182
+ qtable: Qtable, category_column: str = "Representative protein"
183
+ ) -> msreport.normalizer.CategoricalNormalizer:
184
+ """Creates a fitted `CategoricalNormalizer` for site-to-protein normalization.
185
+
186
+ The `CategoricalNormalizer` is fitted to protein expression profiles of the provided
187
+ `qtable`. The protein expression profiles are calculated by subtracting the mean
188
+ expression value of each protein from the protein expression values. Expression
189
+ values must be log transformed. The generated `CategoricalNormalizer` can be used to
190
+ normalize ion, peptide or site qtables based on protein categories.
191
+
192
+ Args:
193
+ qtable: Qtable instance containing protein values for fitting the normalizer.
194
+ category_column: The name of the column containing the protein categories.
195
+
196
+ Returns:
197
+ A fitted `CategoricalNormalizer` object.
198
+ """
199
+ reference_expression = qtable.make_expression_table(
200
+ samples_as_columns=True,
201
+ features=[category_column],
202
+ )
203
+ completely_quantified = (
204
+ ~reference_expression[qtable.get_samples()].isna().any(axis=1)
205
+ )
206
+ reference_expression = reference_expression[completely_quantified]
207
+
208
+ sample_columns = qtable.get_samples()
209
+ reference_profiles = reference_expression[sample_columns].sub(
210
+ reference_expression[sample_columns].mean(axis=1), axis=0
211
+ )
212
+ reference_profiles[category_column] = reference_expression[category_column]
213
+
214
+ normalizer = msreport.normalize.CategoricalNormalizer(category_column)
215
+ normalizer = normalizer.fit(reference_profiles)
216
+
217
+ return normalizer
218
+
219
+
220
+ def create_ibaq_transformer(
221
+ qtable: Qtable,
222
+ category_column: str = "Representative protein",
223
+ ibaq_column: str = "iBAQ peptides",
224
+ ) -> msreport.normalizer.CategoricalNormalizer:
225
+ """Creates a fitted `CategoricalNormalizer` for iBAQ transformation.
226
+
227
+ The `CategoricalNormalizer` is fitted to iBAQ peptide counts of the provided
228
+ `qtable`, and can be used to transform protein intensities by dividing them by the
229
+ corresponding iBAQ peptide counts. Missing iBAQ peptide counts are replaced by 1 and
230
+ values smaller than 1 are replaced by 1. iBAQ peptide counts are then log2
231
+ transformed because the `CategoryTransformer` expects log2 transformed values.
232
+
233
+ Args:
234
+ qtable: Qtable instance containing iBAQ peptide counts for fitting the
235
+ normalizer.
236
+ category_column: The name of the column containing the protein categories.
237
+ ibaq_column: The name of the column containing the iBAQ peptide counts.
238
+
239
+ Returns:
240
+ A fitted `CategoricalNormalizer` object.
241
+ """
242
+ category_values = qtable[category_column].copy()
243
+ ibaq_factor_values = qtable[ibaq_column].copy()
244
+ sample_columns = qtable.get_samples()
245
+
246
+ ibaq_factor_values = ibaq_factor_values.fillna(1)
247
+ ibaq_factor_values[ibaq_factor_values < 1] = 1
248
+ ibaq_factor_values = np.log2(ibaq_factor_values)
249
+
250
+ reference_table = pd.DataFrame({c: ibaq_factor_values for c in sample_columns})
251
+ reference_table[category_column] = category_values
252
+
253
+ normalizer = msreport.normalize.CategoricalNormalizer(category_column)
254
+ normalizer = normalizer.fit(reference_table)
255
+
256
+ return normalizer
257
+
258
+
259
+ def normalize_expression_by_category(
260
+ qtable: Qtable, normalizer: CategoryTransformer
261
+ ) -> None:
262
+ """Normalizes expression values in a Qtable based on categories.
263
+
264
+ Args:
265
+ qtable: A Qtable instance, which expression values will be normalized.
266
+ normalizer: A `CategoryTransformer` object used for normalization.
267
+
268
+ Raises:
269
+ KeyError: If the category column of the `CategoryTransformer` object is not
270
+ found in the `qtable.data`.
271
+ """
272
+ category_column = normalizer.get_category_column()
273
+ if category_column not in qtable.data.columns:
274
+ raise KeyError(
275
+ f'The category column "{category_column}" in the normalizer '
276
+ f"is not found in `qtable.data`."
277
+ )
278
+
279
+ table = qtable.make_expression_table(
280
+ samples_as_columns=True, features=[category_column]
281
+ )
282
+ sample_columns = table.columns.drop(category_column)
283
+ expression_columns = [qtable.get_expression_column(s) for s in sample_columns]
284
+
285
+ raw_data = table[sample_columns.append(pd.Index([category_column]))]
286
+ transformed_data = normalizer.transform(raw_data)
287
+ qtable.data[expression_columns] = transformed_data[sample_columns]
288
+
289
+
290
+ def impute_missing_values(
291
+ qtable: Qtable,
292
+ imputer: Transformer,
293
+ exclude_invalid: bool = True,
294
+ ) -> None:
295
+ """Imputes missing expression values in qtable.
296
+
297
+ Imputes missing values (nan) present in the qtable expression columns, requires
298
+ that the qtable has defined expression columns. If the passed imputer object is not
299
+ yet fitted, it will be fit with the expression values. If 'exclude_invalid' is True,
300
+ only valid expression values will be used for fitting the imputer.
301
+
302
+ Args:
303
+ qtable: A Qtable instance, which missing expression values will be imputed.
304
+ imputer: An Imputer instance from the msreport.impute module. Note that if an
305
+ already fitted imputer is passed, it has to be fitted with a dataframe which
306
+ column names correspond to the sample names present in qtable.design. A not
307
+ fitted imputer is fitted with the expression values present in the qtable.
308
+ exclude_invalid: If true, the column "Valid" is used to determine for which rows
309
+ imputation is performed. Default True.
310
+ """
311
+ table = qtable.make_expression_table(samples_as_columns=True, features=["Valid"])
312
+ sample_columns = table.columns.drop("Valid")
313
+ expression_columns = [qtable.get_expression_column(s) for s in sample_columns]
314
+ if exclude_invalid:
315
+ valid_mask = table["Valid"]
316
+ else:
317
+ valid_mask = np.ones_like(table["Valid"], dtype=bool)
318
+
319
+ raw_data = table.loc[valid_mask, sample_columns]
320
+ if not imputer.is_fitted():
321
+ imputer = imputer.fit(raw_data)
322
+
323
+ imputed_data = imputer.transform(raw_data)
324
+ imputed_data.rename(
325
+ columns=dict(zip(sample_columns, expression_columns)), inplace=True
326
+ )
327
+ qtable.data.loc[valid_mask, expression_columns] = imputed_data
328
+
329
+
330
+ def calculate_experiment_means(qtable: Qtable) -> None:
331
+ """Calculates mean expression values for each experiment.
332
+
333
+ Adds a new column "Expression experiment_name" for each experiment, containing the
334
+ mean expression values of the corresponding samples.
335
+
336
+ Args:
337
+ qtable: A Qtable instance, which mean experiment expression values will be
338
+ calculated.
339
+ """
340
+ experiment_means = {}
341
+ for experiment in qtable.get_experiments():
342
+ samples = qtable.get_samples(experiment)
343
+ columns = [qtable.get_expression_column(s) for s in samples]
344
+ with warnings.catch_warnings():
345
+ warnings.simplefilter("ignore", category=RuntimeWarning)
346
+ row_means = np.nanmean(qtable[columns], axis=1)
347
+ experiment_means[f"Expression {experiment}"] = row_means
348
+ qtable.add_expression_features(pd.DataFrame(experiment_means))
349
+
350
+
351
+ def calculate_multi_group_comparison(
352
+ qtable: Qtable,
353
+ experiment_pairs: Iterable[Iterable[str]],
354
+ exclude_invalid: bool = True,
355
+ ) -> None:
356
+ """Calculates average expression and ratios for multiple comparison groups.
357
+
358
+ For each experiment pair, adds new columns
359
+ "Average expression Experiment_1 vs Experiment_2" and
360
+ "Ratio [log2] Experiment_1 vs Experiment_2" to the qtable. Expression values must be
361
+ log transformed.
362
+
363
+ Args:
364
+ qtable: Qtable instance that contains expression values for calculating group
365
+ comparisons.
366
+ experiment_pairs: A list containing one or multiple experiment pairs for which
367
+ the group comparison should be calculated. The specified experiments must
368
+ correspond to entries from qtable.design["Experiment"].
369
+ exclude_invalid: If true, the column "Valid" is used to determine which rows are
370
+ used for calculating the group comparisons; default True.
371
+ """
372
+ table = qtable.make_expression_table(samples_as_columns=True, features=["Valid"])
373
+ comparison_tag = " vs "
374
+
375
+ if exclude_invalid:
376
+ invalid = np.invert(table["Valid"].to_numpy())
377
+ else:
378
+ invalid = np.zeros(table.shape[0], dtype=bool)
379
+
380
+ for experiment_pair in experiment_pairs:
381
+ comparison_group = comparison_tag.join(experiment_pair)
382
+ with warnings.catch_warnings():
383
+ warnings.simplefilter("ignore", category=RuntimeWarning)
384
+ group_expressions = []
385
+ for experiment in experiment_pair:
386
+ samples = qtable.get_samples(experiment)
387
+ group_expressions.append(np.nanmean(table[samples], axis=1))
388
+ ratios = group_expressions[0] - group_expressions[1]
389
+ average_expressions = np.nanmean(group_expressions, axis=0)
390
+
391
+ comparison_table = pd.DataFrame(
392
+ {
393
+ f"Average expression {comparison_group}": average_expressions,
394
+ f"Ratio [log2] {comparison_group}": ratios,
395
+ }
396
+ )
397
+ comparison_table[invalid] = np.nan
398
+ qtable.add_expression_features(comparison_table)
399
+
400
+
401
+ def two_group_comparison(
402
+ qtable: Qtable, experiment_pair: Iterable[str], exclude_invalid: bool = True
403
+ ) -> None:
404
+ """Calculates comparison values for two experiments.
405
+
406
+ Adds new columns "Average expression Experiment_1 vs Experiment_2" and
407
+ "Ratio [log2] Experiment_1 vs Experiment_2" to the qtable. Expects that expression
408
+ values are log2 transformed.
409
+
410
+ Args:
411
+ qtable: A Qtable instance, containing expression values.
412
+ experiment_pair: The two experiments that will be compared, experiments must be
413
+ present in qtable.design
414
+ exclude_invalid: If true, the column "Valid" is used to determine for which rows
415
+ comparison values are calculated.
416
+ """
417
+ calculate_multi_group_comparison(
418
+ qtable, experiment_pairs=[experiment_pair], exclude_invalid=exclude_invalid
419
+ )
420
+
421
+
422
+ def calculate_multi_group_limma(
423
+ qtable: Qtable,
424
+ experiment_pairs: list[list[str, str]],
425
+ exclude_invalid: bool = True,
426
+ batch: bool = False,
427
+ limma_trend: bool = True,
428
+ ) -> None:
429
+ """Uses limma to perform a differential expression analysis of multiple experiments.
430
+
431
+ For each experiment pair specified in 'experiment_pairs' the following new columns
432
+ are added to the qtable:
433
+ - "P-value Experiment_1 vs Experiment_2"
434
+ - "Adjusted p-value Experiment_1 vs Experiment_2"
435
+ - "Average expression Experiment_1 vs Experiment_2"
436
+ - "Ratio [log2] Experiment_1 vs Experiment_2"
437
+
438
+ Requires that expression columns are set, and expression values are log2 transformed
439
+ All rows with missing values are ignored, impute missing values to allow
440
+ differential expression analysis of all rows. The qtable.data column
441
+ "Representative protein" is used as the index.
442
+
443
+ Args:
444
+ qtable: Qtable instance that contains expression values for differential
445
+ expression analysis.
446
+ experiment_pairs: A list containing lists of experiment pairs for which the
447
+ results of the differential expression analysis should be reported. The
448
+ specified experiment pairs must correspond to entries from
449
+ qtable.design["Experiment"].
450
+ exclude_invalid: If true, the column "Valid" is used to determine which rows are
451
+ used for the differential expression analysis; default True.
452
+ batch: If true batch effects are considered for the differential expression
453
+ analysis. Batches must be specified in the design in a "Batch" column.
454
+ limma_trend: If true, an intensity-dependent trend is fitted to the prior
455
+ variance during calculation of the moderated t-statistics, refer to
456
+ limma.eBayes for details; default True.
457
+ """
458
+ # TODO: not tested #
459
+ if batch and "Batch" not in qtable.get_design():
460
+ raise KeyError(
461
+ "When using calculate_multi_group_limma(batch=True) a"
462
+ ' "Batch" column must be present in qtable.design'
463
+ )
464
+ if batch and qtable.get_design()["Batch"].nunique() == 1:
465
+ raise ValueError(
466
+ "When using calculate_multi_group_limma(batch=True), not all values from"
467
+ ' qtable.design["Batch"] are allowed to be identical.'
468
+ )
469
+
470
+ design = qtable.get_design()
471
+ table = qtable.make_expression_table(
472
+ samples_as_columns=True, features=["Representative protein"]
473
+ )
474
+ table = table.set_index("Representative protein")
475
+ comparison_tag = " vs "
476
+
477
+ if exclude_invalid:
478
+ valid = qtable["Valid"]
479
+ else:
480
+ valid = np.full(table.shape[0], True)
481
+ not_nan = table.isna().sum(axis=1) == 0
482
+ mask = np.all([valid, not_nan], axis=0)
483
+
484
+ # Exchange experiment names with names that are guaranteed to be valid in R
485
+ experiment_to_r = {}
486
+ for i, experiment in enumerate(design["Experiment"].unique()):
487
+ experiment_to_r[experiment] = f".EXPERIMENT__{i:04d}"
488
+ r_to_experiment = {v: k for k, v in experiment_to_r.items()}
489
+
490
+ r_experiment_pairs = []
491
+ for exp1, exp2 in experiment_pairs:
492
+ r_experiment_pairs.append(f"{experiment_to_r[exp1]}-{experiment_to_r[exp2]}")
493
+
494
+ design.replace({"Experiment": experiment_to_r}, inplace=True)
495
+
496
+ # Run limma and join results for all comparison groups
497
+ limma_results = msreport.rinterface.multi_group_limma(
498
+ table[mask], design, r_experiment_pairs, batch, limma_trend
499
+ )
500
+ for r_comparison_group, limma_result in limma_results.items():
501
+ experiment_pair = [r_to_experiment[s] for s in r_comparison_group.split("-")]
502
+ comparison_group = comparison_tag.join(experiment_pair)
503
+ mapping = {col: f"{col} {comparison_group}" for col in limma_result.columns}
504
+ limma_result.rename(columns=mapping, inplace=True)
505
+
506
+ limma_table = pd.DataFrame(index=table.index)
507
+ limma_table = limma_table.join(limma_results.values())
508
+ limma_table.fillna(np.nan, inplace=True)
509
+ qtable.add_expression_features(limma_table)
510
+
511
+ # Average expression from limma is the whole row mean, overwrite with the average
512
+ # expression of the experiment group
513
+ for experiment_pair in experiment_pairs:
514
+ two_group_comparison(qtable, experiment_pair, exclude_invalid=exclude_invalid)
515
+
516
+
517
+ def calculate_two_group_limma(
518
+ qtable: Qtable,
519
+ experiment_pair: list[str, str],
520
+ exclude_invalid: bool = True,
521
+ limma_trend: bool = True,
522
+ ) -> None:
523
+ """Uses limma to perform a differential expression analysis of two experiments.
524
+
525
+ Adds new columns "P-value Experiment_1 vs Experiment_2",
526
+ "Adjusted p-value Experiment_1 vs Experiment_2",
527
+ "Average expression Experiment_1 vs Experiment_2", and
528
+ "Ratio [log2] Experiment_1 vs Experiment_2" to the qtable.
529
+
530
+ Requires that expression columns are set, and expression values are log2
531
+ transformed. All rows with missing values are ignored, impute missing values to
532
+ allow differential expression analysis of all rows. The qtable.data
533
+ column "Representative protein" is used as the index.
534
+
535
+ Args:
536
+ qtable: Qtable instance that contains expression values for differential
537
+ expression analysis.
538
+ experiment_pair: The names of the two experiments that will be compared,
539
+ experiments must be present in qtable.design
540
+ exclude_invalid: If true, the column "Valid" is used to determine which rows are
541
+ used for the differential expression analysis; default True.
542
+ limma_trend: If true, an intensity-dependent trend is fitted to the prior
543
+ variances; default True.
544
+ """
545
+ # TODO: not tested #
546
+ expression_table = qtable.make_expression_table(
547
+ samples_as_columns=True, features=["Representative protein"]
548
+ )
549
+ comparison_tag = " vs "
550
+
551
+ if exclude_invalid:
552
+ valid = qtable["Valid"]
553
+ else:
554
+ valid = np.full(expression_table.shape[0], True)
555
+
556
+ samples_to_experiment = {}
557
+ for experiment in experiment_pair:
558
+ mapping = {s: experiment for s in qtable.get_samples(experiment)}
559
+ samples_to_experiment.update(mapping)
560
+
561
+ table_columns = ["Representative protein"]
562
+ table_columns.extend(samples_to_experiment.keys())
563
+ table = expression_table[table_columns]
564
+ table = table.set_index("Representative protein")
565
+ not_nan = table.isna().sum(axis=1) == 0
566
+
567
+ mask = np.all([valid, not_nan], axis=0)
568
+ experiments = list(samples_to_experiment.values())
569
+
570
+ # Note that the order of experiments for calling limma is reversed
571
+ limma_result = msreport.rinterface.two_group_limma(
572
+ table[mask], experiments, experiment_pair[1], experiment_pair[0], limma_trend
573
+ )
574
+
575
+ # For adding expression features to the qtable it is necessary that the
576
+ # the limma_results have the same number of rows.
577
+ limma_table = pd.DataFrame(index=table.index, columns=limma_result.columns)
578
+ limma_table[mask] = limma_result
579
+ limma_table.fillna(np.nan, inplace=True)
580
+
581
+ comparison_group = comparison_tag.join(experiment_pair)
582
+ mapping = {col: f"{col} {comparison_group}" for col in limma_table.columns}
583
+ limma_table.rename(columns=mapping, inplace=True)
584
+ qtable.add_expression_features(limma_table)
585
+
586
+ return limma_result
msreport/errors.py ADDED
@@ -0,0 +1,10 @@
1
+ class MsreportError(Exception):
2
+ ...
3
+
4
+
5
+ class NotFittedError(ValueError, AttributeError):
6
+ """Exception class to raise if Normalizer is used before fitting."""
7
+
8
+
9
+ class ProteinsNotInFastaWarning(UserWarning):
10
+ """Warning raised when queried proteins are absent from a FASTA file."""