msreport 0.0.26__py3-none-any.whl → 0.0.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,490 @@
1
+ import itertools
2
+ import warnings
3
+ from collections.abc import Iterable, Sequence
4
+ from typing import Optional
5
+
6
+ import matplotlib.pyplot as plt
7
+ import numpy as np
8
+ import pandas as pd
9
+ import seaborn as sns
10
+
11
+ import msreport.helper
12
+ from msreport.qtable import Qtable
13
+
14
+ from ._partial_plots import annotated_scatter
15
+ from .style import with_active_style
16
+
17
+
18
+ @with_active_style
19
+ def volcano_ma(
20
+ qtable: Qtable,
21
+ experiment_pair: Iterable[str],
22
+ comparison_tag: str = " vs ",
23
+ pvalue_tag: str = "P-value",
24
+ special_entries: Optional[list[str]] = None,
25
+ special_proteins: Optional[list[str]] = None,
26
+ annotation_column: str = "Gene name",
27
+ exclude_invalid: bool = True,
28
+ ) -> tuple[plt.Figure, list[plt.Axes]]:
29
+ """Generates a volcano and an MA plot for the comparison of two experiments.
30
+
31
+ Args:
32
+ qtable: A `Qtable` instance, which data is used for plotting.
33
+ experiment_pair: The names of the two experiments that will be compared,
34
+ experiments must be present in qtable.design.
35
+ comparison_tag: String used in comparison columns to separate a pair of
36
+ experiments; default " vs ", which corresponds to the MsReport convention.
37
+ pvalue_tag: String used for matching the pvalue columns; default "P-value",
38
+ which corresponds to the MsReport convention.
39
+ special_entries: Optional, allows to specify a list of entries from the
40
+ `qtable.id_column` column to be annotated.
41
+ special_proteins: This argument is deprecated, use 'special_entries' instead.
42
+ annotation_column: Column used for labeling the points of special entries in the
43
+ scatter plot. Default "Gene name". If the 'annotation_column' is not present
44
+ in the `qtable.data` table, the `qtable.id_column` is used instead.
45
+ exclude_invalid: If True, rows are filtered according to the Boolean entries of
46
+ the "Valid" column.
47
+
48
+ Raises:
49
+ ValueError: If the 'pvalue_tag', "Average expression" or "Ratio [log2]" column
50
+ is missing in the Qtable for the specified experiment_pair.
51
+
52
+ Returns:
53
+ A matplotlib Figure object and a list of two Axes objects containing the volcano
54
+ and the MA plot.
55
+ """
56
+ ratio_tag = "Ratio [log2]"
57
+ expression_tag = "Average expression"
58
+ comparison_group = comparison_tag.join(experiment_pair)
59
+
60
+ for tag in [ratio_tag, expression_tag, pvalue_tag]:
61
+ tag_column = msreport.helper.find_sample_columns(
62
+ qtable.data, comparison_group, [tag]
63
+ )
64
+ if not tag_column:
65
+ raise ValueError(
66
+ f"Missing the required '{tag}' column for the comparison group "
67
+ f"'{comparison_group}' in the Qtable."
68
+ )
69
+
70
+ if special_entries is None:
71
+ special_entries = []
72
+ if special_proteins is not None:
73
+ warnings.warn(
74
+ "The argument 'special_proteins' is deprecated, use 'special_entries' instead.",
75
+ DeprecationWarning,
76
+ stacklevel=2,
77
+ )
78
+ special_entries = list(special_entries) + list(special_proteins)
79
+
80
+ data = qtable.get_data(exclude_invalid=exclude_invalid)
81
+ if annotation_column not in data.columns:
82
+ annotation_column = qtable.id_column
83
+
84
+ scatter_size = 2 / (max(min(data.shape[0], 10000), 1000) / 1000)
85
+
86
+ masks = {
87
+ "highlight": data[qtable.id_column].isin(special_entries),
88
+ "default": ~data[qtable.id_column].isin(special_entries),
89
+ }
90
+ params = {
91
+ "highlight": {
92
+ "s": 10,
93
+ "color": "#E73C40",
94
+ "edgecolor": "#000000",
95
+ "lw": 0.2,
96
+ "zorder": 3,
97
+ },
98
+ "default": {"s": scatter_size, "color": "#40B7B5", "zorder": 2},
99
+ }
100
+
101
+ for column in msreport.helper.find_sample_columns(
102
+ data, pvalue_tag, [comparison_group]
103
+ ):
104
+ data[column] = np.log10(data[column]) * -1
105
+
106
+ suptitle_space_inch = 0.4
107
+ ax_height_inch = 3.2
108
+ ax_width_inch = 3.2
109
+ ax_wspace_inch = 1
110
+
111
+ fig_height = suptitle_space_inch + ax_height_inch
112
+ fig_width = ax_width_inch * 2 + ax_wspace_inch
113
+ fig_size = (fig_width, fig_height)
114
+
115
+ subplot_top = 1 - (suptitle_space_inch / fig_height)
116
+ subplot_wspace = ax_wspace_inch / ax_width_inch
117
+
118
+ fig, axes = plt.subplots(1, 2, figsize=fig_size, sharex=True)
119
+ fig.subplots_adjust(
120
+ bottom=0, top=subplot_top, left=0, right=1, wspace=subplot_wspace
121
+ )
122
+ fig.suptitle(f"Volcano and MA plot of: {comparison_group}", y=1)
123
+
124
+ for ax, x_variable, y_variable in [
125
+ (axes[0], ratio_tag, pvalue_tag),
126
+ (axes[1], ratio_tag, expression_tag),
127
+ ]:
128
+ x_col = " ".join([x_variable, comparison_group])
129
+ y_col = " ".join([y_variable, comparison_group])
130
+ x_values = data[x_col]
131
+ y_values = data[y_col]
132
+ xy_labels = data[annotation_column]
133
+
134
+ valid_values = np.isfinite(x_values) & np.isfinite(y_values)
135
+ mask_default = masks["default"] & valid_values
136
+ mask_special = masks["highlight"] & valid_values
137
+
138
+ ax.scatter(x_values[mask_default], y_values[mask_default], **params["default"])
139
+ annotated_scatter(
140
+ x_values=x_values[mask_special],
141
+ y_values=y_values[mask_special],
142
+ labels=xy_labels[mask_special],
143
+ ax=ax,
144
+ scatter_kws=params["highlight"],
145
+ )
146
+
147
+ ax.set_xlabel(x_variable)
148
+ if y_variable == pvalue_tag:
149
+ ax.set_ylabel(f"{y_variable} [-log10]")
150
+ else:
151
+ ax.set_ylabel(f"{y_variable} [log2]")
152
+ ax.grid(axis="both", linestyle="dotted")
153
+
154
+ return fig, axes
155
+
156
+
157
+ @with_active_style
158
+ def expression_comparison(
159
+ qtable: Qtable,
160
+ experiment_pair: list[str],
161
+ comparison_tag: str = " vs ",
162
+ plot_average_expression: bool = False,
163
+ special_entries: Optional[list[str]] = None,
164
+ special_proteins: Optional[list[str]] = None,
165
+ annotation_column: str = "Gene name",
166
+ exclude_invalid: bool = True,
167
+ ) -> tuple[plt.Figure, list[plt.Axes]]:
168
+ """Generates an expression comparison plot for two experiments.
169
+
170
+ The subplot in the middle displays the average expression of the two experiments on
171
+ the y-axis and the log fold change on the x-axis. The subplots on the left and right
172
+ display entries with only missing values in one of the two experiments.
173
+
174
+ Args:
175
+ qtable: A `Qtable` instance, which data is used for plotting.
176
+ experiment_pair: The names of the two experiments that will be compared,
177
+ experiments must be present in qtable.design.
178
+ comparison_tag: String used in comparison columns to separate a pair of
179
+ experiments; default " vs ", which corresponds to the MsReport convention.
180
+ plot_average_expression: If True plot average expression instead of maxium
181
+ expression. Default False.
182
+ special_entries: Optional, allows to specify a list of entries from the
183
+ `qtable.id_column` column to be annotated.
184
+ special_proteins: This argument is deprecated, use 'special_entries' instead.
185
+ annotation_column: Column used for labeling the points of special entries in the
186
+ scatter plot. Default "Gene name". If the 'annotation_column' is not present
187
+ in the `qtable.data` table, the `qtable.id_column` is used instead.
188
+ exclude_invalid: If True, rows are filtered according to the Boolean entries of
189
+ the "Valid" column.
190
+
191
+ Raises:
192
+ ValueError: If the "Expression" and "Events" columns for the specified
193
+ experiments are missing in the Qtable.
194
+
195
+ Returns:
196
+ A matplotlib Figure objects and a list of three Axes objects containing the
197
+ expression comparison plots.
198
+ """
199
+ missing_columns = []
200
+ for exp in experiment_pair:
201
+ for tag in ["Expression", "Events"]:
202
+ if f"{tag} {exp}" not in qtable.data.columns:
203
+ missing_columns.append(f"{tag} {exp}")
204
+ missing_columns = sorted(missing_columns)
205
+ if missing_columns:
206
+ raise ValueError(
207
+ f"Missing the required columns in the Qtable: {missing_columns}."
208
+ )
209
+
210
+ if special_entries is None:
211
+ special_entries = []
212
+ if special_proteins is not None:
213
+ warnings.warn(
214
+ "The argument 'special_proteins' is deprecated, use 'special_entries' instead.",
215
+ DeprecationWarning,
216
+ stacklevel=2,
217
+ )
218
+ special_entries = list(special_entries) + list(special_proteins)
219
+
220
+ exp_1, exp_2 = experiment_pair
221
+ comparison_group = comparison_tag.join(experiment_pair)
222
+
223
+ qtable_data = qtable.get_data(exclude_invalid=exclude_invalid)
224
+ if annotation_column not in qtable_data.columns:
225
+ annotation_column = qtable.id_column
226
+ total_scatter_area = 5000
227
+ params = {
228
+ "highlight": {
229
+ "s": 10,
230
+ "color": "#E73C40",
231
+ "edgecolor": "#000000",
232
+ "lw": 0.2,
233
+ "zorder": 3,
234
+ },
235
+ "default": {"alpha": 0.75, "color": "#40B7B5", "zorder": 2},
236
+ }
237
+
238
+ mask = (qtable_data[f"Events {exp_1}"] + qtable_data[f"Events {exp_2}"]) > 0
239
+ qtable_data = qtable_data[mask]
240
+
241
+ only_exp_1 = qtable_data[f"Events {exp_2}"] == 0
242
+ only_exp_2 = qtable_data[f"Events {exp_1}"] == 0
243
+ mask_both = np.invert(np.any([only_exp_1, only_exp_2], axis=0))
244
+
245
+ # Test if plotting maximum intensity is better than average
246
+ qtable_data[f"Maximum expression {comparison_group}"] = np.max(
247
+ [qtable_data[f"Expression {exp_2}"], qtable_data[f"Expression {exp_1}"]], axis=0
248
+ )
249
+ qtable_data[f"Average expression {comparison_group}"] = np.nanmean(
250
+ [qtable_data[f"Expression {exp_2}"], qtable_data[f"Expression {exp_1}"]], axis=0
251
+ )
252
+
253
+ def scattersize(df: pd.DataFrame, total_area) -> float:
254
+ if len(values) > 0:
255
+ size = min(max(np.sqrt(total_area / df.shape[0]), 0.5), 4)
256
+ else:
257
+ size = 1
258
+ return size
259
+
260
+ suptitle_space_inch = 0.4
261
+ ax_height_inch = 3.2
262
+ main_ax_width_inch = 3.2
263
+ side_ax_width_inch = 0.65
264
+ ax_wspace_inch = 0.6
265
+ width_ratios = [side_ax_width_inch, main_ax_width_inch, side_ax_width_inch]
266
+
267
+ fig_height = suptitle_space_inch + ax_height_inch
268
+ fig_width = main_ax_width_inch + (side_ax_width_inch + ax_wspace_inch) * 2
269
+ fig_size = (fig_width, fig_height)
270
+
271
+ subplot_top = 1 - (suptitle_space_inch / fig_height)
272
+ subplot_wspace = ax_wspace_inch / np.mean(width_ratios)
273
+
274
+ fig, axes = plt.subplots(
275
+ 1,
276
+ 3,
277
+ figsize=fig_size,
278
+ sharey=True,
279
+ gridspec_kw={
280
+ "bottom": 0,
281
+ "top": subplot_top,
282
+ "left": 0,
283
+ "right": 1,
284
+ "wspace": subplot_wspace,
285
+ "width_ratios": width_ratios,
286
+ },
287
+ )
288
+ fig.suptitle(f'Comparison of "Expression": {comparison_group}', y=1)
289
+
290
+ # Plot values quantified in both experiments
291
+ ax = axes[1]
292
+ values = qtable_data[mask_both]
293
+ s = scattersize(values, total_scatter_area)
294
+ x_variable = "Ratio [log2]"
295
+ y_variable = (
296
+ "Average expression" if plot_average_expression else "Maximum expression"
297
+ )
298
+ x_col = " ".join([x_variable, comparison_group])
299
+ y_col = " ".join([y_variable, comparison_group])
300
+ x_values = values[x_col]
301
+ y_values = values[y_col]
302
+ ax.grid(axis="both", linestyle="dotted")
303
+ ax.scatter(x_values, y_values, s=s, **params["default"])
304
+ highlight_mask = values[qtable.id_column].isin(special_entries)
305
+ annotated_scatter(
306
+ x_values=x_values[highlight_mask],
307
+ y_values=y_values[highlight_mask],
308
+ labels=values[annotation_column][highlight_mask],
309
+ ax=ax,
310
+ scatter_kws=params["highlight"],
311
+ )
312
+
313
+ ax.set_xlabel(x_variable)
314
+ ax.set_ylabel(y_variable)
315
+
316
+ # Plot values quantified only in one experiment
317
+ for ax, mask, exp in [(axes[2], only_exp_1, exp_1), (axes[0], only_exp_2, exp_2)]:
318
+ y_variable = f"Expression {exp}"
319
+ values = qtable_data[mask]
320
+ highlight_mask = values[qtable.id_column].isin(special_entries)
321
+ s = scattersize(values, total_scatter_area)
322
+
323
+ ax.grid(axis="y", linestyle="dotted")
324
+ ax.set_ylabel(y_variable)
325
+
326
+ if len(values) == 0:
327
+ continue
328
+
329
+ sns.stripplot(
330
+ y=values[y_variable],
331
+ jitter=True,
332
+ size=np.sqrt(s * 2),
333
+ marker="o",
334
+ edgecolor="none",
335
+ ax=ax,
336
+ **params["default"],
337
+ )
338
+
339
+ xlim = -0.2, 0.2
340
+ ax.set_xlim(xlim)
341
+ offsets = ax.collections[0].get_offsets()[highlight_mask]
342
+ annotated_scatter(
343
+ x_values=offsets[:, 0],
344
+ y_values=offsets[:, 1],
345
+ labels=values[annotation_column][highlight_mask],
346
+ ax=ax,
347
+ scatter_kws=params["highlight"],
348
+ )
349
+ ax.set_xlim(xlim)
350
+
351
+ # Important to reverse the order here which experiment is on the left and right
352
+ axes[0].set_xlabel(f"Absent in\n{exp_1}")
353
+ axes[2].set_xlabel(f"Absent in\n{exp_2}")
354
+
355
+ return fig, axes
356
+
357
+
358
+ @with_active_style
359
+ def pvalue_histogram(
360
+ qtable: Qtable,
361
+ pvalue_tag: str = "P-value",
362
+ comparison_tag: str = " vs ",
363
+ experiment_pairs: Optional[Sequence[Iterable[str]]] = None,
364
+ exclude_invalid: bool = True,
365
+ ) -> tuple[plt.Figure, list[plt.Axes]]:
366
+ """Generates p-value histograms for one or multiple experiment comparisons.
367
+
368
+ Histograms are generated with 20 bins of size 0.05. The p-value distribution of each
369
+ experiment comparison is shown with a separate subplot.
370
+
371
+ Args:
372
+ qtable: A `Qtable` instance, which data is used for plotting.
373
+ pvalue_tag: String used for matching the pvalue columns; default "P-value",
374
+ which corresponds to the MsReport convention.
375
+ comparison_tag: String used in comparison columns to separate a pair of
376
+ experiments; default " vs ", which corresponds to the MsReport convention.
377
+ experiment_pairs: Optional, list of experiment pairs that will be used for
378
+ plotting. For each experiment pair a p-value column must exists that follows
379
+ the format f"{pvalue_tag} {experiment_1}{comparison_tag}{experiment_2}".
380
+ If None, all experiment comparisons that are found in qtable.data are used.
381
+ exclude_invalid: If True, rows are filtered according to the Boolean entries of
382
+ the "Valid" column.
383
+
384
+ Raises:
385
+ ValueError: If no experiment pairs are found in the Qtable for the provided
386
+ p-value tag and comparison tag or if any of the provided experiment pairs
387
+ does not exist in the Qtable.
388
+
389
+ Returns:
390
+ A matplotlib Figure and a list of Axes objects, containing the p-value plots.
391
+ """
392
+ data = qtable.get_data(exclude_invalid=exclude_invalid)
393
+
394
+ def _get_valid_experiment_pairs(
395
+ pairs: Iterable[Iterable[str]],
396
+ ) -> list[Iterable[str]]:
397
+ valid_pairs = []
398
+ for pair in pairs:
399
+ comparison_group = comparison_tag.join(pair)
400
+ comparison_column = f"{pvalue_tag} {comparison_group}"
401
+ if comparison_column in data.columns:
402
+ valid_pairs.append(pair)
403
+ return valid_pairs
404
+
405
+ # Find all experiment pairs
406
+ if experiment_pairs is not None:
407
+ valid_pairs = _get_valid_experiment_pairs(experiment_pairs)
408
+ invalid_pairs = list(set(experiment_pairs) - set(valid_pairs))
409
+ if invalid_pairs:
410
+ raise ValueError(
411
+ "The following provided experiment pairs do not exist in the Qtable: "
412
+ f"{invalid_pairs}"
413
+ )
414
+ else:
415
+ experiment_pairs = _get_valid_experiment_pairs(
416
+ itertools.permutations(qtable.get_experiments(), 2)
417
+ )
418
+ if not experiment_pairs:
419
+ raise ValueError(
420
+ "No experiment pairs found in the Qtable for p-value tag "
421
+ f"'{pvalue_tag}' and comparison tag '{comparison_tag}'."
422
+ )
423
+
424
+ num_plots = len(experiment_pairs)
425
+
426
+ suptitle_space_inch = 0.4
427
+ ax_height_inch = 1.8
428
+ ax_width_inch = 1
429
+ ax_wspace_inch = 0.6
430
+
431
+ fig_width = num_plots * ax_width_inch + (num_plots - 1) * ax_wspace_inch
432
+ fig_height = ax_height_inch + suptitle_space_inch
433
+ fig_size = (fig_width, fig_height)
434
+
435
+ subplot_top = 1 - (suptitle_space_inch / fig_height)
436
+ subplot_wspace = ax_wspace_inch / ax_width_inch
437
+
438
+ fig, axes = plt.subplots(1, num_plots, figsize=fig_size, sharex=True, sharey=True)
439
+ if num_plots == 1:
440
+ axes = np.array([axes])
441
+ else:
442
+ axes = np.array(axes)
443
+ fig.subplots_adjust(
444
+ bottom=0, top=subplot_top, left=0, right=1, wspace=subplot_wspace
445
+ )
446
+ fig.suptitle("P-value histogram of pair-wise experiment comparisons", y=1)
447
+
448
+ bins = np.arange(0, 1.01, 0.05)
449
+ for ax_pos, experiment_pair in enumerate(experiment_pairs): # type: ignore
450
+ comparison_group = comparison_tag.join(experiment_pair)
451
+ comparison_column = f"{pvalue_tag} {comparison_group}"
452
+ comparison_label = f"{comparison_tag}\n".join(experiment_pair)
453
+ p_values = data[comparison_column]
454
+
455
+ ax = axes[ax_pos]
456
+ ax2 = ax.twinx()
457
+ ax2.set_yticks([])
458
+ ax2.set_ylabel(comparison_label)
459
+
460
+ ax.hist(
461
+ p_values,
462
+ bins=bins,
463
+ color=None,
464
+ edgecolor="#215e5d",
465
+ linewidth=1.5,
466
+ zorder=2,
467
+ )
468
+ ax.hist(
469
+ p_values,
470
+ bins=bins,
471
+ color="#40B7B5",
472
+ edgecolor=None,
473
+ linewidth=0,
474
+ zorder=2.1,
475
+ )
476
+
477
+ ax.set_xticks([0, 0.5, 1])
478
+ # Need to remove the ticks manually because creating the twin axis somehow
479
+ # overrides the rcParams settings.
480
+ ax.tick_params(
481
+ left=plt.rcParams["ytick.left"], right=plt.rcParams["ytick.right"]
482
+ )
483
+ ax.set_xlabel(pvalue_tag)
484
+ ax.grid(False, axis="x")
485
+ sns.despine(top=True, right=True)
486
+
487
+ axes[0].set_ylabel(f"{pvalue_tag} count")
488
+ ax.set_xlim(-0.05, 1.05)
489
+
490
+ return fig, axes