msreport 0.0.27__py3-none-any.whl → 0.0.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
msreport/plot.py DELETED
@@ -1,1134 +0,0 @@
1
- from collections.abc import Iterable
2
- from collections import UserDict
3
- import itertools
4
- from typing import Optional
5
- import re
6
-
7
- import adjustText
8
- import numpy as np
9
- from matplotlib import pyplot as plt
10
- import pandas as pd
11
- import seaborn as sns
12
- import sklearn.preprocessing
13
- import sklearn.decomposition
14
-
15
- from msreport.errors import MsreportError
16
- from msreport.qtable import Qtable
17
- import msreport.helper
18
-
19
-
20
- def set_dpi(dpi: int) -> None:
21
- """Changes the default dots per inch settings for matplotlib plots.
22
-
23
- This effectively makes figures smaller or larger, without affecting the relative
24
- sizes of elements within the figures.
25
-
26
- Args:
27
- dpi: New default dots per inch.
28
- """
29
- plt.rcParams["figure.dpi"] = dpi
30
-
31
-
32
- class ColorWheelDict(UserDict):
33
- """Lookup dictionary that maps keys to hex colors by using a color wheel.
34
-
35
- When a key is not present the first color of the color wheel is added as the value,
36
- and the color is moved from the beginning to the end of the color wheel. If no list
37
- of colors is specified, a default list of ten colors is added to the color wheel.
38
- It is also possible to manually set key and color pairs by using the same syntax as
39
- for a regular dictionary.
40
- """
41
-
42
- def __init__(self, colors: Optional[list[str]] = None):
43
- """Initializes a ColorWheelDict.
44
-
45
- Args:
46
- colors: Optional, a list of hex colors used for the color wheel. By default
47
- a list with ten colors is used.
48
- """
49
- self.data = {}
50
-
51
- if colors is not None:
52
- self.colors = colors
53
- else:
54
- self.colors = [
55
- "#80b1d3",
56
- "#fdb462",
57
- "#8dd3c7",
58
- "#bebada",
59
- "#fb8072",
60
- "#b3de69",
61
- "#fccde5",
62
- "#d9d9d9",
63
- "#bc80bd",
64
- "#ccebc5",
65
- ]
66
- self._color_wheel = self.colors.copy()
67
-
68
- def _next_color(self) -> str:
69
- color = self._color_wheel.pop(0)
70
- self._color_wheel.append(color)
71
- return color
72
-
73
- def __setitem__(self, key, value):
74
- is_hexcolor = re.search(r"^#(?:[0-9a-fA-F]{3}){1,2}$", value)
75
- if is_hexcolor:
76
- self.data[key] = value
77
- else:
78
- raise ValueError(f"the specified value {value} is not a hexcolor.")
79
-
80
- def __getitem__(self, key):
81
- if key not in self.data:
82
- self.data[key] = self._next_color()
83
- return self.data[key]
84
-
85
-
86
- def missing_values_vertical(
87
- qtable: Qtable,
88
- exclude_invalid: bool = True,
89
- ) -> (plt.Figure, list[plt.Axes]):
90
- """Vertical bar plot to analyze the completeness of quantification.
91
-
92
- Requires the columns "Missing experiment_name" and "Events experiment_name", which
93
- are added by calling msreport.analyze.analyze_missingness(qtable: Qtable).
94
-
95
- Args:
96
- qtable: A `Qtable` instance, which data is used for plotting.
97
- exclude_invalid: If True, rows are filtered according to the Boolean entries of
98
- the "Valid" column.
99
-
100
- Returns:
101
- A matplotlib Figure and a list of Axes objects containing the missing values
102
- plots.
103
- """
104
- experiments = qtable.get_experiments()
105
- num_experiments = len(experiments)
106
- qtable_data = qtable.get_data(exclude_invalid=exclude_invalid)
107
-
108
- barwidth = 0.8
109
- barcolors = ["#31A590", "#FAB74E", "#EB3952"]
110
- figwidth = (num_experiments * 1.2) + 0.5
111
- figsize = (figwidth, 3.5)
112
- xtick_labels = ["No missing", "Some missing", "All missing"]
113
-
114
- sns.set_style("whitegrid")
115
- fig, axes = plt.subplots(1, num_experiments, figsize=figsize, sharey=True)
116
- for exp_num, exp in enumerate(experiments):
117
- ax = axes[exp_num]
118
-
119
- exp_missing = qtable_data[f"Missing {exp}"]
120
- exp_values = qtable_data[f"Events {exp}"]
121
- missing_none = (exp_missing == 0).sum()
122
- missing_some = ((exp_missing > 0) & (exp_values > 0)).sum()
123
- missing_all = (exp_values == 0).sum()
124
-
125
- y = [missing_none, missing_some, missing_all]
126
- x = range(len(y))
127
- ax.bar(x, y, width=barwidth, color=barcolors)
128
- if exp_num == 0:
129
- ax.set_ylabel("# Proteins")
130
- ax.set_title(exp)
131
- ax.set_xticks(np.array([0, 1, 2]) + 0.4)
132
- ax.set_xticklabels(xtick_labels, rotation=45, va="top", ha="right")
133
- for spine in ["bottom", "left"]:
134
- ax.spines[spine].set_color("#000000")
135
- ax.spines[spine].set_linewidth(1)
136
- ax.grid(False, axis="x")
137
- ax.grid(axis="y", linestyle="dashed", linewidth=1)
138
- sns.despine(top=True, right=True)
139
- fig.tight_layout()
140
- return fig, axes
141
-
142
-
143
- def missing_values_horizontal(
144
- qtable: Qtable,
145
- exclude_invalid: bool = True,
146
- ) -> (plt.Figure, plt.Axes):
147
- """Horizontal bar plot to analyze the completeness of quantification.
148
-
149
- Requires the columns "Missing experiment_name" and "Events experiment_name", which
150
- are added by calling msreport.analyze.analyze_missingness(qtable: Qtable).
151
-
152
- Args:
153
- qtable: A `Qtable` instance, which data is used for plotting.
154
- exclude_invalid: If True, rows are filtered according to the Boolean entries of
155
- the "Valid" column.
156
-
157
- Returns:
158
- A matplotlib Figure and Axes object, containing the missing values plot.
159
- """
160
- experiments = qtable.get_experiments()
161
- num_experiments = len(experiments)
162
- qtable_data = qtable.get_data(exclude_invalid=exclude_invalid)
163
-
164
- data = {"exp": [], "max": [], "some": [], "min": []}
165
- for exp in experiments:
166
- exp_missing = qtable_data[f"Missing {exp}"]
167
- total = len(exp_missing)
168
- num_replicates = len(qtable.get_samples(exp))
169
- missing_all = (exp_missing == num_replicates).sum()
170
- missing_none = (exp_missing == 0).sum()
171
- with_missing_some = total - missing_all
172
-
173
- data["exp"].append(exp)
174
- data["max"].append(total)
175
- data["some"].append(with_missing_some)
176
- data["min"].append(missing_none)
177
-
178
- plotheight = (num_experiments * 0.5) + 0.5
179
- legendheight = 1.5
180
- figheight = plotheight + legendheight
181
- figsize = (5, figheight)
182
-
183
- sns.set_style("whitegrid")
184
- fig, ax = plt.subplots(figsize=figsize)
185
- sns.barplot(y="exp", x="max", data=data, label="All missing", color="#EB3952")
186
- sns.barplot(y="exp", x="some", data=data, label="Some missing", color="#FAB74E")
187
- sns.barplot(y="exp", x="min", data=data, label="None missing", color="#31A590")
188
- # Manually remove axis labels and axis legend required for seaborn > 0.13
189
- ax.set_ylabel(None)
190
- ax.set_xlabel(None)
191
- ax.legend().remove()
192
-
193
- ax.set_xlim(0, total)
194
- ax.set_title("Completeness of protein quantification per experiment")
195
- handles, labels = ax.get_legend_handles_labels()
196
- fig.legend(handles, labels, bbox_to_anchor=(1, 0), ncol=3)
197
- figure_space_for_legend = 1 - (legendheight / figheight)
198
- fig.tight_layout(rect=[0, 0, 1, figure_space_for_legend])
199
- return fig, ax
200
-
201
-
202
- def contaminants(qtable: Qtable, tag: str = "iBAQ intensity") -> (plt.Figure, plt.Axes):
203
- """A bar plot that displays relative contaminant amounts (iBAQ) per sample.
204
-
205
- Requires "iBAQ intensity" columns for each sample, and a "Potential contaminant"
206
- column to identify the potential contaminant entries.
207
-
208
- The relative iBAQ values are calculated as:
209
- sum of contaminant iBAQ intensities / sum of all iBAQ intensities * 100
210
-
211
- It is possible to use intensity columns that are either log-transformed or not. The
212
- intensity values undergo an automatic evaluation to determine if they are already
213
- in log-space, and if necessary, they are transformed accordingly.
214
-
215
- Args:
216
- qtable: A `Qtable` instance, which data is used for plotting.
217
- tag: A string that is used to extract iBAQ intensity containing columns.
218
- Default "iBAQ intensity".
219
-
220
- Returns:
221
- A matplotlib Figure and Axes object, containing the contaminants plot.
222
- """
223
- data = qtable.make_sample_table(tag, samples_as_columns=True)
224
- if msreport.helper.intensities_in_logspace(data):
225
- data = np.power(2, data)
226
-
227
- relative_intensity = data / data.sum() * 100
228
- contaminants = qtable["Potential contaminant"]
229
- samples = data.columns.to_list()
230
- num_samples = len(samples)
231
-
232
- x_values = range(relative_intensity.shape[1])
233
- bar_values = relative_intensity[contaminants].sum(axis=0)
234
-
235
- color_wheel = ColorWheelDict()
236
- colors = [color_wheel[exp] for exp in qtable.get_experiments(samples)]
237
- width = 0.8
238
- xlim_pad = 0.5
239
- xlim = ((width / 2 + xlim_pad) * -1, (num_samples - 1 + width / 2 + xlim_pad))
240
- min_upper_ylim = 5
241
- figwidth = (num_samples * 0.25) + 1.05
242
- figsize = (figwidth, 3)
243
-
244
- fig, ax = plt.subplots(figsize=figsize)
245
- ax.bar(
246
- x_values, bar_values, width=width, color=colors, edgecolor="#000000", zorder=3
247
- )
248
- ax.set_xticks(x_values)
249
- ax.set_xticklabels(samples, rotation=90)
250
- ax.set_ylabel(f"Sum relative\n{tag} [%]")
251
-
252
- ax.set_ylim(0, max(min_upper_ylim, ax.get_ylim()[1]))
253
- ax.set_xlim(xlim)
254
- sns.despine(top=True, right=True)
255
- for spine in ["bottom", "left"]:
256
- ax.spines[spine].set_color("#000000")
257
- ax.spines[spine].set_linewidth(1)
258
- ax.grid(False, axis="x")
259
- ax.grid(axis="y", linestyle="dashed", linewidth=1, color="#cccccc")
260
-
261
- ax.set_title("Relative amount of contaminants")
262
-
263
- fig.tight_layout()
264
- return fig, ax
265
-
266
-
267
- def sample_intensities(
268
- qtable: Qtable, tag: str = "Intensity", exclude_invalid: bool = True
269
- ) -> (plt.Figure, list[plt.Axes]):
270
- """Figure to compare the overall quantitative similarity of samples.
271
-
272
- Generates two subplots to compare the intensities of multiple samples. For the top
273
- subplot a pseudo reference sample is generated by calculating the average intensity
274
- values of all samples. For each row and sample the log2 ratios to the pseudo
275
- reference are calculated. Only rows without missing values are selected, and for
276
- each sample the log2 ratios to the pseudo reference are displayed as a box plot. The
277
- lower subplot displays the summed intensity of all rows per sample as bar plots.
278
-
279
- It is possible to use intensity columns that are either log-transformed or not. The
280
- intensity values undergo an automatic evaluation to determine if they are already
281
- in log-space, and if necessary, they are transformed accordingly.
282
-
283
- Args:
284
- qtable: A `Qtable` instance, which data is used for plotting.
285
- tag: A string that is used to extract intensity containing columns.
286
- Default "Intensity".
287
- exclude_invalid: If True, rows are filtered according to the Boolean entries of
288
- the "Valid" column.
289
-
290
- Returns:
291
- A matplotlib Figure and a list of Axes objects, containing the intensity plots.
292
- """
293
- table = qtable.make_sample_table(
294
- tag, samples_as_columns=True, exclude_invalid=exclude_invalid
295
- )
296
-
297
- table = table.replace({0: np.nan})
298
- if msreport.helper.intensities_in_logspace(table):
299
- log2_table = table
300
- table = np.power(2, log2_table)
301
- else:
302
- log2_table = np.log2(table)
303
- samples = table.columns.tolist()
304
-
305
- finite_values = log2_table.isna().sum(axis=1) == 0
306
- pseudo_ref = np.nanmean(log2_table[finite_values], axis=1)
307
- log2_ratios = log2_table[finite_values].subtract(pseudo_ref, axis=0)
308
-
309
- bar_values = table.sum()
310
- box_values = [log2_ratios[c] for c in log2_ratios.columns]
311
- color_wheel = ColorWheelDict()
312
- colors = [color_wheel[exp] for exp in qtable.get_experiments(samples)]
313
- fig, axes = box_and_bars(box_values, bar_values, samples, colors=colors)
314
- axes[0].set_title(f'Comparison of "{tag}" columns', pad=15)
315
- axes[0].set_ylabel("Protein ratios [log2]\nto pseudo reference")
316
- axes[1].set_ylabel("Total protein intensity")
317
- fig.tight_layout()
318
- return fig, axes
319
-
320
-
321
- def replicate_ratios(
322
- qtable: Qtable,
323
- exclude_invalid: bool = True,
324
- xlim: Iterable[float] = (-2, 2),
325
- ) -> (plt.Figure, list[plt.Axes]):
326
- """Figure to compare the similarity of expression values between replicates.
327
-
328
- Intended to evaluate the bulk distribution of expression values within experiments,
329
- after normalization. Plots from one experiment are placed in the same row. For each
330
- experiment, samples are compared pair-wise and for each sample comparison the
331
- distribution of the log2 ratios is shown as a density plot.
332
-
333
- Requires log2 transformed expression values.
334
-
335
- Args:
336
- qtable: A `Qtable` instance, which data is used for plotting.
337
- exclude_invalid: If True, rows are filtered according to the Boolean entries of
338
- the "Valid" column.
339
- xlim: Specifies the displayed range for the log2 ratios on the x-axis. Default
340
- is from -2 to 2.
341
-
342
- Returns:
343
- A matplotlib Figure and a list of Axes objects, containing the comparison plots.
344
- """
345
- tag: str = "Expression"
346
- table = qtable.make_sample_table(
347
- tag, samples_as_columns=True, exclude_invalid=exclude_invalid
348
- )
349
- design = qtable.get_design()
350
-
351
- experiments = []
352
- for experiment in design["Experiment"].unique():
353
- if len(qtable.get_samples(experiment)) >= 2:
354
- experiments.append(experiment)
355
-
356
- num_experiments = len(experiments)
357
- max_replicates = max([len(qtable.get_samples(exp)) for exp in experiments])
358
- max_combinations = len(list(itertools.combinations(range(max_replicates), 2)))
359
-
360
- figheight = (num_experiments * 0.85) + 0.64
361
- figwidth = (max_combinations * 1.5) + 0.75
362
- figsize = (figwidth, figheight)
363
-
364
- sns.set_style("whitegrid")
365
- fig, axes = plt.subplots(
366
- num_experiments, max_combinations, figsize=figsize, sharex=True
367
- )
368
- axes = axes if isinstance(axes[0], Iterable) else np.array([axes])
369
-
370
- color_wheel = ColorWheelDict()
371
- _ = [color_wheel[exp] for exp in experiments]
372
-
373
- for x_pos, experiment in enumerate(experiments):
374
- sample_combinations = itertools.combinations(qtable.get_samples(experiment), 2)
375
- for y_pos, (s1, s2) in enumerate(sample_combinations):
376
- s1_label = design.loc[(design["Sample"] == s1), "Replicate"].tolist()[0]
377
- s2_label = design.loc[(design["Sample"] == s2), "Replicate"].tolist()[0]
378
- ax = axes[x_pos, y_pos]
379
- ratios = table[s1] - table[s2]
380
- ratios = ratios[np.isfinite(ratios)]
381
- ylabel = experiment if y_pos == 0 else ""
382
- title = f"{s1_label} vs {s2_label}"
383
- color = color_wheel[experiment]
384
-
385
- sns.kdeplot(x=ratios, fill=True, ax=ax, zorder=3, color=color, alpha=0.5)
386
- ax.set_title(title, fontsize=10)
387
- ax.set_yticklabels("")
388
- ax.set_ylabel(ylabel, rotation=90, fontsize=10, va="center")
389
- ax.set_xlabel("")
390
- ax.tick_params(axis="both", labelsize=8)
391
- ax.locator_params(axis="x", nbins=5)
392
-
393
- axes[0, 0].set_xlim(xlim)
394
- for ax in axes.flatten():
395
- for spine in ["bottom", "left"]:
396
- ax.spines[spine].set_color("#000000")
397
- ax.spines[spine].set_linewidth(0.5)
398
- ax.plot((0, 0), ax.get_ylim(), color="#999999", lw=1, zorder=2)
399
- ax.grid(False, axis="y")
400
- ax.grid(axis="x", linestyle="dashed", linewidth=1, color="#cccccc")
401
- sns.despine(top=True, right=True)
402
- fig.suptitle("Protein ratios [log2] between replicates", fontsize=10)
403
- fig.tight_layout()
404
-
405
- return fig, axes
406
-
407
-
408
- def experiment_ratios(
409
- qtable: Qtable,
410
- experiments: Optional[str] = None,
411
- exclude_invalid: bool = True,
412
- ylim: Iterable[float] = (-2, 2),
413
- ) -> (plt.Figure, list[plt.Axes]):
414
- """Figure to compare the similarity of expression values between experiments.
415
-
416
- Intended to evaluate the bulk distribution of expression values after normalization.
417
- For each experiment a subplot is generated, which displays the distribution of log2
418
- ratios to a pseudo reference experiment as a density plot. The pseudo reference
419
- values are calculated as the average intensity values of all experiments. Only rows
420
- with quantitative values in all experiment are considered.
421
-
422
- Requires "Events experiment" columns and that average experiment expression values
423
- are calculated. This can be achieved by calling
424
- `msreport.analyze.analyze_missingness(qtable: Qtable)` and
425
- `msreport.analyze.calculate_experiment_means(qtable: Qtable)`.
426
-
427
- Args:
428
- qtable: A `Qtable` instance, which data is used for plotting.
429
- experiments: Optional, list of experiments that will be displayed. If None, all
430
- experiments from `qtable.design` will be used.
431
- exclude_invalid: If True, rows are filtered according to the Boolean entries of
432
- the "Valid" column.
433
- ylim: Specifies the displayed range for the log2 ratios on the y-axis. Default
434
- is from -2 to 2.
435
-
436
- Returns:
437
- A matplotlib Figure and a list of Axes objects, containing the comparison plots.
438
- """
439
- tag: str = "Expression"
440
- qtable_data = qtable.get_data(exclude_invalid=exclude_invalid)
441
- if experiments is None:
442
- experiments = qtable.design["Experiment"].unique().tolist()
443
-
444
- if len(experiments) < 2:
445
- raise MsreportError(
446
- "This plot cannot be generated with less than two experiments present in"
447
- "the qtable.design"
448
- )
449
-
450
- column_mapping = {f"{tag} {exp}": exp for exp in experiments}
451
- exp_data = qtable_data[column_mapping.keys()]
452
- exp_data = exp_data.rename(columns=column_mapping)
453
-
454
- quant_mask = np.all(
455
- [(qtable_data[f"Events {exp}"] > 0) for exp in experiments], axis=0
456
- )
457
- exp_data = exp_data[quant_mask]
458
- pseudo_ref = np.nanmean(exp_data, axis=1)
459
- exp_ratios = exp_data.subtract(pseudo_ref, axis=0)
460
-
461
- color_wheel = ColorWheelDict()
462
- num_experiments = len(experiments)
463
- figwidth = (num_experiments * 0.75) + 0.82
464
- figheight = 2.5
465
- figsize = (figwidth, figheight)
466
-
467
- sns.set_style("whitegrid")
468
- fig, axes = plt.subplots(1, num_experiments, figsize=figsize, sharey=True)
469
-
470
- for exp_pos, experiment in enumerate(experiments):
471
- ax = axes[exp_pos]
472
- values = exp_ratios[experiment]
473
- color = color_wheel[experiment]
474
- sns.kdeplot(y=values, fill=True, ax=ax, zorder=3, color=color, alpha=0.5)
475
- if exp_pos == 0:
476
- ax.text(
477
- x=ax.get_xlim()[1] / 20,
478
- y=ylim[1] * 0.95,
479
- s=f"n={str(len(values))}",
480
- va="top",
481
- ha="left",
482
- fontsize=8,
483
- )
484
- ax.set_xticklabels("")
485
- ax.tick_params(axis="both", labelsize=8)
486
- ax.set_xlabel(experiment, rotation=90)
487
-
488
- axes[0].set_ylabel("Protein ratios [log2]\nto pseudo reference")
489
- axes[0].set_ylim(ylim)
490
- for ax_pos, ax in enumerate(axes):
491
- for spine in ["bottom", "left"]:
492
- ax.spines[spine].set_color("#000000")
493
- ax.spines[spine].set_linewidth(0.5)
494
- ax.plot(ax.get_xlim(), (0, 0), color="#999999", lw=1, zorder=2)
495
- ax.grid(False, axis="x")
496
- ax.grid(axis="y", linestyle="dashed", linewidth=1, color="#cccccc")
497
- sns.despine(top=True, right=True)
498
- fig.tight_layout()
499
- return fig, axes
500
-
501
-
502
- def sample_pca(
503
- qtable: Qtable,
504
- tag: str = "Expression",
505
- pc_x: str = "PC1",
506
- pc_y: str = "PC2",
507
- exclude_invalid: bool = True,
508
- ) -> (plt.Figure, list[plt.Axes]):
509
- """Figure to compare sample similarities with a principle component analysis.
510
-
511
- On the left subplots two PCA components of log2 transformed, mean centered intensity
512
- values are shown. On the right subplot the explained variance of the principle
513
- components is display as barplots.
514
-
515
- It is possible to use intensity columns that are either log-transformed or not. The
516
- intensity values undergo an automatic evaluation to determine if they are already
517
- in log-space, and if necessary, they are transformed accordingly.
518
-
519
- Args:
520
- qtable: A `Qtable` instance, which data is used for plotting.
521
- tag: A string that is used to extract intensity containing columns.
522
- Default "Expression".
523
- pc_x: Principle component to plot on x-axis of the scatter plot, default "PC1".
524
- The number of calculated principal components is equal to the number of
525
- samples.
526
- pc_y: Principle component to plot on y-axis of the scatter plot, default "PC2".
527
- The number of calculated principal components is equal to the number of
528
- samples.
529
- exclude_invalid: If True, rows are filtered according to the Boolean entries of
530
- the "Valid" column.
531
-
532
- Returns:
533
- A matplotlib Figure and a list of Axes objects, containing the PCA plots.
534
- """
535
- table = qtable.make_sample_table(
536
- tag, samples_as_columns=True, exclude_invalid=exclude_invalid
537
- )
538
- design = qtable.get_design()
539
-
540
- table = table.replace({0: np.nan})
541
- table = table[np.isfinite(table).sum(axis=1) > 0]
542
- if not msreport.helper.intensities_in_logspace(table):
543
- table = np.log2(table)
544
- table[table.isna()] = 0
545
-
546
- table = table.transpose()
547
- sample_index = table.index.tolist()
548
- table = sklearn.preprocessing.scale(table, with_std=False)
549
-
550
- n_components = min(len(sample_index), 9)
551
- pca = sklearn.decomposition.PCA(n_components=n_components)
552
- components = pca.fit_transform(table)
553
- component_labels = ["PC{}".format(i + 1) for i in range(components.shape[1])]
554
- components_table = pd.DataFrame(
555
- data=components, columns=component_labels, index=sample_index
556
- )
557
- variance = pca.explained_variance_ratio_ * 100
558
- variance_lookup = dict(zip(component_labels, variance))
559
-
560
- # Prepare colors
561
- color_wheel = ColorWheelDict()
562
- experiments = qtable.get_experiments()
563
- _ = [color_wheel[exp] for exp in experiments]
564
-
565
- # Prepare figure
566
- num_legend_cols = 3
567
- legendheight = 0.2 + 0.2 * np.ceil(len(experiments) / num_legend_cols)
568
- plotheight = 3.7
569
- figheight = plotheight + legendheight
570
- figwidth = 4.3 + n_components * 0.2
571
- width_ratios = [4, 0.2 + n_components * 0.25]
572
- figsize = (figwidth, figheight)
573
-
574
- sns.set_style("white")
575
- fig, axes = plt.subplots(
576
- 1, 2, figsize=figsize, gridspec_kw={"width_ratios": width_ratios}
577
- )
578
-
579
- # Comparison of two principle components
580
- ax = axes[0]
581
- texts = []
582
- for sample, data in components_table.iterrows():
583
- experiment = qtable.get_experiment(sample)
584
- label = design.loc[(design["Sample"] == sample), "Replicate"].tolist()[0]
585
- color = color_wheel[experiment]
586
- ax.scatter(
587
- data[pc_x],
588
- data[pc_y],
589
- color=color,
590
- edgecolor="#999999",
591
- lw=1,
592
- s=50,
593
- label=experiment,
594
- )
595
- texts.append(ax.text(data[pc_x], data[pc_y], label, fontdict={"fontsize": 9}))
596
- adjustText.adjust_text(
597
- texts,
598
- force_text=0.15,
599
- arrowprops=dict(arrowstyle="-", color="#ebae34", lw=0.5),
600
- lim=20,
601
- ax=ax,
602
- )
603
- ax.tick_params(axis="both", labelsize=9)
604
- ax.set_xlabel(f"{pc_x} ({variance_lookup[pc_x]:.2f}%)", size=12)
605
- ax.set_ylabel(f"{pc_y} ({variance_lookup[pc_y]:.2f}%)", size=12)
606
- ax.grid(axis="both", linestyle="dotted", linewidth=1)
607
-
608
- # Explained variance bar plot
609
- ax = axes[1]
610
- xpos = range(len(variance))
611
- ax.bar(xpos, variance, color="#D0D0D0", edgecolor="#000000")
612
- ax.set_xticks(xpos)
613
- ax.set_xticklabels(component_labels, rotation="vertical", ha="center")
614
- ax.tick_params(axis="both", labelsize=9)
615
- ax.set_ylabel("Explained variance", size=12)
616
- ax.grid(axis="y", linestyle="dashed", linewidth=1)
617
-
618
- # Add legend
619
- handles, labels = axes[0].get_legend_handles_labels()
620
- by_label = dict(zip(labels, handles))
621
- handles, labels = by_label.values(), by_label.keys()
622
- fig.legend(
623
- handles,
624
- labels,
625
- bbox_to_anchor=(0.5, 0.0),
626
- ncol=num_legend_cols,
627
- fontsize=9,
628
- loc="lower center",
629
- )
630
- legend_space = legendheight / figheight
631
- fig.suptitle(f'PCA of "{tag}" columns')
632
- fig.tight_layout(rect=[0, legend_space, 1, 1])
633
-
634
- return fig, axes
635
-
636
-
637
- def volcano_ma(
638
- qtable: Qtable,
639
- experiment_pair: list[str],
640
- comparison_tag: str = " vs ",
641
- pvalue_tag: str = "P-value",
642
- special_proteins: Optional[list[str]] = None,
643
- exclude_invalid: bool = True,
644
- ) -> (plt.Figure, list[plt.Axes]):
645
- """Generates a volcano and an MA plot for the comparison of two experiments.
646
-
647
- Args:
648
- qtable: A `Qtable` instance, which data is used for plotting.
649
- experiment_pair: The names of the two experiments that will be compared,
650
- experiments must be present in qtable.design.
651
- comparison_tag: String used in comparison columns to separate a pair of
652
- experiments; default " vs ", which corresponds to the MsReport convention.
653
- pvalue_tag: String used for matching the pvalue columns; default "P-value",
654
- which corresponds to the MsReport convention.
655
- special_proteins: Optional, allows to specify a list of entries from the
656
- "Representative Protein" column to be annotated. Entries are annotated with
657
- values from the "Gene Name" column if present, otherwise from the
658
- "Representative Protein" column.
659
- exclude_invalid: If True, rows are filtered according to the Boolean entries of
660
- the "Valid" column.
661
-
662
- Returns:
663
- A matplotlib Figure object and a list of two Axes objects containing the volcano
664
- and the MA plot.
665
- """
666
- comparison_group = comparison_tag.join(experiment_pair)
667
-
668
- special_proteins = special_proteins if special_proteins is not None else []
669
- data = qtable.get_data(exclude_invalid=exclude_invalid)
670
- annotation_column = (
671
- "Gene name" if "Gene name" in data.columns else "Representative protein"
672
- )
673
- scatter_size = 2 / (max(min(data.shape[0], 10000), 1000) / 1000)
674
-
675
- masks = {
676
- "highlight": data["Representative protein"].isin(special_proteins),
677
- "default": ~data["Representative protein"].isin(special_proteins),
678
- }
679
- params = {
680
- "highlight": {
681
- "s": 10,
682
- "color": "#E73C40",
683
- "edgecolor": "#000000",
684
- "lw": 0.2,
685
- "zorder": 3,
686
- },
687
- "default": {"s": scatter_size, "color": "#40B7B5", "zorder": 2},
688
- }
689
-
690
- for column in msreport.helper.find_sample_columns(
691
- data, pvalue_tag, [comparison_group]
692
- ):
693
- data[column] = np.log10(data[column]) * -1
694
-
695
- fig, axes = plt.subplots(1, 2, figsize=[8, 4], sharex=True)
696
- fig.suptitle(comparison_group)
697
-
698
- for ax, x_variable, y_variable in [
699
- (axes[0], "Ratio [log2]", pvalue_tag),
700
- (axes[1], "Ratio [log2]", "Average expression"),
701
- ]:
702
- x_col = " ".join([x_variable, comparison_group])
703
- y_col = " ".join([y_variable, comparison_group])
704
- x_values = data[x_col]
705
- y_values = data[y_col]
706
- xy_labels = data[annotation_column]
707
-
708
- valid_values = np.isfinite(x_values) & np.isfinite(y_values)
709
- mask_default = masks["default"] & valid_values
710
- mask_special = masks["highlight"] & valid_values
711
-
712
- ax.grid(axis="both", linestyle="dotted", linewidth=1)
713
- ax.scatter(x_values[mask_default], y_values[mask_default], **params["default"])
714
- _annotated_scatter(
715
- x_values=x_values[mask_special],
716
- y_values=y_values[mask_special],
717
- labels=xy_labels[mask_special],
718
- ax=ax,
719
- scatter_kws=params["highlight"],
720
- )
721
-
722
- ax.set_xlabel(x_variable)
723
- if y_variable == pvalue_tag:
724
- ax.set_ylabel(f"{y_variable} [-log10]")
725
- else:
726
- ax.set_ylabel(f"{y_variable} [log2]")
727
-
728
- fig.tight_layout()
729
- return fig, axes
730
-
731
-
732
- def expression_comparison(
733
- qtable: Qtable,
734
- experiment_pair: list[str],
735
- comparison_tag: str = " vs ",
736
- plot_average_expression: bool = False,
737
- special_proteins: Optional[list[str]] = None,
738
- exclude_invalid: bool = True,
739
- ) -> (plt.Figure, list[plt.Axes]):
740
- """Generates an expression comparison plot for two experiments.
741
-
742
- The subplot in the middle displays the average expression of the two experiments on
743
- the y-axis and the log fold change on the x-axis. The subplots on the left and right
744
- display entries with only missing values in one of the two experiments.
745
-
746
- Args:
747
- qtable: A `Qtable` instance, which data is used for plotting.
748
- experiment_pair: The names of the two experiments that will be compared,
749
- experiments must be present in qtable.design.
750
- comparison_tag: String used in comparison columns to separate a pair of
751
- experiments; default " vs ", which corresponds to the MsReport convention.
752
- plot_average_expression: If True plot average expression instead of maxium
753
- expression. Default False.
754
- special_proteins: Optional, allows to specify a list of entries from the
755
- "Representative Protein" column to be annotated. Entries are annotated with
756
- values from the "Gene Name" column if present, otherwise from the
757
- "Representative Protein" column.
758
- exclude_invalid: If True, rows are filtered according to the Boolean entries of
759
- the "Valid" column.
760
-
761
- Returns:
762
- A matplotlib Figure objects and a list of three Axes objects containing the
763
- expression comparison plots.
764
- """
765
- exp_1, exp_2 = experiment_pair
766
- comparison_group = comparison_tag.join(experiment_pair)
767
- special_proteins = special_proteins if special_proteins is not None else []
768
- qtable_data = qtable.get_data(exclude_invalid=exclude_invalid)
769
- annotation_column = (
770
- "Gene name" if "Gene name" in qtable_data.columns else "Representative protein"
771
- )
772
- total_scatter_area = 5000
773
- params = {
774
- "highlight": {
775
- "s": 10,
776
- "color": "#E73C40",
777
- "edgecolor": "#000000",
778
- "lw": 0.2,
779
- "zorder": 3,
780
- },
781
- "default": {"alpha": 0.75, "color": "#40B7B5", "zorder": 2},
782
- }
783
-
784
- mask = (qtable_data[f"Events {exp_1}"] + qtable_data[f"Events {exp_2}"]) > 0
785
- qtable_data = qtable_data[mask]
786
-
787
- only_exp_1 = qtable_data[f"Events {exp_2}"] == 0
788
- only_exp_2 = qtable_data[f"Events {exp_1}"] == 0
789
- mask_both = np.invert(np.any([only_exp_1, only_exp_2], axis=0))
790
-
791
- # Test if plotting maximum intensity is better than average
792
- qtable_data[f"Maximum expression {comparison_group}"] = np.max(
793
- [qtable_data[f"Expression {exp_2}"], qtable_data[f"Expression {exp_1}"]], axis=0
794
- )
795
- qtable_data[f"Average expression {comparison_group}"] = np.nanmean(
796
- [qtable_data[f"Expression {exp_2}"], qtable_data[f"Expression {exp_1}"]], axis=0
797
- )
798
-
799
- def scattersize(df: pd.DataFrame, total_area) -> float:
800
- if len(values) > 0:
801
- size = min(max(np.sqrt(total_area / df.shape[0]), 0.5), 4)
802
- else:
803
- size = 1
804
- return size
805
-
806
- width_ratios = [1, 5, 1]
807
- fig, axes = plt.subplots(
808
- 1, 3, figsize=[6, 4], sharey=True, gridspec_kw={"width_ratios": width_ratios}
809
- )
810
-
811
- # Plot values quantified in both experiments
812
- ax = axes[1]
813
- values = qtable_data[mask_both]
814
- s = scattersize(values, total_scatter_area)
815
- x_variable = f"Ratio [log2]"
816
- y_variable = (
817
- f"Average expression" if plot_average_expression else f"Maximum expression"
818
- )
819
- x_col = " ".join([x_variable, comparison_group])
820
- y_col = " ".join([y_variable, comparison_group])
821
- x_values = values[x_col]
822
- y_values = values[y_col]
823
- ax.grid(axis="both", linestyle="dotted", linewidth=1)
824
- ax.scatter(x_values, y_values, s=s, **params["default"])
825
- highlight_mask = values["Representative protein"].isin(special_proteins)
826
- _annotated_scatter(
827
- x_values=x_values[highlight_mask],
828
- y_values=y_values[highlight_mask],
829
- labels=values[annotation_column][highlight_mask],
830
- ax=ax,
831
- scatter_kws=params["highlight"],
832
- )
833
-
834
- ax.set_xlabel(x_variable, fontsize=9)
835
- ax.set_title(comparison_group, fontsize=12)
836
- ax.set_ylabel(y_variable)
837
-
838
- # Plot values quantified only in one experiment
839
- for ax, mask, exp in [(axes[2], only_exp_1, exp_1), (axes[0], only_exp_2, exp_2)]:
840
- y_variable = f"Expression {exp}"
841
- values = qtable_data[mask]
842
- highlight_mask = values["Representative protein"].isin(special_proteins)
843
- s = scattersize(values, total_scatter_area)
844
-
845
- ax.grid(axis="y", linestyle="dotted", linewidth=1)
846
- ax.set_ylabel(y_variable)
847
- ax.tick_params(axis="x", bottom=False, labelbottom=False)
848
-
849
- if len(values) == 0:
850
- continue
851
-
852
- sns.stripplot(
853
- y=values[y_variable],
854
- jitter=True,
855
- size=np.sqrt(s * 2),
856
- marker="o",
857
- edgecolor="none",
858
- ax=ax,
859
- **params["default"],
860
- )
861
-
862
- xlim = -0.2, 0.2
863
- ax.set_xlim(xlim)
864
- offsets = ax.collections[0].get_offsets()[highlight_mask]
865
- _annotated_scatter(
866
- x_values=offsets[:, 0],
867
- y_values=offsets[:, 1],
868
- labels=values[annotation_column][highlight_mask],
869
- ax=ax,
870
- scatter_kws=params["highlight"],
871
- )
872
- ax.set_xlim(xlim)
873
-
874
- axes[0].set_title(f"Absent in\n{exp_1}", fontsize=9)
875
- axes[2].set_title(f"Absent in\n{exp_2}", fontsize=9)
876
-
877
- fig.tight_layout()
878
- return fig, axes
879
-
880
-
881
- def box_and_bars(
882
- box_values: Iterable[Iterable[float]],
883
- bar_values: Iterable[float],
884
- group_names: list[str],
885
- colors: Optional[list[str]] = None,
886
- ) -> (plt.Figure, list[plt.Axes]):
887
- """Generates a figure with horizontally aligned box and bar subplots.
888
-
889
- In the top subplot the 'box_values' are displayed as box plots, in lower subplot the
890
- 'bar_values' are displayed as bar plots. The figure width is automatically adjusted
891
- to the number of groups that will be plotted. The length of group_names must be the
892
- same as the length of the of the 'bar_values' and the number of iterables from
893
- 'box_values'. Each group from 'box_values' and 'bar_values' is horizontally aligned
894
- between the two subplots.
895
-
896
- Args:
897
- box_values: A sequence of sequences that each contain y values for generating a
898
- box plot.
899
- bar_values: A sequence of y values for generating bar plots.
900
- group_names: Used to label groups from box and bar plots.
901
- colors: Sequence of hex color codes for each group that is used for the boxes of
902
- the box and bar plots. Must be the same length as group names. If 'colors'
903
- is None, boxes are colored in light grey.
904
-
905
- Returns:
906
- A matplotlib Figure and a list of Axes objects containing the box and bar plots.
907
- """
908
- assert len(box_values) == len(bar_values) == len(group_names)
909
- assert colors is None or len(colors) == len(group_names)
910
- if colors is None:
911
- colors = ["#D0D0D0" for _ in group_names]
912
-
913
- num_samples = len(group_names)
914
- x_values = range(num_samples)
915
- width = 0.8
916
- xlim = (-1 + 0.15, num_samples - 0.15)
917
- figwidth = (num_samples * 0.25) + 1.2
918
- figsize = (figwidth, 6)
919
-
920
- sns.set_style("whitegrid")
921
- fig, axes = plt.subplots(2, figsize=figsize, sharex=True)
922
-
923
- # Plot boxplots using the box_values
924
- ax = axes[0]
925
- ax.plot(xlim, (0, 0), color="#999999", lw=1, zorder=2)
926
- boxplots = ax.boxplot(
927
- box_values,
928
- positions=x_values,
929
- vert=True,
930
- showfliers=False,
931
- patch_artist=True,
932
- widths=width,
933
- medianprops={"color": "#000000"},
934
- )
935
- for color, box in zip(colors, boxplots["boxes"]):
936
- box.set(facecolor=color)
937
- ylim = ax.get_ylim()
938
- ax.set_ylim(min(-0.4, ylim[0]), max(0.401, ylim[1]))
939
-
940
- # Plot barplots using the bar_values
941
- ax = axes[1]
942
- ax.bar(x_values, bar_values, width=width, color=colors, edgecolor="#000000")
943
- ax.set_xticklabels(group_names, rotation=90)
944
- for ax_pos, ax in enumerate(axes):
945
- for spine in ["bottom", "left"]:
946
- ax.spines[spine].set_color("#000000")
947
- ax.spines[spine].set_linewidth(1)
948
- ax.grid(False, axis="x")
949
- ax.grid(axis="y", linestyle="dashed", linewidth=1, color="#cccccc")
950
- sns.despine(top=True, right=True)
951
-
952
- ax.set_xlim(xlim)
953
- fig.tight_layout()
954
- return fig, axes
955
-
956
-
957
- def expression_clustermap(
958
- qtable: Qtable,
959
- exclude_invalid: bool = True,
960
- cluster_method: str = "average",
961
- ) -> sns.matrix.ClusterGrid:
962
- """Plot sample expression values as a hierarchically-clustered heatmap.
963
-
964
- Missing or imputed values are assigned an intensity value of 0 to perform the
965
- clustering.Once clustering is done, these values are removed from the heatmap,
966
- leaving white entries on the heatmap.
967
-
968
- Args:
969
- qtable: A `Qtable` instance, which data is used for plotting.
970
- exclude_invalid: If True, rows are filtered according to the Boolean entries of
971
- the "Valid" column.
972
- cluster_method: Linkage method to use for calculating clusters. See
973
- `scipy.cluster.hierarchy.linkage` documentation for more information.
974
-
975
- Returns:
976
- A seaborn ClusterGrid instance. Note that ClusterGrid has a `savefig` method
977
- that can be used for saving the figure.
978
- """
979
- samples = qtable.get_samples()
980
- experiments = qtable.get_experiments()
981
-
982
- data = qtable.make_expression_table(samples_as_columns=True)
983
- data = data[samples]
984
- for sample in samples:
985
- data.loc[qtable.data[f"Missing {sample}"], sample] = 0
986
- imputed_values = qtable.data[[f"Missing {sample}" for sample in samples]].to_numpy()
987
-
988
- if exclude_invalid:
989
- data = data[qtable.data["Valid"]]
990
- imputed_values = imputed_values[qtable.data["Valid"]]
991
-
992
- color_wheel = ColorWheelDict()
993
- _ = [color_wheel[exp] for exp in experiments]
994
- sample_colors = [color_wheel[qtable.get_experiment(sample)] for sample in samples]
995
- figsize = (0.3 + len(samples) * 0.4, 5)
996
-
997
- # Generate the plot
998
- cluster_grid = sns.clustermap(
999
- data,
1000
- col_colors=sample_colors,
1001
- cmap="magma",
1002
- yticklabels=False,
1003
- mask=imputed_values,
1004
- figsize=figsize,
1005
- metric="euclidean",
1006
- method=cluster_method,
1007
- )
1008
- cluster_grid.ax_row_dendrogram.set_visible(False)
1009
-
1010
- # Add background color and spines
1011
- cluster_grid.ax_heatmap.set_facecolor("#F9F9F9")
1012
- for _, spine in cluster_grid.ax_heatmap.spines.items():
1013
- spine.set_visible(True)
1014
- spine.set_linewidth(0.75)
1015
- return cluster_grid
1016
-
1017
-
1018
- def pvalue_histogram(
1019
- qtable: Qtable,
1020
- pvalue_tag: str = "P-value",
1021
- comparison_tag: str = " vs ",
1022
- experiment_pairs: Optional[Iterable[Iterable[str]]] = None,
1023
- exclude_invalid: bool = True,
1024
- ) -> (plt.Figure, list[plt.Axes]):
1025
- """Generates p-value histograms for one or multiple experiment comparisons.
1026
-
1027
- Histograms are generated with 20 bins of size 0.05. The p-value distribution of each
1028
- experiment comparison is shown with a separate subplot.
1029
-
1030
- Args:
1031
- qtable: A `Qtable` instance, which data is used for plotting.
1032
- pvalue_tag: String used for matching the pvalue columns; default "P-value",
1033
- which corresponds to the MsReport convention.
1034
- comparison_tag: String used in comparison columns to separate a pair of
1035
- experiments; default " vs ", which corresponds to the MsReport convention.
1036
- experiment_pairs: Optional, list of experiment pairs that will be used for
1037
- plotting. For each experiment pair a p-value column must exists that follows
1038
- the format f"{pvalue_tag} {experiment_1}{comparison_tag}{experiment_2}".
1039
- If None, all experiment comparisons that are found in qtable.data are used.
1040
- exclude_invalid: If True, rows are filtered according to the Boolean entries of
1041
- the "Valid" column.
1042
-
1043
- Returns:
1044
- A matplotlib Figure and a list of Axes objects, containing the p-value plots.
1045
- """
1046
- data = qtable.get_data(exclude_invalid=exclude_invalid)
1047
-
1048
- # Find all experiment pairs
1049
- if experiment_pairs is None:
1050
- experiment_pairs = []
1051
- for experiment_pair in itertools.permutations(qtable.get_experiments(), 2):
1052
- comparison_group = comparison_tag.join(experiment_pair)
1053
- comparison_column = f"{pvalue_tag} {comparison_group}"
1054
- if comparison_column in data.columns:
1055
- experiment_pairs.append(experiment_pair)
1056
-
1057
- num_plots = len(experiment_pairs)
1058
-
1059
- figwidth = (num_plots * 1.8) + -0.6
1060
- figheight = 2.5
1061
- figsize = (figwidth, figheight)
1062
-
1063
- fig, axes = plt.subplots(1, num_plots, figsize=figsize, sharex=True, sharey=True)
1064
- axes = axes if isinstance(axes, Iterable) else (axes,)
1065
- fig.subplots_adjust(wspace=0.5)
1066
-
1067
- bins = np.arange(0, 1.01, 0.05)
1068
- for plot_number, experiment_pair in enumerate(experiment_pairs):
1069
- ax = axes[plot_number]
1070
- comparison_group = comparison_tag.join(experiment_pair)
1071
- comparison_column = f"{pvalue_tag} {comparison_group}"
1072
- p_values = data[comparison_column]
1073
- ax.hist(
1074
- p_values,
1075
- bins=bins,
1076
- zorder=2,
1077
- color="#fbc97a",
1078
- edgecolor="#FFFFFF",
1079
- linewidth=0.7,
1080
- )
1081
-
1082
- # Adjust x- and y-axis
1083
- ax.set_xticks(np.arange(0, 1.01, 0.5))
1084
- ax.tick_params(labelsize=9)
1085
- if plot_number > 0:
1086
- ax.tick_params(axis="y", color="none")
1087
-
1088
- # Add x-label and second y-label
1089
- ax.set_xlabel(pvalue_tag, fontsize=9)
1090
- ax2 = ax.twinx()
1091
- ax2.set_yticks([])
1092
- ax2.set_ylabel(comparison_group, fontsize=9)
1093
-
1094
- # Adjust spines
1095
- sns.despine(top=True, right=True)
1096
- for spine in ["bottom", "left"]:
1097
- ax.spines[spine].set_color("#000000")
1098
- ax.spines[spine].set_linewidth(1)
1099
-
1100
- # Adjust grid
1101
- ax.grid(False, axis="x")
1102
- ax.grid(axis="y", linestyle="dashed", linewidth=1, color="#cccccc", zorder=1)
1103
-
1104
- axes[0].set_ylabel(f"{pvalue_tag} count")
1105
- ax.set_xlim(-0.05, 1.05)
1106
-
1107
- return fig, axes
1108
-
1109
-
1110
- def _annotated_scatter(x_values, y_values, labels, ax=None, scatter_kws=None) -> None:
1111
- ax = plt.gca() if ax is None else ax
1112
- if scatter_kws is None:
1113
- scatter_kws = {
1114
- "s": 10,
1115
- "color": "#FAB74E",
1116
- "edgecolor": "#000000",
1117
- "lw": 0.2,
1118
- "zorder": 3,
1119
- }
1120
- text_params = {
1121
- "force_text": 0.15,
1122
- "arrowprops": dict(
1123
- arrowstyle="-", color=scatter_kws["color"], lw=0.75, alpha=0.5
1124
- ),
1125
- "lim": 100,
1126
- }
1127
-
1128
- texts = []
1129
- for x, y, text in zip(x_values, y_values, labels):
1130
- texts.append(ax.text(x, y, text, fontdict={"fontsize": 9}))
1131
-
1132
- if texts:
1133
- adjustText.adjust_text(texts, ax=ax, **text_params)
1134
- ax.scatter(x_values, y_values, **scatter_kws)