msreport 0.0.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- msreport/__init__.py +13 -0
- msreport/aggregate/__init__.py +0 -0
- msreport/aggregate/condense.py +163 -0
- msreport/aggregate/pivot.py +132 -0
- msreport/aggregate/summarize.py +281 -0
- msreport/analyze.py +586 -0
- msreport/errors.py +10 -0
- msreport/export.py +526 -0
- msreport/fasta.py +28 -0
- msreport/helper/__init__.py +23 -0
- msreport/helper/calc.py +120 -0
- msreport/helper/maxlfq.py +339 -0
- msreport/helper/table.py +267 -0
- msreport/helper/temp.py +99 -0
- msreport/impute.py +275 -0
- msreport/isobar.py +161 -0
- msreport/normalize.py +496 -0
- msreport/peptidoform.py +283 -0
- msreport/plot.py +1129 -0
- msreport/qtable.py +537 -0
- msreport/reader.py +2357 -0
- msreport/rinterface/__init__.py +3 -0
- msreport/rinterface/limma.py +126 -0
- msreport/rinterface/rinstaller.py +35 -0
- msreport/rinterface/rscripts/limma.R +104 -0
- msreport-0.0.24.dist-info/METADATA +128 -0
- msreport-0.0.24.dist-info/RECORD +30 -0
- msreport-0.0.24.dist-info/WHEEL +5 -0
- msreport-0.0.24.dist-info/licenses/LICENSE.txt +202 -0
- msreport-0.0.24.dist-info/top_level.txt +1 -0
msreport/plot.py
ADDED
|
@@ -0,0 +1,1129 @@
|
|
|
1
|
+
from collections.abc import Iterable
|
|
2
|
+
from collections import UserDict
|
|
3
|
+
import itertools
|
|
4
|
+
from typing import Optional
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
import adjustText
|
|
8
|
+
import numpy as np
|
|
9
|
+
from matplotlib import pyplot as plt
|
|
10
|
+
import pandas as pd
|
|
11
|
+
import seaborn as sns
|
|
12
|
+
import sklearn.preprocessing
|
|
13
|
+
import sklearn.decomposition
|
|
14
|
+
|
|
15
|
+
from msreport.errors import MsreportError
|
|
16
|
+
from msreport.qtable import Qtable
|
|
17
|
+
import msreport.helper
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def set_dpi(dpi: int) -> None:
|
|
21
|
+
"""Changes the default dots per inch settings for matplotlib plots.
|
|
22
|
+
|
|
23
|
+
This effectively makes figures smaller or larger, without affecting the relative
|
|
24
|
+
sizes of elements within the figures.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
dpi: New default dots per inch.
|
|
28
|
+
"""
|
|
29
|
+
plt.rcParams["figure.dpi"] = dpi
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ColorWheelDict(UserDict):
|
|
33
|
+
"""Lookup dictionary that maps keys to hex colors by using a color wheel.
|
|
34
|
+
|
|
35
|
+
When a key is not present the first color of the color wheel is added as the value,
|
|
36
|
+
and the color is moved from the beginning to the end of the color wheel. If no list
|
|
37
|
+
of colors is specified, a default list of ten colors is added to the color wheel.
|
|
38
|
+
It is also possible to manually set key and color pairs by using the same syntax as
|
|
39
|
+
for a regular dictionary.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, colors: Optional[list[str]] = None):
|
|
43
|
+
"""Initializes a ColorWheelDict.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
colors: Optional, a list of hex colors used for the color wheel. By default
|
|
47
|
+
a list with ten colors is used.
|
|
48
|
+
"""
|
|
49
|
+
self.data = {}
|
|
50
|
+
|
|
51
|
+
if colors is not None:
|
|
52
|
+
self.colors = colors
|
|
53
|
+
else:
|
|
54
|
+
self.colors = [
|
|
55
|
+
"#80b1d3",
|
|
56
|
+
"#fdb462",
|
|
57
|
+
"#8dd3c7",
|
|
58
|
+
"#bebada",
|
|
59
|
+
"#fb8072",
|
|
60
|
+
"#b3de69",
|
|
61
|
+
"#fccde5",
|
|
62
|
+
"#d9d9d9",
|
|
63
|
+
"#bc80bd",
|
|
64
|
+
"#ccebc5",
|
|
65
|
+
]
|
|
66
|
+
self._color_wheel = self.colors.copy()
|
|
67
|
+
|
|
68
|
+
def _next_color(self) -> str:
|
|
69
|
+
color = self._color_wheel.pop(0)
|
|
70
|
+
self._color_wheel.append(color)
|
|
71
|
+
return color
|
|
72
|
+
|
|
73
|
+
def __setitem__(self, key, value):
|
|
74
|
+
is_hexcolor = re.search(r"^#(?:[0-9a-fA-F]{3}){1,2}$", value)
|
|
75
|
+
if is_hexcolor:
|
|
76
|
+
self.data[key] = value
|
|
77
|
+
else:
|
|
78
|
+
raise ValueError(f"the specified value {value} is not a hexcolor.")
|
|
79
|
+
|
|
80
|
+
def __getitem__(self, key):
|
|
81
|
+
if key not in self.data:
|
|
82
|
+
self.data[key] = self._next_color()
|
|
83
|
+
return self.data[key]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def missing_values_vertical(
|
|
87
|
+
qtable: Qtable,
|
|
88
|
+
exclude_invalid: bool = True,
|
|
89
|
+
) -> (plt.Figure, list[plt.Axes]):
|
|
90
|
+
"""Vertical bar plot to analyze the completeness of quantification.
|
|
91
|
+
|
|
92
|
+
Requires the columns "Missing experiment_name" and "Events experiment_name", which
|
|
93
|
+
are added by calling msreport.analyze.analyze_missingness(qtable: Qtable).
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
qtable: A `Qtable` instance, which data is used for plotting.
|
|
97
|
+
exclude_invalid: If True, rows are filtered according to the Boolean entries of
|
|
98
|
+
the "Valid" column.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
A matplotlib Figure and a list of Axes objects containing the missing values
|
|
102
|
+
plots.
|
|
103
|
+
"""
|
|
104
|
+
experiments = qtable.get_experiments()
|
|
105
|
+
num_experiments = len(experiments)
|
|
106
|
+
qtable_data = qtable.get_data(exclude_invalid=exclude_invalid)
|
|
107
|
+
|
|
108
|
+
barwidth = 0.8
|
|
109
|
+
barcolors = ["#31A590", "#FAB74E", "#EB3952"]
|
|
110
|
+
figwidth = (num_experiments * 1.2) + 0.5
|
|
111
|
+
figsize = (figwidth, 3.5)
|
|
112
|
+
xtick_labels = ["No missing", "Some missing", "All missing"]
|
|
113
|
+
|
|
114
|
+
sns.set_style("whitegrid")
|
|
115
|
+
fig, axes = plt.subplots(1, num_experiments, figsize=figsize, sharey=True)
|
|
116
|
+
for exp_num, exp in enumerate(experiments):
|
|
117
|
+
ax = axes[exp_num]
|
|
118
|
+
|
|
119
|
+
exp_missing = qtable_data[f"Missing {exp}"]
|
|
120
|
+
exp_values = qtable_data[f"Events {exp}"]
|
|
121
|
+
missing_none = (exp_missing == 0).sum()
|
|
122
|
+
missing_some = ((exp_missing > 0) & (exp_values > 0)).sum()
|
|
123
|
+
missing_all = (exp_values == 0).sum()
|
|
124
|
+
|
|
125
|
+
y = [missing_none, missing_some, missing_all]
|
|
126
|
+
x = range(len(y))
|
|
127
|
+
ax.bar(x, y, width=barwidth, color=barcolors)
|
|
128
|
+
if exp_num == 0:
|
|
129
|
+
ax.set_ylabel("# Proteins")
|
|
130
|
+
ax.set_title(exp)
|
|
131
|
+
ax.set_xticks(np.array([0, 1, 2]) + 0.4)
|
|
132
|
+
ax.set_xticklabels(xtick_labels, rotation=45, va="top", ha="right")
|
|
133
|
+
for spine in ["bottom", "left"]:
|
|
134
|
+
ax.spines[spine].set_color("#000000")
|
|
135
|
+
ax.spines[spine].set_linewidth(1)
|
|
136
|
+
ax.grid(False, axis="x")
|
|
137
|
+
ax.grid(axis="y", linestyle="dashed", linewidth=1)
|
|
138
|
+
sns.despine(top=True, right=True)
|
|
139
|
+
fig.tight_layout()
|
|
140
|
+
return fig, axes
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def missing_values_horizontal(
|
|
144
|
+
qtable: Qtable,
|
|
145
|
+
exclude_invalid: bool = True,
|
|
146
|
+
) -> (plt.Figure, plt.Axes):
|
|
147
|
+
"""Horizontal bar plot to analyze the completeness of quantification.
|
|
148
|
+
|
|
149
|
+
Requires the columns "Missing experiment_name" and "Events experiment_name", which
|
|
150
|
+
are added by calling msreport.analyze.analyze_missingness(qtable: Qtable).
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
qtable: A `Qtable` instance, which data is used for plotting.
|
|
154
|
+
exclude_invalid: If True, rows are filtered according to the Boolean entries of
|
|
155
|
+
the "Valid" column.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
A matplotlib Figure and Axes object, containing the missing values plot.
|
|
159
|
+
"""
|
|
160
|
+
experiments = qtable.get_experiments()
|
|
161
|
+
num_experiments = len(experiments)
|
|
162
|
+
qtable_data = qtable.get_data(exclude_invalid=exclude_invalid)
|
|
163
|
+
|
|
164
|
+
data = {"exp": [], "max": [], "some": [], "min": []}
|
|
165
|
+
for exp in experiments:
|
|
166
|
+
exp_missing = qtable_data[f"Missing {exp}"]
|
|
167
|
+
total = len(exp_missing)
|
|
168
|
+
num_replicates = len(qtable.get_samples(exp))
|
|
169
|
+
missing_all = (exp_missing == num_replicates).sum()
|
|
170
|
+
missing_none = (exp_missing == 0).sum()
|
|
171
|
+
with_missing_some = total - missing_all
|
|
172
|
+
|
|
173
|
+
data["exp"].append(exp)
|
|
174
|
+
data["max"].append(total)
|
|
175
|
+
data["some"].append(with_missing_some)
|
|
176
|
+
data["min"].append(missing_none)
|
|
177
|
+
|
|
178
|
+
plotheight = (num_experiments * 0.5) + 0.5
|
|
179
|
+
legendheight = 1.5
|
|
180
|
+
figheight = plotheight + legendheight
|
|
181
|
+
figsize = (5, figheight)
|
|
182
|
+
|
|
183
|
+
sns.set_style("whitegrid")
|
|
184
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
185
|
+
sns.barplot(y="exp", x="max", data=data, label="All missing", color="#EB3952")
|
|
186
|
+
sns.barplot(y="exp", x="some", data=data, label="Some missing", color="#FAB74E")
|
|
187
|
+
sns.barplot(y="exp", x="min", data=data, label="None missing", color="#31A590")
|
|
188
|
+
# Manually remove axis labels and axis legend required for seaborn > 0.13
|
|
189
|
+
ax.set_ylabel(None)
|
|
190
|
+
ax.set_xlabel(None)
|
|
191
|
+
ax.legend().remove()
|
|
192
|
+
|
|
193
|
+
ax.set_xlim(0, total)
|
|
194
|
+
ax.set_title("Completeness of protein quantification per experiment")
|
|
195
|
+
handles, labels = ax.get_legend_handles_labels()
|
|
196
|
+
fig.legend(handles, labels, bbox_to_anchor=(1, 0), ncol=3)
|
|
197
|
+
figure_space_for_legend = 1 - (legendheight / figheight)
|
|
198
|
+
fig.tight_layout(rect=[0, 0, 1, figure_space_for_legend])
|
|
199
|
+
return fig, ax
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def contaminants(qtable: Qtable, tag: str = "iBAQ intensity") -> (plt.Figure, plt.Axes):
|
|
203
|
+
"""A bar plot that displays relative contaminant amounts (iBAQ) per sample.
|
|
204
|
+
|
|
205
|
+
Requires "iBAQ intensity" columns for each sample, and a "Potential contaminant"
|
|
206
|
+
column to identify the potential contaminant entries.
|
|
207
|
+
|
|
208
|
+
The relative iBAQ values are calculated as:
|
|
209
|
+
sum of contaminant iBAQ intensities / sum of all iBAQ intensities * 100
|
|
210
|
+
|
|
211
|
+
It is possible to use intensity columns that are either log-transformed or not. The
|
|
212
|
+
intensity values undergo an automatic evaluation to determine if they are already
|
|
213
|
+
in log-space, and if necessary, they are transformed accordingly.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
qtable: A `Qtable` instance, which data is used for plotting.
|
|
217
|
+
tag: A string that is used to extract iBAQ intensity containing columns.
|
|
218
|
+
Default "iBAQ intensity".
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
A matplotlib Figure and Axes object, containing the contaminants plot.
|
|
222
|
+
"""
|
|
223
|
+
data = qtable.make_sample_table(tag, samples_as_columns=True)
|
|
224
|
+
if msreport.helper.intensities_in_logspace(data):
|
|
225
|
+
data = np.power(2, data)
|
|
226
|
+
|
|
227
|
+
relative_intensity = data / data.sum() * 100
|
|
228
|
+
contaminants = qtable["Potential contaminant"]
|
|
229
|
+
samples = data.columns.to_list()
|
|
230
|
+
num_samples = len(samples)
|
|
231
|
+
|
|
232
|
+
x_values = range(relative_intensity.shape[1])
|
|
233
|
+
bar_values = relative_intensity[contaminants].sum(axis=0)
|
|
234
|
+
|
|
235
|
+
color_wheel = ColorWheelDict()
|
|
236
|
+
colors = [color_wheel[exp] for exp in qtable.get_experiments(samples)]
|
|
237
|
+
width = 0.8
|
|
238
|
+
xlim_pad = 0.5
|
|
239
|
+
xlim = ((width / 2 + xlim_pad) * -1, (num_samples - 1 + width / 2 + xlim_pad))
|
|
240
|
+
min_upper_ylim = 5
|
|
241
|
+
figwidth = (num_samples * 0.25) + 1.05
|
|
242
|
+
figsize = (figwidth, 3)
|
|
243
|
+
|
|
244
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
245
|
+
ax.bar(
|
|
246
|
+
x_values, bar_values, width=width, color=colors, edgecolor="#000000", zorder=3
|
|
247
|
+
)
|
|
248
|
+
ax.set_xticks(x_values)
|
|
249
|
+
ax.set_xticklabels(samples, rotation=90)
|
|
250
|
+
ax.set_ylabel(f"Sum relative\n{tag} [%]")
|
|
251
|
+
|
|
252
|
+
ax.set_ylim(0, max(min_upper_ylim, ax.get_ylim()[1]))
|
|
253
|
+
ax.set_xlim(xlim)
|
|
254
|
+
sns.despine(top=True, right=True)
|
|
255
|
+
for spine in ["bottom", "left"]:
|
|
256
|
+
ax.spines[spine].set_color("#000000")
|
|
257
|
+
ax.spines[spine].set_linewidth(1)
|
|
258
|
+
ax.grid(False, axis="x")
|
|
259
|
+
ax.grid(axis="y", linestyle="dashed", linewidth=1, color="#cccccc")
|
|
260
|
+
|
|
261
|
+
ax.set_title("Relative amount of contaminants")
|
|
262
|
+
|
|
263
|
+
fig.tight_layout()
|
|
264
|
+
return fig, ax
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def sample_intensities(
|
|
268
|
+
qtable: Qtable, tag: str = "Intensity", exclude_invalid: bool = True
|
|
269
|
+
) -> (plt.Figure, list[plt.Axes]):
|
|
270
|
+
"""Figure to compare the overall quantitative similarity of samples.
|
|
271
|
+
|
|
272
|
+
Generates two subplots to compare the intensities of multiple samples. For the top
|
|
273
|
+
subplot a pseudo reference sample is generated by calculating the average intensity
|
|
274
|
+
values of all samples. For each row and sample the log2 ratios to the pseudo
|
|
275
|
+
reference are calculated. Only rows without missing values are selected, and for
|
|
276
|
+
each sample the log2 ratios to the pseudo reference are displayed as a box plot. The
|
|
277
|
+
lower subplot displays the summed intensity of all rows per sample as bar plots.
|
|
278
|
+
|
|
279
|
+
It is possible to use intensity columns that are either log-transformed or not. The
|
|
280
|
+
intensity values undergo an automatic evaluation to determine if they are already
|
|
281
|
+
in log-space, and if necessary, they are transformed accordingly.
|
|
282
|
+
|
|
283
|
+
Args:
|
|
284
|
+
qtable: A `Qtable` instance, which data is used for plotting.
|
|
285
|
+
tag: A string that is used to extract intensity containing columns.
|
|
286
|
+
Default "Intensity".
|
|
287
|
+
exclude_invalid: If True, rows are filtered according to the Boolean entries of
|
|
288
|
+
the "Valid" column.
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
A matplotlib Figure and a list of Axes objects, containing the intensity plots.
|
|
292
|
+
"""
|
|
293
|
+
table = qtable.make_sample_table(
|
|
294
|
+
tag, samples_as_columns=True, exclude_invalid=exclude_invalid
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
table = table.replace({0: np.nan})
|
|
298
|
+
if msreport.helper.intensities_in_logspace(table):
|
|
299
|
+
log2_table = table
|
|
300
|
+
table = np.power(2, log2_table)
|
|
301
|
+
else:
|
|
302
|
+
log2_table = np.log2(table)
|
|
303
|
+
samples = table.columns.tolist()
|
|
304
|
+
|
|
305
|
+
finite_values = log2_table.isna().sum(axis=1) == 0
|
|
306
|
+
pseudo_ref = np.nanmean(log2_table[finite_values], axis=1)
|
|
307
|
+
log2_ratios = log2_table[finite_values].subtract(pseudo_ref, axis=0)
|
|
308
|
+
|
|
309
|
+
bar_values = table.sum()
|
|
310
|
+
box_values = [log2_ratios[c] for c in log2_ratios.columns]
|
|
311
|
+
color_wheel = ColorWheelDict()
|
|
312
|
+
colors = [color_wheel[exp] for exp in qtable.get_experiments(samples)]
|
|
313
|
+
fig, axes = box_and_bars(box_values, bar_values, samples, colors=colors)
|
|
314
|
+
axes[0].set_title(f'Comparison of "{tag}" columns', pad=15)
|
|
315
|
+
axes[0].set_ylabel("Protein ratios [log2]\nto pseudo reference")
|
|
316
|
+
axes[1].set_ylabel("Total protein intensity")
|
|
317
|
+
fig.tight_layout()
|
|
318
|
+
return fig, axes
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def replicate_ratios(
|
|
322
|
+
qtable: Qtable,
|
|
323
|
+
exclude_invalid: bool = True,
|
|
324
|
+
xlim: Iterable[float] = (-2, 2),
|
|
325
|
+
) -> (plt.Figure, list[plt.Axes]):
|
|
326
|
+
"""Figure to compare the similarity of expression values between replicates.
|
|
327
|
+
|
|
328
|
+
Intended to evaluate the bulk distribution of expression values within experiments,
|
|
329
|
+
after normalization. Plots from one experiment are placed in the same row. For each
|
|
330
|
+
experiment, samples are compared pair-wise and for each sample comparison the
|
|
331
|
+
distribution of the log2 ratios is shown as a density plot.
|
|
332
|
+
|
|
333
|
+
Requires log2 transformed expression values.
|
|
334
|
+
|
|
335
|
+
Args:
|
|
336
|
+
qtable: A `Qtable` instance, which data is used for plotting.
|
|
337
|
+
exclude_invalid: If True, rows are filtered according to the Boolean entries of
|
|
338
|
+
the "Valid" column.
|
|
339
|
+
xlim: Specifies the displayed range for the log2 ratios on the x-axis. Default
|
|
340
|
+
is from -2 to 2.
|
|
341
|
+
|
|
342
|
+
Returns:
|
|
343
|
+
A matplotlib Figure and a list of Axes objects, containing the comparison plots.
|
|
344
|
+
"""
|
|
345
|
+
tag: str = "Expression"
|
|
346
|
+
table = qtable.make_sample_table(
|
|
347
|
+
tag, samples_as_columns=True, exclude_invalid=exclude_invalid
|
|
348
|
+
)
|
|
349
|
+
design = qtable.get_design()
|
|
350
|
+
|
|
351
|
+
experiments = []
|
|
352
|
+
for experiment in design["Experiment"].unique():
|
|
353
|
+
if len(qtable.get_samples(experiment)) >= 2:
|
|
354
|
+
experiments.append(experiment)
|
|
355
|
+
|
|
356
|
+
num_experiments = len(experiments)
|
|
357
|
+
max_replicates = max([len(qtable.get_samples(exp)) for exp in experiments])
|
|
358
|
+
max_combinations = len(list(itertools.combinations(range(max_replicates), 2)))
|
|
359
|
+
|
|
360
|
+
figheight = (num_experiments * 0.85) + 0.64
|
|
361
|
+
figwidth = (max_combinations * 1.5) + 0.75
|
|
362
|
+
figsize = (figwidth, figheight)
|
|
363
|
+
|
|
364
|
+
sns.set_style("whitegrid")
|
|
365
|
+
fig, axes = plt.subplots(
|
|
366
|
+
num_experiments, max_combinations, figsize=figsize, sharex=True
|
|
367
|
+
)
|
|
368
|
+
axes = axes if isinstance(axes[0], Iterable) else np.array([axes])
|
|
369
|
+
|
|
370
|
+
color_wheel = ColorWheelDict()
|
|
371
|
+
_ = [color_wheel[exp] for exp in experiments]
|
|
372
|
+
|
|
373
|
+
for x_pos, experiment in enumerate(experiments):
|
|
374
|
+
sample_combinations = itertools.combinations(qtable.get_samples(experiment), 2)
|
|
375
|
+
for y_pos, (s1, s2) in enumerate(sample_combinations):
|
|
376
|
+
s1_label = design.loc[(design["Sample"] == s1), "Replicate"].tolist()[0]
|
|
377
|
+
s2_label = design.loc[(design["Sample"] == s2), "Replicate"].tolist()[0]
|
|
378
|
+
ax = axes[x_pos, y_pos]
|
|
379
|
+
ratios = table[s1] - table[s2]
|
|
380
|
+
ratios = ratios[np.isfinite(ratios)]
|
|
381
|
+
ylabel = experiment if y_pos == 0 else ""
|
|
382
|
+
title = f"{s1_label} vs {s2_label}"
|
|
383
|
+
color = color_wheel[experiment]
|
|
384
|
+
|
|
385
|
+
sns.kdeplot(x=ratios, fill=True, ax=ax, zorder=3, color=color, alpha=0.5)
|
|
386
|
+
ax.set_title(title, fontsize=10)
|
|
387
|
+
ax.set_yticklabels("")
|
|
388
|
+
ax.set_ylabel(ylabel, rotation=90, fontsize=10, va="center")
|
|
389
|
+
ax.set_xlabel("")
|
|
390
|
+
ax.tick_params(axis="both", labelsize=8)
|
|
391
|
+
ax.locator_params(axis="x", nbins=5)
|
|
392
|
+
|
|
393
|
+
axes[0, 0].set_xlim(xlim)
|
|
394
|
+
for ax in axes.flatten():
|
|
395
|
+
for spine in ["bottom", "left"]:
|
|
396
|
+
ax.spines[spine].set_color("#000000")
|
|
397
|
+
ax.spines[spine].set_linewidth(0.5)
|
|
398
|
+
ax.plot((0, 0), ax.get_ylim(), color="#999999", lw=1, zorder=2)
|
|
399
|
+
ax.grid(False, axis="y")
|
|
400
|
+
ax.grid(axis="x", linestyle="dashed", linewidth=1, color="#cccccc")
|
|
401
|
+
sns.despine(top=True, right=True)
|
|
402
|
+
fig.suptitle("Protein ratios [log2] between replicates", fontsize=10)
|
|
403
|
+
fig.tight_layout()
|
|
404
|
+
|
|
405
|
+
return fig, axes
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def experiment_ratios(
|
|
409
|
+
qtable: Qtable,
|
|
410
|
+
experiments: Optional[str] = None,
|
|
411
|
+
exclude_invalid: bool = True,
|
|
412
|
+
ylim: Iterable[float] = (-2, 2),
|
|
413
|
+
) -> (plt.Figure, list[plt.Axes]):
|
|
414
|
+
"""Figure to compare the similarity of expression values between experiments.
|
|
415
|
+
|
|
416
|
+
Intended to evaluate the bulk distribution of expression values after normalization.
|
|
417
|
+
For each experiment a subplot is generated, which displays the distribution of log2
|
|
418
|
+
ratios to a pseudo reference experiment as a density plot. The pseudo reference
|
|
419
|
+
values are calculated as the average intensity values of all experiments. Only rows
|
|
420
|
+
with quantitative values in all experiment are considered.
|
|
421
|
+
|
|
422
|
+
Requires "Events experiment" columns and that average experiment expression values
|
|
423
|
+
are calculated. This can be achieved by calling
|
|
424
|
+
`msreport.analyze.analyze_missingness(qtable: Qtable)` and
|
|
425
|
+
`msreport.analyze.calculate_experiment_means(qtable: Qtable)`.
|
|
426
|
+
|
|
427
|
+
Args:
|
|
428
|
+
qtable: A `Qtable` instance, which data is used for plotting.
|
|
429
|
+
experiments: Optional, list of experiments that will be displayed. If None, all
|
|
430
|
+
experiments from `qtable.design` will be used.
|
|
431
|
+
exclude_invalid: If True, rows are filtered according to the Boolean entries of
|
|
432
|
+
the "Valid" column.
|
|
433
|
+
ylim: Specifies the displayed range for the log2 ratios on the y-axis. Default
|
|
434
|
+
is from -2 to 2.
|
|
435
|
+
|
|
436
|
+
Returns:
|
|
437
|
+
A matplotlib Figure and a list of Axes objects, containing the comparison plots.
|
|
438
|
+
"""
|
|
439
|
+
tag: str = "Expression"
|
|
440
|
+
qtable_data = qtable.get_data(exclude_invalid=exclude_invalid)
|
|
441
|
+
if experiments is None:
|
|
442
|
+
experiments = qtable.design["Experiment"].unique().tolist()
|
|
443
|
+
|
|
444
|
+
if len(experiments) < 2:
|
|
445
|
+
raise MsreportError(
|
|
446
|
+
"This plot cannot be generated with less than two experiments present in"
|
|
447
|
+
"the qtable.design"
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
column_mapping = {f"{tag} {exp}": exp for exp in experiments}
|
|
451
|
+
exp_data = qtable_data[column_mapping.keys()]
|
|
452
|
+
exp_data = exp_data.rename(columns=column_mapping)
|
|
453
|
+
|
|
454
|
+
quant_mask = np.all(
|
|
455
|
+
[(qtable_data[f"Events {exp}"] > 0) for exp in experiments], axis=0
|
|
456
|
+
)
|
|
457
|
+
exp_data = exp_data[quant_mask]
|
|
458
|
+
pseudo_ref = np.nanmean(exp_data, axis=1)
|
|
459
|
+
exp_ratios = exp_data.subtract(pseudo_ref, axis=0)
|
|
460
|
+
|
|
461
|
+
color_wheel = ColorWheelDict()
|
|
462
|
+
num_experiments = len(experiments)
|
|
463
|
+
figwidth = (num_experiments * 0.75) + 0.82
|
|
464
|
+
figheight = 2.5
|
|
465
|
+
figsize = (figwidth, figheight)
|
|
466
|
+
|
|
467
|
+
sns.set_style("whitegrid")
|
|
468
|
+
fig, axes = plt.subplots(1, num_experiments, figsize=figsize, sharey=True)
|
|
469
|
+
|
|
470
|
+
for exp_pos, experiment in enumerate(experiments):
|
|
471
|
+
ax = axes[exp_pos]
|
|
472
|
+
values = exp_ratios[experiment]
|
|
473
|
+
color = color_wheel[experiment]
|
|
474
|
+
sns.kdeplot(y=values, fill=True, ax=ax, zorder=3, color=color, alpha=0.5)
|
|
475
|
+
if exp_pos == 0:
|
|
476
|
+
ax.text(
|
|
477
|
+
x=ax.get_xlim()[1] / 20,
|
|
478
|
+
y=ylim[1] * 0.95,
|
|
479
|
+
s=f"n={str(len(values))}",
|
|
480
|
+
va="top",
|
|
481
|
+
ha="left",
|
|
482
|
+
fontsize=8,
|
|
483
|
+
)
|
|
484
|
+
ax.set_xticklabels("")
|
|
485
|
+
ax.tick_params(axis="both", labelsize=8)
|
|
486
|
+
ax.set_xlabel(experiment, rotation=90)
|
|
487
|
+
|
|
488
|
+
axes[0].set_ylabel("Protein ratios [log2]\nto pseudo reference")
|
|
489
|
+
axes[0].set_ylim(ylim)
|
|
490
|
+
for ax_pos, ax in enumerate(axes):
|
|
491
|
+
for spine in ["bottom", "left"]:
|
|
492
|
+
ax.spines[spine].set_color("#000000")
|
|
493
|
+
ax.spines[spine].set_linewidth(0.5)
|
|
494
|
+
ax.plot(ax.get_xlim(), (0, 0), color="#999999", lw=1, zorder=2)
|
|
495
|
+
ax.grid(False, axis="x")
|
|
496
|
+
ax.grid(axis="y", linestyle="dashed", linewidth=1, color="#cccccc")
|
|
497
|
+
sns.despine(top=True, right=True)
|
|
498
|
+
fig.tight_layout()
|
|
499
|
+
return fig, axes
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
def sample_pca(
|
|
503
|
+
qtable: Qtable,
|
|
504
|
+
tag: str = "Expression",
|
|
505
|
+
pc_x: str = "PC1",
|
|
506
|
+
pc_y: str = "PC2",
|
|
507
|
+
exclude_invalid: bool = True,
|
|
508
|
+
) -> (plt.Figure, list[plt.Axes]):
|
|
509
|
+
"""Figure to compare sample similarities with a principle component analysis.
|
|
510
|
+
|
|
511
|
+
On the left subplots two PCA components of log2 transformed, mean centered intensity
|
|
512
|
+
values are shown. On the right subplot the explained variance of the principle
|
|
513
|
+
components is display as barplots.
|
|
514
|
+
|
|
515
|
+
It is possible to use intensity columns that are either log-transformed or not. The
|
|
516
|
+
intensity values undergo an automatic evaluation to determine if they are already
|
|
517
|
+
in log-space, and if necessary, they are transformed accordingly.
|
|
518
|
+
|
|
519
|
+
Args:
|
|
520
|
+
qtable: A `Qtable` instance, which data is used for plotting.
|
|
521
|
+
tag: A string that is used to extract intensity containing columns.
|
|
522
|
+
Default "Expression".
|
|
523
|
+
pc_x: Principle component to plot on x-axis of the scatter plot, default "PC1".
|
|
524
|
+
The number of calculated principal components is equal to the number of
|
|
525
|
+
samples.
|
|
526
|
+
pc_y: Principle component to plot on y-axis of the scatter plot, default "PC2".
|
|
527
|
+
The number of calculated principal components is equal to the number of
|
|
528
|
+
samples.
|
|
529
|
+
exclude_invalid: If True, rows are filtered according to the Boolean entries of
|
|
530
|
+
the "Valid" column.
|
|
531
|
+
|
|
532
|
+
Returns:
|
|
533
|
+
A matplotlib Figure and a list of Axes objects, containing the PCA plots.
|
|
534
|
+
"""
|
|
535
|
+
table = qtable.make_sample_table(
|
|
536
|
+
tag, samples_as_columns=True, exclude_invalid=exclude_invalid
|
|
537
|
+
)
|
|
538
|
+
design = qtable.get_design()
|
|
539
|
+
|
|
540
|
+
table = table.replace({0: np.nan})
|
|
541
|
+
table = table[np.isfinite(table).sum(axis=1) > 0]
|
|
542
|
+
if not msreport.helper.intensities_in_logspace(table):
|
|
543
|
+
table = np.log2(table)
|
|
544
|
+
table[table.isna()] = 0
|
|
545
|
+
|
|
546
|
+
table = table.transpose()
|
|
547
|
+
sample_index = table.index.tolist()
|
|
548
|
+
table = sklearn.preprocessing.scale(table, with_std=False)
|
|
549
|
+
|
|
550
|
+
n_components = min(len(sample_index), 9)
|
|
551
|
+
pca = sklearn.decomposition.PCA(n_components=n_components)
|
|
552
|
+
components = pca.fit_transform(table)
|
|
553
|
+
component_labels = ["PC{}".format(i + 1) for i in range(components.shape[1])]
|
|
554
|
+
components_table = pd.DataFrame(
|
|
555
|
+
data=components, columns=component_labels, index=sample_index
|
|
556
|
+
)
|
|
557
|
+
variance = pca.explained_variance_ratio_ * 100
|
|
558
|
+
variance_lookup = dict(zip(component_labels, variance))
|
|
559
|
+
|
|
560
|
+
# Prepare colors
|
|
561
|
+
color_wheel = ColorWheelDict()
|
|
562
|
+
experiments = qtable.get_experiments()
|
|
563
|
+
_ = [color_wheel[exp] for exp in experiments]
|
|
564
|
+
|
|
565
|
+
# Prepare figure
|
|
566
|
+
num_legend_cols = 3
|
|
567
|
+
legendheight = 0.2 + 0.2 * np.ceil(len(experiments) / num_legend_cols)
|
|
568
|
+
plotheight = 3.7
|
|
569
|
+
figheight = plotheight + legendheight
|
|
570
|
+
figwidth = 4.3 + n_components * 0.2
|
|
571
|
+
width_ratios = [4, 0.2 + n_components * 0.25]
|
|
572
|
+
figsize = (figwidth, figheight)
|
|
573
|
+
|
|
574
|
+
sns.set_style("white")
|
|
575
|
+
fig, axes = plt.subplots(
|
|
576
|
+
1, 2, figsize=figsize, gridspec_kw={"width_ratios": width_ratios}
|
|
577
|
+
)
|
|
578
|
+
|
|
579
|
+
# Comparison of two principle components
|
|
580
|
+
ax = axes[0]
|
|
581
|
+
texts = []
|
|
582
|
+
for sample, data in components_table.iterrows():
|
|
583
|
+
experiment = qtable.get_experiment(sample)
|
|
584
|
+
label = design.loc[(design["Sample"] == sample), "Replicate"].tolist()[0]
|
|
585
|
+
color = color_wheel[experiment]
|
|
586
|
+
ax.scatter(
|
|
587
|
+
data[pc_x],
|
|
588
|
+
data[pc_y],
|
|
589
|
+
color=color,
|
|
590
|
+
edgecolor="#999999",
|
|
591
|
+
lw=1,
|
|
592
|
+
s=50,
|
|
593
|
+
label=experiment,
|
|
594
|
+
)
|
|
595
|
+
texts.append(ax.text(data[pc_x], data[pc_y], label, fontdict={"fontsize": 9}))
|
|
596
|
+
adjustText.adjust_text(
|
|
597
|
+
texts,
|
|
598
|
+
force_text=0.15,
|
|
599
|
+
arrowprops=dict(arrowstyle="-", color="#ebae34", lw=0.5),
|
|
600
|
+
lim=20,
|
|
601
|
+
ax=ax,
|
|
602
|
+
)
|
|
603
|
+
ax.tick_params(axis="both", labelsize=9)
|
|
604
|
+
ax.set_xlabel(f"{pc_x} ({variance_lookup[pc_x]:.2f}%)", size=12)
|
|
605
|
+
ax.set_ylabel(f"{pc_y} ({variance_lookup[pc_y]:.2f}%)", size=12)
|
|
606
|
+
ax.grid(axis="both", linestyle="dotted", linewidth=1)
|
|
607
|
+
|
|
608
|
+
# Explained variance bar plot
|
|
609
|
+
ax = axes[1]
|
|
610
|
+
xpos = range(len(variance))
|
|
611
|
+
ax.bar(xpos, variance, color="#D0D0D0", edgecolor="#000000")
|
|
612
|
+
ax.set_xticks(xpos)
|
|
613
|
+
ax.set_xticklabels(component_labels, rotation="vertical", ha="center")
|
|
614
|
+
ax.tick_params(axis="both", labelsize=9)
|
|
615
|
+
ax.set_ylabel("Explained variance", size=12)
|
|
616
|
+
ax.grid(axis="y", linestyle="dashed", linewidth=1)
|
|
617
|
+
|
|
618
|
+
# Add legend
|
|
619
|
+
handles, labels = axes[0].get_legend_handles_labels()
|
|
620
|
+
by_label = dict(zip(labels, handles))
|
|
621
|
+
handles, labels = by_label.values(), by_label.keys()
|
|
622
|
+
fig.legend(
|
|
623
|
+
handles,
|
|
624
|
+
labels,
|
|
625
|
+
bbox_to_anchor=(0.5, 0.0),
|
|
626
|
+
ncol=num_legend_cols,
|
|
627
|
+
fontsize=9,
|
|
628
|
+
loc="lower center",
|
|
629
|
+
)
|
|
630
|
+
legend_space = legendheight / figheight
|
|
631
|
+
fig.suptitle(f'PCA of "{tag}" columns')
|
|
632
|
+
fig.tight_layout(rect=[0, legend_space, 1, 1])
|
|
633
|
+
|
|
634
|
+
return fig, axes
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
def volcano_ma(
|
|
638
|
+
qtable: Qtable,
|
|
639
|
+
experiment_pair: list[str],
|
|
640
|
+
comparison_tag: str = " vs ",
|
|
641
|
+
pvalue_tag: str = "P-value",
|
|
642
|
+
special_proteins: Optional[list[str]] = None,
|
|
643
|
+
exclude_invalid: bool = True,
|
|
644
|
+
) -> (plt.Figure, list[plt.Axes]):
|
|
645
|
+
"""Generates a volcano and an MA plot for the comparison of two experiments.
|
|
646
|
+
|
|
647
|
+
Args:
|
|
648
|
+
qtable: A `Qtable` instance, which data is used for plotting.
|
|
649
|
+
experiment_pair: The names of the two experiments that will be compared,
|
|
650
|
+
experiments must be present in qtable.design.
|
|
651
|
+
comparison_tag: String used in comparison columns to separate a pair of
|
|
652
|
+
experiments; default " vs ", which corresponds to the MsReport convention.
|
|
653
|
+
pvalue_tag: String used for matching the pvalue columns; default "P-value",
|
|
654
|
+
which corresponds to the MsReport convention.
|
|
655
|
+
special_proteins: Optional, allows to specify a list of entries from the
|
|
656
|
+
"Representative Protein" column to be annotated. Entries are annotated with
|
|
657
|
+
values from the "Gene Name" column if present, otherwise from the
|
|
658
|
+
"Representative Protein" column.
|
|
659
|
+
exclude_invalid: If True, rows are filtered according to the Boolean entries of
|
|
660
|
+
the "Valid" column.
|
|
661
|
+
|
|
662
|
+
Returns:
|
|
663
|
+
A matplotlib Figure object and a list of two Axes objects containing the volcano
|
|
664
|
+
and the MA plot.
|
|
665
|
+
"""
|
|
666
|
+
comparison_group = comparison_tag.join(experiment_pair)
|
|
667
|
+
|
|
668
|
+
special_proteins = special_proteins if special_proteins is not None else []
|
|
669
|
+
data = qtable.get_data(exclude_invalid=exclude_invalid)
|
|
670
|
+
annotation_column = (
|
|
671
|
+
"Gene name" if "Gene name" in data.columns else "Representative protein"
|
|
672
|
+
)
|
|
673
|
+
scatter_size = 2 / (max(min(data.shape[0], 10000), 1000) / 1000)
|
|
674
|
+
|
|
675
|
+
masks = {
|
|
676
|
+
"highlight": data["Representative protein"].isin(special_proteins),
|
|
677
|
+
"default": ~data["Representative protein"].isin(special_proteins),
|
|
678
|
+
}
|
|
679
|
+
params = {
|
|
680
|
+
"highlight": {
|
|
681
|
+
"s": 10,
|
|
682
|
+
"color": "#E73C40",
|
|
683
|
+
"edgecolor": "#000000",
|
|
684
|
+
"lw": 0.2,
|
|
685
|
+
"zorder": 3,
|
|
686
|
+
},
|
|
687
|
+
"default": {"s": scatter_size, "color": "#40B7B5", "zorder": 2},
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
for column in msreport.helper.find_sample_columns(
|
|
691
|
+
data, pvalue_tag, [comparison_group]
|
|
692
|
+
):
|
|
693
|
+
data[column] = np.log10(data[column]) * -1
|
|
694
|
+
|
|
695
|
+
fig, axes = plt.subplots(1, 2, figsize=[8, 4], sharex=True)
|
|
696
|
+
fig.suptitle(comparison_group)
|
|
697
|
+
|
|
698
|
+
for ax, x_variable, y_variable in [
|
|
699
|
+
(axes[0], "Ratio [log2]", pvalue_tag),
|
|
700
|
+
(axes[1], "Ratio [log2]", "Average expression"),
|
|
701
|
+
]:
|
|
702
|
+
x_col = " ".join([x_variable, comparison_group])
|
|
703
|
+
y_col = " ".join([y_variable, comparison_group])
|
|
704
|
+
x_values = data[x_col]
|
|
705
|
+
y_values = data[y_col]
|
|
706
|
+
ax.grid(axis="both", linestyle="dotted", linewidth=1)
|
|
707
|
+
|
|
708
|
+
mask = masks["default"]
|
|
709
|
+
ax.scatter(x_values[mask], y_values[mask], **params["default"])
|
|
710
|
+
|
|
711
|
+
mask = masks["highlight"]
|
|
712
|
+
_annotated_scatter(
|
|
713
|
+
x_values=data[x_col][mask],
|
|
714
|
+
y_values=data[y_col][mask],
|
|
715
|
+
labels=data[annotation_column][mask],
|
|
716
|
+
ax=ax,
|
|
717
|
+
scatter_kws=params["highlight"],
|
|
718
|
+
)
|
|
719
|
+
|
|
720
|
+
ax.set_xlabel(x_variable)
|
|
721
|
+
ax.set_ylabel(f"{y_variable} [-log10]")
|
|
722
|
+
|
|
723
|
+
fig.tight_layout()
|
|
724
|
+
return fig, axes
|
|
725
|
+
|
|
726
|
+
|
|
727
|
+
def expression_comparison(
|
|
728
|
+
qtable: Qtable,
|
|
729
|
+
experiment_pair: list[str],
|
|
730
|
+
comparison_tag: str = " vs ",
|
|
731
|
+
plot_average_expression: bool = False,
|
|
732
|
+
special_proteins: Optional[list[str]] = None,
|
|
733
|
+
exclude_invalid: bool = True,
|
|
734
|
+
) -> (plt.Figure, list[plt.Axes]):
|
|
735
|
+
"""Generates an expression comparison plot for two experiments.
|
|
736
|
+
|
|
737
|
+
The subplot in the middle displays the average expression of the two experiments on
|
|
738
|
+
the y-axis and the log fold change on the x-axis. The subplots on the left and right
|
|
739
|
+
display entries with only missing values in one of the two experiments.
|
|
740
|
+
|
|
741
|
+
Args:
|
|
742
|
+
qtable: A `Qtable` instance, which data is used for plotting.
|
|
743
|
+
experiment_pair: The names of the two experiments that will be compared,
|
|
744
|
+
experiments must be present in qtable.design.
|
|
745
|
+
comparison_tag: String used in comparison columns to separate a pair of
|
|
746
|
+
experiments; default " vs ", which corresponds to the MsReport convention.
|
|
747
|
+
plot_average_expression: If True plot average expression instead of maxium
|
|
748
|
+
expression. Default False.
|
|
749
|
+
special_proteins: Optional, allows to specify a list of entries from the
|
|
750
|
+
"Representative Protein" column to be annotated. Entries are annotated with
|
|
751
|
+
values from the "Gene Name" column if present, otherwise from the
|
|
752
|
+
"Representative Protein" column.
|
|
753
|
+
exclude_invalid: If True, rows are filtered according to the Boolean entries of
|
|
754
|
+
the "Valid" column.
|
|
755
|
+
|
|
756
|
+
Returns:
|
|
757
|
+
A matplotlib Figure objects and a list of three Axes objects containing the
|
|
758
|
+
expression comparison plots.
|
|
759
|
+
"""
|
|
760
|
+
exp_1, exp_2 = experiment_pair
|
|
761
|
+
comparison_group = comparison_tag.join(experiment_pair)
|
|
762
|
+
special_proteins = special_proteins if special_proteins is not None else []
|
|
763
|
+
qtable_data = qtable.get_data(exclude_invalid=exclude_invalid)
|
|
764
|
+
annotation_column = (
|
|
765
|
+
"Gene name" if "Gene name" in qtable_data.columns else "Representative protein"
|
|
766
|
+
)
|
|
767
|
+
total_scatter_area = 5000
|
|
768
|
+
params = {
|
|
769
|
+
"highlight": {
|
|
770
|
+
"s": 10,
|
|
771
|
+
"color": "#E73C40",
|
|
772
|
+
"edgecolor": "#000000",
|
|
773
|
+
"lw": 0.2,
|
|
774
|
+
"zorder": 3,
|
|
775
|
+
},
|
|
776
|
+
"default": {"alpha": 0.75, "color": "#40B7B5", "zorder": 2},
|
|
777
|
+
}
|
|
778
|
+
|
|
779
|
+
mask = (qtable_data[f"Events {exp_1}"] + qtable_data[f"Events {exp_2}"]) > 0
|
|
780
|
+
qtable_data = qtable_data[mask]
|
|
781
|
+
|
|
782
|
+
only_exp_1 = qtable_data[f"Events {exp_2}"] == 0
|
|
783
|
+
only_exp_2 = qtable_data[f"Events {exp_1}"] == 0
|
|
784
|
+
mask_both = np.invert(np.any([only_exp_1, only_exp_2], axis=0))
|
|
785
|
+
|
|
786
|
+
# Test if plotting maximum intensity is better than average
|
|
787
|
+
qtable_data[f"Maximum expression {comparison_group}"] = np.max(
|
|
788
|
+
[qtable_data[f"Expression {exp_2}"], qtable_data[f"Expression {exp_1}"]], axis=0
|
|
789
|
+
)
|
|
790
|
+
qtable_data[f"Average expression {comparison_group}"] = np.nanmean(
|
|
791
|
+
[qtable_data[f"Expression {exp_2}"], qtable_data[f"Expression {exp_1}"]], axis=0
|
|
792
|
+
)
|
|
793
|
+
|
|
794
|
+
def scattersize(df: pd.DataFrame, total_area) -> float:
|
|
795
|
+
if len(values) > 0:
|
|
796
|
+
size = min(max(np.sqrt(total_area / df.shape[0]), 0.5), 4)
|
|
797
|
+
else:
|
|
798
|
+
size = 1
|
|
799
|
+
return size
|
|
800
|
+
|
|
801
|
+
width_ratios = [1, 5, 1]
|
|
802
|
+
fig, axes = plt.subplots(
|
|
803
|
+
1, 3, figsize=[6, 4], sharey=True, gridspec_kw={"width_ratios": width_ratios}
|
|
804
|
+
)
|
|
805
|
+
|
|
806
|
+
# Plot values quantified in both experiments
|
|
807
|
+
ax = axes[1]
|
|
808
|
+
values = qtable_data[mask_both]
|
|
809
|
+
s = scattersize(values, total_scatter_area)
|
|
810
|
+
x_variable = f"Ratio [log2]"
|
|
811
|
+
y_variable = (
|
|
812
|
+
f"Average expression" if plot_average_expression else f"Maximum expression"
|
|
813
|
+
)
|
|
814
|
+
x_col = " ".join([x_variable, comparison_group])
|
|
815
|
+
y_col = " ".join([y_variable, comparison_group])
|
|
816
|
+
x_values = values[x_col]
|
|
817
|
+
y_values = values[y_col]
|
|
818
|
+
ax.grid(axis="both", linestyle="dotted", linewidth=1)
|
|
819
|
+
ax.scatter(x_values, y_values, s=s, **params["default"])
|
|
820
|
+
highlight_mask = values["Representative protein"].isin(special_proteins)
|
|
821
|
+
_annotated_scatter(
|
|
822
|
+
x_values=x_values[highlight_mask],
|
|
823
|
+
y_values=y_values[highlight_mask],
|
|
824
|
+
labels=values[annotation_column][highlight_mask],
|
|
825
|
+
ax=ax,
|
|
826
|
+
scatter_kws=params["highlight"],
|
|
827
|
+
)
|
|
828
|
+
|
|
829
|
+
ax.set_xlabel(x_variable, fontsize=9)
|
|
830
|
+
ax.set_title(comparison_group, fontsize=12)
|
|
831
|
+
ax.set_ylabel(y_variable)
|
|
832
|
+
|
|
833
|
+
# Plot values quantified only in one experiment
|
|
834
|
+
for ax, mask, exp in [(axes[2], only_exp_1, exp_1), (axes[0], only_exp_2, exp_2)]:
|
|
835
|
+
y_variable = f"Expression {exp}"
|
|
836
|
+
values = qtable_data[mask]
|
|
837
|
+
highlight_mask = values["Representative protein"].isin(special_proteins)
|
|
838
|
+
s = scattersize(values, total_scatter_area)
|
|
839
|
+
|
|
840
|
+
ax.grid(axis="y", linestyle="dotted", linewidth=1)
|
|
841
|
+
ax.set_ylabel(y_variable)
|
|
842
|
+
ax.tick_params(axis="x", bottom=False, labelbottom=False)
|
|
843
|
+
|
|
844
|
+
if len(values) == 0:
|
|
845
|
+
continue
|
|
846
|
+
|
|
847
|
+
sns.stripplot(
|
|
848
|
+
y=values[y_variable],
|
|
849
|
+
jitter=True,
|
|
850
|
+
size=np.sqrt(s * 2),
|
|
851
|
+
marker="o",
|
|
852
|
+
edgecolor="none",
|
|
853
|
+
ax=ax,
|
|
854
|
+
**params["default"],
|
|
855
|
+
)
|
|
856
|
+
|
|
857
|
+
xlim = -0.2, 0.2
|
|
858
|
+
ax.set_xlim(xlim)
|
|
859
|
+
offsets = ax.collections[0].get_offsets()[highlight_mask]
|
|
860
|
+
_annotated_scatter(
|
|
861
|
+
x_values=offsets[:, 0],
|
|
862
|
+
y_values=offsets[:, 1],
|
|
863
|
+
labels=values[annotation_column][highlight_mask],
|
|
864
|
+
ax=ax,
|
|
865
|
+
scatter_kws=params["highlight"],
|
|
866
|
+
)
|
|
867
|
+
ax.set_xlim(xlim)
|
|
868
|
+
|
|
869
|
+
axes[0].set_title(f"Absent in\n{exp_1}", fontsize=9)
|
|
870
|
+
axes[2].set_title(f"Absent in\n{exp_2}", fontsize=9)
|
|
871
|
+
|
|
872
|
+
fig.tight_layout()
|
|
873
|
+
return fig, axes
|
|
874
|
+
|
|
875
|
+
|
|
876
|
+
def box_and_bars(
|
|
877
|
+
box_values: Iterable[Iterable[float]],
|
|
878
|
+
bar_values: Iterable[float],
|
|
879
|
+
group_names: list[str],
|
|
880
|
+
colors: Optional[list[str]] = None,
|
|
881
|
+
) -> (plt.Figure, list[plt.Axes]):
|
|
882
|
+
"""Generates a figure with horizontally aligned box and bar subplots.
|
|
883
|
+
|
|
884
|
+
In the top subplot the 'box_values' are displayed as box plots, in lower subplot the
|
|
885
|
+
'bar_values' are displayed as bar plots. The figure width is automatically adjusted
|
|
886
|
+
to the number of groups that will be plotted. The length of group_names must be the
|
|
887
|
+
same as the length of the of the 'bar_values' and the number of iterables from
|
|
888
|
+
'box_values'. Each group from 'box_values' and 'bar_values' is horizontally aligned
|
|
889
|
+
between the two subplots.
|
|
890
|
+
|
|
891
|
+
Args:
|
|
892
|
+
box_values: A sequence of sequences that each contain y values for generating a
|
|
893
|
+
box plot.
|
|
894
|
+
bar_values: A sequence of y values for generating bar plots.
|
|
895
|
+
group_names: Used to label groups from box and bar plots.
|
|
896
|
+
colors: Sequence of hex color codes for each group that is used for the boxes of
|
|
897
|
+
the box and bar plots. Must be the same length as group names. If 'colors'
|
|
898
|
+
is None, boxes are colored in light grey.
|
|
899
|
+
|
|
900
|
+
Returns:
|
|
901
|
+
A matplotlib Figure and a list of Axes objects containing the box and bar plots.
|
|
902
|
+
"""
|
|
903
|
+
assert len(box_values) == len(bar_values) == len(group_names)
|
|
904
|
+
assert colors is None or len(colors) == len(group_names)
|
|
905
|
+
if colors is None:
|
|
906
|
+
colors = ["#D0D0D0" for _ in group_names]
|
|
907
|
+
|
|
908
|
+
num_samples = len(group_names)
|
|
909
|
+
x_values = range(num_samples)
|
|
910
|
+
width = 0.8
|
|
911
|
+
xlim = (-1 + 0.15, num_samples - 0.15)
|
|
912
|
+
figwidth = (num_samples * 0.25) + 1.2
|
|
913
|
+
figsize = (figwidth, 6)
|
|
914
|
+
|
|
915
|
+
sns.set_style("whitegrid")
|
|
916
|
+
fig, axes = plt.subplots(2, figsize=figsize, sharex=True)
|
|
917
|
+
|
|
918
|
+
# Plot boxplots using the box_values
|
|
919
|
+
ax = axes[0]
|
|
920
|
+
ax.plot(xlim, (0, 0), color="#999999", lw=1, zorder=2)
|
|
921
|
+
boxplots = ax.boxplot(
|
|
922
|
+
box_values,
|
|
923
|
+
positions=x_values,
|
|
924
|
+
vert=True,
|
|
925
|
+
showfliers=False,
|
|
926
|
+
patch_artist=True,
|
|
927
|
+
widths=width,
|
|
928
|
+
medianprops={"color": "#000000"},
|
|
929
|
+
)
|
|
930
|
+
for color, box in zip(colors, boxplots["boxes"]):
|
|
931
|
+
box.set(facecolor=color)
|
|
932
|
+
ylim = ax.get_ylim()
|
|
933
|
+
ax.set_ylim(min(-0.4, ylim[0]), max(0.401, ylim[1]))
|
|
934
|
+
|
|
935
|
+
# Plot barplots using the bar_values
|
|
936
|
+
ax = axes[1]
|
|
937
|
+
ax.bar(x_values, bar_values, width=width, color=colors, edgecolor="#000000")
|
|
938
|
+
ax.set_xticklabels(group_names, rotation=90)
|
|
939
|
+
for ax_pos, ax in enumerate(axes):
|
|
940
|
+
for spine in ["bottom", "left"]:
|
|
941
|
+
ax.spines[spine].set_color("#000000")
|
|
942
|
+
ax.spines[spine].set_linewidth(1)
|
|
943
|
+
ax.grid(False, axis="x")
|
|
944
|
+
ax.grid(axis="y", linestyle="dashed", linewidth=1, color="#cccccc")
|
|
945
|
+
sns.despine(top=True, right=True)
|
|
946
|
+
|
|
947
|
+
ax.set_xlim(xlim)
|
|
948
|
+
fig.tight_layout()
|
|
949
|
+
return fig, axes
|
|
950
|
+
|
|
951
|
+
|
|
952
|
+
def expression_clustermap(
|
|
953
|
+
qtable: Qtable,
|
|
954
|
+
exclude_invalid: bool = True,
|
|
955
|
+
cluster_method: str = "average",
|
|
956
|
+
) -> sns.matrix.ClusterGrid:
|
|
957
|
+
"""Plot sample expression values as a hierarchically-clustered heatmap.
|
|
958
|
+
|
|
959
|
+
Missing or imputed values are assigned an intensity value of 0 to perform the
|
|
960
|
+
clustering.Once clustering is done, these values are removed from the heatmap,
|
|
961
|
+
leaving white entries on the heatmap.
|
|
962
|
+
|
|
963
|
+
Args:
|
|
964
|
+
qtable: A `Qtable` instance, which data is used for plotting.
|
|
965
|
+
exclude_invalid: If True, rows are filtered according to the Boolean entries of
|
|
966
|
+
the "Valid" column.
|
|
967
|
+
cluster_method: Linkage method to use for calculating clusters. See
|
|
968
|
+
`scipy.cluster.hierarchy.linkage` documentation for more information.
|
|
969
|
+
|
|
970
|
+
Returns:
|
|
971
|
+
A seaborn ClusterGrid instance. Note that ClusterGrid has a `savefig` method
|
|
972
|
+
that can be used for saving the figure.
|
|
973
|
+
"""
|
|
974
|
+
samples = qtable.get_samples()
|
|
975
|
+
experiments = qtable.get_experiments()
|
|
976
|
+
|
|
977
|
+
data = qtable.make_expression_table(samples_as_columns=True)
|
|
978
|
+
data = data[samples]
|
|
979
|
+
for sample in samples:
|
|
980
|
+
data.loc[qtable.data[f"Missing {sample}"], sample] = 0
|
|
981
|
+
imputed_values = qtable.data[[f"Missing {sample}" for sample in samples]].to_numpy()
|
|
982
|
+
|
|
983
|
+
if exclude_invalid:
|
|
984
|
+
data = data[qtable.data["Valid"]]
|
|
985
|
+
imputed_values = imputed_values[qtable.data["Valid"]]
|
|
986
|
+
|
|
987
|
+
color_wheel = ColorWheelDict()
|
|
988
|
+
_ = [color_wheel[exp] for exp in experiments]
|
|
989
|
+
sample_colors = [color_wheel[qtable.get_experiment(sample)] for sample in samples]
|
|
990
|
+
figsize = (0.3 + len(samples) * 0.4, 5)
|
|
991
|
+
|
|
992
|
+
# Generate the plot
|
|
993
|
+
cluster_grid = sns.clustermap(
|
|
994
|
+
data,
|
|
995
|
+
col_colors=sample_colors,
|
|
996
|
+
cmap="magma",
|
|
997
|
+
yticklabels=False,
|
|
998
|
+
mask=imputed_values,
|
|
999
|
+
figsize=figsize,
|
|
1000
|
+
metric="euclidean",
|
|
1001
|
+
method=cluster_method,
|
|
1002
|
+
)
|
|
1003
|
+
cluster_grid.ax_row_dendrogram.set_visible(False)
|
|
1004
|
+
|
|
1005
|
+
# Add background color and spines
|
|
1006
|
+
cluster_grid.ax_heatmap.set_facecolor("#F9F9F9")
|
|
1007
|
+
for _, spine in cluster_grid.ax_heatmap.spines.items():
|
|
1008
|
+
spine.set_visible(True)
|
|
1009
|
+
spine.set_linewidth(0.75)
|
|
1010
|
+
return cluster_grid
|
|
1011
|
+
|
|
1012
|
+
|
|
1013
|
+
def pvalue_histogram(
|
|
1014
|
+
qtable: Qtable,
|
|
1015
|
+
pvalue_tag: str = "P-value",
|
|
1016
|
+
comparison_tag: str = " vs ",
|
|
1017
|
+
experiment_pairs: Optional[Iterable[Iterable[str]]] = None,
|
|
1018
|
+
exclude_invalid: bool = True,
|
|
1019
|
+
) -> (plt.Figure, list[plt.Axes]):
|
|
1020
|
+
"""Generates p-value histograms for one or multiple experiment comparisons.
|
|
1021
|
+
|
|
1022
|
+
Histograms are generated with 20 bins of size 0.05. The p-value distribution of each
|
|
1023
|
+
experiment comparison is shown with a separate subplot.
|
|
1024
|
+
|
|
1025
|
+
Args:
|
|
1026
|
+
qtable: A `Qtable` instance, which data is used for plotting.
|
|
1027
|
+
pvalue_tag: String used for matching the pvalue columns; default "P-value",
|
|
1028
|
+
which corresponds to the MsReport convention.
|
|
1029
|
+
comparison_tag: String used in comparison columns to separate a pair of
|
|
1030
|
+
experiments; default " vs ", which corresponds to the MsReport convention.
|
|
1031
|
+
experiment_pairs: Optional, list of experiment pairs that will be used for
|
|
1032
|
+
plotting. For each experiment pair a p-value column must exists that follows
|
|
1033
|
+
the format f"{pvalue_tag} {experiment_1}{comparison_tag}{experiment_2}".
|
|
1034
|
+
If None, all experiment comparisons that are found in qtable.data are used.
|
|
1035
|
+
exclude_invalid: If True, rows are filtered according to the Boolean entries of
|
|
1036
|
+
the "Valid" column.
|
|
1037
|
+
|
|
1038
|
+
Returns:
|
|
1039
|
+
A matplotlib Figure and a list of Axes objects, containing the p-value plots.
|
|
1040
|
+
"""
|
|
1041
|
+
data = qtable.get_data(exclude_invalid=exclude_invalid)
|
|
1042
|
+
|
|
1043
|
+
# Find all experiment pairs
|
|
1044
|
+
if experiment_pairs is None:
|
|
1045
|
+
experiment_pairs = []
|
|
1046
|
+
for experiment_pair in itertools.permutations(qtable.get_experiments(), 2):
|
|
1047
|
+
comparison_group = comparison_tag.join(experiment_pair)
|
|
1048
|
+
comparison_column = f"{pvalue_tag} {comparison_group}"
|
|
1049
|
+
if comparison_column in data.columns:
|
|
1050
|
+
experiment_pairs.append(experiment_pair)
|
|
1051
|
+
|
|
1052
|
+
num_plots = len(experiment_pairs)
|
|
1053
|
+
|
|
1054
|
+
figwidth = (num_plots * 1.8) + -0.6
|
|
1055
|
+
figheight = 2.5
|
|
1056
|
+
figsize = (figwidth, figheight)
|
|
1057
|
+
|
|
1058
|
+
fig, axes = plt.subplots(1, num_plots, figsize=figsize, sharex=True, sharey=True)
|
|
1059
|
+
axes = axes if isinstance(axes, Iterable) else (axes,)
|
|
1060
|
+
fig.subplots_adjust(wspace=0.5)
|
|
1061
|
+
|
|
1062
|
+
bins = np.arange(0, 1.01, 0.05)
|
|
1063
|
+
for plot_number, experiment_pair in enumerate(experiment_pairs):
|
|
1064
|
+
ax = axes[plot_number]
|
|
1065
|
+
comparison_group = comparison_tag.join(experiment_pair)
|
|
1066
|
+
comparison_column = f"{pvalue_tag} {comparison_group}"
|
|
1067
|
+
p_values = data[comparison_column]
|
|
1068
|
+
ax.hist(
|
|
1069
|
+
p_values,
|
|
1070
|
+
bins=bins,
|
|
1071
|
+
zorder=2,
|
|
1072
|
+
color="#fbc97a",
|
|
1073
|
+
edgecolor="#FFFFFF",
|
|
1074
|
+
linewidth=0.7,
|
|
1075
|
+
)
|
|
1076
|
+
|
|
1077
|
+
# Adjust x- and y-axis
|
|
1078
|
+
ax.set_xlabel(None)
|
|
1079
|
+
ax.set_xticks(np.arange(0, 1.01, 0.5))
|
|
1080
|
+
ax.tick_params(labelsize=9)
|
|
1081
|
+
if plot_number > 0:
|
|
1082
|
+
ax.tick_params(axis="y", color="none")
|
|
1083
|
+
|
|
1084
|
+
# Add second label
|
|
1085
|
+
ax2 = ax.twinx()
|
|
1086
|
+
ax2.set_yticks([])
|
|
1087
|
+
ax2.set_ylabel(comparison_group, fontsize=9)
|
|
1088
|
+
|
|
1089
|
+
# Adjust spines
|
|
1090
|
+
sns.despine(top=True, right=True)
|
|
1091
|
+
for spine in ["bottom", "left"]:
|
|
1092
|
+
ax.spines[spine].set_color("#000000")
|
|
1093
|
+
ax.spines[spine].set_linewidth(1)
|
|
1094
|
+
|
|
1095
|
+
# Adjust grid
|
|
1096
|
+
ax.grid(False, axis="x")
|
|
1097
|
+
ax.grid(axis="y", linestyle="dashed", linewidth=1, color="#cccccc", zorder=1)
|
|
1098
|
+
|
|
1099
|
+
axes[0].set_ylabel(f"{pvalue_tag} count")
|
|
1100
|
+
ax.set_xlim(-0.05, 1.05)
|
|
1101
|
+
|
|
1102
|
+
return fig, axes
|
|
1103
|
+
|
|
1104
|
+
|
|
1105
|
+
def _annotated_scatter(x_values, y_values, labels, ax=None, scatter_kws=None) -> None:
|
|
1106
|
+
ax = plt.gca() if ax is None else ax
|
|
1107
|
+
if scatter_kws is None:
|
|
1108
|
+
scatter_kws = {
|
|
1109
|
+
"s": 10,
|
|
1110
|
+
"color": "#FAB74E",
|
|
1111
|
+
"edgecolor": "#000000",
|
|
1112
|
+
"lw": 0.2,
|
|
1113
|
+
"zorder": 3,
|
|
1114
|
+
}
|
|
1115
|
+
text_params = {
|
|
1116
|
+
"force_text": 0.15,
|
|
1117
|
+
"arrowprops": dict(
|
|
1118
|
+
arrowstyle="-", color=scatter_kws["color"], lw=0.75, alpha=0.5
|
|
1119
|
+
),
|
|
1120
|
+
"lim": 100,
|
|
1121
|
+
}
|
|
1122
|
+
|
|
1123
|
+
texts = []
|
|
1124
|
+
for x, y, text in zip(x_values, y_values, labels):
|
|
1125
|
+
texts.append(ax.text(x, y, text, fontdict={"fontsize": 9}))
|
|
1126
|
+
|
|
1127
|
+
if texts:
|
|
1128
|
+
adjustText.adjust_text(texts, ax=ax, **text_params)
|
|
1129
|
+
ax.scatter(x_values, y_values, **scatter_kws)
|