msreport 0.0.30__py3-none-any.whl → 0.0.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- msreport/__init__.py +1 -1
- msreport/aggregate/__init__.py +10 -0
- msreport/aggregate/condense.py +9 -0
- msreport/aggregate/pivot.py +14 -5
- msreport/aggregate/summarize.py +14 -4
- msreport/analyze.py +67 -5
- msreport/export.py +10 -16
- msreport/fasta.py +9 -2
- msreport/helper/__init__.py +18 -0
- msreport/helper/maxlfq.py +3 -3
- msreport/impute.py +18 -10
- msreport/isobar.py +11 -14
- msreport/normalize.py +95 -10
- msreport/peptidoform.py +21 -11
- msreport/plot/__init__.py +3 -3
- msreport/plot/comparison.py +7 -2
- msreport/plot/multivariate.py +34 -15
- msreport/plot/quality.py +1 -1
- msreport/qtable.py +25 -11
- msreport/reader.py +362 -37
- msreport/rinterface/limma.py +1 -1
- {msreport-0.0.30.dist-info → msreport-0.0.32.dist-info}/METADATA +11 -1
- msreport-0.0.32.dist-info/RECORD +38 -0
- msreport-0.0.30.dist-info/RECORD +0 -38
- {msreport-0.0.30.dist-info → msreport-0.0.32.dist-info}/WHEEL +0 -0
- {msreport-0.0.30.dist-info → msreport-0.0.32.dist-info}/licenses/LICENSE.txt +0 -0
- {msreport-0.0.30.dist-info → msreport-0.0.32.dist-info}/top_level.txt +0 -0
msreport/peptidoform.py
CHANGED
|
@@ -1,5 +1,14 @@
|
|
|
1
|
+
"""Defines the `Peptide` class and associated utilities for handling peptidoforms.
|
|
2
|
+
|
|
3
|
+
This module provides a `Peptide` class for representing modified peptide sequences,
|
|
4
|
+
and their site localization probabilities. It offers methods to access and manipulate
|
|
5
|
+
peptide information, summarize isoform probabilities, and retrieve modification sites.
|
|
6
|
+
Additionally, it includes utility functions for parsing modified sequence strings and
|
|
7
|
+
converting site localization probabilities to and from a standardized string format.
|
|
8
|
+
"""
|
|
9
|
+
|
|
1
10
|
from collections import defaultdict as ddict
|
|
2
|
-
from typing import Optional
|
|
11
|
+
from typing import Optional
|
|
3
12
|
|
|
4
13
|
import numpy as np
|
|
5
14
|
|
|
@@ -10,7 +19,7 @@ class Peptide:
|
|
|
10
19
|
def __init__(
|
|
11
20
|
self,
|
|
12
21
|
modified_sequence: str,
|
|
13
|
-
localization_probabilities: Optional[dict] = None,
|
|
22
|
+
localization_probabilities: Optional[dict[str, dict[int, float]]] = None,
|
|
14
23
|
protein_position: Optional[int] = None,
|
|
15
24
|
):
|
|
16
25
|
plain_sequence, modifications = parse_modified_sequence(
|
|
@@ -28,7 +37,7 @@ class Peptide:
|
|
|
28
37
|
self.modification_positions[mod_tag].append(position)
|
|
29
38
|
self.modified_residues[position] = mod_tag
|
|
30
39
|
|
|
31
|
-
def make_modified_sequence(self, include: Optional[list] = None) -> str:
|
|
40
|
+
def make_modified_sequence(self, include: Optional[list[str]] = None) -> str:
|
|
32
41
|
"""Returns a modified sequence string.
|
|
33
42
|
|
|
34
43
|
Args:
|
|
@@ -55,7 +64,7 @@ class Peptide:
|
|
|
55
64
|
return 0
|
|
56
65
|
return len(self.modification_positions[modification])
|
|
57
66
|
|
|
58
|
-
def isoform_probability(self, modification: str) ->
|
|
67
|
+
def isoform_probability(self, modification: str) -> float | None:
|
|
59
68
|
"""Calculates the isoform probability for a given modification.
|
|
60
69
|
|
|
61
70
|
Returns:
|
|
@@ -66,12 +75,13 @@ class Peptide:
|
|
|
66
75
|
"""
|
|
67
76
|
probabilities = []
|
|
68
77
|
for site in self.list_modified_peptide_sites(modification):
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
78
|
+
probability = self.get_peptide_site_probability(site)
|
|
79
|
+
if probability is None:
|
|
80
|
+
return None
|
|
81
|
+
probabilities.append(probability)
|
|
72
82
|
return float(np.prod(probabilities))
|
|
73
83
|
|
|
74
|
-
def get_peptide_site_probability(self, position: int) ->
|
|
84
|
+
def get_peptide_site_probability(self, position: int) -> float | None:
|
|
75
85
|
"""Return the modification localization probability of the peptide position.
|
|
76
86
|
|
|
77
87
|
Args:
|
|
@@ -85,7 +95,7 @@ class Peptide:
|
|
|
85
95
|
"""
|
|
86
96
|
return self._get_site_probability(position, is_protein_position=False)
|
|
87
97
|
|
|
88
|
-
def get_protein_site_probability(self, position: int) ->
|
|
98
|
+
def get_protein_site_probability(self, position: int) -> float | None:
|
|
89
99
|
"""Return the modification localization probability of the protein position.
|
|
90
100
|
|
|
91
101
|
Args:
|
|
@@ -109,7 +119,7 @@ class Peptide:
|
|
|
109
119
|
|
|
110
120
|
def _get_site_probability(
|
|
111
121
|
self, position: int, is_protein_position: bool
|
|
112
|
-
) ->
|
|
122
|
+
) -> float | None:
|
|
113
123
|
"""Return the modification localization probability of the peptide position.
|
|
114
124
|
|
|
115
125
|
Args:
|
|
@@ -224,7 +234,7 @@ def modify_peptide(
|
|
|
224
234
|
|
|
225
235
|
|
|
226
236
|
def make_localization_string(
|
|
227
|
-
localization_probabilities: dict, decimal_places: int = 3
|
|
237
|
+
localization_probabilities: dict[str, dict[int, float]], decimal_places: int = 3
|
|
228
238
|
) -> str:
|
|
229
239
|
"""Generates a site localization probability string.
|
|
230
240
|
|
msreport/plot/__init__.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Plotting functions for visualizing proteomics data from `Qtable`.
|
|
2
2
|
|
|
3
3
|
The functions in this module generate a wide range of plots, including heatmaps, PCA
|
|
4
4
|
plots, volcano plots, and histograms, to analyze and compare expression values,
|
|
@@ -6,8 +6,8 @@ missingness, contaminants, and other features in proteomics datasets. The plots
|
|
|
6
6
|
designed to work with the Qtable class as input, which provides structured access to
|
|
7
7
|
proteomics data and experimental design information.
|
|
8
8
|
|
|
9
|
-
|
|
10
|
-
|
|
9
|
+
Users can customize plot styles via the `set_active_style` function, which allows
|
|
10
|
+
applying style sheets from the msreport library or those available in matplotlib.
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
13
|
from .comparison import expression_comparison, pvalue_histogram, volcano_ma
|
msreport/plot/comparison.py
CHANGED
|
@@ -77,10 +77,15 @@ def volcano_ma(
|
|
|
77
77
|
)
|
|
78
78
|
special_entries = list(special_entries) + list(special_proteins)
|
|
79
79
|
|
|
80
|
-
|
|
81
|
-
if annotation_column not in data.columns:
|
|
80
|
+
if annotation_column not in qtable.data.columns:
|
|
82
81
|
annotation_column = qtable.id_column
|
|
83
82
|
|
|
83
|
+
data = qtable.get_data(exclude_invalid=exclude_invalid)
|
|
84
|
+
mask = np.ones(data.shape[0], dtype=bool)
|
|
85
|
+
for tag in [ratio_tag, expression_tag, pvalue_tag]:
|
|
86
|
+
mask = mask & np.isfinite(data[f"{tag} {comparison_group}"])
|
|
87
|
+
data = data[mask]
|
|
88
|
+
|
|
84
89
|
scatter_size = 2 / (max(min(data.shape[0], 10000), 1000) / 1000)
|
|
85
90
|
|
|
86
91
|
masks = {
|
msreport/plot/multivariate.py
CHANGED
|
@@ -21,6 +21,7 @@ def sample_pca(
|
|
|
21
21
|
pc_x: str = "PC1",
|
|
22
22
|
pc_y: str = "PC2",
|
|
23
23
|
exclude_invalid: bool = True,
|
|
24
|
+
exclude_missing: bool = False,
|
|
24
25
|
) -> tuple[plt.Figure, list[plt.Axes]]:
|
|
25
26
|
"""Figure to compare sample similarities with a principle component analysis.
|
|
26
27
|
|
|
@@ -44,11 +45,14 @@ def sample_pca(
|
|
|
44
45
|
samples.
|
|
45
46
|
exclude_invalid: If True, rows are filtered according to the Boolean entries of
|
|
46
47
|
the "Valid" column.
|
|
48
|
+
exclude_missing: If True, only rows without any missing values are used.
|
|
47
49
|
|
|
48
50
|
Returns:
|
|
49
51
|
A matplotlib Figure and a list of Axes objects, containing the PCA plots.
|
|
50
52
|
"""
|
|
51
53
|
design = qtable.get_design()
|
|
54
|
+
samples = qtable.get_samples()
|
|
55
|
+
|
|
52
56
|
if design.shape[0] < 3:
|
|
53
57
|
fig, ax = plt.subplots(1, 1, figsize=(2, 1.3))
|
|
54
58
|
fig.suptitle(f'PCA of "{tag}" values', y=1.1)
|
|
@@ -65,13 +69,22 @@ def sample_pca(
|
|
|
65
69
|
return fig, np.array([ax])
|
|
66
70
|
|
|
67
71
|
table = qtable.make_sample_table(
|
|
68
|
-
tag, samples_as_columns=True, exclude_invalid=
|
|
72
|
+
tag, samples_as_columns=True, exclude_invalid=False
|
|
69
73
|
)
|
|
74
|
+
|
|
75
|
+
inclusion_mask = np.ones(qtable.data.shape[0], dtype=bool)
|
|
76
|
+
if exclude_invalid:
|
|
77
|
+
inclusion_mask = inclusion_mask & qtable["Valid"]
|
|
78
|
+
if exclude_missing:
|
|
79
|
+
_non_missing_masks = [(qtable[f"Missing {s}"] == 0) for s in samples]
|
|
80
|
+
inclusion_mask = inclusion_mask & (np.all(_non_missing_masks, axis=0))
|
|
81
|
+
table = table[inclusion_mask]
|
|
82
|
+
|
|
70
83
|
table = table.replace({0: np.nan})
|
|
71
84
|
table = table[np.isfinite(table).sum(axis=1) > 0]
|
|
72
85
|
if not msreport.helper.intensities_in_logspace(table):
|
|
73
86
|
table = np.log2(table)
|
|
74
|
-
table
|
|
87
|
+
table = table.fillna(0)
|
|
75
88
|
|
|
76
89
|
table = table.transpose()
|
|
77
90
|
sample_index = table.index.tolist()
|
|
@@ -203,6 +216,7 @@ def sample_pca(
|
|
|
203
216
|
def expression_clustermap(
|
|
204
217
|
qtable: Qtable,
|
|
205
218
|
exclude_invalid: bool = True,
|
|
219
|
+
exclude_missing: bool = False,
|
|
206
220
|
remove_imputation: bool = True,
|
|
207
221
|
mean_center: bool = False,
|
|
208
222
|
cluster_samples: bool = True,
|
|
@@ -218,6 +232,7 @@ def expression_clustermap(
|
|
|
218
232
|
qtable: A `Qtable` instance, which data is used for plotting.
|
|
219
233
|
exclude_invalid: If True, rows are filtered according to the Boolean entries of
|
|
220
234
|
the "Valid" column.
|
|
235
|
+
exclude_missing: If True, only rows without any missing values are used.
|
|
221
236
|
remove_imputation: If True, imputed values are set to 0 before clustering.
|
|
222
237
|
Defaults to True.
|
|
223
238
|
mean_center: If True, the data is mean-centered before clustering. Defaults to
|
|
@@ -242,25 +257,29 @@ def expression_clustermap(
|
|
|
242
257
|
if len(samples) < 2:
|
|
243
258
|
raise ValueError("At least two samples are required to generate a clustermap.")
|
|
244
259
|
|
|
245
|
-
data = qtable.make_expression_table(samples_as_columns=True)
|
|
260
|
+
data = qtable.make_expression_table(samples_as_columns=True, exclude_invalid=False)
|
|
246
261
|
data = data[samples]
|
|
262
|
+
data = data.fillna(0)
|
|
247
263
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
data.loc[qtable.data[f"Missing {sample}"], sample] = 0
|
|
251
|
-
data[sample] = data[sample].fillna(0)
|
|
252
|
-
|
|
253
|
-
if not mean_center:
|
|
254
|
-
# Hide missing values in the heatmap, making them appear white
|
|
255
|
-
mask_values = qtable.data[
|
|
264
|
+
if not mean_center: # Hide missing values in the heatmap, making them appear white
|
|
265
|
+
hide_values_mask = qtable.data[
|
|
256
266
|
[f"Missing {sample}" for sample in samples]
|
|
257
267
|
].to_numpy()
|
|
258
268
|
else:
|
|
259
|
-
|
|
269
|
+
hide_values_mask = np.zeros(data.shape, dtype=bool)
|
|
270
|
+
|
|
271
|
+
if remove_imputation:
|
|
272
|
+
for sample in samples:
|
|
273
|
+
data.loc[qtable.data[f"Missing {sample}"], sample] = 0
|
|
260
274
|
|
|
275
|
+
inclusion_mask = np.ones(data.shape[0], dtype=bool)
|
|
261
276
|
if exclude_invalid:
|
|
262
|
-
|
|
263
|
-
|
|
277
|
+
inclusion_mask = inclusion_mask & qtable["Valid"]
|
|
278
|
+
if exclude_missing:
|
|
279
|
+
_non_missing_masks = [(qtable[f"Missing {s}"] == 0) for s in samples]
|
|
280
|
+
inclusion_mask = inclusion_mask & (np.all(_non_missing_masks, axis=0))
|
|
281
|
+
hide_values_mask = hide_values_mask[inclusion_mask]
|
|
282
|
+
data = data[inclusion_mask]
|
|
264
283
|
|
|
265
284
|
color_wheel = ColorWheelDict()
|
|
266
285
|
for exp in experiments:
|
|
@@ -314,7 +333,7 @@ def expression_clustermap(
|
|
|
314
333
|
col_cluster=cluster_samples,
|
|
315
334
|
col_colors=sample_colors,
|
|
316
335
|
row_colors=["#000000" for _ in range(len(data))],
|
|
317
|
-
mask=
|
|
336
|
+
mask=hide_values_mask,
|
|
318
337
|
method=cluster_method,
|
|
319
338
|
metric="euclidean",
|
|
320
339
|
**heatmap_args,
|
msreport/plot/quality.py
CHANGED
|
@@ -314,7 +314,7 @@ def sample_intensities(
|
|
|
314
314
|
|
|
315
315
|
@with_active_style
|
|
316
316
|
def sample_correlation(
|
|
317
|
-
qtable, exclude_invalid: bool = True, labels: bool = False
|
|
317
|
+
qtable: Qtable, exclude_invalid: bool = True, labels: bool = False
|
|
318
318
|
) -> tuple[plt.Figure, list[plt.Axes]]:
|
|
319
319
|
"""Generates a pair-wise correlation matrix of samples 'Expression' values.
|
|
320
320
|
|
msreport/qtable.py
CHANGED
|
@@ -1,14 +1,28 @@
|
|
|
1
|
-
|
|
1
|
+
"""Defines the `Qtable` class, the central container for quantitative proteomics data.
|
|
2
|
+
|
|
3
|
+
The `Qtable` class serves as the standardized data structure for `msreport`,
|
|
4
|
+
storing a main table with quantitative values and associated metadata for its entries;
|
|
5
|
+
it also maintains the name of the unique ID column for the main table. Additionally,
|
|
6
|
+
it stores an experimental design table that links sample names to experimental
|
|
7
|
+
conditions and replicate information.
|
|
8
|
+
|
|
9
|
+
`Qtable` provides convenience methods for creating subtables and accessing design
|
|
10
|
+
related information (e.g., samples per experiment), and instances of `Qtable` can be
|
|
11
|
+
easily saved to disk and loaded back. As the central data container, the `Qtable`
|
|
12
|
+
facilitates seamless integration with the high-level modules `analyze`, `plot` and
|
|
13
|
+
`export`, which all directly operate on `Qtable` instances.
|
|
14
|
+
"""
|
|
2
15
|
|
|
3
16
|
import copy
|
|
4
17
|
import os
|
|
5
18
|
import warnings
|
|
6
19
|
from contextlib import contextmanager
|
|
7
|
-
from typing import Any, Iterable, Optional
|
|
20
|
+
from typing import Any, Generator, Iterable, Optional
|
|
8
21
|
|
|
9
22
|
import numpy as np
|
|
10
23
|
import pandas as pd
|
|
11
24
|
import yaml
|
|
25
|
+
from typing_extensions import Self
|
|
12
26
|
|
|
13
27
|
import msreport.helper as helper
|
|
14
28
|
|
|
@@ -359,7 +373,7 @@ class Qtable:
|
|
|
359
373
|
keep_experiments: Optional[Iterable[str]] = None,
|
|
360
374
|
exclude_samples: Optional[Iterable[str]] = None,
|
|
361
375
|
keep_samples: Optional[Iterable[str]] = None,
|
|
362
|
-
):
|
|
376
|
+
) -> Generator[None, None, None]:
|
|
363
377
|
"""Context manager to temporarily modify the design table.
|
|
364
378
|
|
|
365
379
|
Args:
|
|
@@ -422,7 +436,7 @@ class Qtable:
|
|
|
422
436
|
self.design.to_csv(filepaths["design"], sep="\t", index=True)
|
|
423
437
|
|
|
424
438
|
@classmethod
|
|
425
|
-
def load(cls, directory: str, basename: str) ->
|
|
439
|
+
def load(cls, directory: str, basename: str) -> Self:
|
|
426
440
|
"""Load a qtable from disk by reading a data, design, and config file.
|
|
427
441
|
|
|
428
442
|
Loading a qtable will first import the three files generated during saving, then
|
|
@@ -470,7 +484,7 @@ class Qtable:
|
|
|
470
484
|
)
|
|
471
485
|
id_column = config_data["Unique ID column"]
|
|
472
486
|
|
|
473
|
-
qtable =
|
|
487
|
+
qtable = cls(data, design, id_column)
|
|
474
488
|
qtable._expression_columns = config_data["Expression columns"]
|
|
475
489
|
qtable._expression_features = config_data["Expression features"]
|
|
476
490
|
qtable._expression_sample_mapping = config_data["Expression sample mapping"]
|
|
@@ -486,11 +500,11 @@ class Qtable:
|
|
|
486
500
|
)
|
|
487
501
|
self.data.to_csv(path, sep="\t", index=index)
|
|
488
502
|
|
|
489
|
-
def to_clipboard(self, index: bool = False):
|
|
503
|
+
def to_clipboard(self, index: bool = False) -> None:
|
|
490
504
|
"""Writes the data table to the system clipboard."""
|
|
491
505
|
self.data.to_clipboard(sep="\t", index=index)
|
|
492
506
|
|
|
493
|
-
def copy(self) ->
|
|
507
|
+
def copy(self) -> Self:
|
|
494
508
|
"""Returns a copy of this Qtable instance."""
|
|
495
509
|
return self.__copy__()
|
|
496
510
|
|
|
@@ -579,8 +593,8 @@ class Qtable:
|
|
|
579
593
|
self._expression_features = []
|
|
580
594
|
self._expression_sample_mapping = {}
|
|
581
595
|
|
|
582
|
-
def __copy__(self) ->
|
|
583
|
-
new_instance =
|
|
596
|
+
def __copy__(self) -> Self:
|
|
597
|
+
new_instance = type(self)(self.data, self.design, self.id_column)
|
|
584
598
|
# Copy all private attributes
|
|
585
599
|
for attr in dir(self):
|
|
586
600
|
if (
|
|
@@ -609,7 +623,7 @@ def _match_samples_to_tag_columns(
|
|
|
609
623
|
samples: Iterable[str],
|
|
610
624
|
columns: Iterable[str],
|
|
611
625
|
tag: str,
|
|
612
|
-
) -> dict:
|
|
626
|
+
) -> dict[str, str]:
|
|
613
627
|
"""Mapping of samples to columns which contain the sample and the tag.
|
|
614
628
|
|
|
615
629
|
Args:
|
|
@@ -632,7 +646,7 @@ def _match_samples_to_tag_columns(
|
|
|
632
646
|
return mapping
|
|
633
647
|
|
|
634
648
|
|
|
635
|
-
def _get_qtable_export_filepaths(directory: str, name: str):
|
|
649
|
+
def _get_qtable_export_filepaths(directory: str, name: str) -> dict[str, str]:
|
|
636
650
|
"""Returns a dictionary of standard filepaths for loading and saving a qtable."""
|
|
637
651
|
filenames = {
|
|
638
652
|
"data": f"{name}.data.tsv",
|