msreport 0.0.30__py3-none-any.whl → 0.0.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- msreport/__init__.py +1 -1
- msreport/aggregate/__init__.py +10 -0
- msreport/aggregate/condense.py +9 -0
- msreport/aggregate/pivot.py +14 -5
- msreport/aggregate/summarize.py +14 -4
- msreport/analyze.py +67 -5
- msreport/export.py +9 -15
- msreport/fasta.py +9 -2
- msreport/helper/__init__.py +18 -0
- msreport/impute.py +18 -10
- msreport/isobar.py +11 -14
- msreport/normalize.py +95 -10
- msreport/peptidoform.py +21 -11
- msreport/plot/__init__.py +3 -3
- msreport/plot/quality.py +1 -1
- msreport/qtable.py +25 -11
- msreport/reader.py +216 -23
- msreport/rinterface/limma.py +1 -1
- {msreport-0.0.30.dist-info → msreport-0.0.31.dist-info}/METADATA +11 -1
- msreport-0.0.31.dist-info/RECORD +38 -0
- msreport-0.0.30.dist-info/RECORD +0 -38
- {msreport-0.0.30.dist-info → msreport-0.0.31.dist-info}/WHEEL +0 -0
- {msreport-0.0.30.dist-info → msreport-0.0.31.dist-info}/licenses/LICENSE.txt +0 -0
- {msreport-0.0.30.dist-info → msreport-0.0.31.dist-info}/top_level.txt +0 -0
msreport/peptidoform.py
CHANGED
|
@@ -1,5 +1,14 @@
|
|
|
1
|
+
"""Defines the `Peptide` class and associated utilities for handling peptidoforms.
|
|
2
|
+
|
|
3
|
+
This module provides a `Peptide` class for representing modified peptide sequences,
|
|
4
|
+
and their site localization probabilities. It offers methods to access and manipulate
|
|
5
|
+
peptide information, summarize isoform probabilities, and retrieve modification sites.
|
|
6
|
+
Additionally, it includes utility functions for parsing modified sequence strings and
|
|
7
|
+
converting site localization probabilities to and from a standardized string format.
|
|
8
|
+
"""
|
|
9
|
+
|
|
1
10
|
from collections import defaultdict as ddict
|
|
2
|
-
from typing import Optional
|
|
11
|
+
from typing import Optional
|
|
3
12
|
|
|
4
13
|
import numpy as np
|
|
5
14
|
|
|
@@ -10,7 +19,7 @@ class Peptide:
|
|
|
10
19
|
def __init__(
|
|
11
20
|
self,
|
|
12
21
|
modified_sequence: str,
|
|
13
|
-
localization_probabilities: Optional[dict] = None,
|
|
22
|
+
localization_probabilities: Optional[dict[str, dict[int, float]]] = None,
|
|
14
23
|
protein_position: Optional[int] = None,
|
|
15
24
|
):
|
|
16
25
|
plain_sequence, modifications = parse_modified_sequence(
|
|
@@ -28,7 +37,7 @@ class Peptide:
|
|
|
28
37
|
self.modification_positions[mod_tag].append(position)
|
|
29
38
|
self.modified_residues[position] = mod_tag
|
|
30
39
|
|
|
31
|
-
def make_modified_sequence(self, include: Optional[list] = None) -> str:
|
|
40
|
+
def make_modified_sequence(self, include: Optional[list[str]] = None) -> str:
|
|
32
41
|
"""Returns a modified sequence string.
|
|
33
42
|
|
|
34
43
|
Args:
|
|
@@ -55,7 +64,7 @@ class Peptide:
|
|
|
55
64
|
return 0
|
|
56
65
|
return len(self.modification_positions[modification])
|
|
57
66
|
|
|
58
|
-
def isoform_probability(self, modification: str) ->
|
|
67
|
+
def isoform_probability(self, modification: str) -> float | None:
|
|
59
68
|
"""Calculates the isoform probability for a given modification.
|
|
60
69
|
|
|
61
70
|
Returns:
|
|
@@ -66,12 +75,13 @@ class Peptide:
|
|
|
66
75
|
"""
|
|
67
76
|
probabilities = []
|
|
68
77
|
for site in self.list_modified_peptide_sites(modification):
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
78
|
+
probability = self.get_peptide_site_probability(site)
|
|
79
|
+
if probability is None:
|
|
80
|
+
return None
|
|
81
|
+
probabilities.append(probability)
|
|
72
82
|
return float(np.prod(probabilities))
|
|
73
83
|
|
|
74
|
-
def get_peptide_site_probability(self, position: int) ->
|
|
84
|
+
def get_peptide_site_probability(self, position: int) -> float | None:
|
|
75
85
|
"""Return the modification localization probability of the peptide position.
|
|
76
86
|
|
|
77
87
|
Args:
|
|
@@ -85,7 +95,7 @@ class Peptide:
|
|
|
85
95
|
"""
|
|
86
96
|
return self._get_site_probability(position, is_protein_position=False)
|
|
87
97
|
|
|
88
|
-
def get_protein_site_probability(self, position: int) ->
|
|
98
|
+
def get_protein_site_probability(self, position: int) -> float | None:
|
|
89
99
|
"""Return the modification localization probability of the protein position.
|
|
90
100
|
|
|
91
101
|
Args:
|
|
@@ -109,7 +119,7 @@ class Peptide:
|
|
|
109
119
|
|
|
110
120
|
def _get_site_probability(
|
|
111
121
|
self, position: int, is_protein_position: bool
|
|
112
|
-
) ->
|
|
122
|
+
) -> float | None:
|
|
113
123
|
"""Return the modification localization probability of the peptide position.
|
|
114
124
|
|
|
115
125
|
Args:
|
|
@@ -224,7 +234,7 @@ def modify_peptide(
|
|
|
224
234
|
|
|
225
235
|
|
|
226
236
|
def make_localization_string(
|
|
227
|
-
localization_probabilities: dict, decimal_places: int = 3
|
|
237
|
+
localization_probabilities: dict[str, dict[int, float]], decimal_places: int = 3
|
|
228
238
|
) -> str:
|
|
229
239
|
"""Generates a site localization probability string.
|
|
230
240
|
|
msreport/plot/__init__.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Plotting functions for visualizing proteomics data from `Qtable`.
|
|
2
2
|
|
|
3
3
|
The functions in this module generate a wide range of plots, including heatmaps, PCA
|
|
4
4
|
plots, volcano plots, and histograms, to analyze and compare expression values,
|
|
@@ -6,8 +6,8 @@ missingness, contaminants, and other features in proteomics datasets. The plots
|
|
|
6
6
|
designed to work with the Qtable class as input, which provides structured access to
|
|
7
7
|
proteomics data and experimental design information.
|
|
8
8
|
|
|
9
|
-
|
|
10
|
-
|
|
9
|
+
Users can customize plot styles via the `set_active_style` function, which allows
|
|
10
|
+
applying style sheets from the msreport library or those available in matplotlib.
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
13
|
from .comparison import expression_comparison, pvalue_histogram, volcano_ma
|
msreport/plot/quality.py
CHANGED
|
@@ -314,7 +314,7 @@ def sample_intensities(
|
|
|
314
314
|
|
|
315
315
|
@with_active_style
|
|
316
316
|
def sample_correlation(
|
|
317
|
-
qtable, exclude_invalid: bool = True, labels: bool = False
|
|
317
|
+
qtable: Qtable, exclude_invalid: bool = True, labels: bool = False
|
|
318
318
|
) -> tuple[plt.Figure, list[plt.Axes]]:
|
|
319
319
|
"""Generates a pair-wise correlation matrix of samples 'Expression' values.
|
|
320
320
|
|
msreport/qtable.py
CHANGED
|
@@ -1,14 +1,28 @@
|
|
|
1
|
-
|
|
1
|
+
"""Defines the `Qtable` class, the central container for quantitative proteomics data.
|
|
2
|
+
|
|
3
|
+
The `Qtable` class serves as the standardized data structure for `msreport`,
|
|
4
|
+
storing a main table with quantitative values and associated metadata for its entries;
|
|
5
|
+
it also maintains the name of the unique ID column for the main table. Additionally,
|
|
6
|
+
it stores an experimental design table that links sample names to experimental
|
|
7
|
+
conditions and replicate information.
|
|
8
|
+
|
|
9
|
+
`Qtable` provides convenience methods for creating subtables and accessing design
|
|
10
|
+
related information (e.g., samples per experiment), and instances of `Qtable` can be
|
|
11
|
+
easily saved to disk and loaded back. As the central data container, the `Qtable`
|
|
12
|
+
facilitates seamless integration with the high-level modules `analyze`, `plot` and
|
|
13
|
+
`export`, which all directly operate on `Qtable` instances.
|
|
14
|
+
"""
|
|
2
15
|
|
|
3
16
|
import copy
|
|
4
17
|
import os
|
|
5
18
|
import warnings
|
|
6
19
|
from contextlib import contextmanager
|
|
7
|
-
from typing import Any, Iterable, Optional
|
|
20
|
+
from typing import Any, Generator, Iterable, Optional
|
|
8
21
|
|
|
9
22
|
import numpy as np
|
|
10
23
|
import pandas as pd
|
|
11
24
|
import yaml
|
|
25
|
+
from typing_extensions import Self
|
|
12
26
|
|
|
13
27
|
import msreport.helper as helper
|
|
14
28
|
|
|
@@ -359,7 +373,7 @@ class Qtable:
|
|
|
359
373
|
keep_experiments: Optional[Iterable[str]] = None,
|
|
360
374
|
exclude_samples: Optional[Iterable[str]] = None,
|
|
361
375
|
keep_samples: Optional[Iterable[str]] = None,
|
|
362
|
-
):
|
|
376
|
+
) -> Generator[None, None, None]:
|
|
363
377
|
"""Context manager to temporarily modify the design table.
|
|
364
378
|
|
|
365
379
|
Args:
|
|
@@ -422,7 +436,7 @@ class Qtable:
|
|
|
422
436
|
self.design.to_csv(filepaths["design"], sep="\t", index=True)
|
|
423
437
|
|
|
424
438
|
@classmethod
|
|
425
|
-
def load(cls, directory: str, basename: str) ->
|
|
439
|
+
def load(cls, directory: str, basename: str) -> Self:
|
|
426
440
|
"""Load a qtable from disk by reading a data, design, and config file.
|
|
427
441
|
|
|
428
442
|
Loading a qtable will first import the three files generated during saving, then
|
|
@@ -470,7 +484,7 @@ class Qtable:
|
|
|
470
484
|
)
|
|
471
485
|
id_column = config_data["Unique ID column"]
|
|
472
486
|
|
|
473
|
-
qtable =
|
|
487
|
+
qtable = cls(data, design, id_column)
|
|
474
488
|
qtable._expression_columns = config_data["Expression columns"]
|
|
475
489
|
qtable._expression_features = config_data["Expression features"]
|
|
476
490
|
qtable._expression_sample_mapping = config_data["Expression sample mapping"]
|
|
@@ -486,11 +500,11 @@ class Qtable:
|
|
|
486
500
|
)
|
|
487
501
|
self.data.to_csv(path, sep="\t", index=index)
|
|
488
502
|
|
|
489
|
-
def to_clipboard(self, index: bool = False):
|
|
503
|
+
def to_clipboard(self, index: bool = False) -> None:
|
|
490
504
|
"""Writes the data table to the system clipboard."""
|
|
491
505
|
self.data.to_clipboard(sep="\t", index=index)
|
|
492
506
|
|
|
493
|
-
def copy(self) ->
|
|
507
|
+
def copy(self) -> Self:
|
|
494
508
|
"""Returns a copy of this Qtable instance."""
|
|
495
509
|
return self.__copy__()
|
|
496
510
|
|
|
@@ -579,8 +593,8 @@ class Qtable:
|
|
|
579
593
|
self._expression_features = []
|
|
580
594
|
self._expression_sample_mapping = {}
|
|
581
595
|
|
|
582
|
-
def __copy__(self) ->
|
|
583
|
-
new_instance =
|
|
596
|
+
def __copy__(self) -> Self:
|
|
597
|
+
new_instance = type(self)(self.data, self.design, self.id_column)
|
|
584
598
|
# Copy all private attributes
|
|
585
599
|
for attr in dir(self):
|
|
586
600
|
if (
|
|
@@ -609,7 +623,7 @@ def _match_samples_to_tag_columns(
|
|
|
609
623
|
samples: Iterable[str],
|
|
610
624
|
columns: Iterable[str],
|
|
611
625
|
tag: str,
|
|
612
|
-
) -> dict:
|
|
626
|
+
) -> dict[str, str]:
|
|
613
627
|
"""Mapping of samples to columns which contain the sample and the tag.
|
|
614
628
|
|
|
615
629
|
Args:
|
|
@@ -632,7 +646,7 @@ def _match_samples_to_tag_columns(
|
|
|
632
646
|
return mapping
|
|
633
647
|
|
|
634
648
|
|
|
635
|
-
def _get_qtable_export_filepaths(directory: str, name: str):
|
|
649
|
+
def _get_qtable_export_filepaths(directory: str, name: str) -> dict[str, str]:
|
|
636
650
|
"""Returns a dictionary of standard filepaths for loading and saving a qtable."""
|
|
637
651
|
filenames = {
|
|
638
652
|
"data": f"{name}.data.tsv",
|
msreport/reader.py
CHANGED
|
@@ -1,17 +1,18 @@
|
|
|
1
|
-
"""
|
|
2
|
-
to a standardized format following the MsReport convention.
|
|
1
|
+
"""Provides tools for importing and standardizing quantitative proteomics data.
|
|
3
2
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
3
|
+
This module offers software-specific reader classes to import raw result tables (e.g.,
|
|
4
|
+
proteins, peptides, ions) from various proteomics software (MaxQuant, FragPipe,
|
|
5
|
+
Spectronaut) and convert them into a standardized `msreport` format. Additionally, it
|
|
6
|
+
provides functions for annotating imported data with biological metadata, such as
|
|
7
|
+
protein information (e.g., sequence length, molecular weight) and peptide positions,
|
|
8
|
+
extracted from a ProteinDatabase (FASTA file).
|
|
7
9
|
|
|
8
|
-
New
|
|
10
|
+
New columns added to imported protein tables:
|
|
9
11
|
- Representative protein
|
|
10
12
|
- Leading proteins
|
|
11
13
|
- Protein reported by software
|
|
12
14
|
|
|
13
|
-
|
|
14
|
-
- Total peptides
|
|
15
|
+
Standardized column names for quantitative values (if available in the software output):
|
|
15
16
|
- Spectral count "sample name"
|
|
16
17
|
- Unique spectral count "sample name"
|
|
17
18
|
- Total spectral count "sample name"
|
|
@@ -38,6 +39,7 @@ from msreport.helper.temp import extract_window_around_position
|
|
|
38
39
|
class Protein(Protocol):
|
|
39
40
|
"""Abstract protein entry"""
|
|
40
41
|
|
|
42
|
+
# identifier: str
|
|
41
43
|
header: str
|
|
42
44
|
sequence: str
|
|
43
45
|
header_fields: dict[str, str]
|
|
@@ -46,9 +48,9 @@ class Protein(Protocol):
|
|
|
46
48
|
class ProteinDatabase(Protocol):
|
|
47
49
|
"""Abstract protein database"""
|
|
48
50
|
|
|
49
|
-
def __getitem__(self,
|
|
51
|
+
def __getitem__(self, identifier: str) -> Protein: ...
|
|
50
52
|
|
|
51
|
-
def __contains__(self,
|
|
53
|
+
def __contains__(self, identifier: str) -> bool: ...
|
|
52
54
|
|
|
53
55
|
|
|
54
56
|
class ResultReader:
|
|
@@ -497,7 +499,9 @@ class MaxQuantReader(ResultReader):
|
|
|
497
499
|
mod_probability_columns = msreport.helper.find_columns(new_df, "Probabilities")
|
|
498
500
|
localization_string_column = "Modification localization string"
|
|
499
501
|
|
|
500
|
-
mod_localization_probabilities
|
|
502
|
+
mod_localization_probabilities: list[dict[str, dict[int, float]]] = [
|
|
503
|
+
{} for _ in range(new_df.shape[0])
|
|
504
|
+
]
|
|
501
505
|
for probability_column in mod_probability_columns:
|
|
502
506
|
# FUTURE: Type should be checked and enforced during the import
|
|
503
507
|
if not pd.api.types.is_string_dtype(new_df[probability_column].dtype):
|
|
@@ -681,7 +685,15 @@ class FragPipeReader(ResultReader):
|
|
|
681
685
|
def import_design(
|
|
682
686
|
self, filename: Optional[str] = None, sort: bool = False
|
|
683
687
|
) -> pd.DataFrame:
|
|
684
|
-
"""
|
|
688
|
+
"""Read a 'fp-manifest' file and returns a processed design dataframe.
|
|
689
|
+
|
|
690
|
+
The manifest columns "Path", "Experiment", and "Bioreplicate" are mapped to the
|
|
691
|
+
design table columns "Rawfile", "Experiment", and "Replicate". The "Rawfile"
|
|
692
|
+
column is extracted as the filename from the full path. The "Sample" column is
|
|
693
|
+
generated by combining "Experiment" and "Replicate" with an underscore
|
|
694
|
+
(e.g., "Experiment_Replicate"), except when "Replicate" is empty, in which case
|
|
695
|
+
"Sample" is set to "Experiment". If "Experiment" is missing, it is set to "exp"
|
|
696
|
+
by default.
|
|
685
697
|
|
|
686
698
|
Args:
|
|
687
699
|
filename: Allows specifying an alternative filename, otherwise the default
|
|
@@ -704,12 +716,18 @@ class FragPipeReader(ResultReader):
|
|
|
704
716
|
raise FileNotFoundError(
|
|
705
717
|
f"File '{filepath}' does not exist. Please check the file path."
|
|
706
718
|
)
|
|
707
|
-
fp_manifest =
|
|
719
|
+
fp_manifest = (
|
|
720
|
+
pd.read_csv(
|
|
721
|
+
filepath, sep="\t", header=None, na_values=[""], keep_default_na=False
|
|
722
|
+
)
|
|
723
|
+
.fillna("")
|
|
724
|
+
.astype(str)
|
|
725
|
+
)
|
|
708
726
|
fp_manifest.columns = ["Path", "Experiment", "Bioreplicate", "Data type"]
|
|
709
727
|
|
|
710
728
|
design = pd.DataFrame(
|
|
711
729
|
{
|
|
712
|
-
"Sample":
|
|
730
|
+
"Sample": "",
|
|
713
731
|
"Experiment": fp_manifest["Experiment"],
|
|
714
732
|
"Replicate": fp_manifest["Bioreplicate"],
|
|
715
733
|
"Rawfile": fp_manifest["Path"].apply(
|
|
@@ -718,6 +736,12 @@ class FragPipeReader(ResultReader):
|
|
|
718
736
|
),
|
|
719
737
|
}
|
|
720
738
|
)
|
|
739
|
+
# FragPipe uses "exp" for missing 'Experiment' values
|
|
740
|
+
design.loc[design["Experiment"] == "", "Experiment"] = "exp"
|
|
741
|
+
# FragPipe combines 'Experiment' + "_" + 'Replicate' into 'Sample', except when
|
|
742
|
+
# 'Replicate' is empty, in which case 'Sample' is set to 'Experiment'.
|
|
743
|
+
design["Sample"] = design["Experiment"] + "_" + design["Replicate"]
|
|
744
|
+
design.loc[design["Replicate"] == "", "Sample"] = design["Experiment"]
|
|
721
745
|
|
|
722
746
|
if sort:
|
|
723
747
|
design.sort_values(by=["Experiment", "Replicate"], inplace=True)
|
|
@@ -963,7 +987,7 @@ class FragPipeReader(ResultReader):
|
|
|
963
987
|
filename: Optional[str] = None,
|
|
964
988
|
rename_columns: bool = True,
|
|
965
989
|
rewrite_modifications: bool = True,
|
|
966
|
-
):
|
|
990
|
+
) -> pd.DataFrame:
|
|
967
991
|
"""Concatenate all "psm.tsv" files and return a processed dataframe.
|
|
968
992
|
|
|
969
993
|
Args:
|
|
@@ -1499,6 +1523,7 @@ class SpectronautReader(ResultReader):
|
|
|
1499
1523
|
filename: Optional[str] = None,
|
|
1500
1524
|
filetag: Optional[str] = None,
|
|
1501
1525
|
rename_columns: bool = True,
|
|
1526
|
+
rewrite_modifications: bool = True,
|
|
1502
1527
|
) -> pd.DataFrame:
|
|
1503
1528
|
"""Reads an ion evidence file (long format) and returns a processed dataframe.
|
|
1504
1529
|
|
|
@@ -1508,8 +1533,15 @@ class SpectronautReader(ResultReader):
|
|
|
1508
1533
|
generated by concatenating the "Modified sequence" and "Charge" columns, and if
|
|
1509
1534
|
present, the "Compensation voltage" column.
|
|
1510
1535
|
|
|
1511
|
-
|
|
1512
|
-
are
|
|
1536
|
+
"Modified sequence" entries contain modifications within square brackets.
|
|
1537
|
+
"Modification" entries are strings in the form of "position:modification_tag",
|
|
1538
|
+
multiple modifications are joined by ";". An example for a modified sequence and
|
|
1539
|
+
a modification entry: "PEPT[Phospho]IDO[Oxidation]", "4:Phospho;7:Oxidation".
|
|
1540
|
+
|
|
1541
|
+
"Modification localization string" contains localization probabilities in the
|
|
1542
|
+
format "Mod1@Site1:Probability1,Site2:Probability2;Mod2@Site3:Probability3",
|
|
1543
|
+
e.g. "15.9949@11:1.000;79.9663@3:0.200,4:0.800". Refer to
|
|
1544
|
+
`msreport.peptidoform.make_localization_string` for details.
|
|
1513
1545
|
|
|
1514
1546
|
Args:
|
|
1515
1547
|
filename: Optional, allows specifying a specific file that will be imported.
|
|
@@ -1517,6 +1549,10 @@ class SpectronautReader(ResultReader):
|
|
|
1517
1549
|
a substring, instead of specifying a filename.
|
|
1518
1550
|
rename_columns: If True, columns are renamed according to the MsReport
|
|
1519
1551
|
convention; default True.
|
|
1552
|
+
rewrite_modifications: If True, the peptide format in "Modified sequence" is
|
|
1553
|
+
changed according to the MsReport convention, and a "Modifications" is
|
|
1554
|
+
added to contains the amino acid position for all modifications.
|
|
1555
|
+
Requires 'rename_columns' to be true. Default True.
|
|
1520
1556
|
|
|
1521
1557
|
Returns:
|
|
1522
1558
|
A dataframe containing the processed ion table.
|
|
@@ -1544,6 +1580,9 @@ class SpectronautReader(ResultReader):
|
|
|
1544
1580
|
df = self._add_protein_entries(df)
|
|
1545
1581
|
if rename_columns:
|
|
1546
1582
|
df = self._rename_columns(df, True)
|
|
1583
|
+
if rewrite_modifications and rename_columns:
|
|
1584
|
+
df = self._add_peptide_modification_entries(df)
|
|
1585
|
+
df = self._add_modification_localization_string(df)
|
|
1547
1586
|
df["Ion ID"] = df["Modified sequence"] + "_c" + df["Charge"].astype(str)
|
|
1548
1587
|
if "Compensation voltage" in df.columns:
|
|
1549
1588
|
_cv = df["Compensation voltage"].astype(str)
|
|
@@ -1597,6 +1636,70 @@ class SpectronautReader(ResultReader):
|
|
|
1597
1636
|
leading_protein_entries = df["PG.ProteinAccessions"].str.split(";").tolist()
|
|
1598
1637
|
return leading_protein_entries
|
|
1599
1638
|
|
|
1639
|
+
def _add_peptide_modification_entries(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
1640
|
+
"""Adds standardized "Modified sequence" and "Modifications" columns.
|
|
1641
|
+
|
|
1642
|
+
"Modified sequence" entries contain modifications within square brackets.
|
|
1643
|
+
"Modifications" entries are strings in the form of "position:modification_text",
|
|
1644
|
+
multiple modifications are joined by ";". An example for a modified sequence and
|
|
1645
|
+
a modification entry: "PEPT[Phospho]IDO[Oxidation]", "4:Phospho;7:Oxidation".
|
|
1646
|
+
|
|
1647
|
+
Requires the columns "Peptide sequence" and "Modified sequence" from the
|
|
1648
|
+
software output.
|
|
1649
|
+
|
|
1650
|
+
Args:
|
|
1651
|
+
df: Dataframe containing "Peptide sequence" and "Modified sequence" columns.
|
|
1652
|
+
|
|
1653
|
+
Returns:
|
|
1654
|
+
A copy of the input dataframe with updated columns.
|
|
1655
|
+
"""
|
|
1656
|
+
# TODO: not tested
|
|
1657
|
+
mod_sequences = df["Modified sequence"].str[1:-1] # Remove sourrounding "_"
|
|
1658
|
+
mod_entries = _generate_modification_entries(
|
|
1659
|
+
df["Peptide sequence"], mod_sequences, "[", "]"
|
|
1660
|
+
)
|
|
1661
|
+
new_df = df.copy()
|
|
1662
|
+
new_df["Modified sequence"] = mod_entries["Modified sequence"]
|
|
1663
|
+
new_df["Modifications"] = mod_entries["Modifications"]
|
|
1664
|
+
return new_df
|
|
1665
|
+
|
|
1666
|
+
def _add_modification_localization_string(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
1667
|
+
"""Adds modification localization string columns.
|
|
1668
|
+
|
|
1669
|
+
Extracts localization probabilities from the "EG.PTMLocalizationProbabilities"
|
|
1670
|
+
column, converts them into the standardized modification localization string
|
|
1671
|
+
format used by msreport, and adds new column "Modification localization string".
|
|
1672
|
+
|
|
1673
|
+
Probabilities are written in the format
|
|
1674
|
+
"Mod1@Site1:Probability1,Site2:Probability2;Mod2@Site3:Probability3",
|
|
1675
|
+
e.g. "15.9949@11:1.000;79.9663@3:0.200,4:0.800". Refer to
|
|
1676
|
+
`msreport.peptidoform.make_localization_string` for details.
|
|
1677
|
+
|
|
1678
|
+
Args:
|
|
1679
|
+
df: Dataframe containing a "EG.PTMLocalizationProbabilities" column.
|
|
1680
|
+
|
|
1681
|
+
Returns:
|
|
1682
|
+
A copy of the input dataframe with the added column
|
|
1683
|
+
"Modification localization string".
|
|
1684
|
+
"""
|
|
1685
|
+
# TODO: not tested
|
|
1686
|
+
new_df = df.copy()
|
|
1687
|
+
localization_strings = []
|
|
1688
|
+
for localization_entry in new_df["EG.PTMLocalizationProbabilities"]:
|
|
1689
|
+
if localization_entry == "":
|
|
1690
|
+
localization_strings.append("")
|
|
1691
|
+
continue
|
|
1692
|
+
|
|
1693
|
+
localization_probabilities = extract_spectronaut_localization_probabilities(
|
|
1694
|
+
localization_entry
|
|
1695
|
+
)
|
|
1696
|
+
localization_string = msreport.peptidoform.make_localization_string(
|
|
1697
|
+
localization_probabilities
|
|
1698
|
+
)
|
|
1699
|
+
localization_strings.append(localization_string)
|
|
1700
|
+
new_df["Modification localization string"] = localization_strings
|
|
1701
|
+
return new_df
|
|
1702
|
+
|
|
1600
1703
|
|
|
1601
1704
|
def sort_leading_proteins(
|
|
1602
1705
|
table: pd.DataFrame,
|
|
@@ -1639,7 +1742,7 @@ def sort_leading_proteins(
|
|
|
1639
1742
|
db_origins_present = "Leading proteins database origin" in table
|
|
1640
1743
|
|
|
1641
1744
|
if database_order is not None:
|
|
1642
|
-
database_encoding = defaultdict(lambda: 999)
|
|
1745
|
+
database_encoding: dict[str, int] = defaultdict(lambda: 999)
|
|
1643
1746
|
database_encoding.update({db: i for i, db in enumerate(database_order)})
|
|
1644
1747
|
if penalize_contaminants is not None:
|
|
1645
1748
|
contaminant_encoding = {"False": 0, "True": 1, False: 0, True: 1}
|
|
@@ -1647,7 +1750,7 @@ def sort_leading_proteins(
|
|
|
1647
1750
|
for _, row in table.iterrows():
|
|
1648
1751
|
protein_ids = row["Leading proteins"].split(";")
|
|
1649
1752
|
|
|
1650
|
-
sorting_info = [[] for _ in protein_ids]
|
|
1753
|
+
sorting_info: list[list] = [[] for _ in protein_ids]
|
|
1651
1754
|
if special_proteins is not None:
|
|
1652
1755
|
for i, _id in enumerate(protein_ids):
|
|
1653
1756
|
sorting_info[i].append(_id not in special_proteins)
|
|
@@ -1787,7 +1890,7 @@ def add_protein_site_annotation(
|
|
|
1787
1890
|
protein_db: ProteinDatabase,
|
|
1788
1891
|
protein_column: str = "Representative protein",
|
|
1789
1892
|
site_column: str = "Protein site",
|
|
1790
|
-
):
|
|
1893
|
+
) -> pd.DataFrame:
|
|
1791
1894
|
"""Uses a FASTA protein database to add protein site annotation columns.
|
|
1792
1895
|
|
|
1793
1896
|
Adds the columns "Modified residue", which corresponds to the amino acid at the
|
|
@@ -1925,6 +2028,61 @@ def add_leading_proteins_annotation(
|
|
|
1925
2028
|
return table
|
|
1926
2029
|
|
|
1927
2030
|
|
|
2031
|
+
def add_protein_site_identifiers(
|
|
2032
|
+
table: pd.DataFrame,
|
|
2033
|
+
protein_db: ProteinDatabase,
|
|
2034
|
+
site_column: str,
|
|
2035
|
+
protein_name_column: str,
|
|
2036
|
+
):
|
|
2037
|
+
"""Adds a "Protein site identifier" column to the 'table'.
|
|
2038
|
+
|
|
2039
|
+
The "Protein site identifier" is generated by concatenating the protein name
|
|
2040
|
+
with the amino acid and position of the protein site or sites, e.g. "P12345 - S123"
|
|
2041
|
+
or "P12345 - S123 / T125". The amino acid is extracted from the protein sequence at
|
|
2042
|
+
the position of the site. If the protein name is not available, the
|
|
2043
|
+
"Representative protein" entry is used instead.
|
|
2044
|
+
|
|
2045
|
+
Args:
|
|
2046
|
+
table: Dataframe to which the protein site identifiers are added.
|
|
2047
|
+
protein_db: A protein database containing entries from one or multiple FASTA
|
|
2048
|
+
files. Protein identifiers in the 'table' column "Representative protein"
|
|
2049
|
+
are used to look up entries in the 'protein_db'.
|
|
2050
|
+
site_column: Column in 'table' that contains protein site positions. Positions
|
|
2051
|
+
are one-indexed, meaning the first amino acid of the protein is position 1.
|
|
2052
|
+
Multiple sites in a single entry should be separated by ";".
|
|
2053
|
+
protein_name_column: Column in 'table' that contains protein names, which will
|
|
2054
|
+
be used to generate the identifier. If no name is available, the accession
|
|
2055
|
+
is used instead.
|
|
2056
|
+
|
|
2057
|
+
Raises:
|
|
2058
|
+
ValueError: If the "Representative protein", 'protein_name_column' or
|
|
2059
|
+
'site_column' is not found in the 'table'.
|
|
2060
|
+
"""
|
|
2061
|
+
if site_column not in table.columns:
|
|
2062
|
+
raise ValueError(f"Column '{site_column}' not found in the table.")
|
|
2063
|
+
if protein_name_column not in table.columns:
|
|
2064
|
+
raise ValueError(f"Column '{protein_name_column}' not found in the table.")
|
|
2065
|
+
if "Representative protein" not in table.columns:
|
|
2066
|
+
raise ValueError("Column 'Representative protein' not found in the table.")
|
|
2067
|
+
|
|
2068
|
+
site_identifiers = []
|
|
2069
|
+
for accession, sites, name in zip(
|
|
2070
|
+
table["Representative protein"],
|
|
2071
|
+
table[site_column].astype(str),
|
|
2072
|
+
table[protein_name_column],
|
|
2073
|
+
):
|
|
2074
|
+
protein_sequence = protein_db[accession].sequence
|
|
2075
|
+
protein_identifier = name if name else accession
|
|
2076
|
+
aa_sites = []
|
|
2077
|
+
for site in sites.split(";"):
|
|
2078
|
+
aa = protein_sequence[int(site) - 1]
|
|
2079
|
+
aa_sites.append(f"{aa}{site}")
|
|
2080
|
+
aa_site_tag = " / ".join(aa_sites)
|
|
2081
|
+
site_identifier = f"{protein_identifier} - {aa_site_tag}"
|
|
2082
|
+
site_identifiers.append(site_identifier)
|
|
2083
|
+
table["Protein site identifier"] = site_identifiers
|
|
2084
|
+
|
|
2085
|
+
|
|
1928
2086
|
def add_sequence_coverage(
|
|
1929
2087
|
protein_table: pd.DataFrame,
|
|
1930
2088
|
peptide_table: pd.DataFrame,
|
|
@@ -2384,7 +2542,9 @@ def _extract_fragpipe_assigned_modifications(
|
|
|
2384
2542
|
return modifications
|
|
2385
2543
|
|
|
2386
2544
|
|
|
2387
|
-
def extract_maxquant_localization_probabilities(
|
|
2545
|
+
def extract_maxquant_localization_probabilities(
|
|
2546
|
+
localization_entry: str,
|
|
2547
|
+
) -> dict[int, float]:
|
|
2388
2548
|
"""Extract localization probabilites from a MaxQuant "Probabilities" entry.
|
|
2389
2549
|
|
|
2390
2550
|
Args:
|
|
@@ -2441,6 +2601,39 @@ def extract_fragpipe_localization_probabilities(localization_entry: str) -> dict
|
|
|
2441
2601
|
return modification_probabilities
|
|
2442
2602
|
|
|
2443
2603
|
|
|
2604
|
+
def extract_spectronaut_localization_probabilities(localization_entry: str) -> dict:
|
|
2605
|
+
"""Extract localization probabilites from a Spectronaut localization entry.
|
|
2606
|
+
|
|
2607
|
+
Args:
|
|
2608
|
+
localization_entry: Entry from the "EG.PTMLocalizationProbabilities" column of a
|
|
2609
|
+
spectronaut elution group (EG) output table.
|
|
2610
|
+
|
|
2611
|
+
Returns:
|
|
2612
|
+
A dictionary of modifications containing a dictionary of {position: probability}
|
|
2613
|
+
mappings. Positions are one-indexed, which means that the first amino acid
|
|
2614
|
+
position is 1.
|
|
2615
|
+
|
|
2616
|
+
Example:
|
|
2617
|
+
>>> extract_spectronaut_localization_probabilities(
|
|
2618
|
+
... "_HM[Oxidation (M): 100%]S[Phospho (STY): 45.5%]GS[Phospho (STY): 54.5%]PG_"
|
|
2619
|
+
... )
|
|
2620
|
+
{'Oxidation (M)': {2: 1.0}, 'Phospho (STY)': {3: 0.455, 5: 0.545}}
|
|
2621
|
+
"""
|
|
2622
|
+
modification_probabilities: dict[str, dict[int, float]] = {}
|
|
2623
|
+
localization_entry = localization_entry.strip("_")
|
|
2624
|
+
_, raw_probability_entries = msreport.peptidoform.parse_modified_sequence(
|
|
2625
|
+
localization_entry, "[", "]"
|
|
2626
|
+
)
|
|
2627
|
+
|
|
2628
|
+
for site, mod_probability_entry in raw_probability_entries:
|
|
2629
|
+
modification, probability_entry = mod_probability_entry.split(": ")
|
|
2630
|
+
if modification not in modification_probabilities:
|
|
2631
|
+
modification_probabilities[modification] = {}
|
|
2632
|
+
probability = float(probability_entry.replace("%", "")) / 100.0
|
|
2633
|
+
modification_probabilities[modification][site] = probability
|
|
2634
|
+
return modification_probabilities
|
|
2635
|
+
|
|
2636
|
+
|
|
2444
2637
|
def _extract_protein_ids(entries: list[str]) -> list[str]:
|
|
2445
2638
|
"""Returns a list of protein IDs, extracted from protein entries.
|
|
2446
2639
|
|
|
@@ -2554,8 +2747,8 @@ def _create_multi_protein_annotations_from_db(
|
|
|
2554
2747
|
query_result.append(query_function(db_entry, default_value))
|
|
2555
2748
|
else:
|
|
2556
2749
|
query_result.append(default_value)
|
|
2557
|
-
|
|
2558
|
-
annotation_values.append(
|
|
2750
|
+
annotation_value = ";".join(map(str, query_result))
|
|
2751
|
+
annotation_values.append(annotation_value)
|
|
2559
2752
|
return annotation_values
|
|
2560
2753
|
|
|
2561
2754
|
|
msreport/rinterface/limma.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: msreport
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.31
|
|
4
4
|
Summary: Post processing and analysis of quantitative proteomics data
|
|
5
5
|
Author-email: "David M. Hollenstein" <hollenstein.david@gmail.com>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
7
7
|
Project-URL: homepage, https://github.com/hollenstein/msreport
|
|
8
|
+
Project-URL: documentation, https://hollenstein.github.io/msreport/
|
|
8
9
|
Project-URL: changelog, https://github.com/hollenstein/msreport/blob/main/CHANGELOG.md
|
|
9
10
|
Keywords: mass spectrometry,proteomics,post processing,data analysis
|
|
10
11
|
Classifier: Development Status :: 4 - Beta
|
|
@@ -33,6 +34,13 @@ Requires-Dist: rpy2<3.5.13,>=3.5.3; extra == "r"
|
|
|
33
34
|
Provides-Extra: dev
|
|
34
35
|
Requires-Dist: mypy>=1.15.0; extra == "dev"
|
|
35
36
|
Requires-Dist: pytest>=8.3.5; extra == "dev"
|
|
37
|
+
Provides-Extra: docs
|
|
38
|
+
Requires-Dist: mkdocs-awesome-nav>=3.1.2; extra == "docs"
|
|
39
|
+
Requires-Dist: mkdocs-macros-plugin>=1.3.7; extra == "docs"
|
|
40
|
+
Requires-Dist: mkdocs-material>=9.6.15; extra == "docs"
|
|
41
|
+
Requires-Dist: mkdocs-roamlinks-plugin>=0.3.2; extra == "docs"
|
|
42
|
+
Requires-Dist: mkdocstrings-python>=1.16.12; extra == "docs"
|
|
43
|
+
Requires-Dist: ruff>=0.12.2; extra == "docs"
|
|
36
44
|
Provides-Extra: test
|
|
37
45
|
Requires-Dist: pytest>=8.3.5; extra == "test"
|
|
38
46
|
Dynamic: license-file
|
|
@@ -64,6 +72,8 @@ MsReport is a Python library designed to simplify the post-processing and analys
|
|
|
64
72
|
|
|
65
73
|
The library supports importing protein and peptide-level quantification results from MaxQuant, FragPipe, and Spectronaut, as well as post-translational modification (PTM) data from MaxQuant and FragPipe. MsReport provides tools for data annotation, normalization and transformation, statistical testing, and data visualization.
|
|
66
74
|
|
|
75
|
+
The [documentation](https://hollenstein.github.io/msreport/) provides an overview of the library's public API.
|
|
76
|
+
|
|
67
77
|
### Key features of MsReport
|
|
68
78
|
|
|
69
79
|
#### Data Import and Standardization
|