msreport 0.0.29__py3-none-any.whl → 0.0.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- msreport/__init__.py +1 -1
- msreport/aggregate/__init__.py +10 -0
- msreport/aggregate/condense.py +9 -0
- msreport/aggregate/pivot.py +14 -5
- msreport/aggregate/summarize.py +14 -4
- msreport/analyze.py +67 -5
- msreport/export.py +9 -15
- msreport/fasta.py +9 -2
- msreport/helper/__init__.py +18 -0
- msreport/impute.py +18 -10
- msreport/isobar.py +11 -14
- msreport/normalize.py +95 -10
- msreport/peptidoform.py +21 -11
- msreport/plot/__init__.py +3 -3
- msreport/plot/distribution.py +2 -1
- msreport/plot/quality.py +1 -1
- msreport/qtable.py +44 -20
- msreport/reader.py +321 -40
- msreport/rinterface/limma.py +1 -1
- {msreport-0.0.29.dist-info → msreport-0.0.31.dist-info}/METADATA +20 -2
- msreport-0.0.31.dist-info/RECORD +38 -0
- {msreport-0.0.29.dist-info → msreport-0.0.31.dist-info}/WHEEL +1 -1
- msreport-0.0.29.dist-info/RECORD +0 -38
- {msreport-0.0.29.dist-info → msreport-0.0.31.dist-info}/licenses/LICENSE.txt +0 -0
- {msreport-0.0.29.dist-info → msreport-0.0.31.dist-info}/top_level.txt +0 -0
msreport/reader.py
CHANGED
|
@@ -1,17 +1,18 @@
|
|
|
1
|
-
"""
|
|
2
|
-
to a standardized format following the MsReport convention.
|
|
1
|
+
"""Provides tools for importing and standardizing quantitative proteomics data.
|
|
3
2
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
3
|
+
This module offers software-specific reader classes to import raw result tables (e.g.,
|
|
4
|
+
proteins, peptides, ions) from various proteomics software (MaxQuant, FragPipe,
|
|
5
|
+
Spectronaut) and convert them into a standardized `msreport` format. Additionally, it
|
|
6
|
+
provides functions for annotating imported data with biological metadata, such as
|
|
7
|
+
protein information (e.g., sequence length, molecular weight) and peptide positions,
|
|
8
|
+
extracted from a ProteinDatabase (FASTA file).
|
|
7
9
|
|
|
8
|
-
New
|
|
10
|
+
New columns added to imported protein tables:
|
|
9
11
|
- Representative protein
|
|
10
12
|
- Leading proteins
|
|
11
13
|
- Protein reported by software
|
|
12
14
|
|
|
13
|
-
|
|
14
|
-
- Total peptides
|
|
15
|
+
Standardized column names for quantitative values (if available in the software output):
|
|
15
16
|
- Spectral count "sample name"
|
|
16
17
|
- Unique spectral count "sample name"
|
|
17
18
|
- Total spectral count "sample name"
|
|
@@ -38,6 +39,7 @@ from msreport.helper.temp import extract_window_around_position
|
|
|
38
39
|
class Protein(Protocol):
|
|
39
40
|
"""Abstract protein entry"""
|
|
40
41
|
|
|
42
|
+
# identifier: str
|
|
41
43
|
header: str
|
|
42
44
|
sequence: str
|
|
43
45
|
header_fields: dict[str, str]
|
|
@@ -46,9 +48,9 @@ class Protein(Protocol):
|
|
|
46
48
|
class ProteinDatabase(Protocol):
|
|
47
49
|
"""Abstract protein database"""
|
|
48
50
|
|
|
49
|
-
def __getitem__(self,
|
|
51
|
+
def __getitem__(self, identifier: str) -> Protein: ...
|
|
50
52
|
|
|
51
|
-
def __contains__(self,
|
|
53
|
+
def __contains__(self, identifier: str) -> bool: ...
|
|
52
54
|
|
|
53
55
|
|
|
54
56
|
class ResultReader:
|
|
@@ -497,7 +499,9 @@ class MaxQuantReader(ResultReader):
|
|
|
497
499
|
mod_probability_columns = msreport.helper.find_columns(new_df, "Probabilities")
|
|
498
500
|
localization_string_column = "Modification localization string"
|
|
499
501
|
|
|
500
|
-
mod_localization_probabilities
|
|
502
|
+
mod_localization_probabilities: list[dict[str, dict[int, float]]] = [
|
|
503
|
+
{} for _ in range(new_df.shape[0])
|
|
504
|
+
]
|
|
501
505
|
for probability_column in mod_probability_columns:
|
|
502
506
|
# FUTURE: Type should be checked and enforced during the import
|
|
503
507
|
if not pd.api.types.is_string_dtype(new_df[probability_column].dtype):
|
|
@@ -541,6 +545,8 @@ class FragPipeReader(ResultReader):
|
|
|
541
545
|
"""FragPipe result reader.
|
|
542
546
|
|
|
543
547
|
Methods:
|
|
548
|
+
import_design: Reads a "fragpipe-files.fp-manifest" file and returns a
|
|
549
|
+
processed design dataframe.
|
|
544
550
|
import_proteins: Reads a "combined_protein.tsv" or "protein.tsv" file and
|
|
545
551
|
returns a processed dataframe, conforming to the MsReport naming
|
|
546
552
|
convention.
|
|
@@ -583,12 +589,19 @@ class FragPipeReader(ResultReader):
|
|
|
583
589
|
"ions": "combined_ion.tsv",
|
|
584
590
|
"ion_evidence": "ion.tsv",
|
|
585
591
|
"psm_evidence": "psm.tsv",
|
|
592
|
+
"design": "fragpipe-files.fp-manifest",
|
|
586
593
|
}
|
|
587
594
|
isobar_filenames: dict[str, str] = {
|
|
588
595
|
"proteins": "protein.tsv",
|
|
589
596
|
"peptides": "peptide.tsv",
|
|
590
597
|
"ions": "ion.tsv",
|
|
591
598
|
}
|
|
599
|
+
sil_filenames: dict[str, str] = {
|
|
600
|
+
"proteins": "combined_protein_label_quant.tsv",
|
|
601
|
+
"peptides": "combined_modified_peptide_label_quant.tsv",
|
|
602
|
+
"ions": "combined_ion_label_quant.tsv",
|
|
603
|
+
}
|
|
604
|
+
|
|
592
605
|
protected_columns: list[str] = []
|
|
593
606
|
sample_column_tags: list[str] = [
|
|
594
607
|
"Spectral Count",
|
|
@@ -609,6 +622,7 @@ class FragPipeReader(ResultReader):
|
|
|
609
622
|
"Modified Sequence": "Modified sequence", # Modified peptide and ion
|
|
610
623
|
"Start": "Start position", # Peptide and ion
|
|
611
624
|
"End": "End position", # Peptide and ion
|
|
625
|
+
"Mapped Proteins": "Mapped proteins", # All PSM, ion, and peptide tables
|
|
612
626
|
"Combined Total Peptides": "Total peptides", # From LFQ
|
|
613
627
|
"Total Peptides": "Total peptides", # From TMT
|
|
614
628
|
"Description": "Protein name",
|
|
@@ -638,7 +652,11 @@ class FragPipeReader(ResultReader):
|
|
|
638
652
|
protein_info_tags: list[str] = []
|
|
639
653
|
|
|
640
654
|
def __init__(
|
|
641
|
-
self,
|
|
655
|
+
self,
|
|
656
|
+
directory: str,
|
|
657
|
+
isobar: bool = False,
|
|
658
|
+
sil: bool = False,
|
|
659
|
+
contaminant_tag: str = "contam_",
|
|
642
660
|
) -> None:
|
|
643
661
|
"""Initializes the FragPipeReader.
|
|
644
662
|
|
|
@@ -646,16 +664,89 @@ class FragPipeReader(ResultReader):
|
|
|
646
664
|
directory: Location of the FragPipe result folder
|
|
647
665
|
isobar: Set to True if quantification strategy was TMT, iTRAQ or similar;
|
|
648
666
|
default False.
|
|
667
|
+
sil: Set to True if the FragPipe result files are from a stable isotope
|
|
668
|
+
labeling experiment, such as SILAC; default False.
|
|
649
669
|
contaminant_tag: Prefix of Protein ID entries to identify contaminants;
|
|
650
670
|
default "contam_".
|
|
651
671
|
"""
|
|
672
|
+
if sil and isobar:
|
|
673
|
+
raise ValueError("Cannot set both 'isobar' and 'sil' to True.")
|
|
652
674
|
self._add_data_directory(directory)
|
|
653
675
|
self._isobar: bool = isobar
|
|
676
|
+
self._sil: bool = sil
|
|
654
677
|
self._contaminant_tag: str = contaminant_tag
|
|
655
|
-
if
|
|
678
|
+
if isobar:
|
|
679
|
+
self.filenames = self.isobar_filenames
|
|
680
|
+
elif sil:
|
|
681
|
+
self.filenames = self.sil_filenames
|
|
682
|
+
else:
|
|
656
683
|
self.filenames = self.default_filenames
|
|
684
|
+
|
|
685
|
+
def import_design(
|
|
686
|
+
self, filename: Optional[str] = None, sort: bool = False
|
|
687
|
+
) -> pd.DataFrame:
|
|
688
|
+
"""Read a 'fp-manifest' file and returns a processed design dataframe.
|
|
689
|
+
|
|
690
|
+
The manifest columns "Path", "Experiment", and "Bioreplicate" are mapped to the
|
|
691
|
+
design table columns "Rawfile", "Experiment", and "Replicate". The "Rawfile"
|
|
692
|
+
column is extracted as the filename from the full path. The "Sample" column is
|
|
693
|
+
generated by combining "Experiment" and "Replicate" with an underscore
|
|
694
|
+
(e.g., "Experiment_Replicate"), except when "Replicate" is empty, in which case
|
|
695
|
+
"Sample" is set to "Experiment". If "Experiment" is missing, it is set to "exp"
|
|
696
|
+
by default.
|
|
697
|
+
|
|
698
|
+
Args:
|
|
699
|
+
filename: Allows specifying an alternative filename, otherwise the default
|
|
700
|
+
filename is used.
|
|
701
|
+
sort: If True, the design dataframe is sorted by "Experiment" and
|
|
702
|
+
"Replicate"; default False.
|
|
703
|
+
|
|
704
|
+
Returns:
|
|
705
|
+
A dataframe containing the processed design table with columns:
|
|
706
|
+
"Sample", "Experiment", "Replicate", "Rawfile".
|
|
707
|
+
|
|
708
|
+
Raises:
|
|
709
|
+
FileNotFoundError: If the specified manifest file does not exist.
|
|
710
|
+
"""
|
|
711
|
+
if filename is None:
|
|
712
|
+
filepath = os.path.join(self.data_directory, self.filenames["design"])
|
|
657
713
|
else:
|
|
658
|
-
|
|
714
|
+
filepath = os.path.join(self.data_directory, filename)
|
|
715
|
+
if not os.path.exists(filepath):
|
|
716
|
+
raise FileNotFoundError(
|
|
717
|
+
f"File '{filepath}' does not exist. Please check the file path."
|
|
718
|
+
)
|
|
719
|
+
fp_manifest = (
|
|
720
|
+
pd.read_csv(
|
|
721
|
+
filepath, sep="\t", header=None, na_values=[""], keep_default_na=False
|
|
722
|
+
)
|
|
723
|
+
.fillna("")
|
|
724
|
+
.astype(str)
|
|
725
|
+
)
|
|
726
|
+
fp_manifest.columns = ["Path", "Experiment", "Bioreplicate", "Data type"]
|
|
727
|
+
|
|
728
|
+
design = pd.DataFrame(
|
|
729
|
+
{
|
|
730
|
+
"Sample": "",
|
|
731
|
+
"Experiment": fp_manifest["Experiment"],
|
|
732
|
+
"Replicate": fp_manifest["Bioreplicate"],
|
|
733
|
+
"Rawfile": fp_manifest["Path"].apply(
|
|
734
|
+
# Required to handle Windows and Unix style paths on either system
|
|
735
|
+
lambda x: x.replace("\\", "/").split("/")[-1]
|
|
736
|
+
),
|
|
737
|
+
}
|
|
738
|
+
)
|
|
739
|
+
# FragPipe uses "exp" for missing 'Experiment' values
|
|
740
|
+
design.loc[design["Experiment"] == "", "Experiment"] = "exp"
|
|
741
|
+
# FragPipe combines 'Experiment' + "_" + 'Replicate' into 'Sample', except when
|
|
742
|
+
# 'Replicate' is empty, in which case 'Sample' is set to 'Experiment'.
|
|
743
|
+
design["Sample"] = design["Experiment"] + "_" + design["Replicate"]
|
|
744
|
+
design.loc[design["Replicate"] == "", "Sample"] = design["Experiment"]
|
|
745
|
+
|
|
746
|
+
if sort:
|
|
747
|
+
design.sort_values(by=["Experiment", "Replicate"], inplace=True)
|
|
748
|
+
design.reset_index(drop=True, inplace=True)
|
|
749
|
+
return design
|
|
659
750
|
|
|
660
751
|
def import_proteins(
|
|
661
752
|
self,
|
|
@@ -737,6 +828,7 @@ class FragPipeReader(ResultReader):
|
|
|
737
828
|
df = self._read_file("peptides" if filename is None else filename)
|
|
738
829
|
df["Protein reported by software"] = _extract_protein_ids(df["Protein"])
|
|
739
830
|
df["Representative protein"] = df["Protein reported by software"]
|
|
831
|
+
df["Mapped Proteins"] = self._collect_mapped_proteins(df)
|
|
740
832
|
# Note that _add_protein_entries would need to be adapted for the peptide table.
|
|
741
833
|
# df = self._add_protein_entries(df)
|
|
742
834
|
if rename_columns:
|
|
@@ -793,6 +885,8 @@ class FragPipeReader(ResultReader):
|
|
|
793
885
|
# 'Indistinguishable Proteins' to the ion table.
|
|
794
886
|
df["Protein reported by software"] = _extract_protein_ids(df["Protein"])
|
|
795
887
|
df["Representative protein"] = df["Protein reported by software"]
|
|
888
|
+
df["Mapped Proteins"] = self._collect_mapped_proteins(df)
|
|
889
|
+
|
|
796
890
|
if rename_columns:
|
|
797
891
|
df = self._rename_columns(df, prefix_column_tags)
|
|
798
892
|
if rewrite_modifications and rename_columns:
|
|
@@ -879,6 +973,8 @@ class FragPipeReader(ResultReader):
|
|
|
879
973
|
# 'Indistinguishable Proteins' to the ion table.
|
|
880
974
|
df["Protein reported by software"] = _extract_protein_ids(df["Protein"])
|
|
881
975
|
df["Representative protein"] = df["Protein reported by software"]
|
|
976
|
+
df["Mapped Proteins"] = self._collect_mapped_proteins(df)
|
|
977
|
+
|
|
882
978
|
if rename_columns:
|
|
883
979
|
df = self._rename_columns(df, prefix_column_tags)
|
|
884
980
|
if rewrite_modifications and rename_columns:
|
|
@@ -891,7 +987,7 @@ class FragPipeReader(ResultReader):
|
|
|
891
987
|
filename: Optional[str] = None,
|
|
892
988
|
rename_columns: bool = True,
|
|
893
989
|
rewrite_modifications: bool = True,
|
|
894
|
-
):
|
|
990
|
+
) -> pd.DataFrame:
|
|
895
991
|
"""Concatenate all "psm.tsv" files and return a processed dataframe.
|
|
896
992
|
|
|
897
993
|
Args:
|
|
@@ -928,23 +1024,7 @@ class FragPipeReader(ResultReader):
|
|
|
928
1024
|
|
|
929
1025
|
df["Protein reported by software"] = _extract_protein_ids(df["Protein"])
|
|
930
1026
|
df["Representative protein"] = df["Protein reported by software"]
|
|
931
|
-
df["Mapped Proteins"] =
|
|
932
|
-
|
|
933
|
-
# FP only lists additional mapped proteins in the "Mapped Proteins" column
|
|
934
|
-
# MsReport reports all matching proteins in the "Mapped proteins" column
|
|
935
|
-
mapped_proteins_entries = []
|
|
936
|
-
for protein, mapped_protein_fp in zip(
|
|
937
|
-
df["Representative protein"], df["Mapped Proteins"], strict=True
|
|
938
|
-
):
|
|
939
|
-
if mapped_protein_fp == "":
|
|
940
|
-
mapped_proteins = [protein]
|
|
941
|
-
else:
|
|
942
|
-
additional_mapped_proteins = msreport.reader._extract_protein_ids(
|
|
943
|
-
mapped_protein_fp.split(", ")
|
|
944
|
-
)
|
|
945
|
-
mapped_proteins = [protein] + additional_mapped_proteins
|
|
946
|
-
mapped_proteins_entries.append(";".join(mapped_proteins))
|
|
947
|
-
df["Mapped proteins"] = mapped_proteins_entries
|
|
1027
|
+
df["Mapped Proteins"] = self._collect_mapped_proteins(df)
|
|
948
1028
|
|
|
949
1029
|
if rename_columns:
|
|
950
1030
|
df = self._rename_columns(df, prefix_tag=True)
|
|
@@ -980,6 +1060,35 @@ class FragPipeReader(ResultReader):
|
|
|
980
1060
|
df[key] = protein_entry_table[key]
|
|
981
1061
|
return df
|
|
982
1062
|
|
|
1063
|
+
def _collect_mapped_proteins(self, df: pd.DataFrame) -> list[str]:
|
|
1064
|
+
"""Generates a list of mapped proteins entries.
|
|
1065
|
+
|
|
1066
|
+
This method extracts protein IDs from the 'Representative protein' and the
|
|
1067
|
+
'Mapped Proteins' column and combines them into a single string for each row,
|
|
1068
|
+
where multiple protein IDs are separated by semicolons.
|
|
1069
|
+
|
|
1070
|
+
Args:
|
|
1071
|
+
df: DataFrame containing the 'Mapped Proteins' column.
|
|
1072
|
+
|
|
1073
|
+
Returns:
|
|
1074
|
+
A list of mapped proteins entries.
|
|
1075
|
+
"""
|
|
1076
|
+
mapped_proteins_entries = []
|
|
1077
|
+
for protein, mapped_protein_fp in zip(
|
|
1078
|
+
df["Representative protein"],
|
|
1079
|
+
df["Mapped Proteins"].astype(str).replace("nan", ""),
|
|
1080
|
+
strict=True,
|
|
1081
|
+
):
|
|
1082
|
+
if mapped_protein_fp == "":
|
|
1083
|
+
mapped_proteins = [protein]
|
|
1084
|
+
else:
|
|
1085
|
+
additional_mapped_proteins = msreport.reader._extract_protein_ids(
|
|
1086
|
+
mapped_protein_fp.split(", ")
|
|
1087
|
+
)
|
|
1088
|
+
mapped_proteins = [protein] + additional_mapped_proteins
|
|
1089
|
+
mapped_proteins_entries.append(";".join(mapped_proteins))
|
|
1090
|
+
return mapped_proteins_entries
|
|
1091
|
+
|
|
983
1092
|
def _collect_leading_protein_entries(self, df: pd.DataFrame) -> list[list[str]]:
|
|
984
1093
|
"""Generates a list of leading protein entries.
|
|
985
1094
|
|
|
@@ -995,6 +1104,9 @@ class FragPipeReader(ResultReader):
|
|
|
995
1104
|
A list of the same length as the input dataframe. Each position contains a
|
|
996
1105
|
list of leading protein entries, which a minimum of one entry.
|
|
997
1106
|
"""
|
|
1107
|
+
if self._sil: # No "Indistinguishable Proteins" columns in 'SIL' data
|
|
1108
|
+
return [[p] for p in df["Protein"]]
|
|
1109
|
+
|
|
998
1110
|
leading_protein_entries = []
|
|
999
1111
|
for protein_entry, indist_protein_entry in zip(
|
|
1000
1112
|
df["Protein"], df["Indistinguishable Proteins"].fillna("").astype(str)
|
|
@@ -1411,6 +1523,7 @@ class SpectronautReader(ResultReader):
|
|
|
1411
1523
|
filename: Optional[str] = None,
|
|
1412
1524
|
filetag: Optional[str] = None,
|
|
1413
1525
|
rename_columns: bool = True,
|
|
1526
|
+
rewrite_modifications: bool = True,
|
|
1414
1527
|
) -> pd.DataFrame:
|
|
1415
1528
|
"""Reads an ion evidence file (long format) and returns a processed dataframe.
|
|
1416
1529
|
|
|
@@ -1420,8 +1533,15 @@ class SpectronautReader(ResultReader):
|
|
|
1420
1533
|
generated by concatenating the "Modified sequence" and "Charge" columns, and if
|
|
1421
1534
|
present, the "Compensation voltage" column.
|
|
1422
1535
|
|
|
1423
|
-
|
|
1424
|
-
are
|
|
1536
|
+
"Modified sequence" entries contain modifications within square brackets.
|
|
1537
|
+
"Modification" entries are strings in the form of "position:modification_tag",
|
|
1538
|
+
multiple modifications are joined by ";". An example for a modified sequence and
|
|
1539
|
+
a modification entry: "PEPT[Phospho]IDO[Oxidation]", "4:Phospho;7:Oxidation".
|
|
1540
|
+
|
|
1541
|
+
"Modification localization string" contains localization probabilities in the
|
|
1542
|
+
format "Mod1@Site1:Probability1,Site2:Probability2;Mod2@Site3:Probability3",
|
|
1543
|
+
e.g. "15.9949@11:1.000;79.9663@3:0.200,4:0.800". Refer to
|
|
1544
|
+
`msreport.peptidoform.make_localization_string` for details.
|
|
1425
1545
|
|
|
1426
1546
|
Args:
|
|
1427
1547
|
filename: Optional, allows specifying a specific file that will be imported.
|
|
@@ -1429,6 +1549,10 @@ class SpectronautReader(ResultReader):
|
|
|
1429
1549
|
a substring, instead of specifying a filename.
|
|
1430
1550
|
rename_columns: If True, columns are renamed according to the MsReport
|
|
1431
1551
|
convention; default True.
|
|
1552
|
+
rewrite_modifications: If True, the peptide format in "Modified sequence" is
|
|
1553
|
+
changed according to the MsReport convention, and a "Modifications" is
|
|
1554
|
+
added to contains the amino acid position for all modifications.
|
|
1555
|
+
Requires 'rename_columns' to be true. Default True.
|
|
1432
1556
|
|
|
1433
1557
|
Returns:
|
|
1434
1558
|
A dataframe containing the processed ion table.
|
|
@@ -1456,6 +1580,9 @@ class SpectronautReader(ResultReader):
|
|
|
1456
1580
|
df = self._add_protein_entries(df)
|
|
1457
1581
|
if rename_columns:
|
|
1458
1582
|
df = self._rename_columns(df, True)
|
|
1583
|
+
if rewrite_modifications and rename_columns:
|
|
1584
|
+
df = self._add_peptide_modification_entries(df)
|
|
1585
|
+
df = self._add_modification_localization_string(df)
|
|
1459
1586
|
df["Ion ID"] = df["Modified sequence"] + "_c" + df["Charge"].astype(str)
|
|
1460
1587
|
if "Compensation voltage" in df.columns:
|
|
1461
1588
|
_cv = df["Compensation voltage"].astype(str)
|
|
@@ -1509,6 +1636,70 @@ class SpectronautReader(ResultReader):
|
|
|
1509
1636
|
leading_protein_entries = df["PG.ProteinAccessions"].str.split(";").tolist()
|
|
1510
1637
|
return leading_protein_entries
|
|
1511
1638
|
|
|
1639
|
+
def _add_peptide_modification_entries(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
1640
|
+
"""Adds standardized "Modified sequence" and "Modifications" columns.
|
|
1641
|
+
|
|
1642
|
+
"Modified sequence" entries contain modifications within square brackets.
|
|
1643
|
+
"Modifications" entries are strings in the form of "position:modification_text",
|
|
1644
|
+
multiple modifications are joined by ";". An example for a modified sequence and
|
|
1645
|
+
a modification entry: "PEPT[Phospho]IDO[Oxidation]", "4:Phospho;7:Oxidation".
|
|
1646
|
+
|
|
1647
|
+
Requires the columns "Peptide sequence" and "Modified sequence" from the
|
|
1648
|
+
software output.
|
|
1649
|
+
|
|
1650
|
+
Args:
|
|
1651
|
+
df: Dataframe containing "Peptide sequence" and "Modified sequence" columns.
|
|
1652
|
+
|
|
1653
|
+
Returns:
|
|
1654
|
+
A copy of the input dataframe with updated columns.
|
|
1655
|
+
"""
|
|
1656
|
+
# TODO: not tested
|
|
1657
|
+
mod_sequences = df["Modified sequence"].str[1:-1] # Remove sourrounding "_"
|
|
1658
|
+
mod_entries = _generate_modification_entries(
|
|
1659
|
+
df["Peptide sequence"], mod_sequences, "[", "]"
|
|
1660
|
+
)
|
|
1661
|
+
new_df = df.copy()
|
|
1662
|
+
new_df["Modified sequence"] = mod_entries["Modified sequence"]
|
|
1663
|
+
new_df["Modifications"] = mod_entries["Modifications"]
|
|
1664
|
+
return new_df
|
|
1665
|
+
|
|
1666
|
+
def _add_modification_localization_string(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
1667
|
+
"""Adds modification localization string columns.
|
|
1668
|
+
|
|
1669
|
+
Extracts localization probabilities from the "EG.PTMLocalizationProbabilities"
|
|
1670
|
+
column, converts them into the standardized modification localization string
|
|
1671
|
+
format used by msreport, and adds new column "Modification localization string".
|
|
1672
|
+
|
|
1673
|
+
Probabilities are written in the format
|
|
1674
|
+
"Mod1@Site1:Probability1,Site2:Probability2;Mod2@Site3:Probability3",
|
|
1675
|
+
e.g. "15.9949@11:1.000;79.9663@3:0.200,4:0.800". Refer to
|
|
1676
|
+
`msreport.peptidoform.make_localization_string` for details.
|
|
1677
|
+
|
|
1678
|
+
Args:
|
|
1679
|
+
df: Dataframe containing a "EG.PTMLocalizationProbabilities" column.
|
|
1680
|
+
|
|
1681
|
+
Returns:
|
|
1682
|
+
A copy of the input dataframe with the added column
|
|
1683
|
+
"Modification localization string".
|
|
1684
|
+
"""
|
|
1685
|
+
# TODO: not tested
|
|
1686
|
+
new_df = df.copy()
|
|
1687
|
+
localization_strings = []
|
|
1688
|
+
for localization_entry in new_df["EG.PTMLocalizationProbabilities"]:
|
|
1689
|
+
if localization_entry == "":
|
|
1690
|
+
localization_strings.append("")
|
|
1691
|
+
continue
|
|
1692
|
+
|
|
1693
|
+
localization_probabilities = extract_spectronaut_localization_probabilities(
|
|
1694
|
+
localization_entry
|
|
1695
|
+
)
|
|
1696
|
+
localization_string = msreport.peptidoform.make_localization_string(
|
|
1697
|
+
localization_probabilities
|
|
1698
|
+
)
|
|
1699
|
+
localization_strings.append(localization_string)
|
|
1700
|
+
new_df["Modification localization string"] = localization_strings
|
|
1701
|
+
return new_df
|
|
1702
|
+
|
|
1512
1703
|
|
|
1513
1704
|
def sort_leading_proteins(
|
|
1514
1705
|
table: pd.DataFrame,
|
|
@@ -1551,7 +1742,7 @@ def sort_leading_proteins(
|
|
|
1551
1742
|
db_origins_present = "Leading proteins database origin" in table
|
|
1552
1743
|
|
|
1553
1744
|
if database_order is not None:
|
|
1554
|
-
database_encoding = defaultdict(lambda: 999)
|
|
1745
|
+
database_encoding: dict[str, int] = defaultdict(lambda: 999)
|
|
1555
1746
|
database_encoding.update({db: i for i, db in enumerate(database_order)})
|
|
1556
1747
|
if penalize_contaminants is not None:
|
|
1557
1748
|
contaminant_encoding = {"False": 0, "True": 1, False: 0, True: 1}
|
|
@@ -1559,7 +1750,7 @@ def sort_leading_proteins(
|
|
|
1559
1750
|
for _, row in table.iterrows():
|
|
1560
1751
|
protein_ids = row["Leading proteins"].split(";")
|
|
1561
1752
|
|
|
1562
|
-
sorting_info = [[] for _ in protein_ids]
|
|
1753
|
+
sorting_info: list[list] = [[] for _ in protein_ids]
|
|
1563
1754
|
if special_proteins is not None:
|
|
1564
1755
|
for i, _id in enumerate(protein_ids):
|
|
1565
1756
|
sorting_info[i].append(_id not in special_proteins)
|
|
@@ -1699,7 +1890,7 @@ def add_protein_site_annotation(
|
|
|
1699
1890
|
protein_db: ProteinDatabase,
|
|
1700
1891
|
protein_column: str = "Representative protein",
|
|
1701
1892
|
site_column: str = "Protein site",
|
|
1702
|
-
):
|
|
1893
|
+
) -> pd.DataFrame:
|
|
1703
1894
|
"""Uses a FASTA protein database to add protein site annotation columns.
|
|
1704
1895
|
|
|
1705
1896
|
Adds the columns "Modified residue", which corresponds to the amino acid at the
|
|
@@ -1837,6 +2028,61 @@ def add_leading_proteins_annotation(
|
|
|
1837
2028
|
return table
|
|
1838
2029
|
|
|
1839
2030
|
|
|
2031
|
+
def add_protein_site_identifiers(
|
|
2032
|
+
table: pd.DataFrame,
|
|
2033
|
+
protein_db: ProteinDatabase,
|
|
2034
|
+
site_column: str,
|
|
2035
|
+
protein_name_column: str,
|
|
2036
|
+
):
|
|
2037
|
+
"""Adds a "Protein site identifier" column to the 'table'.
|
|
2038
|
+
|
|
2039
|
+
The "Protein site identifier" is generated by concatenating the protein name
|
|
2040
|
+
with the amino acid and position of the protein site or sites, e.g. "P12345 - S123"
|
|
2041
|
+
or "P12345 - S123 / T125". The amino acid is extracted from the protein sequence at
|
|
2042
|
+
the position of the site. If the protein name is not available, the
|
|
2043
|
+
"Representative protein" entry is used instead.
|
|
2044
|
+
|
|
2045
|
+
Args:
|
|
2046
|
+
table: Dataframe to which the protein site identifiers are added.
|
|
2047
|
+
protein_db: A protein database containing entries from one or multiple FASTA
|
|
2048
|
+
files. Protein identifiers in the 'table' column "Representative protein"
|
|
2049
|
+
are used to look up entries in the 'protein_db'.
|
|
2050
|
+
site_column: Column in 'table' that contains protein site positions. Positions
|
|
2051
|
+
are one-indexed, meaning the first amino acid of the protein is position 1.
|
|
2052
|
+
Multiple sites in a single entry should be separated by ";".
|
|
2053
|
+
protein_name_column: Column in 'table' that contains protein names, which will
|
|
2054
|
+
be used to generate the identifier. If no name is available, the accession
|
|
2055
|
+
is used instead.
|
|
2056
|
+
|
|
2057
|
+
Raises:
|
|
2058
|
+
ValueError: If the "Representative protein", 'protein_name_column' or
|
|
2059
|
+
'site_column' is not found in the 'table'.
|
|
2060
|
+
"""
|
|
2061
|
+
if site_column not in table.columns:
|
|
2062
|
+
raise ValueError(f"Column '{site_column}' not found in the table.")
|
|
2063
|
+
if protein_name_column not in table.columns:
|
|
2064
|
+
raise ValueError(f"Column '{protein_name_column}' not found in the table.")
|
|
2065
|
+
if "Representative protein" not in table.columns:
|
|
2066
|
+
raise ValueError("Column 'Representative protein' not found in the table.")
|
|
2067
|
+
|
|
2068
|
+
site_identifiers = []
|
|
2069
|
+
for accession, sites, name in zip(
|
|
2070
|
+
table["Representative protein"],
|
|
2071
|
+
table[site_column].astype(str),
|
|
2072
|
+
table[protein_name_column],
|
|
2073
|
+
):
|
|
2074
|
+
protein_sequence = protein_db[accession].sequence
|
|
2075
|
+
protein_identifier = name if name else accession
|
|
2076
|
+
aa_sites = []
|
|
2077
|
+
for site in sites.split(";"):
|
|
2078
|
+
aa = protein_sequence[int(site) - 1]
|
|
2079
|
+
aa_sites.append(f"{aa}{site}")
|
|
2080
|
+
aa_site_tag = " / ".join(aa_sites)
|
|
2081
|
+
site_identifier = f"{protein_identifier} - {aa_site_tag}"
|
|
2082
|
+
site_identifiers.append(site_identifier)
|
|
2083
|
+
table["Protein site identifier"] = site_identifiers
|
|
2084
|
+
|
|
2085
|
+
|
|
1840
2086
|
def add_sequence_coverage(
|
|
1841
2087
|
protein_table: pd.DataFrame,
|
|
1842
2088
|
peptide_table: pd.DataFrame,
|
|
@@ -2296,7 +2542,9 @@ def _extract_fragpipe_assigned_modifications(
|
|
|
2296
2542
|
return modifications
|
|
2297
2543
|
|
|
2298
2544
|
|
|
2299
|
-
def extract_maxquant_localization_probabilities(
|
|
2545
|
+
def extract_maxquant_localization_probabilities(
|
|
2546
|
+
localization_entry: str,
|
|
2547
|
+
) -> dict[int, float]:
|
|
2300
2548
|
"""Extract localization probabilites from a MaxQuant "Probabilities" entry.
|
|
2301
2549
|
|
|
2302
2550
|
Args:
|
|
@@ -2353,6 +2601,39 @@ def extract_fragpipe_localization_probabilities(localization_entry: str) -> dict
|
|
|
2353
2601
|
return modification_probabilities
|
|
2354
2602
|
|
|
2355
2603
|
|
|
2604
|
+
def extract_spectronaut_localization_probabilities(localization_entry: str) -> dict:
|
|
2605
|
+
"""Extract localization probabilites from a Spectronaut localization entry.
|
|
2606
|
+
|
|
2607
|
+
Args:
|
|
2608
|
+
localization_entry: Entry from the "EG.PTMLocalizationProbabilities" column of a
|
|
2609
|
+
spectronaut elution group (EG) output table.
|
|
2610
|
+
|
|
2611
|
+
Returns:
|
|
2612
|
+
A dictionary of modifications containing a dictionary of {position: probability}
|
|
2613
|
+
mappings. Positions are one-indexed, which means that the first amino acid
|
|
2614
|
+
position is 1.
|
|
2615
|
+
|
|
2616
|
+
Example:
|
|
2617
|
+
>>> extract_spectronaut_localization_probabilities(
|
|
2618
|
+
... "_HM[Oxidation (M): 100%]S[Phospho (STY): 45.5%]GS[Phospho (STY): 54.5%]PG_"
|
|
2619
|
+
... )
|
|
2620
|
+
{'Oxidation (M)': {2: 1.0}, 'Phospho (STY)': {3: 0.455, 5: 0.545}}
|
|
2621
|
+
"""
|
|
2622
|
+
modification_probabilities: dict[str, dict[int, float]] = {}
|
|
2623
|
+
localization_entry = localization_entry.strip("_")
|
|
2624
|
+
_, raw_probability_entries = msreport.peptidoform.parse_modified_sequence(
|
|
2625
|
+
localization_entry, "[", "]"
|
|
2626
|
+
)
|
|
2627
|
+
|
|
2628
|
+
for site, mod_probability_entry in raw_probability_entries:
|
|
2629
|
+
modification, probability_entry = mod_probability_entry.split(": ")
|
|
2630
|
+
if modification not in modification_probabilities:
|
|
2631
|
+
modification_probabilities[modification] = {}
|
|
2632
|
+
probability = float(probability_entry.replace("%", "")) / 100.0
|
|
2633
|
+
modification_probabilities[modification][site] = probability
|
|
2634
|
+
return modification_probabilities
|
|
2635
|
+
|
|
2636
|
+
|
|
2356
2637
|
def _extract_protein_ids(entries: list[str]) -> list[str]:
|
|
2357
2638
|
"""Returns a list of protein IDs, extracted from protein entries.
|
|
2358
2639
|
|
|
@@ -2466,8 +2747,8 @@ def _create_multi_protein_annotations_from_db(
|
|
|
2466
2747
|
query_result.append(query_function(db_entry, default_value))
|
|
2467
2748
|
else:
|
|
2468
2749
|
query_result.append(default_value)
|
|
2469
|
-
|
|
2470
|
-
annotation_values.append(
|
|
2750
|
+
annotation_value = ";".join(map(str, query_result))
|
|
2751
|
+
annotation_values.append(annotation_value)
|
|
2471
2752
|
return annotation_values
|
|
2472
2753
|
|
|
2473
2754
|
|
msreport/rinterface/limma.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: msreport
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.31
|
|
4
4
|
Summary: Post processing and analysis of quantitative proteomics data
|
|
5
5
|
Author-email: "David M. Hollenstein" <hollenstein.david@gmail.com>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
7
7
|
Project-URL: homepage, https://github.com/hollenstein/msreport
|
|
8
|
+
Project-URL: documentation, https://hollenstein.github.io/msreport/
|
|
8
9
|
Project-URL: changelog, https://github.com/hollenstein/msreport/blob/main/CHANGELOG.md
|
|
9
10
|
Keywords: mass spectrometry,proteomics,post processing,data analysis
|
|
10
11
|
Classifier: Development Status :: 4 - Beta
|
|
@@ -29,10 +30,17 @@ Requires-Dist: seaborn>=0.12.0
|
|
|
29
30
|
Requires-Dist: statsmodels>=0.13.2
|
|
30
31
|
Requires-Dist: typing_extensions>=4
|
|
31
32
|
Provides-Extra: r
|
|
32
|
-
Requires-Dist: rpy2
|
|
33
|
+
Requires-Dist: rpy2<3.5.13,>=3.5.3; extra == "r"
|
|
33
34
|
Provides-Extra: dev
|
|
34
35
|
Requires-Dist: mypy>=1.15.0; extra == "dev"
|
|
35
36
|
Requires-Dist: pytest>=8.3.5; extra == "dev"
|
|
37
|
+
Provides-Extra: docs
|
|
38
|
+
Requires-Dist: mkdocs-awesome-nav>=3.1.2; extra == "docs"
|
|
39
|
+
Requires-Dist: mkdocs-macros-plugin>=1.3.7; extra == "docs"
|
|
40
|
+
Requires-Dist: mkdocs-material>=9.6.15; extra == "docs"
|
|
41
|
+
Requires-Dist: mkdocs-roamlinks-plugin>=0.3.2; extra == "docs"
|
|
42
|
+
Requires-Dist: mkdocstrings-python>=1.16.12; extra == "docs"
|
|
43
|
+
Requires-Dist: ruff>=0.12.2; extra == "docs"
|
|
36
44
|
Provides-Extra: test
|
|
37
45
|
Requires-Dist: pytest>=8.3.5; extra == "test"
|
|
38
46
|
Dynamic: license-file
|
|
@@ -40,6 +48,7 @@ Dynamic: license-file
|
|
|
40
48
|
# MsReport
|
|
41
49
|
|
|
42
50
|
[](https://www.repostatus.org/#wip)
|
|
51
|
+
[](https://doi.org/10.5281/zenodo.15309090)
|
|
43
52
|

|
|
44
53
|
[](https://github.com/hollenstein/msreport/actions/workflows/run-tests.yml)
|
|
45
54
|
|
|
@@ -55,6 +64,7 @@ bottom-up mass spectrometry experiments.
|
|
|
55
64
|
- [Additional requirements](#additional-requirements)
|
|
56
65
|
- [Optional Dependencies](#optional-dependencies)
|
|
57
66
|
- [Development status](#development-status)
|
|
67
|
+
- [How to cite](#how-to-cite)
|
|
58
68
|
|
|
59
69
|
## What is MsReport?
|
|
60
70
|
|
|
@@ -62,6 +72,8 @@ MsReport is a Python library designed to simplify the post-processing and analys
|
|
|
62
72
|
|
|
63
73
|
The library supports importing protein and peptide-level quantification results from MaxQuant, FragPipe, and Spectronaut, as well as post-translational modification (PTM) data from MaxQuant and FragPipe. MsReport provides tools for data annotation, normalization and transformation, statistical testing, and data visualization.
|
|
64
74
|
|
|
75
|
+
The [documentation](https://hollenstein.github.io/msreport/) provides an overview of the library's public API.
|
|
76
|
+
|
|
65
77
|
### Key features of MsReport
|
|
66
78
|
|
|
67
79
|
#### Data Import and Standardization
|
|
@@ -134,3 +146,9 @@ For example, the R home directory might look like this on Windows: `C:\Program F
|
|
|
134
146
|
## Development status
|
|
135
147
|
|
|
136
148
|
MsReport is a stable and reliable library that has been used on a daily basis for over two years in the Mass Spectrometry Facility at the Max Perutz Labs and the Mass Spectrometry Facility of IMP/IMBA/GMI. While the current interface of MsReport is stable, the library is still under active development, with new features being added regularly. Please note that a major rewrite is planned, which may introduce changes to the API in the future.
|
|
149
|
+
|
|
150
|
+
## How to cite
|
|
151
|
+
|
|
152
|
+
If you use MsReport for your research or publications, please include the following citation and consider giving the project a star on GitHub.
|
|
153
|
+
|
|
154
|
+
> Hollenstein, D. M., & Hartl, M. (2025). hollenstein/msreport: v0.0.29 (0.0.29). Zenodo. https://doi.org/10.5281/zenodo.15309090
|