msreport 0.0.30__py3-none-any.whl → 0.0.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- msreport/__init__.py +1 -1
- msreport/aggregate/__init__.py +10 -0
- msreport/aggregate/condense.py +9 -0
- msreport/aggregate/pivot.py +14 -5
- msreport/aggregate/summarize.py +14 -4
- msreport/analyze.py +67 -5
- msreport/export.py +10 -16
- msreport/fasta.py +9 -2
- msreport/helper/__init__.py +18 -0
- msreport/helper/maxlfq.py +3 -3
- msreport/impute.py +18 -10
- msreport/isobar.py +11 -14
- msreport/normalize.py +95 -10
- msreport/peptidoform.py +21 -11
- msreport/plot/__init__.py +3 -3
- msreport/plot/comparison.py +7 -2
- msreport/plot/multivariate.py +34 -15
- msreport/plot/quality.py +1 -1
- msreport/qtable.py +25 -11
- msreport/reader.py +362 -37
- msreport/rinterface/limma.py +1 -1
- {msreport-0.0.30.dist-info → msreport-0.0.32.dist-info}/METADATA +11 -1
- msreport-0.0.32.dist-info/RECORD +38 -0
- msreport-0.0.30.dist-info/RECORD +0 -38
- {msreport-0.0.30.dist-info → msreport-0.0.32.dist-info}/WHEEL +0 -0
- {msreport-0.0.30.dist-info → msreport-0.0.32.dist-info}/licenses/LICENSE.txt +0 -0
- {msreport-0.0.30.dist-info → msreport-0.0.32.dist-info}/top_level.txt +0 -0
msreport/reader.py
CHANGED
|
@@ -1,17 +1,18 @@
|
|
|
1
|
-
"""
|
|
2
|
-
to a standardized format following the MsReport convention.
|
|
1
|
+
"""Provides tools for importing and standardizing quantitative proteomics data.
|
|
3
2
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
3
|
+
This module offers software-specific reader classes to import raw result tables (e.g.,
|
|
4
|
+
proteins, peptides, ions) from various proteomics software (MaxQuant, FragPipe,
|
|
5
|
+
Spectronaut) and convert them into a standardized `msreport` format. Additionally, it
|
|
6
|
+
provides functions for annotating imported data with biological metadata, such as
|
|
7
|
+
protein information (e.g., sequence length, molecular weight) and peptide positions,
|
|
8
|
+
extracted from a ProteinDatabase (FASTA file).
|
|
7
9
|
|
|
8
|
-
New
|
|
10
|
+
New columns added to imported protein tables:
|
|
9
11
|
- Representative protein
|
|
10
12
|
- Leading proteins
|
|
11
13
|
- Protein reported by software
|
|
12
14
|
|
|
13
|
-
|
|
14
|
-
- Total peptides
|
|
15
|
+
Standardized column names for quantitative values (if available in the software output):
|
|
15
16
|
- Spectral count "sample name"
|
|
16
17
|
- Unique spectral count "sample name"
|
|
17
18
|
- Total spectral count "sample name"
|
|
@@ -38,6 +39,7 @@ from msreport.helper.temp import extract_window_around_position
|
|
|
38
39
|
class Protein(Protocol):
|
|
39
40
|
"""Abstract protein entry"""
|
|
40
41
|
|
|
42
|
+
# identifier: str
|
|
41
43
|
header: str
|
|
42
44
|
sequence: str
|
|
43
45
|
header_fields: dict[str, str]
|
|
@@ -46,9 +48,9 @@ class Protein(Protocol):
|
|
|
46
48
|
class ProteinDatabase(Protocol):
|
|
47
49
|
"""Abstract protein database"""
|
|
48
50
|
|
|
49
|
-
def __getitem__(self,
|
|
51
|
+
def __getitem__(self, identifier: str) -> Protein: ...
|
|
50
52
|
|
|
51
|
-
def __contains__(self,
|
|
53
|
+
def __contains__(self, identifier: str) -> bool: ...
|
|
52
54
|
|
|
53
55
|
|
|
54
56
|
class ResultReader:
|
|
@@ -497,7 +499,9 @@ class MaxQuantReader(ResultReader):
|
|
|
497
499
|
mod_probability_columns = msreport.helper.find_columns(new_df, "Probabilities")
|
|
498
500
|
localization_string_column = "Modification localization string"
|
|
499
501
|
|
|
500
|
-
mod_localization_probabilities
|
|
502
|
+
mod_localization_probabilities: list[dict[str, dict[int, float]]] = [
|
|
503
|
+
{} for _ in range(new_df.shape[0])
|
|
504
|
+
]
|
|
501
505
|
for probability_column in mod_probability_columns:
|
|
502
506
|
# FUTURE: Type should be checked and enforced during the import
|
|
503
507
|
if not pd.api.types.is_string_dtype(new_df[probability_column].dtype):
|
|
@@ -541,7 +545,12 @@ class FragPipeReader(ResultReader):
|
|
|
541
545
|
"""FragPipe result reader.
|
|
542
546
|
|
|
543
547
|
Methods:
|
|
544
|
-
import_design:
|
|
548
|
+
import_design: Depending on the quantification strategy, imports either the
|
|
549
|
+
manifest file or the experiment annotation file and returns a processed
|
|
550
|
+
design dataframe.
|
|
551
|
+
import_manifest: Reads a "fragpipe-files.fp-manifest" file and returns a
|
|
552
|
+
processed design dataframe.
|
|
553
|
+
import_experiment_annotation: Reads a "experiment_annotation" file and returns a
|
|
545
554
|
processed design dataframe.
|
|
546
555
|
import_proteins: Reads a "combined_protein.tsv" or "protein.tsv" file and
|
|
547
556
|
returns a processed dataframe, conforming to the MsReport naming
|
|
@@ -585,12 +594,8 @@ class FragPipeReader(ResultReader):
|
|
|
585
594
|
"ions": "combined_ion.tsv",
|
|
586
595
|
"ion_evidence": "ion.tsv",
|
|
587
596
|
"psm_evidence": "psm.tsv",
|
|
588
|
-
"
|
|
589
|
-
|
|
590
|
-
isobar_filenames: dict[str, str] = {
|
|
591
|
-
"proteins": "protein.tsv",
|
|
592
|
-
"peptides": "peptide.tsv",
|
|
593
|
-
"ions": "ion.tsv",
|
|
597
|
+
"manifest": "fragpipe-files.fp-manifest",
|
|
598
|
+
"experiment_annotation": "experiment_annotation.tsv",
|
|
594
599
|
}
|
|
595
600
|
sil_filenames: dict[str, str] = {
|
|
596
601
|
"proteins": "combined_protein_label_quant.tsv",
|
|
@@ -671,17 +676,38 @@ class FragPipeReader(ResultReader):
|
|
|
671
676
|
self._isobar: bool = isobar
|
|
672
677
|
self._sil: bool = sil
|
|
673
678
|
self._contaminant_tag: str = contaminant_tag
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
self.filenames
|
|
679
|
+
|
|
680
|
+
self.filenames = self.default_filenames.copy()
|
|
681
|
+
if sil:
|
|
682
|
+
self.filenames.update(self.sil_filenames)
|
|
683
|
+
|
|
684
|
+
def import_design(self, sort: bool = False) -> pd.DataFrame:
|
|
685
|
+
"""Reads the experimental design file and returns a processed design dataframe.
|
|
686
|
+
|
|
687
|
+
Depending on the quantification strategy (isobaric or label-free/sil), either
|
|
688
|
+
the experiment annotation file or the manifest file is imported.
|
|
689
|
+
|
|
690
|
+
Args:
|
|
691
|
+
sort: If True, the design dataframe is sorted by "Experiment" and
|
|
692
|
+
"Replicate"; default False.
|
|
693
|
+
"""
|
|
694
|
+
if self._isobar:
|
|
695
|
+
return self.import_experiment_annotation(sort=sort)
|
|
678
696
|
else:
|
|
679
|
-
self.
|
|
697
|
+
return self.import_manifest(sort=sort)
|
|
680
698
|
|
|
681
|
-
def
|
|
699
|
+
def import_manifest(
|
|
682
700
|
self, filename: Optional[str] = None, sort: bool = False
|
|
683
701
|
) -> pd.DataFrame:
|
|
684
|
-
"""
|
|
702
|
+
"""Read a 'fp-manifest' file and returns a processed design dataframe.
|
|
703
|
+
|
|
704
|
+
The manifest columns "Path", "Experiment", and "Bioreplicate" are mapped to the
|
|
705
|
+
design table columns "Rawfile", "Experiment", and "Replicate". The "Rawfile"
|
|
706
|
+
column is extracted as the filename from the full path. The "Sample" column is
|
|
707
|
+
generated by combining "Experiment" and "Replicate" with an underscore
|
|
708
|
+
(e.g., "Experiment_Replicate"), except when "Replicate" is empty, in which case
|
|
709
|
+
"Sample" is set to "Experiment". If "Experiment" is missing, it is set to "exp"
|
|
710
|
+
by default.
|
|
685
711
|
|
|
686
712
|
Args:
|
|
687
713
|
filename: Allows specifying an alternative filename, otherwise the default
|
|
@@ -697,19 +723,25 @@ class FragPipeReader(ResultReader):
|
|
|
697
723
|
FileNotFoundError: If the specified manifest file does not exist.
|
|
698
724
|
"""
|
|
699
725
|
if filename is None:
|
|
700
|
-
filepath = os.path.join(self.data_directory, self.filenames["
|
|
726
|
+
filepath = os.path.join(self.data_directory, self.filenames["manifest"])
|
|
701
727
|
else:
|
|
702
728
|
filepath = os.path.join(self.data_directory, filename)
|
|
703
729
|
if not os.path.exists(filepath):
|
|
704
730
|
raise FileNotFoundError(
|
|
705
731
|
f"File '{filepath}' does not exist. Please check the file path."
|
|
706
732
|
)
|
|
707
|
-
fp_manifest =
|
|
733
|
+
fp_manifest = (
|
|
734
|
+
pd.read_csv(
|
|
735
|
+
filepath, sep="\t", header=None, na_values=[""], keep_default_na=False
|
|
736
|
+
)
|
|
737
|
+
.fillna("")
|
|
738
|
+
.astype(str)
|
|
739
|
+
)
|
|
708
740
|
fp_manifest.columns = ["Path", "Experiment", "Bioreplicate", "Data type"]
|
|
709
741
|
|
|
710
742
|
design = pd.DataFrame(
|
|
711
743
|
{
|
|
712
|
-
"Sample":
|
|
744
|
+
"Sample": "",
|
|
713
745
|
"Experiment": fp_manifest["Experiment"],
|
|
714
746
|
"Replicate": fp_manifest["Bioreplicate"],
|
|
715
747
|
"Rawfile": fp_manifest["Path"].apply(
|
|
@@ -718,10 +750,73 @@ class FragPipeReader(ResultReader):
|
|
|
718
750
|
),
|
|
719
751
|
}
|
|
720
752
|
)
|
|
753
|
+
# FragPipe uses "exp" for missing 'Experiment' values
|
|
754
|
+
design.loc[design["Experiment"] == "", "Experiment"] = "exp"
|
|
755
|
+
# FragPipe combines 'Experiment' + "_" + 'Replicate' into 'Sample', except when
|
|
756
|
+
# 'Replicate' is empty, in which case 'Sample' is set to 'Experiment'.
|
|
757
|
+
design["Sample"] = design["Experiment"] + "_" + design["Replicate"]
|
|
758
|
+
design.loc[design["Replicate"] == "", "Sample"] = design["Experiment"]
|
|
759
|
+
|
|
760
|
+
if sort:
|
|
761
|
+
design.sort_values(by=["Experiment", "Replicate"], inplace=True)
|
|
762
|
+
design.reset_index(drop=True, inplace=True)
|
|
763
|
+
return design
|
|
764
|
+
|
|
765
|
+
def import_experiment_annotation(
|
|
766
|
+
self, filename: Optional[str] = None, sort: bool = False
|
|
767
|
+
) -> pd.DataFrame:
|
|
768
|
+
"""Read a 'experiment_annotation' file and returns a processed design dataframe.
|
|
769
|
+
|
|
770
|
+
The annotation columns "sample", "channel", and "plex" are mapped to the design
|
|
771
|
+
table columns "Sample", "Channel", and "Plex". The "Experiment" and "Replicate"
|
|
772
|
+
columns are extracted from the "Sample" column by splitting at the last
|
|
773
|
+
underscore, if there is no underscore, "Replicate" is set to an empty string.
|
|
774
|
+
|
|
775
|
+
Note that this convention of splitting the "Sample" column does confirm to the
|
|
776
|
+
FragPipe convention, but FragPipe does not enforce it for the experiment
|
|
777
|
+
annotation file.
|
|
778
|
+
|
|
779
|
+
Args:
|
|
780
|
+
filename: Allows specifying an alternative filename, otherwise the default
|
|
781
|
+
filename is used.
|
|
782
|
+
sort: If True, the design dataframe is sorted by "Experiment" and
|
|
783
|
+
"Replicate"; default False.
|
|
784
|
+
|
|
785
|
+
Returns:
|
|
786
|
+
A dataframe containing the processed design table with columns:
|
|
787
|
+
"Sample", "Experiment", "Replicate", "Channel", and "Plex".
|
|
788
|
+
|
|
789
|
+
Raises:
|
|
790
|
+
FileNotFoundError: If the specified manifest file does not exist.
|
|
791
|
+
"""
|
|
792
|
+
if filename is None:
|
|
793
|
+
filepath = os.path.join(
|
|
794
|
+
self.data_directory, self.filenames["experiment_annotation"]
|
|
795
|
+
)
|
|
796
|
+
else:
|
|
797
|
+
filepath = os.path.join(self.data_directory, filename)
|
|
798
|
+
if not os.path.exists(filepath):
|
|
799
|
+
raise FileNotFoundError(
|
|
800
|
+
f"File '{filepath}' does not exist. Please check the file path."
|
|
801
|
+
)
|
|
802
|
+
|
|
803
|
+
annotation = pd.read_csv(filepath, sep="\t")
|
|
804
|
+
|
|
805
|
+
design = pd.DataFrame(
|
|
806
|
+
{
|
|
807
|
+
"Sample": annotation["sample"],
|
|
808
|
+
"Experiment": annotation["sample"].str.rsplit("_", n=1).str[0],
|
|
809
|
+
"Replicate": annotation["sample"].str.rsplit("_", n=1).str[1],
|
|
810
|
+
"Channel": annotation["channel"],
|
|
811
|
+
"Plex": annotation["plex"],
|
|
812
|
+
}
|
|
813
|
+
)
|
|
814
|
+
design["Replicate"] = design["Replicate"].fillna("")
|
|
721
815
|
|
|
722
816
|
if sort:
|
|
723
817
|
design.sort_values(by=["Experiment", "Replicate"], inplace=True)
|
|
724
818
|
design.reset_index(drop=True, inplace=True)
|
|
819
|
+
|
|
725
820
|
return design
|
|
726
821
|
|
|
727
822
|
def import_proteins(
|
|
@@ -963,7 +1058,7 @@ class FragPipeReader(ResultReader):
|
|
|
963
1058
|
filename: Optional[str] = None,
|
|
964
1059
|
rename_columns: bool = True,
|
|
965
1060
|
rewrite_modifications: bool = True,
|
|
966
|
-
):
|
|
1061
|
+
) -> pd.DataFrame:
|
|
967
1062
|
"""Concatenate all "psm.tsv" files and return a processed dataframe.
|
|
968
1063
|
|
|
969
1064
|
Args:
|
|
@@ -1010,6 +1105,7 @@ class FragPipeReader(ResultReader):
|
|
|
1010
1105
|
)
|
|
1011
1106
|
df["Modified sequence"] = mod_entries["Modified sequence"]
|
|
1012
1107
|
df["Modifications"] = mod_entries["Modifications"]
|
|
1108
|
+
df = self._add_modification_localization_string_to_psm_evidence(df)
|
|
1013
1109
|
return df
|
|
1014
1110
|
|
|
1015
1111
|
def _add_protein_entries(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -1183,6 +1279,66 @@ class FragPipeReader(ResultReader):
|
|
|
1183
1279
|
new_df[new_column] = localization_strings
|
|
1184
1280
|
return new_df
|
|
1185
1281
|
|
|
1282
|
+
def _add_modification_localization_string_to_psm_evidence(
|
|
1283
|
+
self, df: pd.DataFrame
|
|
1284
|
+
) -> pd.DataFrame:
|
|
1285
|
+
"""Adds a modification localization string column to a PSM evidence table.
|
|
1286
|
+
|
|
1287
|
+
Extracts localization probabilities from all columns in the form
|
|
1288
|
+
f"{aa:modification}", converts them into the standardized modification
|
|
1289
|
+
localization string format used by msreport, and adds a new column
|
|
1290
|
+
"Modification localization string".
|
|
1291
|
+
|
|
1292
|
+
Probabilities are written in the format
|
|
1293
|
+
"Mod1@Site1:Probability1,Site2:Probability2;Mod2@Site3:Probability3",
|
|
1294
|
+
e.g. "15.9949@11:1.000;79.9663@3:0.200,4:0.800". Refer to
|
|
1295
|
+
`msreport.peptidoform.make_localization_string` for details.
|
|
1296
|
+
|
|
1297
|
+
Args:
|
|
1298
|
+
df: A dataframe containing PSM tables from FragPipe.
|
|
1299
|
+
|
|
1300
|
+
Returns:
|
|
1301
|
+
A copy of the input dataframe with the added
|
|
1302
|
+
"Modification localization string" column.
|
|
1303
|
+
"""
|
|
1304
|
+
new_df = df.copy()
|
|
1305
|
+
_search_tag = " Best Localization"
|
|
1306
|
+
mod_localization_columns = [
|
|
1307
|
+
c.strip(_search_tag) for c in new_df.columns if c.endswith(_search_tag)
|
|
1308
|
+
]
|
|
1309
|
+
if not mod_localization_columns:
|
|
1310
|
+
new_df["Modification localization string"] = ""
|
|
1311
|
+
return new_df
|
|
1312
|
+
|
|
1313
|
+
df[mod_localization_columns] = (
|
|
1314
|
+
df[mod_localization_columns].astype(str).replace("nan", "")
|
|
1315
|
+
)
|
|
1316
|
+
row_mod_probabilities: list[dict[str, dict[int, float]]] = [
|
|
1317
|
+
{} for i in range(df.shape[0])
|
|
1318
|
+
]
|
|
1319
|
+
for mod_localization_column in mod_localization_columns:
|
|
1320
|
+
modification = mod_localization_column.split(":")[1]
|
|
1321
|
+
for modification_probabilities, probability_sequence in zip(
|
|
1322
|
+
row_mod_probabilities, df[mod_localization_column]
|
|
1323
|
+
):
|
|
1324
|
+
if not probability_sequence:
|
|
1325
|
+
continue
|
|
1326
|
+
_, probabilities = msreport.peptidoform.parse_modified_sequence(
|
|
1327
|
+
probability_sequence, "(", ")"
|
|
1328
|
+
)
|
|
1329
|
+
modification_probabilities[modification] = {
|
|
1330
|
+
site: float(probability) for site, probability in probabilities
|
|
1331
|
+
}
|
|
1332
|
+
|
|
1333
|
+
localization_strings = []
|
|
1334
|
+
for localization_probabilities in row_mod_probabilities:
|
|
1335
|
+
localization_string = msreport.peptidoform.make_localization_string(
|
|
1336
|
+
localization_probabilities
|
|
1337
|
+
)
|
|
1338
|
+
localization_strings.append(localization_string)
|
|
1339
|
+
new_df["Modification localization string"] = localization_strings
|
|
1340
|
+
return new_df
|
|
1341
|
+
|
|
1186
1342
|
|
|
1187
1343
|
class SpectronautReader(ResultReader):
|
|
1188
1344
|
"""Spectronaut result reader.
|
|
@@ -1499,6 +1655,7 @@ class SpectronautReader(ResultReader):
|
|
|
1499
1655
|
filename: Optional[str] = None,
|
|
1500
1656
|
filetag: Optional[str] = None,
|
|
1501
1657
|
rename_columns: bool = True,
|
|
1658
|
+
rewrite_modifications: bool = True,
|
|
1502
1659
|
) -> pd.DataFrame:
|
|
1503
1660
|
"""Reads an ion evidence file (long format) and returns a processed dataframe.
|
|
1504
1661
|
|
|
@@ -1508,8 +1665,15 @@ class SpectronautReader(ResultReader):
|
|
|
1508
1665
|
generated by concatenating the "Modified sequence" and "Charge" columns, and if
|
|
1509
1666
|
present, the "Compensation voltage" column.
|
|
1510
1667
|
|
|
1511
|
-
|
|
1512
|
-
are
|
|
1668
|
+
"Modified sequence" entries contain modifications within square brackets.
|
|
1669
|
+
"Modification" entries are strings in the form of "position:modification_tag",
|
|
1670
|
+
multiple modifications are joined by ";". An example for a modified sequence and
|
|
1671
|
+
a modification entry: "PEPT[Phospho]IDO[Oxidation]", "4:Phospho;7:Oxidation".
|
|
1672
|
+
|
|
1673
|
+
"Modification localization string" contains localization probabilities in the
|
|
1674
|
+
format "Mod1@Site1:Probability1,Site2:Probability2;Mod2@Site3:Probability3",
|
|
1675
|
+
e.g. "15.9949@11:1.000;79.9663@3:0.200,4:0.800". Refer to
|
|
1676
|
+
`msreport.peptidoform.make_localization_string` for details.
|
|
1513
1677
|
|
|
1514
1678
|
Args:
|
|
1515
1679
|
filename: Optional, allows specifying a specific file that will be imported.
|
|
@@ -1517,6 +1681,10 @@ class SpectronautReader(ResultReader):
|
|
|
1517
1681
|
a substring, instead of specifying a filename.
|
|
1518
1682
|
rename_columns: If True, columns are renamed according to the MsReport
|
|
1519
1683
|
convention; default True.
|
|
1684
|
+
rewrite_modifications: If True, the peptide format in "Modified sequence" is
|
|
1685
|
+
changed according to the MsReport convention, and a "Modifications" is
|
|
1686
|
+
added to contains the amino acid position for all modifications.
|
|
1687
|
+
Requires 'rename_columns' to be true. Default True.
|
|
1520
1688
|
|
|
1521
1689
|
Returns:
|
|
1522
1690
|
A dataframe containing the processed ion table.
|
|
@@ -1544,6 +1712,9 @@ class SpectronautReader(ResultReader):
|
|
|
1544
1712
|
df = self._add_protein_entries(df)
|
|
1545
1713
|
if rename_columns:
|
|
1546
1714
|
df = self._rename_columns(df, True)
|
|
1715
|
+
if rewrite_modifications and rename_columns:
|
|
1716
|
+
df = self._add_peptide_modification_entries(df)
|
|
1717
|
+
df = self._add_modification_localization_string(df)
|
|
1547
1718
|
df["Ion ID"] = df["Modified sequence"] + "_c" + df["Charge"].astype(str)
|
|
1548
1719
|
if "Compensation voltage" in df.columns:
|
|
1549
1720
|
_cv = df["Compensation voltage"].astype(str)
|
|
@@ -1597,6 +1768,70 @@ class SpectronautReader(ResultReader):
|
|
|
1597
1768
|
leading_protein_entries = df["PG.ProteinAccessions"].str.split(";").tolist()
|
|
1598
1769
|
return leading_protein_entries
|
|
1599
1770
|
|
|
1771
|
+
def _add_peptide_modification_entries(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
1772
|
+
"""Adds standardized "Modified sequence" and "Modifications" columns.
|
|
1773
|
+
|
|
1774
|
+
"Modified sequence" entries contain modifications within square brackets.
|
|
1775
|
+
"Modifications" entries are strings in the form of "position:modification_text",
|
|
1776
|
+
multiple modifications are joined by ";". An example for a modified sequence and
|
|
1777
|
+
a modification entry: "PEPT[Phospho]IDO[Oxidation]", "4:Phospho;7:Oxidation".
|
|
1778
|
+
|
|
1779
|
+
Requires the columns "Peptide sequence" and "Modified sequence" from the
|
|
1780
|
+
software output.
|
|
1781
|
+
|
|
1782
|
+
Args:
|
|
1783
|
+
df: Dataframe containing "Peptide sequence" and "Modified sequence" columns.
|
|
1784
|
+
|
|
1785
|
+
Returns:
|
|
1786
|
+
A copy of the input dataframe with updated columns.
|
|
1787
|
+
"""
|
|
1788
|
+
# TODO: not tested
|
|
1789
|
+
mod_sequences = df["Modified sequence"].str[1:-1] # Remove sourrounding "_"
|
|
1790
|
+
mod_entries = _generate_modification_entries(
|
|
1791
|
+
df["Peptide sequence"], mod_sequences, "[", "]"
|
|
1792
|
+
)
|
|
1793
|
+
new_df = df.copy()
|
|
1794
|
+
new_df["Modified sequence"] = mod_entries["Modified sequence"]
|
|
1795
|
+
new_df["Modifications"] = mod_entries["Modifications"]
|
|
1796
|
+
return new_df
|
|
1797
|
+
|
|
1798
|
+
def _add_modification_localization_string(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
1799
|
+
"""Adds modification localization string columns.
|
|
1800
|
+
|
|
1801
|
+
Extracts localization probabilities from the "EG.PTMLocalizationProbabilities"
|
|
1802
|
+
column, converts them into the standardized modification localization string
|
|
1803
|
+
format used by msreport, and adds new column "Modification localization string".
|
|
1804
|
+
|
|
1805
|
+
Probabilities are written in the format
|
|
1806
|
+
"Mod1@Site1:Probability1,Site2:Probability2;Mod2@Site3:Probability3",
|
|
1807
|
+
e.g. "15.9949@11:1.000;79.9663@3:0.200,4:0.800". Refer to
|
|
1808
|
+
`msreport.peptidoform.make_localization_string` for details.
|
|
1809
|
+
|
|
1810
|
+
Args:
|
|
1811
|
+
df: Dataframe containing a "EG.PTMLocalizationProbabilities" column.
|
|
1812
|
+
|
|
1813
|
+
Returns:
|
|
1814
|
+
A copy of the input dataframe with the added column
|
|
1815
|
+
"Modification localization string".
|
|
1816
|
+
"""
|
|
1817
|
+
# TODO: not tested
|
|
1818
|
+
new_df = df.copy()
|
|
1819
|
+
localization_strings = []
|
|
1820
|
+
for localization_entry in new_df["EG.PTMLocalizationProbabilities"]:
|
|
1821
|
+
if localization_entry == "":
|
|
1822
|
+
localization_strings.append("")
|
|
1823
|
+
continue
|
|
1824
|
+
|
|
1825
|
+
localization_probabilities = extract_spectronaut_localization_probabilities(
|
|
1826
|
+
localization_entry
|
|
1827
|
+
)
|
|
1828
|
+
localization_string = msreport.peptidoform.make_localization_string(
|
|
1829
|
+
localization_probabilities
|
|
1830
|
+
)
|
|
1831
|
+
localization_strings.append(localization_string)
|
|
1832
|
+
new_df["Modification localization string"] = localization_strings
|
|
1833
|
+
return new_df
|
|
1834
|
+
|
|
1600
1835
|
|
|
1601
1836
|
def sort_leading_proteins(
|
|
1602
1837
|
table: pd.DataFrame,
|
|
@@ -1639,7 +1874,7 @@ def sort_leading_proteins(
|
|
|
1639
1874
|
db_origins_present = "Leading proteins database origin" in table
|
|
1640
1875
|
|
|
1641
1876
|
if database_order is not None:
|
|
1642
|
-
database_encoding = defaultdict(lambda: 999)
|
|
1877
|
+
database_encoding: dict[str, int] = defaultdict(lambda: 999)
|
|
1643
1878
|
database_encoding.update({db: i for i, db in enumerate(database_order)})
|
|
1644
1879
|
if penalize_contaminants is not None:
|
|
1645
1880
|
contaminant_encoding = {"False": 0, "True": 1, False: 0, True: 1}
|
|
@@ -1647,7 +1882,7 @@ def sort_leading_proteins(
|
|
|
1647
1882
|
for _, row in table.iterrows():
|
|
1648
1883
|
protein_ids = row["Leading proteins"].split(";")
|
|
1649
1884
|
|
|
1650
|
-
sorting_info = [[] for _ in protein_ids]
|
|
1885
|
+
sorting_info: list[list] = [[] for _ in protein_ids]
|
|
1651
1886
|
if special_proteins is not None:
|
|
1652
1887
|
for i, _id in enumerate(protein_ids):
|
|
1653
1888
|
sorting_info[i].append(_id not in special_proteins)
|
|
@@ -1787,7 +2022,7 @@ def add_protein_site_annotation(
|
|
|
1787
2022
|
protein_db: ProteinDatabase,
|
|
1788
2023
|
protein_column: str = "Representative protein",
|
|
1789
2024
|
site_column: str = "Protein site",
|
|
1790
|
-
):
|
|
2025
|
+
) -> pd.DataFrame:
|
|
1791
2026
|
"""Uses a FASTA protein database to add protein site annotation columns.
|
|
1792
2027
|
|
|
1793
2028
|
Adds the columns "Modified residue", which corresponds to the amino acid at the
|
|
@@ -1925,6 +2160,61 @@ def add_leading_proteins_annotation(
|
|
|
1925
2160
|
return table
|
|
1926
2161
|
|
|
1927
2162
|
|
|
2163
|
+
def add_protein_site_identifiers(
|
|
2164
|
+
table: pd.DataFrame,
|
|
2165
|
+
protein_db: ProteinDatabase,
|
|
2166
|
+
site_column: str,
|
|
2167
|
+
protein_name_column: str,
|
|
2168
|
+
):
|
|
2169
|
+
"""Adds a "Protein site identifier" column to the 'table'.
|
|
2170
|
+
|
|
2171
|
+
The "Protein site identifier" is generated by concatenating the protein name
|
|
2172
|
+
with the amino acid and position of the protein site or sites, e.g. "P12345 - S123"
|
|
2173
|
+
or "P12345 - S123 / T125". The amino acid is extracted from the protein sequence at
|
|
2174
|
+
the position of the site. If the protein name is not available, the
|
|
2175
|
+
"Representative protein" entry is used instead.
|
|
2176
|
+
|
|
2177
|
+
Args:
|
|
2178
|
+
table: Dataframe to which the protein site identifiers are added.
|
|
2179
|
+
protein_db: A protein database containing entries from one or multiple FASTA
|
|
2180
|
+
files. Protein identifiers in the 'table' column "Representative protein"
|
|
2181
|
+
are used to look up entries in the 'protein_db'.
|
|
2182
|
+
site_column: Column in 'table' that contains protein site positions. Positions
|
|
2183
|
+
are one-indexed, meaning the first amino acid of the protein is position 1.
|
|
2184
|
+
Multiple sites in a single entry should be separated by ";".
|
|
2185
|
+
protein_name_column: Column in 'table' that contains protein names, which will
|
|
2186
|
+
be used to generate the identifier. If no name is available, the accession
|
|
2187
|
+
is used instead.
|
|
2188
|
+
|
|
2189
|
+
Raises:
|
|
2190
|
+
ValueError: If the "Representative protein", 'protein_name_column' or
|
|
2191
|
+
'site_column' is not found in the 'table'.
|
|
2192
|
+
"""
|
|
2193
|
+
if site_column not in table.columns:
|
|
2194
|
+
raise ValueError(f"Column '{site_column}' not found in the table.")
|
|
2195
|
+
if protein_name_column not in table.columns:
|
|
2196
|
+
raise ValueError(f"Column '{protein_name_column}' not found in the table.")
|
|
2197
|
+
if "Representative protein" not in table.columns:
|
|
2198
|
+
raise ValueError("Column 'Representative protein' not found in the table.")
|
|
2199
|
+
|
|
2200
|
+
site_identifiers = []
|
|
2201
|
+
for accession, sites, name in zip(
|
|
2202
|
+
table["Representative protein"],
|
|
2203
|
+
table[site_column].astype(str),
|
|
2204
|
+
table[protein_name_column],
|
|
2205
|
+
):
|
|
2206
|
+
protein_sequence = protein_db[accession].sequence
|
|
2207
|
+
protein_identifier = name if name else accession
|
|
2208
|
+
aa_sites = []
|
|
2209
|
+
for site in sites.split(";"):
|
|
2210
|
+
aa = protein_sequence[int(site) - 1]
|
|
2211
|
+
aa_sites.append(f"{aa}{site}")
|
|
2212
|
+
aa_site_tag = " / ".join(aa_sites)
|
|
2213
|
+
site_identifier = f"{protein_identifier} - {aa_site_tag}"
|
|
2214
|
+
site_identifiers.append(site_identifier)
|
|
2215
|
+
table["Protein site identifier"] = site_identifiers
|
|
2216
|
+
|
|
2217
|
+
|
|
1928
2218
|
def add_sequence_coverage(
|
|
1929
2219
|
protein_table: pd.DataFrame,
|
|
1930
2220
|
peptide_table: pd.DataFrame,
|
|
@@ -2384,7 +2674,9 @@ def _extract_fragpipe_assigned_modifications(
|
|
|
2384
2674
|
return modifications
|
|
2385
2675
|
|
|
2386
2676
|
|
|
2387
|
-
def extract_maxquant_localization_probabilities(
|
|
2677
|
+
def extract_maxquant_localization_probabilities(
|
|
2678
|
+
localization_entry: str,
|
|
2679
|
+
) -> dict[int, float]:
|
|
2388
2680
|
"""Extract localization probabilites from a MaxQuant "Probabilities" entry.
|
|
2389
2681
|
|
|
2390
2682
|
Args:
|
|
@@ -2441,6 +2733,39 @@ def extract_fragpipe_localization_probabilities(localization_entry: str) -> dict
|
|
|
2441
2733
|
return modification_probabilities
|
|
2442
2734
|
|
|
2443
2735
|
|
|
2736
|
+
def extract_spectronaut_localization_probabilities(localization_entry: str) -> dict:
|
|
2737
|
+
"""Extract localization probabilites from a Spectronaut localization entry.
|
|
2738
|
+
|
|
2739
|
+
Args:
|
|
2740
|
+
localization_entry: Entry from the "EG.PTMLocalizationProbabilities" column of a
|
|
2741
|
+
spectronaut elution group (EG) output table.
|
|
2742
|
+
|
|
2743
|
+
Returns:
|
|
2744
|
+
A dictionary of modifications containing a dictionary of {position: probability}
|
|
2745
|
+
mappings. Positions are one-indexed, which means that the first amino acid
|
|
2746
|
+
position is 1.
|
|
2747
|
+
|
|
2748
|
+
Example:
|
|
2749
|
+
>>> extract_spectronaut_localization_probabilities(
|
|
2750
|
+
... "_HM[Oxidation (M): 100%]S[Phospho (STY): 45.5%]GS[Phospho (STY): 54.5%]PG_"
|
|
2751
|
+
... )
|
|
2752
|
+
{'Oxidation (M)': {2: 1.0}, 'Phospho (STY)': {3: 0.455, 5: 0.545}}
|
|
2753
|
+
"""
|
|
2754
|
+
modification_probabilities: dict[str, dict[int, float]] = {}
|
|
2755
|
+
localization_entry = localization_entry.strip("_")
|
|
2756
|
+
_, raw_probability_entries = msreport.peptidoform.parse_modified_sequence(
|
|
2757
|
+
localization_entry, "[", "]"
|
|
2758
|
+
)
|
|
2759
|
+
|
|
2760
|
+
for site, mod_probability_entry in raw_probability_entries:
|
|
2761
|
+
modification, probability_entry = mod_probability_entry.split(": ")
|
|
2762
|
+
if modification not in modification_probabilities:
|
|
2763
|
+
modification_probabilities[modification] = {}
|
|
2764
|
+
probability = float(probability_entry.replace("%", "")) / 100.0
|
|
2765
|
+
modification_probabilities[modification][site] = probability
|
|
2766
|
+
return modification_probabilities
|
|
2767
|
+
|
|
2768
|
+
|
|
2444
2769
|
def _extract_protein_ids(entries: list[str]) -> list[str]:
|
|
2445
2770
|
"""Returns a list of protein IDs, extracted from protein entries.
|
|
2446
2771
|
|
|
@@ -2554,8 +2879,8 @@ def _create_multi_protein_annotations_from_db(
|
|
|
2554
2879
|
query_result.append(query_function(db_entry, default_value))
|
|
2555
2880
|
else:
|
|
2556
2881
|
query_result.append(default_value)
|
|
2557
|
-
|
|
2558
|
-
annotation_values.append(
|
|
2882
|
+
annotation_value = ";".join(map(str, query_result))
|
|
2883
|
+
annotation_values.append(annotation_value)
|
|
2559
2884
|
return annotation_values
|
|
2560
2885
|
|
|
2561
2886
|
|
msreport/rinterface/limma.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: msreport
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.32
|
|
4
4
|
Summary: Post processing and analysis of quantitative proteomics data
|
|
5
5
|
Author-email: "David M. Hollenstein" <hollenstein.david@gmail.com>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
7
7
|
Project-URL: homepage, https://github.com/hollenstein/msreport
|
|
8
|
+
Project-URL: documentation, https://hollenstein.github.io/msreport/
|
|
8
9
|
Project-URL: changelog, https://github.com/hollenstein/msreport/blob/main/CHANGELOG.md
|
|
9
10
|
Keywords: mass spectrometry,proteomics,post processing,data analysis
|
|
10
11
|
Classifier: Development Status :: 4 - Beta
|
|
@@ -33,6 +34,13 @@ Requires-Dist: rpy2<3.5.13,>=3.5.3; extra == "r"
|
|
|
33
34
|
Provides-Extra: dev
|
|
34
35
|
Requires-Dist: mypy>=1.15.0; extra == "dev"
|
|
35
36
|
Requires-Dist: pytest>=8.3.5; extra == "dev"
|
|
37
|
+
Provides-Extra: docs
|
|
38
|
+
Requires-Dist: mkdocs-awesome-nav>=3.1.2; extra == "docs"
|
|
39
|
+
Requires-Dist: mkdocs-macros-plugin>=1.3.7; extra == "docs"
|
|
40
|
+
Requires-Dist: mkdocs-material>=9.6.15; extra == "docs"
|
|
41
|
+
Requires-Dist: mkdocs-roamlinks-plugin>=0.3.2; extra == "docs"
|
|
42
|
+
Requires-Dist: mkdocstrings-python>=1.16.12; extra == "docs"
|
|
43
|
+
Requires-Dist: ruff>=0.12.2; extra == "docs"
|
|
36
44
|
Provides-Extra: test
|
|
37
45
|
Requires-Dist: pytest>=8.3.5; extra == "test"
|
|
38
46
|
Dynamic: license-file
|
|
@@ -64,6 +72,8 @@ MsReport is a Python library designed to simplify the post-processing and analys
|
|
|
64
72
|
|
|
65
73
|
The library supports importing protein and peptide-level quantification results from MaxQuant, FragPipe, and Spectronaut, as well as post-translational modification (PTM) data from MaxQuant and FragPipe. MsReport provides tools for data annotation, normalization and transformation, statistical testing, and data visualization.
|
|
66
74
|
|
|
75
|
+
The [documentation](https://hollenstein.github.io/msreport/) provides an overview of the library's public API.
|
|
76
|
+
|
|
67
77
|
### Key features of MsReport
|
|
68
78
|
|
|
69
79
|
#### Data Import and Standardization
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
msreport/__init__.py,sha256=hmq4---v9oHxQm9gidnxGryrWB8HqPfMPHaPryBS_Oc,339
|
|
2
|
+
msreport/analyze.py,sha256=T6ORhBYP3Qnil0r7qF5CkwS2KHUsedpU5P-0paqUmaA,33838
|
|
3
|
+
msreport/errors.py,sha256=X9yFxMiIOCWQdxuqBGr8L7O3vRV2KElXdX1uHbFcZMk,421
|
|
4
|
+
msreport/export.py,sha256=wXQfaVd5UHlGKyKdrt2UWbhzNf-VyJy2Up5qfrPzO2M,20229
|
|
5
|
+
msreport/fasta.py,sha256=hPz4xlkjeTV-2YCrtWMsQQJSkJSmH1ZzNZBxHI89Nqk,1489
|
|
6
|
+
msreport/impute.py,sha256=q21cFKnpENE4GHUPz-R5FipkvagWjX4fa31qeb8uaxc,10782
|
|
7
|
+
msreport/isobar.py,sha256=nh2Wem1wheqJ6wAJYm8be9FuK21c7T1k7nectJjPw7o,6729
|
|
8
|
+
msreport/normalize.py,sha256=73n344jBQ9u-Ube_wOxF5Svi2ltKMnBKaw8M36hEaQM,23441
|
|
9
|
+
msreport/peptidoform.py,sha256=mJhqoolFL6ZzwnmQkWhgJn8zIBoxv_GdYVSb-6gw37g,12615
|
|
10
|
+
msreport/qtable.py,sha256=RhfGdij7cIVO5JiUC-xSQkd7zV-Q8KmC94daA9JotHc,28203
|
|
11
|
+
msreport/reader.py,sha256=02cst1NRyBoeBaspfM67BM_KsTR9pt1NZQX49J_Wev0,131276
|
|
12
|
+
msreport/aggregate/__init__.py,sha256=Y5HnN9C2PRjWfq4epJAoNqyp4Pv6WQfguAcSYKIhRuw,609
|
|
13
|
+
msreport/aggregate/condense.py,sha256=fspY8osQfjzzehw3v4Up2QSihNiixhQpAiCiwXLIpCQ,6301
|
|
14
|
+
msreport/aggregate/pivot.py,sha256=Myk9QhOmQWge7MvGlFYwdD4u7pdqYaAaFZ0uxZH4d28,5491
|
|
15
|
+
msreport/aggregate/summarize.py,sha256=_KbSuLS3rRxIMpoIXfPyC2--5sACV9NsivbS0BPFr9o,12736
|
|
16
|
+
msreport/helper/__init__.py,sha256=IG4xaP_iIugqBLUpDHMj-SbD2_elL5on_V4whLIQTbM,1003
|
|
17
|
+
msreport/helper/calc.py,sha256=J4XltEnMrFR9IQlPtrZhyxlSTj15072huHCMA_nqQ6E,4245
|
|
18
|
+
msreport/helper/maxlfq.py,sha256=kFm3hRNWntM067EuoSrO_x-i5YNXphBfrvssMA3OM1g,14947
|
|
19
|
+
msreport/helper/table.py,sha256=x-Wo8mTENsUxc_gtF-wgOyQa9g7W2fK6tuRiEX7bda0,11430
|
|
20
|
+
msreport/helper/temp.py,sha256=jNulgDATf9sKXEFWMXAhjflciOZPAqlxg_7QZS7IkW8,3736
|
|
21
|
+
msreport/plot/__init__.py,sha256=p-oLxmZIvfC--xkjB0ka321xddW-lst19PmokJq9lTk,1457
|
|
22
|
+
msreport/plot/_partial_plots.py,sha256=tqZTSXEPuruMgVakaGR2tUQl5OrHgo2cROJ0S4cqkR0,5598
|
|
23
|
+
msreport/plot/comparison.py,sha256=Y2KOuakj-TxqdT2XNt7lnVZwimKSszvFQI-K9Pm80k8,18770
|
|
24
|
+
msreport/plot/distribution.py,sha256=QNFL5vG9p-vqhwEk5WcCSXa2B8u5QgySZlAQIPys0-0,10248
|
|
25
|
+
msreport/plot/multivariate.py,sha256=v79gcb-8s5bZVpaJn13MOmqsNA0ZvrV25JlXmHmp4WA,14046
|
|
26
|
+
msreport/plot/quality.py,sha256=ZZKMkghmVESjA49Qg-iukVFBoDIgI2iWLlFa7vJWX7M,15869
|
|
27
|
+
msreport/plot/style.py,sha256=67jWf4uA1ub9RJDu4xhuSoXAW0lbLj6SMP4QXQO76Pc,10591
|
|
28
|
+
msreport/plot/style_sheets/msreport-notebook.mplstyle,sha256=SPYO_7vYT8Ha7tQ0KCTLtykiRQ13-_igAm7kyvsZj1I,1266
|
|
29
|
+
msreport/plot/style_sheets/seaborn-whitegrid.mplstyle,sha256=eC8Zboy8R7ybBwbHPKvKbMIHACystN6X6I0lqm7B80U,833
|
|
30
|
+
msreport/rinterface/__init__.py,sha256=Zs6STvbDqaVZVPRM6iU0kKjq0TWz_2p2ChvNAveRdTA,616
|
|
31
|
+
msreport/rinterface/limma.py,sha256=P-Fs8HARSXz60rO_vLc--of1hafk_IgGgPaNXnS_aKg,5424
|
|
32
|
+
msreport/rinterface/rinstaller.py,sha256=AGs6NFMSwTLrzrIJz1E5BE5jFUz8eQBHlpM_MWVChzA,1370
|
|
33
|
+
msreport/rinterface/rscripts/limma.R,sha256=gr_yjMm_YoG45irDhWOo6gkRQSTwj_7uU_p3NBRHPm8,4331
|
|
34
|
+
msreport-0.0.32.dist-info/licenses/LICENSE.txt,sha256=Pd-b5cKP4n2tFDpdx27qJSIq0d1ok0oEcGTlbtL6QMU,11560
|
|
35
|
+
msreport-0.0.32.dist-info/METADATA,sha256=_OI-LkqJoperzDBo6KjAir7Xq6jANqyjpqyxUfu9T-4,8998
|
|
36
|
+
msreport-0.0.32.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
37
|
+
msreport-0.0.32.dist-info/top_level.txt,sha256=Drl8mCckJHFIw-Ovh5AnyjKnqvLJltDOBUr1JAcHAlI,9
|
|
38
|
+
msreport-0.0.32.dist-info/RECORD,,
|