msreport 0.0.27__py3-none-any.whl → 0.0.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- msreport/__init__.py +4 -6
- msreport/aggregate/condense.py +1 -1
- msreport/aggregate/pivot.py +1 -0
- msreport/aggregate/summarize.py +2 -2
- msreport/analyze.py +117 -36
- msreport/errors.py +5 -2
- msreport/export.py +16 -13
- msreport/fasta.py +2 -1
- msreport/helper/__init__.py +7 -7
- msreport/helper/calc.py +14 -15
- msreport/helper/maxlfq.py +2 -2
- msreport/helper/table.py +5 -6
- msreport/impute.py +4 -3
- msreport/isobar.py +10 -9
- msreport/normalize.py +2 -1
- msreport/peptidoform.py +6 -4
- msreport/plot/__init__.py +41 -0
- msreport/plot/_partial_plots.py +159 -0
- msreport/plot/comparison.py +490 -0
- msreport/plot/distribution.py +253 -0
- msreport/plot/multivariate.py +355 -0
- msreport/plot/quality.py +431 -0
- msreport/plot/style.py +286 -0
- msreport/plot/style_sheets/msreport-notebook.mplstyle +57 -0
- msreport/plot/style_sheets/seaborn-whitegrid.mplstyle +45 -0
- msreport/qtable.py +109 -17
- msreport/reader.py +235 -86
- msreport/rinterface/__init__.py +16 -3
- msreport/rinterface/limma.py +2 -1
- msreport/rinterface/rinstaller.py +3 -3
- msreport-0.0.29.dist-info/METADATA +136 -0
- msreport-0.0.29.dist-info/RECORD +38 -0
- {msreport-0.0.27.dist-info → msreport-0.0.29.dist-info}/WHEEL +1 -1
- msreport/plot.py +0 -1134
- msreport-0.0.27.dist-info/METADATA +0 -129
- msreport-0.0.27.dist-info/RECORD +0 -30
- {msreport-0.0.27.dist-info → msreport-0.0.29.dist-info}/licenses/LICENSE.txt +0 -0
- {msreport-0.0.27.dist-info → msreport-0.0.29.dist-info}/top_level.txt +0 -0
msreport/reader.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Module for reading result tables from various MS analysis tools and converting them
|
|
2
2
|
to a standardized format following the MsReport convention.
|
|
3
3
|
|
|
4
4
|
Currently for MaxQuant and FragPipe protein, peptide, and ion tables are supported, and
|
|
@@ -20,19 +20,19 @@ Unified column names:
|
|
|
20
20
|
- iBAQ intensity "sample name"
|
|
21
21
|
"""
|
|
22
22
|
|
|
23
|
-
from collections import OrderedDict, defaultdict
|
|
24
23
|
import os
|
|
25
|
-
from typing import Any, Callable, Iterable, Optional, Protocol
|
|
26
24
|
import pathlib
|
|
27
25
|
import warnings
|
|
26
|
+
from collections import OrderedDict, defaultdict
|
|
27
|
+
from typing import Any, Callable, Iterable, Optional, Protocol
|
|
28
28
|
|
|
29
29
|
import numpy as np
|
|
30
30
|
import pandas as pd
|
|
31
31
|
|
|
32
32
|
import msreport.helper as helper
|
|
33
|
-
from msreport.helper.temp import extract_window_around_position
|
|
34
|
-
from msreport.errors import ProteinsNotInFastaWarning
|
|
35
33
|
import msreport.peptidoform
|
|
34
|
+
from msreport.errors import ProteinsNotInFastaWarning
|
|
35
|
+
from msreport.helper.temp import extract_window_around_position
|
|
36
36
|
|
|
37
37
|
|
|
38
38
|
class Protein(Protocol):
|
|
@@ -54,6 +54,8 @@ class ProteinDatabase(Protocol):
|
|
|
54
54
|
class ResultReader:
|
|
55
55
|
"""Base Reader class, is by itself not functional."""
|
|
56
56
|
|
|
57
|
+
data_directory: str
|
|
58
|
+
filenames: dict[str, str]
|
|
57
59
|
default_filenames: dict[str, str]
|
|
58
60
|
protected_columns: list[str]
|
|
59
61
|
column_mapping: dict[str, str]
|
|
@@ -61,8 +63,8 @@ class ResultReader:
|
|
|
61
63
|
sample_column_tags: list[str]
|
|
62
64
|
|
|
63
65
|
def __init__(self):
|
|
64
|
-
self.data_directory
|
|
65
|
-
self.filenames
|
|
66
|
+
self.data_directory = ""
|
|
67
|
+
self.filenames = {}
|
|
66
68
|
|
|
67
69
|
def _read_file(self, which: str, sep: str = "\t") -> pd.DataFrame:
|
|
68
70
|
"""Read a result table.
|
|
@@ -183,18 +185,16 @@ class MaxQuantReader(ResultReader):
|
|
|
183
185
|
"MS/MS count",
|
|
184
186
|
"Sequence coverage",
|
|
185
187
|
]
|
|
186
|
-
column_mapping: dict[str, str] =
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
]
|
|
197
|
-
)
|
|
188
|
+
column_mapping: dict[str, str] = {
|
|
189
|
+
"Peptides": "Total peptides",
|
|
190
|
+
"Sequence coverage [%]": "Sequence coverage",
|
|
191
|
+
"MS/MS count": "Spectral count Combined", # proteinGroups, evidence
|
|
192
|
+
"MS/MS Count": "Spectral count Combined", # peptides
|
|
193
|
+
"Sequence": "Peptide sequence", # peptides, evidence
|
|
194
|
+
"Sequence length": "Protein length",
|
|
195
|
+
"Mol. weight [kDa]": "Molecular weight [kDa]",
|
|
196
|
+
"Experiment": "Sample",
|
|
197
|
+
}
|
|
198
198
|
column_tag_mapping: OrderedDict[str, str] = OrderedDict(
|
|
199
199
|
[("MS/MS count", "Spectral count"), ("iBAQ", "iBAQ intensity")]
|
|
200
200
|
)
|
|
@@ -343,7 +343,9 @@ class MaxQuantReader(ResultReader):
|
|
|
343
343
|
Adds new columns to comply with the MsReport convention. "Modified sequence",
|
|
344
344
|
"Modifications columns", "Modification localization string". "Protein reported
|
|
345
345
|
by software" and "Representative protein", both contain the first entry from
|
|
346
|
-
"Leading razor protein".
|
|
346
|
+
"Leading razor protein". "Ion ID" contains unique entries for each ion, which
|
|
347
|
+
are generated by concatenating the "Modified sequence" and "Charge" columns, and
|
|
348
|
+
if present, the "Compensation voltage" column.
|
|
347
349
|
|
|
348
350
|
"Modified sequence" entries contain modifications within square brackets.
|
|
349
351
|
"Modification" entries are strings in the form of "position:modification_tag",
|
|
@@ -376,15 +378,19 @@ class MaxQuantReader(ResultReader):
|
|
|
376
378
|
df["Leading razor protein"]
|
|
377
379
|
)
|
|
378
380
|
df["Representative protein"] = df["Protein reported by software"]
|
|
381
|
+
|
|
379
382
|
if drop_decoy:
|
|
380
383
|
df = self._drop_decoy(df)
|
|
381
384
|
if rename_columns:
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
) # Actually there are no column tags as the table is in long format
|
|
385
|
+
# Actually there are no column tags as the table is in long format
|
|
386
|
+
df = self._rename_columns(df, prefix_tag=True)
|
|
385
387
|
if rewrite_modifications and rename_columns:
|
|
386
388
|
df = self._add_peptide_modification_entries(df)
|
|
387
389
|
df = self._add_modification_localization_string(df)
|
|
390
|
+
df["Ion ID"] = df["Modified sequence"] + "_c" + df["Charge"].astype(str)
|
|
391
|
+
if "Compensation voltage" in df.columns:
|
|
392
|
+
_cv = df["Compensation voltage"].astype(str)
|
|
393
|
+
df["Ion ID"] = df["Ion ID"] + "_cv" + _cv
|
|
388
394
|
return df
|
|
389
395
|
|
|
390
396
|
def _add_protein_entries(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -576,6 +582,7 @@ class FragPipeReader(ResultReader):
|
|
|
576
582
|
"peptides": "combined_peptide.tsv",
|
|
577
583
|
"ions": "combined_ion.tsv",
|
|
578
584
|
"ion_evidence": "ion.tsv",
|
|
585
|
+
"psm_evidence": "psm.tsv",
|
|
579
586
|
}
|
|
580
587
|
isobar_filenames: dict[str, str] = {
|
|
581
588
|
"proteins": "protein.tsv",
|
|
@@ -590,20 +597,25 @@ class FragPipeReader(ResultReader):
|
|
|
590
597
|
"Intensity",
|
|
591
598
|
"MaxLFQ Intensity",
|
|
592
599
|
]
|
|
593
|
-
column_mapping: dict[str, str] =
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
600
|
+
column_mapping: dict[str, str] = {
|
|
601
|
+
"Peptide": "Peptide sequence", # PSM
|
|
602
|
+
"Modified Peptide": "Modified sequence", # PSM
|
|
603
|
+
"Protein Start": "Start position", # PSM
|
|
604
|
+
"Protein End": "End position", # PSM
|
|
605
|
+
"Number of Missed Cleavages": "Missed cleavage", # PSM
|
|
606
|
+
"PeptideProphet Probability": "Probability", # PSM
|
|
607
|
+
"Compensation Voltage": "Compensation voltage", # PSM and ion
|
|
608
|
+
"Peptide Sequence": "Peptide sequence", # Peptide and ion
|
|
609
|
+
"Modified Sequence": "Modified sequence", # Modified peptide and ion
|
|
610
|
+
"Start": "Start position", # Peptide and ion
|
|
611
|
+
"End": "End position", # Peptide and ion
|
|
612
|
+
"Combined Total Peptides": "Total peptides", # From LFQ
|
|
613
|
+
"Total Peptides": "Total peptides", # From TMT
|
|
614
|
+
"Description": "Protein name",
|
|
615
|
+
"Protein Length": "Protein length",
|
|
616
|
+
"Entry Name": "Protein entry name",
|
|
617
|
+
"Gene": "Gene name",
|
|
618
|
+
}
|
|
607
619
|
column_tag_mapping: OrderedDict[str, str] = OrderedDict(
|
|
608
620
|
[
|
|
609
621
|
("MaxLFQ Intensity", "LFQ intensity"),
|
|
@@ -743,7 +755,10 @@ class FragPipeReader(ResultReader):
|
|
|
743
755
|
|
|
744
756
|
Adds new columns to comply with the MsReport convention. "Modified sequence"
|
|
745
757
|
and "Modifications columns". "Protein reported by software" and "Representative
|
|
746
|
-
protein", both contain the first entry from "Leading razor protein".
|
|
758
|
+
protein", both contain the first entry from "Leading razor protein". "Ion ID"
|
|
759
|
+
contains unique entries for each ion, which are generated by concatenating the
|
|
760
|
+
"Modified sequence" and "Charge" columns, and if present, the
|
|
761
|
+
"Compensation voltage" column.
|
|
747
762
|
|
|
748
763
|
"Modified sequence" entries contain modifications within square brackets.
|
|
749
764
|
"Modification" entries are strings in the form of "position:modification_text",
|
|
@@ -783,6 +798,11 @@ class FragPipeReader(ResultReader):
|
|
|
783
798
|
if rewrite_modifications and rename_columns:
|
|
784
799
|
df = self._add_peptide_modification_entries(df)
|
|
785
800
|
df = self._add_modification_localization_string(df, prefix_column_tags)
|
|
801
|
+
df["Ion ID"] = df["Modified sequence"] + "_c" + df["Charge"].astype(str)
|
|
802
|
+
if "Compensation voltage" in df.columns:
|
|
803
|
+
_cv = df["Compensation voltage"].astype(str)
|
|
804
|
+
df["Ion ID"] = df["Ion ID"] + "_cv" + _cv
|
|
805
|
+
|
|
786
806
|
return df
|
|
787
807
|
|
|
788
808
|
def import_ion_evidence(
|
|
@@ -797,7 +817,9 @@ class FragPipeReader(ResultReader):
|
|
|
797
817
|
Adds new columns to comply with the MsReport convention. "Modified sequence",
|
|
798
818
|
"Modifications", and "Modification localization string" columns. "Protein
|
|
799
819
|
reported by software" and "Representative protein", both contain the first entry
|
|
800
|
-
from "Leading razor protein".
|
|
820
|
+
from "Leading razor protein". "Ion ID" contains unique entries for each ion,
|
|
821
|
+
which are generated by concatenating the "Modified sequence" and "Charge"
|
|
822
|
+
columns, and if present, the "Compensation voltage" column.
|
|
801
823
|
|
|
802
824
|
"Modified sequence" entries contain modifications within square brackets.
|
|
803
825
|
"Modification" entries are strings in the form of "position:modification_text",
|
|
@@ -850,6 +872,9 @@ class FragPipeReader(ResultReader):
|
|
|
850
872
|
df = pd.concat(ion_tables, ignore_index=True)
|
|
851
873
|
|
|
852
874
|
# --- Process dataframe --- #
|
|
875
|
+
df["Ion ID"] = df["Modified Sequence"] + "_c" + df["Charge"].astype(str)
|
|
876
|
+
if "Compensation Voltage" in df.columns:
|
|
877
|
+
df["Ion ID"] = df["Ion ID"] + "_cv" + df["Compensation Voltage"].astype(str)
|
|
853
878
|
# FUTURE: replace this by _add_protein_entries(df, False) if FragPipe adds
|
|
854
879
|
# 'Indistinguishable Proteins' to the ion table.
|
|
855
880
|
df["Protein reported by software"] = _extract_protein_ids(df["Protein"])
|
|
@@ -861,6 +886,76 @@ class FragPipeReader(ResultReader):
|
|
|
861
886
|
df = self._add_modification_localization_string(df, prefix_column_tags)
|
|
862
887
|
return df
|
|
863
888
|
|
|
889
|
+
def import_psm_evidence(
|
|
890
|
+
self,
|
|
891
|
+
filename: Optional[str] = None,
|
|
892
|
+
rename_columns: bool = True,
|
|
893
|
+
rewrite_modifications: bool = True,
|
|
894
|
+
):
|
|
895
|
+
"""Concatenate all "psm.tsv" files and return a processed dataframe.
|
|
896
|
+
|
|
897
|
+
Args:
|
|
898
|
+
filename: Allows specifying an alternative filename, otherwise the default
|
|
899
|
+
filename is used.
|
|
900
|
+
rename_columns: If True, columns are renamed according to the MsReport
|
|
901
|
+
convention; default True.
|
|
902
|
+
rewrite_modifications: If True, the peptide format in "Modified sequence" is
|
|
903
|
+
changed according to the MsReport convention, and a "Modifications" is
|
|
904
|
+
added to contains the amino acid position for all modifications.
|
|
905
|
+
Requires 'rename_columns' to be true. Default True.
|
|
906
|
+
|
|
907
|
+
Returns:
|
|
908
|
+
A DataFrame containing the processed psm evidence tables.
|
|
909
|
+
"""
|
|
910
|
+
if filename is None:
|
|
911
|
+
filename = self.default_filenames["psm_evidence"]
|
|
912
|
+
|
|
913
|
+
psm_table_paths = []
|
|
914
|
+
for path in pathlib.Path(self.data_directory).iterdir():
|
|
915
|
+
psm_table_path = path / filename
|
|
916
|
+
if path.is_dir() and psm_table_path.exists():
|
|
917
|
+
psm_table_paths.append(psm_table_path)
|
|
918
|
+
|
|
919
|
+
psm_tables = []
|
|
920
|
+
for filepath in psm_table_paths:
|
|
921
|
+
table = pd.read_csv(filepath, sep="\t", low_memory=False)
|
|
922
|
+
str_cols = table.select_dtypes(include=["object"]).columns
|
|
923
|
+
table.loc[:, str_cols] = table.loc[:, str_cols].fillna("")
|
|
924
|
+
|
|
925
|
+
table["Sample"] = filepath.parent.name
|
|
926
|
+
psm_tables.append(table)
|
|
927
|
+
df = pd.concat(psm_tables, ignore_index=True)
|
|
928
|
+
|
|
929
|
+
df["Protein reported by software"] = _extract_protein_ids(df["Protein"])
|
|
930
|
+
df["Representative protein"] = df["Protein reported by software"]
|
|
931
|
+
df["Mapped Proteins"] = df["Mapped Proteins"].astype(str).replace("nan", "")
|
|
932
|
+
|
|
933
|
+
# FP only lists additional mapped proteins in the "Mapped Proteins" column
|
|
934
|
+
# MsReport reports all matching proteins in the "Mapped proteins" column
|
|
935
|
+
mapped_proteins_entries = []
|
|
936
|
+
for protein, mapped_protein_fp in zip(
|
|
937
|
+
df["Representative protein"], df["Mapped Proteins"], strict=True
|
|
938
|
+
):
|
|
939
|
+
if mapped_protein_fp == "":
|
|
940
|
+
mapped_proteins = [protein]
|
|
941
|
+
else:
|
|
942
|
+
additional_mapped_proteins = msreport.reader._extract_protein_ids(
|
|
943
|
+
mapped_protein_fp.split(", ")
|
|
944
|
+
)
|
|
945
|
+
mapped_proteins = [protein] + additional_mapped_proteins
|
|
946
|
+
mapped_proteins_entries.append(";".join(mapped_proteins))
|
|
947
|
+
df["Mapped proteins"] = mapped_proteins_entries
|
|
948
|
+
|
|
949
|
+
if rename_columns:
|
|
950
|
+
df = self._rename_columns(df, prefix_tag=True)
|
|
951
|
+
if rewrite_modifications and rename_columns:
|
|
952
|
+
mod_entries = _generate_modification_entries_from_assigned_modifications(
|
|
953
|
+
df["Peptide sequence"], df["Assigned Modifications"]
|
|
954
|
+
)
|
|
955
|
+
df["Modified sequence"] = mod_entries["Modified sequence"]
|
|
956
|
+
df["Modifications"] = mod_entries["Modifications"]
|
|
957
|
+
return df
|
|
958
|
+
|
|
864
959
|
def _add_protein_entries(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
865
960
|
"""Adds standardized protein entry columns to the data frame.
|
|
866
961
|
|
|
@@ -1038,40 +1133,32 @@ class SpectronautReader(ResultReader):
|
|
|
1038
1133
|
"design": "conditionsetup",
|
|
1039
1134
|
}
|
|
1040
1135
|
protected_columns: list[str] = []
|
|
1041
|
-
column_mapping: dict[str, str] =
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
("FG.CalibratedMz", "Calibrated m/z"),
|
|
1068
|
-
# ("PG.ProteinAccessions", ""),
|
|
1069
|
-
# ("EG.HasLocalizationInformation", ""),
|
|
1070
|
-
# ("EG.PTMLocalizationProbabilities", ""),
|
|
1071
|
-
# ("EG.UsedForProteinGroupQuantity", ""),
|
|
1072
|
-
# Modified peptides need to be parsed and rewritten
|
|
1073
|
-
]
|
|
1074
|
-
)
|
|
1136
|
+
column_mapping: dict[str, str] = {
|
|
1137
|
+
"R.FileName": "Filename",
|
|
1138
|
+
"R.Label": "Sample",
|
|
1139
|
+
"PG.Qvalue": "Protein qvalue",
|
|
1140
|
+
"PG.Cscore": "Protein cscore",
|
|
1141
|
+
"PG.NrOfStrippedSequencesIdentified (Experiment-wide)": "Total peptides",
|
|
1142
|
+
"PG.NrOfPrecursorsIdentified (Experiment-wide)": "Total ions",
|
|
1143
|
+
"PEP.StrippedSequence": "Peptide sequence",
|
|
1144
|
+
"PEP.AllOccurringProteinAccessions": "Mapped proteins",
|
|
1145
|
+
"EG.ModifiedSequence": "Modified sequence",
|
|
1146
|
+
"EG.CompensationVoltage": "Compensation voltage",
|
|
1147
|
+
"EG.Qvalue": "Qvalue",
|
|
1148
|
+
"EG.ApexRT": "Apex retention time",
|
|
1149
|
+
"EG.DatapointsPerPeak": "Datapoints per peak",
|
|
1150
|
+
"EG.FWHM": "FWHM",
|
|
1151
|
+
"EG.SignalToNoise": "Signal to noise",
|
|
1152
|
+
"FG.FragmentCount": "Fragment count",
|
|
1153
|
+
"FG.Charge": "Charge",
|
|
1154
|
+
"FG.MS1Quantity": "MS1 intensity",
|
|
1155
|
+
"FG.MS1RawQuantity": "MS1 raw intensity",
|
|
1156
|
+
"FG.MS2Quantity": "MS2 intensity",
|
|
1157
|
+
"FG.MS2RawQuantity": "MS2 raw intensity",
|
|
1158
|
+
"FG.MeasuredMz": "Observed m/z",
|
|
1159
|
+
"FG.TheoreticalMz": "Theoretical m/z",
|
|
1160
|
+
"FG.CalibratedMz": "Calibrated m/z",
|
|
1161
|
+
}
|
|
1075
1162
|
sample_column_tags: list[str] = [
|
|
1076
1163
|
".PG.NrOfPrecursorsIdentified",
|
|
1077
1164
|
".PG.IBAQ",
|
|
@@ -1324,12 +1411,14 @@ class SpectronautReader(ResultReader):
|
|
|
1324
1411
|
filename: Optional[str] = None,
|
|
1325
1412
|
filetag: Optional[str] = None,
|
|
1326
1413
|
rename_columns: bool = True,
|
|
1327
|
-
) ->
|
|
1414
|
+
) -> pd.DataFrame:
|
|
1328
1415
|
"""Reads an ion evidence file (long format) and returns a processed dataframe.
|
|
1329
1416
|
|
|
1330
1417
|
Adds new columns to comply with the MsReport convention. "Protein reported
|
|
1331
1418
|
by software" and "Representative protein", both contain the first entry from
|
|
1332
|
-
"PG.ProteinAccessions".
|
|
1419
|
+
"PG.ProteinAccessions". "Ion ID" contains unique entries for each ion, which are
|
|
1420
|
+
generated by concatenating the "Modified sequence" and "Charge" columns, and if
|
|
1421
|
+
present, the "Compensation voltage" column.
|
|
1333
1422
|
|
|
1334
1423
|
(!) Note that the modified sequence and modification localization probabilities
|
|
1335
1424
|
are currently not processed.
|
|
@@ -1367,6 +1456,11 @@ class SpectronautReader(ResultReader):
|
|
|
1367
1456
|
df = self._add_protein_entries(df)
|
|
1368
1457
|
if rename_columns:
|
|
1369
1458
|
df = self._rename_columns(df, True)
|
|
1459
|
+
df["Ion ID"] = df["Modified sequence"] + "_c" + df["Charge"].astype(str)
|
|
1460
|
+
if "Compensation voltage" in df.columns:
|
|
1461
|
+
_cv = df["Compensation voltage"].astype(str)
|
|
1462
|
+
df["Ion ID"] = df["Ion ID"] + "_cv" + _cv
|
|
1463
|
+
|
|
1370
1464
|
return df
|
|
1371
1465
|
|
|
1372
1466
|
def _tidy_up_sample_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -1462,7 +1556,7 @@ def sort_leading_proteins(
|
|
|
1462
1556
|
if penalize_contaminants is not None:
|
|
1463
1557
|
contaminant_encoding = {"False": 0, "True": 1, False: 0, True: 1}
|
|
1464
1558
|
|
|
1465
|
-
for
|
|
1559
|
+
for _, row in table.iterrows():
|
|
1466
1560
|
protein_ids = row["Leading proteins"].split(";")
|
|
1467
1561
|
|
|
1468
1562
|
sorting_info = [[] for _ in protein_ids]
|
|
@@ -1559,6 +1653,7 @@ def add_protein_annotation(
|
|
|
1559
1653
|
warnings.warn(
|
|
1560
1654
|
f"Some proteins could not be annotated: {repr(proteins_not_in_db)}",
|
|
1561
1655
|
ProteinsNotInFastaWarning,
|
|
1656
|
+
stacklevel=2,
|
|
1562
1657
|
)
|
|
1563
1658
|
|
|
1564
1659
|
annotations = {}
|
|
@@ -1636,9 +1731,10 @@ def add_protein_site_annotation(
|
|
|
1636
1731
|
warnings.warn(
|
|
1637
1732
|
f"Some proteins could not be annotated: {repr(proteins_not_in_db)}",
|
|
1638
1733
|
ProteinsNotInFastaWarning,
|
|
1734
|
+
stacklevel=2,
|
|
1639
1735
|
)
|
|
1640
1736
|
|
|
1641
|
-
annotations = {
|
|
1737
|
+
annotations: dict[str, list[str]] = {
|
|
1642
1738
|
"Modified residue": [],
|
|
1643
1739
|
"Sequence window": [],
|
|
1644
1740
|
}
|
|
@@ -1702,6 +1798,7 @@ def add_leading_proteins_annotation(
|
|
|
1702
1798
|
warnings.warn(
|
|
1703
1799
|
f"Some proteins could not be annotated: {repr(proteins_not_in_db)}",
|
|
1704
1800
|
ProteinsNotInFastaWarning,
|
|
1801
|
+
stacklevel=2,
|
|
1705
1802
|
)
|
|
1706
1803
|
|
|
1707
1804
|
annotations = {}
|
|
@@ -1853,7 +1950,7 @@ def add_peptide_positions(
|
|
|
1853
1950
|
find matching entries in the FASTA files.
|
|
1854
1951
|
"""
|
|
1855
1952
|
# not tested #
|
|
1856
|
-
peptide_positions = {"Start position": [], "End position": []}
|
|
1953
|
+
peptide_positions: dict[str, list[int]] = {"Start position": [], "End position": []}
|
|
1857
1954
|
proteins_not_in_db = []
|
|
1858
1955
|
for peptide, protein_id in zip(table[peptide_column], table[protein_column]):
|
|
1859
1956
|
if protein_id in protein_db:
|
|
@@ -1875,6 +1972,7 @@ def add_peptide_positions(
|
|
|
1875
1972
|
warnings.warn(
|
|
1876
1973
|
f"Some peptides could not be annotated: {repr(proteins_not_in_db)}",
|
|
1877
1974
|
ProteinsNotInFastaWarning,
|
|
1975
|
+
stacklevel=2,
|
|
1878
1976
|
)
|
|
1879
1977
|
|
|
1880
1978
|
|
|
@@ -1894,10 +1992,10 @@ def add_protein_modifications(table: pd.DataFrame):
|
|
|
1894
1992
|
for peptide_site, mod in [m.split(":") for m in mod_entry.split(";")]:
|
|
1895
1993
|
protein_site = int(peptide_site) + start_pos - 1
|
|
1896
1994
|
protein_mods.append([str(protein_site), mod])
|
|
1897
|
-
|
|
1995
|
+
protein_mod_string = ";".join([f"{pos}:{mod}" for pos, mod in protein_mods])
|
|
1898
1996
|
else:
|
|
1899
|
-
|
|
1900
|
-
protein_modification_entries.append(
|
|
1997
|
+
protein_mod_string = ""
|
|
1998
|
+
protein_modification_entries.append(protein_mod_string)
|
|
1901
1999
|
table["Protein modifications"] = protein_modification_entries
|
|
1902
2000
|
|
|
1903
2001
|
|
|
@@ -2074,7 +2172,7 @@ def _process_protein_entries(
|
|
|
2074
2172
|
A dataframe containing the columns "Protein reported by software",
|
|
2075
2173
|
"Leading proteins", "Representative protein", and "Potential contaminant".
|
|
2076
2174
|
"""
|
|
2077
|
-
new_entries = {
|
|
2175
|
+
new_entries: dict[str, list[str | bool]] = {
|
|
2078
2176
|
"Protein reported by software": [],
|
|
2079
2177
|
"Representative protein": [],
|
|
2080
2178
|
"Potential contaminant": [],
|
|
@@ -2147,6 +2245,57 @@ def _generate_modification_entries(
|
|
|
2147
2245
|
return entries
|
|
2148
2246
|
|
|
2149
2247
|
|
|
2248
|
+
def _generate_modification_entries_from_assigned_modifications(
|
|
2249
|
+
sequences: Iterable[str],
|
|
2250
|
+
assigned_modifications: Iterable[str],
|
|
2251
|
+
) -> dict[str, list[str]]:
|
|
2252
|
+
modified_sequence_entries = []
|
|
2253
|
+
modification_entries = []
|
|
2254
|
+
for sequence, modifications_entry in zip(sequences, assigned_modifications):
|
|
2255
|
+
modifications = _extract_fragpipe_assigned_modifications(
|
|
2256
|
+
modifications_entry, sequence
|
|
2257
|
+
)
|
|
2258
|
+
modified_sequence = helper.modify_peptide(sequence, modifications)
|
|
2259
|
+
modification_entry = ";".join([f"{pos}:{mod}" for pos, mod in modifications])
|
|
2260
|
+
modified_sequence_entries.append(modified_sequence)
|
|
2261
|
+
modification_entries.append(modification_entry)
|
|
2262
|
+
|
|
2263
|
+
entries = {
|
|
2264
|
+
"Modified sequence": modified_sequence_entries,
|
|
2265
|
+
"Modifications": modification_entries,
|
|
2266
|
+
}
|
|
2267
|
+
return entries
|
|
2268
|
+
|
|
2269
|
+
|
|
2270
|
+
def _extract_fragpipe_assigned_modifications(
|
|
2271
|
+
modifications_entry: str,
|
|
2272
|
+
sequence: str,
|
|
2273
|
+
) -> list[tuple[int, str]]:
|
|
2274
|
+
"""Extracts modifications from a FragPipe "Modifications" entry.
|
|
2275
|
+
|
|
2276
|
+
Example for a modification entry: "N-term(42.0106),8C(57.0215)"
|
|
2277
|
+
|
|
2278
|
+
Returns:
|
|
2279
|
+
A list of tuples, where each tuple contains the position of the modification and
|
|
2280
|
+
the modification text. The position is one-indexed, meaning that the first amino
|
|
2281
|
+
acid position is 1. N-term and C-term are represented as 0 and len(sequence)
|
|
2282
|
+
respectively.
|
|
2283
|
+
"""
|
|
2284
|
+
if modifications_entry == "":
|
|
2285
|
+
return []
|
|
2286
|
+
modifications = []
|
|
2287
|
+
for mod_entry in modifications_entry.split(","):
|
|
2288
|
+
position_entry, modification = mod_entry.split(")")[0].split("(")
|
|
2289
|
+
if position_entry == "N-term":
|
|
2290
|
+
position = 0
|
|
2291
|
+
elif position_entry == "C-term":
|
|
2292
|
+
position = len(sequence)
|
|
2293
|
+
else:
|
|
2294
|
+
position = int(position_entry[:-1])
|
|
2295
|
+
modifications.append((position, modification))
|
|
2296
|
+
return modifications
|
|
2297
|
+
|
|
2298
|
+
|
|
2150
2299
|
def extract_maxquant_localization_probabilities(localization_entry: str) -> dict:
|
|
2151
2300
|
"""Extract localization probabilites from a MaxQuant "Probabilities" entry.
|
|
2152
2301
|
|
|
@@ -2189,7 +2338,7 @@ def extract_fragpipe_localization_probabilities(localization_entry: str) -> dict
|
|
|
2189
2338
|
... )
|
|
2190
2339
|
{'15.9949': {3: 1.0}, '79.9663': {4: 0.334, 6: 0.666}}
|
|
2191
2340
|
"""
|
|
2192
|
-
modification_probabilities = {}
|
|
2341
|
+
modification_probabilities: dict[str, dict[int, float]] = {}
|
|
2193
2342
|
for modification_entry in filter(None, localization_entry.split(";")):
|
|
2194
2343
|
specified_modification, probability_sequence = modification_entry.split("@")
|
|
2195
2344
|
_, modification = specified_modification.split(":")
|
|
@@ -2247,7 +2396,7 @@ def _create_protein_annotations_from_db(
|
|
|
2247
2396
|
protein_db: ProteinDatabase,
|
|
2248
2397
|
query_function: Callable,
|
|
2249
2398
|
default_value: Any,
|
|
2250
|
-
) -> list[
|
|
2399
|
+
) -> list[Any]:
|
|
2251
2400
|
"""Returns a list of multi protein entry annotations.
|
|
2252
2401
|
|
|
2253
2402
|
Used to generate protein annotations for protein entries. For each protein id an
|
|
@@ -2274,9 +2423,9 @@ def _create_protein_annotations_from_db(
|
|
|
2274
2423
|
if protein_id in protein_db:
|
|
2275
2424
|
db_entry = protein_db[protein_id]
|
|
2276
2425
|
query_result = query_function(db_entry, default_value)
|
|
2426
|
+
annotation_values.append(query_result)
|
|
2277
2427
|
else:
|
|
2278
|
-
|
|
2279
|
-
annotation_values.append(query_result)
|
|
2428
|
+
annotation_values.append(default_value)
|
|
2280
2429
|
return annotation_values
|
|
2281
2430
|
|
|
2282
2431
|
|
msreport/rinterface/__init__.py
CHANGED
|
@@ -1,3 +1,16 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
3
|
-
from .
|
|
1
|
+
"""Python interface to custome R scripts."""
|
|
2
|
+
|
|
3
|
+
from msreport.errors import OptionalDependencyError
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
from .limma import multi_group_limma, two_group_limma
|
|
7
|
+
from .rinstaller import r_package_version
|
|
8
|
+
except ImportError as err:
|
|
9
|
+
raise OptionalDependencyError(
|
|
10
|
+
"R integration is not available. R must be installed and configured before "
|
|
11
|
+
"installing optional R dependencies using 'pip install msreport[R]'. For "
|
|
12
|
+
"more information, see: https://github.com/hollenstein/msreport"
|
|
13
|
+
) from err
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
__all__ = ["multi_group_limma", "two_group_limma", "r_package_version"]
|
msreport/rinterface/limma.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
from rpy2.robjects.packages import importr
|
|
2
|
-
import rpy2.robjects.packages as rpackages
|
|
3
1
|
import rpy2.robjects as robjects
|
|
2
|
+
import rpy2.robjects.packages as rpackages
|
|
3
|
+
from rpy2.robjects.packages import importr
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
def r_package_version(package_name: str) ->
|
|
6
|
+
def r_package_version(package_name: str) -> str:
|
|
7
7
|
"""Returns the version number of an installed R package."""
|
|
8
8
|
with robjects.conversion.localconverter(robjects.default_converter):
|
|
9
9
|
utils = importr("utils")
|