msreport 0.0.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- msreport/__init__.py +13 -0
- msreport/aggregate/__init__.py +0 -0
- msreport/aggregate/condense.py +163 -0
- msreport/aggregate/pivot.py +132 -0
- msreport/aggregate/summarize.py +281 -0
- msreport/analyze.py +586 -0
- msreport/errors.py +10 -0
- msreport/export.py +526 -0
- msreport/fasta.py +28 -0
- msreport/helper/__init__.py +23 -0
- msreport/helper/calc.py +120 -0
- msreport/helper/maxlfq.py +339 -0
- msreport/helper/table.py +267 -0
- msreport/helper/temp.py +99 -0
- msreport/impute.py +275 -0
- msreport/isobar.py +161 -0
- msreport/normalize.py +496 -0
- msreport/peptidoform.py +283 -0
- msreport/plot.py +1129 -0
- msreport/qtable.py +537 -0
- msreport/reader.py +2357 -0
- msreport/rinterface/__init__.py +3 -0
- msreport/rinterface/limma.py +126 -0
- msreport/rinterface/rinstaller.py +35 -0
- msreport/rinterface/rscripts/limma.R +104 -0
- msreport-0.0.24.dist-info/METADATA +128 -0
- msreport-0.0.24.dist-info/RECORD +30 -0
- msreport-0.0.24.dist-info/WHEEL +5 -0
- msreport-0.0.24.dist-info/licenses/LICENSE.txt +202 -0
- msreport-0.0.24.dist-info/top_level.txt +1 -0
msreport/peptidoform.py
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
from collections import defaultdict as ddict
|
|
2
|
+
from typing import Optional, Union
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Peptide:
|
|
8
|
+
"""Representation of a peptide sequence identified by mass spectrometry."""
|
|
9
|
+
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
modified_sequence: str,
|
|
13
|
+
localization_probabilities: Optional[dict] = None,
|
|
14
|
+
protein_position: Optional[int] = None,
|
|
15
|
+
):
|
|
16
|
+
plain_sequence, modifications = parse_modified_sequence(
|
|
17
|
+
modified_sequence, "[", "]"
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
self.plain_sequence = plain_sequence
|
|
21
|
+
self.modified_sequence = modified_sequence
|
|
22
|
+
self.localization_probabilities = localization_probabilities
|
|
23
|
+
self.protein_position = protein_position
|
|
24
|
+
|
|
25
|
+
self.modification_positions = ddict(list)
|
|
26
|
+
self.modified_residues = {}
|
|
27
|
+
for position, mod_tag in modifications:
|
|
28
|
+
self.modification_positions[mod_tag].append(position)
|
|
29
|
+
self.modified_residues[position] = mod_tag
|
|
30
|
+
|
|
31
|
+
def make_modified_sequence(self, include: Optional[list] = None) -> str:
|
|
32
|
+
"""Returns a modified sequence string.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
include: Optional, list of modifications that are included in the modified
|
|
36
|
+
sequence string. By default all modifications are added.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
A modified sequence string where modified amino acids are indicated by
|
|
40
|
+
square brackets containing a modification tag. For example
|
|
41
|
+
"PEPT[phospho]IDE"
|
|
42
|
+
"""
|
|
43
|
+
if include is None:
|
|
44
|
+
return self.modified_sequence
|
|
45
|
+
|
|
46
|
+
selected_modifications = []
|
|
47
|
+
for position, mod_tag in self.modified_residues.items():
|
|
48
|
+
if mod_tag in include:
|
|
49
|
+
selected_modifications.append((position, mod_tag))
|
|
50
|
+
return modify_peptide(self.plain_sequence, selected_modifications)
|
|
51
|
+
|
|
52
|
+
def count_modification(self, modification: str) -> int:
|
|
53
|
+
"""Returns how often the a specified modification occurs."""
|
|
54
|
+
if modification not in self.modification_positions:
|
|
55
|
+
return 0
|
|
56
|
+
return len(self.modification_positions[modification])
|
|
57
|
+
|
|
58
|
+
def isoform_probability(self, modification: str) -> Union[float, None]:
|
|
59
|
+
"""Calculates the isoform probability for a given modification.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
The isoform probability for the combination of the assigned modification
|
|
63
|
+
sites. Calculated as the product of the single modification localization
|
|
64
|
+
probabilities. If no localization exist for the specified 'modification',
|
|
65
|
+
None is returned.
|
|
66
|
+
"""
|
|
67
|
+
probabilities = []
|
|
68
|
+
for site in self.list_modified_peptide_sites(modification):
|
|
69
|
+
probabilities.append(self.get_peptide_site_probability(site))
|
|
70
|
+
return np.prod(probabilities)
|
|
71
|
+
|
|
72
|
+
def get_peptide_site_probability(self, position: int) -> Optional[float]:
|
|
73
|
+
"""Return the modification localization probability of the peptide position.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
position: Peptide position which modification localization probability is
|
|
77
|
+
returned.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Localization probability between 0 and 1. Returns None if the specified
|
|
81
|
+
position does not contain a modification or if no localization probability
|
|
82
|
+
is available.
|
|
83
|
+
"""
|
|
84
|
+
return self._get_site_probability(position, is_protein_position=False)
|
|
85
|
+
|
|
86
|
+
def get_protein_site_probability(self, position: int) -> Optional[float]:
|
|
87
|
+
"""Return the modification localization probability of the protein position.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
position: Protein position which modification localization probability is
|
|
91
|
+
returned.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Localization probability between 0 and 1. Returns None if the specified
|
|
95
|
+
position does not contain a modification or if no localization probability
|
|
96
|
+
is available.
|
|
97
|
+
"""
|
|
98
|
+
return self._get_site_probability(position, is_protein_position=True)
|
|
99
|
+
|
|
100
|
+
def list_modified_peptide_sites(self, modification: str) -> list[int]:
|
|
101
|
+
"""Returns a list of peptide positions containing the specified modification."""
|
|
102
|
+
return self._list_modified_sites(modification, use_protein_position=False)
|
|
103
|
+
|
|
104
|
+
def list_modified_protein_sites(self, modification: str) -> list[int]:
|
|
105
|
+
"""Returns a list of protein positions containing the specified modification."""
|
|
106
|
+
return self._list_modified_sites(modification, use_protein_position=True)
|
|
107
|
+
|
|
108
|
+
def _get_site_probability(
|
|
109
|
+
self, position: int, is_protein_position: bool
|
|
110
|
+
) -> Optional[float]:
|
|
111
|
+
"""Return the modification localization probability of the peptide position.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
position: Position which modification localization probability is returned.
|
|
115
|
+
is_protein_position: If True, the specified position is a protein position,
|
|
116
|
+
if False its a peptide position.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
Localization probability between 0 and 1. Returns None if the specified
|
|
120
|
+
position does not contain a modification or if no localization probability
|
|
121
|
+
is available.
|
|
122
|
+
"""
|
|
123
|
+
if is_protein_position and self.protein_position is not None:
|
|
124
|
+
position = position - self.protein_position + 1
|
|
125
|
+
|
|
126
|
+
if self.localization_probabilities is None:
|
|
127
|
+
return None
|
|
128
|
+
if position not in self.modified_residues:
|
|
129
|
+
return None
|
|
130
|
+
|
|
131
|
+
modification = self.modified_residues[position]
|
|
132
|
+
try:
|
|
133
|
+
probability = self.localization_probabilities[modification][position]
|
|
134
|
+
except KeyError:
|
|
135
|
+
probability = None
|
|
136
|
+
return probability
|
|
137
|
+
|
|
138
|
+
def _list_modified_sites(
|
|
139
|
+
self, modification: str, use_protein_position: bool
|
|
140
|
+
) -> list[int]:
|
|
141
|
+
"""Returns a list of positions containint the specified modification.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
modification: Sites containing this modification are extracted.
|
|
145
|
+
use_protein_position: If True, the returned sites are protein positions and
|
|
146
|
+
if False, peptide positions are returnd.
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
A list of modified positions
|
|
150
|
+
"""
|
|
151
|
+
if modification not in self.modification_positions:
|
|
152
|
+
return []
|
|
153
|
+
|
|
154
|
+
modified_sites = self.modification_positions[modification]
|
|
155
|
+
if use_protein_position and self.protein_position is not None:
|
|
156
|
+
modified_sites = [i + self.protein_position - 1 for i in modified_sites]
|
|
157
|
+
return modified_sites
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def parse_modified_sequence(
|
|
161
|
+
modified_sequence: str,
|
|
162
|
+
tag_open: str,
|
|
163
|
+
tag_close: str,
|
|
164
|
+
) -> tuple[str, list]:
|
|
165
|
+
"""Returns the plain sequence and a list of modification positions and tags.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
modified_sequence: Peptide sequence containing modifications.
|
|
169
|
+
tag_open: Symbol that indicates the beginning of a modification tag, e.g. "[".
|
|
170
|
+
tag_close: Symbol that indicates the end of a modification tag, e.g. "]".
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
A tuple containing the plain sequence as a string and a sorted list of
|
|
174
|
+
modification tuples, each containing the position and modification tag
|
|
175
|
+
(excluding the tag_open and tag_close symbols).
|
|
176
|
+
"""
|
|
177
|
+
start_counter = 0
|
|
178
|
+
tags = []
|
|
179
|
+
plain_sequence = ""
|
|
180
|
+
for position, char in enumerate(modified_sequence):
|
|
181
|
+
if char == tag_open:
|
|
182
|
+
start_counter += 1
|
|
183
|
+
if start_counter == 1:
|
|
184
|
+
start_position = position
|
|
185
|
+
elif char == tag_close:
|
|
186
|
+
start_counter -= 1
|
|
187
|
+
if start_counter == 0:
|
|
188
|
+
tags.append((start_position, position))
|
|
189
|
+
elif start_counter == 0:
|
|
190
|
+
plain_sequence += char
|
|
191
|
+
|
|
192
|
+
modifications = []
|
|
193
|
+
last_position = 0
|
|
194
|
+
for tag_start, tag_end in tags:
|
|
195
|
+
mod_position = tag_start - last_position
|
|
196
|
+
modification = modified_sequence[tag_start + 1 : tag_end]
|
|
197
|
+
modifications.append((mod_position, modification))
|
|
198
|
+
last_position += tag_end - tag_start + 1
|
|
199
|
+
return plain_sequence, sorted(modifications)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def modify_peptide(
|
|
203
|
+
sequence: str,
|
|
204
|
+
modifications: list[tuple[int, str]],
|
|
205
|
+
tag_open: str = "[",
|
|
206
|
+
tag_close: str = "]",
|
|
207
|
+
) -> str:
|
|
208
|
+
"""Returns a string containing the modifications within the peptide sequence.
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
Modified sequence. For example "PEPT[phospho]IDE", for sequence = "PEPTIDE" and
|
|
212
|
+
modifications = [(4, "phospho")]
|
|
213
|
+
"""
|
|
214
|
+
last_pos = 0
|
|
215
|
+
modified_sequence = ""
|
|
216
|
+
for pos, mod in sorted(modifications):
|
|
217
|
+
tag = mod.join((tag_open, tag_close))
|
|
218
|
+
modified_sequence += sequence[last_pos:pos] + tag
|
|
219
|
+
last_pos = pos
|
|
220
|
+
modified_sequence += sequence[last_pos:]
|
|
221
|
+
return modified_sequence
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def make_localization_string(
|
|
225
|
+
localization_probabilities: dict, decimal_places: int = 3
|
|
226
|
+
) -> str:
|
|
227
|
+
"""Generates a site localization probability string.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
localization_probabilities: A dictionary in the form
|
|
231
|
+
{"modification tag": {position: probability}}, where positions are integers
|
|
232
|
+
and probabilitiesa are floats ranging from 0 to 1.
|
|
233
|
+
decimal_places: Number of decimal places used for the probabilities, default 3.
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
A site localization probability string according to the MsReport convention.
|
|
237
|
+
Multiple modifications entries are separted by ";". Each modification entry
|
|
238
|
+
consist of a modification tag and site probabilities, separated by "@". The
|
|
239
|
+
site probability entries consist of f"{position}:{probability}" strings, and
|
|
240
|
+
multiple probability entries are separted by ",".
|
|
241
|
+
|
|
242
|
+
For example "15.9949@11:1.000;79.9663@3:0.200,4:0.800"
|
|
243
|
+
"""
|
|
244
|
+
modification_strings = []
|
|
245
|
+
for modification, probabilities in localization_probabilities.items():
|
|
246
|
+
localization_strings = []
|
|
247
|
+
for position, probability in probabilities.items():
|
|
248
|
+
probability_string = f"{probability:.{decimal_places}f}"
|
|
249
|
+
localization_strings.append(f"{position}:{probability_string}")
|
|
250
|
+
localization_string = ",".join(localization_strings)
|
|
251
|
+
modification_strings.append(f"{modification}@{localization_string}")
|
|
252
|
+
localization_string = ";".join(modification_strings)
|
|
253
|
+
return localization_string
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def read_localization_string(localization_string: str) -> dict:
|
|
257
|
+
"""Converts a site localization probability string into a dictionary.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
localization_string: A site localization probability string according to the
|
|
261
|
+
MsReport convention. Can contain information about multiple modifications,
|
|
262
|
+
which are separted by ";". Each modification entry consist of a modification
|
|
263
|
+
tag and site probabilities, separated by "@". The site probability entries
|
|
264
|
+
consist of f"{peptide position}:{localization probability}" strings, and
|
|
265
|
+
multiple entries are separted by ",".
|
|
266
|
+
For example "15.9949@11:1.000;79.9663@3:0.200,4:0.800"
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
A dictionary in the form {"modification tag": {position: probability}}, where
|
|
270
|
+
positions are integers and probabilitiesa are floats ranging from 0 to 1.
|
|
271
|
+
"""
|
|
272
|
+
localization = {}
|
|
273
|
+
if localization_string == "":
|
|
274
|
+
return localization
|
|
275
|
+
|
|
276
|
+
for modification_entry in localization_string.split(";"):
|
|
277
|
+
modification, site_entries = modification_entry.split("@")
|
|
278
|
+
site_probabilities = {}
|
|
279
|
+
for site_entry in site_entries.split(","):
|
|
280
|
+
position, probability = site_entry.split(":")
|
|
281
|
+
site_probabilities[int(position)] = float(probability)
|
|
282
|
+
localization[modification] = site_probabilities
|
|
283
|
+
return localization
|