msreport 0.0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,283 @@
1
+ from collections import defaultdict as ddict
2
+ from typing import Optional, Union
3
+
4
+ import numpy as np
5
+
6
+
7
+ class Peptide:
8
+ """Representation of a peptide sequence identified by mass spectrometry."""
9
+
10
+ def __init__(
11
+ self,
12
+ modified_sequence: str,
13
+ localization_probabilities: Optional[dict] = None,
14
+ protein_position: Optional[int] = None,
15
+ ):
16
+ plain_sequence, modifications = parse_modified_sequence(
17
+ modified_sequence, "[", "]"
18
+ )
19
+
20
+ self.plain_sequence = plain_sequence
21
+ self.modified_sequence = modified_sequence
22
+ self.localization_probabilities = localization_probabilities
23
+ self.protein_position = protein_position
24
+
25
+ self.modification_positions = ddict(list)
26
+ self.modified_residues = {}
27
+ for position, mod_tag in modifications:
28
+ self.modification_positions[mod_tag].append(position)
29
+ self.modified_residues[position] = mod_tag
30
+
31
+ def make_modified_sequence(self, include: Optional[list] = None) -> str:
32
+ """Returns a modified sequence string.
33
+
34
+ Args:
35
+ include: Optional, list of modifications that are included in the modified
36
+ sequence string. By default all modifications are added.
37
+
38
+ Returns:
39
+ A modified sequence string where modified amino acids are indicated by
40
+ square brackets containing a modification tag. For example
41
+ "PEPT[phospho]IDE"
42
+ """
43
+ if include is None:
44
+ return self.modified_sequence
45
+
46
+ selected_modifications = []
47
+ for position, mod_tag in self.modified_residues.items():
48
+ if mod_tag in include:
49
+ selected_modifications.append((position, mod_tag))
50
+ return modify_peptide(self.plain_sequence, selected_modifications)
51
+
52
+ def count_modification(self, modification: str) -> int:
53
+ """Returns how often the a specified modification occurs."""
54
+ if modification not in self.modification_positions:
55
+ return 0
56
+ return len(self.modification_positions[modification])
57
+
58
+ def isoform_probability(self, modification: str) -> Union[float, None]:
59
+ """Calculates the isoform probability for a given modification.
60
+
61
+ Returns:
62
+ The isoform probability for the combination of the assigned modification
63
+ sites. Calculated as the product of the single modification localization
64
+ probabilities. If no localization exist for the specified 'modification',
65
+ None is returned.
66
+ """
67
+ probabilities = []
68
+ for site in self.list_modified_peptide_sites(modification):
69
+ probabilities.append(self.get_peptide_site_probability(site))
70
+ return np.prod(probabilities)
71
+
72
+ def get_peptide_site_probability(self, position: int) -> Optional[float]:
73
+ """Return the modification localization probability of the peptide position.
74
+
75
+ Args:
76
+ position: Peptide position which modification localization probability is
77
+ returned.
78
+
79
+ Returns:
80
+ Localization probability between 0 and 1. Returns None if the specified
81
+ position does not contain a modification or if no localization probability
82
+ is available.
83
+ """
84
+ return self._get_site_probability(position, is_protein_position=False)
85
+
86
+ def get_protein_site_probability(self, position: int) -> Optional[float]:
87
+ """Return the modification localization probability of the protein position.
88
+
89
+ Args:
90
+ position: Protein position which modification localization probability is
91
+ returned.
92
+
93
+ Returns:
94
+ Localization probability between 0 and 1. Returns None if the specified
95
+ position does not contain a modification or if no localization probability
96
+ is available.
97
+ """
98
+ return self._get_site_probability(position, is_protein_position=True)
99
+
100
+ def list_modified_peptide_sites(self, modification: str) -> list[int]:
101
+ """Returns a list of peptide positions containing the specified modification."""
102
+ return self._list_modified_sites(modification, use_protein_position=False)
103
+
104
+ def list_modified_protein_sites(self, modification: str) -> list[int]:
105
+ """Returns a list of protein positions containing the specified modification."""
106
+ return self._list_modified_sites(modification, use_protein_position=True)
107
+
108
+ def _get_site_probability(
109
+ self, position: int, is_protein_position: bool
110
+ ) -> Optional[float]:
111
+ """Return the modification localization probability of the peptide position.
112
+
113
+ Args:
114
+ position: Position which modification localization probability is returned.
115
+ is_protein_position: If True, the specified position is a protein position,
116
+ if False its a peptide position.
117
+
118
+ Returns:
119
+ Localization probability between 0 and 1. Returns None if the specified
120
+ position does not contain a modification or if no localization probability
121
+ is available.
122
+ """
123
+ if is_protein_position and self.protein_position is not None:
124
+ position = position - self.protein_position + 1
125
+
126
+ if self.localization_probabilities is None:
127
+ return None
128
+ if position not in self.modified_residues:
129
+ return None
130
+
131
+ modification = self.modified_residues[position]
132
+ try:
133
+ probability = self.localization_probabilities[modification][position]
134
+ except KeyError:
135
+ probability = None
136
+ return probability
137
+
138
+ def _list_modified_sites(
139
+ self, modification: str, use_protein_position: bool
140
+ ) -> list[int]:
141
+ """Returns a list of positions containint the specified modification.
142
+
143
+ Args:
144
+ modification: Sites containing this modification are extracted.
145
+ use_protein_position: If True, the returned sites are protein positions and
146
+ if False, peptide positions are returnd.
147
+
148
+ Returns:
149
+ A list of modified positions
150
+ """
151
+ if modification not in self.modification_positions:
152
+ return []
153
+
154
+ modified_sites = self.modification_positions[modification]
155
+ if use_protein_position and self.protein_position is not None:
156
+ modified_sites = [i + self.protein_position - 1 for i in modified_sites]
157
+ return modified_sites
158
+
159
+
160
+ def parse_modified_sequence(
161
+ modified_sequence: str,
162
+ tag_open: str,
163
+ tag_close: str,
164
+ ) -> tuple[str, list]:
165
+ """Returns the plain sequence and a list of modification positions and tags.
166
+
167
+ Args:
168
+ modified_sequence: Peptide sequence containing modifications.
169
+ tag_open: Symbol that indicates the beginning of a modification tag, e.g. "[".
170
+ tag_close: Symbol that indicates the end of a modification tag, e.g. "]".
171
+
172
+ Returns:
173
+ A tuple containing the plain sequence as a string and a sorted list of
174
+ modification tuples, each containing the position and modification tag
175
+ (excluding the tag_open and tag_close symbols).
176
+ """
177
+ start_counter = 0
178
+ tags = []
179
+ plain_sequence = ""
180
+ for position, char in enumerate(modified_sequence):
181
+ if char == tag_open:
182
+ start_counter += 1
183
+ if start_counter == 1:
184
+ start_position = position
185
+ elif char == tag_close:
186
+ start_counter -= 1
187
+ if start_counter == 0:
188
+ tags.append((start_position, position))
189
+ elif start_counter == 0:
190
+ plain_sequence += char
191
+
192
+ modifications = []
193
+ last_position = 0
194
+ for tag_start, tag_end in tags:
195
+ mod_position = tag_start - last_position
196
+ modification = modified_sequence[tag_start + 1 : tag_end]
197
+ modifications.append((mod_position, modification))
198
+ last_position += tag_end - tag_start + 1
199
+ return plain_sequence, sorted(modifications)
200
+
201
+
202
+ def modify_peptide(
203
+ sequence: str,
204
+ modifications: list[tuple[int, str]],
205
+ tag_open: str = "[",
206
+ tag_close: str = "]",
207
+ ) -> str:
208
+ """Returns a string containing the modifications within the peptide sequence.
209
+
210
+ Returns:
211
+ Modified sequence. For example "PEPT[phospho]IDE", for sequence = "PEPTIDE" and
212
+ modifications = [(4, "phospho")]
213
+ """
214
+ last_pos = 0
215
+ modified_sequence = ""
216
+ for pos, mod in sorted(modifications):
217
+ tag = mod.join((tag_open, tag_close))
218
+ modified_sequence += sequence[last_pos:pos] + tag
219
+ last_pos = pos
220
+ modified_sequence += sequence[last_pos:]
221
+ return modified_sequence
222
+
223
+
224
+ def make_localization_string(
225
+ localization_probabilities: dict, decimal_places: int = 3
226
+ ) -> str:
227
+ """Generates a site localization probability string.
228
+
229
+ Args:
230
+ localization_probabilities: A dictionary in the form
231
+ {"modification tag": {position: probability}}, where positions are integers
232
+ and probabilitiesa are floats ranging from 0 to 1.
233
+ decimal_places: Number of decimal places used for the probabilities, default 3.
234
+
235
+ Returns:
236
+ A site localization probability string according to the MsReport convention.
237
+ Multiple modifications entries are separted by ";". Each modification entry
238
+ consist of a modification tag and site probabilities, separated by "@". The
239
+ site probability entries consist of f"{position}:{probability}" strings, and
240
+ multiple probability entries are separted by ",".
241
+
242
+ For example "15.9949@11:1.000;79.9663@3:0.200,4:0.800"
243
+ """
244
+ modification_strings = []
245
+ for modification, probabilities in localization_probabilities.items():
246
+ localization_strings = []
247
+ for position, probability in probabilities.items():
248
+ probability_string = f"{probability:.{decimal_places}f}"
249
+ localization_strings.append(f"{position}:{probability_string}")
250
+ localization_string = ",".join(localization_strings)
251
+ modification_strings.append(f"{modification}@{localization_string}")
252
+ localization_string = ";".join(modification_strings)
253
+ return localization_string
254
+
255
+
256
+ def read_localization_string(localization_string: str) -> dict:
257
+ """Converts a site localization probability string into a dictionary.
258
+
259
+ Args:
260
+ localization_string: A site localization probability string according to the
261
+ MsReport convention. Can contain information about multiple modifications,
262
+ which are separted by ";". Each modification entry consist of a modification
263
+ tag and site probabilities, separated by "@". The site probability entries
264
+ consist of f"{peptide position}:{localization probability}" strings, and
265
+ multiple entries are separted by ",".
266
+ For example "15.9949@11:1.000;79.9663@3:0.200,4:0.800"
267
+
268
+ Returns:
269
+ A dictionary in the form {"modification tag": {position: probability}}, where
270
+ positions are integers and probabilitiesa are floats ranging from 0 to 1.
271
+ """
272
+ localization = {}
273
+ if localization_string == "":
274
+ return localization
275
+
276
+ for modification_entry in localization_string.split(";"):
277
+ modification, site_entries = modification_entry.split("@")
278
+ site_probabilities = {}
279
+ for site_entry in site_entries.split(","):
280
+ position, probability = site_entry.split(":")
281
+ site_probabilities[int(position)] = float(probability)
282
+ localization[modification] = site_probabilities
283
+ return localization