weirdo 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
weirdo/__init__.py ADDED
@@ -0,0 +1,104 @@
1
+ from .amino_acid_alphabet import (
2
+ AminoAcid,
3
+ canonical_amino_acids,
4
+ canonical_amino_acid_letters,
5
+ extended_amino_acids,
6
+ extended_amino_acid_letters,
7
+ amino_acid_letter_indices,
8
+ amino_acid_name_indices,
9
+ )
10
+ from .peptide_vectorizer import PeptideVectorizer
11
+ from .distances import hamming
12
+
13
+ # High-level scoring API
14
+ from .api import (
15
+ score_peptide,
16
+ score_peptides,
17
+ create_scorer,
18
+ clear_cache,
19
+ get_available_presets,
20
+ get_preset_info,
21
+ # Model management
22
+ list_models,
23
+ load_model,
24
+ save_model,
25
+ get_available_scorers,
26
+ )
27
+
28
+ # Data management
29
+ from .data_manager import (
30
+ DataManager,
31
+ get_data_manager,
32
+ ensure_data_available,
33
+ )
34
+
35
+ # Model management
36
+ from .model_manager import (
37
+ ModelManager,
38
+ get_model_manager,
39
+ ModelInfo,
40
+ )
41
+
42
+ # Scorer classes (for advanced usage)
43
+ from .scorers import (
44
+ BaseScorer,
45
+ BatchScorer,
46
+ BaseReference,
47
+ StreamingReference,
48
+ TrainableScorer,
49
+ SwissProtReference,
50
+ ScorerConfig,
51
+ register_scorer,
52
+ register_reference,
53
+ )
54
+
55
+ # ML scorer
56
+ from .scorers import MLPScorer
57
+
58
+ __version__ = "2.1.0"
59
+
60
+ __all__ = [
61
+ # Amino acid data
62
+ "AminoAcid",
63
+ "canonical_amino_acids",
64
+ "canonical_amino_acid_letters",
65
+ "extended_amino_acids",
66
+ "extended_amino_acid_letters",
67
+ "amino_acid_letter_indices",
68
+ "amino_acid_name_indices",
69
+ # Vectorization
70
+ "PeptideVectorizer",
71
+ # Distances
72
+ "hamming",
73
+ # High-level scoring API
74
+ "score_peptide",
75
+ "score_peptides",
76
+ "create_scorer",
77
+ "clear_cache",
78
+ "get_available_presets",
79
+ "get_preset_info",
80
+ "get_available_scorers",
81
+ # Model management
82
+ "list_models",
83
+ "load_model",
84
+ "save_model",
85
+ "ModelManager",
86
+ "get_model_manager",
87
+ "ModelInfo",
88
+ # Scorer classes
89
+ "BaseScorer",
90
+ "BatchScorer",
91
+ "BaseReference",
92
+ "StreamingReference",
93
+ "TrainableScorer",
94
+ "SwissProtReference",
95
+ "ScorerConfig",
96
+ "register_scorer",
97
+ "register_reference",
98
+ # ML scorer
99
+ "MLPScorer",
100
+ # Data management
101
+ "DataManager",
102
+ "get_data_manager",
103
+ "ensure_data_available",
104
+ ]
weirdo/amino_acid.py ADDED
@@ -0,0 +1,33 @@
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ class AminoAcid(object):
14
+ def __init__(
15
+ self, full_name, short_name, letter, contains=None):
16
+ self.letter = letter
17
+ self.full_name = full_name
18
+ self.short_name = short_name
19
+ if not contains:
20
+ contains = [letter]
21
+ self.contains = contains
22
+
23
+ def __str__(self):
24
+ return (
25
+ ("AminoAcid(full_name='%s', short_name='%s', letter='%s', "
26
+ "contains=%s)") % (
27
+ self.letter, self.full_name, self.short_name, self.contains))
28
+
29
+ def __repr__(self):
30
+ return str(self)
31
+
32
+ def __eq__(self, other):
33
+ return other.__class__ is AminoAcid and self.letter == other.letter
@@ -0,0 +1,158 @@
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+
14
+ """
15
+ Quantify amino acids by their physical/chemical properties
16
+ """
17
+
18
+ import numpy as np
19
+
20
+ from .amino_acid import AminoAcid
21
+
22
+ canonical_amino_acids = [
23
+ AminoAcid("Alanine", "Ala", "A"),
24
+ AminoAcid("Arginine", "Arg", "R"),
25
+ AminoAcid("Asparagine","Asn", "N"),
26
+ AminoAcid("Aspartic Acid", "Asp", "D"),
27
+ AminoAcid("Cysteine", "Cys", "C"),
28
+ AminoAcid("Glutamic Acid", "Glu", "E"),
29
+ AminoAcid("Glutamine", "Gln", "Q"),
30
+ AminoAcid("Glycine", "Gly", "G"),
31
+ AminoAcid("Histidine", "His", "H"),
32
+ AminoAcid("Isoleucine", "Ile", "I"),
33
+ AminoAcid("Leucine", "Leu", "L"),
34
+ AminoAcid("Lysine", "Lys", "K"),
35
+ AminoAcid("Methionine", "Met", "M"),
36
+ AminoAcid("Phenylalanine", "Phe", "F"),
37
+ AminoAcid("Proline", "Pro", "P"),
38
+ AminoAcid("Serine", "Ser", "S"),
39
+ AminoAcid("Threonine", "Thr", "T"),
40
+ AminoAcid("Tryptophan", "Trp", "W"),
41
+ AminoAcid("Tyrosine", "Tyr", "Y"),
42
+ AminoAcid("Valine", "Val", "V")
43
+ ]
44
+
45
+ canonical_amino_acid_letters = [aa.letter for aa in canonical_amino_acids]
46
+
47
+ ###
48
+ # Post-translation modifications commonly detected by mass-spec
49
+ ###
50
+
51
+ # TODO: figure out three letter codes for modified AAs
52
+
53
+ modified_amino_acids = [
54
+ AminoAcid("Phospho-Serine", "Sep", "s"),
55
+ AminoAcid("Phospho-Threonine", "???", "t"),
56
+ AminoAcid("Phospho-Tyrosine", "???", "y"),
57
+ AminoAcid("Cystine", "???", "c"),
58
+ AminoAcid("Methionine sulfoxide", "???", "m"),
59
+ AminoAcid("Pyroglutamate", "???", "q"),
60
+ AminoAcid("Pyroglutamic acid", "???", "n"),
61
+ ]
62
+
63
+ ###
64
+ # Amino acid tokens which represent multiple canonical amino acids
65
+ ###
66
+ wildcard_amino_acids = [
67
+ AminoAcid("Unknown", "Xaa", "X", contains=set(canonical_amino_acid_letters)),
68
+ AminoAcid("Asparagine-or-Aspartic-Acid", "Asx", "B", contains={"D", "N"}),
69
+ AminoAcid("Glutamine-or-Glutamic-Acid", "Glx", "Z", contains={"E", "Q"}),
70
+ AminoAcid("Leucine-or-Isoleucine", "Xle", "J", contains={"I", "L"})
71
+ ]
72
+
73
+ ###
74
+ # Canonical amino acids + wilcard tokens
75
+ ###
76
+
77
+ canonical_amino_acids_with_unknown = canonical_amino_acids + wildcard_amino_acids
78
+
79
+
80
+ ###
81
+ # Rare amino acids which aren't considered part of the core 20 "canonical"
82
+ ###
83
+
84
+ rare_amino_acids = [
85
+ AminoAcid("Selenocysteine", "Sec", "U"),
86
+ AminoAcid("Pyrrolysine", "Pyl", "O"),
87
+ ]
88
+
89
+ ###
90
+ # Extended amino acids + wildcard tokens
91
+ ###
92
+
93
+ extended_amino_acids = canonical_amino_acids + rare_amino_acids + wildcard_amino_acids
94
+ extended_amino_acid_letters = [
95
+ aa.letter for aa in extended_amino_acids
96
+ ]
97
+ extended_amino_acids_with_unknown_names = [
98
+ aa.full_name for aa in extended_amino_acids
99
+ ]
100
+
101
+
102
+ amino_acid_letter_indices = {
103
+ c: i for (i, c) in
104
+ enumerate(extended_amino_acid_letters)
105
+ }
106
+
107
+
108
+ amino_acid_letter_pairs = [
109
+ "%s%s" % (x, y)
110
+ for y in extended_amino_acids
111
+ for x in extended_amino_acids
112
+ ]
113
+
114
+
115
+ amino_acid_name_indices = {
116
+ aa_name: i for (i, aa_name)
117
+ in enumerate(extended_amino_acids_with_unknown_names)
118
+ }
119
+
120
+ amino_acid_pair_positions = {
121
+ pair: i for (i, pair) in enumerate(amino_acid_letter_pairs)
122
+ }
123
+
124
+ def index_to_full_name(idx):
125
+ return extended_amino_acids[idx].full_name
126
+
127
+ def index_to_short_name(idx):
128
+ return extended_amino_acids[idx].short_name
129
+
130
+ def index_to_letter(idx):
131
+ return extended_amino_acids[idx]
132
+
133
+ def letter_to_index(x):
134
+ """
135
+ Convert from an amino acid's letter code to its position index
136
+ """
137
+ assert x in amino_acid_letter_indices, "Unknown amino acid: %s" % x
138
+ return amino_acid_letter_indices[x]
139
+
140
+ def peptide_to_indices(xs):
141
+ return [amino_acid_letter_indices[x] for x in xs]
142
+
143
+ def letter_to_short_name(x):
144
+ return index_to_short_name(letter_to_index(x))
145
+
146
+ def peptide_to_short_amino_acid_names(xs):
147
+ return [amino_acid_letter_indices[x] for x in xs]
148
+
149
+ def dict_to_amino_acid_matrix(d, alphabet=canonical_amino_acids):
150
+ n_aa = len(d)
151
+ result_matrix = np.zeros((n_aa, n_aa), dtype="float32")
152
+ for i, aa_row in enumerate(alphabet):
153
+ d_row = d[aa_row.letter]
154
+ for j, aa_col in enumerate(alphabet):
155
+ value = d_row[aa_col.letter]
156
+ result_matrix[i, j] = value
157
+ return result_matrix
158
+
@@ -0,0 +1,358 @@
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """
14
+ Quantify amino acids by their physical/chemical properties
15
+ """
16
+
17
+ from .amino_acid_alphabet import letter_to_index
18
+
19
+
20
+
21
+ def aa_dict_to_positional_list(aa_property_dict):
22
+ value_list = [None] * 20
23
+ for letter, value in aa_property_dict.items():
24
+ idx = letter_to_index(letter)
25
+ assert idx >= 0
26
+ assert idx < 20
27
+ value_list[idx] = value
28
+ assert all(elt is not None for elt in value_list), \
29
+ "Missing amino acids in:\n%s" % aa_property_dict.keys()
30
+ return value_list
31
+
32
+ def parse_property_table(table_string):
33
+ value_dict = {}
34
+ for line in table_string.splitlines():
35
+ line = line.strip()
36
+ if not line:
37
+ continue
38
+ fields = line.split(" ")
39
+ fields = [f for f in fields if len(f.strip()) > 0]
40
+ assert len(fields) >= 2
41
+ value, letter = fields[:2]
42
+ assert letter not in value_dict, "Repeated amino acid " + line
43
+ value_dict[letter] = float(value)
44
+ return value_dict
45
+
46
+
47
+ """
48
+ Amino acids property tables copied from CRASP website
49
+ """
50
+
51
+ hydropathy = parse_property_table("""
52
+ 1.80000 A ALA
53
+ -4.5000 R ARG
54
+ -3.5000 N ASN
55
+ -3.5000 D ASP
56
+ 2.50000 C CYS
57
+ -3.5000 Q GLN
58
+ -3.5000 E GLU
59
+ -0.4000 G GLY
60
+ -3.2000 H HIS
61
+ 4.50000 I ILE
62
+ 3.80000 L LEU
63
+ -3.9000 K LYS
64
+ 1.90000 M MET
65
+ 2.80000 F PHE
66
+ -1.6000 P PRO
67
+ -0.8000 S SER
68
+ -0.7000 T THR
69
+ -0.9000 W TRP
70
+ -1.3000 Y TYR
71
+ 4.20000 V VAL
72
+ """)
73
+
74
+ volume = parse_property_table("""
75
+ 91.5000 A ALA
76
+ 202.0000 R ARG
77
+ 135.2000 N ASN
78
+ 124.5000 D ASP
79
+ 118.0000 C CYS
80
+ 161.1000 Q GLN
81
+ 155.1000 E GLU
82
+ 66.40000 G GLY
83
+ 167.3000 H HIS
84
+ 168.8000 I ILE
85
+ 167.9000 L LEU
86
+ 171.3000 K LYS
87
+ 170.8000 M MET
88
+ 203.4000 F PHE
89
+ 129.3000 P PRO
90
+ 99.10000 S SER
91
+ 122.1000 T THR
92
+ 237.6000 W TRP
93
+ 203.6000 Y TYR
94
+ 141.7000 V VAL
95
+ """)
96
+
97
+ polarity = parse_property_table("""
98
+ 0.0000 A ALA
99
+ 52.000 R ARG
100
+ 3.3800 N ASN
101
+ 40.700 D ASP
102
+ 1.4800 C CYS
103
+ 3.5300 Q GLN
104
+ 49.910 E GLU
105
+ 0.0000 G GLY
106
+ 51.600 H HIS
107
+ 0.1500 I ILE
108
+ 0.4500 L LEU
109
+ 49.500 K LYS
110
+ 1.4300 M MET
111
+ 0.3500 F PHE
112
+ 1.5800 P PRO
113
+ 1.6700 S SER
114
+ 1.6600 T THR
115
+ 2.1000 W TRP
116
+ 1.6100 Y TYR
117
+ 0.1300 V VAL
118
+ """)
119
+
120
+ pK_side_chain = parse_property_table("""
121
+ 0.0000 A ALA
122
+ 12.480 R ARG
123
+ 0.0000 N ASN
124
+ 3.6500 D ASP
125
+ 8.1800 C CYS
126
+ 0.0000 Q GLN
127
+ 4.2500 E GLU
128
+ 0.0000 G GLY
129
+ 6.0000 H HIS
130
+ 0.0000 I ILE
131
+ 0.0000 L LEU
132
+ 10.530 K LYS
133
+ 0.0000 M MET
134
+ 0.0000 F PHE
135
+ 0.0000 P PRO
136
+ 0.0000 S SER
137
+ 0.0000 T THR
138
+ 0.0000 W TRP
139
+ 10.700 Y TYR
140
+ 0.0000 V VAL
141
+ """)
142
+
143
+ prct_exposed_residues = parse_property_table("""
144
+ 15.0000 A ALA
145
+ 67.0000 R ARG
146
+ 49.0000 N ASN
147
+ 50.0000 D ASP
148
+ 5.00000 C CYS
149
+ 56.0000 Q GLN
150
+ 55.0000 E GLU
151
+ 10.0000 G GLY
152
+ 34.0000 H HIS
153
+ 13.0000 I ILE
154
+ 16.0000 L LEU
155
+ 85.0000 K LYS
156
+ 20.0000 M MET
157
+ 10.0000 F PHE
158
+ 45.0000 P PRO
159
+ 32.0000 S SER
160
+ 32.0000 T THR
161
+ 17.0000 W TRP
162
+ 41.0000 Y TYR
163
+ 14.0000 V VAL
164
+ """)
165
+
166
+ hydrophilicity = parse_property_table("""
167
+ -0.5000 A ALA
168
+ 3.00000 R ARG
169
+ 0.20000 N ASN
170
+ 3.00000 D ASP
171
+ -1.0000 C CYS
172
+ 0.20000 Q GLN
173
+ 3.00000 E GLU
174
+ 0.00000 G GLY
175
+ -0.5000 H HIS
176
+ -1.8000 I ILE
177
+ -1.8000 L LEU
178
+ 3.00000 K LYS
179
+ -1.3000 M MET
180
+ -2.5000 F PHE
181
+ 0.00000 P PRO
182
+ 0.30000 S SER
183
+ -0.4000 T THR
184
+ -3.4000 W TRP
185
+ -2.3000 Y TYR
186
+ -1.5000 V VAL
187
+ """)
188
+
189
+ accessible_surface_area = parse_property_table("""
190
+ 27.8000 A ALA
191
+ 94.7000 R ARG
192
+ 60.1000 N ASN
193
+ 60.6000 D ASP
194
+ 15.5000 C CYS
195
+ 68.7000 Q GLN
196
+ 68.2000 E GLU
197
+ 24.5000 G GLY
198
+ 50.7000 H HIS
199
+ 22.8000 I ILE
200
+ 27.6000 L LEU
201
+ 103.000 K LYS
202
+ 33.5000 M MET
203
+ 25.5000 F PHE
204
+ 51.5000 P PRO
205
+ 42.0000 S SER
206
+ 45.0000 T THR
207
+ 34.7000 W TRP
208
+ 55.2000 Y TYR
209
+ 23.7000 V VAL
210
+ """)
211
+
212
+ local_flexibility = parse_property_table("""
213
+ 705.42000 A ALA
214
+ 1484.2800 R ARG
215
+ 513.46010 N ASN
216
+ 34.960000 D ASP
217
+ 2412.5601 C CYS
218
+ 1087.8300 Q GLN
219
+ 1158.6600 E GLU
220
+ 33.180000 G GLY
221
+ 1637.1300 H HIS
222
+ 5979.3701 I ILE
223
+ 4985.7300 L LEU
224
+ 699.69000 K LYS
225
+ 4491.6602 M MET
226
+ 5203.8599 F PHE
227
+ 431.96000 P PRO
228
+ 174.76000 S SER
229
+ 601.88000 T THR
230
+ 6374.0698 W TRP
231
+ 4291.1001 Y TYR
232
+ 4474.4199 V VAL
233
+ """)
234
+
235
+ accessible_surface_area_folded = parse_property_table("""
236
+ 31.5000 A ALA
237
+ 93.8000 R ARG
238
+ 62.2000 N ASN
239
+ 60.9000 D ASP
240
+ 13.9000 C CYS
241
+ 74.0000 Q GLN
242
+ 72.3000 E GLU
243
+ 25.2000 G GLY
244
+ 46.7000 H HIS
245
+ 23.0000 I ILE
246
+ 29.0000 L LEU
247
+ 110.300 K LYS
248
+ 30.5000 M MET
249
+ 28.7000 F PHE
250
+ 53.7000 P PRO
251
+ 44.2000 S SER
252
+ 46.0000 T THR
253
+ 41.7000 W TRP
254
+ 59.1000 Y TYR
255
+ 23.5000 V VAL
256
+ """)
257
+
258
+ refractivity = parse_property_table("""
259
+ 4.34000 A ALA
260
+ 26.6600 R ARG
261
+ 13.2800 N ASN
262
+ 12.0000 D ASP
263
+ 35.7700 C CYS
264
+ 17.5600 Q GLN
265
+ 17.2600 E GLU
266
+ 0.00000 G GLY
267
+ 21.8100 H HIS
268
+ 19.0600 I ILE
269
+ 18.7800 L LEU
270
+ 21.2900 K LYS
271
+ 21.6400 M MET
272
+ 29.4000 F PHE
273
+ 10.9300 P PRO
274
+ 6.35000 S SER
275
+ 11.0100 T THR
276
+ 42.5300 W TRP
277
+ 31.5300 Y TYR
278
+ 13.9200 V VAL
279
+ """)
280
+
281
+
282
+ mass = parse_property_table("""
283
+ 70.079 A ALA
284
+ 156.188 R ARG
285
+ 114.104 N ASN
286
+ 115.089 D ASP
287
+ 103.144 C CYS
288
+ 128.131 Q GLN
289
+ 129.116 E GLU
290
+ 57.052 G GLY
291
+ 137.142 H HIS
292
+ 113.160 I ILE
293
+ 113.160 L LEU
294
+ 128.174 K LYS
295
+ 131.198 M MET
296
+ 147.177 F PHE
297
+ 97.177 P PRO
298
+ 87.078 S SER
299
+ 101.105 T THR
300
+ 186.213 W TRP
301
+ 163.170 Y TYR
302
+ 99.133 V VAL
303
+ """)
304
+
305
+ ###
306
+ # Values copied from:
307
+ # "Solvent accessibility of AA in known protein structures"
308
+ # http://prowl.rockefeller.edu/aainfo/access.htm
309
+ ###
310
+ """
311
+ Solvent accessibility of AA in known protein structures
312
+
313
+ Figure 1.
314
+
315
+ S 0.70 0.20 0.10
316
+ T 0.71 0.16 0.13
317
+ A 0.48 0.35 0.17
318
+ G 0.51 0.36 0.13
319
+ P 0.78 0.13 0.09
320
+ C 0.32 0.54 0.14
321
+ D 0.81 0.09 0.10
322
+ E 0.93 0.04 0.03
323
+ Q 0.81 0.10 0.09
324
+ N 0.82 0.10 0.08
325
+ L 0.41 0.49 0.10
326
+ I 0.39 0.47 0.14
327
+ V 0.40 0.50 0.10
328
+ M 0.44 0.20 0.36
329
+ F 0.42 0.42 0.16
330
+ Y 0.67 0.20 0.13
331
+ W 0.49 0.44 0.07
332
+ K 0.93 0.02 0.05
333
+ R 0.84 0.05 0.11
334
+ H 0.66 0.19 0.15
335
+ """
336
+
337
+ solvent_exposed_area = dict(
338
+ S=0.70,
339
+ T=0.71,
340
+ A=0.48,
341
+ G=0.51,
342
+ P=0.78,
343
+ C=0.32,
344
+ D=0.81,
345
+ E=0.93,
346
+ Q=0.81,
347
+ N=0.82,
348
+ L=0.41,
349
+ I=0.39,
350
+ V=0.40,
351
+ M=0.44,
352
+ F=0.42,
353
+ Y=0.67,
354
+ W=0.49,
355
+ K=0.93,
356
+ R=0.84,
357
+ H=0.66,
358
+ )