weirdo 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/__init__.py +25 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/__pycache__/__init__.cpython-38.pyc +0 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/__pycache__/amino_acid.cpython-38.pyc +0 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/__pycache__/amino_acid_alphabet.cpython-38.pyc +0 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/__pycache__/amino_acid_properties.cpython-38.pyc +0 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/__pycache__/blosum.cpython-38.pyc +0 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/__pycache__/chou_fasman.cpython-38.pyc +0 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/__pycache__/common.cpython-38.pyc +0 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/__pycache__/distances.cpython-38.pyc +0 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/__pycache__/peptide_vectorizer.cpython-38.pyc +0 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/__pycache__/pmbec.cpython-38.pyc +0 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/__pycache__/reduced_alphabet.cpython-38.pyc +0 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/__pycache__/residue_contact_energies.cpython-38.pyc +0 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/__pycache__/static_data.cpython-38.pyc +0 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/amino_acid.py +33 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/amino_acid_alphabet.py +158 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/amino_acid_properties.py +358 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/blosum.py +74 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/chou_fasman.py +74 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/common.py +22 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/distances.py +16 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/matrices/__init__.py +0 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/matrices/__pycache__/__init__.cpython-38.pyc +0 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/peptide_vectorizer.py +80 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/pmbec.py +87 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/reduced_alphabet.py +57 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/residue_contact_energies.py +74 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/static_data.py +17 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo-1.0.0-py3.8.egg-info/PKG-INFO +66 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo-1.0.0-py3.8.egg-info/SOURCES.txt +27 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo-1.0.0-py3.8.egg-info/dependency_links.txt +1 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo-1.0.0-py3.8.egg-info/requires.txt +3 -0
- Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo-1.0.0-py3.8.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from .amino_acid_alphabet import (
|
|
2
|
+
AminoAcid,
|
|
3
|
+
canonical_amino_acids,
|
|
4
|
+
canonical_amino_acid_letters,
|
|
5
|
+
extended_amino_acids,
|
|
6
|
+
extended_amino_acid_letters,
|
|
7
|
+
amino_acid_letter_indices,
|
|
8
|
+
amino_acid_name_indices,
|
|
9
|
+
)
|
|
10
|
+
from .peptide_vectorizer import PeptideVectorizer
|
|
11
|
+
from .distances import hamming
|
|
12
|
+
|
|
13
|
+
__version__ = "1.0.0"
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"AminoAcid",
|
|
17
|
+
"canonical_amino_acids",
|
|
18
|
+
"canonical_amino_acid_letters",
|
|
19
|
+
"extended_amino_acids",
|
|
20
|
+
"extended_amino_acid_letters",
|
|
21
|
+
"amino_acid_letter_indices",
|
|
22
|
+
"amino_acid_name_indices",
|
|
23
|
+
"PeptideVectorizer",
|
|
24
|
+
"hamming",
|
|
25
|
+
]
|
Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/__pycache__/__init__.cpython-38.pyc
ADDED
|
Binary file
|
Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/__pycache__/amino_acid.cpython-38.pyc
ADDED
|
Binary file
|
|
Binary file
|
|
Binary file
|
Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/__pycache__/blosum.cpython-38.pyc
ADDED
|
Binary file
|
Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/__pycache__/chou_fasman.cpython-38.pyc
ADDED
|
Binary file
|
Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/__pycache__/common.cpython-38.pyc
ADDED
|
Binary file
|
Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/__pycache__/distances.cpython-38.pyc
ADDED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo/__pycache__/static_data.cpython-38.pyc
ADDED
|
Binary file
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
2
|
+
# you may not use this file except in compliance with the License.
|
|
3
|
+
# You may obtain a copy of the License at
|
|
4
|
+
#
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
#
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
class AminoAcid(object):
|
|
14
|
+
def __init__(
|
|
15
|
+
self, full_name, short_name, letter, contains=None):
|
|
16
|
+
self.letter = letter
|
|
17
|
+
self.full_name = full_name
|
|
18
|
+
self.short_name = short_name
|
|
19
|
+
if not contains:
|
|
20
|
+
contains = [letter]
|
|
21
|
+
self.contains = contains
|
|
22
|
+
|
|
23
|
+
def __str__(self):
|
|
24
|
+
return (
|
|
25
|
+
("AminoAcid(full_name='%s', short_name='%s', letter='%s', "
|
|
26
|
+
"contains=%s)") % (
|
|
27
|
+
self.letter, self.full_name, self.short_name, self.contains))
|
|
28
|
+
|
|
29
|
+
def __repr__(self):
|
|
30
|
+
return str(self)
|
|
31
|
+
|
|
32
|
+
def __eq__(self, other):
|
|
33
|
+
return other.__class__ is AminoAcid and self.letter == other.letter
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
2
|
+
# you may not use this file except in compliance with the License.
|
|
3
|
+
# You may obtain a copy of the License at
|
|
4
|
+
#
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
#
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
"""
|
|
15
|
+
Quantify amino acids by their physical/chemical properties
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
from .amino_acid import AminoAcid
|
|
21
|
+
|
|
22
|
+
canonical_amino_acids = [
|
|
23
|
+
AminoAcid("Alanine", "Ala", "A"),
|
|
24
|
+
AminoAcid("Arginine", "Arg", "R"),
|
|
25
|
+
AminoAcid("Asparagine","Asn", "N"),
|
|
26
|
+
AminoAcid("Aspartic Acid", "Asp", "D"),
|
|
27
|
+
AminoAcid("Cysteine", "Cys", "C"),
|
|
28
|
+
AminoAcid("Glutamic Acid", "Glu", "E"),
|
|
29
|
+
AminoAcid("Glutamine", "Gln", "Q"),
|
|
30
|
+
AminoAcid("Glycine", "Gly", "G"),
|
|
31
|
+
AminoAcid("Histidine", "His", "H"),
|
|
32
|
+
AminoAcid("Isoleucine", "Ile", "I"),
|
|
33
|
+
AminoAcid("Leucine", "Leu", "L"),
|
|
34
|
+
AminoAcid("Lysine", "Lys", "K"),
|
|
35
|
+
AminoAcid("Methionine", "Met", "M"),
|
|
36
|
+
AminoAcid("Phenylalanine", "Phe", "F"),
|
|
37
|
+
AminoAcid("Proline", "Pro", "P"),
|
|
38
|
+
AminoAcid("Serine", "Ser", "S"),
|
|
39
|
+
AminoAcid("Threonine", "Thr", "T"),
|
|
40
|
+
AminoAcid("Tryptophan", "Trp", "W"),
|
|
41
|
+
AminoAcid("Tyrosine", "Tyr", "Y"),
|
|
42
|
+
AminoAcid("Valine", "Val", "V")
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
canonical_amino_acid_letters = [aa.letter for aa in canonical_amino_acids]
|
|
46
|
+
|
|
47
|
+
###
|
|
48
|
+
# Post-translation modifications commonly detected by mass-spec
|
|
49
|
+
###
|
|
50
|
+
|
|
51
|
+
# TODO: figure out three letter codes for modified AAs
|
|
52
|
+
|
|
53
|
+
modified_amino_acids = [
|
|
54
|
+
AminoAcid("Phospho-Serine", "Sep", "s"),
|
|
55
|
+
AminoAcid("Phospho-Threonine", "???", "t"),
|
|
56
|
+
AminoAcid("Phospho-Tyrosine", "???", "y"),
|
|
57
|
+
AminoAcid("Cystine", "???", "c"),
|
|
58
|
+
AminoAcid("Methionine sulfoxide", "???", "m"),
|
|
59
|
+
AminoAcid("Pyroglutamate", "???", "q"),
|
|
60
|
+
AminoAcid("Pyroglutamic acid", "???", "n"),
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
###
|
|
64
|
+
# Amino acid tokens which represent multiple canonical amino acids
|
|
65
|
+
###
|
|
66
|
+
wildcard_amino_acids = [
|
|
67
|
+
AminoAcid("Unknown", "Xaa", "X", contains=set(canonical_amino_acid_letters)),
|
|
68
|
+
AminoAcid("Asparagine-or-Aspartic-Acid", "Asx", "B", contains={"D", "N"}),
|
|
69
|
+
AminoAcid("Glutamine-or-Glutamic-Acid", "Glx", "Z", contains={"E", "Q"}),
|
|
70
|
+
AminoAcid("Leucine-or-Isoleucine", "Xle", "J", contains={"I", "L"})
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
###
|
|
74
|
+
# Canonical amino acids + wilcard tokens
|
|
75
|
+
###
|
|
76
|
+
|
|
77
|
+
canonical_amino_acids_with_unknown = canonical_amino_acids + wildcard_amino_acids
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
###
|
|
81
|
+
# Rare amino acids which aren't considered part of the core 20 "canonical"
|
|
82
|
+
###
|
|
83
|
+
|
|
84
|
+
rare_amino_acids = [
|
|
85
|
+
AminoAcid("Selenocysteine", "Sec", "U"),
|
|
86
|
+
AminoAcid("Pyrrolysine", "Pyl", "O"),
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
###
|
|
90
|
+
# Extended amino acids + wildcard tokens
|
|
91
|
+
###
|
|
92
|
+
|
|
93
|
+
extended_amino_acids = canonical_amino_acids + rare_amino_acids + wildcard_amino_acids
|
|
94
|
+
extended_amino_acid_letters = [
|
|
95
|
+
aa.letter for aa in extended_amino_acids
|
|
96
|
+
]
|
|
97
|
+
extended_amino_acids_with_unknown_names = [
|
|
98
|
+
aa.full_name for aa in extended_amino_acids
|
|
99
|
+
]
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
amino_acid_letter_indices = {
|
|
103
|
+
c: i for (i, c) in
|
|
104
|
+
enumerate(extended_amino_acid_letters)
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
amino_acid_letter_pairs = [
|
|
109
|
+
"%s%s" % (x, y)
|
|
110
|
+
for y in extended_amino_acids
|
|
111
|
+
for x in extended_amino_acids
|
|
112
|
+
]
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
amino_acid_name_indices = {
|
|
116
|
+
aa_name: i for (i, aa_name)
|
|
117
|
+
in enumerate(extended_amino_acids_with_unknown_names)
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
amino_acid_pair_positions = {
|
|
121
|
+
pair: i for (i, pair) in enumerate(amino_acid_letter_pairs)
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
def index_to_full_name(idx):
|
|
125
|
+
return extended_amino_acids[idx].full_name
|
|
126
|
+
|
|
127
|
+
def index_to_short_name(idx):
|
|
128
|
+
return extended_amino_acids[idx].short_name
|
|
129
|
+
|
|
130
|
+
def index_to_letter(idx):
|
|
131
|
+
return extended_amino_acids[idx]
|
|
132
|
+
|
|
133
|
+
def letter_to_index(x):
|
|
134
|
+
"""
|
|
135
|
+
Convert from an amino acid's letter code to its position index
|
|
136
|
+
"""
|
|
137
|
+
assert x in amino_acid_letter_indices, "Unknown amino acid: %s" % x
|
|
138
|
+
return amino_acid_letter_indices[x]
|
|
139
|
+
|
|
140
|
+
def peptide_to_indices(xs):
|
|
141
|
+
return [amino_acid_letter_indices[x] for x in xs]
|
|
142
|
+
|
|
143
|
+
def letter_to_short_name(x):
|
|
144
|
+
return index_to_short_name(letter_to_index(x))
|
|
145
|
+
|
|
146
|
+
def peptide_to_short_amino_acid_names(xs):
|
|
147
|
+
return [amino_acid_letter_indices[x] for x in xs]
|
|
148
|
+
|
|
149
|
+
def dict_to_amino_acid_matrix(d, alphabet=canonical_amino_acids):
|
|
150
|
+
n_aa = len(d)
|
|
151
|
+
result_matrix = np.zeros((n_aa, n_aa), dtype="float32")
|
|
152
|
+
for i, aa_row in enumerate(alphabet):
|
|
153
|
+
d_row = d[aa_row.letter]
|
|
154
|
+
for j, aa_col in enumerate(alphabet):
|
|
155
|
+
value = d_row[aa_col.letter]
|
|
156
|
+
result_matrix[i, j] = value
|
|
157
|
+
return result_matrix
|
|
158
|
+
|
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
2
|
+
# you may not use this file except in compliance with the License.
|
|
3
|
+
# You may obtain a copy of the License at
|
|
4
|
+
#
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
#
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
"""
|
|
14
|
+
Quantify amino acids by their physical/chemical properties
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from .amino_acid_alphabet import letter_to_index
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def aa_dict_to_positional_list(aa_property_dict):
|
|
22
|
+
value_list = [None] * 20
|
|
23
|
+
for letter, value in aa_property_dict.items():
|
|
24
|
+
idx = letter_to_index(letter)
|
|
25
|
+
assert idx >= 0
|
|
26
|
+
assert idx < 20
|
|
27
|
+
value_list[idx] = value
|
|
28
|
+
assert all(elt is not None for elt in value_list), \
|
|
29
|
+
"Missing amino acids in:\n%s" % aa_property_dict.keys()
|
|
30
|
+
return value_list
|
|
31
|
+
|
|
32
|
+
def parse_property_table(table_string):
|
|
33
|
+
value_dict = {}
|
|
34
|
+
for line in table_string.splitlines():
|
|
35
|
+
line = line.strip()
|
|
36
|
+
if not line:
|
|
37
|
+
continue
|
|
38
|
+
fields = line.split(" ")
|
|
39
|
+
fields = [f for f in fields if len(f.strip()) > 0]
|
|
40
|
+
assert len(fields) >= 2
|
|
41
|
+
value, letter = fields[:2]
|
|
42
|
+
assert letter not in value_dict, "Repeated amino acid " + line
|
|
43
|
+
value_dict[letter] = float(value)
|
|
44
|
+
return value_dict
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
"""
|
|
48
|
+
Amino acids property tables copied from CRASP website
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
hydropathy = parse_property_table("""
|
|
52
|
+
1.80000 A ALA
|
|
53
|
+
-4.5000 R ARG
|
|
54
|
+
-3.5000 N ASN
|
|
55
|
+
-3.5000 D ASP
|
|
56
|
+
2.50000 C CYS
|
|
57
|
+
-3.5000 Q GLN
|
|
58
|
+
-3.5000 E GLU
|
|
59
|
+
-0.4000 G GLY
|
|
60
|
+
-3.2000 H HIS
|
|
61
|
+
4.50000 I ILE
|
|
62
|
+
3.80000 L LEU
|
|
63
|
+
-3.9000 K LYS
|
|
64
|
+
1.90000 M MET
|
|
65
|
+
2.80000 F PHE
|
|
66
|
+
-1.6000 P PRO
|
|
67
|
+
-0.8000 S SER
|
|
68
|
+
-0.7000 T THR
|
|
69
|
+
-0.9000 W TRP
|
|
70
|
+
-1.3000 Y TYR
|
|
71
|
+
4.20000 V VAL
|
|
72
|
+
""")
|
|
73
|
+
|
|
74
|
+
volume = parse_property_table("""
|
|
75
|
+
91.5000 A ALA
|
|
76
|
+
202.0000 R ARG
|
|
77
|
+
135.2000 N ASN
|
|
78
|
+
124.5000 D ASP
|
|
79
|
+
118.0000 C CYS
|
|
80
|
+
161.1000 Q GLN
|
|
81
|
+
155.1000 E GLU
|
|
82
|
+
66.40000 G GLY
|
|
83
|
+
167.3000 H HIS
|
|
84
|
+
168.8000 I ILE
|
|
85
|
+
167.9000 L LEU
|
|
86
|
+
171.3000 K LYS
|
|
87
|
+
170.8000 M MET
|
|
88
|
+
203.4000 F PHE
|
|
89
|
+
129.3000 P PRO
|
|
90
|
+
99.10000 S SER
|
|
91
|
+
122.1000 T THR
|
|
92
|
+
237.6000 W TRP
|
|
93
|
+
203.6000 Y TYR
|
|
94
|
+
141.7000 V VAL
|
|
95
|
+
""")
|
|
96
|
+
|
|
97
|
+
polarity = parse_property_table("""
|
|
98
|
+
0.0000 A ALA
|
|
99
|
+
52.000 R ARG
|
|
100
|
+
3.3800 N ASN
|
|
101
|
+
40.700 D ASP
|
|
102
|
+
1.4800 C CYS
|
|
103
|
+
3.5300 Q GLN
|
|
104
|
+
49.910 E GLU
|
|
105
|
+
0.0000 G GLY
|
|
106
|
+
51.600 H HIS
|
|
107
|
+
0.1500 I ILE
|
|
108
|
+
0.4500 L LEU
|
|
109
|
+
49.500 K LYS
|
|
110
|
+
1.4300 M MET
|
|
111
|
+
0.3500 F PHE
|
|
112
|
+
1.5800 P PRO
|
|
113
|
+
1.6700 S SER
|
|
114
|
+
1.6600 T THR
|
|
115
|
+
2.1000 W TRP
|
|
116
|
+
1.6100 Y TYR
|
|
117
|
+
0.1300 V VAL
|
|
118
|
+
""")
|
|
119
|
+
|
|
120
|
+
pK_side_chain = parse_property_table("""
|
|
121
|
+
0.0000 A ALA
|
|
122
|
+
12.480 R ARG
|
|
123
|
+
0.0000 N ASN
|
|
124
|
+
3.6500 D ASP
|
|
125
|
+
8.1800 C CYS
|
|
126
|
+
0.0000 Q GLN
|
|
127
|
+
4.2500 E GLU
|
|
128
|
+
0.0000 G GLY
|
|
129
|
+
6.0000 H HIS
|
|
130
|
+
0.0000 I ILE
|
|
131
|
+
0.0000 L LEU
|
|
132
|
+
10.530 K LYS
|
|
133
|
+
0.0000 M MET
|
|
134
|
+
0.0000 F PHE
|
|
135
|
+
0.0000 P PRO
|
|
136
|
+
0.0000 S SER
|
|
137
|
+
0.0000 T THR
|
|
138
|
+
0.0000 W TRP
|
|
139
|
+
10.700 Y TYR
|
|
140
|
+
0.0000 V VAL
|
|
141
|
+
""")
|
|
142
|
+
|
|
143
|
+
prct_exposed_residues = parse_property_table("""
|
|
144
|
+
15.0000 A ALA
|
|
145
|
+
67.0000 R ARG
|
|
146
|
+
49.0000 N ASN
|
|
147
|
+
50.0000 D ASP
|
|
148
|
+
5.00000 C CYS
|
|
149
|
+
56.0000 Q GLN
|
|
150
|
+
55.0000 E GLU
|
|
151
|
+
10.0000 G GLY
|
|
152
|
+
34.0000 H HIS
|
|
153
|
+
13.0000 I ILE
|
|
154
|
+
16.0000 L LEU
|
|
155
|
+
85.0000 K LYS
|
|
156
|
+
20.0000 M MET
|
|
157
|
+
10.0000 F PHE
|
|
158
|
+
45.0000 P PRO
|
|
159
|
+
32.0000 S SER
|
|
160
|
+
32.0000 T THR
|
|
161
|
+
17.0000 W TRP
|
|
162
|
+
41.0000 Y TYR
|
|
163
|
+
14.0000 V VAL
|
|
164
|
+
""")
|
|
165
|
+
|
|
166
|
+
hydrophilicity = parse_property_table("""
|
|
167
|
+
-0.5000 A ALA
|
|
168
|
+
3.00000 R ARG
|
|
169
|
+
0.20000 N ASN
|
|
170
|
+
3.00000 D ASP
|
|
171
|
+
-1.0000 C CYS
|
|
172
|
+
0.20000 Q GLN
|
|
173
|
+
3.00000 E GLU
|
|
174
|
+
0.00000 G GLY
|
|
175
|
+
-0.5000 H HIS
|
|
176
|
+
-1.8000 I ILE
|
|
177
|
+
-1.8000 L LEU
|
|
178
|
+
3.00000 K LYS
|
|
179
|
+
-1.3000 M MET
|
|
180
|
+
-2.5000 F PHE
|
|
181
|
+
0.00000 P PRO
|
|
182
|
+
0.30000 S SER
|
|
183
|
+
-0.4000 T THR
|
|
184
|
+
-3.4000 W TRP
|
|
185
|
+
-2.3000 Y TYR
|
|
186
|
+
-1.5000 V VAL
|
|
187
|
+
""")
|
|
188
|
+
|
|
189
|
+
accessible_surface_area = parse_property_table("""
|
|
190
|
+
27.8000 A ALA
|
|
191
|
+
94.7000 R ARG
|
|
192
|
+
60.1000 N ASN
|
|
193
|
+
60.6000 D ASP
|
|
194
|
+
15.5000 C CYS
|
|
195
|
+
68.7000 Q GLN
|
|
196
|
+
68.2000 E GLU
|
|
197
|
+
24.5000 G GLY
|
|
198
|
+
50.7000 H HIS
|
|
199
|
+
22.8000 I ILE
|
|
200
|
+
27.6000 L LEU
|
|
201
|
+
103.000 K LYS
|
|
202
|
+
33.5000 M MET
|
|
203
|
+
25.5000 F PHE
|
|
204
|
+
51.5000 P PRO
|
|
205
|
+
42.0000 S SER
|
|
206
|
+
45.0000 T THR
|
|
207
|
+
34.7000 W TRP
|
|
208
|
+
55.2000 Y TYR
|
|
209
|
+
23.7000 V VAL
|
|
210
|
+
""")
|
|
211
|
+
|
|
212
|
+
local_flexibility = parse_property_table("""
|
|
213
|
+
705.42000 A ALA
|
|
214
|
+
1484.2800 R ARG
|
|
215
|
+
513.46010 N ASN
|
|
216
|
+
34.960000 D ASP
|
|
217
|
+
2412.5601 C CYS
|
|
218
|
+
1087.8300 Q GLN
|
|
219
|
+
1158.6600 E GLU
|
|
220
|
+
33.180000 G GLY
|
|
221
|
+
1637.1300 H HIS
|
|
222
|
+
5979.3701 I ILE
|
|
223
|
+
4985.7300 L LEU
|
|
224
|
+
699.69000 K LYS
|
|
225
|
+
4491.6602 M MET
|
|
226
|
+
5203.8599 F PHE
|
|
227
|
+
431.96000 P PRO
|
|
228
|
+
174.76000 S SER
|
|
229
|
+
601.88000 T THR
|
|
230
|
+
6374.0698 W TRP
|
|
231
|
+
4291.1001 Y TYR
|
|
232
|
+
4474.4199 V VAL
|
|
233
|
+
""")
|
|
234
|
+
|
|
235
|
+
accessible_surface_area_folded = parse_property_table("""
|
|
236
|
+
31.5000 A ALA
|
|
237
|
+
93.8000 R ARG
|
|
238
|
+
62.2000 N ASN
|
|
239
|
+
60.9000 D ASP
|
|
240
|
+
13.9000 C CYS
|
|
241
|
+
74.0000 Q GLN
|
|
242
|
+
72.3000 E GLU
|
|
243
|
+
25.2000 G GLY
|
|
244
|
+
46.7000 H HIS
|
|
245
|
+
23.0000 I ILE
|
|
246
|
+
29.0000 L LEU
|
|
247
|
+
110.300 K LYS
|
|
248
|
+
30.5000 M MET
|
|
249
|
+
28.7000 F PHE
|
|
250
|
+
53.7000 P PRO
|
|
251
|
+
44.2000 S SER
|
|
252
|
+
46.0000 T THR
|
|
253
|
+
41.7000 W TRP
|
|
254
|
+
59.1000 Y TYR
|
|
255
|
+
23.5000 V VAL
|
|
256
|
+
""")
|
|
257
|
+
|
|
258
|
+
refractivity = parse_property_table("""
|
|
259
|
+
4.34000 A ALA
|
|
260
|
+
26.6600 R ARG
|
|
261
|
+
13.2800 N ASN
|
|
262
|
+
12.0000 D ASP
|
|
263
|
+
35.7700 C CYS
|
|
264
|
+
17.5600 Q GLN
|
|
265
|
+
17.2600 E GLU
|
|
266
|
+
0.00000 G GLY
|
|
267
|
+
21.8100 H HIS
|
|
268
|
+
19.0600 I ILE
|
|
269
|
+
18.7800 L LEU
|
|
270
|
+
21.2900 K LYS
|
|
271
|
+
21.6400 M MET
|
|
272
|
+
29.4000 F PHE
|
|
273
|
+
10.9300 P PRO
|
|
274
|
+
6.35000 S SER
|
|
275
|
+
11.0100 T THR
|
|
276
|
+
42.5300 W TRP
|
|
277
|
+
31.5300 Y TYR
|
|
278
|
+
13.9200 V VAL
|
|
279
|
+
""")
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
mass = parse_property_table("""
|
|
283
|
+
70.079 A ALA
|
|
284
|
+
156.188 R ARG
|
|
285
|
+
114.104 N ASN
|
|
286
|
+
115.089 D ASP
|
|
287
|
+
103.144 C CYS
|
|
288
|
+
128.131 Q GLN
|
|
289
|
+
129.116 E GLU
|
|
290
|
+
57.052 G GLY
|
|
291
|
+
137.142 H HIS
|
|
292
|
+
113.160 I ILE
|
|
293
|
+
113.160 L LEU
|
|
294
|
+
128.174 K LYS
|
|
295
|
+
131.198 M MET
|
|
296
|
+
147.177 F PHE
|
|
297
|
+
97.177 P PRO
|
|
298
|
+
87.078 S SER
|
|
299
|
+
101.105 T THR
|
|
300
|
+
186.213 W TRP
|
|
301
|
+
163.170 Y TYR
|
|
302
|
+
99.133 V VAL
|
|
303
|
+
""")
|
|
304
|
+
|
|
305
|
+
###
|
|
306
|
+
# Values copied from:
|
|
307
|
+
# "Solvent accessibility of AA in known protein structures"
|
|
308
|
+
# http://prowl.rockefeller.edu/aainfo/access.htm
|
|
309
|
+
###
|
|
310
|
+
"""
|
|
311
|
+
Solvent accessibility of AA in known protein structures
|
|
312
|
+
|
|
313
|
+
Figure 1.
|
|
314
|
+
|
|
315
|
+
S 0.70 0.20 0.10
|
|
316
|
+
T 0.71 0.16 0.13
|
|
317
|
+
A 0.48 0.35 0.17
|
|
318
|
+
G 0.51 0.36 0.13
|
|
319
|
+
P 0.78 0.13 0.09
|
|
320
|
+
C 0.32 0.54 0.14
|
|
321
|
+
D 0.81 0.09 0.10
|
|
322
|
+
E 0.93 0.04 0.03
|
|
323
|
+
Q 0.81 0.10 0.09
|
|
324
|
+
N 0.82 0.10 0.08
|
|
325
|
+
L 0.41 0.49 0.10
|
|
326
|
+
I 0.39 0.47 0.14
|
|
327
|
+
V 0.40 0.50 0.10
|
|
328
|
+
M 0.44 0.20 0.36
|
|
329
|
+
F 0.42 0.42 0.16
|
|
330
|
+
Y 0.67 0.20 0.13
|
|
331
|
+
W 0.49 0.44 0.07
|
|
332
|
+
K 0.93 0.02 0.05
|
|
333
|
+
R 0.84 0.05 0.11
|
|
334
|
+
H 0.66 0.19 0.15
|
|
335
|
+
"""
|
|
336
|
+
|
|
337
|
+
solvent_exposed_area = dict(
|
|
338
|
+
S=0.70,
|
|
339
|
+
T=0.71,
|
|
340
|
+
A=0.48,
|
|
341
|
+
G=0.51,
|
|
342
|
+
P=0.78,
|
|
343
|
+
C=0.32,
|
|
344
|
+
D=0.81,
|
|
345
|
+
E=0.93,
|
|
346
|
+
Q=0.81,
|
|
347
|
+
N=0.82,
|
|
348
|
+
L=0.41,
|
|
349
|
+
I=0.39,
|
|
350
|
+
V=0.40,
|
|
351
|
+
M=0.44,
|
|
352
|
+
F=0.42,
|
|
353
|
+
Y=0.67,
|
|
354
|
+
W=0.49,
|
|
355
|
+
K=0.93,
|
|
356
|
+
R=0.84,
|
|
357
|
+
H=0.66,
|
|
358
|
+
)
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
2
|
+
# you may not use this file except in compliance with the License.
|
|
3
|
+
# You may obtain a copy of the License at
|
|
4
|
+
#
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
#
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
from os.path import join
|
|
15
|
+
|
|
16
|
+
from .static_data import MATRIX_DIR
|
|
17
|
+
|
|
18
|
+
from .amino_acid_alphabet import dict_to_amino_acid_matrix
|
|
19
|
+
|
|
20
|
+
def parse_blosum_table(table, coeff_type=int, key_type='row'):
|
|
21
|
+
"""
|
|
22
|
+
Parse a table of pairwise amino acid coefficient (e.g. BLOSUM50)
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
lines = table.split("\n")
|
|
26
|
+
# drop comments
|
|
27
|
+
lines = [line for line in lines if not line.startswith("#")]
|
|
28
|
+
# drop CR endline characters
|
|
29
|
+
lines = [line.replace("\r", "") for line in lines]
|
|
30
|
+
# skip empty lines
|
|
31
|
+
lines = [line for line in lines if line]
|
|
32
|
+
|
|
33
|
+
labels = lines[0].split()
|
|
34
|
+
|
|
35
|
+
if len(labels) < 20:
|
|
36
|
+
raise ValueError(
|
|
37
|
+
"Expected 20+ amino acids but first line '%s' has %d fields" % (
|
|
38
|
+
lines[0],
|
|
39
|
+
len(labels)))
|
|
40
|
+
coeffs = {}
|
|
41
|
+
for line in lines[1:]:
|
|
42
|
+
|
|
43
|
+
fields = line.split()
|
|
44
|
+
assert len(fields) >= 21, \
|
|
45
|
+
"Expected AA and 20+ coefficients but '%s' has %d fields" % (
|
|
46
|
+
line, len(fields))
|
|
47
|
+
x = fields[0]
|
|
48
|
+
for i, coeff_str in enumerate(fields[1:]):
|
|
49
|
+
y = labels[i]
|
|
50
|
+
coeff = coeff_type(coeff_str)
|
|
51
|
+
if key_type == 'pair':
|
|
52
|
+
coeffs[(x, y)] = coeff
|
|
53
|
+
elif key_type == 'pair_string':
|
|
54
|
+
coeffs[x + y] = coeff
|
|
55
|
+
else:
|
|
56
|
+
assert key_type == 'row', "Unknown key type: %s" % key_type
|
|
57
|
+
if x not in coeffs:
|
|
58
|
+
coeffs[x] = {}
|
|
59
|
+
coeffs[x][y] = coeff
|
|
60
|
+
return coeffs
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
with open(join(MATRIX_DIR, 'BLOSUM30'), 'r') as f:
|
|
64
|
+
blosum30_dict = parse_blosum_table(f.read())
|
|
65
|
+
blosum30_matrix = dict_to_amino_acid_matrix(blosum30_dict)
|
|
66
|
+
|
|
67
|
+
with open(join(MATRIX_DIR, 'BLOSUM50'), 'r') as f:
|
|
68
|
+
blosum50_dict = parse_blosum_table(f.read())
|
|
69
|
+
blosum50_matrix = dict_to_amino_acid_matrix(blosum50_dict)
|
|
70
|
+
|
|
71
|
+
with open(join(MATRIX_DIR, 'BLOSUM62'), 'r') as f:
|
|
72
|
+
blosum62_dict = parse_blosum_table(f.read())
|
|
73
|
+
blosum62_matrix = dict_to_amino_acid_matrix(blosum62_dict)
|
|
74
|
+
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
2
|
+
# you may not use this file except in compliance with the License.
|
|
3
|
+
# You may obtain a copy of the License at
|
|
4
|
+
#
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
#
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
from __future__ import print_function, division, absolute_import
|
|
14
|
+
|
|
15
|
+
from .amino_acid_alphabet import amino_acid_name_indices
|
|
16
|
+
|
|
17
|
+
# Chou-Fasman of structural properties from
|
|
18
|
+
# http://prowl.rockefeller.edu/aainfo/chou.htm
|
|
19
|
+
chou_fasman_table = """
|
|
20
|
+
Alanine 142 83 66 0.06 0.076 0.035 0.058
|
|
21
|
+
Arginine 98 93 95 0.070 0.106 0.099 0.085
|
|
22
|
+
Aspartic Acid 101 54 146 0.147 0.110 0.179 0.081
|
|
23
|
+
Asparagine 67 89 156 0.161 0.083 0.191 0.091
|
|
24
|
+
Cysteine 70 119 119 0.149 0.050 0.117 0.128
|
|
25
|
+
Glutamic Acid 151 037 74 0.056 0.060 0.077 0.064
|
|
26
|
+
Glutamine 111 110 98 0.074 0.098 0.037 0.098
|
|
27
|
+
Glycine 57 75 156 0.102 0.085 0.190 0.152
|
|
28
|
+
Histidine 100 87 95 0.140 0.047 0.093 0.054
|
|
29
|
+
Isoleucine 108 160 47 0.043 0.034 0.013 0.056
|
|
30
|
+
Leucine 121 130 59 0.061 0.025 0.036 0.070
|
|
31
|
+
Lysine 114 74 101 0.055 0.115 0.072 0.095
|
|
32
|
+
Methionine 145 105 60 0.068 0.082 0.014 0.055
|
|
33
|
+
Phenylalanine 113 138 60 0.059 0.041 0.065 0.065
|
|
34
|
+
Proline 57 55 152 0.102 0.301 0.034 0.068
|
|
35
|
+
Serine 77 75 143 0.120 0.139 0.125 0.106
|
|
36
|
+
Threonine 83 119 96 0.086 0.108 0.065 0.079
|
|
37
|
+
Tryptophan 108 137 96 0.077 0.013 0.064 0.167
|
|
38
|
+
Tyrosine 69 147 114 0.082 0.065 0.114 0.125
|
|
39
|
+
Valine 106 170 50 0.062 0.048 0.028 0.053
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def parse_chou_fasman(table):
|
|
44
|
+
alpha_helix_score_dict = {}
|
|
45
|
+
beta_sheet_score_dict = {}
|
|
46
|
+
turn_score_dict = {}
|
|
47
|
+
|
|
48
|
+
for line in table.split("\n"):
|
|
49
|
+
fields = [field for field in line.split(" ") if len(field.strip()) > 0]
|
|
50
|
+
if len(fields) == 0:
|
|
51
|
+
continue
|
|
52
|
+
|
|
53
|
+
if fields[1] == 'Acid':
|
|
54
|
+
name = fields[0] + " " + fields[1]
|
|
55
|
+
fields = fields[1:]
|
|
56
|
+
else:
|
|
57
|
+
name = fields[0]
|
|
58
|
+
|
|
59
|
+
assert name in amino_acid_name_indices, "Invalid amino acid name %s" % name
|
|
60
|
+
letter = amino_acid_name_indices[name]
|
|
61
|
+
alpha = int(fields[1])
|
|
62
|
+
beta = int(fields[2])
|
|
63
|
+
turn = int(fields[3])
|
|
64
|
+
alpha_helix_score_dict[letter] = alpha
|
|
65
|
+
beta_sheet_score_dict[letter] = beta
|
|
66
|
+
turn_score_dict[letter] = turn
|
|
67
|
+
|
|
68
|
+
assert len(alpha_helix_score_dict) == 20
|
|
69
|
+
assert len(beta_sheet_score_dict) == 20
|
|
70
|
+
assert len(turn_score_dict) == 20
|
|
71
|
+
return alpha_helix_score_dict, beta_sheet_score_dict, turn_score_dict
|
|
72
|
+
|
|
73
|
+
alpha_helix_score, beta_sheet_score, turn_score = \
|
|
74
|
+
parse_chou_fasman(chou_fasman_table)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
2
|
+
# you may not use this file except in compliance with the License.
|
|
3
|
+
# You may obtain a copy of the License at
|
|
4
|
+
#
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
#
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
|
|
15
|
+
def transform_peptide(peptide, property_dict):
|
|
16
|
+
return np.array([property_dict[amino_acid] for amino_acid in peptide])
|
|
17
|
+
|
|
18
|
+
def transform_peptides(peptides, property_dict):
|
|
19
|
+
return np.array([
|
|
20
|
+
[property_dict[aa] for aa in peptide]
|
|
21
|
+
for peptide in peptides])
|
|
22
|
+
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
2
|
+
# you may not use this file except in compliance with the License.
|
|
3
|
+
# You may obtain a copy of the License at
|
|
4
|
+
#
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
#
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
def hamming(p1, p2):
|
|
14
|
+
n = min(len(p1), len(p2))
|
|
15
|
+
return sum([p1[i] != p2[i] for i in range(n)])
|
|
16
|
+
|
|
File without changes
|
|
Binary file
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# Copyright (c) 2014-2016. Mount Sinai School of Medicine
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
from sklearn.feature_extraction.text import CountVectorizer
|
|
17
|
+
from sklearn.preprocessing import normalize
|
|
18
|
+
|
|
19
|
+
def make_count_vectorizer(reduced_alphabet, max_ngram):
|
|
20
|
+
if reduced_alphabet is None:
|
|
21
|
+
preprocessor = None
|
|
22
|
+
else:
|
|
23
|
+
preprocessor = lambda s: "".join([reduced_alphabet[si] for si in s])
|
|
24
|
+
|
|
25
|
+
return CountVectorizer(
|
|
26
|
+
analyzer='char',
|
|
27
|
+
ngram_range=(1, max_ngram),
|
|
28
|
+
dtype=np.float,
|
|
29
|
+
preprocessor=preprocessor)
|
|
30
|
+
|
|
31
|
+
class PeptideVectorizer(object):
|
|
32
|
+
"""
|
|
33
|
+
Make n-gram frequency vectors from peptide sequences
|
|
34
|
+
"""
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
max_ngram=1,
|
|
38
|
+
normalize_row=True,
|
|
39
|
+
reduced_alphabet=None,
|
|
40
|
+
training_already_reduced=False):
|
|
41
|
+
self.reduced_alphabet = reduced_alphabet
|
|
42
|
+
self.max_ngram = max_ngram
|
|
43
|
+
self.normalize_row = normalize_row
|
|
44
|
+
self.training_already_reduced = training_already_reduced
|
|
45
|
+
self.count_vectorizer = None
|
|
46
|
+
|
|
47
|
+
def __getstate__(self):
|
|
48
|
+
return {
|
|
49
|
+
'reduced_alphabet': self.reduced_alphabet,
|
|
50
|
+
'count_vectorizer': self.count_vectorizer,
|
|
51
|
+
'training_already_reduced': self.training_already_reduced,
|
|
52
|
+
'normalize_row': self.normalize_row,
|
|
53
|
+
'max_ngram': self.max_ngram,
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
def fit_transform(self, amino_acid_strings):
|
|
57
|
+
self.count_vectorizer = \
|
|
58
|
+
make_count_vectorizer(self.reduced_alphabet, self.max_ngram)
|
|
59
|
+
|
|
60
|
+
if self.training_already_reduced:
|
|
61
|
+
c = make_count_vectorizer(None, self.max_ngram)
|
|
62
|
+
X = c.fit_transform(amino_acid_strings).todense()
|
|
63
|
+
self.count_vectorizer.vocabulary_ = c.vocabulary_
|
|
64
|
+
else:
|
|
65
|
+
c = self.count_vectorizer
|
|
66
|
+
X = c.fit_transform(amino_acid_strings).todense()
|
|
67
|
+
|
|
68
|
+
if self.normalize_row:
|
|
69
|
+
X = normalize(X, norm='l1')
|
|
70
|
+
return X
|
|
71
|
+
|
|
72
|
+
def fit(self, amino_acid_strings):
|
|
73
|
+
self.fit_transform(amino_acid_strings)
|
|
74
|
+
|
|
75
|
+
def transform(self, amino_acid_strings):
|
|
76
|
+
assert self.count_vectorizer, "Must call 'fit' before 'transform'"
|
|
77
|
+
X = self.count_vectorizer.transform(amino_acid_strings).todense()
|
|
78
|
+
if self.normalize_row:
|
|
79
|
+
X = normalize(X, norm='l1')
|
|
80
|
+
return X
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# Copyright (c) 2014-2016. Mount Sinai School of Medicine
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from os.path import join
|
|
16
|
+
|
|
17
|
+
from .static_data import MATRIX_DIR
|
|
18
|
+
|
|
19
|
+
from .amino_acid_alphabet import dict_to_amino_acid_matrix
|
|
20
|
+
|
|
21
|
+
def read_pmbec_coefficients(
|
|
22
|
+
key_type='row',
|
|
23
|
+
verbose=True,
|
|
24
|
+
filename=join(MATRIX_DIR, 'pmbec.mat')):
|
|
25
|
+
"""
|
|
26
|
+
Parameters
|
|
27
|
+
------------
|
|
28
|
+
|
|
29
|
+
filename : str
|
|
30
|
+
Location of PMBEC coefficient matrix
|
|
31
|
+
|
|
32
|
+
key_type : str
|
|
33
|
+
'row' : every key is a single amino acid,
|
|
34
|
+
which maps to a dictionary for that row
|
|
35
|
+
'pair' : every key is a tuple of amino acids
|
|
36
|
+
'pair_string' : every key is a string of two amino acid characters
|
|
37
|
+
|
|
38
|
+
verbose : bool
|
|
39
|
+
Print rows of matrix as we read them
|
|
40
|
+
"""
|
|
41
|
+
d = {}
|
|
42
|
+
if key_type == 'row':
|
|
43
|
+
def add_pair(row_letter, col_letter, value):
|
|
44
|
+
if row_letter not in d:
|
|
45
|
+
d[row_letter] = {}
|
|
46
|
+
d[row_letter][col_letter] = value
|
|
47
|
+
elif key_type == 'pair':
|
|
48
|
+
def add_pair(row_letter, col_letter, value):
|
|
49
|
+
d[(row_letter, col_letter)] = value
|
|
50
|
+
|
|
51
|
+
else:
|
|
52
|
+
assert key_type == 'pair_string', \
|
|
53
|
+
"Invalid dictionary key type: %s" % key_type
|
|
54
|
+
|
|
55
|
+
def add_pair(row_letter, col_letter, value):
|
|
56
|
+
d["%s%s" % (row_letter, col_letter)] = value
|
|
57
|
+
|
|
58
|
+
with open(filename, 'r') as f:
|
|
59
|
+
lines = [line for line in f.read().split('\n') if len(line) > 0]
|
|
60
|
+
header = lines[0]
|
|
61
|
+
if verbose:
|
|
62
|
+
print(header)
|
|
63
|
+
residues = [
|
|
64
|
+
x for x in header.split()
|
|
65
|
+
if len(x) == 1 and x != ' ' and x != '\t'
|
|
66
|
+
]
|
|
67
|
+
assert len(residues) == 20
|
|
68
|
+
if verbose:
|
|
69
|
+
print(residues)
|
|
70
|
+
for line in lines[1:]:
|
|
71
|
+
cols = [
|
|
72
|
+
x
|
|
73
|
+
for x in line.split(' ')
|
|
74
|
+
if len(x) > 0 and x != ' ' and x != '\t'
|
|
75
|
+
]
|
|
76
|
+
assert len(cols) == 21, "Expected 20 values + letter, got %s" % cols
|
|
77
|
+
row_letter = cols[0]
|
|
78
|
+
for i, col in enumerate(cols[1:]):
|
|
79
|
+
col_letter = residues[i]
|
|
80
|
+
assert col_letter != ' ' and col_letter != '\t'
|
|
81
|
+
value = float(col)
|
|
82
|
+
add_pair(row_letter, col_letter, value)
|
|
83
|
+
return d
|
|
84
|
+
|
|
85
|
+
# dictionary of PMBEC coefficient accessed like pmbec_dict["V"]["R"]
|
|
86
|
+
pmbec_dict = read_pmbec_coefficients(key_type="row")
|
|
87
|
+
pmbec_matrix = dict_to_amino_acid_matrix(pmbec_dict)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# Copyright (c) 2014-2018. Mount Sinai School of Medicine
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""
|
|
16
|
+
Amino acid groupings from
|
|
17
|
+
'Reduced amino acid alphabets improve the sensitivity...' by
|
|
18
|
+
Peterson, Kondev, et al.
|
|
19
|
+
http://www.rpgroup.caltech.edu/publications/Peterson2008.pdf
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def dict_from_list(groups):
|
|
24
|
+
aa_to_group = {}
|
|
25
|
+
for i, group in enumerate(groups):
|
|
26
|
+
for c in group:
|
|
27
|
+
aa_to_group[c] = group[0]
|
|
28
|
+
return aa_to_group
|
|
29
|
+
|
|
30
|
+
gbmr4 = dict_from_list(["ADKERNTSQ", "YFLIVMCWH", "G", "P"])
|
|
31
|
+
|
|
32
|
+
sdm12 = dict_from_list([
|
|
33
|
+
"A", "D", "KER", "N", "TSQ", "YF", "LIVM", "C", "W", "H", "G", "P"
|
|
34
|
+
])
|
|
35
|
+
|
|
36
|
+
hsdm17 = dict_from_list([
|
|
37
|
+
"A", "D", "KE", "R", "N", "T", "S", "Q", "Y",
|
|
38
|
+
"F", "LIV", "M", "C", "W", "H", "G", "P"
|
|
39
|
+
])
|
|
40
|
+
|
|
41
|
+
"""
|
|
42
|
+
Other alphabets from
|
|
43
|
+
http://bio.math-inf.uni-greifswald.de/viscose/html/alphabets.html
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
# hydrophilic vs. hydrophobic
|
|
47
|
+
hp2 = dict_from_list(["AGTSNQDEHRKP", "CMFILVWY"])
|
|
48
|
+
|
|
49
|
+
murphy10 = dict_from_list([
|
|
50
|
+
"LVIM", "C", "A", "G", "ST", "P", "FYW", "EDNQ", "KR", "H"
|
|
51
|
+
])
|
|
52
|
+
|
|
53
|
+
alex6 = dict_from_list(["C", "G", "P", "FYW", "AVILM", "STNQRHKDE"])
|
|
54
|
+
|
|
55
|
+
aromatic2 = dict_from_list(["FHWY", "ADKERNTSQLIVMCGP"])
|
|
56
|
+
|
|
57
|
+
hp_vs_aromatic = dict_from_list(["H", "CMILV", "FWY", "ADKERNTSQGP"])
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
2
|
+
# you may not use this file except in compliance with the License.
|
|
3
|
+
# You may obtain a copy of the License at
|
|
4
|
+
#
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
#
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
from os.path import join
|
|
14
|
+
|
|
15
|
+
from .amino_acid_alphabet import canonical_amino_acid_letters, dict_to_amino_acid_matrix
|
|
16
|
+
from .static_data import MATRIX_DIR
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def parse_interaction_table(table, amino_acid_order="ARNDCQEGHILKMFPSTWYV"):
|
|
20
|
+
table = table.strip()
|
|
21
|
+
while " " in table:
|
|
22
|
+
table = table.replace(" ", " ")
|
|
23
|
+
|
|
24
|
+
lines = [l.strip() for l in table.split("\n")]
|
|
25
|
+
lines = [l for l in lines if len(l) > 0 and not l.startswith("#")]
|
|
26
|
+
assert len(lines) == 20, "Malformed amino acid interaction table"
|
|
27
|
+
d = {}
|
|
28
|
+
for i, line in enumerate(lines):
|
|
29
|
+
coeff_strings = line.split(" ")
|
|
30
|
+
assert len(coeff_strings) == 20, \
|
|
31
|
+
"Malformed row in amino acid interaction table"
|
|
32
|
+
x = amino_acid_order[i]
|
|
33
|
+
d[x] = {}
|
|
34
|
+
for j, coeff_str in enumerate(coeff_strings):
|
|
35
|
+
value = float(coeff_str)
|
|
36
|
+
y = amino_acid_order[j]
|
|
37
|
+
d[x][y] = value
|
|
38
|
+
return d
|
|
39
|
+
|
|
40
|
+
def transpose_interaction_dict(d):
|
|
41
|
+
transposed = {}
|
|
42
|
+
for x in canonical_amino_acid_letters:
|
|
43
|
+
transposed[x] = {}
|
|
44
|
+
for y in canonical_amino_acid_letters:
|
|
45
|
+
transposed[x][y] = d[y][x]
|
|
46
|
+
return transposed
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
with open(join(MATRIX_DIR, 'strand_vs_coil.txt'), 'r') as f:
|
|
50
|
+
# Strand vs. Coil
|
|
51
|
+
strand_vs_coil_dict = parse_interaction_table(f.read())
|
|
52
|
+
strand_vs_coil_array = dict_to_amino_acid_matrix(strand_vs_coil_dict)
|
|
53
|
+
|
|
54
|
+
# Coil vs. Strand
|
|
55
|
+
coil_vs_strand_dict = transpose_interaction_dict(strand_vs_coil_dict)
|
|
56
|
+
coil_vs_strand_array = dict_to_amino_acid_matrix(coil_vs_strand_dict)
|
|
57
|
+
|
|
58
|
+
with open(join(MATRIX_DIR, 'helix_vs_strand.txt'), 'r') as f:
|
|
59
|
+
# Helix vs. Strand
|
|
60
|
+
helix_vs_strand_dict = parse_interaction_table(f.read())
|
|
61
|
+
helix_vs_strand_array = dict_to_amino_acid_matrix(helix_vs_strand_dict)
|
|
62
|
+
|
|
63
|
+
# Strand vs. Helix
|
|
64
|
+
strand_vs_helix_dict = transpose_interaction_dict(helix_vs_strand_dict)
|
|
65
|
+
strand_vs_helix_array = dict_to_amino_acid_matrix(strand_vs_helix_dict)
|
|
66
|
+
|
|
67
|
+
with open(join(MATRIX_DIR, 'helix_vs_coil.txt'), 'r') as f:
|
|
68
|
+
# Helix vs. Coil
|
|
69
|
+
helix_vs_coil_dict = parse_interaction_table(f.read())
|
|
70
|
+
helix_vs_coil_array = dict_to_amino_acid_matrix(helix_vs_coil_dict)
|
|
71
|
+
|
|
72
|
+
# Coil vs. Helix
|
|
73
|
+
coil_vs_helix_dict = transpose_interaction_dict(helix_vs_coil_dict)
|
|
74
|
+
coil_vs_helix_array = dict_to_amino_acid_matrix(coil_vs_helix_dict)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
2
|
+
# you may not use this file except in compliance with the License.
|
|
3
|
+
# You may obtain a copy of the License at
|
|
4
|
+
#
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
#
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
from os.path import dirname, realpath, join
|
|
15
|
+
|
|
16
|
+
PACKAGE_DIR = dirname(realpath(__file__))
|
|
17
|
+
MATRIX_DIR = join(PACKAGE_DIR, 'matrices')
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: weirdo
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Peptide similarity measures, distance functions, and attempts to quantify the 'self' proteome
|
|
5
|
+
Home-page: https://github.com/pirl-unc/weirdo
|
|
6
|
+
Author: Alex Rubinsteyn
|
|
7
|
+
Author-email: alex.rubinsteyn@unc.edu
|
|
8
|
+
License: http://www.apache.org/licenses/LICENSE-2.0.html
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Environment :: Console
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
14
|
+
Classifier: Programming Language :: Python
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
|
|
19
|
+
<a href="https://travis-ci.org/openvax/weirdo">
|
|
20
|
+
<img src="https://travis-ci.org/openvax/weirdo.svg?branch=master" alt="Build Status" />
|
|
21
|
+
</a>
|
|
22
|
+
<a href="https://coveralls.io/github/openvax/weirdo?branch=master">
|
|
23
|
+
<img src="https://coveralls.io/repos/openvax/weirdo/badge.svg?branch=master&service=github" alt="Coverage Status" />
|
|
24
|
+
</a>
|
|
25
|
+
<a href="https://pypi.python.org/pypi/weirdo/">
|
|
26
|
+
<img src="https://img.shields.io/pypi/v/weirdo.svg?maxAge=1000" alt="PyPI" />
|
|
27
|
+
</a>
|
|
28
|
+
|
|
29
|
+
# weirdo
|
|
30
|
+
|
|
31
|
+
Metrics of immunological foreignness for candidate T-cell epitopes. An extension of the [pepdata](https://www.github.com/peptdata) library.
|
|
32
|
+
|
|
33
|
+
**Amino Acid Properties**
|
|
34
|
+
|
|
35
|
+
The `amino_acid` module contains a variety of physical/chemical properties for both single amino residues and interactions between pairs of residues.
|
|
36
|
+
|
|
37
|
+
Single residue feature tables are parsed into `StringTransformer` objects, which can be treated as dictionaries or will vectorize a string when you call their method `transform_string`.
|
|
38
|
+
|
|
39
|
+
Examples of single residue features:
|
|
40
|
+
|
|
41
|
+
- `hydropathy`
|
|
42
|
+
- `volume`
|
|
43
|
+
- `polarity`
|
|
44
|
+
- `pK_side_chain`
|
|
45
|
+
- `prct_exposed_residues`
|
|
46
|
+
- `hydrophilicity`
|
|
47
|
+
- `accessible_surface_area`
|
|
48
|
+
- `refractivity`
|
|
49
|
+
- `local_flexibility`
|
|
50
|
+
- `accessible_surface_area_folded`
|
|
51
|
+
- `alpha_helix_score` (Chou-Fasman)
|
|
52
|
+
- `beta_sheet_score` (Chou-Fasman)
|
|
53
|
+
- `turn_score` (Chou-Fasman)
|
|
54
|
+
|
|
55
|
+
Pairwise interaction tables are parsed into nested dictionaries, so that the interaction between amino acids `x` and `y` can be determined from `d[x][y]`.
|
|
56
|
+
|
|
57
|
+
Pairwise interaction dictionaries:
|
|
58
|
+
|
|
59
|
+
- `strand_vs_coil` (and its transpose `coil_vs_strand`)
|
|
60
|
+
- `helix_vs_strand` (and its transpose `strand_vs_helix`)
|
|
61
|
+
- `helix_vs_coil` (and its transpose `coil_vs_helix`)
|
|
62
|
+
- `blosum30`
|
|
63
|
+
- `blosum50`
|
|
64
|
+
- `blosum62`
|
|
65
|
+
|
|
66
|
+
There is also a function to parse the coefficients of the [PMBEC similarity matrix](http://www.biomedcentral.com/1471-2105/10/394), though this currently lives in the separate `pmbec` module.
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
setup.py
|
|
4
|
+
test/test_amino_acids.py
|
|
5
|
+
test/test_blosum.py
|
|
6
|
+
test/test_hamming.py
|
|
7
|
+
test/test_ngram.py
|
|
8
|
+
test/test_pmbec.py
|
|
9
|
+
weirdo/__init__.py
|
|
10
|
+
weirdo/amino_acid.py
|
|
11
|
+
weirdo/amino_acid_alphabet.py
|
|
12
|
+
weirdo/amino_acid_properties.py
|
|
13
|
+
weirdo/blosum.py
|
|
14
|
+
weirdo/chou_fasman.py
|
|
15
|
+
weirdo/common.py
|
|
16
|
+
weirdo/distances.py
|
|
17
|
+
weirdo/peptide_vectorizer.py
|
|
18
|
+
weirdo/pmbec.py
|
|
19
|
+
weirdo/reduced_alphabet.py
|
|
20
|
+
weirdo/residue_contact_energies.py
|
|
21
|
+
weirdo/static_data.py
|
|
22
|
+
weirdo.egg-info/PKG-INFO
|
|
23
|
+
weirdo.egg-info/SOURCES.txt
|
|
24
|
+
weirdo.egg-info/dependency_links.txt
|
|
25
|
+
weirdo.egg-info/requires.txt
|
|
26
|
+
weirdo.egg-info/top_level.txt
|
|
27
|
+
weirdo/matrices/__init__.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
Users/iskander/miniconda3/lib/python3.8/site-packages/weirdo-1.0.0-py3.8.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
weirdo
|