stcrpy 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +0 -0
- examples/egnn.py +425 -0
- stcrpy/__init__.py +5 -0
- stcrpy/tcr_datasets/__init__.py +0 -0
- stcrpy/tcr_datasets/tcr_graph_dataset.py +499 -0
- stcrpy/tcr_datasets/tcr_selector.py +0 -0
- stcrpy/tcr_datasets/tcr_structure_dataset.py +0 -0
- stcrpy/tcr_datasets/utils.py +350 -0
- stcrpy/tcr_formats/__init__.py +0 -0
- stcrpy/tcr_formats/tcr_formats.py +114 -0
- stcrpy/tcr_formats/tcr_haddock.py +556 -0
- stcrpy/tcr_geometry/TCRCoM.py +350 -0
- stcrpy/tcr_geometry/TCRCoM_LICENCE +168 -0
- stcrpy/tcr_geometry/TCRDock.py +261 -0
- stcrpy/tcr_geometry/TCRGeom.py +450 -0
- stcrpy/tcr_geometry/TCRGeomFiltering.py +273 -0
- stcrpy/tcr_geometry/__init__.py +0 -0
- stcrpy/tcr_geometry/reference_data/__init__.py +0 -0
- stcrpy/tcr_geometry/reference_data/dock_reference_1_imgt_numbered.pdb +6549 -0
- stcrpy/tcr_geometry/reference_data/dock_reference_2_imgt_numbered.pdb +6495 -0
- stcrpy/tcr_geometry/reference_data/reference_A.pdb +31 -0
- stcrpy/tcr_geometry/reference_data/reference_B.pdb +31 -0
- stcrpy/tcr_geometry/reference_data/reference_D.pdb +31 -0
- stcrpy/tcr_geometry/reference_data/reference_G.pdb +31 -0
- stcrpy/tcr_geometry/reference_data/reference_data.py +104 -0
- stcrpy/tcr_interactions/PLIPParser.py +147 -0
- stcrpy/tcr_interactions/TCRInteractionProfiler.py +433 -0
- stcrpy/tcr_interactions/TCRpMHC_PLIP_Model_Parser.py +133 -0
- stcrpy/tcr_interactions/__init__.py +0 -0
- stcrpy/tcr_interactions/utils.py +170 -0
- stcrpy/tcr_methods/__init__.py +0 -0
- stcrpy/tcr_methods/tcr_batch_operations.py +223 -0
- stcrpy/tcr_methods/tcr_methods.py +150 -0
- stcrpy/tcr_methods/tcr_reformatting.py +18 -0
- stcrpy/tcr_metrics/__init__.py +2 -0
- stcrpy/tcr_metrics/constants.py +39 -0
- stcrpy/tcr_metrics/tcr_interface_rmsd.py +237 -0
- stcrpy/tcr_metrics/tcr_rmsd.py +179 -0
- stcrpy/tcr_ml/__init__.py +0 -0
- stcrpy/tcr_ml/geometry_predictor.py +3 -0
- stcrpy/tcr_processing/AGchain.py +89 -0
- stcrpy/tcr_processing/Chemical_components.py +48915 -0
- stcrpy/tcr_processing/Entity.py +301 -0
- stcrpy/tcr_processing/Fragment.py +58 -0
- stcrpy/tcr_processing/Holder.py +24 -0
- stcrpy/tcr_processing/MHC.py +449 -0
- stcrpy/tcr_processing/MHCchain.py +149 -0
- stcrpy/tcr_processing/Model.py +37 -0
- stcrpy/tcr_processing/Select.py +145 -0
- stcrpy/tcr_processing/TCR.py +532 -0
- stcrpy/tcr_processing/TCRIO.py +47 -0
- stcrpy/tcr_processing/TCRParser.py +1230 -0
- stcrpy/tcr_processing/TCRStructure.py +148 -0
- stcrpy/tcr_processing/TCRchain.py +160 -0
- stcrpy/tcr_processing/__init__.py +3 -0
- stcrpy/tcr_processing/annotate.py +480 -0
- stcrpy/tcr_processing/utils/__init__.py +0 -0
- stcrpy/tcr_processing/utils/common.py +67 -0
- stcrpy/tcr_processing/utils/constants.py +367 -0
- stcrpy/tcr_processing/utils/region_definitions.py +782 -0
- stcrpy/utils/__init__.py +0 -0
- stcrpy/utils/error_stream.py +12 -0
- stcrpy-1.0.0.dist-info/METADATA +173 -0
- stcrpy-1.0.0.dist-info/RECORD +68 -0
- stcrpy-1.0.0.dist-info/WHEEL +5 -0
- stcrpy-1.0.0.dist-info/licenses/LICENCE +28 -0
- stcrpy-1.0.0.dist-info/licenses/stcrpy/tcr_geometry/TCRCoM_LICENCE +168 -0
- stcrpy-1.0.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,480 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Created on 10 May 2017
|
|
3
|
+
@author: leem
|
|
4
|
+
|
|
5
|
+
Implementation to call anarci (built-in to STrDab) to annotate structures.
|
|
6
|
+
"""
|
|
7
|
+
import sys
|
|
8
|
+
import warnings
|
|
9
|
+
|
|
10
|
+
from Bio.PDB.Polypeptide import aa1, aa3 # to allow me to return "X" if not found.
|
|
11
|
+
|
|
12
|
+
to_one_letter_code = dict(list(zip(aa3, aa1)))
|
|
13
|
+
|
|
14
|
+
# Import TCRDB's constants and common functions.
|
|
15
|
+
from .utils.constants import TCR_CHAINS
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def call_anarci(
|
|
19
|
+
seq,
|
|
20
|
+
allow=set(
|
|
21
|
+
[
|
|
22
|
+
"B",
|
|
23
|
+
"A",
|
|
24
|
+
"D",
|
|
25
|
+
"G",
|
|
26
|
+
"GA1",
|
|
27
|
+
"GA2",
|
|
28
|
+
"GA1L",
|
|
29
|
+
"GA2L",
|
|
30
|
+
"GA",
|
|
31
|
+
"GB",
|
|
32
|
+
"B2M",
|
|
33
|
+
"MH1",
|
|
34
|
+
"MR1",
|
|
35
|
+
"MR2",
|
|
36
|
+
]
|
|
37
|
+
),
|
|
38
|
+
):
|
|
39
|
+
"""
|
|
40
|
+
Use the ANARCI program to number the sequence.
|
|
41
|
+
@param seq: An amino acid sequence that you wish to number.
|
|
42
|
+
@type seq: C{str}
|
|
43
|
+
|
|
44
|
+
@return: numbering, chain type
|
|
45
|
+
"""
|
|
46
|
+
from anarci import number as anarci_number
|
|
47
|
+
|
|
48
|
+
numbering, chain_type, germline_info = anarci_number(
|
|
49
|
+
seq, allow=allow, assign_germline=True
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
if numbering and "MR" not in chain_type and chain_type in allow:
|
|
53
|
+
return [(_, aa) for _, aa in numbering if aa != "-"], chain_type, germline_info
|
|
54
|
+
elif numbering and chain_type in ["BA", "GD", "AB", "DG"]:
|
|
55
|
+
return (
|
|
56
|
+
[[(_, aa) for _, aa in n if aa != "-"] for n in numbering],
|
|
57
|
+
chain_type,
|
|
58
|
+
germline_info,
|
|
59
|
+
)
|
|
60
|
+
else:
|
|
61
|
+
return False, False, False
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def annotate(chain):
|
|
65
|
+
"""
|
|
66
|
+
Annotate the sequence of a chain object from TCRDB.TcrPDB
|
|
67
|
+
# e.g. if you have chains B, A and X, you want to force the annotator to return the annotation
|
|
68
|
+
# for B and A but not for X (the antigen)
|
|
69
|
+
|
|
70
|
+
returns a dictionary which has the residue ids as key and the annotation as value or is False,
|
|
71
|
+
and chain type which is B/A/G/D/MH1/GA/GB/B2M or False.
|
|
72
|
+
"""
|
|
73
|
+
sequence_list, sequence_str = extract_sequence(chain)
|
|
74
|
+
numbering, chain_type, germline_info = call_anarci(sequence_str)
|
|
75
|
+
|
|
76
|
+
# Use
|
|
77
|
+
if chain_type:
|
|
78
|
+
chtype = "".join(sorted(chain_type, reverse=True))
|
|
79
|
+
else:
|
|
80
|
+
chtype = False
|
|
81
|
+
|
|
82
|
+
if chtype in ("BA", "GD"):
|
|
83
|
+
aligned_numbering = align_scTCR_numbering(
|
|
84
|
+
numbering, sequence_list, sequence_str
|
|
85
|
+
)
|
|
86
|
+
# aligned_numbering = cleanup_scTCR_numbering(aligned_numbering, sequence_list)
|
|
87
|
+
scTCR = True
|
|
88
|
+
elif chtype == "DC1" or chtype == "RM1":
|
|
89
|
+
# Use the scTCR numbering trick; since CD1/MR1 numbering only spans up to residue ~87 and
|
|
90
|
+
aligned_numbering = align_scTCR_numbering(
|
|
91
|
+
numbering, sequence_list, sequence_str
|
|
92
|
+
)
|
|
93
|
+
aligned_numbering[0].update(aligned_numbering[1])
|
|
94
|
+
aligned_numbering = aligned_numbering[0] # combine the numbering
|
|
95
|
+
aligned_numbering = cleanup_scTCR_numbering(aligned_numbering, sequence_list)
|
|
96
|
+
scTCR = False
|
|
97
|
+
else:
|
|
98
|
+
# align the original residue id's to the numbering
|
|
99
|
+
aligned_numbering = align_numbering(numbering, sequence_list)
|
|
100
|
+
scTCR = False
|
|
101
|
+
|
|
102
|
+
# aligned numbering is a dictionary of the original residue ids and the new numbering
|
|
103
|
+
return aligned_numbering, chain_type, germline_info, scTCR
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def extract_sequence(
|
|
107
|
+
chain, selection=False, return_warnings=False, ignore_hets=False, backbone=False
|
|
108
|
+
):
|
|
109
|
+
"""
|
|
110
|
+
Get the amino acid sequence of the chain.
|
|
111
|
+
@change: Residues containing HETATOMs are skipped --> Residues containing HETATOMs are checked as an amino acid.
|
|
112
|
+
|
|
113
|
+
Residues containing HETATOMs are checked to be amino acids and the single letter returned.
|
|
114
|
+
|
|
115
|
+
This works provided the residues in the chain are in the correct order.
|
|
116
|
+
|
|
117
|
+
@param selection: a selection object to select certain residues
|
|
118
|
+
@param return_warnings: Flag to return a list of warnings or not
|
|
119
|
+
@param backbone: Flag whether to only show residues with a complete backbone (in the structure) or not.
|
|
120
|
+
@return: The sequence in a resid:aa tuple list and the sequence as a string.
|
|
121
|
+
|
|
122
|
+
"""
|
|
123
|
+
sequence_list = []
|
|
124
|
+
warnings = []
|
|
125
|
+
for residue in chain.get_list():
|
|
126
|
+
if (
|
|
127
|
+
residue.id[0] != " "
|
|
128
|
+
): # skip HETATOMs - this is not necesserily a good idea, flag to the user that is has been done.
|
|
129
|
+
# if residue.get_resname() not in to_one_letter_code: # Check that the residue can be converted into a single letter.
|
|
130
|
+
# continue
|
|
131
|
+
# if residue.get_resname() in to_one_letter_code: # Check that the residue can be converted into a single letter.
|
|
132
|
+
# pass
|
|
133
|
+
if residue.get_resname() in to_one_letter_code:
|
|
134
|
+
if ignore_hets:
|
|
135
|
+
if return_warnings:
|
|
136
|
+
warnings.append(
|
|
137
|
+
"""Warning: HETATM residue %s at position %s (PDB numbering) found in chain %s.
|
|
138
|
+
Not including it in structure's sequence."""
|
|
139
|
+
% (
|
|
140
|
+
residue.get_resname(),
|
|
141
|
+
str(residue.id[1]) + residue.id[2].strip(),
|
|
142
|
+
residue.parent.id,
|
|
143
|
+
)
|
|
144
|
+
)
|
|
145
|
+
else:
|
|
146
|
+
sys.stderr.write(
|
|
147
|
+
"""Warning: HETATM residue %s position %s (PDB numbering) found in chain %s.
|
|
148
|
+
Not including it in structure's sequence.\n"""
|
|
149
|
+
% (
|
|
150
|
+
residue.get_resname(),
|
|
151
|
+
str(residue.id[1]) + residue.id[2].strip(),
|
|
152
|
+
residue.parent.id,
|
|
153
|
+
)
|
|
154
|
+
)
|
|
155
|
+
continue
|
|
156
|
+
else:
|
|
157
|
+
continue
|
|
158
|
+
|
|
159
|
+
if selection:
|
|
160
|
+
if not selection.accept(residue):
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
atoms_of_residue = list(residue.child_dict.keys())
|
|
164
|
+
backboneCondition = (
|
|
165
|
+
"N" in atoms_of_residue
|
|
166
|
+
and "C" in atoms_of_residue
|
|
167
|
+
and "CA" in atoms_of_residue
|
|
168
|
+
and "O" in atoms_of_residue
|
|
169
|
+
) # Boolean to hold if residue has a full backbone
|
|
170
|
+
|
|
171
|
+
# CASE 1: backbone = True, and residue has a full backbone; convert a.a into single letter
|
|
172
|
+
if backbone and backboneCondition:
|
|
173
|
+
sequence_list.append(
|
|
174
|
+
(residue.id, to_one_letter_code.get(residue.get_resname(), "X"))
|
|
175
|
+
)
|
|
176
|
+
# CASE 2: backbone = True, but residue does not have a full backbone; use a gap in sequence annotation
|
|
177
|
+
elif backbone and not backboneCondition:
|
|
178
|
+
sequence_list.append((residue.id, "-"))
|
|
179
|
+
# CASE 0 (default): don't care about backbone, just write it to sequence if it's found in structure.
|
|
180
|
+
elif not backbone:
|
|
181
|
+
sequence_list.append(
|
|
182
|
+
(residue.id, to_one_letter_code.get(residue.get_resname(), "X"))
|
|
183
|
+
) # i am
|
|
184
|
+
|
|
185
|
+
sequence_str = "".join([r[1] for r in sequence_list])
|
|
186
|
+
if not return_warnings:
|
|
187
|
+
return sequence_list, sequence_str
|
|
188
|
+
else:
|
|
189
|
+
return sequence_list, sequence_str, warnings
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def interpret(x):
|
|
193
|
+
"""
|
|
194
|
+
Function to interpret an annotation in the form H100A into the form ( 100, 'A' )
|
|
195
|
+
"""
|
|
196
|
+
assert x[0] in TCR_CHAINS, x
|
|
197
|
+
try:
|
|
198
|
+
return (int(x[1:]), " ")
|
|
199
|
+
except ValueError:
|
|
200
|
+
return (int(x[1:-1]), x[-1])
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def align_numbering(numbering, sequence_list, alignment_dict={}):
|
|
204
|
+
"""
|
|
205
|
+
Align the sequence that has been numbered to the sequence you input.
|
|
206
|
+
The numbered sequence should be "in" the input sequence.
|
|
207
|
+
If not, supply an alignment dictionary.(align sequences and use get_alignment_dict(ali1,ali2))
|
|
208
|
+
"""
|
|
209
|
+
if numbering:
|
|
210
|
+
numbered_sequence = "".join([r[1] for r in numbering])
|
|
211
|
+
input_sequence = "".join([r[1] for r in sequence_list])
|
|
212
|
+
if not alignment_dict:
|
|
213
|
+
try:
|
|
214
|
+
numbered_sequence_ali, input_sequence_ali = pairwise_alignment(
|
|
215
|
+
numbered_sequence, input_sequence
|
|
216
|
+
)
|
|
217
|
+
alignment_dict = get_alignment_dict(
|
|
218
|
+
input_sequence_ali, numbered_sequence_ali
|
|
219
|
+
)
|
|
220
|
+
except Exception:
|
|
221
|
+
raise Exception(
|
|
222
|
+
"Could not align numbered sequence to aligned sequence:"
|
|
223
|
+
+ " "
|
|
224
|
+
+ str(numbered_sequence)
|
|
225
|
+
+ " "
|
|
226
|
+
+ str(input_sequence)
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
aligned_numbering = {}
|
|
230
|
+
n = -1
|
|
231
|
+
after_flag = False
|
|
232
|
+
for i in range(len(input_sequence)):
|
|
233
|
+
if i in alignment_dict:
|
|
234
|
+
# during
|
|
235
|
+
assert (
|
|
236
|
+
after_flag is False
|
|
237
|
+
), "Extra residue in structure than expected from provided sequence"
|
|
238
|
+
assert (
|
|
239
|
+
input_sequence[i] == numbered_sequence[alignment_dict[i]]
|
|
240
|
+
), "alignment dictionary failed"
|
|
241
|
+
aligned_numbering[sequence_list[i][0]] = numbering[alignment_dict[i]][0]
|
|
242
|
+
n = numbering[-1][0][0] + 1
|
|
243
|
+
elif n > -1:
|
|
244
|
+
# after
|
|
245
|
+
after_flag = True
|
|
246
|
+
aligned_numbering[sequence_list[i][0]] = (n, " ")
|
|
247
|
+
n += 1
|
|
248
|
+
else:
|
|
249
|
+
# before numbering
|
|
250
|
+
aligned_numbering[sequence_list[i][0]] = ""
|
|
251
|
+
|
|
252
|
+
return aligned_numbering
|
|
253
|
+
else:
|
|
254
|
+
return False
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def align_scTCR_numbering(numbering, sequence_list, sequence_str):
|
|
258
|
+
"""
|
|
259
|
+
Align the sequence that has been numbered to a scTCR structure.
|
|
260
|
+
@param numbering: numbered list of residues; this is usually a two-element list/tuple from TCRDB.anarci.number
|
|
261
|
+
@param sequence_list: list of residues (e.g. from a structure) in its original numbering
|
|
262
|
+
@param sequence_str: string form of sequence_list
|
|
263
|
+
"""
|
|
264
|
+
if numbering:
|
|
265
|
+
numbered_sequence = ["".join([r[1] for r in n]) for n in numbering]
|
|
266
|
+
input_sequence = sequence_str
|
|
267
|
+
|
|
268
|
+
aligned_numbering = [{}, {}]
|
|
269
|
+
|
|
270
|
+
for ii, a_sequence in enumerate(numbered_sequence):
|
|
271
|
+
|
|
272
|
+
# Align each of the joined sequences from the numbering into the target structure sequence in "sequence_str"
|
|
273
|
+
try:
|
|
274
|
+
a_sequence_ali, input_sequence_ali = pairwise_alignment(
|
|
275
|
+
a_sequence, input_sequence
|
|
276
|
+
)
|
|
277
|
+
alignment_dict = get_alignment_dict(input_sequence_ali, a_sequence_ali)
|
|
278
|
+
except Exception:
|
|
279
|
+
raise Exception(
|
|
280
|
+
"Could not align numbered sequence to aligned sequence"
|
|
281
|
+
+ "\n"
|
|
282
|
+
+ str(numbered_sequence)
|
|
283
|
+
+ "\n"
|
|
284
|
+
+ str(input_sequence)
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
n = -1
|
|
288
|
+
after_flag = False
|
|
289
|
+
# for i in xrange(len(input_sequence)):
|
|
290
|
+
for i in alignment_dict:
|
|
291
|
+
if i in alignment_dict:
|
|
292
|
+
# during
|
|
293
|
+
assert (
|
|
294
|
+
after_flag is False
|
|
295
|
+
), "Extra residue in structure than expected from provided sequence"
|
|
296
|
+
assert (
|
|
297
|
+
input_sequence[i] == numbered_sequence[ii][alignment_dict[i]]
|
|
298
|
+
), "alignment dictionary failed"
|
|
299
|
+
aligned_numbering[ii][sequence_list[i][0]] = numbering[ii][
|
|
300
|
+
alignment_dict[i]
|
|
301
|
+
][0]
|
|
302
|
+
n = numbering[ii][-1][0][0] + 1
|
|
303
|
+
elif n > -1:
|
|
304
|
+
# after
|
|
305
|
+
after_flag = True
|
|
306
|
+
aligned_numbering[ii][sequence_list[i][0]] = (n, " ")
|
|
307
|
+
n += 1
|
|
308
|
+
else:
|
|
309
|
+
# before numbering
|
|
310
|
+
aligned_numbering[ii][sequence_list[i][0]] = ""
|
|
311
|
+
|
|
312
|
+
return aligned_numbering
|
|
313
|
+
else:
|
|
314
|
+
return False
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def cleanup_scTCR_numbering(numbering_dict, sequence_list):
|
|
318
|
+
"""
|
|
319
|
+
The scTCR numbering method, while useful for sequences with two domains,
|
|
320
|
+
can have gaps in between (e.g. CD1 molecule of 4lhu).
|
|
321
|
+
This is to close the gaps in the numbering so that residues that were unnumbered by anarci don't move around
|
|
322
|
+
during structural parsing (when they're probably just connections between domains).
|
|
323
|
+
|
|
324
|
+
@param numbering_dict: numbered dictionary from align_scTCR_numbering
|
|
325
|
+
@param sequence_list : sequence list from the structure for alignment.
|
|
326
|
+
"""
|
|
327
|
+
positions = [p[0] for p in sequence_list]
|
|
328
|
+
|
|
329
|
+
# This gets the last numbered residue in numbering_dict
|
|
330
|
+
lastkey = max(numbering_dict)
|
|
331
|
+
lastidx = positions.index(lastkey) # Where is this on sequence_list?
|
|
332
|
+
|
|
333
|
+
for index in range(1, len(positions)):
|
|
334
|
+
|
|
335
|
+
# If we got to the last key, don't bother.
|
|
336
|
+
if index > lastidx:
|
|
337
|
+
break
|
|
338
|
+
|
|
339
|
+
key = positions[index]
|
|
340
|
+
|
|
341
|
+
# If a target key is not in the numbering dict, see where it fits, then fit a number in it.
|
|
342
|
+
if key not in numbering_dict:
|
|
343
|
+
|
|
344
|
+
# Get the left and right bounds of the gap
|
|
345
|
+
left, right = False, False
|
|
346
|
+
lidx, ridx = 0, 0
|
|
347
|
+
lval = (0, " ")
|
|
348
|
+
j = 0
|
|
349
|
+
|
|
350
|
+
# Continue iterating left from the missing key until we find one that exists
|
|
351
|
+
while not left:
|
|
352
|
+
key_left = positions[index - j]
|
|
353
|
+
if key_left in numbering_dict:
|
|
354
|
+
left = True
|
|
355
|
+
lidx = (
|
|
356
|
+
index - j
|
|
357
|
+
) # Last known index of sequence_list where we know a key exists
|
|
358
|
+
lval = numbering_dict[key_left]
|
|
359
|
+
else:
|
|
360
|
+
j += 1
|
|
361
|
+
|
|
362
|
+
j = 0
|
|
363
|
+
while not right:
|
|
364
|
+
key_right = positions[index + j]
|
|
365
|
+
if key_right in numbering_dict:
|
|
366
|
+
right = True
|
|
367
|
+
ridx = (
|
|
368
|
+
index + j
|
|
369
|
+
) # Last known index of sequence_list on the right where we know a key exists
|
|
370
|
+
else:
|
|
371
|
+
j += 1
|
|
372
|
+
|
|
373
|
+
# For every key between the left and right, fill in
|
|
374
|
+
for k, missing_key in enumerate(positions[lidx + 1 : ridx]):
|
|
375
|
+
numbering_dict[missing_key] = (lval[0] + k + 1, " ")
|
|
376
|
+
|
|
377
|
+
return numbering_dict
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def get_alignment_dict(ali1, ali2):
|
|
381
|
+
"""
|
|
382
|
+
Get a dictionary which tells you the index in sequence 2 that should align with the index in sequence 1 (key)
|
|
383
|
+
|
|
384
|
+
ali1: ----bcde-f--- seq1: bcdef
|
|
385
|
+
ali2: ---abcd--f--- seq2: abcdf
|
|
386
|
+
|
|
387
|
+
alignment_dict={
|
|
388
|
+
0:1,
|
|
389
|
+
1:2,
|
|
390
|
+
2:3,
|
|
391
|
+
4:4
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
If the index is aligned with a gap do not include in the dictionary.
|
|
395
|
+
e.g 1 in alignment_dict --> True
|
|
396
|
+
e.g 3 in alignment_dict --> False
|
|
397
|
+
"""
|
|
398
|
+
assert len(ali1) == len(
|
|
399
|
+
ali2
|
|
400
|
+
), "aligned sequences must be same lengths (including gaps)"
|
|
401
|
+
alignment_dict = {}
|
|
402
|
+
p1 = -1
|
|
403
|
+
p2 = -1
|
|
404
|
+
for ap in range(len(ali1)):
|
|
405
|
+
if ali1[ap] != "-" and ali2[ap] != "-":
|
|
406
|
+
p1 += 1
|
|
407
|
+
p2 += 1
|
|
408
|
+
alignment_dict[p1] = p2
|
|
409
|
+
elif ali1[ap] != "-":
|
|
410
|
+
p1 += 1
|
|
411
|
+
elif ali2[ap] != "-":
|
|
412
|
+
p2 += 1
|
|
413
|
+
return alignment_dict
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def pairwise_alignment(seq1, seq2, exact=False):
|
|
417
|
+
"""
|
|
418
|
+
Function to do alignment of sequences between sequences using biopython.
|
|
419
|
+
"""
|
|
420
|
+
with warnings.catch_warnings(): # prevents pairwise2 deprecation warning from being raised
|
|
421
|
+
warnings.simplefilter("ignore")
|
|
422
|
+
from Bio.pairwise2 import align
|
|
423
|
+
|
|
424
|
+
alignment = None
|
|
425
|
+
s1_aln, s2_aln = easy_alignment(seq1, seq2)
|
|
426
|
+
if s1_aln:
|
|
427
|
+
return s1_aln, s2_aln
|
|
428
|
+
|
|
429
|
+
if exact:
|
|
430
|
+
# Align with a match score of 1, mismatch of 0, gap opening of -1.001, and gap extension of -1
|
|
431
|
+
alignment = align.globalms(seq1, seq2, 1, 0, -1, -1.001)
|
|
432
|
+
else:
|
|
433
|
+
alignment = align.globalxx(seq1, seq2)
|
|
434
|
+
|
|
435
|
+
if alignment:
|
|
436
|
+
aligned_seqs = alignment[0]
|
|
437
|
+
return aligned_seqs[0], aligned_seqs[1]
|
|
438
|
+
else:
|
|
439
|
+
return False, False
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
def easy_alignment(seq1, seq2):
|
|
443
|
+
"""
|
|
444
|
+
Function to align two sequences by checking if one is in the other.
|
|
445
|
+
This function will conserve gaps.
|
|
446
|
+
"""
|
|
447
|
+
assert (
|
|
448
|
+
type(seq1) is str and type(seq2) is str
|
|
449
|
+
), "Sequences must be strings for easy_alignment"
|
|
450
|
+
if seq1 in seq2:
|
|
451
|
+
start = seq2.index(seq1)
|
|
452
|
+
seq1_ali = "-" * start + seq1 + "-" * (len(seq2) - start - len(seq1))
|
|
453
|
+
return seq1_ali, seq2
|
|
454
|
+
|
|
455
|
+
elif seq2 in seq1:
|
|
456
|
+
start = seq1.index(seq2)
|
|
457
|
+
seq2_ali = "-" * start + seq2 + "-" * (len(seq1) - start - len(seq2))
|
|
458
|
+
return seq1, seq2_ali
|
|
459
|
+
|
|
460
|
+
else:
|
|
461
|
+
# Can't align them # I return just one value here.
|
|
462
|
+
return False, False
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def validate_sequence(seq):
|
|
466
|
+
"""
|
|
467
|
+
Check whether a sequence is a protein sequence or if someone has submitted something nasty.
|
|
468
|
+
"""
|
|
469
|
+
if len(seq) > 10000:
|
|
470
|
+
raise AssertionError("Sequence too long.")
|
|
471
|
+
if any([1 for s in seq.upper() if s not in aa1]):
|
|
472
|
+
raise AssertionError(
|
|
473
|
+
"Unknown amino acid letter found in sequence: " + seq.upper()
|
|
474
|
+
)
|
|
475
|
+
else:
|
|
476
|
+
return True
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
if __name__ == "__main__":
|
|
480
|
+
pass
|
|
File without changes
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import warnings
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def fastcross(v, w):
|
|
7
|
+
"""Cross-vector of two Vector objects which is faster than NumPy's version"""
|
|
8
|
+
return np.array(
|
|
9
|
+
[
|
|
10
|
+
v[1] * w[2] - v[2] * w[1],
|
|
11
|
+
v[2] * w[0] - v[0] * w[2],
|
|
12
|
+
v[0] * w[1] - v[1] * w[0],
|
|
13
|
+
]
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def fastnorm(A):
|
|
18
|
+
"""Faster version of Euclidean norm"""
|
|
19
|
+
return math.sqrt(sum([x**2 for x in A]))
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def identity(seq1, seq2, positions=[]):
|
|
23
|
+
"""
|
|
24
|
+
Find the matched sequence identity between two aligned sequences.
|
|
25
|
+
Can accept lists/strings, but this assumes that the two sequences are of the same length.
|
|
26
|
+
@param seq1: Dictionary with key as the position and value as the single letter amino acid code. or an aligned list or string
|
|
27
|
+
@param seq2: Dictionary with key as the position and value as the single letter amino acid code. or an aligned list or string
|
|
28
|
+
"""
|
|
29
|
+
n = 0 # number
|
|
30
|
+
m = 0 # match
|
|
31
|
+
|
|
32
|
+
if isinstance(seq1, dict) and isinstance(seq2, dict):
|
|
33
|
+
if not positions:
|
|
34
|
+
positions = set(seq1.keys()) | set(seq2.keys())
|
|
35
|
+
else:
|
|
36
|
+
assert len(seq1) == len(seq2), "Use two aligned sequences."
|
|
37
|
+
positions = range(len(seq1))
|
|
38
|
+
|
|
39
|
+
# matched identity
|
|
40
|
+
for p in positions:
|
|
41
|
+
try:
|
|
42
|
+
if seq1[p] == "-":
|
|
43
|
+
continue
|
|
44
|
+
if seq2[p] == "-":
|
|
45
|
+
continue
|
|
46
|
+
except KeyError:
|
|
47
|
+
continue
|
|
48
|
+
|
|
49
|
+
if seq1[p] == seq2[p]:
|
|
50
|
+
m += 1
|
|
51
|
+
n += 1
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
return float(m) / n
|
|
55
|
+
except ZeroDivisionError:
|
|
56
|
+
return 0
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def angle(v1, v2):
|
|
60
|
+
"""Return the angle between two vectors"""
|
|
61
|
+
# num = np.dot(v1.point,v2.point)
|
|
62
|
+
# denom = v1.norm() * v2.norm()
|
|
63
|
+
num = np.dot(v1, v2)
|
|
64
|
+
denom = fastnorm(v1) * fastnorm(v2)
|
|
65
|
+
if abs(num / denom) > 1:
|
|
66
|
+
return np.pi
|
|
67
|
+
return np.arccos(num / denom)
|