EntDetect 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- EntDetect/Jwalk/GridTools.py +567 -0
- EntDetect/Jwalk/PDBTools.py +532 -0
- EntDetect/Jwalk/SASDTools.py +543 -0
- EntDetect/Jwalk/SurfaceTools.py +150 -0
- EntDetect/Jwalk/__init__.py +19 -0
- EntDetect/Jwalk/naccess.config.txt +255 -0
- EntDetect/__init__.py +10 -0
- EntDetect/_logging.py +71 -0
- EntDetect/change_resolution.py +2361 -0
- EntDetect/clustering.py +2626 -0
- EntDetect/compare_sim2exp.py +1927 -0
- EntDetect/entanglement_features.py +478 -0
- EntDetect/gaussian_entanglement.py +2067 -0
- EntDetect/order_params.py +1048 -0
- EntDetect/resources/__init__.py +11 -0
- EntDetect/resources/__pycache__/__init__.cpython-311.pyc +0 -0
- EntDetect/resources/calc_K.pl +712 -0
- EntDetect/resources/calc_Q.pl +962 -0
- EntDetect/resources/pulchra +0 -0
- EntDetect/resources/shared_files/__init__.py +2 -0
- EntDetect/resources/shared_files/bt_contact_potential.dat +22 -0
- EntDetect/resources/shared_files/karanicolas_dihe_parm.dat +1600 -0
- EntDetect/resources/shared_files/kgs_contact_potential.dat +22 -0
- EntDetect/resources/shared_files/mj_contact_potential.dat +22 -0
- EntDetect/resources/stride +0 -0
- EntDetect/statistics.py +1344 -0
- EntDetect/utilities.py +201 -0
- entdetect-1.2.0.dist-info/METADATA +26 -0
- entdetect-1.2.0.dist-info/RECORD +45 -0
- entdetect-1.2.0.dist-info/WHEEL +5 -0
- entdetect-1.2.0.dist-info/entry_points.txt +11 -0
- entdetect-1.2.0.dist-info/licenses/LICENSE +674 -0
- entdetect-1.2.0.dist-info/top_level.txt +2 -0
- scripts/__init__.py +5 -0
- scripts/convert_cor_psf_to_pdb.py +103 -0
- scripts/run_Foldingpathway.py +162 -0
- scripts/run_MSM.py +152 -0
- scripts/run_OP_on_simulation_traj.py +194 -0
- scripts/run_change_resolution.py +63 -0
- scripts/run_compare_sim2exp.py +215 -0
- scripts/run_montecarlo.py +158 -0
- scripts/run_nativeNCLE.py +179 -0
- scripts/run_nonnative_entanglement_clustering.py +110 -0
- scripts/run_population_modeling.py +117 -0
- scripts/run_workflow4_nativeNCLE_batch.py +412 -0
|
@@ -0,0 +1,543 @@
|
|
|
1
|
+
# ===============================================================================
|
|
2
|
+
# This file is part of Jwalk (Python 3).
|
|
3
|
+
#
|
|
4
|
+
# Jwalk - A tool to calculate the solvent accessible surface distance (SASD)
|
|
5
|
+
# between crosslinked residues.
|
|
6
|
+
#
|
|
7
|
+
# Copyright 2016 Josh Bullock and Birkbeck College University of London.
|
|
8
|
+
#
|
|
9
|
+
# Jwalk is available under Public Licence.
|
|
10
|
+
# This software is made available under GPL V3
|
|
11
|
+
#
|
|
12
|
+
# Please cite your use of Jwalk in published work:
|
|
13
|
+
#
|
|
14
|
+
# J.Bullock, J. Schwab, K. Thalassinos, M. Topf (2016)
|
|
15
|
+
# The importance of non-accessible crosslinks and solvent accessible surface distance
|
|
16
|
+
# in modelling proteins with restraints from crosslinking mass spectrometry.
|
|
17
|
+
# Molecular and Cellular Proteomics (15) pp.2491-2500
|
|
18
|
+
#
|
|
19
|
+
# ===============================================================================
|
|
20
|
+
|
|
21
|
+
import math
|
|
22
|
+
import itertools
|
|
23
|
+
from collections import deque
|
|
24
|
+
from multiprocessing import Pool, freeze_support
|
|
25
|
+
|
|
26
|
+
def calculate_specific_SASD(single_crosslink, aa1_voxels, aa2_voxels, dens_map, aa1_CA, aa2_CA,
|
|
27
|
+
max_dist, vox):
|
|
28
|
+
|
|
29
|
+
'''
|
|
30
|
+
|
|
31
|
+
Breadth First Search of grid. For general info on algorithm see:
|
|
32
|
+
https://en.wikipedia.org/wiki/Breadth-first_search
|
|
33
|
+
|
|
34
|
+
Returns dictionary containing solvent accessible surface distances between specific starting res
|
|
35
|
+
and ending res.
|
|
36
|
+
|
|
37
|
+
{start res, end res, length in angstroms : voxel path of sasd}
|
|
38
|
+
|
|
39
|
+
Arguments:
|
|
40
|
+
|
|
41
|
+
*single_crosslink*
|
|
42
|
+
start and end residue.
|
|
43
|
+
start is key of aa1_voxels. aa1_voxels[start_residue] = all the starting voxels for that
|
|
44
|
+
residue
|
|
45
|
+
*aa1_voxels*
|
|
46
|
+
dictionary containing starting voxels {start_residue : starting voxels}
|
|
47
|
+
*aa2_voxels*
|
|
48
|
+
dictionary containing ending voxels {end_residue : ending voxels}
|
|
49
|
+
*dens_map*
|
|
50
|
+
grid with solvent accessible surface (masked array)
|
|
51
|
+
*aa1_CA*
|
|
52
|
+
dictionary containing voxel of C-alpha
|
|
53
|
+
*aa2_CA*
|
|
54
|
+
dictionary containing voxel of C-alpha
|
|
55
|
+
*max_dist*
|
|
56
|
+
maximum distance BFS will search until
|
|
57
|
+
*vox*
|
|
58
|
+
number of angstoms per voxel
|
|
59
|
+
|
|
60
|
+
'''
|
|
61
|
+
|
|
62
|
+
start_residue = single_crosslink[0]
|
|
63
|
+
end_residue = single_crosslink[1]
|
|
64
|
+
|
|
65
|
+
specific_xl = {}
|
|
66
|
+
|
|
67
|
+
comb = [[+1, +0, +0],[-1, +0, +0],
|
|
68
|
+
[+0, +1, +0],[+0, -1, +0],
|
|
69
|
+
[+0, +0, +1],[+0, +0, -1],
|
|
70
|
+
[+1, +0, +1],[-1, +0, +1],
|
|
71
|
+
[+0, +1, +1],[+0, -1, +1],
|
|
72
|
+
[+1, -1, +0],[-1, -1, +0],
|
|
73
|
+
[+1, +1, +0],[-1, +1, +0],
|
|
74
|
+
[+1, +0, -1],[-1, +0, -1],
|
|
75
|
+
[+0, +1, -1],[+0, -1, -1],
|
|
76
|
+
[+1, +1, +1],[+1, -1, +1],
|
|
77
|
+
[-1, +1, +1],[-1, -1, +1],
|
|
78
|
+
[+1, +1, -1],[+1, -1, -1],
|
|
79
|
+
[-1, +1, -1],[-1, -1, -1]]
|
|
80
|
+
|
|
81
|
+
# distance of diagonal steps
|
|
82
|
+
diag1 = (math.sqrt((vox ** 2) * 2)) # 2d diagonal
|
|
83
|
+
diag2 = (math.sqrt((vox ** 2) * 3)) # 3d diagonal
|
|
84
|
+
|
|
85
|
+
queue = [] # voxels in queue for searching
|
|
86
|
+
end_voxels = [] # list of voxels to find path to
|
|
87
|
+
visited = {} # list works as all the coordinates that have been visited - dictionary gives the path to said coordinate from startpoint
|
|
88
|
+
distance = {} # keeps distance from starting voxel for each other voxel
|
|
89
|
+
|
|
90
|
+
# place starting voxels into queue and initialise visited and distance
|
|
91
|
+
for j in aa1_voxels[start_residue]:
|
|
92
|
+
queue.append([j[0], j[1], j[2]])
|
|
93
|
+
visited[j[0], j[1], j[2]] = [[j[0], j[1], j[2]]]
|
|
94
|
+
distance[j[0], j[1], j[2]] = 0
|
|
95
|
+
|
|
96
|
+
while queue:
|
|
97
|
+
x_n, y_n, z_n = queue.pop(0)
|
|
98
|
+
if distance[x_n, y_n, z_n] <= max_dist:
|
|
99
|
+
for c in comb:
|
|
100
|
+
x_temp = x_n + c[0]
|
|
101
|
+
y_temp = y_n + c[1]
|
|
102
|
+
z_temp = z_n + c[2]
|
|
103
|
+
if (x_temp, y_temp, z_temp) not in visited:
|
|
104
|
+
if ((0 <= x_temp < dens_map.x_size()) and (0 <= y_temp < dens_map.y_size()) and (
|
|
105
|
+
0 <= z_temp < dens_map.z_size())):
|
|
106
|
+
temp_list = visited[x_n, y_n, z_n][:]
|
|
107
|
+
temp_list.append([x_temp, y_temp, z_temp])
|
|
108
|
+
visited[x_temp, y_temp, z_temp] = temp_list # updated visited list
|
|
109
|
+
|
|
110
|
+
if dens_map.fullMap[z_temp][y_temp][x_temp] <= 0: # if the voxel is in empty space
|
|
111
|
+
queue.append(([x_temp, y_temp, z_temp]))
|
|
112
|
+
# calculate the distance
|
|
113
|
+
diff_x = x_temp - x_n
|
|
114
|
+
diff_y = y_temp - y_n
|
|
115
|
+
diff_z = z_temp - z_n
|
|
116
|
+
if diff_x != 0 and diff_y != 0 and diff_z != 0:
|
|
117
|
+
distance[x_temp, y_temp, z_temp] = distance[x_n, y_n, z_n] + diag2
|
|
118
|
+
elif diff_x != 0 and diff_y != 0:
|
|
119
|
+
distance[x_temp, y_temp, z_temp] = distance[x_n, y_n, z_n] + diag1
|
|
120
|
+
elif diff_x != 0 and diff_z != 0:
|
|
121
|
+
distance[x_temp, y_temp, z_temp] = distance[x_n, y_n, z_n] + diag1
|
|
122
|
+
elif diff_y != 0 and diff_z != 0:
|
|
123
|
+
distance[x_temp, y_temp, z_temp] = distance[x_n, y_n, z_n] + diag1
|
|
124
|
+
else:
|
|
125
|
+
distance[x_temp, y_temp, z_temp] = distance[x_n, y_n, z_n] + vox
|
|
126
|
+
|
|
127
|
+
# now we have a full set of paths into empty space starting from start_residue
|
|
128
|
+
# all stored in visited. Now need to extract paths to specific residue
|
|
129
|
+
shortest_distance = 9999
|
|
130
|
+
all_distances = {}
|
|
131
|
+
|
|
132
|
+
for j in aa2_voxels[end_residue]:
|
|
133
|
+
|
|
134
|
+
(x, y, z) = j
|
|
135
|
+
|
|
136
|
+
if (x, y, z) in visited:
|
|
137
|
+
|
|
138
|
+
visited[(x, y, z)].insert(0, aa1_CA[start_residue]) # add aa1 CA voxel to path
|
|
139
|
+
visited[(x, y, z)].append(aa2_CA[end_residue]) # add aa2 CA voxel to end of path
|
|
140
|
+
|
|
141
|
+
# add the distance between starting/ending residue CA voxel and start/end voxel in path
|
|
142
|
+
for i in [1, len(visited[(x, y, z)]) - 1]:
|
|
143
|
+
(x_1, y_1, z_1) = visited[(x, y, z)][i - 1]
|
|
144
|
+
(x_2, y_2, z_2) = visited[(x, y, z)][i]
|
|
145
|
+
distance[(x, y, z)] += math.sqrt((x_1 - x_2) ** 2 + (y_1 - y_2) ** 2 + (z_1 - z_2) ** 2)
|
|
146
|
+
|
|
147
|
+
all_distances[distance[(x, y, z)]] = visited[(x, y, z)] # linking distance:path
|
|
148
|
+
|
|
149
|
+
# keep record of shortest distance
|
|
150
|
+
if shortest_distance > distance[(x, y, z)]:
|
|
151
|
+
shortest_distance = distance[(x, y, z)]
|
|
152
|
+
|
|
153
|
+
# now adding shortest xl to the final list
|
|
154
|
+
|
|
155
|
+
if shortest_distance != 9999:
|
|
156
|
+
# this is just to order the dict so that chain goes alphabetically
|
|
157
|
+
specific_xl[start_residue, end_residue, shortest_distance] = all_distances[
|
|
158
|
+
shortest_distance] # start lys, end lys, length of xl = path of xl
|
|
159
|
+
|
|
160
|
+
return specific_xl
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def calculate_SASDs(start_residue, aa1_voxels, aa2_voxels, dens_map, aa1_CA, aa2_CA,
|
|
164
|
+
max_dist, vox):
|
|
165
|
+
|
|
166
|
+
"""
|
|
167
|
+
|
|
168
|
+
Breadth First Search of grid. For general info on algorithm see:
|
|
169
|
+
https://en.wikipedia.org/wiki/Breadth-first_search
|
|
170
|
+
|
|
171
|
+
Returns dictionary containing solvent accessible surface distances between starting res
|
|
172
|
+
and all possible ending res.
|
|
173
|
+
|
|
174
|
+
{start res, end res, length in angstroms : voxel path of sasd}
|
|
175
|
+
|
|
176
|
+
Arguments:
|
|
177
|
+
|
|
178
|
+
*start_residue*
|
|
179
|
+
key of aa1_voxels. aa1_voxels[start_residue] = all the starting voxels for that
|
|
180
|
+
residue
|
|
181
|
+
*aa1_voxels*
|
|
182
|
+
dictionary containing starting voxels {start_residue : starting voxels}
|
|
183
|
+
*aa2_voxels*
|
|
184
|
+
dictionary containing ending voxels {end_residue : ending voxels}
|
|
185
|
+
*dens_map*
|
|
186
|
+
grid with solvent accessible surface (masked array)
|
|
187
|
+
*aa1_CA*
|
|
188
|
+
dictionary containing voxel of C-alpha
|
|
189
|
+
*aa2_CA*
|
|
190
|
+
dictionary containing voxel of C-alpha
|
|
191
|
+
*max_dist*
|
|
192
|
+
maximum distance BFS will search until
|
|
193
|
+
*vox*
|
|
194
|
+
number of angstoms per voxel
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
sasds = {}
|
|
200
|
+
|
|
201
|
+
# order of voxels to search - by having diagonals last ensures shortest path is returned
|
|
202
|
+
comb = [[+1, +0, +0],[-1, +0, +0],
|
|
203
|
+
[+0, +1, +0],[+0, -1, +0],
|
|
204
|
+
[+0, +0, +1],[+0, +0, -1],
|
|
205
|
+
[+1, +0, +1],[-1, +0, +1],
|
|
206
|
+
[+0, +1, +1],[+0, -1, +1],
|
|
207
|
+
[+1, -1, +0],[-1, -1, +0],
|
|
208
|
+
[+1, +1, +0],[-1, +1, +0],
|
|
209
|
+
[+1, +0, -1],[-1, +0, -1],
|
|
210
|
+
[+0, +1, -1],[+0, -1, -1],
|
|
211
|
+
[+1, +1, +1],[+1, -1, +1],
|
|
212
|
+
[-1, +1, +1],[-1, -1, +1],
|
|
213
|
+
[+1, +1, -1],[+1, -1, -1],
|
|
214
|
+
[-1, +1, -1],[-1, -1, -1]]
|
|
215
|
+
|
|
216
|
+
# distance of diagonal steps
|
|
217
|
+
diag1 = (math.sqrt((vox ** 2) * 2)) # 2d diagonal
|
|
218
|
+
diag2 = (math.sqrt((vox ** 2) * 3)) # 3d diagonal
|
|
219
|
+
|
|
220
|
+
queue = [] # voxels in queue for searching
|
|
221
|
+
visited = {} # list works as all the coordinates that have been visited - dictionary gives the path to said coordinate from startpoint
|
|
222
|
+
distance = {} # keeps distance from starting voxel for each other voxel
|
|
223
|
+
|
|
224
|
+
# place starting voxels into queue and initialise visited and distance
|
|
225
|
+
for j in aa1_voxels[start_residue]:
|
|
226
|
+
queue.append([j[0], j[1], j[2]])
|
|
227
|
+
visited[j[0], j[1], j[2]] = [[j[0], j[1], j[2]]]
|
|
228
|
+
distance[j[0], j[1], j[2]] = 0
|
|
229
|
+
|
|
230
|
+
# grid is searched until queue is empty
|
|
231
|
+
while queue:
|
|
232
|
+
x_n, y_n, z_n = queue.pop(0) # take first voxel in queue
|
|
233
|
+
if distance[x_n, y_n, z_n] <= max_dist:
|
|
234
|
+
for c in comb: # expand in all directions from voxel - in order of comb.
|
|
235
|
+
x_temp = x_n + c[0]
|
|
236
|
+
y_temp = y_n + c[1]
|
|
237
|
+
z_temp = z_n + c[2]
|
|
238
|
+
# check voxel hasn't already been searched
|
|
239
|
+
if (x_temp, y_temp, z_temp) not in visited:
|
|
240
|
+
# check that voxel is within bounds of the grid
|
|
241
|
+
if ((0 <= x_temp < dens_map.x_size()) and (0 <= y_temp < dens_map.y_size()) and (
|
|
242
|
+
0 <= z_temp < dens_map.z_size())):
|
|
243
|
+
# add path to this voxel to visited
|
|
244
|
+
temp_list = visited[x_n, y_n, z_n][:]
|
|
245
|
+
temp_list.append([x_temp, y_temp, z_temp])
|
|
246
|
+
visited[x_temp, y_temp, z_temp] = temp_list
|
|
247
|
+
|
|
248
|
+
if dens_map.fullMap[z_temp][y_temp][x_temp] <= 0: # if the voxel is in empty space
|
|
249
|
+
queue.append(([x_temp, y_temp, z_temp])) # add to queue for later searching
|
|
250
|
+
|
|
251
|
+
# calculate the distance to voxel from start voxel
|
|
252
|
+
diff_x = x_temp - x_n
|
|
253
|
+
diff_y = y_temp - y_n
|
|
254
|
+
diff_z = z_temp - z_n
|
|
255
|
+
if diff_x != 0 and diff_y != 0 and diff_z != 0:
|
|
256
|
+
distance[x_temp, y_temp, z_temp] = distance[x_n, y_n, z_n] + diag2
|
|
257
|
+
elif diff_x != 0 and diff_y != 0:
|
|
258
|
+
distance[x_temp, y_temp, z_temp] = distance[x_n, y_n, z_n] + diag1
|
|
259
|
+
elif diff_x != 0 and diff_z != 0:
|
|
260
|
+
distance[x_temp, y_temp, z_temp] = distance[x_n, y_n, z_n] + diag1
|
|
261
|
+
elif diff_y != 0 and diff_z != 0:
|
|
262
|
+
distance[x_temp, y_temp, z_temp] = distance[x_n, y_n, z_n] + diag1
|
|
263
|
+
else:
|
|
264
|
+
distance[x_temp, y_temp, z_temp] = distance[x_n, y_n, z_n] + vox
|
|
265
|
+
|
|
266
|
+
# now we have a full set of paths into empty space starting from start_residue
|
|
267
|
+
# all stored in visited. Now need to extract paths to specific residues
|
|
268
|
+
for end_residue in aa2_voxels:
|
|
269
|
+
if start_residue != end_residue:
|
|
270
|
+
shortest_distance = 9999
|
|
271
|
+
all_distances = {}
|
|
272
|
+
|
|
273
|
+
# cycling through possible end coords of end_residue to get shortest sasd
|
|
274
|
+
for j in aa2_voxels[end_residue]:
|
|
275
|
+
|
|
276
|
+
(x, y, z) = j
|
|
277
|
+
|
|
278
|
+
if (x, y, z) in visited:
|
|
279
|
+
|
|
280
|
+
visited[(x, y, z)].insert(0, aa1_CA[start_residue]) # add aa1 CA voxel to path
|
|
281
|
+
visited[(x, y, z)].append(aa2_CA[end_residue]) # add aa2 CA voxel to end of path
|
|
282
|
+
|
|
283
|
+
# add the distance between starting/ending residue CA voxel and start/end voxel in path
|
|
284
|
+
for i in [1, len(visited[(x, y, z)]) - 1]:
|
|
285
|
+
(x_1, y_1, z_1) = visited[(x, y, z)][i - 1]
|
|
286
|
+
(x_2, y_2, z_2) = visited[(x, y, z)][i]
|
|
287
|
+
distance[(x, y, z)] += math.sqrt((x_1 - x_2) ** 2 + (y_1 - y_2) ** 2 + (z_1 - z_2) ** 2)
|
|
288
|
+
|
|
289
|
+
all_distances[distance[(x, y, z)]] = visited[(x, y, z)] # linking distance:path
|
|
290
|
+
|
|
291
|
+
# keep record of shortest distance
|
|
292
|
+
if shortest_distance > distance[(x, y, z)]:
|
|
293
|
+
shortest_distance = distance[(x, y, z)]
|
|
294
|
+
|
|
295
|
+
# add shortest distance sasd to output dictionary
|
|
296
|
+
|
|
297
|
+
if shortest_distance != 9999:
|
|
298
|
+
if start_residue[1] < end_residue[1]: # this to order the dict so that chain goes alphabetically
|
|
299
|
+
sasds[start_residue, end_residue, shortest_distance] = all_distances[shortest_distance]
|
|
300
|
+
elif end_residue[1] < start_residue[1]:
|
|
301
|
+
sasds[end_residue, start_residue, shortest_distance] = all_distances[shortest_distance]
|
|
302
|
+
# if both on the same chain, then ordered to go numerically
|
|
303
|
+
elif start_residue[0] < end_residue[0]:
|
|
304
|
+
sasds[start_residue, end_residue, shortest_distance] = all_distances[shortest_distance]
|
|
305
|
+
else:
|
|
306
|
+
sasds[end_residue, start_residue, shortest_distance] = all_distances[shortest_distance]
|
|
307
|
+
|
|
308
|
+
return sasds
|
|
309
|
+
|
|
310
|
+
def calculate_SASDs_star(a_b):
|
|
311
|
+
"""Convert `f([1,2])` to `f(1,2)` call."""
|
|
312
|
+
return calculate_SASDs(*a_b)
|
|
313
|
+
|
|
314
|
+
def calculate_specific_SASD_star(a_b):
|
|
315
|
+
"""Convert `f([1,2])` to `f(1,2)` call."""
|
|
316
|
+
return calculate_specific_SASD(*a_b)
|
|
317
|
+
|
|
318
|
+
# ---------------------------------------------------------------------------
|
|
319
|
+
# Fast BFS helpers (deque queue, no full-path storage, grouped by start residue)
|
|
320
|
+
# ---------------------------------------------------------------------------
|
|
321
|
+
|
|
322
|
+
_COMB = (
|
|
323
|
+
(+1,+0,+0),(-1,+0,+0),(+0,+1,+0),(+0,-1,+0),(+0,+0,+1),(+0,+0,-1),
|
|
324
|
+
(+1,+0,+1),(-1,+0,+1),(+0,+1,+1),(+0,-1,+1),(+1,-1,+0),(-1,-1,+0),
|
|
325
|
+
(+1,+1,+0),(-1,+1,+0),(+1,+0,-1),(-1,+0,-1),(+0,+1,-1),(+0,-1,-1),
|
|
326
|
+
(+1,+1,+1),(+1,-1,+1),(-1,+1,+1),(-1,-1,+1),(+1,+1,-1),(+1,-1,-1),
|
|
327
|
+
(-1,+1,-1),(-1,-1,-1),
|
|
328
|
+
)
|
|
329
|
+
# number of non-zero components per move (determines step size)
|
|
330
|
+
_COMB_N = tuple(abs(c[0]) + abs(c[1]) + abs(c[2]) for c in _COMB)
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def _bfs_fast(start_residue, aa1_voxels, dens_map, max_dist, vox):
|
|
334
|
+
"""
|
|
335
|
+
Fast BFS using a deque queue with no full path storage.
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
distance : dict {(x,y,z): float} — path length through solvent from
|
|
339
|
+
any start-surface voxel to each reachable voxel.
|
|
340
|
+
start_origin: dict {(x,y,z): (sx,sy,sz)} — which start-surface voxel
|
|
341
|
+
originated the shortest path to each voxel (needed for the
|
|
342
|
+
CA-to-surface correction).
|
|
343
|
+
"""
|
|
344
|
+
diag1 = math.sqrt(vox * vox * 2)
|
|
345
|
+
diag2 = math.sqrt(vox * vox * 3)
|
|
346
|
+
|
|
347
|
+
queue = deque()
|
|
348
|
+
visited = set()
|
|
349
|
+
distance = {}
|
|
350
|
+
start_origin = {}
|
|
351
|
+
|
|
352
|
+
for j in aa1_voxels[start_residue]:
|
|
353
|
+
key = (j[0], j[1], j[2])
|
|
354
|
+
if key not in visited:
|
|
355
|
+
queue.append(key)
|
|
356
|
+
visited.add(key)
|
|
357
|
+
distance[key] = 0.0
|
|
358
|
+
start_origin[key] = key
|
|
359
|
+
|
|
360
|
+
x_size = dens_map.x_size()
|
|
361
|
+
y_size = dens_map.y_size()
|
|
362
|
+
z_size = dens_map.z_size()
|
|
363
|
+
full_map = dens_map.fullMap
|
|
364
|
+
|
|
365
|
+
while queue:
|
|
366
|
+
x_n, y_n, z_n = queue.popleft()
|
|
367
|
+
d_n = distance[x_n, y_n, z_n]
|
|
368
|
+
if d_n > max_dist:
|
|
369
|
+
continue
|
|
370
|
+
orig = start_origin[x_n, y_n, z_n]
|
|
371
|
+
for c, n in zip(_COMB, _COMB_N):
|
|
372
|
+
x_t = x_n + c[0]
|
|
373
|
+
y_t = y_n + c[1]
|
|
374
|
+
z_t = z_n + c[2]
|
|
375
|
+
key = (x_t, y_t, z_t)
|
|
376
|
+
if key not in visited:
|
|
377
|
+
if 0 <= x_t < x_size and 0 <= y_t < y_size and 0 <= z_t < z_size:
|
|
378
|
+
visited.add(key)
|
|
379
|
+
step = diag2 if n == 3 else (diag1 if n == 2 else vox)
|
|
380
|
+
distance[key] = d_n + step
|
|
381
|
+
start_origin[key] = orig
|
|
382
|
+
if full_map[z_t][y_t][x_t] <= 0:
|
|
383
|
+
queue.append(key)
|
|
384
|
+
|
|
385
|
+
return distance, start_origin
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def calculate_SASDs_for_start_fast(args):
|
|
389
|
+
"""
|
|
390
|
+
Run ONE BFS from *start_residue* and extract the shortest distance to every
|
|
391
|
+
end residue listed in *end_residues*. This replaces running one BFS per
|
|
392
|
+
crosslink pair (O(pairs) BFS runs → O(unique start residues) BFS runs).
|
|
393
|
+
|
|
394
|
+
Args: (start_residue, end_residues, aa1_voxels, aa2_voxels, dens_map,
|
|
395
|
+
aa1_CA, aa2_CA, max_dist, vox)
|
|
396
|
+
"""
|
|
397
|
+
start_residue, end_residues, aa1_voxels, aa2_voxels, dens_map, aa1_CA, aa2_CA, max_dist, vox = args
|
|
398
|
+
|
|
399
|
+
distance, start_origin = _bfs_fast(start_residue, aa1_voxels, dens_map, max_dist, vox)
|
|
400
|
+
|
|
401
|
+
ca1 = aa1_CA[start_residue] # [gx, gy, gz] in grid coords
|
|
402
|
+
result = {}
|
|
403
|
+
|
|
404
|
+
for end_residue in end_residues:
|
|
405
|
+
if end_residue == start_residue:
|
|
406
|
+
continue
|
|
407
|
+
if end_residue not in aa2_voxels:
|
|
408
|
+
continue
|
|
409
|
+
|
|
410
|
+
shortest_dist = 9999.0
|
|
411
|
+
ca2 = aa2_CA[end_residue]
|
|
412
|
+
|
|
413
|
+
for j in aa2_voxels[end_residue]:
|
|
414
|
+
voxel = (j[0], j[1], j[2])
|
|
415
|
+
if voxel in distance:
|
|
416
|
+
d = distance[voxel]
|
|
417
|
+
# correction 1: start CA → the start-surface voxel that seeded this path
|
|
418
|
+
sv = start_origin[voxel]
|
|
419
|
+
d += math.sqrt((ca1[0]-sv[0])**2 + (ca1[1]-sv[1])**2 + (ca1[2]-sv[2])**2)
|
|
420
|
+
# correction 2: end-surface voxel → end CA
|
|
421
|
+
d += math.sqrt((j[0]-ca2[0])**2 + (j[1]-ca2[1])**2 + (j[2]-ca2[2])**2)
|
|
422
|
+
if d < shortest_dist:
|
|
423
|
+
shortest_dist = d
|
|
424
|
+
|
|
425
|
+
if shortest_dist < 9999.0:
|
|
426
|
+
# preserve chain-alphabetical / residue-numerical ordering of the key
|
|
427
|
+
if start_residue[1] < end_residue[1]:
|
|
428
|
+
result[(start_residue, end_residue, shortest_dist)] = []
|
|
429
|
+
elif end_residue[1] < start_residue[1]:
|
|
430
|
+
result[(end_residue, start_residue, shortest_dist)] = []
|
|
431
|
+
elif start_residue[0] < end_residue[0]:
|
|
432
|
+
result[(start_residue, end_residue, shortest_dist)] = []
|
|
433
|
+
else:
|
|
434
|
+
result[(end_residue, start_residue, shortest_dist)] = []
|
|
435
|
+
|
|
436
|
+
return result
|
|
437
|
+
|
|
438
|
+
# ---------------------------------------------------------------------------
|
|
439
|
+
|
|
440
|
+
def parallel_BFS(aa1_voxels, aa2_voxels, dens_map, aa1_CA, aa2_CA, crosslink_pairs,
|
|
441
|
+
max_dist, vox, ncpus, xl_list):
|
|
442
|
+
|
|
443
|
+
"""
|
|
444
|
+
|
|
445
|
+
Parallelised Breadth First Search of grid.
|
|
446
|
+
|
|
447
|
+
Returns dictionary containing all solvent accessible surface distances
|
|
448
|
+
{start res, end res, length in angstroms : voxel path of sasd}
|
|
449
|
+
|
|
450
|
+
When xl_list is provided, pairs are grouped by start residue so that only
|
|
451
|
+
ONE BFS is run per unique start residue (instead of one BFS per pair).
|
|
452
|
+
This typically reduces BFS count by 20-50x for large crosslink lists.
|
|
453
|
+
|
|
454
|
+
"""
|
|
455
|
+
|
|
456
|
+
freeze_support()
|
|
457
|
+
final_XL = {}
|
|
458
|
+
|
|
459
|
+
if xl_list != "NULL":
|
|
460
|
+
# --- grouped fast path: one BFS per unique start residue ---
|
|
461
|
+
pairs_by_start = {}
|
|
462
|
+
for pair in crosslink_pairs:
|
|
463
|
+
start = pair[0]
|
|
464
|
+
end = pair[1]
|
|
465
|
+
pairs_by_start.setdefault(start, []).append(end)
|
|
466
|
+
|
|
467
|
+
tasks = [
|
|
468
|
+
(start, ends, aa1_voxels, aa2_voxels, dens_map, aa1_CA, aa2_CA, max_dist, vox)
|
|
469
|
+
for start, ends in pairs_by_start.items()
|
|
470
|
+
]
|
|
471
|
+
|
|
472
|
+
if ncpus > 1:
|
|
473
|
+
pool = Pool(ncpus)
|
|
474
|
+
xl_dictionaries = pool.map(calculate_SASDs_for_start_fast, tasks)
|
|
475
|
+
pool.close()
|
|
476
|
+
pool.join()
|
|
477
|
+
else:
|
|
478
|
+
xl_dictionaries = [calculate_SASDs_for_start_fast(t) for t in tasks]
|
|
479
|
+
|
|
480
|
+
for c in xl_dictionaries:
|
|
481
|
+
final_XL.update(c)
|
|
482
|
+
|
|
483
|
+
else:
|
|
484
|
+
if ncpus > 1:
|
|
485
|
+
|
|
486
|
+
pool = Pool(ncpus)
|
|
487
|
+
xl_dictionaries = pool.map(calculate_SASDs_star,
|
|
488
|
+
zip(aa1_voxels,
|
|
489
|
+
itertools.repeat(aa1_voxels),
|
|
490
|
+
itertools.repeat(aa2_voxels),
|
|
491
|
+
itertools.repeat(dens_map),
|
|
492
|
+
itertools.repeat(aa1_CA),
|
|
493
|
+
itertools.repeat(aa2_CA),
|
|
494
|
+
itertools.repeat(max_dist),
|
|
495
|
+
itertools.repeat(vox)))
|
|
496
|
+
pool.close()
|
|
497
|
+
pool.join()
|
|
498
|
+
|
|
499
|
+
for c in xl_dictionaries:
|
|
500
|
+
final_XL.update(c)
|
|
501
|
+
|
|
502
|
+
else:
|
|
503
|
+
# alternative call to allow single cpu running on Windows machines
|
|
504
|
+
for start_residue in aa1_voxels:
|
|
505
|
+
xl_dictionaries = calculate_SASDs(start_residue, aa1_voxels, aa2_voxels,
|
|
506
|
+
dens_map, aa1_CA, aa2_CA, max_dist, vox)
|
|
507
|
+
final_XL.update(xl_dictionaries)
|
|
508
|
+
|
|
509
|
+
return final_XL
|
|
510
|
+
|
|
511
|
+
def calculate_distance(cords):
|
|
512
|
+
''' Calculates the distance of points in 3d, input e.g. [[x1,y1,z1],[x2,y2,z3]] '''
|
|
513
|
+
return math.sqrt(((cords[0][0]-cords[1][0])**2)+((cords[0][1]-cords[1][1])**2)+((cords[0][2]-cords[1][2])**2))
|
|
514
|
+
|
|
515
|
+
def get_euclidean_distances(sasds, pdb, aa1, aa2):
|
|
516
|
+
|
|
517
|
+
residues = {}
|
|
518
|
+
euc_dists = {}
|
|
519
|
+
with open (pdb) as inf:
|
|
520
|
+
for line in inf:
|
|
521
|
+
if line.startswith('ATOM') and (line[12:16].strip() == 'CA'):
|
|
522
|
+
if line[21:22].strip() == "":
|
|
523
|
+
chain = " "
|
|
524
|
+
else:
|
|
525
|
+
chain = line[21:22].strip()
|
|
526
|
+
residues[line[22:26].strip(),chain] = [float(line[30:38].strip()),
|
|
527
|
+
float(line[38:46].strip()),
|
|
528
|
+
float(line[46:54].strip())]
|
|
529
|
+
|
|
530
|
+
for k,v in residues.items():
|
|
531
|
+
for k1,v1 in residues.items():
|
|
532
|
+
if k1 != k:
|
|
533
|
+
|
|
534
|
+
euc_dists[int(k[0]),k[1], int(k1[0]),k1[1]] = calculate_distance([v,v1])
|
|
535
|
+
|
|
536
|
+
sasds_and_eucs = {}
|
|
537
|
+
|
|
538
|
+
for s in sasds:
|
|
539
|
+
if (s[0][0],s[0][1],s[1][0],s[1][1]) in euc_dists:
|
|
540
|
+
sasds_and_eucs[s[0],s[1],s[2],euc_dists[(s[0][0],s[0][1],s[1][0],s[1][1])]] = sasds[s]
|
|
541
|
+
|
|
542
|
+
return sasds_and_eucs
|
|
543
|
+
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
# ===============================================================================
|
|
2
|
+
# This file is part of Jwalk (Python 3).
|
|
3
|
+
#
|
|
4
|
+
# Jwalk - A tool to calculate the solvent accessible surface distance (SASD)
|
|
5
|
+
# between crosslinked residues.
|
|
6
|
+
#
|
|
7
|
+
# Copyright 2016 Josh Bullock and Birkbeck College University of London.
|
|
8
|
+
#
|
|
9
|
+
# Jwalk is available under Public Licence.
|
|
10
|
+
# This software is made available under GPL V3
|
|
11
|
+
#
|
|
12
|
+
# Please cite your use of Jwalk in published work:
|
|
13
|
+
#
|
|
14
|
+
# J.Bullock, J. Schwab, K. Thalassinos, M. Topf (2016)
|
|
15
|
+
# The importance of non-accessible crosslinks and solvent accessible surface distance
|
|
16
|
+
# in modelling proteins with restraints from crosslinking mass spectrometry.
|
|
17
|
+
# Molecular and Cellular Proteomics (15) pp.2491-2500
|
|
18
|
+
#
|
|
19
|
+
# ===============================================================================
|
|
20
|
+
|
|
21
|
+
import os
|
|
22
|
+
import freesasa
|
|
23
|
+
|
|
24
|
+
def update_crosslink_pairs(crosslink_pairs, aa1_CA, aa2_CA, remove_aa1, remove_aa2):
|
|
25
|
+
|
|
26
|
+
'''Removes buried residues from crosslink_pairs'''
|
|
27
|
+
|
|
28
|
+
buried_residues = []
|
|
29
|
+
index_to_delete = []
|
|
30
|
+
|
|
31
|
+
for i in range(len(crosslink_pairs)): # for each residue pair, check both are solvent accessible
|
|
32
|
+
|
|
33
|
+
xl_pair_1, xl_pair_2 = crosslink_pairs[i]
|
|
34
|
+
|
|
35
|
+
if xl_pair_1 not in aa1_CA:
|
|
36
|
+
index_to_delete.append(i)
|
|
37
|
+
if xl_pair_1 not in buried_residues:
|
|
38
|
+
buried_residues.append(xl_pair_1)
|
|
39
|
+
if xl_pair_2 not in aa2_CA and xl_pair_2 not in buried_residues:
|
|
40
|
+
buried_residues.append(xl_pair_2)
|
|
41
|
+
elif xl_pair_2 not in aa2_CA:
|
|
42
|
+
index_to_delete.append(i)
|
|
43
|
+
if xl_pair_2 not in buried_residues:
|
|
44
|
+
buried_residues.append(xl_pair_2)
|
|
45
|
+
|
|
46
|
+
if [xl_pair_1[0],xl_pair_1[1]] in remove_aa1:
|
|
47
|
+
index_to_delete.append(i)
|
|
48
|
+
if xl_pair_1 not in buried_residues:
|
|
49
|
+
buried_residues.append(xl_pair_1)
|
|
50
|
+
if xl_pair_2 in remove_aa2 and not xl_pair_2 in buried_residues:
|
|
51
|
+
buried_residues.append(xl_pair_2)
|
|
52
|
+
|
|
53
|
+
elif [xl_pair_2[0],xl_pair_2[1]] in remove_aa2:
|
|
54
|
+
index_to_delete.append(i)
|
|
55
|
+
if xl_pair_2 not in buried_residues:
|
|
56
|
+
buried_residues.append(xl_pair_2)
|
|
57
|
+
|
|
58
|
+
no_sasd_possible = []
|
|
59
|
+
crosslink_pairs_final = []
|
|
60
|
+
for i in range(len(crosslink_pairs)):
|
|
61
|
+
if i not in index_to_delete:
|
|
62
|
+
crosslink_pairs_final.append(crosslink_pairs[i])
|
|
63
|
+
else:
|
|
64
|
+
no_sasd_possible.append(crosslink_pairs[i])
|
|
65
|
+
|
|
66
|
+
if len(no_sasd_possible) > 0:
|
|
67
|
+
print("the following crosslinks cannot be calculated:")
|
|
68
|
+
for s in no_sasd_possible:
|
|
69
|
+
print("{}-{}-{} - {}-{}-{}".format(s[0][2],s[0][0],s[0][1],s[1][2],s[1][0],s[1][1]))
|
|
70
|
+
|
|
71
|
+
return crosslink_pairs_final
|
|
72
|
+
|
|
73
|
+
def check_solvent_accessibility_freesasa(prot, aa_CA, xl_list, aa_dict, ncpus):
|
|
74
|
+
|
|
75
|
+
freesasa.Parameters().setNSlices(50)
|
|
76
|
+
freesasa.Parameters().setNThreads(ncpus)
|
|
77
|
+
|
|
78
|
+
pt = os.path.dirname(os.path.realpath(__file__))
|
|
79
|
+
classifier = freesasa.Classifier(os.path.join(pt,"naccess.config.txt"))
|
|
80
|
+
structure = freesasa.Structure(os.path.normpath(prot), classifier)
|
|
81
|
+
result = freesasa.calc(structure)
|
|
82
|
+
|
|
83
|
+
solv_access_residue = {}
|
|
84
|
+
for chain, residue in result.residueAreas().items():
|
|
85
|
+
for res_sasa_info in residue.values():
|
|
86
|
+
if res_sasa_info.total > 7.0: # if total residue SASA is greater than 7.0 ...
|
|
87
|
+
solv_access_residue[(int(res_sasa_info.residueNumber), chain, res_sasa_info.residueType)] = True
|
|
88
|
+
|
|
89
|
+
surface_solv_access_residue = {}
|
|
90
|
+
|
|
91
|
+
for res_num, chain, res_name in aa_CA:
|
|
92
|
+
if (res_num, chain, res_name) in solv_access_residue:
|
|
93
|
+
surface_solv_access_residue[(res_num, chain, res_name)] = aa_CA[(res_num, chain, res_name)]
|
|
94
|
+
sd_res = res_name
|
|
95
|
+
else:
|
|
96
|
+
print("Residue {}-{}-{} is buried".format(res_num, chain, res_name))
|
|
97
|
+
sd_res = res_name
|
|
98
|
+
|
|
99
|
+
# inform user on buried resiudes
|
|
100
|
+
if xl_list != "NULL":
|
|
101
|
+
pass
|
|
102
|
+
elif sd_res == "LYS":
|
|
103
|
+
print("{} {} and 1 N-terminus of which {} are on the surface".format(len(aa_CA)-1, aa_dict[sd_res], len(surface_solv_access_residue)))
|
|
104
|
+
else:
|
|
105
|
+
print("{} {} of which {} are on the surface".format(len(aa_CA), aa_dict[sd_res], len(surface_solv_access_residue)))
|
|
106
|
+
|
|
107
|
+
return surface_solv_access_residue
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def check_solvent_accessibility_freesasa_both(prot, aa1_CA, aa2_CA, xl_list, aa_dict, ncpus):
|
|
111
|
+
"""
|
|
112
|
+
Run freesasa ONCE for the PDB and return filtered surface-accessible dicts
|
|
113
|
+
for both aa1_CA and aa2_CA. Avoids the duplicate freesasa call that occurs
|
|
114
|
+
when the function is called separately for each residue set.
|
|
115
|
+
|
|
116
|
+
Returns: (surface_aa1_CA, surface_aa2_CA)
|
|
117
|
+
"""
|
|
118
|
+
freesasa.Parameters().setNSlices(50)
|
|
119
|
+
freesasa.Parameters().setNThreads(ncpus)
|
|
120
|
+
|
|
121
|
+
pt = os.path.dirname(os.path.realpath(__file__))
|
|
122
|
+
classifier = freesasa.Classifier(os.path.join(pt, "naccess.config.txt"))
|
|
123
|
+
structure = freesasa.Structure(os.path.normpath(prot), classifier)
|
|
124
|
+
result = freesasa.calc(structure)
|
|
125
|
+
|
|
126
|
+
solv_access_residue = {}
|
|
127
|
+
for chain, residue in result.residueAreas().items():
|
|
128
|
+
for res_sasa_info in residue.values():
|
|
129
|
+
if res_sasa_info.total > 7.0:
|
|
130
|
+
solv_access_residue[(int(res_sasa_info.residueNumber), chain, res_sasa_info.residueType)] = True
|
|
131
|
+
|
|
132
|
+
def _filter(aa_CA):
|
|
133
|
+
out = {}
|
|
134
|
+
for res_num, chain, res_name in aa_CA:
|
|
135
|
+
if (res_num, chain, res_name) in solv_access_residue:
|
|
136
|
+
out[(res_num, chain, res_name)] = aa_CA[(res_num, chain, res_name)]
|
|
137
|
+
else:
|
|
138
|
+
print("Residue {}-{}-{} is buried".format(res_num, chain, res_name))
|
|
139
|
+
if xl_list == "NULL" and out:
|
|
140
|
+
sd_res = next(iter(out))[2]
|
|
141
|
+
if sd_res == "LYS":
|
|
142
|
+
print("{} {} and 1 N-terminus of which {} are on the surface".format(
|
|
143
|
+
len(aa_CA) - 1, aa_dict[sd_res], len(out)))
|
|
144
|
+
else:
|
|
145
|
+
print("{} {} of which {} are on the surface".format(
|
|
146
|
+
len(aa_CA), aa_dict[sd_res], len(out)))
|
|
147
|
+
return out
|
|
148
|
+
|
|
149
|
+
return _filter(aa1_CA), _filter(aa2_CA)
|
|
150
|
+
|