pycpet 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,402 @@
1
+ from CPET.source.calculator import calculator
2
+ from CPET.source.cluster import cluster
3
+ from CPET.source.pca import pca_pycpet
4
+
5
+ import CPET.utils.visualize as visualize
6
+ from CPET.utils.io import save_numpy_as_dat, default_options_initializer
7
+ from CPET.utils.calculator import report_inside_box
8
+
9
+ from glob import glob
10
+ from random import choice
11
+ import os
12
+ import numpy as np
13
+ import warnings
14
+ import logging
15
+
16
+
17
+ class CPET:
18
+ def __init__(self, options):
19
+ # Logistics
20
+ self.options = default_options_initializer(options)
21
+ self.logger = logging.getLogger(__name__) # Inherit logger from cpet.py
22
+ self.m = self.options["CPET_method"]
23
+ self.logger.info("Instantiating CPET, running method: {}".format(self.m))
24
+ self.inputpath = self.options["inputpath"]
25
+ self.outputpath = self.options["outputpath"]
26
+ self.step_size = self.options["step_size"]
27
+ if not os.path.exists(self.outputpath):
28
+ print(
29
+ "Output directory does not exist in current directory, creating: \n{}".format(
30
+ self.outputpath
31
+ )
32
+ )
33
+ os.makedirs(self.outputpath)
34
+
35
+ # Calculation-specific settings
36
+ self.dimensions = self.options["dimensions"]
37
+
38
+ def run(self):
39
+ if self.m == "topo":
40
+ self.run_topo()
41
+ elif self.m == "topo_GPU":
42
+ self.run_topo_GPU()
43
+ elif self.m == "volume":
44
+ self.run_volume()
45
+ elif self.m == "volume_ESP":
46
+ self.run_volume_ESP()
47
+ elif self.m == "point_field":
48
+ self.run_point_field()
49
+ elif self.m == "point_mag":
50
+ self.run_point_mag()
51
+ elif (
52
+ self.m == "cluster"
53
+ or self.m == "cluster_volume"
54
+ or self.m == "cluster_volume_tensor"
55
+ or self.m == "cluster_volume_esp_tensor"
56
+ ):
57
+ self.run_cluster()
58
+ elif self.m == "box_check":
59
+ self.run_box_check()
60
+ elif self.m == "visualize_field" or self.m == "visualize_esp":
61
+ self.run_visualize_efield()
62
+ elif self.m == "pca" or self.m == "pca_compare":
63
+ self.run_pca()
64
+ else:
65
+ print(
66
+ "You have reached the limit of this package's capabilities at the moment, we do not support the function called as of yet"
67
+ )
68
+ exit()
69
+
70
+ def run_topo(self, num=100000, benchmarking=False):
71
+ files_input = glob(self.inputpath + "/*.pdb")
72
+ if len(files_input) == 0:
73
+ raise ValueError("No pdb files found in the input directory")
74
+ if len(files_input) == 1:
75
+ warnings.warn("Only one pdb file found in the input directory")
76
+ for i in range(num):
77
+ if len(files_input) != 0:
78
+ file = choice(files_input)
79
+ else:
80
+ print("No more files to process!")
81
+ break
82
+ files_input.remove(file)
83
+ protein = file.split("/")[-1].split(".")[0]
84
+ print("protein file: {}".format(protein))
85
+ files_done = [
86
+ x for x in os.listdir(self.outputpath) if x.split(".")[-1] == "top"
87
+ ]
88
+ if protein + ".top" not in files_done:
89
+ self.calculator = calculator(self.options, path_to_pdb=file)
90
+ hist = self.calculator.compute_topo_complete_c_shared()
91
+ if not benchmarking:
92
+ np.savetxt(self.outputpath + "/{}.top".format(protein), hist)
93
+ if benchmarking:
94
+ np.savetxt(
95
+ self.outputpath
96
+ + "/{}_{}_{}_{}.top".format(
97
+ protein,
98
+ self.calculator.n_samples,
99
+ str(self.calculator.step_size)[2:],
100
+ self.replica,
101
+ ),
102
+ hist,
103
+ )
104
+ else:
105
+ print("Already done for protein: {}, skipping...".format(protein))
106
+
107
+ def run_topo_GPU(self, num=100000, benchmarking=False):
108
+ files_input = glob(self.inputpath + "/*.pdb")
109
+ if len(files_input) == 0:
110
+ raise ValueError("No pdb files found in the input directory")
111
+ if len(files_input) == 1:
112
+ warnings.warn("Only one pdb file found in the input directory")
113
+ for i in range(num):
114
+ if len(files_input) != 0:
115
+ file = choice(files_input)
116
+ else:
117
+ break
118
+ self.calculator = calculator(self.options, path_to_pdb=file)
119
+ protein = self.calculator.path_to_pdb.split("/")[-1].split(".")[0]
120
+ files_input.remove(file)
121
+ print("protein file: {}".format(protein))
122
+ files_done = [
123
+ x for x in os.listdir(self.outputpath) if x.split(".")[-1] == "top"
124
+ ]
125
+ if protein + ".top" not in files_done:
126
+ hist = self.calculator.compute_topo_GPU_batch_filter()
127
+ if not benchmarking:
128
+ np.savetxt(self.outputpath + "/{}.top".format(protein), hist)
129
+ if benchmarking:
130
+ np.savetxt(
131
+ self.outputpath
132
+ + "/{}_{}_{}_{}.top".format(
133
+ protein,
134
+ self.calculator.n_samples,
135
+ str(self.calculator.step_size)[2:],
136
+ self.replica,
137
+ ),
138
+ hist,
139
+ )
140
+
141
+ def run_volume(self, num=100000):
142
+ """
143
+ Get the electric fields along a grid of points in the box
144
+ """
145
+
146
+ files_input = glob(self.inputpath + "/*.pdb")
147
+ if len(files_input) == 0:
148
+ raise ValueError("No pdb files found in the input directory")
149
+
150
+ if len(files_input) == 1:
151
+ warnings.warn("Only one pdb file found in the input directory")
152
+
153
+ for i in range(num):
154
+ if len(files_input) != 0:
155
+ file = choice(files_input)
156
+ else:
157
+ print("No more files to process!")
158
+ break
159
+ self.calculator = calculator(self.options, path_to_pdb=file)
160
+ protein = self.calculator.path_to_pdb.split("/")[-1].split(".")[0]
161
+ files_input.remove(file)
162
+ print("protein file: {}".format(protein))
163
+ files_done = [
164
+ x for x in os.listdir(self.outputpath) if x[-11:] == "_efield.dat"
165
+ ]
166
+
167
+ if protein + "_efield.dat" not in files_done:
168
+ field_box, mesh_shape = self.calculator.compute_box()
169
+ print(field_box.shape)
170
+ meta_data = {
171
+ "dimensions": self.dimensions,
172
+ "step_size": [self.step_size, self.step_size, self.step_size],
173
+ "num_steps": [mesh_shape[0], mesh_shape[1], mesh_shape[2]],
174
+ "transformation_matrix": self.calculator.transformation_matrix,
175
+ "center": self.calculator.center,
176
+ }
177
+
178
+ save_numpy_as_dat(
179
+ name=self.outputpath + "/{}_efield.dat".format(protein),
180
+ volume=field_box,
181
+ meta_data=meta_data,
182
+ )
183
+
184
+ def run_point_field(self):
185
+ files_input = glob(self.inputpath + "/*.pdb")
186
+ if len(files_input) == 0:
187
+ raise ValueError("No pdb files found in the input directory")
188
+ if len(files_input) == 1:
189
+ warnings.warn("Only one pdb file found in the input directory")
190
+ outfile = self.outputpath + "/point_field.dat"
191
+ with open(outfile, "w") as f:
192
+ for file in files_input:
193
+ self.calculator = calculator(self.options, path_to_pdb=file)
194
+ protein = file.split("/")[-1].split(".")[0]
195
+ print("protein file: {}".format(protein))
196
+ point_field = self.calculator.compute_point_field()
197
+ f.write("{}:{}\n".format(protein, point_field))
198
+
199
+ def run_point_mag(self):
200
+ files_input = glob(self.inputpath + "/*.pdb")
201
+ if len(files_input) == 0:
202
+ raise ValueError("No pdb files found in the input directory")
203
+ if len(files_input) == 1:
204
+ warnings.warn("Only one pdb file found in the input directory")
205
+ outfile = self.outputpath + "/point_mag.dat"
206
+ with open(outfile, "w") as f:
207
+ for file in files_input:
208
+ self.calculator = calculator(self.options, path_to_pdb=file)
209
+ protein = file.split("/")[-1].split(".")[0]
210
+ print("protein file: {}".format(protein))
211
+ point_field = self.calculator.compute_point_mag()
212
+ f.write("{}:{}\n".format(protein, point_field))
213
+
214
+ def run_volume_ESP(self, num=100000):
215
+ files_input = glob(self.inputpath + "/*.pdb")
216
+ if len(files_input) == 0:
217
+ raise ValueError("No pdb files found in the input directory")
218
+ if len(files_input) == 1:
219
+ warnings.warn("Only one pdb file found in the input directory")
220
+ for i in range(num):
221
+ if len(files_input) != 0:
222
+ file = choice(files_input)
223
+ else:
224
+ print("No more files to process!")
225
+ break
226
+ self.calculator = calculator(self.options, path_to_pdb=file)
227
+ protein = self.calculator.path_to_pdb.split("/")[-1].split(".")[0]
228
+ files_input.remove(file)
229
+ print("protein file: {}".format(protein))
230
+ files_done = [
231
+ x for x in os.listdir(self.outputpath) if x[-11:] == "_esp.dat"
232
+ ]
233
+ if protein + "_esp.dat" not in files_done:
234
+ esp_box, mesh_shape = self.calculator.compute_box_ESP()
235
+ print(esp_box.shape)
236
+ meta_data = {
237
+ "dimensions": self.dimensions,
238
+ "step_size": [self.step_size, self.step_size, self.step_size],
239
+ "num_steps": [mesh_shape[0], mesh_shape[1], mesh_shape[2]],
240
+ "transformation_matrix": self.calculator.transformation_matrix,
241
+ "center": self.calculator.center,
242
+ }
243
+ save_numpy_as_dat(
244
+ name=self.outputpath + "/{}_esp.dat".format(protein),
245
+ volume=esp_box,
246
+ meta_data=meta_data,
247
+ )
248
+
249
+ def run_box_check(self, num=100000):
250
+ files_input = glob(self.inputpath + "/*.pdb")
251
+ if len(files_input) == 0:
252
+ raise ValueError("No pdb files found in the input directory")
253
+ if len(files_input) == 1:
254
+ warnings.warn("Only one pdb file found in the input directory")
255
+ for file in files_input:
256
+ if "filter_radius" in self.options or "filter_resnum" in self.options:
257
+ # Error out, radius not compatible
258
+ raise ValueError(
259
+ "filter_radius/filter_resnum is not compatible with box_check. Please remove from options"
260
+ )
261
+ # Need to not filter in box to check, but can filter all else
262
+ self.options["filter_in_box"] = False
263
+ self.calculator = calculator(self.options, path_to_pdb=file)
264
+ protein = self.calculator.path_to_pdb.split("/")[-1].split(".")[0]
265
+ print("protein file: {}".format(protein))
266
+ report_inside_box(self.calculator)
267
+ print("No more files to process!")
268
+
269
+ def run_cluster(self):
270
+ print("Running the cluster analysis. Method type: {}".format(self.m))
271
+ self.cluster = cluster(self.options)
272
+ self.cluster.Cluster()
273
+
274
+ def run_visualize_efield(self):
275
+ print(
276
+ "Visualizing the electric field. This module will load a ChimeraX session with the first protein and the electric field, and requires the electric field to be computed first."
277
+ )
278
+ files_input_pdb = glob(self.inputpath + "/*.pdb")
279
+ if self.m == "visualize_field":
280
+ files_input_efield = glob(self.inputpath + "/*_efield.dat")
281
+ elif self.m == "visualize_esp":
282
+ files_input_esp = glob(self.inputpath + "/*_esp.dat")
283
+ if len(files_input_pdb) == 0:
284
+ raise ValueError("No pdb files found in the input directory")
285
+ if len(files_input_pdb) > 1:
286
+ warnings.warn(
287
+ "More than one pdb file found in the input directory. Only the first will be visualized, .bild files will be generated for all of them though."
288
+ )
289
+
290
+ # Sort list of pdbs and efields
291
+ files_input_pdb.sort()
292
+
293
+ # Check to make sure each pdb file has a corresponding electric field file in the input path while visualizing fields
294
+ for i in range(len(files_input_pdb)):
295
+ if self.m == "visualize_field":
296
+ # Modify efield file list to just have file name, not _efield.dat
297
+ files_input_efield = [
298
+ efield.split("/")[-1].split("_efield")[0]
299
+ for efield in files_input_efield
300
+ ]
301
+ # Efield list is unsorted, so just check if the protein file is anywhere in the efield list
302
+ if not any(
303
+ files_input_pdb[i].split("/")[-1].split(".")[0] in efield
304
+ for efield in files_input_efield
305
+ ):
306
+ raise ValueError(
307
+ "No electric field file found for protein: {}".format(
308
+ files_input_pdb[i].split("/")[-1]
309
+ )
310
+ )
311
+ print(
312
+ "Generating .bild file for the protein: {}".format(
313
+ files_input_pdb[i].split("/")[-1]
314
+ )
315
+ )
316
+ visualize.visualize_field(
317
+ path_to_pdb=files_input_pdb[i],
318
+ path_to_efield=self.inputpath
319
+ + "/"
320
+ + files_input_pdb[i].split("/")[-1].split(".")[0]
321
+ + "_efield.dat",
322
+ outputpath=self.outputpath,
323
+ options=self.options,
324
+ )
325
+ elif self.m == "visualize_esp":
326
+ # Modify esp file list to just have file name, not _esp.dat
327
+ files_input_esp = [
328
+ esp.split("/")[-1].split("_esp")[0] for esp in files_input_esp
329
+ ]
330
+ # Esp list is unsorted, so just check if the protein file is anywhere in the esp list
331
+ if not any(
332
+ files_input_pdb[i].split("/")[-1].split(".")[0] in esp
333
+ for esp in files_input_esp
334
+ ):
335
+ raise ValueError(
336
+ "No ESP file found for protein: {}".format(
337
+ files_input_pdb[i].split("/")[-1]
338
+ )
339
+ )
340
+ print(
341
+ "Generating .bild file for the protein: {}".format(
342
+ files_input_pdb[i].split("/")[-1]
343
+ )
344
+ )
345
+ visualize.visualize_esp(
346
+ path_to_pdb=files_input_pdb[i],
347
+ path_to_esp=self.inputpath
348
+ + "/"
349
+ + files_input_pdb[i].split("/")[-1].split(".")[0]
350
+ + "_esp.dat",
351
+ outputpath=self.outputpath,
352
+ options=self.options,
353
+ )
354
+ # To-do: automatically visualize the electric field for the first protein, in dev mode for now
355
+
356
+ def run_pca(self):
357
+ if self.m == "pca":
358
+ self.pca = pca_pycpet(self.options)
359
+ self.pca.fit_and_transform()
360
+ elif self.m == "pca_compare":
361
+ # Check for provided directories list for comparison
362
+ if "inputpath_list" not in self.options:
363
+ raise ValueError(
364
+ "No inputpath_list provided for PCA comparison mode. Please provide a list of directories that contain field files in the output file, or use the 'pca' method instead."
365
+ )
366
+ if "outputpath_list" not in self.options:
367
+ warnings.warn(
368
+ "No outputpath_list provided. Using default outputpath_list based on inputpath_list"
369
+ )
370
+ # Add 'pca_out' to the end of each input path
371
+ self.options["outputpath_list"] = [
372
+ path + "/pca_out" for path in self.options["inputpath_list"]
373
+ ]
374
+ if self.options["pca_combined_only"] == False:
375
+ # Run PCA for each individual variant
376
+ for inputpath, outputpath in zip(
377
+ self.options["inputpath_list"], self.options["outputpath_list"]
378
+ ):
379
+ self.options["inputpath"] = inputpath
380
+ self.options["outputpath"] = outputpath
381
+ print(
382
+ "Running PCA for variant: {}".format(inputpath.split("/")[-1])
383
+ )
384
+ self.pca = pca_pycpet(self.options)
385
+ self.pca.fit_and_transform()
386
+ else:
387
+ from CPET.utils.io import pull_mats_from_MD_folder
388
+
389
+ # Pull all field files from all variants
390
+ all_field_files = []
391
+ for i in range(len(self.options["inputpath_list"])):
392
+ all_field_files.extend(
393
+ pull_mats_from_MD_folder(self.options["inputpath_list"][i])
394
+ )
395
+ all_fields = np.concatenate(all_field_files, axis=0)
396
+
397
+ # Make a directory called 'pca_combined' in the current directory
398
+ if not os.path.exists("pca_combined"):
399
+ os.makedirs("pca_combined")
400
+ self.options["outputpath"] = "./pca_combined"
401
+ # PCA for combined set of variants
402
+ # TBD
@@ -0,0 +1,10 @@
1
+ '''
2
+ PyCPET (c) is licensed under a MIT License.
3
+
4
+ You should have received a copy of the license along with this
5
+ work. If not, see <https://choosealicense.com/licenses/mit/>.
6
+
7
+
8
+ PYCPET INITIALISATION FILE
9
+ This file is needed for the PyCPET package initialisation.
10
+ '''
@@ -0,0 +1,48 @@
1
+ import pandas as pd
2
+ import matplotlib.pyplot as plt
3
+ import seaborn as sns
4
+
5
+
6
+ def gen_param_dist_mat(dist_mat, topo_file_list):
7
+ distances = pd.DataFrame(dist_mat)
8
+ name = (
9
+ topo_file_list[0].split("/")[-1].split("_")[0]
10
+ + "_"
11
+ + topo_file_list[0].split("/")[-1].split("_")[1]
12
+ + "_"
13
+ + topo_file_list[0].split("/")[-1].split("_")[2]
14
+ + "_"
15
+ )
16
+
17
+ # Modify file names
18
+
19
+ labels = topo_file_list
20
+ labels = [
21
+ label.replace(".top", "").split("/")[-1].replace(name, "") for label in labels
22
+ ]
23
+
24
+ # Map each label to its group
25
+ group_map = {
26
+ label: label.split("_")[-3] + "_" + label.split("_")[-2] for label in labels
27
+ }
28
+ grouped_labels = [group_map[label] for label in labels]
29
+ print(group_map)
30
+ print(grouped_labels)
31
+ # Apply the new labels to the DataFrame
32
+ distances.columns = grouped_labels
33
+ distances.index = grouped_labels
34
+
35
+ # Aggregate by taking the mean within each group for both rows and columns
36
+ grouped = distances.groupby(level=0).mean()
37
+ averaged_distances = grouped.T.groupby(level=0).mean()
38
+
39
+ # Ensure the matrix is symmetric
40
+ averaged_distances = (averaged_distances + averaged_distances.T) / 2
41
+
42
+ # (Optional) Plot the distance matrix
43
+ plt.figure(figsize=(10, 8))
44
+ sns.heatmap(averaged_distances, cmap="Greens_r", annot=True, linewidths=0.1)
45
+ plt.title("Averaged Distance Matrix")
46
+ plt.show()
47
+
48
+ return averaged_distances