pycpet 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pycpet-0.0.1/CPET/__init__.py +1 -0
- pycpet-0.0.1/CPET/source/CPET.py +402 -0
- pycpet-0.0.1/CPET/source/__init__.py +10 -0
- pycpet-0.0.1/CPET/source/benchmark.py +48 -0
- pycpet-0.0.1/CPET/source/calculator.py +899 -0
- pycpet-0.0.1/CPET/source/cluster.py +503 -0
- pycpet-0.0.1/CPET/source/pca.py +228 -0
- pycpet-0.0.1/CPET/source/scripts/benchmark_radius_convergence.py +135 -0
- pycpet-0.0.1/CPET/source/scripts/benchmark_sample_step.py +86 -0
- pycpet-0.0.1/CPET/source/scripts/cpet.py +85 -0
- pycpet-0.0.1/LICENSE +21 -0
- pycpet-0.0.1/PKG-INFO +5 -0
- pycpet-0.0.1/README.md +85 -0
- pycpet-0.0.1/pycpet.egg-info/PKG-INFO +5 -0
- pycpet-0.0.1/pycpet.egg-info/SOURCES.txt +29 -0
- pycpet-0.0.1/pycpet.egg-info/dependency_links.txt +1 -0
- pycpet-0.0.1/pycpet.egg-info/top_level.txt +1 -0
- pycpet-0.0.1/setup.cfg +4 -0
- pycpet-0.0.1/setup.py +14 -0
- pycpet-0.0.1/tests/test_charge_convergence.py +102 -0
- pycpet-0.0.1/tests/test_clustering.py +0 -0
- pycpet-0.0.1/tests/test_e_field_calcs.py +138 -0
- pycpet-0.0.1/tests/test_filter.py +280 -0
- pycpet-0.0.1/tests/test_io.py +59 -0
- pycpet-0.0.1/tests/test_time_grid_field_calcs.py +233 -0
- pycpet-0.0.1/tests/test_time_point_field_calcs.py +136 -0
- pycpet-0.0.1/tests/test_topo_ops.py +114 -0
- pycpet-0.0.1/tests/test_topology.py +117 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,402 @@
|
|
|
1
|
+
from CPET.source.calculator import calculator
|
|
2
|
+
from CPET.source.cluster import cluster
|
|
3
|
+
from CPET.source.pca import pca_pycpet
|
|
4
|
+
|
|
5
|
+
import CPET.utils.visualize as visualize
|
|
6
|
+
from CPET.utils.io import save_numpy_as_dat, default_options_initializer
|
|
7
|
+
from CPET.utils.calculator import report_inside_box
|
|
8
|
+
|
|
9
|
+
from glob import glob
|
|
10
|
+
from random import choice
|
|
11
|
+
import os
|
|
12
|
+
import numpy as np
|
|
13
|
+
import warnings
|
|
14
|
+
import logging
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CPET:
|
|
18
|
+
def __init__(self, options):
|
|
19
|
+
# Logistics
|
|
20
|
+
self.options = default_options_initializer(options)
|
|
21
|
+
self.logger = logging.getLogger(__name__) # Inherit logger from cpet.py
|
|
22
|
+
self.m = self.options["CPET_method"]
|
|
23
|
+
self.logger.info("Instantiating CPET, running method: {}".format(self.m))
|
|
24
|
+
self.inputpath = self.options["inputpath"]
|
|
25
|
+
self.outputpath = self.options["outputpath"]
|
|
26
|
+
self.step_size = self.options["step_size"]
|
|
27
|
+
if not os.path.exists(self.outputpath):
|
|
28
|
+
print(
|
|
29
|
+
"Output directory does not exist in current directory, creating: \n{}".format(
|
|
30
|
+
self.outputpath
|
|
31
|
+
)
|
|
32
|
+
)
|
|
33
|
+
os.makedirs(self.outputpath)
|
|
34
|
+
|
|
35
|
+
# Calculation-specific settings
|
|
36
|
+
self.dimensions = self.options["dimensions"]
|
|
37
|
+
|
|
38
|
+
def run(self):
|
|
39
|
+
if self.m == "topo":
|
|
40
|
+
self.run_topo()
|
|
41
|
+
elif self.m == "topo_GPU":
|
|
42
|
+
self.run_topo_GPU()
|
|
43
|
+
elif self.m == "volume":
|
|
44
|
+
self.run_volume()
|
|
45
|
+
elif self.m == "volume_ESP":
|
|
46
|
+
self.run_volume_ESP()
|
|
47
|
+
elif self.m == "point_field":
|
|
48
|
+
self.run_point_field()
|
|
49
|
+
elif self.m == "point_mag":
|
|
50
|
+
self.run_point_mag()
|
|
51
|
+
elif (
|
|
52
|
+
self.m == "cluster"
|
|
53
|
+
or self.m == "cluster_volume"
|
|
54
|
+
or self.m == "cluster_volume_tensor"
|
|
55
|
+
or self.m == "cluster_volume_esp_tensor"
|
|
56
|
+
):
|
|
57
|
+
self.run_cluster()
|
|
58
|
+
elif self.m == "box_check":
|
|
59
|
+
self.run_box_check()
|
|
60
|
+
elif self.m == "visualize_field" or self.m == "visualize_esp":
|
|
61
|
+
self.run_visualize_efield()
|
|
62
|
+
elif self.m == "pca" or self.m == "pca_compare":
|
|
63
|
+
self.run_pca()
|
|
64
|
+
else:
|
|
65
|
+
print(
|
|
66
|
+
"You have reached the limit of this package's capabilities at the moment, we do not support the function called as of yet"
|
|
67
|
+
)
|
|
68
|
+
exit()
|
|
69
|
+
|
|
70
|
+
def run_topo(self, num=100000, benchmarking=False):
|
|
71
|
+
files_input = glob(self.inputpath + "/*.pdb")
|
|
72
|
+
if len(files_input) == 0:
|
|
73
|
+
raise ValueError("No pdb files found in the input directory")
|
|
74
|
+
if len(files_input) == 1:
|
|
75
|
+
warnings.warn("Only one pdb file found in the input directory")
|
|
76
|
+
for i in range(num):
|
|
77
|
+
if len(files_input) != 0:
|
|
78
|
+
file = choice(files_input)
|
|
79
|
+
else:
|
|
80
|
+
print("No more files to process!")
|
|
81
|
+
break
|
|
82
|
+
files_input.remove(file)
|
|
83
|
+
protein = file.split("/")[-1].split(".")[0]
|
|
84
|
+
print("protein file: {}".format(protein))
|
|
85
|
+
files_done = [
|
|
86
|
+
x for x in os.listdir(self.outputpath) if x.split(".")[-1] == "top"
|
|
87
|
+
]
|
|
88
|
+
if protein + ".top" not in files_done:
|
|
89
|
+
self.calculator = calculator(self.options, path_to_pdb=file)
|
|
90
|
+
hist = self.calculator.compute_topo_complete_c_shared()
|
|
91
|
+
if not benchmarking:
|
|
92
|
+
np.savetxt(self.outputpath + "/{}.top".format(protein), hist)
|
|
93
|
+
if benchmarking:
|
|
94
|
+
np.savetxt(
|
|
95
|
+
self.outputpath
|
|
96
|
+
+ "/{}_{}_{}_{}.top".format(
|
|
97
|
+
protein,
|
|
98
|
+
self.calculator.n_samples,
|
|
99
|
+
str(self.calculator.step_size)[2:],
|
|
100
|
+
self.replica,
|
|
101
|
+
),
|
|
102
|
+
hist,
|
|
103
|
+
)
|
|
104
|
+
else:
|
|
105
|
+
print("Already done for protein: {}, skipping...".format(protein))
|
|
106
|
+
|
|
107
|
+
def run_topo_GPU(self, num=100000, benchmarking=False):
|
|
108
|
+
files_input = glob(self.inputpath + "/*.pdb")
|
|
109
|
+
if len(files_input) == 0:
|
|
110
|
+
raise ValueError("No pdb files found in the input directory")
|
|
111
|
+
if len(files_input) == 1:
|
|
112
|
+
warnings.warn("Only one pdb file found in the input directory")
|
|
113
|
+
for i in range(num):
|
|
114
|
+
if len(files_input) != 0:
|
|
115
|
+
file = choice(files_input)
|
|
116
|
+
else:
|
|
117
|
+
break
|
|
118
|
+
self.calculator = calculator(self.options, path_to_pdb=file)
|
|
119
|
+
protein = self.calculator.path_to_pdb.split("/")[-1].split(".")[0]
|
|
120
|
+
files_input.remove(file)
|
|
121
|
+
print("protein file: {}".format(protein))
|
|
122
|
+
files_done = [
|
|
123
|
+
x for x in os.listdir(self.outputpath) if x.split(".")[-1] == "top"
|
|
124
|
+
]
|
|
125
|
+
if protein + ".top" not in files_done:
|
|
126
|
+
hist = self.calculator.compute_topo_GPU_batch_filter()
|
|
127
|
+
if not benchmarking:
|
|
128
|
+
np.savetxt(self.outputpath + "/{}.top".format(protein), hist)
|
|
129
|
+
if benchmarking:
|
|
130
|
+
np.savetxt(
|
|
131
|
+
self.outputpath
|
|
132
|
+
+ "/{}_{}_{}_{}.top".format(
|
|
133
|
+
protein,
|
|
134
|
+
self.calculator.n_samples,
|
|
135
|
+
str(self.calculator.step_size)[2:],
|
|
136
|
+
self.replica,
|
|
137
|
+
),
|
|
138
|
+
hist,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
def run_volume(self, num=100000):
|
|
142
|
+
"""
|
|
143
|
+
Get the electric fields along a grid of points in the box
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
files_input = glob(self.inputpath + "/*.pdb")
|
|
147
|
+
if len(files_input) == 0:
|
|
148
|
+
raise ValueError("No pdb files found in the input directory")
|
|
149
|
+
|
|
150
|
+
if len(files_input) == 1:
|
|
151
|
+
warnings.warn("Only one pdb file found in the input directory")
|
|
152
|
+
|
|
153
|
+
for i in range(num):
|
|
154
|
+
if len(files_input) != 0:
|
|
155
|
+
file = choice(files_input)
|
|
156
|
+
else:
|
|
157
|
+
print("No more files to process!")
|
|
158
|
+
break
|
|
159
|
+
self.calculator = calculator(self.options, path_to_pdb=file)
|
|
160
|
+
protein = self.calculator.path_to_pdb.split("/")[-1].split(".")[0]
|
|
161
|
+
files_input.remove(file)
|
|
162
|
+
print("protein file: {}".format(protein))
|
|
163
|
+
files_done = [
|
|
164
|
+
x for x in os.listdir(self.outputpath) if x[-11:] == "_efield.dat"
|
|
165
|
+
]
|
|
166
|
+
|
|
167
|
+
if protein + "_efield.dat" not in files_done:
|
|
168
|
+
field_box, mesh_shape = self.calculator.compute_box()
|
|
169
|
+
print(field_box.shape)
|
|
170
|
+
meta_data = {
|
|
171
|
+
"dimensions": self.dimensions,
|
|
172
|
+
"step_size": [self.step_size, self.step_size, self.step_size],
|
|
173
|
+
"num_steps": [mesh_shape[0], mesh_shape[1], mesh_shape[2]],
|
|
174
|
+
"transformation_matrix": self.calculator.transformation_matrix,
|
|
175
|
+
"center": self.calculator.center,
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
save_numpy_as_dat(
|
|
179
|
+
name=self.outputpath + "/{}_efield.dat".format(protein),
|
|
180
|
+
volume=field_box,
|
|
181
|
+
meta_data=meta_data,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
def run_point_field(self):
|
|
185
|
+
files_input = glob(self.inputpath + "/*.pdb")
|
|
186
|
+
if len(files_input) == 0:
|
|
187
|
+
raise ValueError("No pdb files found in the input directory")
|
|
188
|
+
if len(files_input) == 1:
|
|
189
|
+
warnings.warn("Only one pdb file found in the input directory")
|
|
190
|
+
outfile = self.outputpath + "/point_field.dat"
|
|
191
|
+
with open(outfile, "w") as f:
|
|
192
|
+
for file in files_input:
|
|
193
|
+
self.calculator = calculator(self.options, path_to_pdb=file)
|
|
194
|
+
protein = file.split("/")[-1].split(".")[0]
|
|
195
|
+
print("protein file: {}".format(protein))
|
|
196
|
+
point_field = self.calculator.compute_point_field()
|
|
197
|
+
f.write("{}:{}\n".format(protein, point_field))
|
|
198
|
+
|
|
199
|
+
def run_point_mag(self):
|
|
200
|
+
files_input = glob(self.inputpath + "/*.pdb")
|
|
201
|
+
if len(files_input) == 0:
|
|
202
|
+
raise ValueError("No pdb files found in the input directory")
|
|
203
|
+
if len(files_input) == 1:
|
|
204
|
+
warnings.warn("Only one pdb file found in the input directory")
|
|
205
|
+
outfile = self.outputpath + "/point_mag.dat"
|
|
206
|
+
with open(outfile, "w") as f:
|
|
207
|
+
for file in files_input:
|
|
208
|
+
self.calculator = calculator(self.options, path_to_pdb=file)
|
|
209
|
+
protein = file.split("/")[-1].split(".")[0]
|
|
210
|
+
print("protein file: {}".format(protein))
|
|
211
|
+
point_field = self.calculator.compute_point_mag()
|
|
212
|
+
f.write("{}:{}\n".format(protein, point_field))
|
|
213
|
+
|
|
214
|
+
def run_volume_ESP(self, num=100000):
|
|
215
|
+
files_input = glob(self.inputpath + "/*.pdb")
|
|
216
|
+
if len(files_input) == 0:
|
|
217
|
+
raise ValueError("No pdb files found in the input directory")
|
|
218
|
+
if len(files_input) == 1:
|
|
219
|
+
warnings.warn("Only one pdb file found in the input directory")
|
|
220
|
+
for i in range(num):
|
|
221
|
+
if len(files_input) != 0:
|
|
222
|
+
file = choice(files_input)
|
|
223
|
+
else:
|
|
224
|
+
print("No more files to process!")
|
|
225
|
+
break
|
|
226
|
+
self.calculator = calculator(self.options, path_to_pdb=file)
|
|
227
|
+
protein = self.calculator.path_to_pdb.split("/")[-1].split(".")[0]
|
|
228
|
+
files_input.remove(file)
|
|
229
|
+
print("protein file: {}".format(protein))
|
|
230
|
+
files_done = [
|
|
231
|
+
x for x in os.listdir(self.outputpath) if x[-11:] == "_esp.dat"
|
|
232
|
+
]
|
|
233
|
+
if protein + "_esp.dat" not in files_done:
|
|
234
|
+
esp_box, mesh_shape = self.calculator.compute_box_ESP()
|
|
235
|
+
print(esp_box.shape)
|
|
236
|
+
meta_data = {
|
|
237
|
+
"dimensions": self.dimensions,
|
|
238
|
+
"step_size": [self.step_size, self.step_size, self.step_size],
|
|
239
|
+
"num_steps": [mesh_shape[0], mesh_shape[1], mesh_shape[2]],
|
|
240
|
+
"transformation_matrix": self.calculator.transformation_matrix,
|
|
241
|
+
"center": self.calculator.center,
|
|
242
|
+
}
|
|
243
|
+
save_numpy_as_dat(
|
|
244
|
+
name=self.outputpath + "/{}_esp.dat".format(protein),
|
|
245
|
+
volume=esp_box,
|
|
246
|
+
meta_data=meta_data,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
def run_box_check(self, num=100000):
|
|
250
|
+
files_input = glob(self.inputpath + "/*.pdb")
|
|
251
|
+
if len(files_input) == 0:
|
|
252
|
+
raise ValueError("No pdb files found in the input directory")
|
|
253
|
+
if len(files_input) == 1:
|
|
254
|
+
warnings.warn("Only one pdb file found in the input directory")
|
|
255
|
+
for file in files_input:
|
|
256
|
+
if "filter_radius" in self.options or "filter_resnum" in self.options:
|
|
257
|
+
# Error out, radius not compatible
|
|
258
|
+
raise ValueError(
|
|
259
|
+
"filter_radius/filter_resnum is not compatible with box_check. Please remove from options"
|
|
260
|
+
)
|
|
261
|
+
# Need to not filter in box to check, but can filter all else
|
|
262
|
+
self.options["filter_in_box"] = False
|
|
263
|
+
self.calculator = calculator(self.options, path_to_pdb=file)
|
|
264
|
+
protein = self.calculator.path_to_pdb.split("/")[-1].split(".")[0]
|
|
265
|
+
print("protein file: {}".format(protein))
|
|
266
|
+
report_inside_box(self.calculator)
|
|
267
|
+
print("No more files to process!")
|
|
268
|
+
|
|
269
|
+
def run_cluster(self):
|
|
270
|
+
print("Running the cluster analysis. Method type: {}".format(self.m))
|
|
271
|
+
self.cluster = cluster(self.options)
|
|
272
|
+
self.cluster.Cluster()
|
|
273
|
+
|
|
274
|
+
def run_visualize_efield(self):
|
|
275
|
+
print(
|
|
276
|
+
"Visualizing the electric field. This module will load a ChimeraX session with the first protein and the electric field, and requires the electric field to be computed first."
|
|
277
|
+
)
|
|
278
|
+
files_input_pdb = glob(self.inputpath + "/*.pdb")
|
|
279
|
+
if self.m == "visualize_field":
|
|
280
|
+
files_input_efield = glob(self.inputpath + "/*_efield.dat")
|
|
281
|
+
elif self.m == "visualize_esp":
|
|
282
|
+
files_input_esp = glob(self.inputpath + "/*_esp.dat")
|
|
283
|
+
if len(files_input_pdb) == 0:
|
|
284
|
+
raise ValueError("No pdb files found in the input directory")
|
|
285
|
+
if len(files_input_pdb) > 1:
|
|
286
|
+
warnings.warn(
|
|
287
|
+
"More than one pdb file found in the input directory. Only the first will be visualized, .bild files will be generated for all of them though."
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
# Sort list of pdbs and efields
|
|
291
|
+
files_input_pdb.sort()
|
|
292
|
+
|
|
293
|
+
# Check to make sure each pdb file has a corresponding electric field file in the input path while visualizing fields
|
|
294
|
+
for i in range(len(files_input_pdb)):
|
|
295
|
+
if self.m == "visualize_field":
|
|
296
|
+
# Modify efield file list to just have file name, not _efield.dat
|
|
297
|
+
files_input_efield = [
|
|
298
|
+
efield.split("/")[-1].split("_efield")[0]
|
|
299
|
+
for efield in files_input_efield
|
|
300
|
+
]
|
|
301
|
+
# Efield list is unsorted, so just check if the protein file is anywhere in the efield list
|
|
302
|
+
if not any(
|
|
303
|
+
files_input_pdb[i].split("/")[-1].split(".")[0] in efield
|
|
304
|
+
for efield in files_input_efield
|
|
305
|
+
):
|
|
306
|
+
raise ValueError(
|
|
307
|
+
"No electric field file found for protein: {}".format(
|
|
308
|
+
files_input_pdb[i].split("/")[-1]
|
|
309
|
+
)
|
|
310
|
+
)
|
|
311
|
+
print(
|
|
312
|
+
"Generating .bild file for the protein: {}".format(
|
|
313
|
+
files_input_pdb[i].split("/")[-1]
|
|
314
|
+
)
|
|
315
|
+
)
|
|
316
|
+
visualize.visualize_field(
|
|
317
|
+
path_to_pdb=files_input_pdb[i],
|
|
318
|
+
path_to_efield=self.inputpath
|
|
319
|
+
+ "/"
|
|
320
|
+
+ files_input_pdb[i].split("/")[-1].split(".")[0]
|
|
321
|
+
+ "_efield.dat",
|
|
322
|
+
outputpath=self.outputpath,
|
|
323
|
+
options=self.options,
|
|
324
|
+
)
|
|
325
|
+
elif self.m == "visualize_esp":
|
|
326
|
+
# Modify esp file list to just have file name, not _esp.dat
|
|
327
|
+
files_input_esp = [
|
|
328
|
+
esp.split("/")[-1].split("_esp")[0] for esp in files_input_esp
|
|
329
|
+
]
|
|
330
|
+
# Esp list is unsorted, so just check if the protein file is anywhere in the esp list
|
|
331
|
+
if not any(
|
|
332
|
+
files_input_pdb[i].split("/")[-1].split(".")[0] in esp
|
|
333
|
+
for esp in files_input_esp
|
|
334
|
+
):
|
|
335
|
+
raise ValueError(
|
|
336
|
+
"No ESP file found for protein: {}".format(
|
|
337
|
+
files_input_pdb[i].split("/")[-1]
|
|
338
|
+
)
|
|
339
|
+
)
|
|
340
|
+
print(
|
|
341
|
+
"Generating .bild file for the protein: {}".format(
|
|
342
|
+
files_input_pdb[i].split("/")[-1]
|
|
343
|
+
)
|
|
344
|
+
)
|
|
345
|
+
visualize.visualize_esp(
|
|
346
|
+
path_to_pdb=files_input_pdb[i],
|
|
347
|
+
path_to_esp=self.inputpath
|
|
348
|
+
+ "/"
|
|
349
|
+
+ files_input_pdb[i].split("/")[-1].split(".")[0]
|
|
350
|
+
+ "_esp.dat",
|
|
351
|
+
outputpath=self.outputpath,
|
|
352
|
+
options=self.options,
|
|
353
|
+
)
|
|
354
|
+
# To-do: automatically visualize the electric field for the first protein, in dev mode for now
|
|
355
|
+
|
|
356
|
+
def run_pca(self):
|
|
357
|
+
if self.m == "pca":
|
|
358
|
+
self.pca = pca_pycpet(self.options)
|
|
359
|
+
self.pca.fit_and_transform()
|
|
360
|
+
elif self.m == "pca_compare":
|
|
361
|
+
# Check for provided directories list for comparison
|
|
362
|
+
if "inputpath_list" not in self.options:
|
|
363
|
+
raise ValueError(
|
|
364
|
+
"No inputpath_list provided for PCA comparison mode. Please provide a list of directories that contain field files in the output file, or use the 'pca' method instead."
|
|
365
|
+
)
|
|
366
|
+
if "outputpath_list" not in self.options:
|
|
367
|
+
warnings.warn(
|
|
368
|
+
"No outputpath_list provided. Using default outputpath_list based on inputpath_list"
|
|
369
|
+
)
|
|
370
|
+
# Add 'pca_out' to the end of each input path
|
|
371
|
+
self.options["outputpath_list"] = [
|
|
372
|
+
path + "/pca_out" for path in self.options["inputpath_list"]
|
|
373
|
+
]
|
|
374
|
+
if self.options["pca_combined_only"] == False:
|
|
375
|
+
# Run PCA for each individual variant
|
|
376
|
+
for inputpath, outputpath in zip(
|
|
377
|
+
self.options["inputpath_list"], self.options["outputpath_list"]
|
|
378
|
+
):
|
|
379
|
+
self.options["inputpath"] = inputpath
|
|
380
|
+
self.options["outputpath"] = outputpath
|
|
381
|
+
print(
|
|
382
|
+
"Running PCA for variant: {}".format(inputpath.split("/")[-1])
|
|
383
|
+
)
|
|
384
|
+
self.pca = pca_pycpet(self.options)
|
|
385
|
+
self.pca.fit_and_transform()
|
|
386
|
+
else:
|
|
387
|
+
from CPET.utils.io import pull_mats_from_MD_folder
|
|
388
|
+
|
|
389
|
+
# Pull all field files from all variants
|
|
390
|
+
all_field_files = []
|
|
391
|
+
for i in range(len(self.options["inputpath_list"])):
|
|
392
|
+
all_field_files.extend(
|
|
393
|
+
pull_mats_from_MD_folder(self.options["inputpath_list"][i])
|
|
394
|
+
)
|
|
395
|
+
all_fields = np.concatenate(all_field_files, axis=0)
|
|
396
|
+
|
|
397
|
+
# Make a directory called 'pca_combined' in the current directory
|
|
398
|
+
if not os.path.exists("pca_combined"):
|
|
399
|
+
os.makedirs("pca_combined")
|
|
400
|
+
self.options["outputpath"] = "./pca_combined"
|
|
401
|
+
# PCA for combined set of variants
|
|
402
|
+
# TBD
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
'''
|
|
2
|
+
PyCPET (c) is licensed under a MIT License.
|
|
3
|
+
|
|
4
|
+
You should have received a copy of the license along with this
|
|
5
|
+
work. If not, see <https://choosealicense.com/licenses/mit/>.
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
PYCPET INITIALISATION FILE
|
|
9
|
+
This file is needed for the PyCPET package initialisation.
|
|
10
|
+
'''
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import matplotlib.pyplot as plt
|
|
3
|
+
import seaborn as sns
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def gen_param_dist_mat(dist_mat, topo_file_list):
|
|
7
|
+
distances = pd.DataFrame(dist_mat)
|
|
8
|
+
name = (
|
|
9
|
+
topo_file_list[0].split("/")[-1].split("_")[0]
|
|
10
|
+
+ "_"
|
|
11
|
+
+ topo_file_list[0].split("/")[-1].split("_")[1]
|
|
12
|
+
+ "_"
|
|
13
|
+
+ topo_file_list[0].split("/")[-1].split("_")[2]
|
|
14
|
+
+ "_"
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
# Modify file names
|
|
18
|
+
|
|
19
|
+
labels = topo_file_list
|
|
20
|
+
labels = [
|
|
21
|
+
label.replace(".top", "").split("/")[-1].replace(name, "") for label in labels
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
# Map each label to its group
|
|
25
|
+
group_map = {
|
|
26
|
+
label: label.split("_")[-3] + "_" + label.split("_")[-2] for label in labels
|
|
27
|
+
}
|
|
28
|
+
grouped_labels = [group_map[label] for label in labels]
|
|
29
|
+
print(group_map)
|
|
30
|
+
print(grouped_labels)
|
|
31
|
+
# Apply the new labels to the DataFrame
|
|
32
|
+
distances.columns = grouped_labels
|
|
33
|
+
distances.index = grouped_labels
|
|
34
|
+
|
|
35
|
+
# Aggregate by taking the mean within each group for both rows and columns
|
|
36
|
+
grouped = distances.groupby(level=0).mean()
|
|
37
|
+
averaged_distances = grouped.T.groupby(level=0).mean()
|
|
38
|
+
|
|
39
|
+
# Ensure the matrix is symmetric
|
|
40
|
+
averaged_distances = (averaged_distances + averaged_distances.T) / 2
|
|
41
|
+
|
|
42
|
+
# (Optional) Plot the distance matrix
|
|
43
|
+
plt.figure(figsize=(10, 8))
|
|
44
|
+
sns.heatmap(averaged_distances, cmap="Greens_r", annot=True, linewidths=0.1)
|
|
45
|
+
plt.title("Averaged Distance Matrix")
|
|
46
|
+
plt.show()
|
|
47
|
+
|
|
48
|
+
return averaged_distances
|