cohere-ui 4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cohere_ui/__init__.py +0 -0
- cohere_ui/api/__init__.py +0 -0
- cohere_ui/api/auto_data.py +245 -0
- cohere_ui/api/balancer.py +254 -0
- cohere_ui/api/common.py +131 -0
- cohere_ui/api/convertconfig.py +314 -0
- cohere_ui/api/host_utils.py +42 -0
- cohere_ui/api/mpi_cmd.py +92 -0
- cohere_ui/api/multipeak.py +343 -0
- cohere_ui/api/postprocess_utils.py +383 -0
- cohere_ui/api/reconstruction_ga.py +315 -0
- cohere_ui/api/reconstruction_populous.py +180 -0
- cohere_ui/api/reconstruction_populous_ga.py +265 -0
- cohere_ui/api/symdata.py +57 -0
- cohere_ui/api/te_preprocess.py +168 -0
- cohere_ui/api/te_rec.py +34 -0
- cohere_ui/api/view_reconstruction.py +67 -0
- cohere_ui/beamline_preprocess.py +173 -0
- cohere_ui/beamline_visualization.py +334 -0
- cohere_ui/cohere_gui.py +3435 -0
- cohere_ui/copy_setup.py +131 -0
- cohere_ui/create_aps34idc_experiment.py +222 -0
- cohere_ui/everything.py +73 -0
- cohere_ui/run_reconstruction.py +392 -0
- cohere_ui/simple_phasing.py +151 -0
- cohere_ui/standard_preprocess.py +111 -0
- cohere_ui-4.3.dist-info/METADATA +54 -0
- cohere_ui-4.3.dist-info/RECORD +32 -0
- cohere_ui-4.3.dist-info/WHEEL +5 -0
- cohere_ui-4.3.dist-info/entry_points.txt +10 -0
- cohere_ui-4.3.dist-info/licenses/LICENSE +18 -0
- cohere_ui-4.3.dist-info/top_level.txt +1 -0
cohere_ui/__init__.py
ADDED
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
|
|
3
|
+
# #########################################################################
|
|
4
|
+
# Copyright (c) , UChicago Argonne, LLC. All rights reserved. #
|
|
5
|
+
# #
|
|
6
|
+
# See LICENSE file. #
|
|
7
|
+
# #########################################################################
|
|
8
|
+
|
|
9
|
+
"""
|
|
10
|
+
This file contains suite of scripts related to auto_data setting.
|
|
11
|
+
While preprocessing with auto option the scripts determine which scans are outliers, i.e. have
|
|
12
|
+
the greatest correlation error with relation to all other scans in the set.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
__author__ = "Paul Frosik"
|
|
16
|
+
__docformat__ = 'restructuredtext en'
|
|
17
|
+
__all__ = ['get_ref_correlation_err',
|
|
18
|
+
'find_outliers_in_batch',
|
|
19
|
+
'find_outlier_scans']
|
|
20
|
+
|
|
21
|
+
import os
|
|
22
|
+
import importlib
|
|
23
|
+
import cohere_core.utilities as ut
|
|
24
|
+
import cohere_core.utilities.dvc_utils as dvut
|
|
25
|
+
import shutil
|
|
26
|
+
from multiprocessing import Queue, Process, Pool
|
|
27
|
+
from functools import partial
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def set_lib(pkg):
|
|
31
|
+
# initialize the library to cupy if available, otherwise to numpy
|
|
32
|
+
global devlib
|
|
33
|
+
if pkg == 'cp':
|
|
34
|
+
devlib = importlib.import_module('cohere_core.lib.cplib').cplib
|
|
35
|
+
else:
|
|
36
|
+
devlib = importlib.import_module('cohere_core.lib.nplib').nplib
|
|
37
|
+
dvut.set_lib_from_pkg(pkg)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def get_ref_correlation_err(experiment_dir, scans, scan):
|
|
41
|
+
"""
|
|
42
|
+
This function finds a mean of correlation errors calculated between given scan and all other scans.
|
|
43
|
+
|
|
44
|
+
:param experiment_dir: str
|
|
45
|
+
path to cohere experiment
|
|
46
|
+
:param scans: list of int
|
|
47
|
+
list of scans included in the batch
|
|
48
|
+
:param scan: int
|
|
49
|
+
scan number the mean correlation error is calculated for
|
|
50
|
+
:return: float
|
|
51
|
+
mean of all correlation errors
|
|
52
|
+
"""
|
|
53
|
+
refarr = ut.read_tif(ut.join(experiment_dir, f'scan_{str(scan)}', 'preprocessed_data', 'prep_data.tif'))
|
|
54
|
+
err = 0
|
|
55
|
+
refarr = devlib.from_numpy(refarr)
|
|
56
|
+
|
|
57
|
+
for s in scans:
|
|
58
|
+
if s != scan:
|
|
59
|
+
datafile = ut.join(experiment_dir, f'scan_{str(s)}', 'preprocessed_data', 'prep_data.tif')
|
|
60
|
+
arr = devlib.from_numpy(ut.read_tif(datafile))
|
|
61
|
+
e = dvut.correlation_err(refarr, arr)
|
|
62
|
+
err += e
|
|
63
|
+
return (err / (len(scans) - 1), scan)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def find_outliers_in_batch(experiment_dir, scans, q, no_processes):
|
|
67
|
+
"""
|
|
68
|
+
Used by auto-data. This function is called after experiment data has been read for each scan that is part of batch, i.e. scans that are being added together to bear a data file.
|
|
69
|
+
Each scan is aligned with other scans and correlation error is calculated for each pair. The errors are summed for each scan. Mertics such as average and standard deviation on the summed errors are used to find the outliers.
|
|
70
|
+
The scans with summed errors exceeding standard deviation are considered outliers that are returned in a list. The outliers scans will be excluded from the data set.
|
|
71
|
+
The outliers scans are added to the queue and will be consumed by calling process.
|
|
72
|
+
|
|
73
|
+
:param experiment_dir: str
|
|
74
|
+
path to the cohere experiment
|
|
75
|
+
:param scans: list
|
|
76
|
+
list of scans in the batch
|
|
77
|
+
:param q: Queue
|
|
78
|
+
a queue used to pass outliers scans calculated for this batch
|
|
79
|
+
:param no_processes: int
|
|
80
|
+
number processes allocated to this computing
|
|
81
|
+
:return:
|
|
82
|
+
"""
|
|
83
|
+
from statistics import mean, pstdev
|
|
84
|
+
|
|
85
|
+
err_scan = []
|
|
86
|
+
outlier_scans = []
|
|
87
|
+
# if multiple processes can run concurrently use this code
|
|
88
|
+
if no_processes > 1:
|
|
89
|
+
func = partial(get_ref_correlation_err, experiment_dir, scans)
|
|
90
|
+
with Pool(processes=no_processes) as pool:
|
|
91
|
+
res = pool.map_async(func, scans)
|
|
92
|
+
pool.close()
|
|
93
|
+
pool.join()
|
|
94
|
+
for r in res.get():
|
|
95
|
+
err_scan.append(r)
|
|
96
|
+
else:
|
|
97
|
+
# otherwise run it sequentially
|
|
98
|
+
for scan in scans:
|
|
99
|
+
err_scan.append(get_ref_correlation_err(experiment_dir, scans, scan))
|
|
100
|
+
|
|
101
|
+
err = [el[0].item() for el in err_scan]
|
|
102
|
+
err_mean = mean(err)
|
|
103
|
+
stdev = pstdev(err)
|
|
104
|
+
# print('mean, std', mean, stdev)
|
|
105
|
+
for (err_value, scan) in err_scan:
|
|
106
|
+
# print(err_value, scan)
|
|
107
|
+
if err_value > (err_mean + stdev):
|
|
108
|
+
outlier_scans.append(scan)
|
|
109
|
+
q.put(outlier_scans)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def find_outlier_scans(experiment_dir, scans_datainfo, separate_ranges):
|
|
113
|
+
"""
|
|
114
|
+
This function finds batches of scans with number of scans greater than 3 and follows to find outliers in those batches.
|
|
115
|
+
Scans data are read and saved in scan directories.
|
|
116
|
+
The function finds available resources and calls concurrent processes on each batch to find outliers.
|
|
117
|
+
The outliers scans are received through queue from each process.
|
|
118
|
+
:param experiment_dir:
|
|
119
|
+
path to the cohere experiment
|
|
120
|
+
:param read_scan_func:
|
|
121
|
+
function to read a scan data
|
|
122
|
+
:return: list of int
|
|
123
|
+
list of outliers scans
|
|
124
|
+
"""
|
|
125
|
+
def remove_scan_dirs():
|
|
126
|
+
# remove individual scan directories
|
|
127
|
+
for scan_dir in os.listdir(experiment_dir):
|
|
128
|
+
if scan_dir.startswith('scan'):
|
|
129
|
+
shutil.rmtree(ut.join(experiment_dir, scan_dir))
|
|
130
|
+
|
|
131
|
+
if separate_ranges:
|
|
132
|
+
auto_batches = [batch for batch in scans_datainfo if len(batch) > 3]
|
|
133
|
+
if len(auto_batches) == 0:
|
|
134
|
+
remove_scan_dirs()
|
|
135
|
+
return []
|
|
136
|
+
else:
|
|
137
|
+
auto_batches = [s_d for batch in scans_datainfo for s_d in batch]
|
|
138
|
+
if len(auto_batches) <= 3:
|
|
139
|
+
remove_scan_dirs()
|
|
140
|
+
return []
|
|
141
|
+
else:
|
|
142
|
+
# make it a single sub-list
|
|
143
|
+
auto_batches = [auto_batches]
|
|
144
|
+
|
|
145
|
+
print('finding outliers')
|
|
146
|
+
|
|
147
|
+
# find all (scan, directory) tuples in auto_batches
|
|
148
|
+
single_scans_dinfo = [s_d for batch in auto_batches for s_d in batch]
|
|
149
|
+
# process_separate_scans(read_scan_func, single_scans_dinfo, experiment_dir)
|
|
150
|
+
#
|
|
151
|
+
# this code determines which library to use and how many scans can be processed concurrently
|
|
152
|
+
try:
|
|
153
|
+
import cupy
|
|
154
|
+
pkg = 'cp'
|
|
155
|
+
data_size = ut.read_tif(ut.join(experiment_dir, f'scan_{str(single_scans_dinfo[0][0])}', 'preprocessed_data', 'prep_data.tif')).size
|
|
156
|
+
job_size = data_size * 67 / 1000000. + 84 # empirically found constants
|
|
157
|
+
# use the first GPU
|
|
158
|
+
avail_devs_dict = ut.get_avail_gpu_runs(job_size, [0])
|
|
159
|
+
avail_devs = []
|
|
160
|
+
for k,v in avail_devs_dict.items():
|
|
161
|
+
avail_devs.extend([k] * v)
|
|
162
|
+
available_processes = len(avail_devs)
|
|
163
|
+
except:
|
|
164
|
+
pkg = 'np'
|
|
165
|
+
available_processes = os.cpu_count() * 2
|
|
166
|
+
set_lib(pkg)
|
|
167
|
+
|
|
168
|
+
# the available processes will be distributed among processes for each batch, i.e. scan range
|
|
169
|
+
no_concurrent = available_processes // len(auto_batches)
|
|
170
|
+
# in case when number of batches is greater than available processes
|
|
171
|
+
# the chunking will handle all batches
|
|
172
|
+
|
|
173
|
+
# find outliers in each batch
|
|
174
|
+
q = Queue()
|
|
175
|
+
|
|
176
|
+
outliers = []
|
|
177
|
+
chunk_size = available_processes
|
|
178
|
+
while auto_batches:
|
|
179
|
+
chunk, auto_batches = auto_batches[:chunk_size], auto_batches[chunk_size:]
|
|
180
|
+
|
|
181
|
+
processes = []
|
|
182
|
+
for batch in chunk:
|
|
183
|
+
scans_in_batch = [s_d[0] for s_d in batch]
|
|
184
|
+
p = Process(target=find_outliers_in_batch, args=(experiment_dir, scans_in_batch, q, no_concurrent))
|
|
185
|
+
processes.append(p)
|
|
186
|
+
p.start()
|
|
187
|
+
i = len(processes)
|
|
188
|
+
while i > 0:
|
|
189
|
+
outliers.extend(q.get())
|
|
190
|
+
i -= 1
|
|
191
|
+
|
|
192
|
+
for p in processes:
|
|
193
|
+
p.join()
|
|
194
|
+
|
|
195
|
+
# remove individual scan directories
|
|
196
|
+
remove_scan_dirs()
|
|
197
|
+
|
|
198
|
+
outliers.sort()
|
|
199
|
+
return outliers
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
# def auto_separate_scans(experiment_dir, prep_obj, no_auto_batches):
|
|
203
|
+
# # this code determines which library to use and how many scans can be processed concurrently
|
|
204
|
+
# try:
|
|
205
|
+
# import cupy
|
|
206
|
+
# lib = 'cp'
|
|
207
|
+
# no_concurrent = 1
|
|
208
|
+
# except:
|
|
209
|
+
# lib = 'np'
|
|
210
|
+
# # the available processes will be distributed among processes for each batch, i.e. scan range
|
|
211
|
+
# no_concurrent = os.cpu_count() * 2 // no_auto_batches
|
|
212
|
+
# set_lib(lib)
|
|
213
|
+
#
|
|
214
|
+
# print('finding outliers')
|
|
215
|
+
# dirs = []
|
|
216
|
+
# scans = []
|
|
217
|
+
# arrs = []
|
|
218
|
+
# batches = prep_obj.get_batches()
|
|
219
|
+
# for batch in batches:
|
|
220
|
+
# dirs.extend(batch[0])
|
|
221
|
+
# scans.extend(batch[1])
|
|
222
|
+
# # for dir in dirs:
|
|
223
|
+
# # arr = devlib.from_numpy(prep_obj.read_scan(dir))
|
|
224
|
+
# # arrs.append(arr)
|
|
225
|
+
# arr = devlib.from_numpy(prep_obj.read_scan(dirs[0]))
|
|
226
|
+
# arrs.append((arr))
|
|
227
|
+
# arr1 = devlib.from_numpy(prep_obj.read_scan(dirs[1]))
|
|
228
|
+
# arrs.append((arr1))
|
|
229
|
+
# # # save scans
|
|
230
|
+
# # process_separate_scans(prep_obj, dirs, scans, experiment_dir)
|
|
231
|
+
# print(scans)
|
|
232
|
+
# errs = []
|
|
233
|
+
# refarr = prep_obj.read_scan(dirs[0])
|
|
234
|
+
# for dir in dirs[1:]:
|
|
235
|
+
# arr = prep_obj.read_scan(dir)
|
|
236
|
+
# errs.append(dvut.correlation_err(devlib.from_numpy(refarr), devlib.from_numpy(arr)))
|
|
237
|
+
# refarr = arr
|
|
238
|
+
# errs = [e.item() for e in errs]
|
|
239
|
+
# for i in range(0,len(errs)):
|
|
240
|
+
# print(scans[i+1], errs[i])
|
|
241
|
+
# refarr = prep_obj.read_scan(dirs[3])
|
|
242
|
+
# arr = prep_obj.read_scan(dirs[7])
|
|
243
|
+
# print(errs)
|
|
244
|
+
#
|
|
245
|
+
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
# #########################################################################
|
|
2
|
+
# Copyright (c) , UChicago Argonne, LLC. All rights reserved. #
|
|
3
|
+
# #
|
|
4
|
+
# See LICENSE file. #
|
|
5
|
+
# #########################################################################
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
cohere_core.utils
|
|
9
|
+
=================
|
|
10
|
+
|
|
11
|
+
This module returns available, balanced devices suited for given job.
|
|
12
|
+
"""
|
|
13
|
+
import os
|
|
14
|
+
import ast
|
|
15
|
+
import GPUtil
|
|
16
|
+
from functools import reduce
|
|
17
|
+
import cohere_core.utilities as ut
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
__author__ = "Barbara Frosik"
|
|
21
|
+
__copyright__ = "Copyright (c), UChicago Argonne, LLC."
|
|
22
|
+
__docformat__ = 'restructuredtext en'
|
|
23
|
+
__all__ = [
|
|
24
|
+
'estimate_no_proc',
|
|
25
|
+
'get_avail_gpu_runs',
|
|
26
|
+
'get_gpu_use',
|
|
27
|
+
'get_one_dev',
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
def estimate_no_proc(arr_size, factor):
|
|
31
|
+
"""
|
|
32
|
+
Estimates number of processes the prep can be run on. Determined by number of available cpus and size
|
|
33
|
+
of array.
|
|
34
|
+
Parameters
|
|
35
|
+
----------
|
|
36
|
+
arr_size : int
|
|
37
|
+
size of array
|
|
38
|
+
factor : int
|
|
39
|
+
an estimate of how much memory is required to process comparing to array size
|
|
40
|
+
Returns
|
|
41
|
+
-------
|
|
42
|
+
int
|
|
43
|
+
number of processes
|
|
44
|
+
"""
|
|
45
|
+
from multiprocessing import cpu_count
|
|
46
|
+
import psutil
|
|
47
|
+
|
|
48
|
+
ncpu = cpu_count()
|
|
49
|
+
freemem = psutil.virtual_memory().available
|
|
50
|
+
nmem = freemem / (factor * arr_size)
|
|
51
|
+
# decide what limits, ncpu or nmem
|
|
52
|
+
if nmem > ncpu:
|
|
53
|
+
return ncpu
|
|
54
|
+
else:
|
|
55
|
+
return int(nmem)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def get_avail_gpu_runs(devices, run_mem):
|
|
59
|
+
"""
|
|
60
|
+
Finds how many jobs of run_mem size can run on configured GPUs on local host.
|
|
61
|
+
|
|
62
|
+
:param devices: list or string
|
|
63
|
+
list of GPU IDs or 'all' if configured to use all available GPUs
|
|
64
|
+
:param run_mem: int
|
|
65
|
+
size of GPU memory (in MB) needed for one job
|
|
66
|
+
:return: dict
|
|
67
|
+
pairs of GPU IDs, number of available jobs
|
|
68
|
+
"""
|
|
69
|
+
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
|
70
|
+
gpus = GPUtil.getGPUs()
|
|
71
|
+
available = {}
|
|
72
|
+
|
|
73
|
+
for gpu in gpus:
|
|
74
|
+
if devices == 'all' or gpu.id in devices:
|
|
75
|
+
available[gpu.id] = gpu.memoryFree // run_mem
|
|
76
|
+
|
|
77
|
+
return available
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def get_avail_hosts_gpu_runs(devices, run_mem):
|
|
81
|
+
"""
|
|
82
|
+
This function is called in a cluster configuration case, i.e. devices parameter is configured as dictionary of hostnames and GPU IDs (either list of the IDs or 'all' for all GPUs per host).
|
|
83
|
+
It starts mpi subprocess that targets each of the configured host. The subprocess returns tuples with hostname and available GPUs. The tuples are converted into dictionary and returned.
|
|
84
|
+
|
|
85
|
+
:param devices:
|
|
86
|
+
:param run_mem:
|
|
87
|
+
:return:
|
|
88
|
+
"""
|
|
89
|
+
hosts = ','.join(devices.keys())
|
|
90
|
+
script = ut.join(os.path.realpath(os.path.dirname(__file__)), 'host_utils.py')
|
|
91
|
+
command = ['mpiexec', '-n', str(len(devices)), '--host', hosts, 'python', script, str(devices), str(run_mem)]
|
|
92
|
+
result = subprocess.run(command, stdout=subprocess.PIPE, text=True).stdout
|
|
93
|
+
mem_map = {}
|
|
94
|
+
for entry in result.splitlines():
|
|
95
|
+
host_devs = ast.literal_eval(entry)
|
|
96
|
+
mem_map[host_devs[0]] = host_devs[1]
|
|
97
|
+
return mem_map
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def get_balanced_load(avail_runs, runs):
|
|
101
|
+
"""
|
|
102
|
+
This function distributes the runs proportionally to the GPUs availability.
|
|
103
|
+
If number of available runs is less or equal to the requested runs, the input parameter avail_runs becomes load.
|
|
104
|
+
The function also returns number of available runs.
|
|
105
|
+
|
|
106
|
+
:param avail_runs: dict
|
|
107
|
+
keys are GPU IDs, and values are available runs
|
|
108
|
+
for cluster configuration the keys are prepended with the hostnames
|
|
109
|
+
:param runs: int
|
|
110
|
+
number of requested jobs
|
|
111
|
+
:return: dict, int
|
|
112
|
+
a dictionary with the same structure as avail_runs input parameter, but with values indicating runs modified to achieve balanced distribution.
|
|
113
|
+
"""
|
|
114
|
+
if len(avail_runs) == 0:
|
|
115
|
+
return {}
|
|
116
|
+
|
|
117
|
+
# if total number of available runs is less or equal runs, return the avail_runs,
|
|
118
|
+
# and total number of available jobs
|
|
119
|
+
total_available = reduce((lambda x, y: x + y), avail_runs.values())
|
|
120
|
+
if total_available <= runs:
|
|
121
|
+
return avail_runs, total_available
|
|
122
|
+
|
|
123
|
+
# initialize variables for calculations
|
|
124
|
+
need_runs = runs
|
|
125
|
+
available = total_available
|
|
126
|
+
load = {}
|
|
127
|
+
|
|
128
|
+
# add one run from each available
|
|
129
|
+
for k, v in avail_runs.items():
|
|
130
|
+
if v > 0:
|
|
131
|
+
load[k] = 1
|
|
132
|
+
avail_runs[k] = v - 1
|
|
133
|
+
need_runs -= 1
|
|
134
|
+
if need_runs == 0:
|
|
135
|
+
return load, runs
|
|
136
|
+
available -= 1
|
|
137
|
+
|
|
138
|
+
# use proportionally from available
|
|
139
|
+
distributed = 0
|
|
140
|
+
ratio = need_runs / available
|
|
141
|
+
for k, v in avail_runs.items():
|
|
142
|
+
if v > 0:
|
|
143
|
+
share = int(v * ratio)
|
|
144
|
+
load[k] = load[k] + share
|
|
145
|
+
avail_runs[k] = v - share
|
|
146
|
+
distributed += share
|
|
147
|
+
need_runs -= distributed
|
|
148
|
+
available -= distributed
|
|
149
|
+
|
|
150
|
+
if need_runs > 0:
|
|
151
|
+
# need to add the few remaining
|
|
152
|
+
for k, v in avail_runs.items():
|
|
153
|
+
if v > 0:
|
|
154
|
+
load[k] = load[k] + 1
|
|
155
|
+
need_runs -= 1
|
|
156
|
+
if need_runs == 0:
|
|
157
|
+
break
|
|
158
|
+
|
|
159
|
+
return load, runs
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def get_gpu_use(devices, no_jobs, job_size):
|
|
163
|
+
"""
|
|
164
|
+
Determines available GPUs that match configured devices, and selects the optimal distribution of jobs on available devices. If devices is configured as dict (i.e. cluster configuration) then a file "hosts" is created in the running directory. This file contains hosts names and number of jobs to run on that host.
|
|
165
|
+
Parameters
|
|
166
|
+
----------
|
|
167
|
+
devices : list or dict or 'all'
|
|
168
|
+
Configured parameter. list of GPU ids to use for jobs or 'all' if all GPUs should be used. If cluster configuration, then
|
|
169
|
+
it is dict with keys being host names.
|
|
170
|
+
no_jobs : int
|
|
171
|
+
wanted number of jobs
|
|
172
|
+
job_size : float
|
|
173
|
+
a GPU memory requirement to run one job
|
|
174
|
+
Returns
|
|
175
|
+
-------
|
|
176
|
+
picked_devs : list or list of lists(if cluster conf)
|
|
177
|
+
list of GPU ids that were selected for the jobs
|
|
178
|
+
available jobs : int
|
|
179
|
+
number of jobs allocated on all GPUs
|
|
180
|
+
cluster_conf : boolean
|
|
181
|
+
True is cluster configuration
|
|
182
|
+
"""
|
|
183
|
+
|
|
184
|
+
def unpack_load(load):
|
|
185
|
+
picked_devs = []
|
|
186
|
+
for ds in [[k] * int(v) for k, v in load.items()]:
|
|
187
|
+
picked_devs.extend(ds)
|
|
188
|
+
return picked_devs
|
|
189
|
+
|
|
190
|
+
if type(devices) != dict: # a configuration for local host
|
|
191
|
+
hostfile_name = None
|
|
192
|
+
avail_jobs = get_avail_gpu_runs(devices, job_size)
|
|
193
|
+
balanced_load, avail_jobs_no = get_balanced_load(avail_jobs, no_jobs)
|
|
194
|
+
picked_devs = unpack_load(balanced_load)
|
|
195
|
+
else: # cluster configuration
|
|
196
|
+
hosts_avail_jobs = get_avail_hosts_gpu_runs(devices, job_size)
|
|
197
|
+
avail_jobs = {}
|
|
198
|
+
# collapse the host dict into one dict by adding hostname in front of key (gpu id)
|
|
199
|
+
for k, v in hosts_avail_jobs.items():
|
|
200
|
+
host_runs = {(f'{k}_{str(kv)}'): vv for kv, vv in v.items()}
|
|
201
|
+
avail_jobs.update(host_runs)
|
|
202
|
+
balanced_load, avail_jobs_no = get_balanced_load(avail_jobs, no_jobs)
|
|
203
|
+
|
|
204
|
+
# un-collapse the balanced load by hosts
|
|
205
|
+
host_balanced_load = {}
|
|
206
|
+
for k, v in balanced_load.items():
|
|
207
|
+
idx = k.rfind('_')
|
|
208
|
+
host = k[:idx]
|
|
209
|
+
if host not in host_balanced_load:
|
|
210
|
+
host_balanced_load[host] = {}
|
|
211
|
+
host_balanced_load[host].update({int(k[idx + 1:]): v})
|
|
212
|
+
|
|
213
|
+
# create hosts file and return corresponding picked devices
|
|
214
|
+
hosts_picked_devs = [(k, unpack_load(v)) for k, v in host_balanced_load.items()]
|
|
215
|
+
|
|
216
|
+
picked_devs = []
|
|
217
|
+
hostfile_name = f'hostfile_{os.getpid()}'
|
|
218
|
+
host_file = open(hostfile_name, mode='w+')
|
|
219
|
+
linesep = os.linesep
|
|
220
|
+
for h, ds in hosts_picked_devs:
|
|
221
|
+
host_file.write(f'{h}:{str(len(ds))}{linesep}')
|
|
222
|
+
picked_devs.append(ds)
|
|
223
|
+
host_file.close()
|
|
224
|
+
|
|
225
|
+
return picked_devs, int(min(avail_jobs_no, no_jobs)), hostfile_name
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def get_one_dev(ids):
|
|
229
|
+
"""
|
|
230
|
+
Returns GPU ID that is included in the configuration, is on a local node, and has the most available memory.
|
|
231
|
+
|
|
232
|
+
:param ids: list or string or dict
|
|
233
|
+
list of gpu ids, or string 'all' indicating all GPUs included, or dict by hostname
|
|
234
|
+
:return: int
|
|
235
|
+
selected GPU ID
|
|
236
|
+
"""
|
|
237
|
+
import socket
|
|
238
|
+
|
|
239
|
+
# if cluster configuration, look only at devices on local machine
|
|
240
|
+
if issubclass(type(ids), dict): # a dict with cluster configuration
|
|
241
|
+
ids = ids[socket.gethostname()] # configured devices on local host
|
|
242
|
+
|
|
243
|
+
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
|
244
|
+
gpus = GPUtil.getGPUs()
|
|
245
|
+
dev = -1
|
|
246
|
+
max_mem = 0
|
|
247
|
+
# select one with the highest availbale memory
|
|
248
|
+
for gpu in gpus:
|
|
249
|
+
if ids == 'all' or gpu.id in ids:
|
|
250
|
+
free_mem = gpu.memoryFree
|
|
251
|
+
if free_mem > max_mem:
|
|
252
|
+
dev = gpu.id
|
|
253
|
+
max_mem = free_mem
|
|
254
|
+
return dev
|
cohere_ui/api/common.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# #########################################################################
|
|
2
|
+
# Copyright (c) , UChicago Argonne, LLC. All rights reserved. #
|
|
3
|
+
# #
|
|
4
|
+
# See LICENSE file. #
|
|
5
|
+
# #########################################################################
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
import cohere_core.utilities as ut
|
|
10
|
+
import cohere_ui.api.convertconfig as conv
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_config_maps(experiment_dir, configs, **kwargs):
|
|
14
|
+
"""
|
|
15
|
+
Reads the configuration files included in configs list and returns dictionaries.
|
|
16
|
+
It will check for missing main config, for converter version. If needed it will convert
|
|
17
|
+
to the latest version.
|
|
18
|
+
|
|
19
|
+
:param experiment_dir: str
|
|
20
|
+
directory where the experiment files are loacted
|
|
21
|
+
:param configs: list str
|
|
22
|
+
list of configuaration files key names requested by calling function
|
|
23
|
+
The main config is always processed.
|
|
24
|
+
:param kwargs: ver parameters
|
|
25
|
+
may contain:
|
|
26
|
+
- rec_id : reconstruction id, pointing to alternate config
|
|
27
|
+
- no_verify : boolean switch to determine if the verification error is returned
|
|
28
|
+
:return:
|
|
29
|
+
error message
|
|
30
|
+
configuration dictionaries
|
|
31
|
+
boolean value telling if conversion happened
|
|
32
|
+
"""
|
|
33
|
+
no_verify = kwargs.pop('no_verify', False)
|
|
34
|
+
maps = {}
|
|
35
|
+
# always get main config
|
|
36
|
+
conf_dir = ut.join(experiment_dir, 'conf')
|
|
37
|
+
main_conf = ut.join(conf_dir, 'config')
|
|
38
|
+
if not os.path.isfile(main_conf):
|
|
39
|
+
# return 'no main config, exiting.', maps, None
|
|
40
|
+
raise ValueError('no main config, exiting.')
|
|
41
|
+
main_config_map = ut.read_config(main_conf)
|
|
42
|
+
|
|
43
|
+
msg = ut.verify('config', main_config_map)
|
|
44
|
+
if len(msg) > 0:
|
|
45
|
+
if not no_verify:
|
|
46
|
+
raise ValueError(msg)
|
|
47
|
+
# return msg, maps, None
|
|
48
|
+
|
|
49
|
+
converted = False
|
|
50
|
+
|
|
51
|
+
# convert configuration files if different converter version
|
|
52
|
+
if 'converter_ver' not in main_config_map or conv.get_version() is None or conv.get_version() > main_config_map['converter_ver']:
|
|
53
|
+
conv.convert(conf_dir)
|
|
54
|
+
main_config_map = ut.read_config(main_conf)
|
|
55
|
+
converted = True
|
|
56
|
+
|
|
57
|
+
maps['config'] = main_config_map
|
|
58
|
+
|
|
59
|
+
if 'config_instr' in configs or 'config_mp' in configs:
|
|
60
|
+
# the configuration file applies to specific beamline and needs to be imported
|
|
61
|
+
beamline = main_config_map.get('beamline', None)
|
|
62
|
+
if beamline is None:
|
|
63
|
+
raise ValueError(f'cannot import cohere_ui.beamlines.{beamline} module, exiting.')
|
|
64
|
+
# return f'cannot import cohere_ui.beamlines.{beamline} module, exiting.', maps, None
|
|
65
|
+
import importlib
|
|
66
|
+
beam_ver = importlib.import_module(f'cohere_beamlines.{beamline}.beam_verifier')
|
|
67
|
+
else:
|
|
68
|
+
beam_ver = None
|
|
69
|
+
|
|
70
|
+
verifier_map = {'config_data' : ut, 'config_rec' : ut, 'config_instr' : beam_ver,
|
|
71
|
+
'config_prep' : ut, 'config_disp' : ut, 'config_mp' : beam_ver}
|
|
72
|
+
|
|
73
|
+
rec_id = kwargs.get('rec_id')
|
|
74
|
+
for conf in configs:
|
|
75
|
+
# special case for rec_id
|
|
76
|
+
if rec_id is not None and conf == 'config_rec':
|
|
77
|
+
conf_file = ut.join(experiment_dir, 'conf', f'{conf}_{rec_id}')
|
|
78
|
+
else:
|
|
79
|
+
conf_file = ut.join(experiment_dir, 'conf', conf)
|
|
80
|
+
if not os.path.isfile(conf_file):
|
|
81
|
+
continue
|
|
82
|
+
config_map = ut.read_config(conf_file)
|
|
83
|
+
# verify the config map, for beamline specific conf file the verifier has to be imported
|
|
84
|
+
msg = verifier_map[conf].verify(conf, config_map)
|
|
85
|
+
if len(msg) > 0:
|
|
86
|
+
if not no_verify:
|
|
87
|
+
raise ValueError(msg)
|
|
88
|
+
|
|
89
|
+
maps[conf] = config_map
|
|
90
|
+
|
|
91
|
+
return maps, converted
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def get_pkg(proc, dev):
|
|
95
|
+
pkg = 'np'
|
|
96
|
+
|
|
97
|
+
if proc == 'auto':
|
|
98
|
+
try:
|
|
99
|
+
import cupy
|
|
100
|
+
pkg = 'cp'
|
|
101
|
+
if dev == [-1]:
|
|
102
|
+
raise ValueError('cupy processing is available, define device')
|
|
103
|
+
except:
|
|
104
|
+
try:
|
|
105
|
+
import torch
|
|
106
|
+
pkg = 'torch'
|
|
107
|
+
except:
|
|
108
|
+
pass # lib set to 'np'
|
|
109
|
+
elif proc == 'cp':
|
|
110
|
+
if sys.platform == 'darwin':
|
|
111
|
+
raise ValueError('cupy is not supported by Mac, select different processing')
|
|
112
|
+
try:
|
|
113
|
+
import cupy
|
|
114
|
+
if dev == [-1]:
|
|
115
|
+
raise ValueError('when using cupy processing, define device')
|
|
116
|
+
pkg = 'cp'
|
|
117
|
+
except:
|
|
118
|
+
raise ValueError('cupy is not installed, select different processing')
|
|
119
|
+
elif proc == 'torch':
|
|
120
|
+
try:
|
|
121
|
+
import torch
|
|
122
|
+
pkg = 'torch'
|
|
123
|
+
except:
|
|
124
|
+
raise ValueError('torch is not installed, select different processing')
|
|
125
|
+
elif proc == 'np':
|
|
126
|
+
pass # lib set to 'np'
|
|
127
|
+
else:
|
|
128
|
+
err_msg = f'invalid "processing" value, {proc} is not supported'
|
|
129
|
+
raise ValueError(err_msg)
|
|
130
|
+
|
|
131
|
+
return pkg
|