graphviper 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graphviper/__init__.py +0 -0
- graphviper/dask/__init__.py +1 -0
- graphviper/dask/_scheduler.py +208 -0
- graphviper/dask/_worker.py +71 -0
- graphviper/dask/client.py +248 -0
- graphviper/graph_tools/__init__.py +8 -0
- graphviper/graph_tools/append.py +0 -0
- graphviper/graph_tools/coordinate_utils.py +243 -0
- graphviper/graph_tools/map.py +143 -0
- graphviper/graph_tools/reduce.py +34 -0
- graphviper/logger/__init__.py +1 -0
- graphviper/logger/logger.py +86 -0
- graphviper/parameter_checking/__init__.py +2 -0
- graphviper/parameter_checking/check_logger_parms.py +56 -0
- graphviper/parameter_checking/check_parms.py +335 -0
- graphviper/utils/__init__.py +1 -0
- graphviper/utils/display.py +10 -0
- graphviper-0.0.1.dist-info/LICENSE.txt +28 -0
- graphviper-0.0.1.dist-info/METADATA +78 -0
- graphviper-0.0.1.dist-info/RECORD +22 -0
- graphviper-0.0.1.dist-info/WHEEL +5 -0
- graphviper-0.0.1.dist-info/top_level.txt +1 -0
graphviper/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .client import local_client, slurm_cluster_client
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
"""
|
|
2
|
+
MIT License
|
|
3
|
+
|
|
4
|
+
Copyright (c) 2022 Jonathan Simon Kenyon
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
in the Software without restriction, including without limitation the rights
|
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
furnished to do so, subject to the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be included in all
|
|
14
|
+
copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
22
|
+
SOFTWARE.
|
|
23
|
+
"""
|
|
24
|
+
from collections import defaultdict
|
|
25
|
+
from distributed import SchedulerPlugin
|
|
26
|
+
from dask.core import reverse_dict
|
|
27
|
+
from dask.base import tokenize
|
|
28
|
+
from dask.order import graph_metrics, ndependencies
|
|
29
|
+
import click
|
|
30
|
+
from distributed.diagnostics.plugin import SchedulerPlugin
|
|
31
|
+
import numpy as np
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def unravel_deps(hlg_deps, name, unravelled_deps=None):
|
|
35
|
+
"""Recursively construct a set of all dependencies for a specific task."""
|
|
36
|
+
|
|
37
|
+
if unravelled_deps is None:
|
|
38
|
+
unravelled_deps = set()
|
|
39
|
+
|
|
40
|
+
for dep in hlg_deps[name]:
|
|
41
|
+
unravelled_deps |= {dep}
|
|
42
|
+
unravel_deps(hlg_deps, dep, unravelled_deps)
|
|
43
|
+
|
|
44
|
+
return unravelled_deps
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_node_depths(dependencies, root_nodes, metrics):
|
|
48
|
+
node_depths = {}
|
|
49
|
+
|
|
50
|
+
for k in dependencies.keys():
|
|
51
|
+
# Get dependencies per node.
|
|
52
|
+
deps = unravel_deps(dependencies, k)
|
|
53
|
+
# Associate nodes with root nodes.
|
|
54
|
+
roots = root_nodes & deps
|
|
55
|
+
offset = metrics[k][-1]
|
|
56
|
+
node_depths[k] = max(metrics[r][-1] - offset for r in roots) if roots else 0
|
|
57
|
+
|
|
58
|
+
return node_depths
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class schedular(SchedulerPlugin):
|
|
62
|
+
def __init__(self, autorestrictor, local_cache):
|
|
63
|
+
self.autorestrictor = autorestrictor
|
|
64
|
+
self.local_cache = local_cache
|
|
65
|
+
super().__init__()
|
|
66
|
+
|
|
67
|
+
def add_worker(self, scheduler, worker):
|
|
68
|
+
if self.local_cache:
|
|
69
|
+
# Set the resource label to the ip of the node that the worker is on, so that tasks that require a specific node can be assigned to the correct worker.
|
|
70
|
+
ip = worker[worker.rfind("/") + 1 : worker.rfind(":")]
|
|
71
|
+
scheduler.add_resources(worker=worker, resources={ip: 1})
|
|
72
|
+
|
|
73
|
+
def update_graph(self, scheduler, dsk=None, keys=None, restrictions=None, **kw):
|
|
74
|
+
if self.autorestrictor:
|
|
75
|
+
print("Using autorestrictor")
|
|
76
|
+
"""Processes dependencies to assign tasks to specific workers."""
|
|
77
|
+
workers = list(scheduler.workers.keys())
|
|
78
|
+
n_worker = len(workers)
|
|
79
|
+
|
|
80
|
+
tasks = scheduler.tasks
|
|
81
|
+
dependencies = kw["dependencies"]
|
|
82
|
+
|
|
83
|
+
# print('In update_graph :', scheduler, ',*,', dsk, ',*,', keys , ',*,', restrictions , ',*,', kw)
|
|
84
|
+
if dependencies:
|
|
85
|
+
dependents = reverse_dict(dependencies)
|
|
86
|
+
|
|
87
|
+
# print('reversed dict:', dependents)
|
|
88
|
+
|
|
89
|
+
_, total_dependencies = ndependencies(dependencies, dependents)
|
|
90
|
+
# TODO: Avoid calling graph metrics.
|
|
91
|
+
metrics = graph_metrics(dependencies, dependents, total_dependencies)
|
|
92
|
+
|
|
93
|
+
# Terminal nodes have no dependents, root nodes have no dependencies.
|
|
94
|
+
# Horizontal partition nodes are initialized as the terminal nodes.
|
|
95
|
+
part_nodes = {k for (k, v) in dependents.items() if not v}
|
|
96
|
+
root_nodes = {k for (k, v) in dependencies.items() if not v}
|
|
97
|
+
|
|
98
|
+
# Figure out the depth of every task. Depth is defined as maximum
|
|
99
|
+
# distance from a root node. TODO: Optimize get_node_depths.
|
|
100
|
+
|
|
101
|
+
node_depths = get_node_depths(dependencies, root_nodes, metrics)
|
|
102
|
+
# try:
|
|
103
|
+
max_depth = max(node_depths.values())
|
|
104
|
+
# except:
|
|
105
|
+
# print('&&&&& dependencies, root_nodes, metrics',node_depths,',*,',dependencies, root_nodes, metrics)
|
|
106
|
+
|
|
107
|
+
# If we have fewer partition nodes than workers, we cannot utilise all
|
|
108
|
+
# the workers and are likely dealing with a reduction. We work our way
|
|
109
|
+
# back through the graph, starting at the deepest terminal nodes, and
|
|
110
|
+
# try to find a depth at which there was enough work to utilise all
|
|
111
|
+
# workers.
|
|
112
|
+
while (len(part_nodes) < n_worker) & (max_depth > 0):
|
|
113
|
+
_part_nodes = part_nodes.copy()
|
|
114
|
+
for pn in _part_nodes:
|
|
115
|
+
if node_depths[pn] == max_depth:
|
|
116
|
+
part_nodes ^= set((pn,))
|
|
117
|
+
part_nodes |= dependencies[pn]
|
|
118
|
+
max_depth -= 1
|
|
119
|
+
if max_depth <= 0:
|
|
120
|
+
return # In this case, there in nothing we can do - fall back.
|
|
121
|
+
|
|
122
|
+
part_roots = {}
|
|
123
|
+
part_dependencies = {}
|
|
124
|
+
part_dependents = {}
|
|
125
|
+
|
|
126
|
+
for pn in part_nodes:
|
|
127
|
+
# Get dependencies per partition node.
|
|
128
|
+
part_dependencies[pn] = unravel_deps(dependencies, pn)
|
|
129
|
+
# Get dependents per partition node.
|
|
130
|
+
part_dependents[pn] = unravel_deps(dependents, pn)
|
|
131
|
+
# Associate partition nodes with root nodes.
|
|
132
|
+
part_roots[pn] = root_nodes & part_dependencies[pn]
|
|
133
|
+
|
|
134
|
+
# Create a unique token for each set of partition roots. TODO: This is
|
|
135
|
+
# very strict. What about nodes with very similar roots? Tokenization
|
|
136
|
+
# may be overkill too.
|
|
137
|
+
root_tokens = {tokenize(*sorted(v)): v for v in part_roots.values()}
|
|
138
|
+
|
|
139
|
+
hash_map = defaultdict(set)
|
|
140
|
+
group_offset = 0
|
|
141
|
+
|
|
142
|
+
# Associate partition roots with a specific group if they are not a
|
|
143
|
+
# subset of another, larger root set.
|
|
144
|
+
for k, v in root_tokens.items():
|
|
145
|
+
if any(v < vv for vv in root_tokens.values()): # Strict subset.
|
|
146
|
+
continue
|
|
147
|
+
else:
|
|
148
|
+
hash_map[k] |= set([group_offset])
|
|
149
|
+
group_offset += 1
|
|
150
|
+
|
|
151
|
+
# If roots were a subset, they should share the group of their
|
|
152
|
+
# superset/s.
|
|
153
|
+
for k, v in root_tokens.items():
|
|
154
|
+
if not v: # Special case - no dependencies. Handled below.
|
|
155
|
+
continue
|
|
156
|
+
shared_roots = {
|
|
157
|
+
kk: None for kk, vv in root_tokens.items() if v < vv
|
|
158
|
+
}
|
|
159
|
+
if shared_roots:
|
|
160
|
+
hash_map[k] = set().union(
|
|
161
|
+
*[hash_map[kk] for kk in shared_roots.keys()]
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
task_groups = defaultdict(set)
|
|
165
|
+
|
|
166
|
+
for pn in part_nodes:
|
|
167
|
+
pdp = part_dependencies[pn]
|
|
168
|
+
pdn = part_dependents[pn]
|
|
169
|
+
|
|
170
|
+
if pdp:
|
|
171
|
+
groups = hash_map[tokenize(*sorted(part_roots[pn]))]
|
|
172
|
+
else: # Special case - no dependencies.
|
|
173
|
+
groups = {group_offset}
|
|
174
|
+
group_offset += 1
|
|
175
|
+
|
|
176
|
+
for g in groups:
|
|
177
|
+
task_groups[g] |= pdp | pdn | {pn}
|
|
178
|
+
|
|
179
|
+
worker_loads = {wkr: 0 for wkr in workers}
|
|
180
|
+
|
|
181
|
+
for task_group in task_groups.values():
|
|
182
|
+
assignee = min(worker_loads, key=worker_loads.get)
|
|
183
|
+
worker_loads[assignee] += len(task_group)
|
|
184
|
+
|
|
185
|
+
for task_name in task_group:
|
|
186
|
+
try:
|
|
187
|
+
task = tasks[task_name]
|
|
188
|
+
except KeyError: # Keys may not have an assosciated task.
|
|
189
|
+
continue
|
|
190
|
+
|
|
191
|
+
# print('^^^^^^',dir(task))
|
|
192
|
+
# if task._worker_restrictions is None:
|
|
193
|
+
# task._worker_restrictions = set()
|
|
194
|
+
# task._worker_restrictions |= {assignee}
|
|
195
|
+
# task._loose_restrictions = False
|
|
196
|
+
|
|
197
|
+
if task.worker_restrictions is None:
|
|
198
|
+
task.worker_restrictions = set()
|
|
199
|
+
task.worker_restrictions |= {assignee}
|
|
200
|
+
task.loose_restrictions = False
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
@click.command()
|
|
204
|
+
@click.option("--autorestrictor", default=False)
|
|
205
|
+
@click.option("--local_cache", default=False)
|
|
206
|
+
def dask_setup(scheduler, autorestrictor, local_cache):
|
|
207
|
+
plugin = schedular(autorestrictor, local_cache)
|
|
208
|
+
scheduler.add_plugin(plugin)
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import click
|
|
2
|
+
|
|
3
|
+
from graphviper._logger import _setup_worker_logger
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class _worker:
|
|
7
|
+
def __init__(self, local_cache, log_parms):
|
|
8
|
+
# print("init local cache")
|
|
9
|
+
self.local_cache = local_cache
|
|
10
|
+
|
|
11
|
+
print("log_parms", log_parms)
|
|
12
|
+
# /.lustre/aoc/projects/ngvla/viper/ngvla_sim/viper_
|
|
13
|
+
self.log_to_term = log_parms["log_to_term"]
|
|
14
|
+
self.log_to_file = log_parms["log_to_file"]
|
|
15
|
+
self.log_file = log_parms["log_file"]
|
|
16
|
+
self.log_level = log_parms["log_level"]
|
|
17
|
+
|
|
18
|
+
def get_logger(self):
|
|
19
|
+
return self.logger
|
|
20
|
+
|
|
21
|
+
def setup(self, worker):
|
|
22
|
+
"""
|
|
23
|
+
Run when the plugin is attached to a worker. This happens when the plugin is registered
|
|
24
|
+
and attached to existing workers, or when a worker is created after the plugin has been
|
|
25
|
+
registered.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
self.logger = _setup_worker_logger(
|
|
29
|
+
self.log_to_term,
|
|
30
|
+
self.log_to_file,
|
|
31
|
+
self.log_file,
|
|
32
|
+
self.log_level,
|
|
33
|
+
str(worker.id),
|
|
34
|
+
)
|
|
35
|
+
self.logger.debug(
|
|
36
|
+
"Logger created on worker " + str(worker.id) + ",*," + str(worker.address)
|
|
37
|
+
)
|
|
38
|
+
# Documentation https://distributed.dask.org/en/stable/worker.html#distributed.worker.Worker
|
|
39
|
+
self.worker = worker
|
|
40
|
+
|
|
41
|
+
if self.local_cache:
|
|
42
|
+
ip = worker.address[
|
|
43
|
+
worker.address.rfind("/") + 1 : worker.address.rfind(":")
|
|
44
|
+
]
|
|
45
|
+
self.logger.debug(str(worker.id) + ",*," + ip)
|
|
46
|
+
worker.state.available_resources = {
|
|
47
|
+
**worker.state.available_resources,
|
|
48
|
+
**{ip: 1},
|
|
49
|
+
}
|
|
50
|
+
# print(worker.state.available_resources)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# https://github.com/dask/distributed/issues/4169
|
|
54
|
+
@click.command()
|
|
55
|
+
@click.option("--local_cache", default=False)
|
|
56
|
+
# @click.option("--log_parms", default={'log_to_term':True,'log_to_file':False,'log_file':'viper_', 'log_level':'DEBUG'})
|
|
57
|
+
@click.option("--log_to_term", default=True)
|
|
58
|
+
@click.option("--log_to_file", default=False)
|
|
59
|
+
@click.option("--log_file", default="viper_")
|
|
60
|
+
@click.option("--log_level", default="INFO")
|
|
61
|
+
async def dask_setup(
|
|
62
|
+
worker, local_cache, log_to_term, log_to_file, log_file, log_level
|
|
63
|
+
):
|
|
64
|
+
log_parms = {
|
|
65
|
+
"log_to_term": log_to_term,
|
|
66
|
+
"log_to_file": log_to_file,
|
|
67
|
+
"log_file": log_file,
|
|
68
|
+
"log_level": log_level,
|
|
69
|
+
}
|
|
70
|
+
plugin = _worker(local_cache, log_parms)
|
|
71
|
+
await worker.client.register_worker_plugin(plugin, name="viper_worker")
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
import warnings, time, os, psutil, multiprocessing, re
|
|
2
|
+
import dask
|
|
3
|
+
import copy
|
|
4
|
+
import os
|
|
5
|
+
import logging
|
|
6
|
+
import graphviper
|
|
7
|
+
import distributed
|
|
8
|
+
from graphviper.parameter_checking.check_logger_parms import (
|
|
9
|
+
check_logger_parms,
|
|
10
|
+
check_worker_logger_parms,
|
|
11
|
+
)
|
|
12
|
+
from graphviper.logger import setup_logger, get_logger
|
|
13
|
+
from graphviper.dask._worker import (
|
|
14
|
+
_worker,
|
|
15
|
+
) # _worker_logger_plugin
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def local_client(
|
|
19
|
+
cores=None,
|
|
20
|
+
memory_limit=None,
|
|
21
|
+
autorestrictor=False,
|
|
22
|
+
dask_local_dir=None,
|
|
23
|
+
local_dir=None,
|
|
24
|
+
wait_for_workers=True,
|
|
25
|
+
log_parms={},
|
|
26
|
+
worker_log_parms={},
|
|
27
|
+
):
|
|
28
|
+
"""
|
|
29
|
+
local_dir setting is only useful for testing since this function creates a local cluster. slurm_cluster_client should be used for a multinode cluster.
|
|
30
|
+
|
|
31
|
+
https://github.com/dask/dask/issues/5577
|
|
32
|
+
log_parms['log_to_term'] = True/False
|
|
33
|
+
log_parms['log_file'] = True/False
|
|
34
|
+
log_parms['log_level'] =
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
_log_parms = copy.deepcopy(log_parms)
|
|
38
|
+
_worker_log_parms = copy.deepcopy(worker_log_parms)
|
|
39
|
+
|
|
40
|
+
assert check_logger_parms(
|
|
41
|
+
_log_parms
|
|
42
|
+
), "######### ERROR: initialize_processing log_parms checking failed."
|
|
43
|
+
|
|
44
|
+
if _worker_log_parms is not None:
|
|
45
|
+
assert check_worker_logger_parms(
|
|
46
|
+
_worker_log_parms
|
|
47
|
+
), "######### ERROR: initialize_processing log_parms checking failed."
|
|
48
|
+
|
|
49
|
+
if local_dir:
|
|
50
|
+
os.environ["VIPER_LOCAL_DIR"] = local_dir
|
|
51
|
+
local_cache = True
|
|
52
|
+
else:
|
|
53
|
+
local_cache = False
|
|
54
|
+
|
|
55
|
+
# print(_log_parms)
|
|
56
|
+
setup_logger(**_log_parms)
|
|
57
|
+
logger = get_logger()
|
|
58
|
+
|
|
59
|
+
_set_up_dask(dask_local_dir)
|
|
60
|
+
|
|
61
|
+
viper_path = graphviper.__path__[0]
|
|
62
|
+
if local_cache or autorestrictor:
|
|
63
|
+
dask.config.set(
|
|
64
|
+
{
|
|
65
|
+
"distributed.scheduler.preload": os.path.join(
|
|
66
|
+
viper_path, "_concurrency/_dask/_scheduler.py"
|
|
67
|
+
)
|
|
68
|
+
}
|
|
69
|
+
)
|
|
70
|
+
dask.config.set(
|
|
71
|
+
{
|
|
72
|
+
"distributed.scheduler.preload-argv": [
|
|
73
|
+
"--local_cache",
|
|
74
|
+
local_cache,
|
|
75
|
+
"--autorestrictor",
|
|
76
|
+
autorestrictor,
|
|
77
|
+
]
|
|
78
|
+
}
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
""" This method of assigning a worker plugin does not seem to work when using dask_jobqueue. Consequently using client.register_worker_plugin so that the method of assigning a worker plugin is the same for local_client and slurm_cluster_client.
|
|
82
|
+
if local_cache or _worker_log_parms:
|
|
83
|
+
dask.config.set({"distributed.worker.preload": os.path.join(viper_path,'_utils/_worker.py')})
|
|
84
|
+
dask.config.set({"distributed.worker.preload-argv": ["--local_cache",local_cache,"--log_to_term",_worker_log_parms['log_to_term'],"--log_to_file",_worker_log_parms['log_to_file'],"--log_file",_worker_log_parms['log_file'],"--log_level",_worker_log_parms['log_level']]})
|
|
85
|
+
"""
|
|
86
|
+
# setup distributed based multiprocessing environment
|
|
87
|
+
if cores is None:
|
|
88
|
+
cores = multiprocessing.cpu_count()
|
|
89
|
+
if memory_limit is None:
|
|
90
|
+
memory_limit = (
|
|
91
|
+
str(round(((psutil.virtual_memory().available / (1024**2))) / cores))
|
|
92
|
+
+ "MB"
|
|
93
|
+
)
|
|
94
|
+
cluster = distributed.LocalCluster(
|
|
95
|
+
n_workers=cores, threads_per_worker=1, processes=True, memory_limit=memory_limit
|
|
96
|
+
) # , silence_logs=logging.ERROR #,resources={'GPU': 2}
|
|
97
|
+
client = distributed.Client(cluster)
|
|
98
|
+
client.get_versions(check=True)
|
|
99
|
+
|
|
100
|
+
"""
|
|
101
|
+
When constructing a graph that has local cache enabled all workers need to be up and running.
|
|
102
|
+
"""
|
|
103
|
+
if local_cache or wait_for_workers:
|
|
104
|
+
client.wait_for_workers(n_workers=cores)
|
|
105
|
+
|
|
106
|
+
if local_cache or _worker_log_parms:
|
|
107
|
+
plugin = _worker(local_cache, _worker_log_parms)
|
|
108
|
+
client.register_worker_plugin(plugin, name="viper_worker")
|
|
109
|
+
|
|
110
|
+
logger.info("Created client " + str(client))
|
|
111
|
+
|
|
112
|
+
return client
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def slurm_cluster_client(
|
|
116
|
+
workers_per_node,
|
|
117
|
+
cores_per_node,
|
|
118
|
+
memory_per_node,
|
|
119
|
+
number_of_nodes,
|
|
120
|
+
queue,
|
|
121
|
+
interface,
|
|
122
|
+
python_env_dir,
|
|
123
|
+
dask_local_dir,
|
|
124
|
+
dask_log_dir,
|
|
125
|
+
exclude_nodes="nmpost090",
|
|
126
|
+
dashboard_port=9000,
|
|
127
|
+
local_dir=None,
|
|
128
|
+
autorestrictor=False,
|
|
129
|
+
wait_for_workers=True,
|
|
130
|
+
log_parms={},
|
|
131
|
+
worker_log_parms={},
|
|
132
|
+
):
|
|
133
|
+
"""
|
|
134
|
+
local_cache setting is only useful for testing since this function creates a local cluster. slurm_cluster_client should be used for a multinode cluster.
|
|
135
|
+
|
|
136
|
+
https://github.com/dask/dask/issues/5577
|
|
137
|
+
log_parms['log_to_term'] = True/False
|
|
138
|
+
log_parms['log_file'] = True/False
|
|
139
|
+
log_parms['log_level'] =
|
|
140
|
+
|
|
141
|
+
interface eth0, ib0
|
|
142
|
+
python "/mnt/condor/jsteeb/viper_py/bin/python"
|
|
143
|
+
dask_local_dir "/mnt/condor/jsteeb"
|
|
144
|
+
dask_log_dir "/.lustre/aoc/projects/ngvla/viper/ngvla_sim",
|
|
145
|
+
"""
|
|
146
|
+
|
|
147
|
+
from dask_jobqueue import SLURMCluster
|
|
148
|
+
from distributed import Client, config, performance_report
|
|
149
|
+
|
|
150
|
+
_log_parms = copy.deepcopy(log_parms)
|
|
151
|
+
_worker_log_parms = copy.deepcopy(worker_log_parms)
|
|
152
|
+
|
|
153
|
+
assert _check_logger_parms(
|
|
154
|
+
_log_parms
|
|
155
|
+
), "######### ERROR: initialize_processing log_parms checking failed."
|
|
156
|
+
assert _check_worker_logger_parms(
|
|
157
|
+
_worker_log_parms
|
|
158
|
+
), "######### ERROR: initialize_processing log_parms checking failed."
|
|
159
|
+
|
|
160
|
+
if local_dir:
|
|
161
|
+
os.environ["VIPER_LOCAL_DIR"] = local_dir
|
|
162
|
+
local_cache = True
|
|
163
|
+
else:
|
|
164
|
+
local_cache = False
|
|
165
|
+
|
|
166
|
+
# Viper logger for code that is not part of the Dask graph. The worker logger is setup in the _viper_worker plugin.
|
|
167
|
+
from viper._utils._logger import setup_logger
|
|
168
|
+
|
|
169
|
+
setup_logger(**_log_parms)
|
|
170
|
+
logger = get_logger()
|
|
171
|
+
|
|
172
|
+
_set_up_dask(dask_local_dir)
|
|
173
|
+
|
|
174
|
+
viper_path = graphviper.__path__.__dict__["_path"][0]
|
|
175
|
+
if local_cache or autorestrictor:
|
|
176
|
+
dask.config.set(
|
|
177
|
+
{
|
|
178
|
+
"distributed.scheduler.preload": os.path.join(
|
|
179
|
+
viper_path, "_concurrency/_dask/_scheduler.py"
|
|
180
|
+
)
|
|
181
|
+
}
|
|
182
|
+
)
|
|
183
|
+
dask.config.set(
|
|
184
|
+
{
|
|
185
|
+
"distributed.scheduler.preload-argv": [
|
|
186
|
+
"--local_cache",
|
|
187
|
+
local_cache,
|
|
188
|
+
"--autorestrictor",
|
|
189
|
+
autorestrictor,
|
|
190
|
+
]
|
|
191
|
+
}
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
""" This method of assigning a worker plugin does not seem to work when using dask_jobqueue. Consequently using client.register_worker_plugin so that the method of assigning a worker plugin is the same for local_client and slurm_cluster_client.
|
|
195
|
+
if local_cache or _worker_log_parms:
|
|
196
|
+
dask.config.set({"distributed.worker.preload": os.path.join(viper_path,'_utils/_worker.py')})
|
|
197
|
+
dask.config.set({"distributed.worker.preload-argv": ["--local_cache",local_cache,"--log_to_term",_worker_log_parms['log_to_term'],"--log_to_file",_worker_log_parms['log_to_file'],"--log_file",_worker_log_parms['log_file'],"--log_level",_worker_log_parms['log_level']]})
|
|
198
|
+
"""
|
|
199
|
+
|
|
200
|
+
cluster = SLURMCluster(
|
|
201
|
+
processes=workers_per_node,
|
|
202
|
+
cores=cores_per_node,
|
|
203
|
+
interface=interface,
|
|
204
|
+
memory=memory_per_node,
|
|
205
|
+
walltime="24:00:00",
|
|
206
|
+
queue=queue,
|
|
207
|
+
name="viper",
|
|
208
|
+
python=python_env_dir, # "/mnt/condor/jsteeb/viper_py/bin/python", #"/.lustre/aoc/projects/ngvla/viper/viper_py_env/bin/python",
|
|
209
|
+
local_directory=dask_local_dir, # "/mnt/condor/jsteeb",
|
|
210
|
+
log_directory=dask_log_dir,
|
|
211
|
+
job_extra_directives=["--exclude=" + exclude_nodes],
|
|
212
|
+
# job_extra_directives=["--exclude=nmpost087,nmpost089,nmpost088"],
|
|
213
|
+
scheduler_options={"dashboard_address": ":" + str(dashboard_port)},
|
|
214
|
+
) # interface='ib0'
|
|
215
|
+
|
|
216
|
+
client = Client(cluster)
|
|
217
|
+
|
|
218
|
+
cluster.scale(workers_per_node * number_of_nodes)
|
|
219
|
+
|
|
220
|
+
"""
|
|
221
|
+
When constructing a graph that has local cache enabled all workers need to be up and running.
|
|
222
|
+
"""
|
|
223
|
+
if local_cache or wait_for_workers:
|
|
224
|
+
client.wait_for_workers(n_workers=workers_per_node * number_of_nodes)
|
|
225
|
+
|
|
226
|
+
if local_cache or _worker_log_parms:
|
|
227
|
+
plugin = _worker(local_cache, _worker_log_parms)
|
|
228
|
+
client.register_worker_plugin(plugin, name="viper_worker")
|
|
229
|
+
|
|
230
|
+
logger.info("Created client " + str(client))
|
|
231
|
+
|
|
232
|
+
return client
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def _set_up_dask(local_directory):
|
|
236
|
+
if local_directory:
|
|
237
|
+
dask.config.set({"temporary_directory": local_directory})
|
|
238
|
+
dask.config.set({"distributed.scheduler.allowed-failures": 10})
|
|
239
|
+
dask.config.set({"distributed.scheduler.work-stealing": True})
|
|
240
|
+
dask.config.set({"distributed.scheduler.unknown-task-duration": "99m"})
|
|
241
|
+
dask.config.set({"distributed.worker.memory.pause": False})
|
|
242
|
+
dask.config.set({"distributed.worker.memory.terminate": False})
|
|
243
|
+
# dask.config.set({"distributed.worker.memory.recent-to-old-time": '999s'})
|
|
244
|
+
dask.config.set({"distributed.comm.timeouts.connect": "3600s"})
|
|
245
|
+
dask.config.set({"distributed.comm.timeouts.tcp": "3600s"})
|
|
246
|
+
dask.config.set({"distributed.nanny.environ.OMP_NUM_THREADS": 1})
|
|
247
|
+
dask.config.set({"distributed.nanny.environ.MKL_NUM_THREADS": 1})
|
|
248
|
+
# https://docs.dask.org/en/stable/how-to/customize-initialization.html
|
|
File without changes
|