dpdispatcher 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dpdispatcher might be problematic. Click here for more details.
- dpdispatcher/__init__.py +2 -0
- dpdispatcher/_version.py +2 -2
- dpdispatcher/base_context.py +0 -3
- dpdispatcher/distributed_shell.py +6 -7
- dpdispatcher/dp_cloud_server.py +3 -1
- dpdispatcher/dp_cloud_server_context.py +0 -3
- dpdispatcher/dpcloudserver/client.py +1 -1
- dpdispatcher/fugaku.py +94 -0
- dpdispatcher/hdfs_context.py +0 -3
- dpdispatcher/lazy_local_context.py +0 -4
- dpdispatcher/local_context.py +0 -4
- dpdispatcher/lsf.py +12 -2
- dpdispatcher/machine.py +18 -2
- dpdispatcher/pbs.py +14 -2
- dpdispatcher/shell.py +14 -3
- dpdispatcher/slurm.py +69 -16
- dpdispatcher/ssh_context.py +21 -17
- dpdispatcher/submission.py +158 -41
- {dpdispatcher-0.5.6.dist-info → dpdispatcher-0.5.8.dist-info}/METADATA +14 -5
- dpdispatcher-0.5.8.dist-info/RECORD +34 -0
- dpdispatcher-0.5.6.dist-info/RECORD +0 -33
- {dpdispatcher-0.5.6.dist-info → dpdispatcher-0.5.8.dist-info}/LICENSE +0 -0
- {dpdispatcher-0.5.6.dist-info → dpdispatcher-0.5.8.dist-info}/WHEEL +0 -0
- {dpdispatcher-0.5.6.dist-info → dpdispatcher-0.5.8.dist-info}/entry_points.txt +0 -0
- {dpdispatcher-0.5.6.dist-info → dpdispatcher-0.5.8.dist-info}/top_level.txt +0 -0
dpdispatcher/__init__.py
CHANGED
|
@@ -43,6 +43,7 @@ except ImportError:
|
|
|
43
43
|
from .distributed_shell import DistributedShell
|
|
44
44
|
from .dp_cloud_server import DpCloudServer, Lebesgue
|
|
45
45
|
from .dp_cloud_server_context import DpCloudServerContext, LebesgueContext
|
|
46
|
+
from .fugaku import Fugaku
|
|
46
47
|
from .hdfs_context import HDFSContext
|
|
47
48
|
from .lazy_local_context import LazyLocalContext
|
|
48
49
|
from .local_context import LocalContext
|
|
@@ -85,6 +86,7 @@ __all__ = [
|
|
|
85
86
|
"PBS",
|
|
86
87
|
"Shell",
|
|
87
88
|
"Slurm",
|
|
89
|
+
"Fugaku",
|
|
88
90
|
"SSHContext",
|
|
89
91
|
"Submission",
|
|
90
92
|
"Task",
|
dpdispatcher/_version.py
CHANGED
dpdispatcher/base_context.py
CHANGED
|
@@ -70,9 +70,6 @@ class BaseContext(metaclass=ABCMeta):
|
|
|
70
70
|
def read_file(self, fname):
|
|
71
71
|
raise NotImplementedError("abstract method")
|
|
72
72
|
|
|
73
|
-
def kill(self, proc):
|
|
74
|
-
raise NotImplementedError("abstract method")
|
|
75
|
-
|
|
76
73
|
def check_finish(self, proc):
|
|
77
74
|
raise NotImplementedError("abstract method")
|
|
78
75
|
|
|
@@ -136,17 +136,16 @@ class DistributedShell(Machine):
|
|
|
136
136
|
|
|
137
137
|
resources = job.resources
|
|
138
138
|
submit_command = (
|
|
139
|
-
"hadoop jar
|
|
139
|
+
"hadoop jar {}/hadoop-yarn-applications-distributedshell-*.jar "
|
|
140
140
|
"org.apache.hadoop.yarn.applications.distributedshell.Client "
|
|
141
|
-
"-jar
|
|
142
|
-
'-queue
|
|
141
|
+
"-jar {}/hadoop-yarn-applications-distributedshell-*.jar "
|
|
142
|
+
'-queue {} -appname "distributedshell_dpgen_{}" '
|
|
143
143
|
"-shell_env YARN_CONTAINER_RUNTIME_TYPE=docker "
|
|
144
|
-
"-shell_env YARN_CONTAINER_RUNTIME_DOCKER_IMAGE
|
|
144
|
+
"-shell_env YARN_CONTAINER_RUNTIME_DOCKER_IMAGE={} "
|
|
145
145
|
"-shell_env ENV_DOCKER_CONTAINER_SHM_SIZE='600m' "
|
|
146
146
|
"-master_memory 1024 -master_vcores 2 -num_containers 1 "
|
|
147
|
-
"-container_resources memory-mb
|
|
148
|
-
"-shell_script /tmp
|
|
149
|
-
% (
|
|
147
|
+
"-container_resources memory-mb={},vcores={} "
|
|
148
|
+
"-shell_script /tmp/{}".format(
|
|
150
149
|
resources.kwargs.get("yarn_path", ""),
|
|
151
150
|
resources.kwargs.get("yarn_path", ""),
|
|
152
151
|
resources.queue_name,
|
dpdispatcher/dp_cloud_server.py
CHANGED
|
@@ -106,7 +106,9 @@ class Bohrium(Machine):
|
|
|
106
106
|
|
|
107
107
|
input_data = self.input_data.copy()
|
|
108
108
|
|
|
109
|
-
input_data
|
|
109
|
+
if not input_data.get("job_resources"):
|
|
110
|
+
input_data["job_resources"] = []
|
|
111
|
+
input_data["job_resources"].append(job_resources)
|
|
110
112
|
input_data["command"] = f"bash {job.script_file_name}"
|
|
111
113
|
if not input_data.get("backward_files"):
|
|
112
114
|
input_data["backward_files"] = self._gen_backward_files_list(job)
|
|
@@ -270,9 +270,6 @@ class BohriumContext(BaseContext):
|
|
|
270
270
|
# retcode = cmd_pipes['stdout'].channel.recv_exit_status()
|
|
271
271
|
# return retcode, cmd_pipes['stdout'], cmd_pipes['stderr']
|
|
272
272
|
|
|
273
|
-
def kill(self, cmd_pipes):
|
|
274
|
-
pass
|
|
275
|
-
|
|
276
273
|
@classmethod
|
|
277
274
|
def machine_subfields(cls) -> List[Argument]:
|
|
278
275
|
"""Generate the machine subfields.
|
dpdispatcher/fugaku.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import shlex
|
|
2
|
+
|
|
3
|
+
from dpdispatcher import dlog
|
|
4
|
+
from dpdispatcher.JobStatus import JobStatus
|
|
5
|
+
from dpdispatcher.machine import Machine
|
|
6
|
+
|
|
7
|
+
fugaku_script_header_template = """\
|
|
8
|
+
{queue_name_line}
|
|
9
|
+
{fugaku_node_number_line}
|
|
10
|
+
{fugaku_ntasks_per_node_line}
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Fugaku(Machine):
|
|
15
|
+
def gen_script(self, job):
|
|
16
|
+
fugaku_script = super().gen_script(job)
|
|
17
|
+
return fugaku_script
|
|
18
|
+
|
|
19
|
+
def gen_script_header(self, job):
|
|
20
|
+
resources = job.resources
|
|
21
|
+
fugaku_script_header_dict = {}
|
|
22
|
+
fugaku_script_header_dict[
|
|
23
|
+
"fugaku_node_number_line"
|
|
24
|
+
] = f'#PJM -L "node={resources.number_node}" '
|
|
25
|
+
fugaku_script_header_dict[
|
|
26
|
+
"fugaku_ntasks_per_node_line"
|
|
27
|
+
] = '#PJM --mpi "max-proc-per-node={cpu_per_node}"'.format(
|
|
28
|
+
cpu_per_node=resources.cpu_per_node
|
|
29
|
+
)
|
|
30
|
+
fugaku_script_header_dict[
|
|
31
|
+
"queue_name_line"
|
|
32
|
+
] = f'#PJM -L "rscgrp={resources.queue_name}"'
|
|
33
|
+
fugaku_script_header = fugaku_script_header_template.format(
|
|
34
|
+
**fugaku_script_header_dict
|
|
35
|
+
)
|
|
36
|
+
return fugaku_script_header
|
|
37
|
+
|
|
38
|
+
def do_submit(self, job):
|
|
39
|
+
script_file_name = job.script_file_name
|
|
40
|
+
script_str = self.gen_script(job)
|
|
41
|
+
job_id_name = job.job_hash + "_job_id"
|
|
42
|
+
# script_str = self.sub_script(job_dirs, cmd, args=args, resources=resources, outlog=outlog, errlog=errlog)
|
|
43
|
+
self.context.write_file(fname=script_file_name, write_str=script_str)
|
|
44
|
+
# self.context.write_file(fname=os.path.join(self.context.submission.work_base, script_file_name), write_str=script_str)
|
|
45
|
+
# script_file_dir = os.path.join(self.context.submission.work_base)
|
|
46
|
+
script_file_dir = self.context.remote_root
|
|
47
|
+
# stdin, stdout, stderr = self.context.block_checkcall('cd %s && %s %s' % (self.context.remote_root, 'pjsub', script_file_name))
|
|
48
|
+
|
|
49
|
+
stdin, stdout, stderr = self.context.block_checkcall(
|
|
50
|
+
"cd {} && {} {}".format(
|
|
51
|
+
shlex.quote(script_file_dir), "pjsub", shlex.quote(script_file_name)
|
|
52
|
+
)
|
|
53
|
+
)
|
|
54
|
+
subret = stdout.readlines()
|
|
55
|
+
job_id = subret[0].split()[5]
|
|
56
|
+
self.context.write_file(job_id_name, job_id)
|
|
57
|
+
return job_id
|
|
58
|
+
|
|
59
|
+
def default_resources(self, resources):
|
|
60
|
+
pass
|
|
61
|
+
|
|
62
|
+
def check_status(self, job):
|
|
63
|
+
job_id = job.job_id
|
|
64
|
+
if job_id == "":
|
|
65
|
+
return JobStatus.unsubmitted
|
|
66
|
+
ret, stdin, stdout, stderr = self.context.block_call("pjstat " + job_id)
|
|
67
|
+
err_str = stderr.read().decode("utf-8")
|
|
68
|
+
try:
|
|
69
|
+
status_line = stdout.read().decode("utf-8").split("\n")[-2]
|
|
70
|
+
# pjstat only retrun 0 if the job is not waiting or running
|
|
71
|
+
except Exception:
|
|
72
|
+
ret, stdin, stdout, stderr = self.context.block_call("pjstat -H " + job_id)
|
|
73
|
+
status_line = stdout.read().decode("utf-8").split("\n")[-2]
|
|
74
|
+
status_word = status_line.split()[3]
|
|
75
|
+
if status_word in ["EXT", "CCL", "ERR"]:
|
|
76
|
+
if self.check_finish_tag(job):
|
|
77
|
+
dlog.info(f"job: {job.job_hash} {job.job_id} finished")
|
|
78
|
+
return JobStatus.finished
|
|
79
|
+
else:
|
|
80
|
+
return JobStatus.terminated
|
|
81
|
+
else:
|
|
82
|
+
return JobStatus.unknown
|
|
83
|
+
status_word = status_line.split()[3]
|
|
84
|
+
# dlog.info (status_word)
|
|
85
|
+
if status_word in ["QUE", "HLD", "RNA", "SPD"]:
|
|
86
|
+
return JobStatus.waiting
|
|
87
|
+
elif status_word in ["RUN", "RNE"]:
|
|
88
|
+
return JobStatus.running
|
|
89
|
+
else:
|
|
90
|
+
return JobStatus.unknown
|
|
91
|
+
|
|
92
|
+
def check_finish_tag(self, job):
|
|
93
|
+
job_tag_finished = job.job_hash + "_job_tag_finished"
|
|
94
|
+
return self.context.check_file_exists(job_tag_finished)
|
dpdispatcher/hdfs_context.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import os
|
|
2
|
-
import signal
|
|
3
2
|
import subprocess as sp
|
|
4
3
|
|
|
5
4
|
from dpdispatcher.base_context import BaseContext
|
|
@@ -167,9 +166,6 @@ class LazyLocalContext(BaseContext):
|
|
|
167
166
|
)
|
|
168
167
|
return proc
|
|
169
168
|
|
|
170
|
-
def kill(self, job_id):
|
|
171
|
-
os.kill(job_id, signal.SIGTERM)
|
|
172
|
-
|
|
173
169
|
def check_finish(self, proc):
|
|
174
170
|
return proc.poll() is not None
|
|
175
171
|
|
dpdispatcher/local_context.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import hashlib
|
|
2
2
|
import os
|
|
3
3
|
import shutil
|
|
4
|
-
import signal
|
|
5
4
|
import subprocess as sp
|
|
6
5
|
from glob import glob
|
|
7
6
|
from subprocess import TimeoutExpired
|
|
@@ -291,9 +290,6 @@ class LocalContext(BaseContext):
|
|
|
291
290
|
)
|
|
292
291
|
return proc
|
|
293
292
|
|
|
294
|
-
def kill(self, job_id):
|
|
295
|
-
os.kill(job_id, signal.SIGTERM)
|
|
296
|
-
|
|
297
293
|
def check_finish(self, proc):
|
|
298
294
|
return proc.poll() is not None
|
|
299
295
|
|
dpdispatcher/lsf.py
CHANGED
|
@@ -83,8 +83,7 @@ class LSF(Machine):
|
|
|
83
83
|
|
|
84
84
|
try:
|
|
85
85
|
stdin, stdout, stderr = self.context.block_checkcall(
|
|
86
|
-
"cd
|
|
87
|
-
% (
|
|
86
|
+
"cd {} && {} {}".format(
|
|
88
87
|
shlex.quote(self.context.remote_root),
|
|
89
88
|
"bsub < ",
|
|
90
89
|
shlex.quote(script_file_name),
|
|
@@ -211,3 +210,14 @@ class LSF(Machine):
|
|
|
211
210
|
doc="Extra arguments.",
|
|
212
211
|
)
|
|
213
212
|
]
|
|
213
|
+
|
|
214
|
+
def kill(self, job):
|
|
215
|
+
"""Kill the job.
|
|
216
|
+
|
|
217
|
+
Parameters
|
|
218
|
+
----------
|
|
219
|
+
job : Job
|
|
220
|
+
job
|
|
221
|
+
"""
|
|
222
|
+
job_id = job.job_id
|
|
223
|
+
ret, stdin, stdout, stderr = self.context.block_call("bkill " + str(job_id))
|
dpdispatcher/machine.py
CHANGED
|
@@ -377,8 +377,12 @@ class Machine(metaclass=ABCMeta):
|
|
|
377
377
|
machine_args = [
|
|
378
378
|
Argument("batch_type", str, optional=False, doc=doc_batch_type),
|
|
379
379
|
# TODO: add default to local_root and remote_root after refactor the code
|
|
380
|
-
Argument(
|
|
381
|
-
|
|
380
|
+
Argument(
|
|
381
|
+
"local_root", [str, type(None)], optional=False, doc=doc_local_root
|
|
382
|
+
),
|
|
383
|
+
Argument(
|
|
384
|
+
"remote_root", [str, type(None)], optional=True, doc=doc_remote_root
|
|
385
|
+
),
|
|
382
386
|
Argument(
|
|
383
387
|
"clean_asynchronously",
|
|
384
388
|
bool,
|
|
@@ -439,3 +443,15 @@ class Machine(metaclass=ABCMeta):
|
|
|
439
443
|
"kwargs", dict, optional=True, doc="This field is empty for this batch."
|
|
440
444
|
)
|
|
441
445
|
]
|
|
446
|
+
|
|
447
|
+
def kill(self, job):
|
|
448
|
+
"""Kill the job.
|
|
449
|
+
|
|
450
|
+
If not implemented, pass and let the user manually kill it.
|
|
451
|
+
|
|
452
|
+
Parameters
|
|
453
|
+
----------
|
|
454
|
+
job : Job
|
|
455
|
+
job
|
|
456
|
+
"""
|
|
457
|
+
dlog.warning("Job %s should be manually killed" % job.job_id)
|
dpdispatcher/pbs.py
CHANGED
|
@@ -46,8 +46,9 @@ class PBS(Machine):
|
|
|
46
46
|
script_file_dir = self.context.remote_root
|
|
47
47
|
# stdin, stdout, stderr = self.context.block_checkcall('cd %s && %s %s' % (self.context.remote_root, 'qsub', script_file_name))
|
|
48
48
|
stdin, stdout, stderr = self.context.block_checkcall(
|
|
49
|
-
"cd
|
|
50
|
-
|
|
49
|
+
"cd {} && {} {}".format(
|
|
50
|
+
shlex.quote(script_file_dir), "qsub", shlex.quote(script_file_name)
|
|
51
|
+
)
|
|
51
52
|
)
|
|
52
53
|
subret = stdout.readlines()
|
|
53
54
|
job_id = subret[0].split()[0]
|
|
@@ -94,6 +95,17 @@ class PBS(Machine):
|
|
|
94
95
|
job_tag_finished = job.job_hash + "_job_tag_finished"
|
|
95
96
|
return self.context.check_file_exists(job_tag_finished)
|
|
96
97
|
|
|
98
|
+
def kill(self, job):
|
|
99
|
+
"""Kill the job.
|
|
100
|
+
|
|
101
|
+
Parameters
|
|
102
|
+
----------
|
|
103
|
+
job : Job
|
|
104
|
+
job
|
|
105
|
+
"""
|
|
106
|
+
job_id = job.job_id
|
|
107
|
+
ret, stdin, stdout, stderr = self.context.block_call("qdel " + str(job_id))
|
|
108
|
+
|
|
97
109
|
|
|
98
110
|
class Torque(PBS):
|
|
99
111
|
def check_status(self, job):
|
dpdispatcher/shell.py
CHANGED
|
@@ -25,8 +25,7 @@ class Shell(Machine):
|
|
|
25
25
|
output_name = job.job_hash + ".out"
|
|
26
26
|
self.context.write_file(fname=script_file_name, write_str=script_str)
|
|
27
27
|
ret, stdin, stdout, stderr = self.context.block_call(
|
|
28
|
-
"cd
|
|
29
|
-
% (
|
|
28
|
+
"cd {} && {{ nohup bash {} 1>>{} 2>>{} & }} && echo $!".format(
|
|
30
29
|
shlex.quote(self.context.remote_root),
|
|
31
30
|
script_file_name,
|
|
32
31
|
output_name,
|
|
@@ -66,7 +65,7 @@ class Shell(Machine):
|
|
|
66
65
|
|
|
67
66
|
# mark defunct process as terminated
|
|
68
67
|
ret, stdin, stdout, stderr = self.context.block_call(
|
|
69
|
-
f"if ps -p {job_id} > /dev/null && ! (ps -p {job_id} | grep defunct >/dev/null) ; then echo 1; fi"
|
|
68
|
+
f"if ps -p {job_id} > /dev/null && ! (ps -o command -p {job_id} | grep defunct >/dev/null) ; then echo 1; fi"
|
|
70
69
|
)
|
|
71
70
|
if ret != 0:
|
|
72
71
|
err_str = stderr.read().decode("utf-8")
|
|
@@ -101,3 +100,15 @@ class Shell(Machine):
|
|
|
101
100
|
job_tag_finished = job.job_hash + "_job_tag_finished"
|
|
102
101
|
# print('job finished: ',job.job_id, job_tag_finished)
|
|
103
102
|
return self.context.check_file_exists(job_tag_finished)
|
|
103
|
+
|
|
104
|
+
def kill(self, job):
|
|
105
|
+
"""Kill the job.
|
|
106
|
+
|
|
107
|
+
Parameters
|
|
108
|
+
----------
|
|
109
|
+
job : Job
|
|
110
|
+
job
|
|
111
|
+
"""
|
|
112
|
+
job_id = job.job_id
|
|
113
|
+
# 9 means exit, cannot be blocked
|
|
114
|
+
ret, stdin, stdout, stderr = self.context.block_call("kill -9 " + str(job_id))
|
dpdispatcher/slurm.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import math
|
|
1
2
|
import pathlib
|
|
2
3
|
import shlex
|
|
3
4
|
from typing import List
|
|
@@ -45,9 +46,12 @@ class Slurm(Machine):
|
|
|
45
46
|
)
|
|
46
47
|
else:
|
|
47
48
|
script_header_dict["slurm_number_gpu_line"] = custom_gpu_line
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
49
|
+
if resources.queue_name != "":
|
|
50
|
+
script_header_dict[
|
|
51
|
+
"slurm_partition_line"
|
|
52
|
+
] = f"#SBATCH --partition {resources.queue_name}"
|
|
53
|
+
else:
|
|
54
|
+
script_header_dict["slurm_partition_line"] = ""
|
|
51
55
|
slurm_script_header = slurm_script_header_template.format(**script_header_dict)
|
|
52
56
|
return slurm_script_header
|
|
53
57
|
|
|
@@ -60,8 +64,7 @@ class Slurm(Machine):
|
|
|
60
64
|
self.context.write_file(fname=script_file_name, write_str=script_str)
|
|
61
65
|
# self.context.write_file(fname=os.path.join(self.context.submission.work_base, script_file_name), write_str=script_str)
|
|
62
66
|
ret, stdin, stdout, stderr = self.context.block_call(
|
|
63
|
-
"cd
|
|
64
|
-
% (
|
|
67
|
+
"cd {} && {} {}".format(
|
|
65
68
|
shlex.quote(self.context.remote_root),
|
|
66
69
|
"sbatch",
|
|
67
70
|
shlex.quote(script_file_name),
|
|
@@ -78,7 +81,12 @@ class Slurm(Machine):
|
|
|
78
81
|
"Get error code %d in submitting through ssh with job: %s . message: %s"
|
|
79
82
|
% (ret, job.job_hash, err_str)
|
|
80
83
|
)
|
|
81
|
-
elif
|
|
84
|
+
elif (
|
|
85
|
+
"Job violates accounting/QOS policy" in err_str
|
|
86
|
+
# the number of jobs exceeds DEFAULT_MAX_JOB_COUNT (by default 10000)
|
|
87
|
+
or "Slurm temporarily unable to accept job, sleeping and retrying"
|
|
88
|
+
in err_str
|
|
89
|
+
):
|
|
82
90
|
# job number exceeds, skip the submitting
|
|
83
91
|
return ""
|
|
84
92
|
raise RuntimeError(
|
|
@@ -115,6 +123,7 @@ class Slurm(Machine):
|
|
|
115
123
|
elif (
|
|
116
124
|
"Socket timed out on send/recv operation" in err_str
|
|
117
125
|
or "Unable to contact slurm controller" in err_str
|
|
126
|
+
or "Invalid user for SlurmUser" in err_str
|
|
118
127
|
):
|
|
119
128
|
# retry 3 times
|
|
120
129
|
raise RetrySignal(
|
|
@@ -194,30 +203,47 @@ class Slurm(Machine):
|
|
|
194
203
|
)
|
|
195
204
|
]
|
|
196
205
|
|
|
206
|
+
def kill(self, job):
|
|
207
|
+
"""Kill the job.
|
|
208
|
+
|
|
209
|
+
Parameters
|
|
210
|
+
----------
|
|
211
|
+
job : Job
|
|
212
|
+
job
|
|
213
|
+
"""
|
|
214
|
+
job_id = job.job_id
|
|
215
|
+
# -Q Do not report an error if the specified job is already completed.
|
|
216
|
+
ret, stdin, stdout, stderr = self.context.block_call(
|
|
217
|
+
"scancel -Q " + str(job_id)
|
|
218
|
+
)
|
|
219
|
+
# we do not need to stop here if scancel failed; just continue
|
|
220
|
+
|
|
197
221
|
|
|
198
222
|
class SlurmJobArray(Slurm):
|
|
199
223
|
"""Slurm with job array enabled for multiple tasks in a job."""
|
|
200
224
|
|
|
201
225
|
def gen_script_header(self, job):
|
|
226
|
+
slurm_job_size = job.resources.kwargs.get("slurm_job_size", 1)
|
|
202
227
|
if job.fail_count > 0:
|
|
203
228
|
# resubmit jobs, check if some of tasks have been finished
|
|
204
|
-
job_array =
|
|
229
|
+
job_array = set()
|
|
205
230
|
for ii, task in enumerate(job.job_task_list):
|
|
206
231
|
task_tag_finished = (
|
|
207
232
|
pathlib.PurePath(task.task_work_path)
|
|
208
233
|
/ (task.task_hash + "_task_tag_finished")
|
|
209
234
|
).as_posix()
|
|
210
235
|
if not self.context.check_file_exists(task_tag_finished):
|
|
211
|
-
job_array.
|
|
236
|
+
job_array.add(ii // slurm_job_size)
|
|
212
237
|
return super().gen_script_header(job) + "\n#SBATCH --array=%s" % (
|
|
213
238
|
",".join(map(str, job_array))
|
|
214
239
|
)
|
|
215
240
|
return super().gen_script_header(job) + "\n#SBATCH --array=0-%d" % (
|
|
216
|
-
len(job.job_task_list) - 1
|
|
241
|
+
math.ceil(len(job.job_task_list) / slurm_job_size) - 1
|
|
217
242
|
)
|
|
218
243
|
|
|
219
244
|
def gen_script_command(self, job):
|
|
220
245
|
resources = job.resources
|
|
246
|
+
slurm_job_size = resources.kwargs.get("slurm_job_size", 1)
|
|
221
247
|
# SLURM_ARRAY_TASK_ID: 0 ~ n_jobs-1
|
|
222
248
|
script_command = "case $SLURM_ARRAY_TASK_ID in\n"
|
|
223
249
|
for ii, task in enumerate(job.job_task_list):
|
|
@@ -243,10 +269,16 @@ class SlurmJobArray(Slurm):
|
|
|
243
269
|
task_tag_finished=task_tag_finished,
|
|
244
270
|
log_err_part=log_err_part,
|
|
245
271
|
)
|
|
246
|
-
|
|
272
|
+
if ii % slurm_job_size == 0:
|
|
273
|
+
script_command += f"{ii // slurm_job_size})\n"
|
|
247
274
|
script_command += single_script_command
|
|
248
275
|
script_command += self.gen_script_wait(resources=resources)
|
|
249
|
-
script_command += "\n
|
|
276
|
+
script_command += "\n"
|
|
277
|
+
if (
|
|
278
|
+
ii % slurm_job_size == slurm_job_size - 1
|
|
279
|
+
or ii == len(job.job_task_list) - 1
|
|
280
|
+
):
|
|
281
|
+
script_command += ";;\n"
|
|
250
282
|
script_command += "*)\nexit 1\n;;\nesac\n"
|
|
251
283
|
return script_command
|
|
252
284
|
|
|
@@ -337,9 +369,30 @@ class SlurmJobArray(Slurm):
|
|
|
337
369
|
def check_finish_tag(self, job):
|
|
338
370
|
results = []
|
|
339
371
|
for task in job.job_task_list:
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
/ (task.task_hash + "_task_tag_finished")
|
|
343
|
-
).as_posix()
|
|
344
|
-
results.append(self.context.check_file_exists(task_tag_finished))
|
|
372
|
+
task.get_task_state(self.context)
|
|
373
|
+
results.append(task.task_state == JobStatus.finished)
|
|
345
374
|
return all(results)
|
|
375
|
+
|
|
376
|
+
@classmethod
|
|
377
|
+
def resources_subfields(cls) -> List[Argument]:
|
|
378
|
+
"""Generate the resources subfields.
|
|
379
|
+
|
|
380
|
+
Returns
|
|
381
|
+
-------
|
|
382
|
+
list[Argument]
|
|
383
|
+
resources subfields
|
|
384
|
+
"""
|
|
385
|
+
doc_slurm_job_size = "Number of tasks in a Slurm job"
|
|
386
|
+
arg = super().resources_subfields()[0]
|
|
387
|
+
arg.extend_subfields(
|
|
388
|
+
[
|
|
389
|
+
Argument(
|
|
390
|
+
"slurm_job_size",
|
|
391
|
+
int,
|
|
392
|
+
optional=True,
|
|
393
|
+
default=1,
|
|
394
|
+
doc=doc_slurm_job_size,
|
|
395
|
+
),
|
|
396
|
+
]
|
|
397
|
+
)
|
|
398
|
+
return [arg]
|
dpdispatcher/ssh_context.py
CHANGED
|
@@ -116,7 +116,7 @@ class SSHSession:
|
|
|
116
116
|
# transport = self.ssh.get_transport()
|
|
117
117
|
# transport.set_keepalive(60)
|
|
118
118
|
|
|
119
|
-
@retry(max_retry=
|
|
119
|
+
@retry(max_retry=6, sleep=1)
|
|
120
120
|
def _setup_ssh(self):
|
|
121
121
|
# machine = self.machine
|
|
122
122
|
self.ssh = paramiko.SSHClient()
|
|
@@ -199,7 +199,7 @@ class SSHSession:
|
|
|
199
199
|
ts.auth_interactive(self.username, self.inter_handler)
|
|
200
200
|
except paramiko.ssh_exception.AuthenticationException:
|
|
201
201
|
# since the asynchrony of interactive authentication, one addtional try is added
|
|
202
|
-
# retry for up to
|
|
202
|
+
# retry for up to 6 times
|
|
203
203
|
raise RetrySignal("Authentication failed")
|
|
204
204
|
elif key_ok:
|
|
205
205
|
pass
|
|
@@ -213,7 +213,12 @@ class SSHSession:
|
|
|
213
213
|
raise RuntimeError("Please provide at least one form of authentication")
|
|
214
214
|
assert ts.is_active()
|
|
215
215
|
# Opening a session creates a channel along the socket to the server
|
|
216
|
-
|
|
216
|
+
try:
|
|
217
|
+
ts.open_session(timeout=self.timeout)
|
|
218
|
+
except paramiko.ssh_exception.SSHException:
|
|
219
|
+
# retry for up to 6 times
|
|
220
|
+
# ref: https://github.com/paramiko/paramiko/issues/1508
|
|
221
|
+
raise RetrySignal("Opening session failed")
|
|
217
222
|
ts.set_keepalive(60)
|
|
218
223
|
self.ssh._transport = ts # type: ignore
|
|
219
224
|
# reset sftp
|
|
@@ -323,14 +328,14 @@ class SSHSession:
|
|
|
323
328
|
Argument("port", int, optional=True, default=22, doc=doc_port),
|
|
324
329
|
Argument(
|
|
325
330
|
"key_filename",
|
|
326
|
-
[str, None],
|
|
331
|
+
[str, type(None)],
|
|
327
332
|
optional=True,
|
|
328
333
|
default=None,
|
|
329
334
|
doc=doc_key_filename,
|
|
330
335
|
),
|
|
331
336
|
Argument(
|
|
332
337
|
"passphrase",
|
|
333
|
-
[str, None],
|
|
338
|
+
[str, type(None)],
|
|
334
339
|
optional=True,
|
|
335
340
|
default=None,
|
|
336
341
|
doc=doc_passphrase,
|
|
@@ -497,6 +502,14 @@ class SSHContext(BaseContext):
|
|
|
497
502
|
self.block_checkcall(
|
|
498
503
|
f"mv {shlex.quote(old_remote_root)} {shlex.quote(self.remote_root)}"
|
|
499
504
|
)
|
|
505
|
+
elif (
|
|
506
|
+
old_remote_root is not None
|
|
507
|
+
and old_remote_root != self.remote_root
|
|
508
|
+
and self.check_file_exists(old_remote_root)
|
|
509
|
+
and not len(self.ssh_session.sftp.listdir(old_remote_root))
|
|
510
|
+
):
|
|
511
|
+
# if the new directory exists and the old directory does not contain files, then move the old directory
|
|
512
|
+
self._rmtree(old_remote_root)
|
|
500
513
|
|
|
501
514
|
sftp = self.ssh_session.ssh.open_sftp()
|
|
502
515
|
try:
|
|
@@ -762,12 +775,6 @@ class SSHContext(BaseContext):
|
|
|
762
775
|
retcode = cmd_pipes["stdout"].channel.recv_exit_status()
|
|
763
776
|
return retcode, cmd_pipes["stdout"], cmd_pipes["stderr"]
|
|
764
777
|
|
|
765
|
-
def kill(self, cmd_pipes):
|
|
766
|
-
raise RuntimeError(
|
|
767
|
-
"dose not work! we do not know how to kill proc through paramiko.SSHClient"
|
|
768
|
-
)
|
|
769
|
-
# self.block_checkcall('kill -15 %s' % cmd_pipes['pid'])
|
|
770
|
-
|
|
771
778
|
def _rmtree(self, remotepath, verbose=False):
|
|
772
779
|
"""Remove the remote path."""
|
|
773
780
|
# The original implementation method removes files one by one using sftp.
|
|
@@ -847,8 +854,7 @@ class SSHContext(BaseContext):
|
|
|
847
854
|
self.ssh_session.put(from_f, to_f)
|
|
848
855
|
except FileNotFoundError:
|
|
849
856
|
raise FileNotFoundError(
|
|
850
|
-
"from
|
|
851
|
-
% (from_f, self.ssh_session.username, self.ssh_session.hostname, to_f)
|
|
857
|
+
f"from {from_f} to {self.ssh_session.username} @ {self.ssh_session.hostname} : {to_f} Error!"
|
|
852
858
|
)
|
|
853
859
|
# remote extract
|
|
854
860
|
self.block_checkcall("tar xf %s" % of)
|
|
@@ -877,8 +883,7 @@ class SSHContext(BaseContext):
|
|
|
877
883
|
ntar = len(files) // per_nfile + 1
|
|
878
884
|
if ntar <= 1:
|
|
879
885
|
self.block_checkcall(
|
|
880
|
-
"tar
|
|
881
|
-
% (
|
|
886
|
+
"tar {} {} {}".format(
|
|
882
887
|
tar_command,
|
|
883
888
|
shlex.quote(of),
|
|
884
889
|
" ".join([shlex.quote(file) for file in files]),
|
|
@@ -890,8 +895,7 @@ class SSHContext(BaseContext):
|
|
|
890
895
|
)
|
|
891
896
|
self.write_file(file_list_file, "\n".join(files))
|
|
892
897
|
self.block_checkcall(
|
|
893
|
-
"tar
|
|
894
|
-
% (tar_command, shlex.quote(of), shlex.quote(file_list_file))
|
|
898
|
+
f"tar {tar_command} {shlex.quote(of)} -T {shlex.quote(file_list_file)}"
|
|
895
899
|
)
|
|
896
900
|
# trans
|
|
897
901
|
from_f = pathlib.PurePath(os.path.join(self.remote_root, of)).as_posix()
|
dpdispatcher/submission.py
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
# %%
|
|
2
|
+
import asyncio
|
|
2
3
|
import copy
|
|
4
|
+
import functools
|
|
3
5
|
import json
|
|
4
6
|
import os
|
|
7
|
+
import pathlib
|
|
5
8
|
import random
|
|
6
9
|
import time
|
|
7
10
|
import uuid
|
|
@@ -198,7 +201,9 @@ class Submission:
|
|
|
198
201
|
self.local_root = machine.context.temp_local_root
|
|
199
202
|
return self
|
|
200
203
|
|
|
201
|
-
def run_submission(
|
|
204
|
+
def run_submission(
|
|
205
|
+
self, *, dry_run=False, exit_on_submit=False, clean=True, check_interval=30
|
|
206
|
+
):
|
|
202
207
|
"""Main method to execute the submission.
|
|
203
208
|
First, check whether old Submission exists on the remote machine, and try to recover from it.
|
|
204
209
|
Second, upload the local files to the remote machine where the tasks to be executed.
|
|
@@ -235,11 +240,11 @@ class Submission:
|
|
|
235
240
|
dlog.info(f"at {self.machine.context.remote_root}")
|
|
236
241
|
return self.serialize()
|
|
237
242
|
if ratio_unfinished > 0.0 and self.check_ratio_unfinished(ratio_unfinished):
|
|
238
|
-
self.
|
|
243
|
+
self.remove_unfinished_tasks()
|
|
239
244
|
break
|
|
240
245
|
|
|
241
246
|
try:
|
|
242
|
-
time.sleep(
|
|
247
|
+
time.sleep(check_interval)
|
|
243
248
|
except (Exception, KeyboardInterrupt, SystemExit) as e:
|
|
244
249
|
self.submission_to_json()
|
|
245
250
|
dlog.exception(e)
|
|
@@ -253,12 +258,72 @@ class Submission:
|
|
|
253
258
|
finally:
|
|
254
259
|
pass
|
|
255
260
|
self.handle_unexpected_submission_state()
|
|
256
|
-
self.
|
|
261
|
+
self.try_download_result()
|
|
257
262
|
self.submission_to_json()
|
|
258
263
|
if clean:
|
|
259
264
|
self.clean_jobs()
|
|
260
265
|
return self.serialize()
|
|
261
266
|
|
|
267
|
+
def try_download_result(self):
|
|
268
|
+
start_time = time.time()
|
|
269
|
+
retry_interval = 60 # 每1分钟重试一次
|
|
270
|
+
success = False
|
|
271
|
+
while not success:
|
|
272
|
+
try:
|
|
273
|
+
self.download_jobs()
|
|
274
|
+
success = True
|
|
275
|
+
except (EOFError, Exception) as e:
|
|
276
|
+
dlog.exception(e)
|
|
277
|
+
elapsed_time = time.time() - start_time
|
|
278
|
+
if elapsed_time < 3600: # 1小时内
|
|
279
|
+
dlog.info("Retrying in 1 minute...")
|
|
280
|
+
time.sleep(retry_interval)
|
|
281
|
+
elif elapsed_time < 86400: # 1小时后,但在24小时内
|
|
282
|
+
retry_interval = 600 # 每10分钟重试一次
|
|
283
|
+
dlog.info("Retrying in 10 minutes...")
|
|
284
|
+
time.sleep(retry_interval)
|
|
285
|
+
else: # 超过24小时
|
|
286
|
+
dlog.info("Maximum retries time reached. Exiting.")
|
|
287
|
+
break
|
|
288
|
+
|
|
289
|
+
async def async_run_submission(self, **kwargs):
|
|
290
|
+
"""Async interface of run_submission.
|
|
291
|
+
|
|
292
|
+
Examples
|
|
293
|
+
--------
|
|
294
|
+
>>> import asyncio
|
|
295
|
+
>>> from dpdispacher import Machine, Resource, Submission
|
|
296
|
+
>>> async def run_jobs():
|
|
297
|
+
... backgroud_task = set()
|
|
298
|
+
... # task1
|
|
299
|
+
... task1 = Task(...)
|
|
300
|
+
... submission1 = Submission(..., task_list=[task1])
|
|
301
|
+
... background_task = asyncio.create_task(
|
|
302
|
+
... submission1.async_run_submission(check_interval=2, clean=False)
|
|
303
|
+
... )
|
|
304
|
+
... # task2
|
|
305
|
+
... task2 = Task(...)
|
|
306
|
+
... submission2 = Submission(..., task_list=[task1])
|
|
307
|
+
... background_task = asyncio.create_task(
|
|
308
|
+
... submission2.async_run_submission(check_interval=2, clean=False)
|
|
309
|
+
... )
|
|
310
|
+
... background_tasks.add(background_task)
|
|
311
|
+
... result = await asyncio.gather(*background_tasks)
|
|
312
|
+
... return result
|
|
313
|
+
>>> run_jobs()
|
|
314
|
+
|
|
315
|
+
May raise Error if pass `clean=True` explicitly when submit to pbs or slurm.
|
|
316
|
+
"""
|
|
317
|
+
kwargs = {**{"clean": False}, **kwargs}
|
|
318
|
+
if kwargs["clean"]:
|
|
319
|
+
dlog.warning(
|
|
320
|
+
"Using async submission with `clean=True`, "
|
|
321
|
+
"job may fail in queue system"
|
|
322
|
+
)
|
|
323
|
+
loop = asyncio.get_event_loop()
|
|
324
|
+
wrapped_submission = functools.partial(self.run_submission, **kwargs)
|
|
325
|
+
return await loop.run_in_executor(None, wrapped_submission)
|
|
326
|
+
|
|
262
327
|
def update_submission_state(self):
|
|
263
328
|
"""Check whether all the jobs in the submission.
|
|
264
329
|
|
|
@@ -306,41 +371,53 @@ class Submission:
|
|
|
306
371
|
|
|
307
372
|
# def update_submi
|
|
308
373
|
|
|
309
|
-
def check_ratio_unfinished(self, ratio_unfinished):
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
374
|
+
def check_ratio_unfinished(self, ratio_unfinished: float) -> bool:
|
|
375
|
+
"""Calculate the ratio of unfinished tasks in the submission.
|
|
376
|
+
|
|
377
|
+
Parameters
|
|
378
|
+
----------
|
|
379
|
+
ratio_unfinished : float
|
|
380
|
+
the ratio of unfinished tasks in the submission
|
|
381
|
+
|
|
382
|
+
Returns
|
|
383
|
+
-------
|
|
384
|
+
bool
|
|
385
|
+
whether the ratio of unfinished tasks in the submission is larger than ratio_unfinished
|
|
386
|
+
"""
|
|
387
|
+
assert self.resources is not None
|
|
388
|
+
if self.resources.group_size == 1:
|
|
389
|
+
# if group size is 1, calculate job state is enough and faster
|
|
390
|
+
status_list = [job.job_state for job in self.belonging_jobs]
|
|
314
391
|
else:
|
|
315
|
-
|
|
392
|
+
# get task state is more accurate
|
|
393
|
+
status_list = []
|
|
394
|
+
for task in self.belonging_tasks:
|
|
395
|
+
task.get_task_state(self.machine.context)
|
|
396
|
+
status_list.append(task.task_state)
|
|
397
|
+
finished_num = status_list.count(JobStatus.finished)
|
|
398
|
+
return finished_num / len(self.belonging_tasks) >= (1 - ratio_unfinished)
|
|
316
399
|
|
|
317
|
-
def
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
if job.job_state
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
]
|
|
326
|
-
for
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
shutil.rmtree(
|
|
339
|
-
os.path.join(self.machine.context.local_root, task.task_work_path),
|
|
340
|
-
ignore_errors=True,
|
|
341
|
-
)
|
|
342
|
-
self.belonging_tasks = [
|
|
343
|
-
task for task in self.belonging_tasks if task not in job.job_task_list
|
|
400
|
+
def remove_unfinished_tasks(self):
|
|
401
|
+
dlog.info("Remove unfinished tasks")
|
|
402
|
+
# kill all jobs and mark them as finished
|
|
403
|
+
for job in self.belonging_jobs:
|
|
404
|
+
if job.job_state != JobStatus.finished:
|
|
405
|
+
self.machine.kill(job)
|
|
406
|
+
job.job_state = JobStatus.finished
|
|
407
|
+
# remove all unfinished tasks
|
|
408
|
+
finished_tasks = []
|
|
409
|
+
for task in self.belonging_tasks:
|
|
410
|
+
if task.task_state == JobStatus.finished:
|
|
411
|
+
finished_tasks.append(task)
|
|
412
|
+
# there is no need to remove actual remote directory
|
|
413
|
+
# as it should be cleaned anyway
|
|
414
|
+
self.belonging_tasks = finished_tasks
|
|
415
|
+
# clean removed tasks in jobs - although this should not be necessary
|
|
416
|
+
for job in self.belonging_jobs:
|
|
417
|
+
job.job_task_list = [
|
|
418
|
+
task
|
|
419
|
+
for task in job.job_task_list
|
|
420
|
+
if task.task_state == JobStatus.finished
|
|
344
421
|
]
|
|
345
422
|
|
|
346
423
|
def check_all_finished(self):
|
|
@@ -463,6 +540,9 @@ class Submission:
|
|
|
463
540
|
submission.bind_machine(machine=self.machine)
|
|
464
541
|
if self == submission:
|
|
465
542
|
self.belonging_jobs = submission.belonging_jobs
|
|
543
|
+
self.belonging_tasks = [
|
|
544
|
+
task for job in self.belonging_jobs for task in job.job_task_list
|
|
545
|
+
]
|
|
466
546
|
self.bind_machine(machine=self.machine)
|
|
467
547
|
dlog.info(
|
|
468
548
|
f"Find old submission; recover submission from json file;"
|
|
@@ -518,6 +598,7 @@ class Task:
|
|
|
518
598
|
self.task_hash = self.get_hash()
|
|
519
599
|
# self.task_need_resources="<to be completed in the future>"
|
|
520
600
|
# self.uuid =
|
|
601
|
+
self.task_state = JobStatus.unsubmitted
|
|
521
602
|
|
|
522
603
|
def __repr__(self):
|
|
523
604
|
return str(self.serialize())
|
|
@@ -602,15 +683,44 @@ class Task:
|
|
|
602
683
|
default=[],
|
|
603
684
|
),
|
|
604
685
|
Argument(
|
|
605
|
-
"outlog",
|
|
686
|
+
"outlog",
|
|
687
|
+
[type(None), str],
|
|
688
|
+
optional=False,
|
|
689
|
+
doc=doc_outlog,
|
|
690
|
+
default="log",
|
|
606
691
|
),
|
|
607
692
|
Argument(
|
|
608
|
-
"errlog",
|
|
693
|
+
"errlog",
|
|
694
|
+
[type(None), str],
|
|
695
|
+
optional=False,
|
|
696
|
+
doc=doc_errlog,
|
|
697
|
+
default="err",
|
|
609
698
|
),
|
|
610
699
|
]
|
|
611
700
|
task_format = Argument("task", dict, task_args)
|
|
612
701
|
return task_format
|
|
613
702
|
|
|
703
|
+
def get_task_state(self, context):
|
|
704
|
+
"""Get the task state by checking the tag file.
|
|
705
|
+
|
|
706
|
+
Parameters
|
|
707
|
+
----------
|
|
708
|
+
context : Context
|
|
709
|
+
the context of the task
|
|
710
|
+
"""
|
|
711
|
+
if self.task_state in (JobStatus.finished, JobStatus.unsubmitted):
|
|
712
|
+
# finished task should always be finished
|
|
713
|
+
# unsubmitted task do not need to check tag
|
|
714
|
+
return
|
|
715
|
+
# check tag
|
|
716
|
+
task_tag_finished = (
|
|
717
|
+
pathlib.PurePath(self.task_work_path)
|
|
718
|
+
/ (self.task_hash + "_task_tag_finished")
|
|
719
|
+
).as_posix()
|
|
720
|
+
result = context.check_file_exists(task_tag_finished)
|
|
721
|
+
if result:
|
|
722
|
+
self.task_state = JobStatus.finished
|
|
723
|
+
|
|
614
724
|
|
|
615
725
|
class Job:
|
|
616
726
|
"""Job is generated by Submission automatically.
|
|
@@ -700,6 +810,8 @@ class Job:
|
|
|
700
810
|
job.job_id = job_dict[job_hash]["job_id"]
|
|
701
811
|
job.fail_count = job_dict[job_hash]["fail_count"]
|
|
702
812
|
# job.job_uuid = job_dict[job_hash]['job_uuid']
|
|
813
|
+
for task in job.job_task_list:
|
|
814
|
+
task.task_state = job.job_state
|
|
703
815
|
return job
|
|
704
816
|
|
|
705
817
|
def get_job_state(self):
|
|
@@ -715,6 +827,11 @@ class Job:
|
|
|
715
827
|
assert self.machine is not None
|
|
716
828
|
job_state = self.machine.check_status(self)
|
|
717
829
|
self.job_state = job_state
|
|
830
|
+
# update general task_state, which should be faster than checking tags
|
|
831
|
+
for task in self.job_task_list:
|
|
832
|
+
# only update if the task is not finished
|
|
833
|
+
if task.task_state != JobStatus.finished:
|
|
834
|
+
task.task_state = job_state
|
|
718
835
|
|
|
719
836
|
def handle_unexpected_job_state(self):
|
|
720
837
|
job_state = self.job_state
|
|
@@ -838,7 +955,7 @@ class Resources:
|
|
|
838
955
|
If true, dpdispatcher will manually export environment variable CUDA_VISIBLE_DEVICES to different task.
|
|
839
956
|
Usually, this option will be used with Task.task_need_resources variable simultaneously.
|
|
840
957
|
ratio_unfinished : float
|
|
841
|
-
The ratio of `
|
|
958
|
+
The ratio of `task` that can be unfinished.
|
|
842
959
|
para_deg : int
|
|
843
960
|
Decide how many tasks will be run in parallel.
|
|
844
961
|
Usually run with `strategy['if_cuda_multi_devices']`
|
|
@@ -1010,7 +1127,7 @@ class Resources:
|
|
|
1010
1127
|
"If true, dpdispatcher will manually export environment variable CUDA_VISIBLE_DEVICES to different task."
|
|
1011
1128
|
"Usually, this option will be used with Task.task_need_resources variable simultaneously."
|
|
1012
1129
|
)
|
|
1013
|
-
doc_ratio_unfinished = "The ratio of `
|
|
1130
|
+
doc_ratio_unfinished = "The ratio of `tasks` that can be unfinished."
|
|
1014
1131
|
|
|
1015
1132
|
strategy_args = [
|
|
1016
1133
|
Argument(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: dpdispatcher
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.8
|
|
4
4
|
Summary: Generate HPC scheduler systems jobs input scripts, submit these scripts to HPC systems, and poke until they finish
|
|
5
5
|
Author: DeepModeling
|
|
6
6
|
License: GNU LESSER GENERAL PUBLIC LICENSE
|
|
@@ -204,15 +204,20 @@ Provides-Extra: test
|
|
|
204
204
|
|
|
205
205
|
# DPDispatcher
|
|
206
206
|
|
|
207
|
-
|
|
207
|
+
[](https://anaconda.org/conda-forge/dpdispatcher)
|
|
208
|
+
[](https://pypi.org/project/dpdispatcher)
|
|
209
|
+
[](https://hub.docker.com/r/dptechnology/dpdispatcher)
|
|
210
|
+
[](https://dpdispatcher.readthedocs.io/)
|
|
211
|
+
|
|
212
|
+
DPDispatcher is a Python package used to generate HPC (High-Performance Computing) scheduler systems (Slurm/PBS/LSF/Bohrium) jobs input scripts, submit them to HPC systems, and poke until they finish.
|
|
208
213
|
|
|
209
|
-
DPDispatcher will monitor (poke) until these jobs finish and download the results files (if these jobs
|
|
214
|
+
DPDispatcher will monitor (poke) until these jobs finish and download the results files (if these jobs are running on remote systems connected by SSH).
|
|
210
215
|
|
|
211
216
|
For more information, check the [documentation](https://dpdispatcher.readthedocs.io/).
|
|
212
217
|
|
|
213
218
|
## Installation
|
|
214
219
|
|
|
215
|
-
DPDispatcher can installed by `pip`:
|
|
220
|
+
DPDispatcher can be installed by `pip`:
|
|
216
221
|
|
|
217
222
|
```bash
|
|
218
223
|
pip install dpdispatcher
|
|
@@ -224,5 +229,9 @@ See [Getting Started](https://dpdispatcher.readthedocs.io/en/latest/getting-star
|
|
|
224
229
|
|
|
225
230
|
## Contributing
|
|
226
231
|
|
|
227
|
-
DPDispatcher is maintained by Deep Modeling's developers and
|
|
232
|
+
DPDispatcher is maintained by Deep Modeling's developers and welcomes other people.
|
|
228
233
|
See [Contributing Guide](CONTRIBUTING.md) to become a contributor! 🤓
|
|
234
|
+
|
|
235
|
+
## References
|
|
236
|
+
|
|
237
|
+
DPDispatcher is derivated from the [DP-GEN](https://github.com/deepmodeling/dpgen) package. To mention DPDispatcher in a scholarly publication, please read Section 3.3 in the [DP-GEN paper](https://doi.org/10.1016/j.cpc.2020.107206).
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
dpdispatcher/JobStatus.py,sha256=Eszs4TPLfszCuf6zLaFonf25feXDUguF28spYOjJpQE,233
|
|
2
|
+
dpdispatcher/__init__.py,sha256=2GIz4niyzHTbxros1G7Mi4uBJbD3AMSnTPxXSJMJmUs,2907
|
|
3
|
+
dpdispatcher/_version.py,sha256=iqWtoISytDDNpYe-atC8Kl-rZhTojPnDQKAEcFNtIhg,160
|
|
4
|
+
dpdispatcher/arginfo.py,sha256=pNaxYIE6ahBidpR7OCKZdw8iGt003uTXGSlVzwiuvRg,188
|
|
5
|
+
dpdispatcher/base_context.py,sha256=Hfri0x41XC4MRUjxc0-WMiZB_E4NvLp94ZYaHfYCWHM,3610
|
|
6
|
+
dpdispatcher/distributed_shell.py,sha256=XMcXt8g1f2DY5HYhhyiN5ehV2ihKULY5ng-sB0B7YaI,6933
|
|
7
|
+
dpdispatcher/dp_cloud_server.py,sha256=xVpDI0exBwHNSZECLJdfrQsvBzeUn5a0gx5Bzt9UAdU,9857
|
|
8
|
+
dpdispatcher/dp_cloud_server_context.py,sha256=VfRRo4ruorWC8NVjW19EjmxQ0Rbz6XzxrHrJKl4cCZk,11255
|
|
9
|
+
dpdispatcher/dpdisp.py,sha256=_dyH8xEgUR-s2xKkB20D9FIYhSHUCmzc2PxWgo9ildQ,94
|
|
10
|
+
dpdispatcher/fugaku.py,sha256=wSjY0XB3TNNWAPKHgMpoPl5jyYJIlijBcEkYXp6nrZQ,3733
|
|
11
|
+
dpdispatcher/hdfs_cli.py,sha256=9Vrf7Kz_kJgXP2xEdZqNVNxRGbui5RrtnLtEjxfcq9A,6047
|
|
12
|
+
dpdispatcher/hdfs_context.py,sha256=1jT1nzx7VGJFJ42MHTXoFWhfEu4KBkMBJO84klRAnPI,8938
|
|
13
|
+
dpdispatcher/lazy_local_context.py,sha256=ZdWNqK3QF8SsoqnCjpFt3ZDRCIagjzJNlKPUYutRUC8,5692
|
|
14
|
+
dpdispatcher/local_context.py,sha256=anYJqQASOnkcAhfckUcFD8_DcjNUZ1KE0GuksxR5Mxw,11772
|
|
15
|
+
dpdispatcher/lsf.py,sha256=zy-WEnC7f2Dy5hJGnRBl5jpjYZ_H3-KMcE0lxDG6ejo,7790
|
|
16
|
+
dpdispatcher/machine.py,sha256=31xG5ksN8mBVwD8taLsk5KXLhjM0ZTjlHlbbPgiig1c,15296
|
|
17
|
+
dpdispatcher/pbs.py,sha256=LiULEKNDuisrKmOpZyB1af6sGDQ35xrAhMh7VMwpFbY,6327
|
|
18
|
+
dpdispatcher/shell.py,sha256=kEP7za-qN71y_21p0uBNkopZ5s63Adq54904hjUHv48,4141
|
|
19
|
+
dpdispatcher/slurm.py,sha256=krlyjzxK8gIhSsqcKHFvNiUwVE7411wTUwuW9xGzS-E,14648
|
|
20
|
+
dpdispatcher/ssh_context.py,sha256=7Xrm8biVA7tAEDJ6YJZzC3nbdQrVBr_5UOhQNQ7qJ2g,35032
|
|
21
|
+
dpdispatcher/submission.py,sha256=r_F05nHTpN86b2os8RZAjZsCILNarDko2BjAEUYSntw,46643
|
|
22
|
+
dpdispatcher/utils.py,sha256=RXUHJl3S2z26Em3SeltnxtdVM3kv7weXJKvBEjG6I34,5035
|
|
23
|
+
dpdispatcher/dpcloudserver/__init__.py,sha256=FnX9HH-2dXADluNfucg98JPMfruMoBpN9ER9lZkVQvQ,49
|
|
24
|
+
dpdispatcher/dpcloudserver/client.py,sha256=w1wQ8g-FMQlyh00LIAbJLE1xirGXocpp7zAnhbeM4V0,11152
|
|
25
|
+
dpdispatcher/dpcloudserver/config.py,sha256=vBRtzExJXTGfXPeBObXrZNAhBNXoFFzMkzSuSrrjHEQ,635
|
|
26
|
+
dpdispatcher/dpcloudserver/retcode.py,sha256=1qAF8gFZx55u2sO8KbtYSIIrjcO-IGufEUlwbkSfC1g,721
|
|
27
|
+
dpdispatcher/dpcloudserver/temp_test.py,sha256=jklOSu7tZ_wW5gycGRiUsbBWMLZDqCBslSYOCb2hTHw,2932
|
|
28
|
+
dpdispatcher/dpcloudserver/zip_file.py,sha256=f9WrlktwHW0YipaWg5Y0kxjMZlhD1cJYa6EUpvu4Cro,2611
|
|
29
|
+
dpdispatcher-0.5.8.dist-info/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
|
|
30
|
+
dpdispatcher-0.5.8.dist-info/METADATA,sha256=o2oD8_6Ohc04mRTkJWi51-KOPamYqH0kvUD-E0iW-c0,12280
|
|
31
|
+
dpdispatcher-0.5.8.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
|
|
32
|
+
dpdispatcher-0.5.8.dist-info/entry_points.txt,sha256=3bKn6IB6SYhKOUbbcOdBBevz4gsDmhmbogKMVn4ptOQ,52
|
|
33
|
+
dpdispatcher-0.5.8.dist-info/top_level.txt,sha256=35jAQoXY-b-e9fJ1_mxhZUiaCoJNt1ZI7mpFRf07Qjs,13
|
|
34
|
+
dpdispatcher-0.5.8.dist-info/RECORD,,
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
dpdispatcher/JobStatus.py,sha256=Eszs4TPLfszCuf6zLaFonf25feXDUguF28spYOjJpQE,233
|
|
2
|
-
dpdispatcher/__init__.py,sha256=U8OLDjSGHxILiz8XH-HYBxjIlhD429HEqqxQ-vVK1a4,2866
|
|
3
|
-
dpdispatcher/_version.py,sha256=J0O-QTcfk70wRsnrg-XApMYr8T5heaCiqHkl3PJ9zfs,160
|
|
4
|
-
dpdispatcher/arginfo.py,sha256=pNaxYIE6ahBidpR7OCKZdw8iGt003uTXGSlVzwiuvRg,188
|
|
5
|
-
dpdispatcher/base_context.py,sha256=XTKN0T_ffhVipEsbVEYNPmbKds8qMuwWbCixAhV8mUc,3690
|
|
6
|
-
dpdispatcher/distributed_shell.py,sha256=vbNT8VHaYwEu2zIVFyosQGji4C3_QSpWKMFZURkJC7c,6941
|
|
7
|
-
dpdispatcher/dp_cloud_server.py,sha256=mwXt2rtQeW4uMYBP05JcYE5vm5dp2JeitxiS8KjpFQc,9758
|
|
8
|
-
dpdispatcher/dp_cloud_server_context.py,sha256=t47Kfn3cyQ223cBU-HVVdKvTm8yQiqHsFcMnQcIXdgk,11300
|
|
9
|
-
dpdispatcher/dpdisp.py,sha256=_dyH8xEgUR-s2xKkB20D9FIYhSHUCmzc2PxWgo9ildQ,94
|
|
10
|
-
dpdispatcher/hdfs_cli.py,sha256=9Vrf7Kz_kJgXP2xEdZqNVNxRGbui5RrtnLtEjxfcq9A,6047
|
|
11
|
-
dpdispatcher/hdfs_context.py,sha256=IGvXsw9wdR8aemQ9kOE5WaciwVLtZbr-t2mrCQjxywU,8980
|
|
12
|
-
dpdispatcher/lazy_local_context.py,sha256=V0jVuAgOHKw_PkYPCKnr3OkMyWfLRTJ8B7As3VCzLX8,5775
|
|
13
|
-
dpdispatcher/local_context.py,sha256=8tML77WRSydJoPA6DseYsIshV-id5xO-6kWsqDsJHQ0,11855
|
|
14
|
-
dpdispatcher/lsf.py,sha256=PjsjNO8YZkWBzFe_277G1oVrLcAm1Qz1fN-3FZ4dsK4,7553
|
|
15
|
-
dpdispatcher/machine.py,sha256=vafq9zTW6NYZ-ZOMEfVEiZkoZzbEjco09d4VwEb9Jk8,14949
|
|
16
|
-
dpdispatcher/pbs.py,sha256=RScX8rX1lGlvilgCEFxzUAeTA-Em5AeAcM2yT2OKY3s,6057
|
|
17
|
-
dpdispatcher/shell.py,sha256=SZoJynOmqldMrl-lIMsNOY1RELFNFWWffeSg7XJsi9g,3843
|
|
18
|
-
dpdispatcher/slurm.py,sha256=bsSTaRe5t3-z5BdjMyerGGBEXjJ_BfzMcawNMaFULfs,12886
|
|
19
|
-
dpdispatcher/ssh_context.py,sha256=aeGiTUmBzfwkZ4xgNsyXBSpHAUHoUEVltidkNFWLXUE,34670
|
|
20
|
-
dpdispatcher/submission.py,sha256=5qt1Nw3qRQeFq-w5DvzoWb9K-qMLiOFUjy_JhtnxFfI,41960
|
|
21
|
-
dpdispatcher/utils.py,sha256=RXUHJl3S2z26Em3SeltnxtdVM3kv7weXJKvBEjG6I34,5035
|
|
22
|
-
dpdispatcher/dpcloudserver/__init__.py,sha256=FnX9HH-2dXADluNfucg98JPMfruMoBpN9ER9lZkVQvQ,49
|
|
23
|
-
dpdispatcher/dpcloudserver/client.py,sha256=a1KzBbbBKz6ZMH9iWhQfrdhL5BwrWevLS1vlTK4WP8w,11154
|
|
24
|
-
dpdispatcher/dpcloudserver/config.py,sha256=vBRtzExJXTGfXPeBObXrZNAhBNXoFFzMkzSuSrrjHEQ,635
|
|
25
|
-
dpdispatcher/dpcloudserver/retcode.py,sha256=1qAF8gFZx55u2sO8KbtYSIIrjcO-IGufEUlwbkSfC1g,721
|
|
26
|
-
dpdispatcher/dpcloudserver/temp_test.py,sha256=jklOSu7tZ_wW5gycGRiUsbBWMLZDqCBslSYOCb2hTHw,2932
|
|
27
|
-
dpdispatcher/dpcloudserver/zip_file.py,sha256=f9WrlktwHW0YipaWg5Y0kxjMZlhD1cJYa6EUpvu4Cro,2611
|
|
28
|
-
dpdispatcher-0.5.6.dist-info/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
|
|
29
|
-
dpdispatcher-0.5.6.dist-info/METADATA,sha256=KtfW7Uy8R5YHqMkRH3oDE5jh3JHXk_RcZ1W5H_gv2EM,11481
|
|
30
|
-
dpdispatcher-0.5.6.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
|
|
31
|
-
dpdispatcher-0.5.6.dist-info/entry_points.txt,sha256=3bKn6IB6SYhKOUbbcOdBBevz4gsDmhmbogKMVn4ptOQ,52
|
|
32
|
-
dpdispatcher-0.5.6.dist-info/top_level.txt,sha256=35jAQoXY-b-e9fJ1_mxhZUiaCoJNt1ZI7mpFRf07Qjs,13
|
|
33
|
-
dpdispatcher-0.5.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|