dpdispatcher 0.6.6__py3-none-any.whl → 0.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dpdispatcher might be problematic. Click here for more details.
- dpdispatcher/_version.py +2 -2
- dpdispatcher/base_context.py +60 -1
- dpdispatcher/contexts/dp_cloud_server_context.py +5 -0
- dpdispatcher/contexts/hdfs_context.py +5 -0
- dpdispatcher/contexts/lazy_local_context.py +0 -17
- dpdispatcher/contexts/local_context.py +57 -31
- dpdispatcher/contexts/openapi_context.py +5 -0
- dpdispatcher/contexts/ssh_context.py +16 -38
- dpdispatcher/machine.py +12 -0
- dpdispatcher/machines/JH_UniScheduler.py +2 -3
- dpdispatcher/machines/distributed_shell.py +2 -4
- dpdispatcher/machines/lsf.py +1 -2
- dpdispatcher/machines/pbs.py +14 -10
- dpdispatcher/machines/shell.py +7 -8
- dpdispatcher/machines/slurm.py +18 -27
- dpdispatcher/submission.py +4 -11
- dpdispatcher/utils/hdfs_cli.py +6 -11
- dpdispatcher/utils/utils.py +1 -1
- {dpdispatcher-0.6.6.dist-info → dpdispatcher-0.6.8.dist-info}/METADATA +25 -25
- {dpdispatcher-0.6.6.dist-info → dpdispatcher-0.6.8.dist-info}/RECORD +24 -24
- {dpdispatcher-0.6.6.dist-info → dpdispatcher-0.6.8.dist-info}/WHEEL +1 -1
- {dpdispatcher-0.6.6.dist-info → dpdispatcher-0.6.8.dist-info}/LICENSE +0 -0
- {dpdispatcher-0.6.6.dist-info → dpdispatcher-0.6.8.dist-info}/entry_points.txt +0 -0
- {dpdispatcher-0.6.6.dist-info → dpdispatcher-0.6.8.dist-info}/top_level.txt +0 -0
dpdispatcher/_version.py
CHANGED
dpdispatcher/base_context.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from abc import ABCMeta, abstractmethod
|
|
2
|
-
from typing import List, Tuple
|
|
2
|
+
from typing import Any, List, Tuple
|
|
3
3
|
|
|
4
4
|
from dargs import Argument
|
|
5
5
|
|
|
@@ -73,6 +73,65 @@ class BaseContext(metaclass=ABCMeta):
|
|
|
73
73
|
def check_finish(self, proc):
|
|
74
74
|
raise NotImplementedError("abstract method")
|
|
75
75
|
|
|
76
|
+
def block_checkcall(self, cmd, asynchronously=False) -> Tuple[Any, Any, Any]:
|
|
77
|
+
"""Run command with arguments. Wait for command to complete.
|
|
78
|
+
|
|
79
|
+
Parameters
|
|
80
|
+
----------
|
|
81
|
+
cmd : str
|
|
82
|
+
The command to run.
|
|
83
|
+
asynchronously : bool, optional, default=False
|
|
84
|
+
Run command asynchronously. If True, `nohup` will be used to run the command.
|
|
85
|
+
|
|
86
|
+
Returns
|
|
87
|
+
-------
|
|
88
|
+
stdin
|
|
89
|
+
standard inout
|
|
90
|
+
stdout
|
|
91
|
+
standard output
|
|
92
|
+
stderr
|
|
93
|
+
standard error
|
|
94
|
+
|
|
95
|
+
Raises
|
|
96
|
+
------
|
|
97
|
+
RuntimeError
|
|
98
|
+
when the return code is not zero
|
|
99
|
+
"""
|
|
100
|
+
if asynchronously:
|
|
101
|
+
cmd = f"nohup {cmd} >/dev/null &"
|
|
102
|
+
exit_status, stdin, stdout, stderr = self.block_call(cmd)
|
|
103
|
+
if exit_status != 0:
|
|
104
|
+
raise RuntimeError(
|
|
105
|
+
"Get error code {} in calling {} with job: {} . message: {}".format(
|
|
106
|
+
exit_status,
|
|
107
|
+
cmd,
|
|
108
|
+
self.submission.submission_hash,
|
|
109
|
+
stderr.read().decode("utf-8"),
|
|
110
|
+
)
|
|
111
|
+
)
|
|
112
|
+
return stdin, stdout, stderr
|
|
113
|
+
|
|
114
|
+
@abstractmethod
|
|
115
|
+
def block_call(self, cmd) -> Tuple[int, Any, Any, Any]:
|
|
116
|
+
"""Run command with arguments. Wait for command to complete.
|
|
117
|
+
|
|
118
|
+
Parameters
|
|
119
|
+
----------
|
|
120
|
+
cmd : str
|
|
121
|
+
The command to run.
|
|
122
|
+
|
|
123
|
+
Returns
|
|
124
|
+
-------
|
|
125
|
+
exit_status
|
|
126
|
+
exit code
|
|
127
|
+
stdin
|
|
128
|
+
standard inout
|
|
129
|
+
stdout
|
|
130
|
+
standard output
|
|
131
|
+
stderr
|
|
132
|
+
standard error
|
|
133
|
+
"""
|
|
134
|
+
|
|
76
135
|
@classmethod
|
|
77
136
|
def machine_arginfo(cls) -> Argument:
|
|
78
137
|
"""Generate the machine arginfo.
|
|
@@ -335,6 +335,11 @@ class BohriumContext(BaseContext):
|
|
|
335
335
|
)
|
|
336
336
|
]
|
|
337
337
|
|
|
338
|
+
def block_call(self, cmd):
|
|
339
|
+
raise RuntimeError(
|
|
340
|
+
"Unsupported method. You may use an unsupported combination of the machine and the context."
|
|
341
|
+
)
|
|
342
|
+
|
|
338
343
|
|
|
339
344
|
DpCloudServerContext = BohriumContext
|
|
340
345
|
LebesgueContext = BohriumContext
|
|
@@ -244,3 +244,8 @@ class HDFSContext(BaseContext):
|
|
|
244
244
|
|
|
245
245
|
def read_file(self, fname):
|
|
246
246
|
return HDFS.read_hdfs_file(os.path.join(self.remote_root, fname))
|
|
247
|
+
|
|
248
|
+
def block_call(self, cmd):
|
|
249
|
+
raise RuntimeError(
|
|
250
|
+
"Unsupported method. You may use an unsupported combination of the machine and the context."
|
|
251
|
+
)
|
|
@@ -112,23 +112,6 @@ class LazyLocalContext(BaseContext):
|
|
|
112
112
|
# else:
|
|
113
113
|
# raise RuntimeError('do not find download file ' + fname)
|
|
114
114
|
|
|
115
|
-
def block_checkcall(self, cmd):
|
|
116
|
-
# script_dir = os.path.join(self.local_root, self.submission.work_base)
|
|
117
|
-
# os.chdir(script_dir)
|
|
118
|
-
proc = sp.Popen(
|
|
119
|
-
cmd, cwd=self.local_root, shell=True, stdout=sp.PIPE, stderr=sp.PIPE
|
|
120
|
-
)
|
|
121
|
-
o, e = proc.communicate()
|
|
122
|
-
stdout = SPRetObj(o)
|
|
123
|
-
stderr = SPRetObj(e)
|
|
124
|
-
code = proc.returncode
|
|
125
|
-
if code != 0:
|
|
126
|
-
raise RuntimeError(
|
|
127
|
-
"Get error code %d in locally calling %s with job: %s ",
|
|
128
|
-
(code, cmd, self.submission.submission_hash),
|
|
129
|
-
)
|
|
130
|
-
return None, stdout, stderr
|
|
131
|
-
|
|
132
115
|
def block_call(self, cmd):
|
|
133
116
|
proc = sp.Popen(
|
|
134
117
|
cmd, cwd=self.local_root, shell=True, stdout=sp.PIPE, stderr=sp.PIPE
|
|
@@ -3,6 +3,9 @@ import shutil
|
|
|
3
3
|
import subprocess as sp
|
|
4
4
|
from glob import glob
|
|
5
5
|
from subprocess import TimeoutExpired
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
from dargs import Argument
|
|
6
9
|
|
|
7
10
|
from dpdispatcher.base_context import BaseContext
|
|
8
11
|
from dpdispatcher.dlog import dlog
|
|
@@ -60,6 +63,7 @@ class LocalContext(BaseContext):
|
|
|
60
63
|
self.temp_local_root = os.path.abspath(local_root)
|
|
61
64
|
self.temp_remote_root = os.path.abspath(remote_root)
|
|
62
65
|
self.remote_profile = remote_profile
|
|
66
|
+
self.symlink = remote_profile.get("symlink", True)
|
|
63
67
|
|
|
64
68
|
@classmethod
|
|
65
69
|
def load_from_dict(cls, context_dict):
|
|
@@ -83,6 +87,25 @@ class LocalContext(BaseContext):
|
|
|
83
87
|
self.temp_remote_root, submission.submission_hash
|
|
84
88
|
)
|
|
85
89
|
|
|
90
|
+
def _copy_from_local_to_remote(self, local_path, remote_path):
|
|
91
|
+
if not os.path.exists(local_path):
|
|
92
|
+
raise FileNotFoundError(
|
|
93
|
+
f"cannot find uploaded file {os.path.join(local_path)}"
|
|
94
|
+
)
|
|
95
|
+
if os.path.exists(remote_path):
|
|
96
|
+
os.remove(remote_path)
|
|
97
|
+
_check_file_path(remote_path)
|
|
98
|
+
|
|
99
|
+
if self.symlink:
|
|
100
|
+
# ensure the file exist
|
|
101
|
+
os.symlink(local_path, remote_path)
|
|
102
|
+
elif os.path.isfile(local_path):
|
|
103
|
+
shutil.copyfile(local_path, remote_path)
|
|
104
|
+
elif os.path.isdir(local_path):
|
|
105
|
+
shutil.copytree(local_path, remote_path)
|
|
106
|
+
else:
|
|
107
|
+
raise ValueError(f"Unknown file type: {local_path}")
|
|
108
|
+
|
|
86
109
|
def upload(self, submission):
|
|
87
110
|
os.makedirs(self.remote_root, exist_ok=True)
|
|
88
111
|
for ii in submission.belonging_tasks:
|
|
@@ -103,14 +126,9 @@ class LocalContext(BaseContext):
|
|
|
103
126
|
file_list.extend(rel_file_list)
|
|
104
127
|
|
|
105
128
|
for jj in file_list:
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
)
|
|
110
|
-
if os.path.exists(os.path.join(remote_job, jj)):
|
|
111
|
-
os.remove(os.path.join(remote_job, jj))
|
|
112
|
-
_check_file_path(os.path.join(remote_job, jj))
|
|
113
|
-
os.symlink(os.path.join(local_job, jj), os.path.join(remote_job, jj))
|
|
129
|
+
self._copy_from_local_to_remote(
|
|
130
|
+
os.path.join(local_job, jj), os.path.join(remote_job, jj)
|
|
131
|
+
)
|
|
114
132
|
|
|
115
133
|
local_job = self.local_root
|
|
116
134
|
remote_job = self.remote_root
|
|
@@ -128,14 +146,9 @@ class LocalContext(BaseContext):
|
|
|
128
146
|
file_list.extend(rel_file_list)
|
|
129
147
|
|
|
130
148
|
for jj in file_list:
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
)
|
|
135
|
-
if os.path.exists(os.path.join(remote_job, jj)):
|
|
136
|
-
os.remove(os.path.join(remote_job, jj))
|
|
137
|
-
_check_file_path(os.path.join(remote_job, jj))
|
|
138
|
-
os.symlink(os.path.join(local_job, jj), os.path.join(remote_job, jj))
|
|
149
|
+
self._copy_from_local_to_remote(
|
|
150
|
+
os.path.join(local_job, jj), os.path.join(remote_job, jj)
|
|
151
|
+
)
|
|
139
152
|
|
|
140
153
|
def download(
|
|
141
154
|
self, submission, check_exists=False, mark_failure=True, back_error=False
|
|
@@ -288,21 +301,6 @@ class LocalContext(BaseContext):
|
|
|
288
301
|
# no nothing in the case of linked files
|
|
289
302
|
pass
|
|
290
303
|
|
|
291
|
-
def block_checkcall(self, cmd):
|
|
292
|
-
proc = sp.Popen(
|
|
293
|
-
cmd, cwd=self.remote_root, shell=True, stdout=sp.PIPE, stderr=sp.PIPE
|
|
294
|
-
)
|
|
295
|
-
o, e = proc.communicate()
|
|
296
|
-
stdout = SPRetObj(o)
|
|
297
|
-
stderr = SPRetObj(e)
|
|
298
|
-
code = proc.returncode
|
|
299
|
-
if code != 0:
|
|
300
|
-
raise RuntimeError(
|
|
301
|
-
f"Get error code {code} in locally calling {cmd} with job: {self.submission.submission_hash}"
|
|
302
|
-
f"\nStandard error: {stderr}"
|
|
303
|
-
)
|
|
304
|
-
return None, stdout, stderr
|
|
305
|
-
|
|
306
304
|
def block_call(self, cmd):
|
|
307
305
|
proc = sp.Popen(
|
|
308
306
|
cmd, cwd=self.remote_root, shell=True, stdout=sp.PIPE, stderr=sp.PIPE
|
|
@@ -351,3 +349,31 @@ class LocalContext(BaseContext):
|
|
|
351
349
|
stdout = None
|
|
352
350
|
stderr = None
|
|
353
351
|
return ret, stdout, stderr
|
|
352
|
+
|
|
353
|
+
@classmethod
|
|
354
|
+
def machine_subfields(cls) -> List[Argument]:
|
|
355
|
+
"""Generate the machine subfields.
|
|
356
|
+
|
|
357
|
+
Returns
|
|
358
|
+
-------
|
|
359
|
+
list[Argument]
|
|
360
|
+
machine subfields
|
|
361
|
+
"""
|
|
362
|
+
doc_remote_profile = "The information used to maintain the local machine."
|
|
363
|
+
return [
|
|
364
|
+
Argument(
|
|
365
|
+
"remote_profile",
|
|
366
|
+
dict,
|
|
367
|
+
optional=True,
|
|
368
|
+
doc=doc_remote_profile,
|
|
369
|
+
sub_fields=[
|
|
370
|
+
Argument(
|
|
371
|
+
"symlink",
|
|
372
|
+
bool,
|
|
373
|
+
optional=True,
|
|
374
|
+
default=True,
|
|
375
|
+
doc="Whether to use symbolic links to replace copy. This option should be turned off if the local directory is not accessible on the Batch system.",
|
|
376
|
+
),
|
|
377
|
+
],
|
|
378
|
+
)
|
|
379
|
+
]
|
|
@@ -258,3 +258,8 @@ class OpenAPIContext(BaseContext):
|
|
|
258
258
|
dir_to_be_removed = os.path.join(local_root, "backup")
|
|
259
259
|
if os.path.exists(dir_to_be_removed):
|
|
260
260
|
shutil.rmtree(dir_to_be_removed)
|
|
261
|
+
|
|
262
|
+
def block_call(self, cmd):
|
|
263
|
+
raise RuntimeError(
|
|
264
|
+
"Unsupported method. You may use an unsupported combination of the machine and the context."
|
|
265
|
+
)
|
|
@@ -44,6 +44,7 @@ class SSHSession:
|
|
|
44
44
|
totp_secret=None,
|
|
45
45
|
tar_compress=True,
|
|
46
46
|
look_for_keys=True,
|
|
47
|
+
execute_command=None,
|
|
47
48
|
):
|
|
48
49
|
self.hostname = hostname
|
|
49
50
|
self.username = username
|
|
@@ -56,6 +57,7 @@ class SSHSession:
|
|
|
56
57
|
self.ssh = None
|
|
57
58
|
self.tar_compress = tar_compress
|
|
58
59
|
self.look_for_keys = look_for_keys
|
|
60
|
+
self.execute_command = execute_command
|
|
59
61
|
self._keyboard_interactive_auth = False
|
|
60
62
|
self._setup_ssh()
|
|
61
63
|
|
|
@@ -88,8 +90,7 @@ class SSHSession:
|
|
|
88
90
|
while not self._check_alive():
|
|
89
91
|
if count == max_check:
|
|
90
92
|
raise RuntimeError(
|
|
91
|
-
"cannot connect ssh after
|
|
92
|
-
% (max_check, sleep_time)
|
|
93
|
+
f"cannot connect ssh after {max_check} failures at interval {sleep_time} s"
|
|
93
94
|
)
|
|
94
95
|
dlog.info("connection check failed, try to reconnect to " + self.hostname)
|
|
95
96
|
self._setup_ssh()
|
|
@@ -237,6 +238,8 @@ class SSHSession:
|
|
|
237
238
|
self.ssh._transport = ts # type: ignore
|
|
238
239
|
# reset sftp
|
|
239
240
|
self._sftp = None
|
|
241
|
+
if self.execute_command is not None:
|
|
242
|
+
self.exec_command(self.execute_command)
|
|
240
243
|
|
|
241
244
|
def inter_handler(self, title, instructions, prompt_list):
|
|
242
245
|
"""inter_handler: the callback for paramiko.transport.auth_interactive.
|
|
@@ -338,6 +341,7 @@ class SSHSession:
|
|
|
338
341
|
doc_look_for_keys = (
|
|
339
342
|
"enable searching for discoverable private key files in ~/.ssh/"
|
|
340
343
|
)
|
|
344
|
+
doc_execute_command = "execute command after ssh connection is established."
|
|
341
345
|
ssh_remote_profile_args = [
|
|
342
346
|
Argument("hostname", str, optional=False, doc=doc_hostname),
|
|
343
347
|
Argument("username", str, optional=False, doc=doc_username),
|
|
@@ -379,6 +383,13 @@ class SSHSession:
|
|
|
379
383
|
default=True,
|
|
380
384
|
doc=doc_look_for_keys,
|
|
381
385
|
),
|
|
386
|
+
Argument(
|
|
387
|
+
"execute_command",
|
|
388
|
+
str,
|
|
389
|
+
optional=True,
|
|
390
|
+
default=None,
|
|
391
|
+
doc=doc_execute_command,
|
|
392
|
+
),
|
|
382
393
|
]
|
|
383
394
|
ssh_remote_profile_format = Argument(
|
|
384
395
|
"ssh_session", dict, ssh_remote_profile_args
|
|
@@ -438,7 +449,9 @@ class SSHContext(BaseContext):
|
|
|
438
449
|
self.init_local_root = local_root
|
|
439
450
|
self.init_remote_root = remote_root
|
|
440
451
|
self.temp_local_root = os.path.abspath(local_root)
|
|
441
|
-
assert os.path.isabs(remote_root),
|
|
452
|
+
assert os.path.isabs(os.path.realpath(remote_root)), (
|
|
453
|
+
"remote_root must be a abspath"
|
|
454
|
+
)
|
|
442
455
|
self.temp_remote_root = remote_root
|
|
443
456
|
self.remote_profile = remote_profile
|
|
444
457
|
self.remote_root = None
|
|
@@ -755,41 +768,6 @@ class SSHContext(BaseContext):
|
|
|
755
768
|
tar_compress=self.remote_profile.get("tar_compress", None),
|
|
756
769
|
)
|
|
757
770
|
|
|
758
|
-
def block_checkcall(self, cmd, asynchronously=False, stderr_whitelist=None):
|
|
759
|
-
"""Run command with arguments. Wait for command to complete. If the return code
|
|
760
|
-
was zero then return, otherwise raise RuntimeError.
|
|
761
|
-
|
|
762
|
-
Parameters
|
|
763
|
-
----------
|
|
764
|
-
cmd : str
|
|
765
|
-
The command to run.
|
|
766
|
-
asynchronously : bool, optional, default=False
|
|
767
|
-
Run command asynchronously. If True, `nohup` will be used to run the command.
|
|
768
|
-
stderr_whitelist : list of str, optional, default=None
|
|
769
|
-
If not None, the stderr will be checked against the whitelist. If the stderr
|
|
770
|
-
contains any of the strings in the whitelist, the command will be considered
|
|
771
|
-
successful.
|
|
772
|
-
"""
|
|
773
|
-
assert self.remote_root is not None
|
|
774
|
-
self.ssh_session.ensure_alive()
|
|
775
|
-
if asynchronously:
|
|
776
|
-
cmd = f"nohup {cmd} >/dev/null &"
|
|
777
|
-
stdin, stdout, stderr = self.ssh_session.exec_command(
|
|
778
|
-
(f"cd {shlex.quote(self.remote_root)} ;") + cmd
|
|
779
|
-
)
|
|
780
|
-
exit_status = stdout.channel.recv_exit_status()
|
|
781
|
-
if exit_status != 0:
|
|
782
|
-
raise RuntimeError(
|
|
783
|
-
"Get error code %d in calling %s through ssh with job: %s . message: %s"
|
|
784
|
-
% (
|
|
785
|
-
exit_status,
|
|
786
|
-
cmd,
|
|
787
|
-
self.submission.submission_hash,
|
|
788
|
-
stderr.read().decode("utf-8"),
|
|
789
|
-
)
|
|
790
|
-
)
|
|
791
|
-
return stdin, stdout, stderr
|
|
792
|
-
|
|
793
771
|
def block_call(self, cmd):
|
|
794
772
|
assert self.remote_root is not None
|
|
795
773
|
self.ssh_session.ensure_alive()
|
dpdispatcher/machine.py
CHANGED
|
@@ -161,6 +161,9 @@ class Machine(metaclass=ABCMeta):
|
|
|
161
161
|
machine_dict["remote_profile"] = self.context.remote_profile
|
|
162
162
|
else:
|
|
163
163
|
machine_dict["remote_profile"] = {}
|
|
164
|
+
# normalize the dict
|
|
165
|
+
base = self.arginfo()
|
|
166
|
+
machine_dict = base.normalize_value(machine_dict, trim_pattern="_*")
|
|
164
167
|
return machine_dict
|
|
165
168
|
|
|
166
169
|
def __eq__(self, other):
|
|
@@ -265,6 +268,15 @@ class Machine(metaclass=ABCMeta):
|
|
|
265
268
|
|
|
266
269
|
export_envs_part = ""
|
|
267
270
|
envs = job.resources.envs
|
|
271
|
+
envs = {
|
|
272
|
+
# export resources information to the environment variables
|
|
273
|
+
"DPDISPATCHER_NUMBER_NODE": job.resources.number_node,
|
|
274
|
+
"DPDISPATCHER_CPU_PER_NODE": job.resources.cpu_per_node,
|
|
275
|
+
"DPDISPATCHER_GPU_PER_NODE": job.resources.gpu_per_node,
|
|
276
|
+
"DPDISPATCHER_QUEUE_NAME": job.resources.queue_name,
|
|
277
|
+
"DPDISPATCHER_GROUP_SIZE": job.resources.group_size,
|
|
278
|
+
**envs,
|
|
279
|
+
}
|
|
268
280
|
for k, v in envs.items():
|
|
269
281
|
if isinstance(v, list):
|
|
270
282
|
for each_value in v:
|
|
@@ -39,7 +39,7 @@ class JH_UniScheduler(Machine):
|
|
|
39
39
|
custom_gpu_line = resources.kwargs.get("custom_gpu_line", None)
|
|
40
40
|
if not custom_gpu_line:
|
|
41
41
|
script_header_dict["JH_UniScheduler_number_gpu_line"] = (
|
|
42
|
-
|
|
42
|
+
f"#JSUB -gpgpu {resources.gpu_per_node}"
|
|
43
43
|
)
|
|
44
44
|
else:
|
|
45
45
|
script_header_dict["JH_UniScheduler_number_gpu_line"] = custom_gpu_line
|
|
@@ -105,8 +105,7 @@ class JH_UniScheduler(Machine):
|
|
|
105
105
|
elif ret != 0:
|
|
106
106
|
# just retry when any unknown error raised.
|
|
107
107
|
raise RetrySignal(
|
|
108
|
-
"Get error code
|
|
109
|
-
% (ret, job.job_hash, err_str)
|
|
108
|
+
f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
|
|
110
109
|
)
|
|
111
110
|
status_out = stdout.read().decode("utf-8").split("\n")
|
|
112
111
|
if len(status_out) < 2:
|
|
@@ -181,8 +181,7 @@ class DistributedShell(Machine):
|
|
|
181
181
|
if ret != 0:
|
|
182
182
|
err_str = stderr.decode("utf-8")
|
|
183
183
|
raise RuntimeError(
|
|
184
|
-
"Command
|
|
185
|
-
% (err_str, ret)
|
|
184
|
+
f"Command {cmd} fails to execute, error message:{err_str}\nreturn code {ret}\n"
|
|
186
185
|
)
|
|
187
186
|
job_id = int(stdout.decode("utf-8").strip())
|
|
188
187
|
|
|
@@ -200,8 +199,7 @@ class DistributedShell(Machine):
|
|
|
200
199
|
if ret != 0:
|
|
201
200
|
err_str = stderr.decode("utf-8")
|
|
202
201
|
raise RuntimeError(
|
|
203
|
-
"Command fails to execute, error message
|
|
204
|
-
% (err_str, ret)
|
|
202
|
+
f"Command fails to execute, error message:{err_str}\nreturn code {ret}\n"
|
|
205
203
|
)
|
|
206
204
|
|
|
207
205
|
if_job_exists = bool(stdout.decode("utf-8").strip())
|
dpdispatcher/machines/lsf.py
CHANGED
|
@@ -129,8 +129,7 @@ class LSF(Machine):
|
|
|
129
129
|
elif ret != 0:
|
|
130
130
|
# just retry when any unknown error raised.
|
|
131
131
|
raise RetrySignal(
|
|
132
|
-
"Get error code
|
|
133
|
-
% (ret, job.job_hash, err_str)
|
|
132
|
+
f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
|
|
134
133
|
)
|
|
135
134
|
status_out = stdout.read().decode("utf-8").split("\n")
|
|
136
135
|
if len(status_out) < 2:
|
dpdispatcher/machines/pbs.py
CHANGED
|
@@ -76,7 +76,8 @@ class PBS(Machine):
|
|
|
76
76
|
job_id = job.job_id
|
|
77
77
|
if job_id == "":
|
|
78
78
|
return JobStatus.unsubmitted
|
|
79
|
-
|
|
79
|
+
command = "qstat -x " + job_id
|
|
80
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
80
81
|
err_str = stderr.read().decode("utf-8")
|
|
81
82
|
if ret != 0:
|
|
82
83
|
if "qstat: Unknown Job Id" in err_str or "Job has finished" in err_str:
|
|
@@ -86,8 +87,7 @@ class PBS(Machine):
|
|
|
86
87
|
return JobStatus.terminated
|
|
87
88
|
else:
|
|
88
89
|
raise RuntimeError(
|
|
89
|
-
"status command
|
|
90
|
-
% (err_str, ret)
|
|
90
|
+
f"status command {command} fails to execute. erro info: {err_str} return code {ret}"
|
|
91
91
|
)
|
|
92
92
|
status_line = stdout.read().decode("utf-8").split("\n")[-2]
|
|
93
93
|
status_word = status_line.split()[-2]
|
|
@@ -126,7 +126,8 @@ class Torque(PBS):
|
|
|
126
126
|
job_id = job.job_id
|
|
127
127
|
if job_id == "":
|
|
128
128
|
return JobStatus.unsubmitted
|
|
129
|
-
|
|
129
|
+
command = "qstat -l " + job_id
|
|
130
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
130
131
|
err_str = stderr.read().decode("utf-8")
|
|
131
132
|
if ret != 0:
|
|
132
133
|
if "qstat: Unknown Job Id" in err_str or "Job has finished" in err_str:
|
|
@@ -136,8 +137,7 @@ class Torque(PBS):
|
|
|
136
137
|
return JobStatus.terminated
|
|
137
138
|
else:
|
|
138
139
|
raise RuntimeError(
|
|
139
|
-
"status command
|
|
140
|
-
% (err_str, ret)
|
|
140
|
+
f"status command {command} fails to execute. erro info: {err_str} return code {ret}"
|
|
141
141
|
)
|
|
142
142
|
status_line = stdout.read().decode("utf-8").split("\n")[-2]
|
|
143
143
|
status_word = status_line.split()[-2]
|
|
@@ -259,15 +259,17 @@ class SGE(PBS):
|
|
|
259
259
|
pass
|
|
260
260
|
|
|
261
261
|
def check_status(self, job):
|
|
262
|
+
### https://softpanorama.org/HPC/Grid_engine/Queues/queue_states.shtml
|
|
262
263
|
job_id = job.job_id
|
|
263
264
|
status_line = None
|
|
264
265
|
if job_id == "":
|
|
265
266
|
return JobStatus.unsubmitted
|
|
266
|
-
|
|
267
|
+
command = "qstat"
|
|
268
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
267
269
|
err_str = stderr.read().decode("utf-8")
|
|
268
270
|
if ret != 0:
|
|
269
271
|
raise RuntimeError(
|
|
270
|
-
f"status command
|
|
272
|
+
f"status command {command} fails to execute. erro info: {err_str} return code {ret}"
|
|
271
273
|
)
|
|
272
274
|
status_text_list = stdout.read().decode("utf-8").split("\n")
|
|
273
275
|
for txt in status_text_list:
|
|
@@ -291,10 +293,12 @@ class SGE(PBS):
|
|
|
291
293
|
else:
|
|
292
294
|
status_word = status_line.split()[4]
|
|
293
295
|
# dlog.info (status_word)
|
|
294
|
-
if status_word in ["qw"]:
|
|
296
|
+
if status_word in ["qw", "hqw", "t"]:
|
|
295
297
|
return JobStatus.waiting
|
|
296
|
-
elif status_word in ["r"]:
|
|
298
|
+
elif status_word in ["r", "Rr"]:
|
|
297
299
|
return JobStatus.running
|
|
300
|
+
elif status_word in ["Eqw", "dr", "dt"]:
|
|
301
|
+
return JobStatus.terminated
|
|
298
302
|
else:
|
|
299
303
|
return JobStatus.unknown
|
|
300
304
|
|
dpdispatcher/machines/shell.py
CHANGED
|
@@ -38,14 +38,12 @@ class Shell(Machine):
|
|
|
38
38
|
script_run_str = self.gen_script_command(job)
|
|
39
39
|
script_run_file_name = f"{job.script_file_name}.run"
|
|
40
40
|
self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
)
|
|
41
|
+
cmd = f"cd {shlex.quote(self.context.remote_root)} && {{ nohup bash {script_file_name} 1>>{output_name} 2>>{output_name} & }} && echo $!"
|
|
42
|
+
ret, stdin, stdout, stderr = self.context.block_call(cmd)
|
|
44
43
|
if ret != 0:
|
|
45
44
|
err_str = stderr.read().decode("utf-8")
|
|
46
45
|
raise RuntimeError(
|
|
47
|
-
"status command
|
|
48
|
-
% (err_str, ret)
|
|
46
|
+
f"status command {cmd} fails to execute\nerror message:{err_str}\nreturn code {ret}\n"
|
|
49
47
|
)
|
|
50
48
|
job_id = int(stdout.read().decode("utf-8").strip())
|
|
51
49
|
self.context.write_file(job_id_name, str(job_id))
|
|
@@ -73,14 +71,15 @@ class Shell(Machine):
|
|
|
73
71
|
return JobStatus.unsubmitted
|
|
74
72
|
|
|
75
73
|
# mark defunct process as terminated
|
|
76
|
-
|
|
74
|
+
cmd = (
|
|
75
|
+
r"""command -v ps >/dev/null 2>&1 || { echo >&2 "I require ps but it's not installed. Aborting."; exit 1; };"""
|
|
77
76
|
f"if ps -p {job_id} > /dev/null && ! (ps -o command -p {job_id} | grep defunct >/dev/null) ; then echo 1; fi"
|
|
78
77
|
)
|
|
78
|
+
ret, stdin, stdout, stderr = self.context.block_call(cmd)
|
|
79
79
|
if ret != 0:
|
|
80
80
|
err_str = stderr.read().decode("utf-8")
|
|
81
81
|
raise RuntimeError(
|
|
82
|
-
"status command
|
|
83
|
-
% (err_str, ret)
|
|
82
|
+
f"status command {cmd} fails to execute\nerror message:{err_str}\nreturn code {ret}\n"
|
|
84
83
|
)
|
|
85
84
|
|
|
86
85
|
if_job_exists = bool(stdout.read().decode("utf-8").strip())
|
dpdispatcher/machines/slurm.py
CHANGED
|
@@ -83,13 +83,12 @@ class Slurm(Machine):
|
|
|
83
83
|
script_run_file_name = f"{job.script_file_name}.run"
|
|
84
84
|
self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
|
|
85
85
|
# self.context.write_file(fname=os.path.join(self.context.submission.work_base, script_file_name), write_str=script_str)
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
shlex.quote(script_file_name),
|
|
91
|
-
)
|
|
86
|
+
command = "cd {} && {} {}".format(
|
|
87
|
+
shlex.quote(self.context.remote_root),
|
|
88
|
+
"sbatch",
|
|
89
|
+
shlex.quote(script_file_name),
|
|
92
90
|
)
|
|
91
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
93
92
|
if ret != 0:
|
|
94
93
|
err_str = stderr.read().decode("utf-8")
|
|
95
94
|
if (
|
|
@@ -98,8 +97,7 @@ class Slurm(Machine):
|
|
|
98
97
|
):
|
|
99
98
|
# server network error, retry 3 times
|
|
100
99
|
raise RetrySignal(
|
|
101
|
-
"Get error code
|
|
102
|
-
% (ret, job.job_hash, err_str)
|
|
100
|
+
f"Get error code {ret} in submitting with job: {job.job_hash} . message: {err_str}"
|
|
103
101
|
)
|
|
104
102
|
elif (
|
|
105
103
|
"Job violates accounting/QOS policy" in err_str
|
|
@@ -110,8 +108,7 @@ class Slurm(Machine):
|
|
|
110
108
|
# job number exceeds, skip the submitting
|
|
111
109
|
return ""
|
|
112
110
|
raise RuntimeError(
|
|
113
|
-
"
|
|
114
|
-
% (err_str, ret)
|
|
111
|
+
f"command {command} fails to execute\nerror message:{err_str}\nreturn code {ret}\n"
|
|
115
112
|
)
|
|
116
113
|
subret = stdout.readlines()
|
|
117
114
|
# --parsable
|
|
@@ -129,9 +126,8 @@ class Slurm(Machine):
|
|
|
129
126
|
job_id = job.job_id
|
|
130
127
|
if job_id == "":
|
|
131
128
|
return JobStatus.unsubmitted
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
)
|
|
129
|
+
command = 'squeue -o "%.18i %.2t" -j ' + job_id
|
|
130
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
135
131
|
if ret != 0:
|
|
136
132
|
err_str = stderr.read().decode("utf-8")
|
|
137
133
|
if "Invalid job id specified" in err_str:
|
|
@@ -147,13 +143,11 @@ class Slurm(Machine):
|
|
|
147
143
|
):
|
|
148
144
|
# retry 3 times
|
|
149
145
|
raise RetrySignal(
|
|
150
|
-
"Get error code
|
|
151
|
-
% (ret, job.job_hash, err_str)
|
|
146
|
+
f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
|
|
152
147
|
)
|
|
153
148
|
raise RuntimeError(
|
|
154
|
-
"status command
|
|
155
|
-
"job_id
|
|
156
|
-
% (job_id, err_str, ret)
|
|
149
|
+
f"status command {command} fails to execute."
|
|
150
|
+
f"job_id:{job_id} \n error message:{err_str}\n return code {ret}\n"
|
|
157
151
|
)
|
|
158
152
|
status_line = stdout.read().decode("utf-8").split("\n")[-2]
|
|
159
153
|
status_word = status_line.split()[-1]
|
|
@@ -257,7 +251,7 @@ class SlurmJobArray(Slurm):
|
|
|
257
251
|
return super().gen_script_header(job) + "\n#SBATCH --array={}".format(
|
|
258
252
|
",".join(map(str, job_array))
|
|
259
253
|
)
|
|
260
|
-
return super().gen_script_header(job) + "\n#SBATCH --array=0-%
|
|
254
|
+
return super().gen_script_header(job) + "\n#SBATCH --array=0-%s" % (
|
|
261
255
|
math.ceil(len(job.job_task_list) / slurm_job_size) - 1
|
|
262
256
|
)
|
|
263
257
|
|
|
@@ -319,9 +313,8 @@ class SlurmJobArray(Slurm):
|
|
|
319
313
|
job_id = job.job_id
|
|
320
314
|
if job_id == "":
|
|
321
315
|
return JobStatus.unsubmitted
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
)
|
|
316
|
+
command = 'squeue -h -o "%.18i %.2t" -j ' + job_id
|
|
317
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
325
318
|
if ret != 0:
|
|
326
319
|
err_str = stderr.read().decode("utf-8")
|
|
327
320
|
if "Invalid job id specified" in err_str:
|
|
@@ -336,13 +329,11 @@ class SlurmJobArray(Slurm):
|
|
|
336
329
|
):
|
|
337
330
|
# retry 3 times
|
|
338
331
|
raise RetrySignal(
|
|
339
|
-
"Get error code
|
|
340
|
-
% (ret, job.job_hash, err_str)
|
|
332
|
+
f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
|
|
341
333
|
)
|
|
342
334
|
raise RuntimeError(
|
|
343
|
-
"status command
|
|
344
|
-
"job_id
|
|
345
|
-
% (job_id, err_str, ret)
|
|
335
|
+
f"status command {command} fails to execute."
|
|
336
|
+
f"job_id:{job_id} \n error message:{err_str}\n return code {ret}\n"
|
|
346
337
|
)
|
|
347
338
|
status_lines = stdout.read().decode("utf-8").split("\n")[:-1]
|
|
348
339
|
status = []
|
dpdispatcher/submission.py
CHANGED
|
@@ -55,7 +55,6 @@ class Submission:
|
|
|
55
55
|
*,
|
|
56
56
|
task_list=[],
|
|
57
57
|
):
|
|
58
|
-
# self.submission_list = submission_list
|
|
59
58
|
self.local_root = None
|
|
60
59
|
self.work_base = work_base
|
|
61
60
|
self._abs_work_base = os.path.abspath(work_base)
|
|
@@ -324,8 +323,7 @@ class Submission:
|
|
|
324
323
|
kwargs = {**{"clean": False}, **kwargs}
|
|
325
324
|
if kwargs["clean"]:
|
|
326
325
|
dlog.warning(
|
|
327
|
-
"Using async submission with `clean=True`, "
|
|
328
|
-
"job may fail in queue system"
|
|
326
|
+
"Using async submission with `clean=True`, job may fail in queue system"
|
|
329
327
|
)
|
|
330
328
|
loop = asyncio.get_event_loop()
|
|
331
329
|
wrapped_submission = functools.partial(self.run_submission, **kwargs)
|
|
@@ -515,12 +513,9 @@ class Submission:
|
|
|
515
513
|
def submission_from_json(cls, json_file_name="submission.json"):
|
|
516
514
|
with open(json_file_name) as f:
|
|
517
515
|
submission_dict = json.load(f)
|
|
518
|
-
# submission_dict = machine.context.read_file(json_file_name)
|
|
519
516
|
submission = cls.deserialize(submission_dict=submission_dict, machine=None)
|
|
520
517
|
return submission
|
|
521
518
|
|
|
522
|
-
# def check_if_recover()
|
|
523
|
-
|
|
524
519
|
def try_recover_from_json(self):
|
|
525
520
|
submission_file_name = f"{self.submission_hash}.json"
|
|
526
521
|
if_recover = self.machine.context.check_file_exists(submission_file_name)
|
|
@@ -545,7 +540,6 @@ class Submission:
|
|
|
545
540
|
f"machine.context.remote_root:{self.machine.context.remote_root}; "
|
|
546
541
|
f"submission.work_base:{submission.work_base};"
|
|
547
542
|
)
|
|
548
|
-
# self = submission.bind_machine(machine=self.machine)
|
|
549
543
|
else:
|
|
550
544
|
print(self.serialize())
|
|
551
545
|
print(submission.serialize())
|
|
@@ -759,7 +753,6 @@ class Job:
|
|
|
759
753
|
self.fail_count = 0
|
|
760
754
|
self.job_uuid = uuid.uuid4()
|
|
761
755
|
|
|
762
|
-
# self.job_hash = self.get_hash()
|
|
763
756
|
self.job_hash = self.get_hash()
|
|
764
757
|
self.script_file_name = self.job_hash + ".sub"
|
|
765
758
|
|
|
@@ -1122,9 +1115,9 @@ class Resources:
|
|
|
1122
1115
|
|
|
1123
1116
|
@staticmethod
|
|
1124
1117
|
def arginfo(detail_kwargs=True):
|
|
1125
|
-
doc_number_node = "The number of
|
|
1126
|
-
doc_cpu_per_node = "
|
|
1127
|
-
doc_gpu_per_node = "
|
|
1118
|
+
doc_number_node = "The number of nodes required for each `job`."
|
|
1119
|
+
doc_cpu_per_node = "CPU numbers of each node assigned to each job."
|
|
1120
|
+
doc_gpu_per_node = "GPU numbers of each node assigned to each job."
|
|
1128
1121
|
doc_queue_name = "The queue name of batch job scheduler system."
|
|
1129
1122
|
doc_group_size = "The number of `tasks` in a `job`. 0 means infinity."
|
|
1130
1123
|
doc_custom_flags = "The extra lines pass to job submitting script header"
|
dpdispatcher/utils/hdfs_cli.py
CHANGED
|
@@ -28,7 +28,7 @@ class HDFS:
|
|
|
28
28
|
)
|
|
29
29
|
except Exception as e:
|
|
30
30
|
raise RuntimeError(
|
|
31
|
-
f"Cannot check existence of hdfs uri[{uri}]
|
|
31
|
+
f"Cannot check existence of hdfs uri[{uri}] with cmd[{cmd}]"
|
|
32
32
|
) from e
|
|
33
33
|
|
|
34
34
|
@staticmethod
|
|
@@ -48,9 +48,7 @@ class HDFS:
|
|
|
48
48
|
f"with cmd[{cmd}]; ret[{ret}] output[{out}] stderr[{err}]"
|
|
49
49
|
)
|
|
50
50
|
except Exception as e:
|
|
51
|
-
raise RuntimeError(
|
|
52
|
-
f"Cannot remove hdfs uri[{uri}] " f"with cmd[{cmd}]"
|
|
53
|
-
) from e
|
|
51
|
+
raise RuntimeError(f"Cannot remove hdfs uri[{uri}] with cmd[{cmd}]") from e
|
|
54
52
|
|
|
55
53
|
@staticmethod
|
|
56
54
|
def mkdir(uri):
|
|
@@ -70,7 +68,7 @@ class HDFS:
|
|
|
70
68
|
)
|
|
71
69
|
except Exception as e:
|
|
72
70
|
raise RuntimeError(
|
|
73
|
-
f"Cannot mkdir of hdfs uri[{uri}]
|
|
71
|
+
f"Cannot mkdir of hdfs uri[{uri}] with cmd[{cmd}]"
|
|
74
72
|
) from e
|
|
75
73
|
|
|
76
74
|
@staticmethod
|
|
@@ -80,7 +78,7 @@ class HDFS:
|
|
|
80
78
|
"""
|
|
81
79
|
# Make sure local_path is accessible
|
|
82
80
|
if not os.path.exists(local_path) or not os.access(local_path, os.R_OK):
|
|
83
|
-
raise RuntimeError(f"try to access local_path[{local_path}]
|
|
81
|
+
raise RuntimeError(f"try to access local_path[{local_path}] but failed")
|
|
84
82
|
cmd = f"hadoop fs -copyFromLocal -f {local_path} {to_uri}"
|
|
85
83
|
try:
|
|
86
84
|
ret, out, err = run_cmd_with_all_output(cmd)
|
|
@@ -132,9 +130,7 @@ class HDFS:
|
|
|
132
130
|
f"cmd [{cmd}] ret[{ret}] output[{out}] stderr[{err}]"
|
|
133
131
|
)
|
|
134
132
|
except Exception as e:
|
|
135
|
-
raise RuntimeError(
|
|
136
|
-
f"Cannot read text from uri[{uri}]" f"cmd [{cmd}]"
|
|
137
|
-
) from e
|
|
133
|
+
raise RuntimeError(f"Cannot read text from uri[{uri}]cmd [{cmd}]") from e
|
|
138
134
|
|
|
139
135
|
@staticmethod
|
|
140
136
|
def move(from_uri, to_uri):
|
|
@@ -151,6 +147,5 @@ class HDFS:
|
|
|
151
147
|
)
|
|
152
148
|
except Exception as e:
|
|
153
149
|
raise RuntimeError(
|
|
154
|
-
f"Cannot move from_uri[{from_uri}] to "
|
|
155
|
-
f"to_uri[{to_uri}] with cmd[{cmd}]"
|
|
150
|
+
f"Cannot move from_uri[{from_uri}] to to_uri[{to_uri}] with cmd[{cmd}]"
|
|
156
151
|
) from e
|
dpdispatcher/utils/utils.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: dpdispatcher
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.8
|
|
4
4
|
Summary: Generate HPC scheduler systems jobs input scripts, submit these scripts to HPC systems, and poke until they finish
|
|
5
5
|
Author: DeepModeling
|
|
6
|
-
License:
|
|
6
|
+
License: GNU LESSER GENERAL PUBLIC LICENSE
|
|
7
7
|
Version 3, 29 June 2007
|
|
8
8
|
|
|
9
9
|
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
|
@@ -186,32 +186,32 @@ Requires-Python: >=3.7
|
|
|
186
186
|
Description-Content-Type: text/markdown
|
|
187
187
|
License-File: LICENSE
|
|
188
188
|
Requires-Dist: paramiko
|
|
189
|
-
Requires-Dist: dargs
|
|
189
|
+
Requires-Dist: dargs>=0.4.1
|
|
190
190
|
Requires-Dist: requests
|
|
191
|
-
Requires-Dist: tqdm
|
|
191
|
+
Requires-Dist: tqdm>=4.9.0
|
|
192
|
+
Requires-Dist: typing_extensions; python_version < "3.7"
|
|
192
193
|
Requires-Dist: pyyaml
|
|
193
|
-
Requires-Dist: tomli
|
|
194
|
-
Requires-Dist: typing-extensions ; python_version < "3.7"
|
|
195
|
-
Provides-Extra: bohrium
|
|
196
|
-
Requires-Dist: oss2 ; extra == 'bohrium'
|
|
197
|
-
Requires-Dist: tqdm ; extra == 'bohrium'
|
|
198
|
-
Requires-Dist: bohrium-sdk ; extra == 'bohrium'
|
|
199
|
-
Provides-Extra: cloudserver
|
|
200
|
-
Requires-Dist: oss2 ; extra == 'cloudserver'
|
|
201
|
-
Requires-Dist: tqdm ; extra == 'cloudserver'
|
|
202
|
-
Requires-Dist: bohrium-sdk ; extra == 'cloudserver'
|
|
194
|
+
Requires-Dist: tomli>=1.1.0; python_version < "3.11"
|
|
203
195
|
Provides-Extra: docs
|
|
204
|
-
Requires-Dist: sphinx
|
|
205
|
-
Requires-Dist: myst-parser
|
|
206
|
-
Requires-Dist: sphinx-
|
|
207
|
-
Requires-Dist: numpydoc
|
|
208
|
-
Requires-Dist: deepmodeling-sphinx
|
|
209
|
-
Requires-Dist: dargs
|
|
210
|
-
Requires-Dist: sphinx-argparse
|
|
196
|
+
Requires-Dist: sphinx; extra == "docs"
|
|
197
|
+
Requires-Dist: myst-parser; extra == "docs"
|
|
198
|
+
Requires-Dist: sphinx-book-theme; extra == "docs"
|
|
199
|
+
Requires-Dist: numpydoc; extra == "docs"
|
|
200
|
+
Requires-Dist: deepmodeling-sphinx>=0.3.0; extra == "docs"
|
|
201
|
+
Requires-Dist: dargs>=0.3.1; extra == "docs"
|
|
202
|
+
Requires-Dist: sphinx-argparse<0.5.0; extra == "docs"
|
|
203
|
+
Provides-Extra: cloudserver
|
|
204
|
+
Requires-Dist: oss2; extra == "cloudserver"
|
|
205
|
+
Requires-Dist: tqdm; extra == "cloudserver"
|
|
206
|
+
Requires-Dist: bohrium-sdk; extra == "cloudserver"
|
|
207
|
+
Provides-Extra: bohrium
|
|
208
|
+
Requires-Dist: oss2; extra == "bohrium"
|
|
209
|
+
Requires-Dist: tqdm; extra == "bohrium"
|
|
210
|
+
Requires-Dist: bohrium-sdk; extra == "bohrium"
|
|
211
211
|
Provides-Extra: gui
|
|
212
|
-
Requires-Dist: dpgui
|
|
212
|
+
Requires-Dist: dpgui; extra == "gui"
|
|
213
213
|
Provides-Extra: test
|
|
214
|
-
Requires-Dist: dpgui
|
|
214
|
+
Requires-Dist: dpgui; extra == "test"
|
|
215
215
|
|
|
216
216
|
# DPDispatcher
|
|
217
217
|
|
|
@@ -221,7 +221,7 @@ Requires-Dist: dpgui ; extra == 'test'
|
|
|
221
221
|
[](https://dpdispatcher.readthedocs.io/)
|
|
222
222
|
|
|
223
223
|
DPDispatcher is a Python package used to generate HPC (High-Performance Computing) scheduler systems (Slurm/PBS/LSF/Bohrium) jobs input scripts, submit them to HPC systems, and poke until they finish.
|
|
224
|
-
|
|
224
|
+
|
|
225
225
|
DPDispatcher will monitor (poke) until these jobs finish and download the results files (if these jobs are running on remote systems connected by SSH).
|
|
226
226
|
|
|
227
227
|
For more information, check the [documentation](https://dpdispatcher.readthedocs.io/).
|
|
@@ -1,49 +1,49 @@
|
|
|
1
1
|
dpdispatcher/__init__.py,sha256=CLZP_N5CTp14ujWCykEHuJjoIfKR6CwrclXhjWUgNoE,517
|
|
2
2
|
dpdispatcher/__main__.py,sha256=BFhG-mSBzVZUEezQJqXWZnt2WsnhAHT_zpT8Y6gpOz0,116
|
|
3
|
-
dpdispatcher/_version.py,sha256=
|
|
3
|
+
dpdispatcher/_version.py,sha256=jFQ70HGO-FOLhIy6SpujRlrxJLVWsy5CAtewPppbaOs,411
|
|
4
4
|
dpdispatcher/arginfo.py,sha256=pNaxYIE6ahBidpR7OCKZdw8iGt003uTXGSlVzwiuvRg,188
|
|
5
|
-
dpdispatcher/base_context.py,sha256=
|
|
5
|
+
dpdispatcher/base_context.py,sha256=W4eWDWVzYeL6EuEkivmJp-_h_B2mV9PtRWc09l1_Qzc,5242
|
|
6
6
|
dpdispatcher/dlog.py,sha256=QJKAwB6gV3Zb6zQUL9dZ_uIoTIEy9Z7ecmVQ-8WNmD8,1081
|
|
7
7
|
dpdispatcher/dpdisp.py,sha256=jhuTmwPY7KBF4WukaQomEwZcfYoISaMbKwuxdDGSluc,4206
|
|
8
|
-
dpdispatcher/machine.py,sha256=
|
|
8
|
+
dpdispatcher/machine.py,sha256=k53ycs_v7xrl4D93URc5ht0shoO9NPrVl0rYr4v5OiU,16696
|
|
9
9
|
dpdispatcher/run.py,sha256=tFHbJAioXXpgHTE5bhRRAuc8w7cX1ET9SBbiAg3Rw-I,5382
|
|
10
|
-
dpdispatcher/submission.py,sha256=
|
|
10
|
+
dpdispatcher/submission.py,sha256=zLzdKJkMXhvaicD2el33NxDHP_9LL29HBombxR1l-Sw,48086
|
|
11
11
|
dpdispatcher/contexts/__init__.py,sha256=jlvcIppmUnS39yBlkZEDvIQFV-j_BR75ZTbZALF_RB0,336
|
|
12
|
-
dpdispatcher/contexts/dp_cloud_server_context.py,sha256=
|
|
13
|
-
dpdispatcher/contexts/hdfs_context.py,sha256=
|
|
14
|
-
dpdispatcher/contexts/lazy_local_context.py,sha256=
|
|
15
|
-
dpdispatcher/contexts/local_context.py,sha256=
|
|
16
|
-
dpdispatcher/contexts/openapi_context.py,sha256=
|
|
17
|
-
dpdispatcher/contexts/ssh_context.py,sha256=
|
|
12
|
+
dpdispatcher/contexts/dp_cloud_server_context.py,sha256=PGRMef3q2hfK-o5dNIWWvzPca2NK1HrWEgungM4L9Go,12420
|
|
13
|
+
dpdispatcher/contexts/hdfs_context.py,sha256=mYQzXMZ4A9EjjWBAH3Ba6HOErUhMMwCsKxOjpd5R57Y,9105
|
|
14
|
+
dpdispatcher/contexts/lazy_local_context.py,sha256=FAClbLD2F4LizUqFzMOg3t0Z6NLeTDLJy7NkRcDELFs,5070
|
|
15
|
+
dpdispatcher/contexts/local_context.py,sha256=VbaSXGAc_EDMT0K5WV_flBF0bX87ntrwO_hq_Bkcb04,14590
|
|
16
|
+
dpdispatcher/contexts/openapi_context.py,sha256=M7L9axpjOrzvdTpLMDuEzZqe4ZuKIxjS0bzZUv8W2IQ,9674
|
|
17
|
+
dpdispatcher/contexts/ssh_context.py,sha256=qaj8h2TdY1i-YYdDstUBs9IJaLwzytwnQkdntMEZ7vg,37664
|
|
18
18
|
dpdispatcher/dpcloudserver/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
19
|
dpdispatcher/dpcloudserver/client.py,sha256=k1niKjG6zFnMtHn_UuCjYoOcMju3o3PV-GdyVLr5-KM,165
|
|
20
20
|
dpdispatcher/entrypoints/__init__.py,sha256=exKSFT3j2oCerGwtI8WbHQK-D0K-CyifocRji1xntT4,20
|
|
21
21
|
dpdispatcher/entrypoints/gui.py,sha256=29lMXqbmSRbLj4rfBv7Jnw89NLU9syTB88IUP6IRJsU,830
|
|
22
22
|
dpdispatcher/entrypoints/run.py,sha256=tRkHfeAktV6gF31yb2MVOSTlpNGZFw3N0jHBmM1YfIg,175
|
|
23
23
|
dpdispatcher/entrypoints/submission.py,sha256=ikVwIZAQL0SsYO5xaMIdKXgO6qtc05w1vqmvtG7Nk5M,3401
|
|
24
|
-
dpdispatcher/machines/JH_UniScheduler.py,sha256=
|
|
24
|
+
dpdispatcher/machines/JH_UniScheduler.py,sha256=ZeUZXqyGrN5Zec4gwpwH5r6FJXaJLRUJMQWDCP7X3Nk,5756
|
|
25
25
|
dpdispatcher/machines/__init__.py,sha256=tOQuPUlW1Ab4qcC0oSAIyDjZA_WyE67h_EIxPCWGhys,336
|
|
26
|
-
dpdispatcher/machines/distributed_shell.py,sha256=
|
|
26
|
+
dpdispatcher/machines/distributed_shell.py,sha256=c0-lGeGz_M-PY2gPciT-uYZLQht5XTMaxJSNxkbMffc,7489
|
|
27
27
|
dpdispatcher/machines/dp_cloud_server.py,sha256=SR69gsFb2BvOQCW1QnWfP3cQvu_qHLJNsycp5wzosJU,11706
|
|
28
28
|
dpdispatcher/machines/fugaku.py,sha256=oY2hD2ldL2dztwtJ9WNisdsfPnaX-5yTRXewIT9r60I,4314
|
|
29
|
-
dpdispatcher/machines/lsf.py,sha256=
|
|
29
|
+
dpdispatcher/machines/lsf.py,sha256=xGDq8OLAk83E9EjK_3-QtEOyahvBGspWbxT__7mnSTw,7896
|
|
30
30
|
dpdispatcher/machines/openapi.py,sha256=Gzzbo8YOAybXGTrgMutexErcaEi3ts7uTUNvOhThFS8,8858
|
|
31
|
-
dpdispatcher/machines/pbs.py,sha256=
|
|
32
|
-
dpdispatcher/machines/shell.py,sha256=
|
|
33
|
-
dpdispatcher/machines/slurm.py,sha256=
|
|
31
|
+
dpdispatcher/machines/pbs.py,sha256=gUoj3OGQbZRBK4P-WXlhrxlQqTeUi9X8JGLOkAB__wE,12669
|
|
32
|
+
dpdispatcher/machines/shell.py,sha256=EeYnRCowXdzO3Nh25Yh_t5xeM6frq4uChk4GVx7OjH8,4797
|
|
33
|
+
dpdispatcher/machines/slurm.py,sha256=oyMX9iZQpVRR951zwz0wRNfl3_uJZzdtzxMbTJotlQU,15402
|
|
34
34
|
dpdispatcher/utils/__init__.py,sha256=fwvwkMf7DFNQkNBiIce8Y8gRA6FhICwKjkKiXu_BEJg,13
|
|
35
|
-
dpdispatcher/utils/hdfs_cli.py,sha256=
|
|
35
|
+
dpdispatcher/utils/hdfs_cli.py,sha256=a1a9PJAzt3wsTcdaSw_oD1vcNw59pMooxpAHjYOaaGA,5209
|
|
36
36
|
dpdispatcher/utils/job_status.py,sha256=Eszs4TPLfszCuf6zLaFonf25feXDUguF28spYOjJpQE,233
|
|
37
37
|
dpdispatcher/utils/record.py,sha256=c8jdPmCuLzRmFo_jOjR0j9zFR1EWX3NSHVuPEIYCycg,2147
|
|
38
|
-
dpdispatcher/utils/utils.py,sha256=
|
|
38
|
+
dpdispatcher/utils/utils.py,sha256=Wo-8tGO05e2KkRyLXoIg3UlxzkuM-x1phRrTA1Hh7Ko,5328
|
|
39
39
|
dpdispatcher/utils/dpcloudserver/__init__.py,sha256=FnX9HH-2dXADluNfucg98JPMfruMoBpN9ER9lZkVQvQ,49
|
|
40
40
|
dpdispatcher/utils/dpcloudserver/client.py,sha256=CLfXswvzI4inDrW2bYkfMQ6gQJFcZOgLhiXBz_EI17M,12029
|
|
41
41
|
dpdispatcher/utils/dpcloudserver/config.py,sha256=NteQzf1OeEkz2UbkXHHQ0B72cUu23zLVzpM9Yh4v1Cc,559
|
|
42
42
|
dpdispatcher/utils/dpcloudserver/retcode.py,sha256=1qAF8gFZx55u2sO8KbtYSIIrjcO-IGufEUlwbkSfC1g,721
|
|
43
43
|
dpdispatcher/utils/dpcloudserver/zip_file.py,sha256=f9WrlktwHW0YipaWg5Y0kxjMZlhD1cJYa6EUpvu4Cro,2611
|
|
44
|
-
dpdispatcher-0.6.
|
|
45
|
-
dpdispatcher-0.6.
|
|
46
|
-
dpdispatcher-0.6.
|
|
47
|
-
dpdispatcher-0.6.
|
|
48
|
-
dpdispatcher-0.6.
|
|
49
|
-
dpdispatcher-0.6.
|
|
44
|
+
dpdispatcher-0.6.8.dist-info/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
|
|
45
|
+
dpdispatcher-0.6.8.dist-info/METADATA,sha256=mneS5eFsvLVeWxt9dGHBLyDQv0NSdIdC9x00TqFHhGI,12811
|
|
46
|
+
dpdispatcher-0.6.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
47
|
+
dpdispatcher-0.6.8.dist-info/entry_points.txt,sha256=NRHUV0IU_u7_XtcmmEDnVzAcUmurhiEAGwENckrajo4,233
|
|
48
|
+
dpdispatcher-0.6.8.dist-info/top_level.txt,sha256=35jAQoXY-b-e9fJ1_mxhZUiaCoJNt1ZI7mpFRf07Qjs,13
|
|
49
|
+
dpdispatcher-0.6.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|