dpdispatcher 0.6.5__py3-none-any.whl → 0.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dpdispatcher might be problematic. Click here for more details.
- dpdispatcher/_version.py +2 -2
- dpdispatcher/base_context.py +61 -1
- dpdispatcher/contexts/dp_cloud_server_context.py +5 -0
- dpdispatcher/contexts/hdfs_context.py +5 -0
- dpdispatcher/contexts/lazy_local_context.py +0 -17
- dpdispatcher/contexts/local_context.py +57 -31
- dpdispatcher/contexts/openapi_context.py +5 -0
- dpdispatcher/contexts/ssh_context.py +67 -77
- dpdispatcher/machine.py +13 -2
- dpdispatcher/machines/JH_UniScheduler.py +1 -1
- dpdispatcher/machines/distributed_shell.py +2 -2
- dpdispatcher/machines/lsf.py +1 -1
- dpdispatcher/machines/pbs.py +83 -19
- dpdispatcher/machines/shell.py +9 -8
- dpdispatcher/machines/slurm.py +18 -21
- dpdispatcher/submission.py +3 -3
- {dpdispatcher-0.6.5.dist-info → dpdispatcher-0.6.7.dist-info}/METADATA +2 -2
- {dpdispatcher-0.6.5.dist-info → dpdispatcher-0.6.7.dist-info}/RECORD +22 -22
- {dpdispatcher-0.6.5.dist-info → dpdispatcher-0.6.7.dist-info}/WHEEL +1 -1
- {dpdispatcher-0.6.5.dist-info → dpdispatcher-0.6.7.dist-info}/LICENSE +0 -0
- {dpdispatcher-0.6.5.dist-info → dpdispatcher-0.6.7.dist-info}/entry_points.txt +0 -0
- {dpdispatcher-0.6.5.dist-info → dpdispatcher-0.6.7.dist-info}/top_level.txt +0 -0
dpdispatcher/_version.py
CHANGED
dpdispatcher/base_context.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from abc import ABCMeta, abstractmethod
|
|
2
|
-
from typing import List, Tuple
|
|
2
|
+
from typing import Any, List, Tuple
|
|
3
3
|
|
|
4
4
|
from dargs import Argument
|
|
5
5
|
|
|
@@ -73,6 +73,66 @@ class BaseContext(metaclass=ABCMeta):
|
|
|
73
73
|
def check_finish(self, proc):
|
|
74
74
|
raise NotImplementedError("abstract method")
|
|
75
75
|
|
|
76
|
+
def block_checkcall(self, cmd, asynchronously=False) -> Tuple[Any, Any, Any]:
|
|
77
|
+
"""Run command with arguments. Wait for command to complete.
|
|
78
|
+
|
|
79
|
+
Parameters
|
|
80
|
+
----------
|
|
81
|
+
cmd : str
|
|
82
|
+
The command to run.
|
|
83
|
+
asynchronously : bool, optional, default=False
|
|
84
|
+
Run command asynchronously. If True, `nohup` will be used to run the command.
|
|
85
|
+
|
|
86
|
+
Returns
|
|
87
|
+
-------
|
|
88
|
+
stdin
|
|
89
|
+
standard inout
|
|
90
|
+
stdout
|
|
91
|
+
standard output
|
|
92
|
+
stderr
|
|
93
|
+
standard error
|
|
94
|
+
|
|
95
|
+
Raises
|
|
96
|
+
------
|
|
97
|
+
RuntimeError
|
|
98
|
+
when the return code is not zero
|
|
99
|
+
"""
|
|
100
|
+
if asynchronously:
|
|
101
|
+
cmd = f"nohup {cmd} >/dev/null &"
|
|
102
|
+
exit_status, stdin, stdout, stderr = self.block_call(cmd)
|
|
103
|
+
if exit_status != 0:
|
|
104
|
+
raise RuntimeError(
|
|
105
|
+
"Get error code %d in calling %s with job: %s . message: %s"
|
|
106
|
+
% (
|
|
107
|
+
exit_status,
|
|
108
|
+
cmd,
|
|
109
|
+
self.submission.submission_hash,
|
|
110
|
+
stderr.read().decode("utf-8"),
|
|
111
|
+
)
|
|
112
|
+
)
|
|
113
|
+
return stdin, stdout, stderr
|
|
114
|
+
|
|
115
|
+
@abstractmethod
|
|
116
|
+
def block_call(self, cmd) -> Tuple[int, Any, Any, Any]:
|
|
117
|
+
"""Run command with arguments. Wait for command to complete.
|
|
118
|
+
|
|
119
|
+
Parameters
|
|
120
|
+
----------
|
|
121
|
+
cmd : str
|
|
122
|
+
The command to run.
|
|
123
|
+
|
|
124
|
+
Returns
|
|
125
|
+
-------
|
|
126
|
+
exit_status
|
|
127
|
+
exit code
|
|
128
|
+
stdin
|
|
129
|
+
standard inout
|
|
130
|
+
stdout
|
|
131
|
+
standard output
|
|
132
|
+
stderr
|
|
133
|
+
standard error
|
|
134
|
+
"""
|
|
135
|
+
|
|
76
136
|
@classmethod
|
|
77
137
|
def machine_arginfo(cls) -> Argument:
|
|
78
138
|
"""Generate the machine arginfo.
|
|
@@ -335,6 +335,11 @@ class BohriumContext(BaseContext):
|
|
|
335
335
|
)
|
|
336
336
|
]
|
|
337
337
|
|
|
338
|
+
def block_call(self, cmd):
|
|
339
|
+
raise RuntimeError(
|
|
340
|
+
"Unsupported method. You may use an unsupported combination of the machine and the context."
|
|
341
|
+
)
|
|
342
|
+
|
|
338
343
|
|
|
339
344
|
DpCloudServerContext = BohriumContext
|
|
340
345
|
LebesgueContext = BohriumContext
|
|
@@ -244,3 +244,8 @@ class HDFSContext(BaseContext):
|
|
|
244
244
|
|
|
245
245
|
def read_file(self, fname):
|
|
246
246
|
return HDFS.read_hdfs_file(os.path.join(self.remote_root, fname))
|
|
247
|
+
|
|
248
|
+
def block_call(self, cmd):
|
|
249
|
+
raise RuntimeError(
|
|
250
|
+
"Unsupported method. You may use an unsupported combination of the machine and the context."
|
|
251
|
+
)
|
|
@@ -112,23 +112,6 @@ class LazyLocalContext(BaseContext):
|
|
|
112
112
|
# else:
|
|
113
113
|
# raise RuntimeError('do not find download file ' + fname)
|
|
114
114
|
|
|
115
|
-
def block_checkcall(self, cmd):
|
|
116
|
-
# script_dir = os.path.join(self.local_root, self.submission.work_base)
|
|
117
|
-
# os.chdir(script_dir)
|
|
118
|
-
proc = sp.Popen(
|
|
119
|
-
cmd, cwd=self.local_root, shell=True, stdout=sp.PIPE, stderr=sp.PIPE
|
|
120
|
-
)
|
|
121
|
-
o, e = proc.communicate()
|
|
122
|
-
stdout = SPRetObj(o)
|
|
123
|
-
stderr = SPRetObj(e)
|
|
124
|
-
code = proc.returncode
|
|
125
|
-
if code != 0:
|
|
126
|
-
raise RuntimeError(
|
|
127
|
-
"Get error code %d in locally calling %s with job: %s ",
|
|
128
|
-
(code, cmd, self.submission.submission_hash),
|
|
129
|
-
)
|
|
130
|
-
return None, stdout, stderr
|
|
131
|
-
|
|
132
115
|
def block_call(self, cmd):
|
|
133
116
|
proc = sp.Popen(
|
|
134
117
|
cmd, cwd=self.local_root, shell=True, stdout=sp.PIPE, stderr=sp.PIPE
|
|
@@ -3,6 +3,9 @@ import shutil
|
|
|
3
3
|
import subprocess as sp
|
|
4
4
|
from glob import glob
|
|
5
5
|
from subprocess import TimeoutExpired
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
from dargs import Argument
|
|
6
9
|
|
|
7
10
|
from dpdispatcher.base_context import BaseContext
|
|
8
11
|
from dpdispatcher.dlog import dlog
|
|
@@ -60,6 +63,7 @@ class LocalContext(BaseContext):
|
|
|
60
63
|
self.temp_local_root = os.path.abspath(local_root)
|
|
61
64
|
self.temp_remote_root = os.path.abspath(remote_root)
|
|
62
65
|
self.remote_profile = remote_profile
|
|
66
|
+
self.symlink = remote_profile.get("symlink", True)
|
|
63
67
|
|
|
64
68
|
@classmethod
|
|
65
69
|
def load_from_dict(cls, context_dict):
|
|
@@ -83,6 +87,25 @@ class LocalContext(BaseContext):
|
|
|
83
87
|
self.temp_remote_root, submission.submission_hash
|
|
84
88
|
)
|
|
85
89
|
|
|
90
|
+
def _copy_from_local_to_remote(self, local_path, remote_path):
|
|
91
|
+
if not os.path.exists(local_path):
|
|
92
|
+
raise FileNotFoundError(
|
|
93
|
+
f"cannot find uploaded file {os.path.join(local_path)}"
|
|
94
|
+
)
|
|
95
|
+
if os.path.exists(remote_path):
|
|
96
|
+
os.remove(remote_path)
|
|
97
|
+
_check_file_path(remote_path)
|
|
98
|
+
|
|
99
|
+
if self.symlink:
|
|
100
|
+
# ensure the file exist
|
|
101
|
+
os.symlink(local_path, remote_path)
|
|
102
|
+
elif os.path.isfile(local_path):
|
|
103
|
+
shutil.copyfile(local_path, remote_path)
|
|
104
|
+
elif os.path.isdir(local_path):
|
|
105
|
+
shutil.copytree(local_path, remote_path)
|
|
106
|
+
else:
|
|
107
|
+
raise ValueError(f"Unknown file type: {local_path}")
|
|
108
|
+
|
|
86
109
|
def upload(self, submission):
|
|
87
110
|
os.makedirs(self.remote_root, exist_ok=True)
|
|
88
111
|
for ii in submission.belonging_tasks:
|
|
@@ -103,14 +126,9 @@ class LocalContext(BaseContext):
|
|
|
103
126
|
file_list.extend(rel_file_list)
|
|
104
127
|
|
|
105
128
|
for jj in file_list:
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
)
|
|
110
|
-
if os.path.exists(os.path.join(remote_job, jj)):
|
|
111
|
-
os.remove(os.path.join(remote_job, jj))
|
|
112
|
-
_check_file_path(os.path.join(remote_job, jj))
|
|
113
|
-
os.symlink(os.path.join(local_job, jj), os.path.join(remote_job, jj))
|
|
129
|
+
self._copy_from_local_to_remote(
|
|
130
|
+
os.path.join(local_job, jj), os.path.join(remote_job, jj)
|
|
131
|
+
)
|
|
114
132
|
|
|
115
133
|
local_job = self.local_root
|
|
116
134
|
remote_job = self.remote_root
|
|
@@ -128,14 +146,9 @@ class LocalContext(BaseContext):
|
|
|
128
146
|
file_list.extend(rel_file_list)
|
|
129
147
|
|
|
130
148
|
for jj in file_list:
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
)
|
|
135
|
-
if os.path.exists(os.path.join(remote_job, jj)):
|
|
136
|
-
os.remove(os.path.join(remote_job, jj))
|
|
137
|
-
_check_file_path(os.path.join(remote_job, jj))
|
|
138
|
-
os.symlink(os.path.join(local_job, jj), os.path.join(remote_job, jj))
|
|
149
|
+
self._copy_from_local_to_remote(
|
|
150
|
+
os.path.join(local_job, jj), os.path.join(remote_job, jj)
|
|
151
|
+
)
|
|
139
152
|
|
|
140
153
|
def download(
|
|
141
154
|
self, submission, check_exists=False, mark_failure=True, back_error=False
|
|
@@ -288,21 +301,6 @@ class LocalContext(BaseContext):
|
|
|
288
301
|
# no nothing in the case of linked files
|
|
289
302
|
pass
|
|
290
303
|
|
|
291
|
-
def block_checkcall(self, cmd):
|
|
292
|
-
proc = sp.Popen(
|
|
293
|
-
cmd, cwd=self.remote_root, shell=True, stdout=sp.PIPE, stderr=sp.PIPE
|
|
294
|
-
)
|
|
295
|
-
o, e = proc.communicate()
|
|
296
|
-
stdout = SPRetObj(o)
|
|
297
|
-
stderr = SPRetObj(e)
|
|
298
|
-
code = proc.returncode
|
|
299
|
-
if code != 0:
|
|
300
|
-
raise RuntimeError(
|
|
301
|
-
f"Get error code {code} in locally calling {cmd} with job: {self.submission.submission_hash}"
|
|
302
|
-
f"\nStandard error: {stderr}"
|
|
303
|
-
)
|
|
304
|
-
return None, stdout, stderr
|
|
305
|
-
|
|
306
304
|
def block_call(self, cmd):
|
|
307
305
|
proc = sp.Popen(
|
|
308
306
|
cmd, cwd=self.remote_root, shell=True, stdout=sp.PIPE, stderr=sp.PIPE
|
|
@@ -351,3 +349,31 @@ class LocalContext(BaseContext):
|
|
|
351
349
|
stdout = None
|
|
352
350
|
stderr = None
|
|
353
351
|
return ret, stdout, stderr
|
|
352
|
+
|
|
353
|
+
@classmethod
|
|
354
|
+
def machine_subfields(cls) -> List[Argument]:
|
|
355
|
+
"""Generate the machine subfields.
|
|
356
|
+
|
|
357
|
+
Returns
|
|
358
|
+
-------
|
|
359
|
+
list[Argument]
|
|
360
|
+
machine subfields
|
|
361
|
+
"""
|
|
362
|
+
doc_remote_profile = "The information used to maintain the local machine."
|
|
363
|
+
return [
|
|
364
|
+
Argument(
|
|
365
|
+
"remote_profile",
|
|
366
|
+
dict,
|
|
367
|
+
optional=True,
|
|
368
|
+
doc=doc_remote_profile,
|
|
369
|
+
sub_fields=[
|
|
370
|
+
Argument(
|
|
371
|
+
"symlink",
|
|
372
|
+
bool,
|
|
373
|
+
optional=True,
|
|
374
|
+
default=True,
|
|
375
|
+
doc="Whether to use symbolic links to replace copy. This option should be turned off if the local directory is not accessible on the Batch system.",
|
|
376
|
+
),
|
|
377
|
+
],
|
|
378
|
+
)
|
|
379
|
+
]
|
|
@@ -258,3 +258,8 @@ class OpenAPIContext(BaseContext):
|
|
|
258
258
|
dir_to_be_removed = os.path.join(local_root, "backup")
|
|
259
259
|
if os.path.exists(dir_to_be_removed):
|
|
260
260
|
shutil.rmtree(dir_to_be_removed)
|
|
261
|
+
|
|
262
|
+
def block_call(self, cmd):
|
|
263
|
+
raise RuntimeError(
|
|
264
|
+
"Unsupported method. You may use an unsupported combination of the machine and the context."
|
|
265
|
+
)
|
|
@@ -44,6 +44,7 @@ class SSHSession:
|
|
|
44
44
|
totp_secret=None,
|
|
45
45
|
tar_compress=True,
|
|
46
46
|
look_for_keys=True,
|
|
47
|
+
execute_command=None,
|
|
47
48
|
):
|
|
48
49
|
self.hostname = hostname
|
|
49
50
|
self.username = username
|
|
@@ -56,6 +57,7 @@ class SSHSession:
|
|
|
56
57
|
self.ssh = None
|
|
57
58
|
self.tar_compress = tar_compress
|
|
58
59
|
self.look_for_keys = look_for_keys
|
|
60
|
+
self.execute_command = execute_command
|
|
59
61
|
self._keyboard_interactive_auth = False
|
|
60
62
|
self._setup_ssh()
|
|
61
63
|
|
|
@@ -237,6 +239,8 @@ class SSHSession:
|
|
|
237
239
|
self.ssh._transport = ts # type: ignore
|
|
238
240
|
# reset sftp
|
|
239
241
|
self._sftp = None
|
|
242
|
+
if self.execute_command is not None:
|
|
243
|
+
self.exec_command(self.execute_command)
|
|
240
244
|
|
|
241
245
|
def inter_handler(self, title, instructions, prompt_list):
|
|
242
246
|
"""inter_handler: the callback for paramiko.transport.auth_interactive.
|
|
@@ -295,7 +299,11 @@ class SSHSession:
|
|
|
295
299
|
assert self.ssh is not None
|
|
296
300
|
try:
|
|
297
301
|
return self.ssh.exec_command(cmd)
|
|
298
|
-
except (
|
|
302
|
+
except (
|
|
303
|
+
paramiko.ssh_exception.SSHException,
|
|
304
|
+
socket.timeout,
|
|
305
|
+
EOFError,
|
|
306
|
+
) as e:
|
|
299
307
|
# SSH session not active
|
|
300
308
|
# retry for up to 3 times
|
|
301
309
|
# ensure alive
|
|
@@ -334,6 +342,7 @@ class SSHSession:
|
|
|
334
342
|
doc_look_for_keys = (
|
|
335
343
|
"enable searching for discoverable private key files in ~/.ssh/"
|
|
336
344
|
)
|
|
345
|
+
doc_execute_command = "execute command after ssh connection is established."
|
|
337
346
|
ssh_remote_profile_args = [
|
|
338
347
|
Argument("hostname", str, optional=False, doc=doc_hostname),
|
|
339
348
|
Argument("username", str, optional=False, doc=doc_username),
|
|
@@ -355,10 +364,18 @@ class SSHSession:
|
|
|
355
364
|
),
|
|
356
365
|
Argument("timeout", int, optional=True, default=10, doc=doc_timeout),
|
|
357
366
|
Argument(
|
|
358
|
-
"totp_secret",
|
|
367
|
+
"totp_secret",
|
|
368
|
+
str,
|
|
369
|
+
optional=True,
|
|
370
|
+
default=None,
|
|
371
|
+
doc=doc_totp_secret,
|
|
359
372
|
),
|
|
360
373
|
Argument(
|
|
361
|
-
"tar_compress",
|
|
374
|
+
"tar_compress",
|
|
375
|
+
bool,
|
|
376
|
+
optional=True,
|
|
377
|
+
default=True,
|
|
378
|
+
doc=doc_tar_compress,
|
|
362
379
|
),
|
|
363
380
|
Argument(
|
|
364
381
|
"look_for_keys",
|
|
@@ -367,6 +384,13 @@ class SSHSession:
|
|
|
367
384
|
default=True,
|
|
368
385
|
doc=doc_look_for_keys,
|
|
369
386
|
),
|
|
387
|
+
Argument(
|
|
388
|
+
"execute_command",
|
|
389
|
+
str,
|
|
390
|
+
optional=True,
|
|
391
|
+
default=None,
|
|
392
|
+
doc=doc_execute_command,
|
|
393
|
+
),
|
|
370
394
|
]
|
|
371
395
|
ssh_remote_profile_format = Argument(
|
|
372
396
|
"ssh_session", dict, ssh_remote_profile_args
|
|
@@ -603,7 +627,10 @@ class SSHContext(BaseContext):
|
|
|
603
627
|
directory_list,
|
|
604
628
|
)
|
|
605
629
|
self._walk_directory(
|
|
606
|
-
submission.forward_common_files,
|
|
630
|
+
submission.forward_common_files,
|
|
631
|
+
self.local_root,
|
|
632
|
+
file_list,
|
|
633
|
+
directory_list,
|
|
607
634
|
)
|
|
608
635
|
|
|
609
636
|
# convert to relative path to local_root
|
|
@@ -621,9 +648,9 @@ class SSHContext(BaseContext):
|
|
|
621
648
|
).as_posix()
|
|
622
649
|
sha256_list.append(f"{sha256} {jj_rel}")
|
|
623
650
|
# write to remote
|
|
624
|
-
sha256_file =
|
|
625
|
-
self.remote_root, ".tmp.sha256." + str(uuid.uuid4())
|
|
626
|
-
)
|
|
651
|
+
sha256_file = pathlib.PurePath(
|
|
652
|
+
os.path.join(self.remote_root, ".tmp.sha256." + str(uuid.uuid4()))
|
|
653
|
+
).as_posix()
|
|
627
654
|
self.write_file(sha256_file, "\n".join(sha256_list))
|
|
628
655
|
# check sha256
|
|
629
656
|
# `:` means pass: https://stackoverflow.com/a/2421592/9567349
|
|
@@ -736,43 +763,9 @@ class SSHContext(BaseContext):
|
|
|
736
763
|
file_list.extend(submission.backward_common_files)
|
|
737
764
|
if len(file_list) > 0:
|
|
738
765
|
self._get_files(
|
|
739
|
-
file_list,
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
def block_checkcall(self, cmd, asynchronously=False, stderr_whitelist=None):
|
|
743
|
-
"""Run command with arguments. Wait for command to complete. If the return code
|
|
744
|
-
was zero then return, otherwise raise RuntimeError.
|
|
745
|
-
|
|
746
|
-
Parameters
|
|
747
|
-
----------
|
|
748
|
-
cmd : str
|
|
749
|
-
The command to run.
|
|
750
|
-
asynchronously : bool, optional, default=False
|
|
751
|
-
Run command asynchronously. If True, `nohup` will be used to run the command.
|
|
752
|
-
stderr_whitelist : list of str, optional, default=None
|
|
753
|
-
If not None, the stderr will be checked against the whitelist. If the stderr
|
|
754
|
-
contains any of the strings in the whitelist, the command will be considered
|
|
755
|
-
successful.
|
|
756
|
-
"""
|
|
757
|
-
assert self.remote_root is not None
|
|
758
|
-
self.ssh_session.ensure_alive()
|
|
759
|
-
if asynchronously:
|
|
760
|
-
cmd = f"nohup {cmd} >/dev/null &"
|
|
761
|
-
stdin, stdout, stderr = self.ssh_session.exec_command(
|
|
762
|
-
(f"cd {shlex.quote(self.remote_root)} ;") + cmd
|
|
763
|
-
)
|
|
764
|
-
exit_status = stdout.channel.recv_exit_status()
|
|
765
|
-
if exit_status != 0:
|
|
766
|
-
raise RuntimeError(
|
|
767
|
-
"Get error code %d in calling %s through ssh with job: %s . message: %s"
|
|
768
|
-
% (
|
|
769
|
-
exit_status,
|
|
770
|
-
cmd,
|
|
771
|
-
self.submission.submission_hash,
|
|
772
|
-
stderr.read().decode("utf-8"),
|
|
773
|
-
)
|
|
766
|
+
file_list,
|
|
767
|
+
tar_compress=self.remote_profile.get("tar_compress", None),
|
|
774
768
|
)
|
|
775
|
-
return stdin, stdout, stderr
|
|
776
769
|
|
|
777
770
|
def block_call(self, cmd):
|
|
778
771
|
assert self.remote_root is not None
|
|
@@ -793,18 +786,23 @@ class SSHContext(BaseContext):
|
|
|
793
786
|
fname = pathlib.PurePath(os.path.join(self.remote_root, fname)).as_posix()
|
|
794
787
|
# to prevent old file from being overwritten but cancelled, create a temporary file first
|
|
795
788
|
# when it is fully written, rename it to the original file name
|
|
796
|
-
|
|
797
|
-
|
|
789
|
+
temp_fname = fname + "_tmp"
|
|
790
|
+
try:
|
|
791
|
+
with self.sftp.open(temp_fname, "w") as fp:
|
|
792
|
+
fp.write(write_str)
|
|
793
|
+
# Rename the temporary file
|
|
794
|
+
self.block_checkcall(f"mv {shlex.quote(temp_fname)} {shlex.quote(fname)}")
|
|
798
795
|
# sftp.rename may throw OSError
|
|
799
|
-
|
|
800
|
-
"
|
|
801
|
-
|
|
796
|
+
except OSError as e:
|
|
797
|
+
dlog.exception(f"Error writing to file {fname}")
|
|
798
|
+
raise e
|
|
802
799
|
|
|
803
800
|
def read_file(self, fname):
|
|
804
801
|
assert self.remote_root is not None
|
|
805
802
|
self.ssh_session.ensure_alive()
|
|
806
803
|
with self.sftp.open(
|
|
807
|
-
pathlib.PurePath(os.path.join(self.remote_root, fname)).as_posix(),
|
|
804
|
+
pathlib.PurePath(os.path.join(self.remote_root, fname)).as_posix(),
|
|
805
|
+
"r",
|
|
808
806
|
) as fp:
|
|
809
807
|
ret = fp.read().decode("utf-8")
|
|
810
808
|
return ret
|
|
@@ -945,36 +943,28 @@ class SSHContext(BaseContext):
|
|
|
945
943
|
per_nfile = 100
|
|
946
944
|
ntar = len(files) // per_nfile + 1
|
|
947
945
|
if ntar <= 1:
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
"tar {} {} {}".format(
|
|
951
|
-
tar_command,
|
|
952
|
-
shlex.quote(of),
|
|
953
|
-
" ".join([shlex.quote(file) for file in files]),
|
|
954
|
-
)
|
|
955
|
-
)
|
|
956
|
-
except RuntimeError as e:
|
|
957
|
-
if "No such file or directory" in str(e):
|
|
958
|
-
raise FileNotFoundError(
|
|
959
|
-
"Any of the backward files does not exist in the remote directory."
|
|
960
|
-
) from e
|
|
961
|
-
raise e
|
|
946
|
+
file_list = " ".join([shlex.quote(file) for file in files])
|
|
947
|
+
tar_cmd = f"tar {tar_command} {shlex.quote(of)} {file_list}"
|
|
962
948
|
else:
|
|
963
|
-
file_list_file =
|
|
964
|
-
self.remote_root, ".
|
|
965
|
-
)
|
|
949
|
+
file_list_file = pathlib.PurePath(
|
|
950
|
+
os.path.join(self.remote_root, f".tmp_tar_{uuid.uuid4()}")
|
|
951
|
+
).as_posix()
|
|
966
952
|
self.write_file(file_list_file, "\n".join(files))
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
raise
|
|
977
|
-
|
|
953
|
+
tar_cmd = (
|
|
954
|
+
f"tar {tar_command} {shlex.quote(of)} -T {shlex.quote(file_list_file)}"
|
|
955
|
+
)
|
|
956
|
+
|
|
957
|
+
# Execute the tar command remotely
|
|
958
|
+
try:
|
|
959
|
+
self.block_checkcall(tar_cmd)
|
|
960
|
+
except RuntimeError as e:
|
|
961
|
+
if "No such file or directory" in str(e):
|
|
962
|
+
raise FileNotFoundError(
|
|
963
|
+
"Backward files do not exist in the remote directory."
|
|
964
|
+
) from e
|
|
965
|
+
raise e
|
|
966
|
+
|
|
967
|
+
# Transfer the archive from remote to local
|
|
978
968
|
from_f = pathlib.PurePath(os.path.join(self.remote_root, of)).as_posix()
|
|
979
969
|
to_f = pathlib.PurePath(os.path.join(self.local_root, of)).as_posix()
|
|
980
970
|
if os.path.isfile(to_f):
|
dpdispatcher/machine.py
CHANGED
|
@@ -161,6 +161,9 @@ class Machine(metaclass=ABCMeta):
|
|
|
161
161
|
machine_dict["remote_profile"] = self.context.remote_profile
|
|
162
162
|
else:
|
|
163
163
|
machine_dict["remote_profile"] = {}
|
|
164
|
+
# normalize the dict
|
|
165
|
+
base = self.arginfo()
|
|
166
|
+
machine_dict = base.normalize_value(machine_dict, trim_pattern="_*")
|
|
164
167
|
return machine_dict
|
|
165
168
|
|
|
166
169
|
def __eq__(self, other):
|
|
@@ -261,11 +264,19 @@ class Machine(metaclass=ABCMeta):
|
|
|
261
264
|
|
|
262
265
|
source_list = job.resources.source_list
|
|
263
266
|
for ii in source_list:
|
|
264
|
-
|
|
265
|
-
source_files_part += line
|
|
267
|
+
source_files_part += f"source {ii}\n"
|
|
266
268
|
|
|
267
269
|
export_envs_part = ""
|
|
268
270
|
envs = job.resources.envs
|
|
271
|
+
envs = {
|
|
272
|
+
# export resources information to the environment variables
|
|
273
|
+
"DPDISPATCHER_NUMBER_NODE": job.resources.number_node,
|
|
274
|
+
"DPDISPATCHER_CPU_PER_NODE": job.resources.cpu_per_node,
|
|
275
|
+
"DPDISPATCHER_GPU_PER_NODE": job.resources.gpu_per_node,
|
|
276
|
+
"DPDISPATCHER_QUEUE_NAME": job.resources.queue_name,
|
|
277
|
+
"DPDISPATCHER_GROUP_SIZE": job.resources.group_size,
|
|
278
|
+
**envs,
|
|
279
|
+
}
|
|
269
280
|
for k, v in envs.items():
|
|
270
281
|
if isinstance(v, list):
|
|
271
282
|
for each_value in v:
|
|
@@ -105,7 +105,7 @@ class JH_UniScheduler(Machine):
|
|
|
105
105
|
elif ret != 0:
|
|
106
106
|
# just retry when any unknown error raised.
|
|
107
107
|
raise RetrySignal(
|
|
108
|
-
"Get error code %d in checking status
|
|
108
|
+
"Get error code %d in checking status with job: %s . message: %s"
|
|
109
109
|
% (ret, job.job_hash, err_str)
|
|
110
110
|
)
|
|
111
111
|
status_out = stdout.read().decode("utf-8").split("\n")
|
|
@@ -181,8 +181,8 @@ class DistributedShell(Machine):
|
|
|
181
181
|
if ret != 0:
|
|
182
182
|
err_str = stderr.decode("utf-8")
|
|
183
183
|
raise RuntimeError(
|
|
184
|
-
"Command
|
|
185
|
-
% (err_str, ret)
|
|
184
|
+
"Command %s fails to execute, error message:%s\nreturn code %d\n"
|
|
185
|
+
% (cmd, err_str, ret)
|
|
186
186
|
)
|
|
187
187
|
job_id = int(stdout.decode("utf-8").strip())
|
|
188
188
|
|
dpdispatcher/machines/lsf.py
CHANGED
|
@@ -129,7 +129,7 @@ class LSF(Machine):
|
|
|
129
129
|
elif ret != 0:
|
|
130
130
|
# just retry when any unknown error raised.
|
|
131
131
|
raise RetrySignal(
|
|
132
|
-
"Get error code %d in checking status
|
|
132
|
+
"Get error code %d in checking status with job: %s . message: %s"
|
|
133
133
|
% (ret, job.job_hash, err_str)
|
|
134
134
|
)
|
|
135
135
|
status_out = stdout.read().decode("utf-8").split("\n")
|
dpdispatcher/machines/pbs.py
CHANGED
|
@@ -1,4 +1,7 @@
|
|
|
1
1
|
import shlex
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from dargs import Argument
|
|
2
5
|
|
|
3
6
|
from dpdispatcher.dlog import dlog
|
|
4
7
|
from dpdispatcher.machine import Machine
|
|
@@ -73,7 +76,8 @@ class PBS(Machine):
|
|
|
73
76
|
job_id = job.job_id
|
|
74
77
|
if job_id == "":
|
|
75
78
|
return JobStatus.unsubmitted
|
|
76
|
-
|
|
79
|
+
command = "qstat -x " + job_id
|
|
80
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
77
81
|
err_str = stderr.read().decode("utf-8")
|
|
78
82
|
if ret != 0:
|
|
79
83
|
if "qstat: Unknown Job Id" in err_str or "Job has finished" in err_str:
|
|
@@ -83,8 +87,8 @@ class PBS(Machine):
|
|
|
83
87
|
return JobStatus.terminated
|
|
84
88
|
else:
|
|
85
89
|
raise RuntimeError(
|
|
86
|
-
"status command
|
|
87
|
-
% (err_str, ret)
|
|
90
|
+
"status command %s fails to execute. erro info: %s return code %d"
|
|
91
|
+
% (command, err_str, ret)
|
|
88
92
|
)
|
|
89
93
|
status_line = stdout.read().decode("utf-8").split("\n")[-2]
|
|
90
94
|
status_word = status_line.split()[-2]
|
|
@@ -123,7 +127,8 @@ class Torque(PBS):
|
|
|
123
127
|
job_id = job.job_id
|
|
124
128
|
if job_id == "":
|
|
125
129
|
return JobStatus.unsubmitted
|
|
126
|
-
|
|
130
|
+
command = "qstat -l " + job_id
|
|
131
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
127
132
|
err_str = stderr.read().decode("utf-8")
|
|
128
133
|
if ret != 0:
|
|
129
134
|
if "qstat: Unknown Job Id" in err_str or "Job has finished" in err_str:
|
|
@@ -133,8 +138,8 @@ class Torque(PBS):
|
|
|
133
138
|
return JobStatus.terminated
|
|
134
139
|
else:
|
|
135
140
|
raise RuntimeError(
|
|
136
|
-
"status command
|
|
137
|
-
% (err_str, ret)
|
|
141
|
+
"status command %s fails to execute. erro info: %s return code %d"
|
|
142
|
+
% (command, err_str, ret)
|
|
138
143
|
)
|
|
139
144
|
status_line = stdout.read().decode("utf-8").split("\n")[-2]
|
|
140
145
|
status_word = status_line.split()[-2]
|
|
@@ -181,10 +186,9 @@ class Torque(PBS):
|
|
|
181
186
|
|
|
182
187
|
sge_script_header_template = """
|
|
183
188
|
#!/bin/bash
|
|
184
|
-
#$ -
|
|
185
|
-
{select_node_line}
|
|
189
|
+
#$ -S /bin/bash
|
|
186
190
|
#$ -cwd
|
|
187
|
-
|
|
191
|
+
{select_node_line}
|
|
188
192
|
"""
|
|
189
193
|
|
|
190
194
|
|
|
@@ -209,14 +213,31 @@ class SGE(PBS):
|
|
|
209
213
|
)
|
|
210
214
|
|
|
211
215
|
def gen_script_header(self, job):
|
|
216
|
+
### Ref:https://softpanorama.org/HPC/PBS_and_derivatives/Reference/pbs_command_vs_sge_commands.shtml
|
|
217
|
+
# resources.number_node is not used in SGE
|
|
212
218
|
resources = job.resources
|
|
219
|
+
job_name = resources.kwargs.get("job_name", "wDPjob")
|
|
220
|
+
pe_name = resources.kwargs.get("pe_name", "mpi")
|
|
213
221
|
sge_script_header_dict = {}
|
|
214
|
-
|
|
215
|
-
sge_script_header_dict["select_node_line"]
|
|
216
|
-
f"#$ -pe
|
|
222
|
+
sge_script_header_dict["select_node_line"] = f"#$ -N {job_name}\n"
|
|
223
|
+
sge_script_header_dict["select_node_line"] += (
|
|
224
|
+
f"#$ -pe {pe_name} {resources.cpu_per_node}\n"
|
|
217
225
|
)
|
|
218
|
-
|
|
219
|
-
|
|
226
|
+
|
|
227
|
+
if resources.queue_name != "":
|
|
228
|
+
sge_script_header_dict["select_node_line"] += (
|
|
229
|
+
f"#$ -q {resources.queue_name}"
|
|
230
|
+
)
|
|
231
|
+
if (
|
|
232
|
+
resources["strategy"].get("customized_script_header_template_file")
|
|
233
|
+
is not None
|
|
234
|
+
):
|
|
235
|
+
file_name = resources["strategy"]["customized_script_header_template_file"]
|
|
236
|
+
sge_script_header = customized_script_header_template(file_name, resources)
|
|
237
|
+
else:
|
|
238
|
+
sge_script_header = sge_script_header_template.format(
|
|
239
|
+
**sge_script_header_dict
|
|
240
|
+
)
|
|
220
241
|
return sge_script_header
|
|
221
242
|
|
|
222
243
|
def do_submit(self, job):
|
|
@@ -224,6 +245,9 @@ class SGE(PBS):
|
|
|
224
245
|
script_str = self.gen_script(job)
|
|
225
246
|
job_id_name = job.job_hash + "_job_id"
|
|
226
247
|
self.context.write_file(fname=script_file_name, write_str=script_str)
|
|
248
|
+
script_run_str = self.gen_script_command(job)
|
|
249
|
+
script_run_file_name = f"{job.script_file_name}.run"
|
|
250
|
+
self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
|
|
227
251
|
script_file_dir = self.context.remote_root
|
|
228
252
|
stdin, stdout, stderr = self.context.block_checkcall(
|
|
229
253
|
"cd {} && {} {}".format(script_file_dir, "qsub", script_file_name)
|
|
@@ -241,12 +265,12 @@ class SGE(PBS):
|
|
|
241
265
|
status_line = None
|
|
242
266
|
if job_id == "":
|
|
243
267
|
return JobStatus.unsubmitted
|
|
244
|
-
|
|
268
|
+
command = "qstat"
|
|
269
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
245
270
|
err_str = stderr.read().decode("utf-8")
|
|
246
271
|
if ret != 0:
|
|
247
272
|
raise RuntimeError(
|
|
248
|
-
"status command
|
|
249
|
-
% (err_str, ret)
|
|
273
|
+
f"status command {command} fails to execute. erro info: {err_str} return code {ret}"
|
|
250
274
|
)
|
|
251
275
|
status_text_list = stdout.read().decode("utf-8").split("\n")
|
|
252
276
|
for txt in status_text_list:
|
|
@@ -259,8 +283,7 @@ class SGE(PBS):
|
|
|
259
283
|
if self.check_finish_tag(job=job):
|
|
260
284
|
return JobStatus.finished
|
|
261
285
|
dlog.info(
|
|
262
|
-
"not tag_finished detected, execute sync command and wait. count "
|
|
263
|
-
+ str(count)
|
|
286
|
+
f"not tag_finished detected, execute sync command and wait. count {count}"
|
|
264
287
|
)
|
|
265
288
|
self.context.block_call("sync")
|
|
266
289
|
import time
|
|
@@ -281,3 +304,44 @@ class SGE(PBS):
|
|
|
281
304
|
def check_finish_tag(self, job):
|
|
282
305
|
job_tag_finished = job.job_hash + "_job_tag_finished"
|
|
283
306
|
return self.context.check_file_exists(job_tag_finished)
|
|
307
|
+
|
|
308
|
+
@classmethod
|
|
309
|
+
def resources_subfields(cls) -> List[Argument]:
|
|
310
|
+
"""Generate the resources subfields.
|
|
311
|
+
|
|
312
|
+
pe_name : str
|
|
313
|
+
The parallel environment name of SGE.
|
|
314
|
+
|
|
315
|
+
Returns
|
|
316
|
+
-------
|
|
317
|
+
list[Argument]
|
|
318
|
+
resources subfields
|
|
319
|
+
"""
|
|
320
|
+
doc_pe_name = "The parallel environment name of SGE system."
|
|
321
|
+
doc_job_name = "The name of SGE's job."
|
|
322
|
+
|
|
323
|
+
return [
|
|
324
|
+
Argument(
|
|
325
|
+
"kwargs",
|
|
326
|
+
dict,
|
|
327
|
+
[
|
|
328
|
+
Argument(
|
|
329
|
+
"pe_name",
|
|
330
|
+
str,
|
|
331
|
+
optional=True,
|
|
332
|
+
default="mpi",
|
|
333
|
+
doc=doc_pe_name,
|
|
334
|
+
alias=["sge_pe_name"],
|
|
335
|
+
),
|
|
336
|
+
Argument(
|
|
337
|
+
"job_name",
|
|
338
|
+
str,
|
|
339
|
+
optional=True,
|
|
340
|
+
default="wDPjob",
|
|
341
|
+
doc=doc_job_name,
|
|
342
|
+
),
|
|
343
|
+
],
|
|
344
|
+
optional=False,
|
|
345
|
+
doc="Extra arguments.",
|
|
346
|
+
)
|
|
347
|
+
]
|
dpdispatcher/machines/shell.py
CHANGED
|
@@ -38,14 +38,13 @@ class Shell(Machine):
|
|
|
38
38
|
script_run_str = self.gen_script_command(job)
|
|
39
39
|
script_run_file_name = f"{job.script_file_name}.run"
|
|
40
40
|
self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
)
|
|
41
|
+
cmd = f"cd {shlex.quote(self.context.remote_root)} && {{ nohup bash {script_file_name} 1>>{output_name} 2>>{output_name} & }} && echo $!"
|
|
42
|
+
ret, stdin, stdout, stderr = self.context.block_call(cmd)
|
|
44
43
|
if ret != 0:
|
|
45
44
|
err_str = stderr.read().decode("utf-8")
|
|
46
45
|
raise RuntimeError(
|
|
47
|
-
"status command
|
|
48
|
-
% (err_str, ret)
|
|
46
|
+
"status command %s fails to execute\nerror message:%s\nreturn code %d\n"
|
|
47
|
+
% (cmd, err_str, ret)
|
|
49
48
|
)
|
|
50
49
|
job_id = int(stdout.read().decode("utf-8").strip())
|
|
51
50
|
self.context.write_file(job_id_name, str(job_id))
|
|
@@ -73,14 +72,16 @@ class Shell(Machine):
|
|
|
73
72
|
return JobStatus.unsubmitted
|
|
74
73
|
|
|
75
74
|
# mark defunct process as terminated
|
|
76
|
-
|
|
75
|
+
cmd = (
|
|
76
|
+
r"""command -v ps >/dev/null 2>&1 || { echo >&2 "I require ps but it's not installed. Aborting."; exit 1; };"""
|
|
77
77
|
f"if ps -p {job_id} > /dev/null && ! (ps -o command -p {job_id} | grep defunct >/dev/null) ; then echo 1; fi"
|
|
78
78
|
)
|
|
79
|
+
ret, stdin, stdout, stderr = self.context.block_call(cmd)
|
|
79
80
|
if ret != 0:
|
|
80
81
|
err_str = stderr.read().decode("utf-8")
|
|
81
82
|
raise RuntimeError(
|
|
82
|
-
"status command
|
|
83
|
-
% (err_str, ret)
|
|
83
|
+
"status command %s fails to execute\nerror message:%s\nreturn code %d\n"
|
|
84
|
+
% (cmd, err_str, ret)
|
|
84
85
|
)
|
|
85
86
|
|
|
86
87
|
if_job_exists = bool(stdout.read().decode("utf-8").strip())
|
dpdispatcher/machines/slurm.py
CHANGED
|
@@ -83,13 +83,12 @@ class Slurm(Machine):
|
|
|
83
83
|
script_run_file_name = f"{job.script_file_name}.run"
|
|
84
84
|
self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
|
|
85
85
|
# self.context.write_file(fname=os.path.join(self.context.submission.work_base, script_file_name), write_str=script_str)
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
shlex.quote(script_file_name),
|
|
91
|
-
)
|
|
86
|
+
command = "cd {} && {} {}".format(
|
|
87
|
+
shlex.quote(self.context.remote_root),
|
|
88
|
+
"sbatch",
|
|
89
|
+
shlex.quote(script_file_name),
|
|
92
90
|
)
|
|
91
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
93
92
|
if ret != 0:
|
|
94
93
|
err_str = stderr.read().decode("utf-8")
|
|
95
94
|
if (
|
|
@@ -98,7 +97,7 @@ class Slurm(Machine):
|
|
|
98
97
|
):
|
|
99
98
|
# server network error, retry 3 times
|
|
100
99
|
raise RetrySignal(
|
|
101
|
-
"Get error code %d in submitting
|
|
100
|
+
"Get error code %d in submitting with job: %s . message: %s"
|
|
102
101
|
% (ret, job.job_hash, err_str)
|
|
103
102
|
)
|
|
104
103
|
elif (
|
|
@@ -110,8 +109,8 @@ class Slurm(Machine):
|
|
|
110
109
|
# job number exceeds, skip the submitting
|
|
111
110
|
return ""
|
|
112
111
|
raise RuntimeError(
|
|
113
|
-
"
|
|
114
|
-
% (err_str, ret)
|
|
112
|
+
"command %s fails to execute\nerror message:%s\nreturn code %d\n"
|
|
113
|
+
% (command, err_str, ret)
|
|
115
114
|
)
|
|
116
115
|
subret = stdout.readlines()
|
|
117
116
|
# --parsable
|
|
@@ -129,9 +128,8 @@ class Slurm(Machine):
|
|
|
129
128
|
job_id = job.job_id
|
|
130
129
|
if job_id == "":
|
|
131
130
|
return JobStatus.unsubmitted
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
)
|
|
131
|
+
command = 'squeue -o "%.18i %.2t" -j ' + job_id
|
|
132
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
135
133
|
if ret != 0:
|
|
136
134
|
err_str = stderr.read().decode("utf-8")
|
|
137
135
|
if "Invalid job id specified" in err_str:
|
|
@@ -147,13 +145,13 @@ class Slurm(Machine):
|
|
|
147
145
|
):
|
|
148
146
|
# retry 3 times
|
|
149
147
|
raise RetrySignal(
|
|
150
|
-
"Get error code %d in checking status
|
|
148
|
+
"Get error code %d in checking status with job: %s . message: %s"
|
|
151
149
|
% (ret, job.job_hash, err_str)
|
|
152
150
|
)
|
|
153
151
|
raise RuntimeError(
|
|
154
|
-
"status command
|
|
152
|
+
"status command %s fails to execute."
|
|
155
153
|
"job_id:%s \n error message:%s\n return code %d\n"
|
|
156
|
-
% (job_id, err_str, ret)
|
|
154
|
+
% (command, job_id, err_str, ret)
|
|
157
155
|
)
|
|
158
156
|
status_line = stdout.read().decode("utf-8").split("\n")[-2]
|
|
159
157
|
status_word = status_line.split()[-1]
|
|
@@ -319,9 +317,8 @@ class SlurmJobArray(Slurm):
|
|
|
319
317
|
job_id = job.job_id
|
|
320
318
|
if job_id == "":
|
|
321
319
|
return JobStatus.unsubmitted
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
)
|
|
320
|
+
command = 'squeue -h -o "%.18i %.2t" -j ' + job_id
|
|
321
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
325
322
|
if ret != 0:
|
|
326
323
|
err_str = stderr.read().decode("utf-8")
|
|
327
324
|
if "Invalid job id specified" in err_str:
|
|
@@ -336,13 +333,13 @@ class SlurmJobArray(Slurm):
|
|
|
336
333
|
):
|
|
337
334
|
# retry 3 times
|
|
338
335
|
raise RetrySignal(
|
|
339
|
-
"Get error code %d in checking status
|
|
336
|
+
"Get error code %d in checking status with job: %s . message: %s"
|
|
340
337
|
% (ret, job.job_hash, err_str)
|
|
341
338
|
)
|
|
342
339
|
raise RuntimeError(
|
|
343
|
-
"status command
|
|
340
|
+
"status command %s fails to execute."
|
|
344
341
|
"job_id:%s \n error message:%s\n return code %d\n"
|
|
345
|
-
% (job_id, err_str, ret)
|
|
342
|
+
% (command, job_id, err_str, ret)
|
|
346
343
|
)
|
|
347
344
|
status_lines = stdout.read().decode("utf-8").split("\n")[:-1]
|
|
348
345
|
status = []
|
dpdispatcher/submission.py
CHANGED
|
@@ -1122,9 +1122,9 @@ class Resources:
|
|
|
1122
1122
|
|
|
1123
1123
|
@staticmethod
|
|
1124
1124
|
def arginfo(detail_kwargs=True):
|
|
1125
|
-
doc_number_node = "The number of
|
|
1126
|
-
doc_cpu_per_node = "
|
|
1127
|
-
doc_gpu_per_node = "
|
|
1125
|
+
doc_number_node = "The number of nodes required for each `job`."
|
|
1126
|
+
doc_cpu_per_node = "CPU numbers of each node assigned to each job."
|
|
1127
|
+
doc_gpu_per_node = "GPU numbers of each node assigned to each job."
|
|
1128
1128
|
doc_queue_name = "The queue name of batch job scheduler system."
|
|
1129
1129
|
doc_group_size = "The number of `tasks` in a `job`. 0 means infinity."
|
|
1130
1130
|
doc_custom_flags = "The extra lines pass to job submitting script header"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: dpdispatcher
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.7
|
|
4
4
|
Summary: Generate HPC scheduler systems jobs input scripts, submit these scripts to HPC systems, and poke until they finish
|
|
5
5
|
Author: DeepModeling
|
|
6
6
|
License: GNU LESSER GENERAL PUBLIC LICENSE
|
|
@@ -207,7 +207,7 @@ Requires-Dist: sphinx-rtd-theme >=1.0.0rc1 ; extra == 'docs'
|
|
|
207
207
|
Requires-Dist: numpydoc ; extra == 'docs'
|
|
208
208
|
Requires-Dist: deepmodeling-sphinx >=0.1.1 ; extra == 'docs'
|
|
209
209
|
Requires-Dist: dargs >=0.3.1 ; extra == 'docs'
|
|
210
|
-
Requires-Dist: sphinx-argparse ; extra == 'docs'
|
|
210
|
+
Requires-Dist: sphinx-argparse <0.5.0 ; extra == 'docs'
|
|
211
211
|
Provides-Extra: gui
|
|
212
212
|
Requires-Dist: dpgui ; extra == 'gui'
|
|
213
213
|
Provides-Extra: test
|
|
@@ -1,36 +1,36 @@
|
|
|
1
1
|
dpdispatcher/__init__.py,sha256=CLZP_N5CTp14ujWCykEHuJjoIfKR6CwrclXhjWUgNoE,517
|
|
2
2
|
dpdispatcher/__main__.py,sha256=BFhG-mSBzVZUEezQJqXWZnt2WsnhAHT_zpT8Y6gpOz0,116
|
|
3
|
-
dpdispatcher/_version.py,sha256=
|
|
3
|
+
dpdispatcher/_version.py,sha256=iLXz9haw4jSV4Xm2-5_V8999GBAYoJkXg9-YOwMJpLY,411
|
|
4
4
|
dpdispatcher/arginfo.py,sha256=pNaxYIE6ahBidpR7OCKZdw8iGt003uTXGSlVzwiuvRg,188
|
|
5
|
-
dpdispatcher/base_context.py,sha256=
|
|
5
|
+
dpdispatcher/base_context.py,sha256=FDie50yHuLxTwV-k3B_sgAYqR5rLLLVOwk24wSiu4SQ,5254
|
|
6
6
|
dpdispatcher/dlog.py,sha256=QJKAwB6gV3Zb6zQUL9dZ_uIoTIEy9Z7ecmVQ-8WNmD8,1081
|
|
7
7
|
dpdispatcher/dpdisp.py,sha256=jhuTmwPY7KBF4WukaQomEwZcfYoISaMbKwuxdDGSluc,4206
|
|
8
|
-
dpdispatcher/machine.py,sha256=
|
|
8
|
+
dpdispatcher/machine.py,sha256=k53ycs_v7xrl4D93URc5ht0shoO9NPrVl0rYr4v5OiU,16696
|
|
9
9
|
dpdispatcher/run.py,sha256=tFHbJAioXXpgHTE5bhRRAuc8w7cX1ET9SBbiAg3Rw-I,5382
|
|
10
|
-
dpdispatcher/submission.py,sha256=
|
|
10
|
+
dpdispatcher/submission.py,sha256=NaljgA88NLv0rvxoSMZvUMq0sQEggkgKlcT8gXUnqFs,48367
|
|
11
11
|
dpdispatcher/contexts/__init__.py,sha256=jlvcIppmUnS39yBlkZEDvIQFV-j_BR75ZTbZALF_RB0,336
|
|
12
|
-
dpdispatcher/contexts/dp_cloud_server_context.py,sha256=
|
|
13
|
-
dpdispatcher/contexts/hdfs_context.py,sha256=
|
|
14
|
-
dpdispatcher/contexts/lazy_local_context.py,sha256=
|
|
15
|
-
dpdispatcher/contexts/local_context.py,sha256=
|
|
16
|
-
dpdispatcher/contexts/openapi_context.py,sha256=
|
|
17
|
-
dpdispatcher/contexts/ssh_context.py,sha256=
|
|
12
|
+
dpdispatcher/contexts/dp_cloud_server_context.py,sha256=PGRMef3q2hfK-o5dNIWWvzPca2NK1HrWEgungM4L9Go,12420
|
|
13
|
+
dpdispatcher/contexts/hdfs_context.py,sha256=mYQzXMZ4A9EjjWBAH3Ba6HOErUhMMwCsKxOjpd5R57Y,9105
|
|
14
|
+
dpdispatcher/contexts/lazy_local_context.py,sha256=FAClbLD2F4LizUqFzMOg3t0Z6NLeTDLJy7NkRcDELFs,5070
|
|
15
|
+
dpdispatcher/contexts/local_context.py,sha256=VbaSXGAc_EDMT0K5WV_flBF0bX87ntrwO_hq_Bkcb04,14590
|
|
16
|
+
dpdispatcher/contexts/openapi_context.py,sha256=M7L9axpjOrzvdTpLMDuEzZqe4ZuKIxjS0bzZUv8W2IQ,9674
|
|
17
|
+
dpdispatcher/contexts/ssh_context.py,sha256=s0K-gSKPSykq2PyOzAt4yNEczAdsVGvQ1QmPJpZ4_Vo,37648
|
|
18
18
|
dpdispatcher/dpcloudserver/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
19
|
dpdispatcher/dpcloudserver/client.py,sha256=k1niKjG6zFnMtHn_UuCjYoOcMju3o3PV-GdyVLr5-KM,165
|
|
20
20
|
dpdispatcher/entrypoints/__init__.py,sha256=exKSFT3j2oCerGwtI8WbHQK-D0K-CyifocRji1xntT4,20
|
|
21
21
|
dpdispatcher/entrypoints/gui.py,sha256=29lMXqbmSRbLj4rfBv7Jnw89NLU9syTB88IUP6IRJsU,830
|
|
22
22
|
dpdispatcher/entrypoints/run.py,sha256=tRkHfeAktV6gF31yb2MVOSTlpNGZFw3N0jHBmM1YfIg,175
|
|
23
23
|
dpdispatcher/entrypoints/submission.py,sha256=ikVwIZAQL0SsYO5xaMIdKXgO6qtc05w1vqmvtG7Nk5M,3401
|
|
24
|
-
dpdispatcher/machines/JH_UniScheduler.py,sha256=
|
|
24
|
+
dpdispatcher/machines/JH_UniScheduler.py,sha256=B-LGldr9H8qPQYdCYoEaXFCEFBPmjFEi0fwEWp0wdR0,5783
|
|
25
25
|
dpdispatcher/machines/__init__.py,sha256=tOQuPUlW1Ab4qcC0oSAIyDjZA_WyE67h_EIxPCWGhys,336
|
|
26
|
-
dpdispatcher/machines/distributed_shell.py,sha256=
|
|
26
|
+
dpdispatcher/machines/distributed_shell.py,sha256=TVnXFNqQmBgWk3s34rKSZo0S5N5KPZVmAG3Xbu_kuBo,7535
|
|
27
27
|
dpdispatcher/machines/dp_cloud_server.py,sha256=SR69gsFb2BvOQCW1QnWfP3cQvu_qHLJNsycp5wzosJU,11706
|
|
28
28
|
dpdispatcher/machines/fugaku.py,sha256=oY2hD2ldL2dztwtJ9WNisdsfPnaX-5yTRXewIT9r60I,4314
|
|
29
|
-
dpdispatcher/machines/lsf.py,sha256=
|
|
29
|
+
dpdispatcher/machines/lsf.py,sha256=fOZoOTpFn1nKx79lYkvZQOhNwz39YAIEytxICd56AFU,7920
|
|
30
30
|
dpdispatcher/machines/openapi.py,sha256=Gzzbo8YOAybXGTrgMutexErcaEi3ts7uTUNvOhThFS8,8858
|
|
31
|
-
dpdispatcher/machines/pbs.py,sha256=
|
|
32
|
-
dpdispatcher/machines/shell.py,sha256=
|
|
33
|
-
dpdispatcher/machines/slurm.py,sha256=
|
|
31
|
+
dpdispatcher/machines/pbs.py,sha256=XeeFQMZoH9DscsrJ_Ykv6fNUtc9TBp4epuFqbUyr3dk,12531
|
|
32
|
+
dpdispatcher/machines/shell.py,sha256=ONaUJpszsCwCcbyVLvC6VoJ-ig2QTU9JQdA-nlgXnu8,4845
|
|
33
|
+
dpdispatcher/machines/slurm.py,sha256=HLYk9E1dChnTeHjOOWNG854AWdlUJVYYmgwaiVswPQ8,15560
|
|
34
34
|
dpdispatcher/utils/__init__.py,sha256=fwvwkMf7DFNQkNBiIce8Y8gRA6FhICwKjkKiXu_BEJg,13
|
|
35
35
|
dpdispatcher/utils/hdfs_cli.py,sha256=n3EIfFIralsISlaEewawD35f0P8mabo-u8D8UW3k_7Y,5308
|
|
36
36
|
dpdispatcher/utils/job_status.py,sha256=Eszs4TPLfszCuf6zLaFonf25feXDUguF28spYOjJpQE,233
|
|
@@ -41,9 +41,9 @@ dpdispatcher/utils/dpcloudserver/client.py,sha256=CLfXswvzI4inDrW2bYkfMQ6gQJFcZO
|
|
|
41
41
|
dpdispatcher/utils/dpcloudserver/config.py,sha256=NteQzf1OeEkz2UbkXHHQ0B72cUu23zLVzpM9Yh4v1Cc,559
|
|
42
42
|
dpdispatcher/utils/dpcloudserver/retcode.py,sha256=1qAF8gFZx55u2sO8KbtYSIIrjcO-IGufEUlwbkSfC1g,721
|
|
43
43
|
dpdispatcher/utils/dpcloudserver/zip_file.py,sha256=f9WrlktwHW0YipaWg5Y0kxjMZlhD1cJYa6EUpvu4Cro,2611
|
|
44
|
-
dpdispatcher-0.6.
|
|
45
|
-
dpdispatcher-0.6.
|
|
46
|
-
dpdispatcher-0.6.
|
|
47
|
-
dpdispatcher-0.6.
|
|
48
|
-
dpdispatcher-0.6.
|
|
49
|
-
dpdispatcher-0.6.
|
|
44
|
+
dpdispatcher-0.6.7.dist-info/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
|
|
45
|
+
dpdispatcher-0.6.7.dist-info/METADATA,sha256=lNsC7Ruo7GmUOQl1TadlThoLvrOETbZ1s0-sXmrRYL4,12828
|
|
46
|
+
dpdispatcher-0.6.7.dist-info/WHEEL,sha256=UvcQYKBHoFqaQd6LKyqHw9fxEolWLQnlzP0h_LgJAfI,91
|
|
47
|
+
dpdispatcher-0.6.7.dist-info/entry_points.txt,sha256=NRHUV0IU_u7_XtcmmEDnVzAcUmurhiEAGwENckrajo4,233
|
|
48
|
+
dpdispatcher-0.6.7.dist-info/top_level.txt,sha256=35jAQoXY-b-e9fJ1_mxhZUiaCoJNt1ZI7mpFRf07Qjs,13
|
|
49
|
+
dpdispatcher-0.6.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|