dpdispatcher 0.6.6__py3-none-any.whl → 0.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dpdispatcher might be problematic. Click here for more details.
- dpdispatcher/_version.py +2 -2
- dpdispatcher/base_context.py +61 -1
- dpdispatcher/contexts/dp_cloud_server_context.py +5 -0
- dpdispatcher/contexts/hdfs_context.py +5 -0
- dpdispatcher/contexts/lazy_local_context.py +0 -17
- dpdispatcher/contexts/local_context.py +57 -31
- dpdispatcher/contexts/openapi_context.py +5 -0
- dpdispatcher/contexts/ssh_context.py +12 -35
- dpdispatcher/machine.py +12 -0
- dpdispatcher/machines/JH_UniScheduler.py +1 -1
- dpdispatcher/machines/distributed_shell.py +2 -2
- dpdispatcher/machines/lsf.py +1 -1
- dpdispatcher/machines/pbs.py +11 -8
- dpdispatcher/machines/shell.py +9 -8
- dpdispatcher/machines/slurm.py +18 -21
- dpdispatcher/submission.py +3 -3
- {dpdispatcher-0.6.6.dist-info → dpdispatcher-0.6.7.dist-info}/METADATA +1 -1
- {dpdispatcher-0.6.6.dist-info → dpdispatcher-0.6.7.dist-info}/RECORD +22 -22
- {dpdispatcher-0.6.6.dist-info → dpdispatcher-0.6.7.dist-info}/WHEEL +1 -1
- {dpdispatcher-0.6.6.dist-info → dpdispatcher-0.6.7.dist-info}/LICENSE +0 -0
- {dpdispatcher-0.6.6.dist-info → dpdispatcher-0.6.7.dist-info}/entry_points.txt +0 -0
- {dpdispatcher-0.6.6.dist-info → dpdispatcher-0.6.7.dist-info}/top_level.txt +0 -0
dpdispatcher/_version.py
CHANGED
dpdispatcher/base_context.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from abc import ABCMeta, abstractmethod
|
|
2
|
-
from typing import List, Tuple
|
|
2
|
+
from typing import Any, List, Tuple
|
|
3
3
|
|
|
4
4
|
from dargs import Argument
|
|
5
5
|
|
|
@@ -73,6 +73,66 @@ class BaseContext(metaclass=ABCMeta):
|
|
|
73
73
|
def check_finish(self, proc):
|
|
74
74
|
raise NotImplementedError("abstract method")
|
|
75
75
|
|
|
76
|
+
def block_checkcall(self, cmd, asynchronously=False) -> Tuple[Any, Any, Any]:
|
|
77
|
+
"""Run command with arguments. Wait for command to complete.
|
|
78
|
+
|
|
79
|
+
Parameters
|
|
80
|
+
----------
|
|
81
|
+
cmd : str
|
|
82
|
+
The command to run.
|
|
83
|
+
asynchronously : bool, optional, default=False
|
|
84
|
+
Run command asynchronously. If True, `nohup` will be used to run the command.
|
|
85
|
+
|
|
86
|
+
Returns
|
|
87
|
+
-------
|
|
88
|
+
stdin
|
|
89
|
+
standard inout
|
|
90
|
+
stdout
|
|
91
|
+
standard output
|
|
92
|
+
stderr
|
|
93
|
+
standard error
|
|
94
|
+
|
|
95
|
+
Raises
|
|
96
|
+
------
|
|
97
|
+
RuntimeError
|
|
98
|
+
when the return code is not zero
|
|
99
|
+
"""
|
|
100
|
+
if asynchronously:
|
|
101
|
+
cmd = f"nohup {cmd} >/dev/null &"
|
|
102
|
+
exit_status, stdin, stdout, stderr = self.block_call(cmd)
|
|
103
|
+
if exit_status != 0:
|
|
104
|
+
raise RuntimeError(
|
|
105
|
+
"Get error code %d in calling %s with job: %s . message: %s"
|
|
106
|
+
% (
|
|
107
|
+
exit_status,
|
|
108
|
+
cmd,
|
|
109
|
+
self.submission.submission_hash,
|
|
110
|
+
stderr.read().decode("utf-8"),
|
|
111
|
+
)
|
|
112
|
+
)
|
|
113
|
+
return stdin, stdout, stderr
|
|
114
|
+
|
|
115
|
+
@abstractmethod
|
|
116
|
+
def block_call(self, cmd) -> Tuple[int, Any, Any, Any]:
|
|
117
|
+
"""Run command with arguments. Wait for command to complete.
|
|
118
|
+
|
|
119
|
+
Parameters
|
|
120
|
+
----------
|
|
121
|
+
cmd : str
|
|
122
|
+
The command to run.
|
|
123
|
+
|
|
124
|
+
Returns
|
|
125
|
+
-------
|
|
126
|
+
exit_status
|
|
127
|
+
exit code
|
|
128
|
+
stdin
|
|
129
|
+
standard inout
|
|
130
|
+
stdout
|
|
131
|
+
standard output
|
|
132
|
+
stderr
|
|
133
|
+
standard error
|
|
134
|
+
"""
|
|
135
|
+
|
|
76
136
|
@classmethod
|
|
77
137
|
def machine_arginfo(cls) -> Argument:
|
|
78
138
|
"""Generate the machine arginfo.
|
|
@@ -335,6 +335,11 @@ class BohriumContext(BaseContext):
|
|
|
335
335
|
)
|
|
336
336
|
]
|
|
337
337
|
|
|
338
|
+
def block_call(self, cmd):
|
|
339
|
+
raise RuntimeError(
|
|
340
|
+
"Unsupported method. You may use an unsupported combination of the machine and the context."
|
|
341
|
+
)
|
|
342
|
+
|
|
338
343
|
|
|
339
344
|
DpCloudServerContext = BohriumContext
|
|
340
345
|
LebesgueContext = BohriumContext
|
|
@@ -244,3 +244,8 @@ class HDFSContext(BaseContext):
|
|
|
244
244
|
|
|
245
245
|
def read_file(self, fname):
|
|
246
246
|
return HDFS.read_hdfs_file(os.path.join(self.remote_root, fname))
|
|
247
|
+
|
|
248
|
+
def block_call(self, cmd):
|
|
249
|
+
raise RuntimeError(
|
|
250
|
+
"Unsupported method. You may use an unsupported combination of the machine and the context."
|
|
251
|
+
)
|
|
@@ -112,23 +112,6 @@ class LazyLocalContext(BaseContext):
|
|
|
112
112
|
# else:
|
|
113
113
|
# raise RuntimeError('do not find download file ' + fname)
|
|
114
114
|
|
|
115
|
-
def block_checkcall(self, cmd):
|
|
116
|
-
# script_dir = os.path.join(self.local_root, self.submission.work_base)
|
|
117
|
-
# os.chdir(script_dir)
|
|
118
|
-
proc = sp.Popen(
|
|
119
|
-
cmd, cwd=self.local_root, shell=True, stdout=sp.PIPE, stderr=sp.PIPE
|
|
120
|
-
)
|
|
121
|
-
o, e = proc.communicate()
|
|
122
|
-
stdout = SPRetObj(o)
|
|
123
|
-
stderr = SPRetObj(e)
|
|
124
|
-
code = proc.returncode
|
|
125
|
-
if code != 0:
|
|
126
|
-
raise RuntimeError(
|
|
127
|
-
"Get error code %d in locally calling %s with job: %s ",
|
|
128
|
-
(code, cmd, self.submission.submission_hash),
|
|
129
|
-
)
|
|
130
|
-
return None, stdout, stderr
|
|
131
|
-
|
|
132
115
|
def block_call(self, cmd):
|
|
133
116
|
proc = sp.Popen(
|
|
134
117
|
cmd, cwd=self.local_root, shell=True, stdout=sp.PIPE, stderr=sp.PIPE
|
|
@@ -3,6 +3,9 @@ import shutil
|
|
|
3
3
|
import subprocess as sp
|
|
4
4
|
from glob import glob
|
|
5
5
|
from subprocess import TimeoutExpired
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
from dargs import Argument
|
|
6
9
|
|
|
7
10
|
from dpdispatcher.base_context import BaseContext
|
|
8
11
|
from dpdispatcher.dlog import dlog
|
|
@@ -60,6 +63,7 @@ class LocalContext(BaseContext):
|
|
|
60
63
|
self.temp_local_root = os.path.abspath(local_root)
|
|
61
64
|
self.temp_remote_root = os.path.abspath(remote_root)
|
|
62
65
|
self.remote_profile = remote_profile
|
|
66
|
+
self.symlink = remote_profile.get("symlink", True)
|
|
63
67
|
|
|
64
68
|
@classmethod
|
|
65
69
|
def load_from_dict(cls, context_dict):
|
|
@@ -83,6 +87,25 @@ class LocalContext(BaseContext):
|
|
|
83
87
|
self.temp_remote_root, submission.submission_hash
|
|
84
88
|
)
|
|
85
89
|
|
|
90
|
+
def _copy_from_local_to_remote(self, local_path, remote_path):
|
|
91
|
+
if not os.path.exists(local_path):
|
|
92
|
+
raise FileNotFoundError(
|
|
93
|
+
f"cannot find uploaded file {os.path.join(local_path)}"
|
|
94
|
+
)
|
|
95
|
+
if os.path.exists(remote_path):
|
|
96
|
+
os.remove(remote_path)
|
|
97
|
+
_check_file_path(remote_path)
|
|
98
|
+
|
|
99
|
+
if self.symlink:
|
|
100
|
+
# ensure the file exist
|
|
101
|
+
os.symlink(local_path, remote_path)
|
|
102
|
+
elif os.path.isfile(local_path):
|
|
103
|
+
shutil.copyfile(local_path, remote_path)
|
|
104
|
+
elif os.path.isdir(local_path):
|
|
105
|
+
shutil.copytree(local_path, remote_path)
|
|
106
|
+
else:
|
|
107
|
+
raise ValueError(f"Unknown file type: {local_path}")
|
|
108
|
+
|
|
86
109
|
def upload(self, submission):
|
|
87
110
|
os.makedirs(self.remote_root, exist_ok=True)
|
|
88
111
|
for ii in submission.belonging_tasks:
|
|
@@ -103,14 +126,9 @@ class LocalContext(BaseContext):
|
|
|
103
126
|
file_list.extend(rel_file_list)
|
|
104
127
|
|
|
105
128
|
for jj in file_list:
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
)
|
|
110
|
-
if os.path.exists(os.path.join(remote_job, jj)):
|
|
111
|
-
os.remove(os.path.join(remote_job, jj))
|
|
112
|
-
_check_file_path(os.path.join(remote_job, jj))
|
|
113
|
-
os.symlink(os.path.join(local_job, jj), os.path.join(remote_job, jj))
|
|
129
|
+
self._copy_from_local_to_remote(
|
|
130
|
+
os.path.join(local_job, jj), os.path.join(remote_job, jj)
|
|
131
|
+
)
|
|
114
132
|
|
|
115
133
|
local_job = self.local_root
|
|
116
134
|
remote_job = self.remote_root
|
|
@@ -128,14 +146,9 @@ class LocalContext(BaseContext):
|
|
|
128
146
|
file_list.extend(rel_file_list)
|
|
129
147
|
|
|
130
148
|
for jj in file_list:
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
)
|
|
135
|
-
if os.path.exists(os.path.join(remote_job, jj)):
|
|
136
|
-
os.remove(os.path.join(remote_job, jj))
|
|
137
|
-
_check_file_path(os.path.join(remote_job, jj))
|
|
138
|
-
os.symlink(os.path.join(local_job, jj), os.path.join(remote_job, jj))
|
|
149
|
+
self._copy_from_local_to_remote(
|
|
150
|
+
os.path.join(local_job, jj), os.path.join(remote_job, jj)
|
|
151
|
+
)
|
|
139
152
|
|
|
140
153
|
def download(
|
|
141
154
|
self, submission, check_exists=False, mark_failure=True, back_error=False
|
|
@@ -288,21 +301,6 @@ class LocalContext(BaseContext):
|
|
|
288
301
|
# no nothing in the case of linked files
|
|
289
302
|
pass
|
|
290
303
|
|
|
291
|
-
def block_checkcall(self, cmd):
|
|
292
|
-
proc = sp.Popen(
|
|
293
|
-
cmd, cwd=self.remote_root, shell=True, stdout=sp.PIPE, stderr=sp.PIPE
|
|
294
|
-
)
|
|
295
|
-
o, e = proc.communicate()
|
|
296
|
-
stdout = SPRetObj(o)
|
|
297
|
-
stderr = SPRetObj(e)
|
|
298
|
-
code = proc.returncode
|
|
299
|
-
if code != 0:
|
|
300
|
-
raise RuntimeError(
|
|
301
|
-
f"Get error code {code} in locally calling {cmd} with job: {self.submission.submission_hash}"
|
|
302
|
-
f"\nStandard error: {stderr}"
|
|
303
|
-
)
|
|
304
|
-
return None, stdout, stderr
|
|
305
|
-
|
|
306
304
|
def block_call(self, cmd):
|
|
307
305
|
proc = sp.Popen(
|
|
308
306
|
cmd, cwd=self.remote_root, shell=True, stdout=sp.PIPE, stderr=sp.PIPE
|
|
@@ -351,3 +349,31 @@ class LocalContext(BaseContext):
|
|
|
351
349
|
stdout = None
|
|
352
350
|
stderr = None
|
|
353
351
|
return ret, stdout, stderr
|
|
352
|
+
|
|
353
|
+
@classmethod
|
|
354
|
+
def machine_subfields(cls) -> List[Argument]:
|
|
355
|
+
"""Generate the machine subfields.
|
|
356
|
+
|
|
357
|
+
Returns
|
|
358
|
+
-------
|
|
359
|
+
list[Argument]
|
|
360
|
+
machine subfields
|
|
361
|
+
"""
|
|
362
|
+
doc_remote_profile = "The information used to maintain the local machine."
|
|
363
|
+
return [
|
|
364
|
+
Argument(
|
|
365
|
+
"remote_profile",
|
|
366
|
+
dict,
|
|
367
|
+
optional=True,
|
|
368
|
+
doc=doc_remote_profile,
|
|
369
|
+
sub_fields=[
|
|
370
|
+
Argument(
|
|
371
|
+
"symlink",
|
|
372
|
+
bool,
|
|
373
|
+
optional=True,
|
|
374
|
+
default=True,
|
|
375
|
+
doc="Whether to use symbolic links to replace copy. This option should be turned off if the local directory is not accessible on the Batch system.",
|
|
376
|
+
),
|
|
377
|
+
],
|
|
378
|
+
)
|
|
379
|
+
]
|
|
@@ -258,3 +258,8 @@ class OpenAPIContext(BaseContext):
|
|
|
258
258
|
dir_to_be_removed = os.path.join(local_root, "backup")
|
|
259
259
|
if os.path.exists(dir_to_be_removed):
|
|
260
260
|
shutil.rmtree(dir_to_be_removed)
|
|
261
|
+
|
|
262
|
+
def block_call(self, cmd):
|
|
263
|
+
raise RuntimeError(
|
|
264
|
+
"Unsupported method. You may use an unsupported combination of the machine and the context."
|
|
265
|
+
)
|
|
@@ -44,6 +44,7 @@ class SSHSession:
|
|
|
44
44
|
totp_secret=None,
|
|
45
45
|
tar_compress=True,
|
|
46
46
|
look_for_keys=True,
|
|
47
|
+
execute_command=None,
|
|
47
48
|
):
|
|
48
49
|
self.hostname = hostname
|
|
49
50
|
self.username = username
|
|
@@ -56,6 +57,7 @@ class SSHSession:
|
|
|
56
57
|
self.ssh = None
|
|
57
58
|
self.tar_compress = tar_compress
|
|
58
59
|
self.look_for_keys = look_for_keys
|
|
60
|
+
self.execute_command = execute_command
|
|
59
61
|
self._keyboard_interactive_auth = False
|
|
60
62
|
self._setup_ssh()
|
|
61
63
|
|
|
@@ -237,6 +239,8 @@ class SSHSession:
|
|
|
237
239
|
self.ssh._transport = ts # type: ignore
|
|
238
240
|
# reset sftp
|
|
239
241
|
self._sftp = None
|
|
242
|
+
if self.execute_command is not None:
|
|
243
|
+
self.exec_command(self.execute_command)
|
|
240
244
|
|
|
241
245
|
def inter_handler(self, title, instructions, prompt_list):
|
|
242
246
|
"""inter_handler: the callback for paramiko.transport.auth_interactive.
|
|
@@ -338,6 +342,7 @@ class SSHSession:
|
|
|
338
342
|
doc_look_for_keys = (
|
|
339
343
|
"enable searching for discoverable private key files in ~/.ssh/"
|
|
340
344
|
)
|
|
345
|
+
doc_execute_command = "execute command after ssh connection is established."
|
|
341
346
|
ssh_remote_profile_args = [
|
|
342
347
|
Argument("hostname", str, optional=False, doc=doc_hostname),
|
|
343
348
|
Argument("username", str, optional=False, doc=doc_username),
|
|
@@ -379,6 +384,13 @@ class SSHSession:
|
|
|
379
384
|
default=True,
|
|
380
385
|
doc=doc_look_for_keys,
|
|
381
386
|
),
|
|
387
|
+
Argument(
|
|
388
|
+
"execute_command",
|
|
389
|
+
str,
|
|
390
|
+
optional=True,
|
|
391
|
+
default=None,
|
|
392
|
+
doc=doc_execute_command,
|
|
393
|
+
),
|
|
382
394
|
]
|
|
383
395
|
ssh_remote_profile_format = Argument(
|
|
384
396
|
"ssh_session", dict, ssh_remote_profile_args
|
|
@@ -755,41 +767,6 @@ class SSHContext(BaseContext):
|
|
|
755
767
|
tar_compress=self.remote_profile.get("tar_compress", None),
|
|
756
768
|
)
|
|
757
769
|
|
|
758
|
-
def block_checkcall(self, cmd, asynchronously=False, stderr_whitelist=None):
|
|
759
|
-
"""Run command with arguments. Wait for command to complete. If the return code
|
|
760
|
-
was zero then return, otherwise raise RuntimeError.
|
|
761
|
-
|
|
762
|
-
Parameters
|
|
763
|
-
----------
|
|
764
|
-
cmd : str
|
|
765
|
-
The command to run.
|
|
766
|
-
asynchronously : bool, optional, default=False
|
|
767
|
-
Run command asynchronously. If True, `nohup` will be used to run the command.
|
|
768
|
-
stderr_whitelist : list of str, optional, default=None
|
|
769
|
-
If not None, the stderr will be checked against the whitelist. If the stderr
|
|
770
|
-
contains any of the strings in the whitelist, the command will be considered
|
|
771
|
-
successful.
|
|
772
|
-
"""
|
|
773
|
-
assert self.remote_root is not None
|
|
774
|
-
self.ssh_session.ensure_alive()
|
|
775
|
-
if asynchronously:
|
|
776
|
-
cmd = f"nohup {cmd} >/dev/null &"
|
|
777
|
-
stdin, stdout, stderr = self.ssh_session.exec_command(
|
|
778
|
-
(f"cd {shlex.quote(self.remote_root)} ;") + cmd
|
|
779
|
-
)
|
|
780
|
-
exit_status = stdout.channel.recv_exit_status()
|
|
781
|
-
if exit_status != 0:
|
|
782
|
-
raise RuntimeError(
|
|
783
|
-
"Get error code %d in calling %s through ssh with job: %s . message: %s"
|
|
784
|
-
% (
|
|
785
|
-
exit_status,
|
|
786
|
-
cmd,
|
|
787
|
-
self.submission.submission_hash,
|
|
788
|
-
stderr.read().decode("utf-8"),
|
|
789
|
-
)
|
|
790
|
-
)
|
|
791
|
-
return stdin, stdout, stderr
|
|
792
|
-
|
|
793
770
|
def block_call(self, cmd):
|
|
794
771
|
assert self.remote_root is not None
|
|
795
772
|
self.ssh_session.ensure_alive()
|
dpdispatcher/machine.py
CHANGED
|
@@ -161,6 +161,9 @@ class Machine(metaclass=ABCMeta):
|
|
|
161
161
|
machine_dict["remote_profile"] = self.context.remote_profile
|
|
162
162
|
else:
|
|
163
163
|
machine_dict["remote_profile"] = {}
|
|
164
|
+
# normalize the dict
|
|
165
|
+
base = self.arginfo()
|
|
166
|
+
machine_dict = base.normalize_value(machine_dict, trim_pattern="_*")
|
|
164
167
|
return machine_dict
|
|
165
168
|
|
|
166
169
|
def __eq__(self, other):
|
|
@@ -265,6 +268,15 @@ class Machine(metaclass=ABCMeta):
|
|
|
265
268
|
|
|
266
269
|
export_envs_part = ""
|
|
267
270
|
envs = job.resources.envs
|
|
271
|
+
envs = {
|
|
272
|
+
# export resources information to the environment variables
|
|
273
|
+
"DPDISPATCHER_NUMBER_NODE": job.resources.number_node,
|
|
274
|
+
"DPDISPATCHER_CPU_PER_NODE": job.resources.cpu_per_node,
|
|
275
|
+
"DPDISPATCHER_GPU_PER_NODE": job.resources.gpu_per_node,
|
|
276
|
+
"DPDISPATCHER_QUEUE_NAME": job.resources.queue_name,
|
|
277
|
+
"DPDISPATCHER_GROUP_SIZE": job.resources.group_size,
|
|
278
|
+
**envs,
|
|
279
|
+
}
|
|
268
280
|
for k, v in envs.items():
|
|
269
281
|
if isinstance(v, list):
|
|
270
282
|
for each_value in v:
|
|
@@ -105,7 +105,7 @@ class JH_UniScheduler(Machine):
|
|
|
105
105
|
elif ret != 0:
|
|
106
106
|
# just retry when any unknown error raised.
|
|
107
107
|
raise RetrySignal(
|
|
108
|
-
"Get error code %d in checking status
|
|
108
|
+
"Get error code %d in checking status with job: %s . message: %s"
|
|
109
109
|
% (ret, job.job_hash, err_str)
|
|
110
110
|
)
|
|
111
111
|
status_out = stdout.read().decode("utf-8").split("\n")
|
|
@@ -181,8 +181,8 @@ class DistributedShell(Machine):
|
|
|
181
181
|
if ret != 0:
|
|
182
182
|
err_str = stderr.decode("utf-8")
|
|
183
183
|
raise RuntimeError(
|
|
184
|
-
"Command
|
|
185
|
-
% (err_str, ret)
|
|
184
|
+
"Command %s fails to execute, error message:%s\nreturn code %d\n"
|
|
185
|
+
% (cmd, err_str, ret)
|
|
186
186
|
)
|
|
187
187
|
job_id = int(stdout.decode("utf-8").strip())
|
|
188
188
|
|
dpdispatcher/machines/lsf.py
CHANGED
|
@@ -129,7 +129,7 @@ class LSF(Machine):
|
|
|
129
129
|
elif ret != 0:
|
|
130
130
|
# just retry when any unknown error raised.
|
|
131
131
|
raise RetrySignal(
|
|
132
|
-
"Get error code %d in checking status
|
|
132
|
+
"Get error code %d in checking status with job: %s . message: %s"
|
|
133
133
|
% (ret, job.job_hash, err_str)
|
|
134
134
|
)
|
|
135
135
|
status_out = stdout.read().decode("utf-8").split("\n")
|
dpdispatcher/machines/pbs.py
CHANGED
|
@@ -76,7 +76,8 @@ class PBS(Machine):
|
|
|
76
76
|
job_id = job.job_id
|
|
77
77
|
if job_id == "":
|
|
78
78
|
return JobStatus.unsubmitted
|
|
79
|
-
|
|
79
|
+
command = "qstat -x " + job_id
|
|
80
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
80
81
|
err_str = stderr.read().decode("utf-8")
|
|
81
82
|
if ret != 0:
|
|
82
83
|
if "qstat: Unknown Job Id" in err_str or "Job has finished" in err_str:
|
|
@@ -86,8 +87,8 @@ class PBS(Machine):
|
|
|
86
87
|
return JobStatus.terminated
|
|
87
88
|
else:
|
|
88
89
|
raise RuntimeError(
|
|
89
|
-
"status command
|
|
90
|
-
% (err_str, ret)
|
|
90
|
+
"status command %s fails to execute. erro info: %s return code %d"
|
|
91
|
+
% (command, err_str, ret)
|
|
91
92
|
)
|
|
92
93
|
status_line = stdout.read().decode("utf-8").split("\n")[-2]
|
|
93
94
|
status_word = status_line.split()[-2]
|
|
@@ -126,7 +127,8 @@ class Torque(PBS):
|
|
|
126
127
|
job_id = job.job_id
|
|
127
128
|
if job_id == "":
|
|
128
129
|
return JobStatus.unsubmitted
|
|
129
|
-
|
|
130
|
+
command = "qstat -l " + job_id
|
|
131
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
130
132
|
err_str = stderr.read().decode("utf-8")
|
|
131
133
|
if ret != 0:
|
|
132
134
|
if "qstat: Unknown Job Id" in err_str or "Job has finished" in err_str:
|
|
@@ -136,8 +138,8 @@ class Torque(PBS):
|
|
|
136
138
|
return JobStatus.terminated
|
|
137
139
|
else:
|
|
138
140
|
raise RuntimeError(
|
|
139
|
-
"status command
|
|
140
|
-
% (err_str, ret)
|
|
141
|
+
"status command %s fails to execute. erro info: %s return code %d"
|
|
142
|
+
% (command, err_str, ret)
|
|
141
143
|
)
|
|
142
144
|
status_line = stdout.read().decode("utf-8").split("\n")[-2]
|
|
143
145
|
status_word = status_line.split()[-2]
|
|
@@ -263,11 +265,12 @@ class SGE(PBS):
|
|
|
263
265
|
status_line = None
|
|
264
266
|
if job_id == "":
|
|
265
267
|
return JobStatus.unsubmitted
|
|
266
|
-
|
|
268
|
+
command = "qstat"
|
|
269
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
267
270
|
err_str = stderr.read().decode("utf-8")
|
|
268
271
|
if ret != 0:
|
|
269
272
|
raise RuntimeError(
|
|
270
|
-
f"status command
|
|
273
|
+
f"status command {command} fails to execute. erro info: {err_str} return code {ret}"
|
|
271
274
|
)
|
|
272
275
|
status_text_list = stdout.read().decode("utf-8").split("\n")
|
|
273
276
|
for txt in status_text_list:
|
dpdispatcher/machines/shell.py
CHANGED
|
@@ -38,14 +38,13 @@ class Shell(Machine):
|
|
|
38
38
|
script_run_str = self.gen_script_command(job)
|
|
39
39
|
script_run_file_name = f"{job.script_file_name}.run"
|
|
40
40
|
self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
)
|
|
41
|
+
cmd = f"cd {shlex.quote(self.context.remote_root)} && {{ nohup bash {script_file_name} 1>>{output_name} 2>>{output_name} & }} && echo $!"
|
|
42
|
+
ret, stdin, stdout, stderr = self.context.block_call(cmd)
|
|
44
43
|
if ret != 0:
|
|
45
44
|
err_str = stderr.read().decode("utf-8")
|
|
46
45
|
raise RuntimeError(
|
|
47
|
-
"status command
|
|
48
|
-
% (err_str, ret)
|
|
46
|
+
"status command %s fails to execute\nerror message:%s\nreturn code %d\n"
|
|
47
|
+
% (cmd, err_str, ret)
|
|
49
48
|
)
|
|
50
49
|
job_id = int(stdout.read().decode("utf-8").strip())
|
|
51
50
|
self.context.write_file(job_id_name, str(job_id))
|
|
@@ -73,14 +72,16 @@ class Shell(Machine):
|
|
|
73
72
|
return JobStatus.unsubmitted
|
|
74
73
|
|
|
75
74
|
# mark defunct process as terminated
|
|
76
|
-
|
|
75
|
+
cmd = (
|
|
76
|
+
r"""command -v ps >/dev/null 2>&1 || { echo >&2 "I require ps but it's not installed. Aborting."; exit 1; };"""
|
|
77
77
|
f"if ps -p {job_id} > /dev/null && ! (ps -o command -p {job_id} | grep defunct >/dev/null) ; then echo 1; fi"
|
|
78
78
|
)
|
|
79
|
+
ret, stdin, stdout, stderr = self.context.block_call(cmd)
|
|
79
80
|
if ret != 0:
|
|
80
81
|
err_str = stderr.read().decode("utf-8")
|
|
81
82
|
raise RuntimeError(
|
|
82
|
-
"status command
|
|
83
|
-
% (err_str, ret)
|
|
83
|
+
"status command %s fails to execute\nerror message:%s\nreturn code %d\n"
|
|
84
|
+
% (cmd, err_str, ret)
|
|
84
85
|
)
|
|
85
86
|
|
|
86
87
|
if_job_exists = bool(stdout.read().decode("utf-8").strip())
|
dpdispatcher/machines/slurm.py
CHANGED
|
@@ -83,13 +83,12 @@ class Slurm(Machine):
|
|
|
83
83
|
script_run_file_name = f"{job.script_file_name}.run"
|
|
84
84
|
self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
|
|
85
85
|
# self.context.write_file(fname=os.path.join(self.context.submission.work_base, script_file_name), write_str=script_str)
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
shlex.quote(script_file_name),
|
|
91
|
-
)
|
|
86
|
+
command = "cd {} && {} {}".format(
|
|
87
|
+
shlex.quote(self.context.remote_root),
|
|
88
|
+
"sbatch",
|
|
89
|
+
shlex.quote(script_file_name),
|
|
92
90
|
)
|
|
91
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
93
92
|
if ret != 0:
|
|
94
93
|
err_str = stderr.read().decode("utf-8")
|
|
95
94
|
if (
|
|
@@ -98,7 +97,7 @@ class Slurm(Machine):
|
|
|
98
97
|
):
|
|
99
98
|
# server network error, retry 3 times
|
|
100
99
|
raise RetrySignal(
|
|
101
|
-
"Get error code %d in submitting
|
|
100
|
+
"Get error code %d in submitting with job: %s . message: %s"
|
|
102
101
|
% (ret, job.job_hash, err_str)
|
|
103
102
|
)
|
|
104
103
|
elif (
|
|
@@ -110,8 +109,8 @@ class Slurm(Machine):
|
|
|
110
109
|
# job number exceeds, skip the submitting
|
|
111
110
|
return ""
|
|
112
111
|
raise RuntimeError(
|
|
113
|
-
"
|
|
114
|
-
% (err_str, ret)
|
|
112
|
+
"command %s fails to execute\nerror message:%s\nreturn code %d\n"
|
|
113
|
+
% (command, err_str, ret)
|
|
115
114
|
)
|
|
116
115
|
subret = stdout.readlines()
|
|
117
116
|
# --parsable
|
|
@@ -129,9 +128,8 @@ class Slurm(Machine):
|
|
|
129
128
|
job_id = job.job_id
|
|
130
129
|
if job_id == "":
|
|
131
130
|
return JobStatus.unsubmitted
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
)
|
|
131
|
+
command = 'squeue -o "%.18i %.2t" -j ' + job_id
|
|
132
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
135
133
|
if ret != 0:
|
|
136
134
|
err_str = stderr.read().decode("utf-8")
|
|
137
135
|
if "Invalid job id specified" in err_str:
|
|
@@ -147,13 +145,13 @@ class Slurm(Machine):
|
|
|
147
145
|
):
|
|
148
146
|
# retry 3 times
|
|
149
147
|
raise RetrySignal(
|
|
150
|
-
"Get error code %d in checking status
|
|
148
|
+
"Get error code %d in checking status with job: %s . message: %s"
|
|
151
149
|
% (ret, job.job_hash, err_str)
|
|
152
150
|
)
|
|
153
151
|
raise RuntimeError(
|
|
154
|
-
"status command
|
|
152
|
+
"status command %s fails to execute."
|
|
155
153
|
"job_id:%s \n error message:%s\n return code %d\n"
|
|
156
|
-
% (job_id, err_str, ret)
|
|
154
|
+
% (command, job_id, err_str, ret)
|
|
157
155
|
)
|
|
158
156
|
status_line = stdout.read().decode("utf-8").split("\n")[-2]
|
|
159
157
|
status_word = status_line.split()[-1]
|
|
@@ -319,9 +317,8 @@ class SlurmJobArray(Slurm):
|
|
|
319
317
|
job_id = job.job_id
|
|
320
318
|
if job_id == "":
|
|
321
319
|
return JobStatus.unsubmitted
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
)
|
|
320
|
+
command = 'squeue -h -o "%.18i %.2t" -j ' + job_id
|
|
321
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
325
322
|
if ret != 0:
|
|
326
323
|
err_str = stderr.read().decode("utf-8")
|
|
327
324
|
if "Invalid job id specified" in err_str:
|
|
@@ -336,13 +333,13 @@ class SlurmJobArray(Slurm):
|
|
|
336
333
|
):
|
|
337
334
|
# retry 3 times
|
|
338
335
|
raise RetrySignal(
|
|
339
|
-
"Get error code %d in checking status
|
|
336
|
+
"Get error code %d in checking status with job: %s . message: %s"
|
|
340
337
|
% (ret, job.job_hash, err_str)
|
|
341
338
|
)
|
|
342
339
|
raise RuntimeError(
|
|
343
|
-
"status command
|
|
340
|
+
"status command %s fails to execute."
|
|
344
341
|
"job_id:%s \n error message:%s\n return code %d\n"
|
|
345
|
-
% (job_id, err_str, ret)
|
|
342
|
+
% (command, job_id, err_str, ret)
|
|
346
343
|
)
|
|
347
344
|
status_lines = stdout.read().decode("utf-8").split("\n")[:-1]
|
|
348
345
|
status = []
|
dpdispatcher/submission.py
CHANGED
|
@@ -1122,9 +1122,9 @@ class Resources:
|
|
|
1122
1122
|
|
|
1123
1123
|
@staticmethod
|
|
1124
1124
|
def arginfo(detail_kwargs=True):
|
|
1125
|
-
doc_number_node = "The number of
|
|
1126
|
-
doc_cpu_per_node = "
|
|
1127
|
-
doc_gpu_per_node = "
|
|
1125
|
+
doc_number_node = "The number of nodes required for each `job`."
|
|
1126
|
+
doc_cpu_per_node = "CPU numbers of each node assigned to each job."
|
|
1127
|
+
doc_gpu_per_node = "GPU numbers of each node assigned to each job."
|
|
1128
1128
|
doc_queue_name = "The queue name of batch job scheduler system."
|
|
1129
1129
|
doc_group_size = "The number of `tasks` in a `job`. 0 means infinity."
|
|
1130
1130
|
doc_custom_flags = "The extra lines pass to job submitting script header"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: dpdispatcher
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.7
|
|
4
4
|
Summary: Generate HPC scheduler systems jobs input scripts, submit these scripts to HPC systems, and poke until they finish
|
|
5
5
|
Author: DeepModeling
|
|
6
6
|
License: GNU LESSER GENERAL PUBLIC LICENSE
|
|
@@ -1,36 +1,36 @@
|
|
|
1
1
|
dpdispatcher/__init__.py,sha256=CLZP_N5CTp14ujWCykEHuJjoIfKR6CwrclXhjWUgNoE,517
|
|
2
2
|
dpdispatcher/__main__.py,sha256=BFhG-mSBzVZUEezQJqXWZnt2WsnhAHT_zpT8Y6gpOz0,116
|
|
3
|
-
dpdispatcher/_version.py,sha256=
|
|
3
|
+
dpdispatcher/_version.py,sha256=iLXz9haw4jSV4Xm2-5_V8999GBAYoJkXg9-YOwMJpLY,411
|
|
4
4
|
dpdispatcher/arginfo.py,sha256=pNaxYIE6ahBidpR7OCKZdw8iGt003uTXGSlVzwiuvRg,188
|
|
5
|
-
dpdispatcher/base_context.py,sha256=
|
|
5
|
+
dpdispatcher/base_context.py,sha256=FDie50yHuLxTwV-k3B_sgAYqR5rLLLVOwk24wSiu4SQ,5254
|
|
6
6
|
dpdispatcher/dlog.py,sha256=QJKAwB6gV3Zb6zQUL9dZ_uIoTIEy9Z7ecmVQ-8WNmD8,1081
|
|
7
7
|
dpdispatcher/dpdisp.py,sha256=jhuTmwPY7KBF4WukaQomEwZcfYoISaMbKwuxdDGSluc,4206
|
|
8
|
-
dpdispatcher/machine.py,sha256=
|
|
8
|
+
dpdispatcher/machine.py,sha256=k53ycs_v7xrl4D93URc5ht0shoO9NPrVl0rYr4v5OiU,16696
|
|
9
9
|
dpdispatcher/run.py,sha256=tFHbJAioXXpgHTE5bhRRAuc8w7cX1ET9SBbiAg3Rw-I,5382
|
|
10
|
-
dpdispatcher/submission.py,sha256=
|
|
10
|
+
dpdispatcher/submission.py,sha256=NaljgA88NLv0rvxoSMZvUMq0sQEggkgKlcT8gXUnqFs,48367
|
|
11
11
|
dpdispatcher/contexts/__init__.py,sha256=jlvcIppmUnS39yBlkZEDvIQFV-j_BR75ZTbZALF_RB0,336
|
|
12
|
-
dpdispatcher/contexts/dp_cloud_server_context.py,sha256=
|
|
13
|
-
dpdispatcher/contexts/hdfs_context.py,sha256=
|
|
14
|
-
dpdispatcher/contexts/lazy_local_context.py,sha256=
|
|
15
|
-
dpdispatcher/contexts/local_context.py,sha256=
|
|
16
|
-
dpdispatcher/contexts/openapi_context.py,sha256=
|
|
17
|
-
dpdispatcher/contexts/ssh_context.py,sha256=
|
|
12
|
+
dpdispatcher/contexts/dp_cloud_server_context.py,sha256=PGRMef3q2hfK-o5dNIWWvzPca2NK1HrWEgungM4L9Go,12420
|
|
13
|
+
dpdispatcher/contexts/hdfs_context.py,sha256=mYQzXMZ4A9EjjWBAH3Ba6HOErUhMMwCsKxOjpd5R57Y,9105
|
|
14
|
+
dpdispatcher/contexts/lazy_local_context.py,sha256=FAClbLD2F4LizUqFzMOg3t0Z6NLeTDLJy7NkRcDELFs,5070
|
|
15
|
+
dpdispatcher/contexts/local_context.py,sha256=VbaSXGAc_EDMT0K5WV_flBF0bX87ntrwO_hq_Bkcb04,14590
|
|
16
|
+
dpdispatcher/contexts/openapi_context.py,sha256=M7L9axpjOrzvdTpLMDuEzZqe4ZuKIxjS0bzZUv8W2IQ,9674
|
|
17
|
+
dpdispatcher/contexts/ssh_context.py,sha256=s0K-gSKPSykq2PyOzAt4yNEczAdsVGvQ1QmPJpZ4_Vo,37648
|
|
18
18
|
dpdispatcher/dpcloudserver/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
19
|
dpdispatcher/dpcloudserver/client.py,sha256=k1niKjG6zFnMtHn_UuCjYoOcMju3o3PV-GdyVLr5-KM,165
|
|
20
20
|
dpdispatcher/entrypoints/__init__.py,sha256=exKSFT3j2oCerGwtI8WbHQK-D0K-CyifocRji1xntT4,20
|
|
21
21
|
dpdispatcher/entrypoints/gui.py,sha256=29lMXqbmSRbLj4rfBv7Jnw89NLU9syTB88IUP6IRJsU,830
|
|
22
22
|
dpdispatcher/entrypoints/run.py,sha256=tRkHfeAktV6gF31yb2MVOSTlpNGZFw3N0jHBmM1YfIg,175
|
|
23
23
|
dpdispatcher/entrypoints/submission.py,sha256=ikVwIZAQL0SsYO5xaMIdKXgO6qtc05w1vqmvtG7Nk5M,3401
|
|
24
|
-
dpdispatcher/machines/JH_UniScheduler.py,sha256=
|
|
24
|
+
dpdispatcher/machines/JH_UniScheduler.py,sha256=B-LGldr9H8qPQYdCYoEaXFCEFBPmjFEi0fwEWp0wdR0,5783
|
|
25
25
|
dpdispatcher/machines/__init__.py,sha256=tOQuPUlW1Ab4qcC0oSAIyDjZA_WyE67h_EIxPCWGhys,336
|
|
26
|
-
dpdispatcher/machines/distributed_shell.py,sha256=
|
|
26
|
+
dpdispatcher/machines/distributed_shell.py,sha256=TVnXFNqQmBgWk3s34rKSZo0S5N5KPZVmAG3Xbu_kuBo,7535
|
|
27
27
|
dpdispatcher/machines/dp_cloud_server.py,sha256=SR69gsFb2BvOQCW1QnWfP3cQvu_qHLJNsycp5wzosJU,11706
|
|
28
28
|
dpdispatcher/machines/fugaku.py,sha256=oY2hD2ldL2dztwtJ9WNisdsfPnaX-5yTRXewIT9r60I,4314
|
|
29
|
-
dpdispatcher/machines/lsf.py,sha256=
|
|
29
|
+
dpdispatcher/machines/lsf.py,sha256=fOZoOTpFn1nKx79lYkvZQOhNwz39YAIEytxICd56AFU,7920
|
|
30
30
|
dpdispatcher/machines/openapi.py,sha256=Gzzbo8YOAybXGTrgMutexErcaEi3ts7uTUNvOhThFS8,8858
|
|
31
|
-
dpdispatcher/machines/pbs.py,sha256=
|
|
32
|
-
dpdispatcher/machines/shell.py,sha256=
|
|
33
|
-
dpdispatcher/machines/slurm.py,sha256=
|
|
31
|
+
dpdispatcher/machines/pbs.py,sha256=XeeFQMZoH9DscsrJ_Ykv6fNUtc9TBp4epuFqbUyr3dk,12531
|
|
32
|
+
dpdispatcher/machines/shell.py,sha256=ONaUJpszsCwCcbyVLvC6VoJ-ig2QTU9JQdA-nlgXnu8,4845
|
|
33
|
+
dpdispatcher/machines/slurm.py,sha256=HLYk9E1dChnTeHjOOWNG854AWdlUJVYYmgwaiVswPQ8,15560
|
|
34
34
|
dpdispatcher/utils/__init__.py,sha256=fwvwkMf7DFNQkNBiIce8Y8gRA6FhICwKjkKiXu_BEJg,13
|
|
35
35
|
dpdispatcher/utils/hdfs_cli.py,sha256=n3EIfFIralsISlaEewawD35f0P8mabo-u8D8UW3k_7Y,5308
|
|
36
36
|
dpdispatcher/utils/job_status.py,sha256=Eszs4TPLfszCuf6zLaFonf25feXDUguF28spYOjJpQE,233
|
|
@@ -41,9 +41,9 @@ dpdispatcher/utils/dpcloudserver/client.py,sha256=CLfXswvzI4inDrW2bYkfMQ6gQJFcZO
|
|
|
41
41
|
dpdispatcher/utils/dpcloudserver/config.py,sha256=NteQzf1OeEkz2UbkXHHQ0B72cUu23zLVzpM9Yh4v1Cc,559
|
|
42
42
|
dpdispatcher/utils/dpcloudserver/retcode.py,sha256=1qAF8gFZx55u2sO8KbtYSIIrjcO-IGufEUlwbkSfC1g,721
|
|
43
43
|
dpdispatcher/utils/dpcloudserver/zip_file.py,sha256=f9WrlktwHW0YipaWg5Y0kxjMZlhD1cJYa6EUpvu4Cro,2611
|
|
44
|
-
dpdispatcher-0.6.
|
|
45
|
-
dpdispatcher-0.6.
|
|
46
|
-
dpdispatcher-0.6.
|
|
47
|
-
dpdispatcher-0.6.
|
|
48
|
-
dpdispatcher-0.6.
|
|
49
|
-
dpdispatcher-0.6.
|
|
44
|
+
dpdispatcher-0.6.7.dist-info/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
|
|
45
|
+
dpdispatcher-0.6.7.dist-info/METADATA,sha256=lNsC7Ruo7GmUOQl1TadlThoLvrOETbZ1s0-sXmrRYL4,12828
|
|
46
|
+
dpdispatcher-0.6.7.dist-info/WHEEL,sha256=UvcQYKBHoFqaQd6LKyqHw9fxEolWLQnlzP0h_LgJAfI,91
|
|
47
|
+
dpdispatcher-0.6.7.dist-info/entry_points.txt,sha256=NRHUV0IU_u7_XtcmmEDnVzAcUmurhiEAGwENckrajo4,233
|
|
48
|
+
dpdispatcher-0.6.7.dist-info/top_level.txt,sha256=35jAQoXY-b-e9fJ1_mxhZUiaCoJNt1ZI7mpFRf07Qjs,13
|
|
49
|
+
dpdispatcher-0.6.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|