dpdispatcher 0.6.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dpdispatcher/_version.py +22 -4
- dpdispatcher/base_context.py +60 -1
- dpdispatcher/contexts/__init__.py +1 -0
- dpdispatcher/contexts/dp_cloud_server_context.py +8 -1
- dpdispatcher/contexts/hdfs_context.py +16 -11
- dpdispatcher/contexts/lazy_local_context.py +2 -19
- dpdispatcher/contexts/local_context.py +77 -43
- dpdispatcher/contexts/openapi_context.py +78 -14
- dpdispatcher/contexts/ssh_context.py +117 -98
- dpdispatcher/dlog.py +9 -5
- dpdispatcher/dpcloudserver/__init__.py +0 -0
- dpdispatcher/dpcloudserver/client.py +7 -0
- dpdispatcher/dpdisp.py +21 -0
- dpdispatcher/entrypoints/run.py +9 -0
- dpdispatcher/entrypoints/submission.py +21 -1
- dpdispatcher/machine.py +15 -4
- dpdispatcher/machines/JH_UniScheduler.py +171 -0
- dpdispatcher/machines/__init__.py +1 -0
- dpdispatcher/machines/distributed_shell.py +6 -10
- dpdispatcher/machines/fugaku.py +9 -12
- dpdispatcher/machines/lsf.py +3 -9
- dpdispatcher/machines/openapi.py +48 -15
- dpdispatcher/machines/pbs.py +183 -20
- dpdispatcher/machines/shell.py +7 -16
- dpdispatcher/machines/slurm.py +30 -42
- dpdispatcher/run.py +172 -0
- dpdispatcher/submission.py +5 -14
- dpdispatcher/utils/dpcloudserver/client.py +10 -6
- dpdispatcher/utils/hdfs_cli.py +10 -19
- dpdispatcher/utils/utils.py +21 -7
- {dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/METADATA +35 -29
- dpdispatcher-1.0.0.dist-info/RECORD +49 -0
- {dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/WHEEL +1 -1
- dpdispatcher-0.6.1.dist-info/RECORD +0 -44
- {dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/entry_points.txt +0 -0
- {dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info/licenses}/LICENSE +0 -0
- {dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
import shlex
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from dargs import Argument
|
|
5
|
+
|
|
6
|
+
from dpdispatcher.dlog import dlog
|
|
7
|
+
from dpdispatcher.machine import Machine
|
|
8
|
+
from dpdispatcher.utils.job_status import JobStatus
|
|
9
|
+
from dpdispatcher.utils.utils import (
|
|
10
|
+
RetrySignal,
|
|
11
|
+
customized_script_header_template,
|
|
12
|
+
retry,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
JH_UniScheduler_script_header_template = """\
|
|
16
|
+
#!/bin/bash -l
|
|
17
|
+
#JSUB -e %J.err
|
|
18
|
+
#JSUB -o %J.out
|
|
19
|
+
{JH_UniScheduler_nodes_line}
|
|
20
|
+
{JH_UniScheduler_ptile_line}
|
|
21
|
+
{JH_UniScheduler_partition_line}
|
|
22
|
+
{JH_UniScheduler_number_gpu_line}"""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class JH_UniScheduler(Machine):
|
|
26
|
+
"""JH_UniScheduler batch."""
|
|
27
|
+
|
|
28
|
+
def gen_script(self, job):
|
|
29
|
+
JH_UniScheduler_script = super().gen_script(job)
|
|
30
|
+
return JH_UniScheduler_script
|
|
31
|
+
|
|
32
|
+
def gen_script_header(self, job):
|
|
33
|
+
resources = job.resources
|
|
34
|
+
script_header_dict = {
|
|
35
|
+
"JH_UniScheduler_nodes_line": f"#JSUB -n {resources.number_node * resources.cpu_per_node}",
|
|
36
|
+
"JH_UniScheduler_ptile_line": f"#JSUB -R 'span[ptile={resources.cpu_per_node}]'",
|
|
37
|
+
"JH_UniScheduler_partition_line": f"#JSUB -q {resources.queue_name}",
|
|
38
|
+
}
|
|
39
|
+
custom_gpu_line = resources.kwargs.get("custom_gpu_line", None)
|
|
40
|
+
if not custom_gpu_line:
|
|
41
|
+
script_header_dict["JH_UniScheduler_number_gpu_line"] = (
|
|
42
|
+
f"#JSUB -gpgpu {resources.gpu_per_node}"
|
|
43
|
+
)
|
|
44
|
+
else:
|
|
45
|
+
script_header_dict["JH_UniScheduler_number_gpu_line"] = custom_gpu_line
|
|
46
|
+
if (
|
|
47
|
+
resources["strategy"].get("customized_script_header_template_file")
|
|
48
|
+
is not None
|
|
49
|
+
):
|
|
50
|
+
JH_UniScheduler_script_header = customized_script_header_template(
|
|
51
|
+
resources["strategy"]["customized_script_header_template_file"],
|
|
52
|
+
resources,
|
|
53
|
+
)
|
|
54
|
+
else:
|
|
55
|
+
JH_UniScheduler_script_header = (
|
|
56
|
+
JH_UniScheduler_script_header_template.format(**script_header_dict)
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
return JH_UniScheduler_script_header
|
|
60
|
+
|
|
61
|
+
@retry()
|
|
62
|
+
def do_submit(self, job):
|
|
63
|
+
script_file_name = job.script_file_name
|
|
64
|
+
script_str = self.gen_script(job)
|
|
65
|
+
job_id_name = job.job_hash + "_job_id"
|
|
66
|
+
self.context.write_file(fname=script_file_name, write_str=script_str)
|
|
67
|
+
script_run_str = self.gen_script_command(job)
|
|
68
|
+
script_run_file_name = f"{job.script_file_name}.run"
|
|
69
|
+
self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
stdin, stdout, stderr = self.context.block_checkcall(
|
|
73
|
+
"cd {} && {} {}".format(
|
|
74
|
+
shlex.quote(self.context.remote_root),
|
|
75
|
+
"jsub < ",
|
|
76
|
+
shlex.quote(script_file_name),
|
|
77
|
+
)
|
|
78
|
+
)
|
|
79
|
+
except RuntimeError as err:
|
|
80
|
+
raise RetrySignal(err) from err
|
|
81
|
+
|
|
82
|
+
subret = stdout.readlines()
|
|
83
|
+
job_id = subret[0].split()[1][1:-1]
|
|
84
|
+
self.context.write_file(job_id_name, job_id)
|
|
85
|
+
return job_id
|
|
86
|
+
|
|
87
|
+
@retry()
|
|
88
|
+
def check_status(self, job):
|
|
89
|
+
try:
|
|
90
|
+
job_id = job.job_id
|
|
91
|
+
except AttributeError:
|
|
92
|
+
return JobStatus.terminated
|
|
93
|
+
if job_id == "":
|
|
94
|
+
return JobStatus.unsubmitted
|
|
95
|
+
ret, stdin, stdout, stderr = self.context.block_call("jjobs " + job_id)
|
|
96
|
+
err_str = stderr.read().decode("utf-8")
|
|
97
|
+
if (f"Job <{job_id}> is not found") in err_str:
|
|
98
|
+
if self.check_finish_tag(job):
|
|
99
|
+
return JobStatus.finished
|
|
100
|
+
else:
|
|
101
|
+
return JobStatus.terminated
|
|
102
|
+
elif ret != 0:
|
|
103
|
+
# just retry when any unknown error raised.
|
|
104
|
+
raise RetrySignal(
|
|
105
|
+
f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
|
|
106
|
+
)
|
|
107
|
+
status_out = stdout.read().decode("utf-8").split("\n")
|
|
108
|
+
if len(status_out) < 2:
|
|
109
|
+
return JobStatus.unknown
|
|
110
|
+
else:
|
|
111
|
+
status_line = status_out[1]
|
|
112
|
+
status_word = status_line.split()[2]
|
|
113
|
+
|
|
114
|
+
if status_word in ["PEND"]:
|
|
115
|
+
return JobStatus.waiting
|
|
116
|
+
elif status_word in ["RUN", "PSUSP", "SSUSP", "USUSP"]:
|
|
117
|
+
return JobStatus.running
|
|
118
|
+
elif status_word in ["DONE", "EXIT"]:
|
|
119
|
+
if self.check_finish_tag(job):
|
|
120
|
+
dlog.info(f"job: {job.job_hash} {job.job_id} finished")
|
|
121
|
+
return JobStatus.finished
|
|
122
|
+
else:
|
|
123
|
+
return JobStatus.terminated
|
|
124
|
+
else:
|
|
125
|
+
return JobStatus.unknown
|
|
126
|
+
|
|
127
|
+
def check_finish_tag(self, job):
|
|
128
|
+
job_tag_finished = job.job_hash + "_job_tag_finished"
|
|
129
|
+
return self.context.check_file_exists(job_tag_finished)
|
|
130
|
+
|
|
131
|
+
@classmethod
|
|
132
|
+
def resources_subfields(cls) -> List[Argument]:
|
|
133
|
+
"""Generate the resources subfields.
|
|
134
|
+
|
|
135
|
+
Returns
|
|
136
|
+
-------
|
|
137
|
+
list[Argument]
|
|
138
|
+
resources subfields
|
|
139
|
+
"""
|
|
140
|
+
doc_custom_gpu_line = "Custom GPU configuration, starting with #JSUB"
|
|
141
|
+
|
|
142
|
+
return [
|
|
143
|
+
Argument(
|
|
144
|
+
"kwargs",
|
|
145
|
+
dict,
|
|
146
|
+
[
|
|
147
|
+
Argument(
|
|
148
|
+
"custom_gpu_line",
|
|
149
|
+
str,
|
|
150
|
+
optional=True,
|
|
151
|
+
default=None,
|
|
152
|
+
doc=doc_custom_gpu_line,
|
|
153
|
+
),
|
|
154
|
+
],
|
|
155
|
+
optional=False,
|
|
156
|
+
doc="Extra arguments.",
|
|
157
|
+
)
|
|
158
|
+
]
|
|
159
|
+
|
|
160
|
+
def kill(self, job):
|
|
161
|
+
"""Kill the job.
|
|
162
|
+
|
|
163
|
+
Parameters
|
|
164
|
+
----------
|
|
165
|
+
job : Job
|
|
166
|
+
job
|
|
167
|
+
"""
|
|
168
|
+
job_id = job.job_id
|
|
169
|
+
ret, stdin, stdout, stderr = self.context.block_call(
|
|
170
|
+
"jctrl kill " + str(job_id)
|
|
171
|
+
)
|
|
@@ -64,7 +64,7 @@ class DistributedShell(Machine):
|
|
|
64
64
|
|
|
65
65
|
source_list = job.resources.source_list
|
|
66
66
|
for ii in source_list:
|
|
67
|
-
line = "{ source
|
|
67
|
+
line = f"{{ source {ii}; }} \n"
|
|
68
68
|
source_files_part += line
|
|
69
69
|
|
|
70
70
|
export_envs_part = ""
|
|
@@ -96,7 +96,7 @@ class DistributedShell(Machine):
|
|
|
96
96
|
def gen_script_end(self, job):
|
|
97
97
|
all_task_dirs = ""
|
|
98
98
|
for task in job.job_task_list:
|
|
99
|
-
all_task_dirs += "
|
|
99
|
+
all_task_dirs += f"{task.task_work_path} "
|
|
100
100
|
job_tag_finished = job.job_hash + "_job_tag_finished"
|
|
101
101
|
flag_if_job_task_fail = job.job_hash + "_flag_if_job_task_fail"
|
|
102
102
|
|
|
@@ -173,18 +173,15 @@ class DistributedShell(Machine):
|
|
|
173
173
|
)
|
|
174
174
|
)
|
|
175
175
|
|
|
176
|
-
cmd =
|
|
177
|
-
submit_command
|
|
178
|
-
output_name,
|
|
179
|
-
output_name,
|
|
176
|
+
cmd = (
|
|
177
|
+
f"{{ nohup {submit_command} 1>{output_name} 2>{output_name} & }} && echo $!"
|
|
180
178
|
)
|
|
181
179
|
ret, stdout, stderr = run_cmd_with_all_output(cmd)
|
|
182
180
|
|
|
183
181
|
if ret != 0:
|
|
184
182
|
err_str = stderr.decode("utf-8")
|
|
185
183
|
raise RuntimeError(
|
|
186
|
-
"Command
|
|
187
|
-
% (err_str, ret)
|
|
184
|
+
f"Command {cmd} fails to execute, error message:{err_str}\nreturn code {ret}\n"
|
|
188
185
|
)
|
|
189
186
|
job_id = int(stdout.decode("utf-8").strip())
|
|
190
187
|
|
|
@@ -202,8 +199,7 @@ class DistributedShell(Machine):
|
|
|
202
199
|
if ret != 0:
|
|
203
200
|
err_str = stderr.decode("utf-8")
|
|
204
201
|
raise RuntimeError(
|
|
205
|
-
"Command fails to execute, error message
|
|
206
|
-
% (err_str, ret)
|
|
202
|
+
f"Command fails to execute, error message:{err_str}\nreturn code {ret}\n"
|
|
207
203
|
)
|
|
208
204
|
|
|
209
205
|
if_job_exists = bool(stdout.decode("utf-8").strip())
|
dpdispatcher/machines/fugaku.py
CHANGED
|
@@ -20,15 +20,15 @@ class Fugaku(Machine):
|
|
|
20
20
|
def gen_script_header(self, job):
|
|
21
21
|
resources = job.resources
|
|
22
22
|
fugaku_script_header_dict = {}
|
|
23
|
-
fugaku_script_header_dict[
|
|
24
|
-
"
|
|
25
|
-
|
|
26
|
-
fugaku_script_header_dict[
|
|
27
|
-
"
|
|
28
|
-
|
|
29
|
-
fugaku_script_header_dict[
|
|
30
|
-
"
|
|
31
|
-
|
|
23
|
+
fugaku_script_header_dict["fugaku_node_number_line"] = (
|
|
24
|
+
f'#PJM -L "node={resources.number_node}" '
|
|
25
|
+
)
|
|
26
|
+
fugaku_script_header_dict["fugaku_ntasks_per_node_line"] = (
|
|
27
|
+
f'#PJM --mpi "max-proc-per-node={resources.cpu_per_node}"'
|
|
28
|
+
)
|
|
29
|
+
fugaku_script_header_dict["queue_name_line"] = (
|
|
30
|
+
f'#PJM -L "rscgrp={resources.queue_name}"'
|
|
31
|
+
)
|
|
32
32
|
if (
|
|
33
33
|
resources["strategy"].get("customized_script_header_template_file")
|
|
34
34
|
is not None
|
|
@@ -67,9 +67,6 @@ class Fugaku(Machine):
|
|
|
67
67
|
self.context.write_file(job_id_name, job_id)
|
|
68
68
|
return job_id
|
|
69
69
|
|
|
70
|
-
def default_resources(self, resources):
|
|
71
|
-
pass
|
|
72
|
-
|
|
73
70
|
def check_status(self, job):
|
|
74
71
|
job_id = job.job_id
|
|
75
72
|
if job_id == "":
|
dpdispatcher/machines/lsf.py
CHANGED
|
@@ -32,9 +32,7 @@ class LSF(Machine):
|
|
|
32
32
|
def gen_script_header(self, job):
|
|
33
33
|
resources = job.resources
|
|
34
34
|
script_header_dict = {
|
|
35
|
-
"lsf_nodes_line": "#BSUB -n {
|
|
36
|
-
number_cores=resources.number_node * resources.cpu_per_node
|
|
37
|
-
),
|
|
35
|
+
"lsf_nodes_line": f"#BSUB -n {resources.number_node * resources.cpu_per_node}",
|
|
38
36
|
"lsf_ptile_line": f"#BSUB -R 'span[ptile={resources.cpu_per_node}]'",
|
|
39
37
|
"lsf_partition_line": f"#BSUB -q {resources.queue_name}",
|
|
40
38
|
}
|
|
@@ -104,9 +102,6 @@ class LSF(Machine):
|
|
|
104
102
|
return job_id
|
|
105
103
|
|
|
106
104
|
# TODO: derive abstract methods
|
|
107
|
-
def default_resources(self, resources):
|
|
108
|
-
pass
|
|
109
|
-
|
|
110
105
|
def sub_script_cmd(self, res):
|
|
111
106
|
pass
|
|
112
107
|
|
|
@@ -123,7 +118,7 @@ class LSF(Machine):
|
|
|
123
118
|
return JobStatus.unsubmitted
|
|
124
119
|
ret, stdin, stdout, stderr = self.context.block_call("bjobs " + job_id)
|
|
125
120
|
err_str = stderr.read().decode("utf-8")
|
|
126
|
-
if ("Job
|
|
121
|
+
if (f"Job <{job_id}> is not found") in err_str:
|
|
127
122
|
if self.check_finish_tag(job):
|
|
128
123
|
return JobStatus.finished
|
|
129
124
|
else:
|
|
@@ -131,8 +126,7 @@ class LSF(Machine):
|
|
|
131
126
|
elif ret != 0:
|
|
132
127
|
# just retry when any unknown error raised.
|
|
133
128
|
raise RetrySignal(
|
|
134
|
-
"Get error code
|
|
135
|
-
% (ret, job.job_hash, err_str)
|
|
129
|
+
f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
|
|
136
130
|
)
|
|
137
131
|
status_out = stdout.read().decode("utf-8").split("\n")
|
|
138
132
|
if len(status_out) < 2:
|
dpdispatcher/machines/openapi.py
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import shutil
|
|
3
3
|
import time
|
|
4
|
+
from zipfile import ZipFile
|
|
4
5
|
|
|
5
6
|
from dpdispatcher.utils.utils import customized_script_header_template
|
|
6
7
|
|
|
7
8
|
try:
|
|
8
|
-
from
|
|
9
|
-
from
|
|
10
|
-
from bohriumsdk.storage import Storage
|
|
11
|
-
from bohriumsdk.util import Util
|
|
9
|
+
from bohrium import Bohrium
|
|
10
|
+
from bohrium.resources import Job, Tiefblue
|
|
12
11
|
except ModuleNotFoundError:
|
|
13
12
|
found_bohriumsdk = False
|
|
14
13
|
else:
|
|
@@ -23,6 +22,12 @@ shell_script_header_template = """
|
|
|
23
22
|
"""
|
|
24
23
|
|
|
25
24
|
|
|
25
|
+
def unzip_file(zip_file, out_dir="./"):
|
|
26
|
+
obj = ZipFile(zip_file, "r")
|
|
27
|
+
for item in obj.namelist():
|
|
28
|
+
obj.extract(item, out_dir)
|
|
29
|
+
|
|
30
|
+
|
|
26
31
|
class OpenAPI(Machine):
|
|
27
32
|
def __init__(self, context):
|
|
28
33
|
if not found_bohriumsdk:
|
|
@@ -35,9 +40,35 @@ class OpenAPI(Machine):
|
|
|
35
40
|
self.grouped = self.remote_profile.get("grouped", True)
|
|
36
41
|
self.retry_count = self.remote_profile.get("retry_count", 3)
|
|
37
42
|
self.ignore_exit_code = context.remote_profile.get("ignore_exit_code", True)
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
43
|
+
|
|
44
|
+
access_key = (
|
|
45
|
+
self.remote_profile.get("access_key", None)
|
|
46
|
+
or os.getenv("BOHRIUM_ACCESS_KEY", None)
|
|
47
|
+
or os.getenv("ACCESS_KEY", None)
|
|
48
|
+
)
|
|
49
|
+
project_id = (
|
|
50
|
+
self.remote_profile.get("project_id", None)
|
|
51
|
+
or os.getenv("BOHRIUM_PROJECT_ID", None)
|
|
52
|
+
or os.getenv("PROJECT_ID", None)
|
|
53
|
+
)
|
|
54
|
+
app_key = (
|
|
55
|
+
self.remote_profile.get("app_key", None)
|
|
56
|
+
or os.getenv("BOHRIUM_APP_KEY", None)
|
|
57
|
+
or os.getenv("APP_KEY", None)
|
|
58
|
+
)
|
|
59
|
+
if access_key is None:
|
|
60
|
+
raise ValueError(
|
|
61
|
+
"remote_profile must contain 'access_key' or set environment variable 'BOHRIUM_ACCESS_KEY'"
|
|
62
|
+
)
|
|
63
|
+
if project_id is None:
|
|
64
|
+
raise ValueError(
|
|
65
|
+
"remote_profile must contain 'project_id' or set environment variable 'BOHRIUM_PROJECT_ID'"
|
|
66
|
+
)
|
|
67
|
+
self.client = Bohrium( # type: ignore[reportPossiblyUnboundVariable]
|
|
68
|
+
access_key=access_key, project_id=project_id, app_key=app_key
|
|
69
|
+
)
|
|
70
|
+
self.storage = Tiefblue() # type: ignore[reportPossiblyUnboundVariable]
|
|
71
|
+
self.job = Job(client=self.client) # type: ignore[reportPossiblyUnboundVariable]
|
|
41
72
|
self.group_id = None
|
|
42
73
|
|
|
43
74
|
def gen_script(self, job):
|
|
@@ -98,11 +129,13 @@ class OpenAPI(Machine):
|
|
|
98
129
|
),
|
|
99
130
|
"out_files": self._gen_backward_files_list(job),
|
|
100
131
|
"platform": self.remote_profile.get("platform", "ali"),
|
|
101
|
-
"
|
|
132
|
+
"image_name": self.remote_profile.get("image_address", ""),
|
|
102
133
|
}
|
|
103
|
-
if
|
|
104
|
-
openapi_params["
|
|
105
|
-
|
|
134
|
+
if "real_user_id" in self.remote_profile:
|
|
135
|
+
openapi_params["real_user_id"] = self.remote_profile["real_user_id"]
|
|
136
|
+
if "session_id" in self.remote_profile:
|
|
137
|
+
openapi_params["session_id"] = self.remote_profile["session_id"]
|
|
138
|
+
openapi_params["job_id"] = job.job_id
|
|
106
139
|
data = self.job.insert(**openapi_params)
|
|
107
140
|
|
|
108
141
|
job.job_id = data.get("jobId", 0) # type: ignore
|
|
@@ -152,8 +185,8 @@ class OpenAPI(Machine):
|
|
|
152
185
|
self.ignore_exit_code,
|
|
153
186
|
)
|
|
154
187
|
if job_state == JobStatus.finished:
|
|
155
|
-
job_log = self.job.log(job_id)
|
|
156
188
|
if self.remote_profile.get("output_log"):
|
|
189
|
+
job_log = self.job.log(job_id)
|
|
157
190
|
print(job_log, end="")
|
|
158
191
|
self._download_job(job)
|
|
159
192
|
elif self.remote_profile.get("output_log") and job_state == JobStatus.running:
|
|
@@ -163,14 +196,14 @@ class OpenAPI(Machine):
|
|
|
163
196
|
|
|
164
197
|
def _download_job(self, job):
|
|
165
198
|
data = self.job.detail(job.job_id)
|
|
166
|
-
job_url = data["
|
|
199
|
+
job_url = data["resultUrl"] # type: ignore
|
|
167
200
|
if not job_url:
|
|
168
201
|
return
|
|
169
202
|
job_hash = job.job_hash
|
|
170
203
|
result_filename = job_hash + "_back.zip"
|
|
171
204
|
target_result_zip = os.path.join(self.context.local_root, result_filename)
|
|
172
205
|
self.storage.download_from_url(job_url, target_result_zip)
|
|
173
|
-
|
|
206
|
+
unzip_file(target_result_zip, out_dir=self.context.local_root)
|
|
174
207
|
try:
|
|
175
208
|
os.makedirs(os.path.join(self.context.local_root, "backup"), exist_ok=True)
|
|
176
209
|
shutil.move(
|
|
@@ -213,7 +246,7 @@ class OpenAPI(Machine):
|
|
|
213
246
|
if status not in map_dict:
|
|
214
247
|
dlog.error(f"unknown job status {status}")
|
|
215
248
|
return JobStatus.unknown
|
|
216
|
-
if status == -1 and
|
|
249
|
+
if status == -1 and ignore_exit_code:
|
|
217
250
|
return JobStatus.finished
|
|
218
251
|
return map_dict[status]
|
|
219
252
|
|
dpdispatcher/machines/pbs.py
CHANGED
|
@@ -1,4 +1,7 @@
|
|
|
1
1
|
import shlex
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from dargs import Argument
|
|
2
5
|
|
|
3
6
|
from dpdispatcher.dlog import dlog
|
|
4
7
|
from dpdispatcher.machine import Machine
|
|
@@ -21,13 +24,13 @@ class PBS(Machine):
|
|
|
21
24
|
def gen_script_header(self, job):
|
|
22
25
|
resources = job.resources
|
|
23
26
|
pbs_script_header_dict = {}
|
|
24
|
-
pbs_script_header_dict[
|
|
25
|
-
"
|
|
26
|
-
|
|
27
|
+
pbs_script_header_dict["select_node_line"] = (
|
|
28
|
+
f"#PBS -l select={resources.number_node}:ncpus={resources.cpu_per_node}"
|
|
29
|
+
)
|
|
27
30
|
if resources.gpu_per_node != 0:
|
|
28
|
-
pbs_script_header_dict[
|
|
29
|
-
"
|
|
30
|
-
|
|
31
|
+
pbs_script_header_dict["select_node_line"] += (
|
|
32
|
+
f":ngpus={resources.gpu_per_node}"
|
|
33
|
+
)
|
|
31
34
|
pbs_script_header_dict["queue_name_line"] = f"#PBS -q {resources.queue_name}"
|
|
32
35
|
if (
|
|
33
36
|
resources["strategy"].get("customized_script_header_template_file")
|
|
@@ -66,14 +69,12 @@ class PBS(Machine):
|
|
|
66
69
|
self.context.write_file(job_id_name, job_id)
|
|
67
70
|
return job_id
|
|
68
71
|
|
|
69
|
-
def default_resources(self, resources):
|
|
70
|
-
pass
|
|
71
|
-
|
|
72
72
|
def check_status(self, job):
|
|
73
73
|
job_id = job.job_id
|
|
74
74
|
if job_id == "":
|
|
75
75
|
return JobStatus.unsubmitted
|
|
76
|
-
|
|
76
|
+
command = "qstat -x " + job_id
|
|
77
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
77
78
|
err_str = stderr.read().decode("utf-8")
|
|
78
79
|
if ret != 0:
|
|
79
80
|
if "qstat: Unknown Job Id" in err_str or "Job has finished" in err_str:
|
|
@@ -83,8 +84,7 @@ class PBS(Machine):
|
|
|
83
84
|
return JobStatus.terminated
|
|
84
85
|
else:
|
|
85
86
|
raise RuntimeError(
|
|
86
|
-
"status command
|
|
87
|
-
% (err_str, ret)
|
|
87
|
+
f"status command {command} fails to execute. erro info: {err_str} return code {ret}"
|
|
88
88
|
)
|
|
89
89
|
status_line = stdout.read().decode("utf-8").split("\n")[-2]
|
|
90
90
|
status_word = status_line.split()[-2]
|
|
@@ -123,7 +123,8 @@ class Torque(PBS):
|
|
|
123
123
|
job_id = job.job_id
|
|
124
124
|
if job_id == "":
|
|
125
125
|
return JobStatus.unsubmitted
|
|
126
|
-
|
|
126
|
+
command = "qstat -l " + job_id
|
|
127
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
127
128
|
err_str = stderr.read().decode("utf-8")
|
|
128
129
|
if ret != 0:
|
|
129
130
|
if "qstat: Unknown Job Id" in err_str or "Job has finished" in err_str:
|
|
@@ -133,8 +134,7 @@ class Torque(PBS):
|
|
|
133
134
|
return JobStatus.terminated
|
|
134
135
|
else:
|
|
135
136
|
raise RuntimeError(
|
|
136
|
-
"status command
|
|
137
|
-
% (err_str, ret)
|
|
137
|
+
f"status command {command} fails to execute. erro info: {err_str} return code {ret}"
|
|
138
138
|
)
|
|
139
139
|
status_line = stdout.read().decode("utf-8").split("\n")[-2]
|
|
140
140
|
status_word = status_line.split()[-2]
|
|
@@ -156,12 +156,12 @@ class Torque(PBS):
|
|
|
156
156
|
# ref: https://support.adaptivecomputing.com/wp-content/uploads/2021/02/torque/torque.htm#topics/torque/2-jobs/requestingRes.htm
|
|
157
157
|
resources = job.resources
|
|
158
158
|
pbs_script_header_dict = {}
|
|
159
|
-
pbs_script_header_dict[
|
|
160
|
-
"
|
|
161
|
-
|
|
159
|
+
pbs_script_header_dict["select_node_line"] = (
|
|
160
|
+
f"#PBS -l nodes={resources.number_node}:ppn={resources.cpu_per_node}"
|
|
161
|
+
)
|
|
162
162
|
if resources.gpu_per_node != 0:
|
|
163
|
-
pbs_script_header_dict["select_node_line"] +=
|
|
164
|
-
|
|
163
|
+
pbs_script_header_dict["select_node_line"] += (
|
|
164
|
+
f":gpus={resources.gpu_per_node}"
|
|
165
165
|
)
|
|
166
166
|
pbs_script_header_dict["queue_name_line"] = f"#PBS -q {resources.queue_name}"
|
|
167
167
|
if (
|
|
@@ -177,3 +177,166 @@ class Torque(PBS):
|
|
|
177
177
|
**pbs_script_header_dict
|
|
178
178
|
)
|
|
179
179
|
return pbs_script_header
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
sge_script_header_template = """
|
|
183
|
+
#!/bin/bash
|
|
184
|
+
#$ -S /bin/bash
|
|
185
|
+
#$ -cwd
|
|
186
|
+
{select_node_line}
|
|
187
|
+
"""
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class SGE(PBS):
|
|
191
|
+
def __init__(
|
|
192
|
+
self,
|
|
193
|
+
batch_type=None,
|
|
194
|
+
context_type=None,
|
|
195
|
+
local_root=None,
|
|
196
|
+
remote_root=None,
|
|
197
|
+
remote_profile={},
|
|
198
|
+
*,
|
|
199
|
+
context=None,
|
|
200
|
+
):
|
|
201
|
+
super(PBS, self).__init__(
|
|
202
|
+
batch_type,
|
|
203
|
+
context_type,
|
|
204
|
+
local_root,
|
|
205
|
+
remote_root,
|
|
206
|
+
remote_profile,
|
|
207
|
+
context=context,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
def gen_script_header(self, job):
|
|
211
|
+
### Ref:https://softpanorama.org/HPC/PBS_and_derivatives/Reference/pbs_command_vs_sge_commands.shtml
|
|
212
|
+
# resources.number_node is not used in SGE
|
|
213
|
+
resources = job.resources
|
|
214
|
+
job_name = resources.kwargs.get("job_name", "wDPjob")
|
|
215
|
+
pe_name = resources.kwargs.get("pe_name", "mpi")
|
|
216
|
+
sge_script_header_dict = {}
|
|
217
|
+
sge_script_header_dict["select_node_line"] = f"#$ -N {job_name}\n"
|
|
218
|
+
sge_script_header_dict["select_node_line"] += (
|
|
219
|
+
f"#$ -pe {pe_name} {resources.cpu_per_node}\n"
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
if resources.queue_name != "":
|
|
223
|
+
sge_script_header_dict["select_node_line"] += (
|
|
224
|
+
f"#$ -q {resources.queue_name}"
|
|
225
|
+
)
|
|
226
|
+
if (
|
|
227
|
+
resources["strategy"].get("customized_script_header_template_file")
|
|
228
|
+
is not None
|
|
229
|
+
):
|
|
230
|
+
file_name = resources["strategy"]["customized_script_header_template_file"]
|
|
231
|
+
sge_script_header = customized_script_header_template(file_name, resources)
|
|
232
|
+
else:
|
|
233
|
+
sge_script_header = sge_script_header_template.format(
|
|
234
|
+
**sge_script_header_dict
|
|
235
|
+
)
|
|
236
|
+
return sge_script_header
|
|
237
|
+
|
|
238
|
+
def do_submit(self, job):
|
|
239
|
+
script_file_name = job.script_file_name
|
|
240
|
+
script_str = self.gen_script(job)
|
|
241
|
+
job_id_name = job.job_hash + "_job_id"
|
|
242
|
+
self.context.write_file(fname=script_file_name, write_str=script_str)
|
|
243
|
+
script_run_str = self.gen_script_command(job)
|
|
244
|
+
script_run_file_name = f"{job.script_file_name}.run"
|
|
245
|
+
self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
|
|
246
|
+
script_file_dir = self.context.remote_root
|
|
247
|
+
stdin, stdout, stderr = self.context.block_checkcall(
|
|
248
|
+
"cd {} && {} {}".format(script_file_dir, "qsub", script_file_name)
|
|
249
|
+
)
|
|
250
|
+
subret = stdout.readlines()
|
|
251
|
+
job_id = subret[0].split()[2]
|
|
252
|
+
self.context.write_file(job_id_name, job_id)
|
|
253
|
+
return job_id
|
|
254
|
+
|
|
255
|
+
def check_status(self, job):
|
|
256
|
+
### https://softpanorama.org/HPC/Grid_engine/Queues/queue_states.shtml
|
|
257
|
+
job_id = job.job_id
|
|
258
|
+
status_line = None
|
|
259
|
+
if job_id == "":
|
|
260
|
+
return JobStatus.unsubmitted
|
|
261
|
+
command = "qstat"
|
|
262
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
263
|
+
err_str = stderr.read().decode("utf-8")
|
|
264
|
+
if ret != 0:
|
|
265
|
+
raise RuntimeError(
|
|
266
|
+
f"status command {command} fails to execute. erro info: {err_str} return code {ret}"
|
|
267
|
+
)
|
|
268
|
+
status_text_list = stdout.read().decode("utf-8").split("\n")
|
|
269
|
+
for txt in status_text_list:
|
|
270
|
+
if job_id in txt:
|
|
271
|
+
status_line = txt
|
|
272
|
+
|
|
273
|
+
if status_line is None:
|
|
274
|
+
count = 0
|
|
275
|
+
while count <= 6:
|
|
276
|
+
if self.check_finish_tag(job=job):
|
|
277
|
+
return JobStatus.finished
|
|
278
|
+
dlog.info(
|
|
279
|
+
f"not tag_finished detected, execute sync command and wait. count {count}"
|
|
280
|
+
)
|
|
281
|
+
self.context.block_call("sync")
|
|
282
|
+
import time
|
|
283
|
+
|
|
284
|
+
time.sleep(10)
|
|
285
|
+
count += 1
|
|
286
|
+
return JobStatus.terminated
|
|
287
|
+
else:
|
|
288
|
+
status_word = status_line.split()[4]
|
|
289
|
+
# dlog.info (status_word)
|
|
290
|
+
if status_word in ["qw", "hqw", "t"]:
|
|
291
|
+
return JobStatus.waiting
|
|
292
|
+
elif status_word in ["r", "Rr"]:
|
|
293
|
+
return JobStatus.running
|
|
294
|
+
elif status_word in ["Eqw", "dr", "dt"]:
|
|
295
|
+
return JobStatus.terminated
|
|
296
|
+
else:
|
|
297
|
+
return JobStatus.unknown
|
|
298
|
+
|
|
299
|
+
def check_finish_tag(self, job):
|
|
300
|
+
job_tag_finished = job.job_hash + "_job_tag_finished"
|
|
301
|
+
return self.context.check_file_exists(job_tag_finished)
|
|
302
|
+
|
|
303
|
+
@classmethod
|
|
304
|
+
def resources_subfields(cls) -> List[Argument]:
|
|
305
|
+
"""Generate the resources subfields.
|
|
306
|
+
|
|
307
|
+
pe_name : str
|
|
308
|
+
The parallel environment name of SGE.
|
|
309
|
+
|
|
310
|
+
Returns
|
|
311
|
+
-------
|
|
312
|
+
list[Argument]
|
|
313
|
+
resources subfields
|
|
314
|
+
"""
|
|
315
|
+
doc_pe_name = "The parallel environment name of SGE system."
|
|
316
|
+
doc_job_name = "The name of SGE's job."
|
|
317
|
+
|
|
318
|
+
return [
|
|
319
|
+
Argument(
|
|
320
|
+
"kwargs",
|
|
321
|
+
dict,
|
|
322
|
+
[
|
|
323
|
+
Argument(
|
|
324
|
+
"pe_name",
|
|
325
|
+
str,
|
|
326
|
+
optional=True,
|
|
327
|
+
default="mpi",
|
|
328
|
+
doc=doc_pe_name,
|
|
329
|
+
alias=["sge_pe_name"],
|
|
330
|
+
),
|
|
331
|
+
Argument(
|
|
332
|
+
"job_name",
|
|
333
|
+
str,
|
|
334
|
+
optional=True,
|
|
335
|
+
default="wDPjob",
|
|
336
|
+
doc=doc_job_name,
|
|
337
|
+
),
|
|
338
|
+
],
|
|
339
|
+
optional=False,
|
|
340
|
+
doc="Extra arguments.",
|
|
341
|
+
)
|
|
342
|
+
]
|