dpdispatcher 0.6.6__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dpdispatcher/_version.py +22 -4
- dpdispatcher/base_context.py +60 -1
- dpdispatcher/contexts/dp_cloud_server_context.py +8 -1
- dpdispatcher/contexts/hdfs_context.py +5 -0
- dpdispatcher/contexts/lazy_local_context.py +2 -19
- dpdispatcher/contexts/local_context.py +57 -31
- dpdispatcher/contexts/openapi_context.py +78 -14
- dpdispatcher/contexts/ssh_context.py +54 -47
- dpdispatcher/machine.py +13 -1
- dpdispatcher/machines/JH_UniScheduler.py +2 -6
- dpdispatcher/machines/distributed_shell.py +2 -4
- dpdispatcher/machines/fugaku.py +0 -3
- dpdispatcher/machines/lsf.py +1 -5
- dpdispatcher/machines/openapi.py +48 -15
- dpdispatcher/machines/pbs.py +14 -16
- dpdispatcher/machines/shell.py +7 -11
- dpdispatcher/machines/slurm.py +18 -30
- dpdispatcher/submission.py +4 -11
- dpdispatcher/utils/dpcloudserver/client.py +10 -6
- dpdispatcher/utils/hdfs_cli.py +6 -11
- dpdispatcher/utils/utils.py +21 -7
- {dpdispatcher-0.6.6.dist-info → dpdispatcher-1.0.0.dist-info}/METADATA +34 -29
- dpdispatcher-1.0.0.dist-info/RECORD +49 -0
- {dpdispatcher-0.6.6.dist-info → dpdispatcher-1.0.0.dist-info}/WHEEL +1 -1
- dpdispatcher-0.6.6.dist-info/RECORD +0 -49
- {dpdispatcher-0.6.6.dist-info → dpdispatcher-1.0.0.dist-info}/entry_points.txt +0 -0
- {dpdispatcher-0.6.6.dist-info → dpdispatcher-1.0.0.dist-info/licenses}/LICENSE +0 -0
- {dpdispatcher-0.6.6.dist-info → dpdispatcher-1.0.0.dist-info}/top_level.txt +0 -0
dpdispatcher/machine.py
CHANGED
|
@@ -161,6 +161,9 @@ class Machine(metaclass=ABCMeta):
|
|
|
161
161
|
machine_dict["remote_profile"] = self.context.remote_profile
|
|
162
162
|
else:
|
|
163
163
|
machine_dict["remote_profile"] = {}
|
|
164
|
+
# normalize the dict
|
|
165
|
+
base = self.arginfo()
|
|
166
|
+
machine_dict = base.normalize_value(machine_dict, trim_pattern="_*")
|
|
164
167
|
return machine_dict
|
|
165
168
|
|
|
166
169
|
def __eq__(self, other):
|
|
@@ -224,7 +227,7 @@ class Machine(metaclass=ABCMeta):
|
|
|
224
227
|
return if_recover
|
|
225
228
|
|
|
226
229
|
@abstractmethod
|
|
227
|
-
def check_finish_tag(self,
|
|
230
|
+
def check_finish_tag(self, job):
|
|
228
231
|
raise NotImplementedError(
|
|
229
232
|
"abstract method check_finish_tag should be implemented by derived class"
|
|
230
233
|
)
|
|
@@ -265,6 +268,15 @@ class Machine(metaclass=ABCMeta):
|
|
|
265
268
|
|
|
266
269
|
export_envs_part = ""
|
|
267
270
|
envs = job.resources.envs
|
|
271
|
+
envs = {
|
|
272
|
+
# export resources information to the environment variables
|
|
273
|
+
"DPDISPATCHER_NUMBER_NODE": job.resources.number_node,
|
|
274
|
+
"DPDISPATCHER_CPU_PER_NODE": job.resources.cpu_per_node,
|
|
275
|
+
"DPDISPATCHER_GPU_PER_NODE": job.resources.gpu_per_node,
|
|
276
|
+
"DPDISPATCHER_QUEUE_NAME": job.resources.queue_name,
|
|
277
|
+
"DPDISPATCHER_GROUP_SIZE": job.resources.group_size,
|
|
278
|
+
**envs,
|
|
279
|
+
}
|
|
268
280
|
for k, v in envs.items():
|
|
269
281
|
if isinstance(v, list):
|
|
270
282
|
for each_value in v:
|
|
@@ -39,7 +39,7 @@ class JH_UniScheduler(Machine):
|
|
|
39
39
|
custom_gpu_line = resources.kwargs.get("custom_gpu_line", None)
|
|
40
40
|
if not custom_gpu_line:
|
|
41
41
|
script_header_dict["JH_UniScheduler_number_gpu_line"] = (
|
|
42
|
-
|
|
42
|
+
f"#JSUB -gpgpu {resources.gpu_per_node}"
|
|
43
43
|
)
|
|
44
44
|
else:
|
|
45
45
|
script_header_dict["JH_UniScheduler_number_gpu_line"] = custom_gpu_line
|
|
@@ -84,9 +84,6 @@ class JH_UniScheduler(Machine):
|
|
|
84
84
|
self.context.write_file(job_id_name, job_id)
|
|
85
85
|
return job_id
|
|
86
86
|
|
|
87
|
-
def default_resources(self, resources):
|
|
88
|
-
pass
|
|
89
|
-
|
|
90
87
|
@retry()
|
|
91
88
|
def check_status(self, job):
|
|
92
89
|
try:
|
|
@@ -105,8 +102,7 @@ class JH_UniScheduler(Machine):
|
|
|
105
102
|
elif ret != 0:
|
|
106
103
|
# just retry when any unknown error raised.
|
|
107
104
|
raise RetrySignal(
|
|
108
|
-
"Get error code
|
|
109
|
-
% (ret, job.job_hash, err_str)
|
|
105
|
+
f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
|
|
110
106
|
)
|
|
111
107
|
status_out = stdout.read().decode("utf-8").split("\n")
|
|
112
108
|
if len(status_out) < 2:
|
|
@@ -181,8 +181,7 @@ class DistributedShell(Machine):
|
|
|
181
181
|
if ret != 0:
|
|
182
182
|
err_str = stderr.decode("utf-8")
|
|
183
183
|
raise RuntimeError(
|
|
184
|
-
"Command
|
|
185
|
-
% (err_str, ret)
|
|
184
|
+
f"Command {cmd} fails to execute, error message:{err_str}\nreturn code {ret}\n"
|
|
186
185
|
)
|
|
187
186
|
job_id = int(stdout.decode("utf-8").strip())
|
|
188
187
|
|
|
@@ -200,8 +199,7 @@ class DistributedShell(Machine):
|
|
|
200
199
|
if ret != 0:
|
|
201
200
|
err_str = stderr.decode("utf-8")
|
|
202
201
|
raise RuntimeError(
|
|
203
|
-
"Command fails to execute, error message
|
|
204
|
-
% (err_str, ret)
|
|
202
|
+
f"Command fails to execute, error message:{err_str}\nreturn code {ret}\n"
|
|
205
203
|
)
|
|
206
204
|
|
|
207
205
|
if_job_exists = bool(stdout.decode("utf-8").strip())
|
dpdispatcher/machines/fugaku.py
CHANGED
dpdispatcher/machines/lsf.py
CHANGED
|
@@ -102,9 +102,6 @@ class LSF(Machine):
|
|
|
102
102
|
return job_id
|
|
103
103
|
|
|
104
104
|
# TODO: derive abstract methods
|
|
105
|
-
def default_resources(self, resources):
|
|
106
|
-
pass
|
|
107
|
-
|
|
108
105
|
def sub_script_cmd(self, res):
|
|
109
106
|
pass
|
|
110
107
|
|
|
@@ -129,8 +126,7 @@ class LSF(Machine):
|
|
|
129
126
|
elif ret != 0:
|
|
130
127
|
# just retry when any unknown error raised.
|
|
131
128
|
raise RetrySignal(
|
|
132
|
-
"Get error code
|
|
133
|
-
% (ret, job.job_hash, err_str)
|
|
129
|
+
f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
|
|
134
130
|
)
|
|
135
131
|
status_out = stdout.read().decode("utf-8").split("\n")
|
|
136
132
|
if len(status_out) < 2:
|
dpdispatcher/machines/openapi.py
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import shutil
|
|
3
3
|
import time
|
|
4
|
+
from zipfile import ZipFile
|
|
4
5
|
|
|
5
6
|
from dpdispatcher.utils.utils import customized_script_header_template
|
|
6
7
|
|
|
7
8
|
try:
|
|
8
|
-
from
|
|
9
|
-
from
|
|
10
|
-
from bohriumsdk.storage import Storage
|
|
11
|
-
from bohriumsdk.util import Util
|
|
9
|
+
from bohrium import Bohrium
|
|
10
|
+
from bohrium.resources import Job, Tiefblue
|
|
12
11
|
except ModuleNotFoundError:
|
|
13
12
|
found_bohriumsdk = False
|
|
14
13
|
else:
|
|
@@ -23,6 +22,12 @@ shell_script_header_template = """
|
|
|
23
22
|
"""
|
|
24
23
|
|
|
25
24
|
|
|
25
|
+
def unzip_file(zip_file, out_dir="./"):
|
|
26
|
+
obj = ZipFile(zip_file, "r")
|
|
27
|
+
for item in obj.namelist():
|
|
28
|
+
obj.extract(item, out_dir)
|
|
29
|
+
|
|
30
|
+
|
|
26
31
|
class OpenAPI(Machine):
|
|
27
32
|
def __init__(self, context):
|
|
28
33
|
if not found_bohriumsdk:
|
|
@@ -35,9 +40,35 @@ class OpenAPI(Machine):
|
|
|
35
40
|
self.grouped = self.remote_profile.get("grouped", True)
|
|
36
41
|
self.retry_count = self.remote_profile.get("retry_count", 3)
|
|
37
42
|
self.ignore_exit_code = context.remote_profile.get("ignore_exit_code", True)
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
43
|
+
|
|
44
|
+
access_key = (
|
|
45
|
+
self.remote_profile.get("access_key", None)
|
|
46
|
+
or os.getenv("BOHRIUM_ACCESS_KEY", None)
|
|
47
|
+
or os.getenv("ACCESS_KEY", None)
|
|
48
|
+
)
|
|
49
|
+
project_id = (
|
|
50
|
+
self.remote_profile.get("project_id", None)
|
|
51
|
+
or os.getenv("BOHRIUM_PROJECT_ID", None)
|
|
52
|
+
or os.getenv("PROJECT_ID", None)
|
|
53
|
+
)
|
|
54
|
+
app_key = (
|
|
55
|
+
self.remote_profile.get("app_key", None)
|
|
56
|
+
or os.getenv("BOHRIUM_APP_KEY", None)
|
|
57
|
+
or os.getenv("APP_KEY", None)
|
|
58
|
+
)
|
|
59
|
+
if access_key is None:
|
|
60
|
+
raise ValueError(
|
|
61
|
+
"remote_profile must contain 'access_key' or set environment variable 'BOHRIUM_ACCESS_KEY'"
|
|
62
|
+
)
|
|
63
|
+
if project_id is None:
|
|
64
|
+
raise ValueError(
|
|
65
|
+
"remote_profile must contain 'project_id' or set environment variable 'BOHRIUM_PROJECT_ID'"
|
|
66
|
+
)
|
|
67
|
+
self.client = Bohrium( # type: ignore[reportPossiblyUnboundVariable]
|
|
68
|
+
access_key=access_key, project_id=project_id, app_key=app_key
|
|
69
|
+
)
|
|
70
|
+
self.storage = Tiefblue() # type: ignore[reportPossiblyUnboundVariable]
|
|
71
|
+
self.job = Job(client=self.client) # type: ignore[reportPossiblyUnboundVariable]
|
|
41
72
|
self.group_id = None
|
|
42
73
|
|
|
43
74
|
def gen_script(self, job):
|
|
@@ -98,11 +129,13 @@ class OpenAPI(Machine):
|
|
|
98
129
|
),
|
|
99
130
|
"out_files": self._gen_backward_files_list(job),
|
|
100
131
|
"platform": self.remote_profile.get("platform", "ali"),
|
|
101
|
-
"
|
|
132
|
+
"image_name": self.remote_profile.get("image_address", ""),
|
|
102
133
|
}
|
|
103
|
-
if
|
|
104
|
-
openapi_params["
|
|
105
|
-
|
|
134
|
+
if "real_user_id" in self.remote_profile:
|
|
135
|
+
openapi_params["real_user_id"] = self.remote_profile["real_user_id"]
|
|
136
|
+
if "session_id" in self.remote_profile:
|
|
137
|
+
openapi_params["session_id"] = self.remote_profile["session_id"]
|
|
138
|
+
openapi_params["job_id"] = job.job_id
|
|
106
139
|
data = self.job.insert(**openapi_params)
|
|
107
140
|
|
|
108
141
|
job.job_id = data.get("jobId", 0) # type: ignore
|
|
@@ -152,8 +185,8 @@ class OpenAPI(Machine):
|
|
|
152
185
|
self.ignore_exit_code,
|
|
153
186
|
)
|
|
154
187
|
if job_state == JobStatus.finished:
|
|
155
|
-
job_log = self.job.log(job_id)
|
|
156
188
|
if self.remote_profile.get("output_log"):
|
|
189
|
+
job_log = self.job.log(job_id)
|
|
157
190
|
print(job_log, end="")
|
|
158
191
|
self._download_job(job)
|
|
159
192
|
elif self.remote_profile.get("output_log") and job_state == JobStatus.running:
|
|
@@ -163,14 +196,14 @@ class OpenAPI(Machine):
|
|
|
163
196
|
|
|
164
197
|
def _download_job(self, job):
|
|
165
198
|
data = self.job.detail(job.job_id)
|
|
166
|
-
job_url = data["
|
|
199
|
+
job_url = data["resultUrl"] # type: ignore
|
|
167
200
|
if not job_url:
|
|
168
201
|
return
|
|
169
202
|
job_hash = job.job_hash
|
|
170
203
|
result_filename = job_hash + "_back.zip"
|
|
171
204
|
target_result_zip = os.path.join(self.context.local_root, result_filename)
|
|
172
205
|
self.storage.download_from_url(job_url, target_result_zip)
|
|
173
|
-
|
|
206
|
+
unzip_file(target_result_zip, out_dir=self.context.local_root)
|
|
174
207
|
try:
|
|
175
208
|
os.makedirs(os.path.join(self.context.local_root, "backup"), exist_ok=True)
|
|
176
209
|
shutil.move(
|
|
@@ -213,7 +246,7 @@ class OpenAPI(Machine):
|
|
|
213
246
|
if status not in map_dict:
|
|
214
247
|
dlog.error(f"unknown job status {status}")
|
|
215
248
|
return JobStatus.unknown
|
|
216
|
-
if status == -1 and
|
|
249
|
+
if status == -1 and ignore_exit_code:
|
|
217
250
|
return JobStatus.finished
|
|
218
251
|
return map_dict[status]
|
|
219
252
|
|
dpdispatcher/machines/pbs.py
CHANGED
|
@@ -69,14 +69,12 @@ class PBS(Machine):
|
|
|
69
69
|
self.context.write_file(job_id_name, job_id)
|
|
70
70
|
return job_id
|
|
71
71
|
|
|
72
|
-
def default_resources(self, resources):
|
|
73
|
-
pass
|
|
74
|
-
|
|
75
72
|
def check_status(self, job):
|
|
76
73
|
job_id = job.job_id
|
|
77
74
|
if job_id == "":
|
|
78
75
|
return JobStatus.unsubmitted
|
|
79
|
-
|
|
76
|
+
command = "qstat -x " + job_id
|
|
77
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
80
78
|
err_str = stderr.read().decode("utf-8")
|
|
81
79
|
if ret != 0:
|
|
82
80
|
if "qstat: Unknown Job Id" in err_str or "Job has finished" in err_str:
|
|
@@ -86,8 +84,7 @@ class PBS(Machine):
|
|
|
86
84
|
return JobStatus.terminated
|
|
87
85
|
else:
|
|
88
86
|
raise RuntimeError(
|
|
89
|
-
"status command
|
|
90
|
-
% (err_str, ret)
|
|
87
|
+
f"status command {command} fails to execute. erro info: {err_str} return code {ret}"
|
|
91
88
|
)
|
|
92
89
|
status_line = stdout.read().decode("utf-8").split("\n")[-2]
|
|
93
90
|
status_word = status_line.split()[-2]
|
|
@@ -126,7 +123,8 @@ class Torque(PBS):
|
|
|
126
123
|
job_id = job.job_id
|
|
127
124
|
if job_id == "":
|
|
128
125
|
return JobStatus.unsubmitted
|
|
129
|
-
|
|
126
|
+
command = "qstat -l " + job_id
|
|
127
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
130
128
|
err_str = stderr.read().decode("utf-8")
|
|
131
129
|
if ret != 0:
|
|
132
130
|
if "qstat: Unknown Job Id" in err_str or "Job has finished" in err_str:
|
|
@@ -136,8 +134,7 @@ class Torque(PBS):
|
|
|
136
134
|
return JobStatus.terminated
|
|
137
135
|
else:
|
|
138
136
|
raise RuntimeError(
|
|
139
|
-
"status command
|
|
140
|
-
% (err_str, ret)
|
|
137
|
+
f"status command {command} fails to execute. erro info: {err_str} return code {ret}"
|
|
141
138
|
)
|
|
142
139
|
status_line = stdout.read().decode("utf-8").split("\n")[-2]
|
|
143
140
|
status_word = status_line.split()[-2]
|
|
@@ -255,19 +252,18 @@ class SGE(PBS):
|
|
|
255
252
|
self.context.write_file(job_id_name, job_id)
|
|
256
253
|
return job_id
|
|
257
254
|
|
|
258
|
-
def default_resources(self, resources):
|
|
259
|
-
pass
|
|
260
|
-
|
|
261
255
|
def check_status(self, job):
|
|
256
|
+
### https://softpanorama.org/HPC/Grid_engine/Queues/queue_states.shtml
|
|
262
257
|
job_id = job.job_id
|
|
263
258
|
status_line = None
|
|
264
259
|
if job_id == "":
|
|
265
260
|
return JobStatus.unsubmitted
|
|
266
|
-
|
|
261
|
+
command = "qstat"
|
|
262
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
267
263
|
err_str = stderr.read().decode("utf-8")
|
|
268
264
|
if ret != 0:
|
|
269
265
|
raise RuntimeError(
|
|
270
|
-
f"status command
|
|
266
|
+
f"status command {command} fails to execute. erro info: {err_str} return code {ret}"
|
|
271
267
|
)
|
|
272
268
|
status_text_list = stdout.read().decode("utf-8").split("\n")
|
|
273
269
|
for txt in status_text_list:
|
|
@@ -291,10 +287,12 @@ class SGE(PBS):
|
|
|
291
287
|
else:
|
|
292
288
|
status_word = status_line.split()[4]
|
|
293
289
|
# dlog.info (status_word)
|
|
294
|
-
if status_word in ["qw"]:
|
|
290
|
+
if status_word in ["qw", "hqw", "t"]:
|
|
295
291
|
return JobStatus.waiting
|
|
296
|
-
elif status_word in ["r"]:
|
|
292
|
+
elif status_word in ["r", "Rr"]:
|
|
297
293
|
return JobStatus.running
|
|
294
|
+
elif status_word in ["Eqw", "dr", "dt"]:
|
|
295
|
+
return JobStatus.terminated
|
|
298
296
|
else:
|
|
299
297
|
return JobStatus.unknown
|
|
300
298
|
|
dpdispatcher/machines/shell.py
CHANGED
|
@@ -38,14 +38,12 @@ class Shell(Machine):
|
|
|
38
38
|
script_run_str = self.gen_script_command(job)
|
|
39
39
|
script_run_file_name = f"{job.script_file_name}.run"
|
|
40
40
|
self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
)
|
|
41
|
+
cmd = f"cd {shlex.quote(self.context.remote_root)} && {{ nohup bash {script_file_name} 1>>{output_name} 2>>{output_name} & }} && echo $!"
|
|
42
|
+
ret, stdin, stdout, stderr = self.context.block_call(cmd)
|
|
44
43
|
if ret != 0:
|
|
45
44
|
err_str = stderr.read().decode("utf-8")
|
|
46
45
|
raise RuntimeError(
|
|
47
|
-
"status command
|
|
48
|
-
% (err_str, ret)
|
|
46
|
+
f"status command {cmd} fails to execute\nerror message:{err_str}\nreturn code {ret}\n"
|
|
49
47
|
)
|
|
50
48
|
job_id = int(stdout.read().decode("utf-8").strip())
|
|
51
49
|
self.context.write_file(job_id_name, str(job_id))
|
|
@@ -62,9 +60,6 @@ class Shell(Machine):
|
|
|
62
60
|
# self.context.write_file(job_id_name, job_id)
|
|
63
61
|
# return job_id
|
|
64
62
|
|
|
65
|
-
def default_resources(self, resources):
|
|
66
|
-
pass
|
|
67
|
-
|
|
68
63
|
def check_status(self, job):
|
|
69
64
|
job_id = job.job_id
|
|
70
65
|
# print('shell.check_status.job_id', job_id)
|
|
@@ -73,14 +68,15 @@ class Shell(Machine):
|
|
|
73
68
|
return JobStatus.unsubmitted
|
|
74
69
|
|
|
75
70
|
# mark defunct process as terminated
|
|
76
|
-
|
|
71
|
+
cmd = (
|
|
72
|
+
r"""command -v ps >/dev/null 2>&1 || { echo >&2 "I require ps but it's not installed. Aborting."; exit 1; };"""
|
|
77
73
|
f"if ps -p {job_id} > /dev/null && ! (ps -o command -p {job_id} | grep defunct >/dev/null) ; then echo 1; fi"
|
|
78
74
|
)
|
|
75
|
+
ret, stdin, stdout, stderr = self.context.block_call(cmd)
|
|
79
76
|
if ret != 0:
|
|
80
77
|
err_str = stderr.read().decode("utf-8")
|
|
81
78
|
raise RuntimeError(
|
|
82
|
-
"status command
|
|
83
|
-
% (err_str, ret)
|
|
79
|
+
f"status command {cmd} fails to execute\nerror message:{err_str}\nreturn code {ret}\n"
|
|
84
80
|
)
|
|
85
81
|
|
|
86
82
|
if_job_exists = bool(stdout.read().decode("utf-8").strip())
|
dpdispatcher/machines/slurm.py
CHANGED
|
@@ -83,13 +83,12 @@ class Slurm(Machine):
|
|
|
83
83
|
script_run_file_name = f"{job.script_file_name}.run"
|
|
84
84
|
self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
|
|
85
85
|
# self.context.write_file(fname=os.path.join(self.context.submission.work_base, script_file_name), write_str=script_str)
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
shlex.quote(script_file_name),
|
|
91
|
-
)
|
|
86
|
+
command = "cd {} && {} {}".format(
|
|
87
|
+
shlex.quote(self.context.remote_root),
|
|
88
|
+
"sbatch --parsable",
|
|
89
|
+
shlex.quote(script_file_name),
|
|
92
90
|
)
|
|
91
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
93
92
|
if ret != 0:
|
|
94
93
|
err_str = stderr.read().decode("utf-8")
|
|
95
94
|
if (
|
|
@@ -98,8 +97,7 @@ class Slurm(Machine):
|
|
|
98
97
|
):
|
|
99
98
|
# server network error, retry 3 times
|
|
100
99
|
raise RetrySignal(
|
|
101
|
-
"Get error code
|
|
102
|
-
% (ret, job.job_hash, err_str)
|
|
100
|
+
f"Get error code {ret} in submitting with job: {job.job_hash} . message: {err_str}"
|
|
103
101
|
)
|
|
104
102
|
elif (
|
|
105
103
|
"Job violates accounting/QOS policy" in err_str
|
|
@@ -110,8 +108,7 @@ class Slurm(Machine):
|
|
|
110
108
|
# job number exceeds, skip the submitting
|
|
111
109
|
return ""
|
|
112
110
|
raise RuntimeError(
|
|
113
|
-
"
|
|
114
|
-
% (err_str, ret)
|
|
111
|
+
f"command {command} fails to execute\nerror message:{err_str}\nreturn code {ret}\n"
|
|
115
112
|
)
|
|
116
113
|
subret = stdout.readlines()
|
|
117
114
|
# --parsable
|
|
@@ -121,17 +118,13 @@ class Slurm(Machine):
|
|
|
121
118
|
self.context.write_file(job_id_name, job_id)
|
|
122
119
|
return job_id
|
|
123
120
|
|
|
124
|
-
def default_resources(self, resources):
|
|
125
|
-
pass
|
|
126
|
-
|
|
127
121
|
@retry()
|
|
128
122
|
def check_status(self, job):
|
|
129
123
|
job_id = job.job_id
|
|
130
124
|
if job_id == "":
|
|
131
125
|
return JobStatus.unsubmitted
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
)
|
|
126
|
+
command = 'squeue -o "%.18i %.2t" -j ' + job_id
|
|
127
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
135
128
|
if ret != 0:
|
|
136
129
|
err_str = stderr.read().decode("utf-8")
|
|
137
130
|
if "Invalid job id specified" in err_str:
|
|
@@ -147,13 +140,11 @@ class Slurm(Machine):
|
|
|
147
140
|
):
|
|
148
141
|
# retry 3 times
|
|
149
142
|
raise RetrySignal(
|
|
150
|
-
"Get error code
|
|
151
|
-
% (ret, job.job_hash, err_str)
|
|
143
|
+
f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
|
|
152
144
|
)
|
|
153
145
|
raise RuntimeError(
|
|
154
|
-
"status command
|
|
155
|
-
"job_id
|
|
156
|
-
% (job_id, err_str, ret)
|
|
146
|
+
f"status command {command} fails to execute."
|
|
147
|
+
f"job_id:{job_id} \n error message:{err_str}\n return code {ret}\n"
|
|
157
148
|
)
|
|
158
149
|
status_line = stdout.read().decode("utf-8").split("\n")[-2]
|
|
159
150
|
status_word = status_line.split()[-1]
|
|
@@ -257,7 +248,7 @@ class SlurmJobArray(Slurm):
|
|
|
257
248
|
return super().gen_script_header(job) + "\n#SBATCH --array={}".format(
|
|
258
249
|
",".join(map(str, job_array))
|
|
259
250
|
)
|
|
260
|
-
return super().gen_script_header(job) + "\n#SBATCH --array=0-%
|
|
251
|
+
return super().gen_script_header(job) + "\n#SBATCH --array=0-%s" % (
|
|
261
252
|
math.ceil(len(job.job_task_list) / slurm_job_size) - 1
|
|
262
253
|
)
|
|
263
254
|
|
|
@@ -319,9 +310,8 @@ class SlurmJobArray(Slurm):
|
|
|
319
310
|
job_id = job.job_id
|
|
320
311
|
if job_id == "":
|
|
321
312
|
return JobStatus.unsubmitted
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
)
|
|
313
|
+
command = 'squeue -h -o "%.18i %.2t" -j ' + job_id
|
|
314
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
325
315
|
if ret != 0:
|
|
326
316
|
err_str = stderr.read().decode("utf-8")
|
|
327
317
|
if "Invalid job id specified" in err_str:
|
|
@@ -336,13 +326,11 @@ class SlurmJobArray(Slurm):
|
|
|
336
326
|
):
|
|
337
327
|
# retry 3 times
|
|
338
328
|
raise RetrySignal(
|
|
339
|
-
"Get error code
|
|
340
|
-
% (ret, job.job_hash, err_str)
|
|
329
|
+
f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
|
|
341
330
|
)
|
|
342
331
|
raise RuntimeError(
|
|
343
|
-
"status command
|
|
344
|
-
"job_id
|
|
345
|
-
% (job_id, err_str, ret)
|
|
332
|
+
f"status command {command} fails to execute."
|
|
333
|
+
f"job_id:{job_id} \n error message:{err_str}\n return code {ret}\n"
|
|
346
334
|
)
|
|
347
335
|
status_lines = stdout.read().decode("utf-8").split("\n")[:-1]
|
|
348
336
|
status = []
|
dpdispatcher/submission.py
CHANGED
|
@@ -55,7 +55,6 @@ class Submission:
|
|
|
55
55
|
*,
|
|
56
56
|
task_list=[],
|
|
57
57
|
):
|
|
58
|
-
# self.submission_list = submission_list
|
|
59
58
|
self.local_root = None
|
|
60
59
|
self.work_base = work_base
|
|
61
60
|
self._abs_work_base = os.path.abspath(work_base)
|
|
@@ -324,8 +323,7 @@ class Submission:
|
|
|
324
323
|
kwargs = {**{"clean": False}, **kwargs}
|
|
325
324
|
if kwargs["clean"]:
|
|
326
325
|
dlog.warning(
|
|
327
|
-
"Using async submission with `clean=True`, "
|
|
328
|
-
"job may fail in queue system"
|
|
326
|
+
"Using async submission with `clean=True`, job may fail in queue system"
|
|
329
327
|
)
|
|
330
328
|
loop = asyncio.get_event_loop()
|
|
331
329
|
wrapped_submission = functools.partial(self.run_submission, **kwargs)
|
|
@@ -515,12 +513,9 @@ class Submission:
|
|
|
515
513
|
def submission_from_json(cls, json_file_name="submission.json"):
|
|
516
514
|
with open(json_file_name) as f:
|
|
517
515
|
submission_dict = json.load(f)
|
|
518
|
-
# submission_dict = machine.context.read_file(json_file_name)
|
|
519
516
|
submission = cls.deserialize(submission_dict=submission_dict, machine=None)
|
|
520
517
|
return submission
|
|
521
518
|
|
|
522
|
-
# def check_if_recover()
|
|
523
|
-
|
|
524
519
|
def try_recover_from_json(self):
|
|
525
520
|
submission_file_name = f"{self.submission_hash}.json"
|
|
526
521
|
if_recover = self.machine.context.check_file_exists(submission_file_name)
|
|
@@ -545,7 +540,6 @@ class Submission:
|
|
|
545
540
|
f"machine.context.remote_root:{self.machine.context.remote_root}; "
|
|
546
541
|
f"submission.work_base:{submission.work_base};"
|
|
547
542
|
)
|
|
548
|
-
# self = submission.bind_machine(machine=self.machine)
|
|
549
543
|
else:
|
|
550
544
|
print(self.serialize())
|
|
551
545
|
print(submission.serialize())
|
|
@@ -759,7 +753,6 @@ class Job:
|
|
|
759
753
|
self.fail_count = 0
|
|
760
754
|
self.job_uuid = uuid.uuid4()
|
|
761
755
|
|
|
762
|
-
# self.job_hash = self.get_hash()
|
|
763
756
|
self.job_hash = self.get_hash()
|
|
764
757
|
self.script_file_name = self.job_hash + ".sub"
|
|
765
758
|
|
|
@@ -1122,9 +1115,9 @@ class Resources:
|
|
|
1122
1115
|
|
|
1123
1116
|
@staticmethod
|
|
1124
1117
|
def arginfo(detail_kwargs=True):
|
|
1125
|
-
doc_number_node = "The number of
|
|
1126
|
-
doc_cpu_per_node = "
|
|
1127
|
-
doc_gpu_per_node = "
|
|
1118
|
+
doc_number_node = "The number of nodes required for each `job`."
|
|
1119
|
+
doc_cpu_per_node = "CPU numbers of each node assigned to each job."
|
|
1120
|
+
doc_gpu_per_node = "GPU numbers of each node assigned to each job."
|
|
1128
1121
|
doc_queue_name = "The queue name of batch job scheduler system."
|
|
1129
1122
|
doc_group_size = "The number of `tasks` in a `job`. 0 means infinity."
|
|
1130
1123
|
doc_custom_flags = "The extra lines pass to job submitting script header"
|
|
@@ -142,10 +142,10 @@ class Client:
|
|
|
142
142
|
res = self.get("/data/get_sts_token", {})
|
|
143
143
|
# print('debug>>>>>>>>>>>>>', res)
|
|
144
144
|
dlog.debug(f"debug: _get_oss_bucket: res:{res}")
|
|
145
|
-
auth = oss2.StsAuth(
|
|
145
|
+
auth = oss2.StsAuth( # type: ignore[reportPossiblyUnboundVariable]
|
|
146
146
|
res["AccessKeyId"], res["AccessKeySecret"], res["SecurityToken"]
|
|
147
147
|
)
|
|
148
|
-
return oss2.Bucket(auth, endpoint, bucket_name)
|
|
148
|
+
return oss2.Bucket(auth, endpoint, bucket_name) # type: ignore[reportPossiblyUnboundVariable]
|
|
149
149
|
|
|
150
150
|
def download(self, oss_file, save_file, endpoint, bucket_name):
|
|
151
151
|
bucket = self._get_oss_bucket(endpoint, bucket_name)
|
|
@@ -184,7 +184,7 @@ class Client:
|
|
|
184
184
|
)
|
|
185
185
|
bucket = self._get_oss_bucket(endpoint, bucket_name)
|
|
186
186
|
total_size = os.path.getsize(zip_task_file)
|
|
187
|
-
part_size = determine_part_size(total_size, preferred_size=1000 * 1024)
|
|
187
|
+
part_size = determine_part_size(total_size, preferred_size=1000 * 1024) # type: ignore[reportPossiblyUnboundVariable]
|
|
188
188
|
upload_id = bucket.init_multipart_upload(oss_task_zip).upload_id
|
|
189
189
|
parts = []
|
|
190
190
|
with open(zip_task_file, "rb") as fileobj:
|
|
@@ -196,9 +196,9 @@ class Client:
|
|
|
196
196
|
oss_task_zip,
|
|
197
197
|
upload_id,
|
|
198
198
|
part_number,
|
|
199
|
-
SizedFileAdapter(fileobj, num_to_upload),
|
|
199
|
+
SizedFileAdapter(fileobj, num_to_upload), # type: ignore[reportPossiblyUnboundVariable]
|
|
200
200
|
)
|
|
201
|
-
parts.append(PartInfo(part_number, result.etag))
|
|
201
|
+
parts.append(PartInfo(part_number, result.etag)) # type: ignore[reportPossiblyUnboundVariable]
|
|
202
202
|
offset += num_to_upload
|
|
203
203
|
part_number += 1
|
|
204
204
|
# result = bucket.complete_multipart_upload(oss_task_zip, upload_id, parts)
|
|
@@ -278,7 +278,11 @@ class Client:
|
|
|
278
278
|
return ""
|
|
279
279
|
resp = requests.get(url, headers={"Range": f"bytes={self.last_log_offset}-"})
|
|
280
280
|
self.last_log_offset += len(resp.content)
|
|
281
|
-
|
|
281
|
+
try:
|
|
282
|
+
return resp.content.decode("utf-8")
|
|
283
|
+
except Exception as e:
|
|
284
|
+
dlog.error(f"Error decoding job log: {e}", stack_info=ENABLE_STACK)
|
|
285
|
+
return ""
|
|
282
286
|
|
|
283
287
|
def _get_job_log(self, job_id):
|
|
284
288
|
ret = self.get(
|
dpdispatcher/utils/hdfs_cli.py
CHANGED
|
@@ -28,7 +28,7 @@ class HDFS:
|
|
|
28
28
|
)
|
|
29
29
|
except Exception as e:
|
|
30
30
|
raise RuntimeError(
|
|
31
|
-
f"Cannot check existence of hdfs uri[{uri}]
|
|
31
|
+
f"Cannot check existence of hdfs uri[{uri}] with cmd[{cmd}]"
|
|
32
32
|
) from e
|
|
33
33
|
|
|
34
34
|
@staticmethod
|
|
@@ -48,9 +48,7 @@ class HDFS:
|
|
|
48
48
|
f"with cmd[{cmd}]; ret[{ret}] output[{out}] stderr[{err}]"
|
|
49
49
|
)
|
|
50
50
|
except Exception as e:
|
|
51
|
-
raise RuntimeError(
|
|
52
|
-
f"Cannot remove hdfs uri[{uri}] " f"with cmd[{cmd}]"
|
|
53
|
-
) from e
|
|
51
|
+
raise RuntimeError(f"Cannot remove hdfs uri[{uri}] with cmd[{cmd}]") from e
|
|
54
52
|
|
|
55
53
|
@staticmethod
|
|
56
54
|
def mkdir(uri):
|
|
@@ -70,7 +68,7 @@ class HDFS:
|
|
|
70
68
|
)
|
|
71
69
|
except Exception as e:
|
|
72
70
|
raise RuntimeError(
|
|
73
|
-
f"Cannot mkdir of hdfs uri[{uri}]
|
|
71
|
+
f"Cannot mkdir of hdfs uri[{uri}] with cmd[{cmd}]"
|
|
74
72
|
) from e
|
|
75
73
|
|
|
76
74
|
@staticmethod
|
|
@@ -80,7 +78,7 @@ class HDFS:
|
|
|
80
78
|
"""
|
|
81
79
|
# Make sure local_path is accessible
|
|
82
80
|
if not os.path.exists(local_path) or not os.access(local_path, os.R_OK):
|
|
83
|
-
raise RuntimeError(f"try to access local_path[{local_path}]
|
|
81
|
+
raise RuntimeError(f"try to access local_path[{local_path}] but failed")
|
|
84
82
|
cmd = f"hadoop fs -copyFromLocal -f {local_path} {to_uri}"
|
|
85
83
|
try:
|
|
86
84
|
ret, out, err = run_cmd_with_all_output(cmd)
|
|
@@ -132,9 +130,7 @@ class HDFS:
|
|
|
132
130
|
f"cmd [{cmd}] ret[{ret}] output[{out}] stderr[{err}]"
|
|
133
131
|
)
|
|
134
132
|
except Exception as e:
|
|
135
|
-
raise RuntimeError(
|
|
136
|
-
f"Cannot read text from uri[{uri}]" f"cmd [{cmd}]"
|
|
137
|
-
) from e
|
|
133
|
+
raise RuntimeError(f"Cannot read text from uri[{uri}]cmd [{cmd}]") from e
|
|
138
134
|
|
|
139
135
|
@staticmethod
|
|
140
136
|
def move(from_uri, to_uri):
|
|
@@ -151,6 +147,5 @@ class HDFS:
|
|
|
151
147
|
)
|
|
152
148
|
except Exception as e:
|
|
153
149
|
raise RuntimeError(
|
|
154
|
-
f"Cannot move from_uri[{from_uri}] to "
|
|
155
|
-
f"to_uri[{to_uri}] with cmd[{cmd}]"
|
|
150
|
+
f"Cannot move from_uri[{from_uri}] to to_uri[{to_uri}] with cmd[{cmd}]"
|
|
156
151
|
) from e
|