dpdispatcher 0.6.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dpdispatcher/_version.py +22 -4
- dpdispatcher/base_context.py +60 -1
- dpdispatcher/contexts/__init__.py +1 -0
- dpdispatcher/contexts/dp_cloud_server_context.py +8 -1
- dpdispatcher/contexts/hdfs_context.py +16 -11
- dpdispatcher/contexts/lazy_local_context.py +2 -19
- dpdispatcher/contexts/local_context.py +77 -43
- dpdispatcher/contexts/openapi_context.py +78 -14
- dpdispatcher/contexts/ssh_context.py +117 -98
- dpdispatcher/dlog.py +9 -5
- dpdispatcher/dpcloudserver/__init__.py +0 -0
- dpdispatcher/dpcloudserver/client.py +7 -0
- dpdispatcher/dpdisp.py +21 -0
- dpdispatcher/entrypoints/run.py +9 -0
- dpdispatcher/entrypoints/submission.py +21 -1
- dpdispatcher/machine.py +15 -4
- dpdispatcher/machines/JH_UniScheduler.py +171 -0
- dpdispatcher/machines/__init__.py +1 -0
- dpdispatcher/machines/distributed_shell.py +6 -10
- dpdispatcher/machines/fugaku.py +9 -12
- dpdispatcher/machines/lsf.py +3 -9
- dpdispatcher/machines/openapi.py +48 -15
- dpdispatcher/machines/pbs.py +183 -20
- dpdispatcher/machines/shell.py +7 -16
- dpdispatcher/machines/slurm.py +30 -42
- dpdispatcher/run.py +172 -0
- dpdispatcher/submission.py +5 -14
- dpdispatcher/utils/dpcloudserver/client.py +10 -6
- dpdispatcher/utils/hdfs_cli.py +10 -19
- dpdispatcher/utils/utils.py +21 -7
- {dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/METADATA +35 -29
- dpdispatcher-1.0.0.dist-info/RECORD +49 -0
- {dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/WHEEL +1 -1
- dpdispatcher-0.6.1.dist-info/RECORD +0 -44
- {dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/entry_points.txt +0 -0
- {dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info/licenses}/LICENSE +0 -0
- {dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/top_level.txt +0 -0
dpdispatcher/machines/shell.py
CHANGED
|
@@ -38,19 +38,12 @@ class Shell(Machine):
|
|
|
38
38
|
script_run_str = self.gen_script_command(job)
|
|
39
39
|
script_run_file_name = f"{job.script_file_name}.run"
|
|
40
40
|
self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
shlex.quote(self.context.remote_root),
|
|
44
|
-
script_file_name,
|
|
45
|
-
output_name,
|
|
46
|
-
output_name,
|
|
47
|
-
)
|
|
48
|
-
)
|
|
41
|
+
cmd = f"cd {shlex.quote(self.context.remote_root)} && {{ nohup bash {script_file_name} 1>>{output_name} 2>>{output_name} & }} && echo $!"
|
|
42
|
+
ret, stdin, stdout, stderr = self.context.block_call(cmd)
|
|
49
43
|
if ret != 0:
|
|
50
44
|
err_str = stderr.read().decode("utf-8")
|
|
51
45
|
raise RuntimeError(
|
|
52
|
-
"status command
|
|
53
|
-
% (err_str, ret)
|
|
46
|
+
f"status command {cmd} fails to execute\nerror message:{err_str}\nreturn code {ret}\n"
|
|
54
47
|
)
|
|
55
48
|
job_id = int(stdout.read().decode("utf-8").strip())
|
|
56
49
|
self.context.write_file(job_id_name, str(job_id))
|
|
@@ -67,9 +60,6 @@ class Shell(Machine):
|
|
|
67
60
|
# self.context.write_file(job_id_name, job_id)
|
|
68
61
|
# return job_id
|
|
69
62
|
|
|
70
|
-
def default_resources(self, resources):
|
|
71
|
-
pass
|
|
72
|
-
|
|
73
63
|
def check_status(self, job):
|
|
74
64
|
job_id = job.job_id
|
|
75
65
|
# print('shell.check_status.job_id', job_id)
|
|
@@ -78,14 +68,15 @@ class Shell(Machine):
|
|
|
78
68
|
return JobStatus.unsubmitted
|
|
79
69
|
|
|
80
70
|
# mark defunct process as terminated
|
|
81
|
-
|
|
71
|
+
cmd = (
|
|
72
|
+
r"""command -v ps >/dev/null 2>&1 || { echo >&2 "I require ps but it's not installed. Aborting."; exit 1; };"""
|
|
82
73
|
f"if ps -p {job_id} > /dev/null && ! (ps -o command -p {job_id} | grep defunct >/dev/null) ; then echo 1; fi"
|
|
83
74
|
)
|
|
75
|
+
ret, stdin, stdout, stderr = self.context.block_call(cmd)
|
|
84
76
|
if ret != 0:
|
|
85
77
|
err_str = stderr.read().decode("utf-8")
|
|
86
78
|
raise RuntimeError(
|
|
87
|
-
"status command
|
|
88
|
-
% (err_str, ret)
|
|
79
|
+
f"status command {cmd} fails to execute\nerror message:{err_str}\nreturn code {ret}\n"
|
|
89
80
|
)
|
|
90
81
|
|
|
91
82
|
if_job_exists = bool(stdout.read().decode("utf-8").strip())
|
dpdispatcher/machines/slurm.py
CHANGED
|
@@ -39,23 +39,23 @@ class Slurm(Machine):
|
|
|
39
39
|
def gen_script_header(self, job):
|
|
40
40
|
resources = job.resources
|
|
41
41
|
script_header_dict = {}
|
|
42
|
-
script_header_dict["slurm_nodes_line"] =
|
|
43
|
-
|
|
42
|
+
script_header_dict["slurm_nodes_line"] = (
|
|
43
|
+
f"#SBATCH --nodes {resources.number_node}"
|
|
44
|
+
)
|
|
45
|
+
script_header_dict["slurm_ntasks_per_node_line"] = (
|
|
46
|
+
f"#SBATCH --ntasks-per-node {resources.cpu_per_node}"
|
|
44
47
|
)
|
|
45
|
-
script_header_dict[
|
|
46
|
-
"slurm_ntasks_per_node_line"
|
|
47
|
-
] = f"#SBATCH --ntasks-per-node {resources.cpu_per_node}"
|
|
48
48
|
custom_gpu_line = resources.kwargs.get("custom_gpu_line", None)
|
|
49
49
|
if not custom_gpu_line:
|
|
50
|
-
script_header_dict[
|
|
51
|
-
"
|
|
52
|
-
|
|
50
|
+
script_header_dict["slurm_number_gpu_line"] = (
|
|
51
|
+
f"#SBATCH --gres=gpu:{resources.gpu_per_node}"
|
|
52
|
+
)
|
|
53
53
|
else:
|
|
54
54
|
script_header_dict["slurm_number_gpu_line"] = custom_gpu_line
|
|
55
55
|
if resources.queue_name != "":
|
|
56
|
-
script_header_dict[
|
|
57
|
-
"
|
|
58
|
-
|
|
56
|
+
script_header_dict["slurm_partition_line"] = (
|
|
57
|
+
f"#SBATCH --partition {resources.queue_name}"
|
|
58
|
+
)
|
|
59
59
|
else:
|
|
60
60
|
script_header_dict["slurm_partition_line"] = ""
|
|
61
61
|
if (
|
|
@@ -83,13 +83,12 @@ class Slurm(Machine):
|
|
|
83
83
|
script_run_file_name = f"{job.script_file_name}.run"
|
|
84
84
|
self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
|
|
85
85
|
# self.context.write_file(fname=os.path.join(self.context.submission.work_base, script_file_name), write_str=script_str)
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
shlex.quote(script_file_name),
|
|
91
|
-
)
|
|
86
|
+
command = "cd {} && {} {}".format(
|
|
87
|
+
shlex.quote(self.context.remote_root),
|
|
88
|
+
"sbatch --parsable",
|
|
89
|
+
shlex.quote(script_file_name),
|
|
92
90
|
)
|
|
91
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
93
92
|
if ret != 0:
|
|
94
93
|
err_str = stderr.read().decode("utf-8")
|
|
95
94
|
if (
|
|
@@ -98,8 +97,7 @@ class Slurm(Machine):
|
|
|
98
97
|
):
|
|
99
98
|
# server network error, retry 3 times
|
|
100
99
|
raise RetrySignal(
|
|
101
|
-
"Get error code
|
|
102
|
-
% (ret, job.job_hash, err_str)
|
|
100
|
+
f"Get error code {ret} in submitting with job: {job.job_hash} . message: {err_str}"
|
|
103
101
|
)
|
|
104
102
|
elif (
|
|
105
103
|
"Job violates accounting/QOS policy" in err_str
|
|
@@ -110,8 +108,7 @@ class Slurm(Machine):
|
|
|
110
108
|
# job number exceeds, skip the submitting
|
|
111
109
|
return ""
|
|
112
110
|
raise RuntimeError(
|
|
113
|
-
"
|
|
114
|
-
% (err_str, ret)
|
|
111
|
+
f"command {command} fails to execute\nerror message:{err_str}\nreturn code {ret}\n"
|
|
115
112
|
)
|
|
116
113
|
subret = stdout.readlines()
|
|
117
114
|
# --parsable
|
|
@@ -121,17 +118,13 @@ class Slurm(Machine):
|
|
|
121
118
|
self.context.write_file(job_id_name, job_id)
|
|
122
119
|
return job_id
|
|
123
120
|
|
|
124
|
-
def default_resources(self, resources):
|
|
125
|
-
pass
|
|
126
|
-
|
|
127
121
|
@retry()
|
|
128
122
|
def check_status(self, job):
|
|
129
123
|
job_id = job.job_id
|
|
130
124
|
if job_id == "":
|
|
131
125
|
return JobStatus.unsubmitted
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
)
|
|
126
|
+
command = 'squeue -o "%.18i %.2t" -j ' + job_id
|
|
127
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
135
128
|
if ret != 0:
|
|
136
129
|
err_str = stderr.read().decode("utf-8")
|
|
137
130
|
if "Invalid job id specified" in err_str:
|
|
@@ -147,13 +140,11 @@ class Slurm(Machine):
|
|
|
147
140
|
):
|
|
148
141
|
# retry 3 times
|
|
149
142
|
raise RetrySignal(
|
|
150
|
-
"Get error code
|
|
151
|
-
% (ret, job.job_hash, err_str)
|
|
143
|
+
f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
|
|
152
144
|
)
|
|
153
145
|
raise RuntimeError(
|
|
154
|
-
"status command
|
|
155
|
-
"job_id
|
|
156
|
-
% (job_id, err_str, ret)
|
|
146
|
+
f"status command {command} fails to execute."
|
|
147
|
+
f"job_id:{job_id} \n error message:{err_str}\n return code {ret}\n"
|
|
157
148
|
)
|
|
158
149
|
status_line = stdout.read().decode("utf-8").split("\n")[-2]
|
|
159
150
|
status_word = status_line.split()[-1]
|
|
@@ -254,10 +245,10 @@ class SlurmJobArray(Slurm):
|
|
|
254
245
|
).as_posix()
|
|
255
246
|
if not self.context.check_file_exists(task_tag_finished):
|
|
256
247
|
job_array.add(ii // slurm_job_size)
|
|
257
|
-
return super().gen_script_header(job) + "\n#SBATCH --array
|
|
248
|
+
return super().gen_script_header(job) + "\n#SBATCH --array={}".format(
|
|
258
249
|
",".join(map(str, job_array))
|
|
259
250
|
)
|
|
260
|
-
return super().gen_script_header(job) + "\n#SBATCH --array=0-%
|
|
251
|
+
return super().gen_script_header(job) + "\n#SBATCH --array=0-%s" % (
|
|
261
252
|
math.ceil(len(job.job_task_list) / slurm_job_size) - 1
|
|
262
253
|
)
|
|
263
254
|
|
|
@@ -319,9 +310,8 @@ class SlurmJobArray(Slurm):
|
|
|
319
310
|
job_id = job.job_id
|
|
320
311
|
if job_id == "":
|
|
321
312
|
return JobStatus.unsubmitted
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
)
|
|
313
|
+
command = 'squeue -h -o "%.18i %.2t" -j ' + job_id
|
|
314
|
+
ret, stdin, stdout, stderr = self.context.block_call(command)
|
|
325
315
|
if ret != 0:
|
|
326
316
|
err_str = stderr.read().decode("utf-8")
|
|
327
317
|
if "Invalid job id specified" in err_str:
|
|
@@ -336,13 +326,11 @@ class SlurmJobArray(Slurm):
|
|
|
336
326
|
):
|
|
337
327
|
# retry 3 times
|
|
338
328
|
raise RetrySignal(
|
|
339
|
-
"Get error code
|
|
340
|
-
% (ret, job.job_hash, err_str)
|
|
329
|
+
f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
|
|
341
330
|
)
|
|
342
331
|
raise RuntimeError(
|
|
343
|
-
"status command
|
|
344
|
-
"job_id
|
|
345
|
-
% (job_id, err_str, ret)
|
|
332
|
+
f"status command {command} fails to execute."
|
|
333
|
+
f"job_id:{job_id} \n error message:{err_str}\n return code {ret}\n"
|
|
346
334
|
)
|
|
347
335
|
status_lines = stdout.read().decode("utf-8").split("\n")[:-1]
|
|
348
336
|
status = []
|
dpdispatcher/run.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
import sys
|
|
4
|
+
from glob import glob
|
|
5
|
+
from hashlib import sha1
|
|
6
|
+
|
|
7
|
+
from dpdispatcher.machine import Machine
|
|
8
|
+
from dpdispatcher.submission import Resources, Submission, Task
|
|
9
|
+
|
|
10
|
+
if sys.version_info >= (3, 11):
|
|
11
|
+
import tomllib
|
|
12
|
+
else:
|
|
13
|
+
import tomli as tomllib
|
|
14
|
+
from typing import List, Optional
|
|
15
|
+
|
|
16
|
+
from dargs import Argument
|
|
17
|
+
|
|
18
|
+
from dpdispatcher.arginfo import machine_dargs, resources_dargs, task_dargs
|
|
19
|
+
|
|
20
|
+
REGEX = r"(?m)^# /// (?P<type>[a-zA-Z0-9-]+)$\s(?P<content>(^#(| .*)$\s)+)^# ///$"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def read_pep723(script: str) -> Optional[dict]:
|
|
24
|
+
"""Read a PEP 723 script metadata from a script string.
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
script : str
|
|
29
|
+
Script content.
|
|
30
|
+
|
|
31
|
+
Returns
|
|
32
|
+
-------
|
|
33
|
+
dict
|
|
34
|
+
PEP 723 metadata.
|
|
35
|
+
"""
|
|
36
|
+
name = "script"
|
|
37
|
+
matches = list(
|
|
38
|
+
filter(lambda m: m.group("type") == name, re.finditer(REGEX, script))
|
|
39
|
+
)
|
|
40
|
+
if len(matches) > 1:
|
|
41
|
+
# TODO: Add tests for scenarios where multiple script blocks are found
|
|
42
|
+
raise ValueError(f"Multiple {name} blocks found")
|
|
43
|
+
elif len(matches) == 1:
|
|
44
|
+
content = "".join(
|
|
45
|
+
line[2:] if line.startswith("# ") else line[1:]
|
|
46
|
+
for line in matches[0].group("content").splitlines(keepends=True)
|
|
47
|
+
)
|
|
48
|
+
return tomllib.loads(content)
|
|
49
|
+
else:
|
|
50
|
+
# TODO: Add tests for scenarios where no metadata is found
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def pep723_args() -> Argument:
|
|
55
|
+
"""Return the argument parser for PEP 723 metadata."""
|
|
56
|
+
machine_args = machine_dargs()
|
|
57
|
+
machine_args.fold_subdoc = True
|
|
58
|
+
machine_args.doc = "Machine configuration. See related documentation for details."
|
|
59
|
+
resources_args = resources_dargs(detail_kwargs=False)
|
|
60
|
+
resources_args.fold_subdoc = True
|
|
61
|
+
resources_args.doc = (
|
|
62
|
+
"Resources configuration. See related documentation for details."
|
|
63
|
+
)
|
|
64
|
+
task_args = task_dargs()
|
|
65
|
+
command_arg = task_args["command"]
|
|
66
|
+
command_arg.doc = (
|
|
67
|
+
"Python interpreter or launcher. No need to contain the Python script filename."
|
|
68
|
+
)
|
|
69
|
+
command_arg.default = "python"
|
|
70
|
+
command_arg.optional = True
|
|
71
|
+
task_args["task_work_path"].doc += " Can be a glob pattern."
|
|
72
|
+
task_args.name = "task_list"
|
|
73
|
+
task_args.doc = "List of tasks to execute."
|
|
74
|
+
task_args.repeat = True
|
|
75
|
+
task_args.dtype = (list,)
|
|
76
|
+
return Argument(
|
|
77
|
+
"pep723",
|
|
78
|
+
dtype=dict,
|
|
79
|
+
doc="PEP 723 metadata",
|
|
80
|
+
sub_fields=[
|
|
81
|
+
Argument(
|
|
82
|
+
"work_base",
|
|
83
|
+
dtype=str,
|
|
84
|
+
optional=True,
|
|
85
|
+
default="./",
|
|
86
|
+
doc="Base directory for the work",
|
|
87
|
+
),
|
|
88
|
+
Argument(
|
|
89
|
+
"forward_common_files",
|
|
90
|
+
dtype=List[str],
|
|
91
|
+
optional=True,
|
|
92
|
+
default=[],
|
|
93
|
+
doc="Common files to forward to the remote machine",
|
|
94
|
+
),
|
|
95
|
+
Argument(
|
|
96
|
+
"backward_common_files",
|
|
97
|
+
dtype=List[str],
|
|
98
|
+
optional=True,
|
|
99
|
+
default=[],
|
|
100
|
+
doc="Common files to backward from the remote machine",
|
|
101
|
+
),
|
|
102
|
+
machine_args,
|
|
103
|
+
resources_args,
|
|
104
|
+
task_args,
|
|
105
|
+
],
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def create_submission(metadata: dict, hash: str) -> Submission:
|
|
110
|
+
"""Create a Submission instance from a PEP 723 metadata.
|
|
111
|
+
|
|
112
|
+
Parameters
|
|
113
|
+
----------
|
|
114
|
+
metadata : dict
|
|
115
|
+
PEP 723 metadata.
|
|
116
|
+
hash : str
|
|
117
|
+
Submission hash.
|
|
118
|
+
|
|
119
|
+
Returns
|
|
120
|
+
-------
|
|
121
|
+
Submission
|
|
122
|
+
Submission instance.
|
|
123
|
+
"""
|
|
124
|
+
base = pep723_args()
|
|
125
|
+
metadata = base.normalize_value(metadata, trim_pattern="_*")
|
|
126
|
+
base.check_value(metadata, strict=False)
|
|
127
|
+
|
|
128
|
+
tasks = []
|
|
129
|
+
for task in metadata["task_list"]:
|
|
130
|
+
task = task.copy()
|
|
131
|
+
task["command"] += f" $REMOTE_ROOT/script_{hash}.py"
|
|
132
|
+
task_work_path = os.path.join(
|
|
133
|
+
metadata["machine"]["local_root"],
|
|
134
|
+
metadata["work_base"],
|
|
135
|
+
task["task_work_path"],
|
|
136
|
+
)
|
|
137
|
+
if os.path.isdir(task_work_path):
|
|
138
|
+
tasks.append(Task.load_from_dict(task))
|
|
139
|
+
elif glob(task_work_path):
|
|
140
|
+
for file in glob(task_work_path):
|
|
141
|
+
tasks.append(Task.load_from_dict({**task, "task_work_path": file}))
|
|
142
|
+
# TODO: Add tests for scenarios where the task work path is a glob pattern
|
|
143
|
+
else:
|
|
144
|
+
# TODO: Add tests for scenarios where the task work path is not found
|
|
145
|
+
raise FileNotFoundError(f"Task work path {task_work_path} not found.")
|
|
146
|
+
return Submission(
|
|
147
|
+
work_base=metadata["work_base"],
|
|
148
|
+
forward_common_files=metadata["forward_common_files"],
|
|
149
|
+
backward_common_files=metadata["backward_common_files"],
|
|
150
|
+
machine=Machine.load_from_dict(metadata["machine"]),
|
|
151
|
+
resources=Resources.load_from_dict(metadata["resources"]),
|
|
152
|
+
task_list=tasks,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def run_pep723(script: str):
|
|
157
|
+
"""Run a PEP 723 script.
|
|
158
|
+
|
|
159
|
+
Parameters
|
|
160
|
+
----------
|
|
161
|
+
script : str
|
|
162
|
+
Script content.
|
|
163
|
+
"""
|
|
164
|
+
metadata = read_pep723(script)
|
|
165
|
+
if metadata is None:
|
|
166
|
+
raise ValueError("No PEP 723 metadata found.")
|
|
167
|
+
dpdispatcher_metadata = metadata["tool"]["dpdispatcher"]
|
|
168
|
+
script_hash = sha1(script.encode("utf-8")).hexdigest()
|
|
169
|
+
submission = create_submission(dpdispatcher_metadata, script_hash)
|
|
170
|
+
submission.machine.context.write_file(f"script_{script_hash}.py", script)
|
|
171
|
+
# write script
|
|
172
|
+
submission.run_submission()
|
dpdispatcher/submission.py
CHANGED
|
@@ -55,7 +55,6 @@ class Submission:
|
|
|
55
55
|
*,
|
|
56
56
|
task_list=[],
|
|
57
57
|
):
|
|
58
|
-
# self.submission_list = submission_list
|
|
59
58
|
self.local_root = None
|
|
60
59
|
self.work_base = work_base
|
|
61
60
|
self._abs_work_base = os.path.abspath(work_base)
|
|
@@ -324,8 +323,7 @@ class Submission:
|
|
|
324
323
|
kwargs = {**{"clean": False}, **kwargs}
|
|
325
324
|
if kwargs["clean"]:
|
|
326
325
|
dlog.warning(
|
|
327
|
-
"Using async submission with `clean=True`, "
|
|
328
|
-
"job may fail in queue system"
|
|
326
|
+
"Using async submission with `clean=True`, job may fail in queue system"
|
|
329
327
|
)
|
|
330
328
|
loop = asyncio.get_event_loop()
|
|
331
329
|
wrapped_submission = functools.partial(self.run_submission, **kwargs)
|
|
@@ -515,12 +513,9 @@ class Submission:
|
|
|
515
513
|
def submission_from_json(cls, json_file_name="submission.json"):
|
|
516
514
|
with open(json_file_name) as f:
|
|
517
515
|
submission_dict = json.load(f)
|
|
518
|
-
# submission_dict = machine.context.read_file(json_file_name)
|
|
519
516
|
submission = cls.deserialize(submission_dict=submission_dict, machine=None)
|
|
520
517
|
return submission
|
|
521
518
|
|
|
522
|
-
# def check_if_recover()
|
|
523
|
-
|
|
524
519
|
def try_recover_from_json(self):
|
|
525
520
|
submission_file_name = f"{self.submission_hash}.json"
|
|
526
521
|
if_recover = self.machine.context.check_file_exists(submission_file_name)
|
|
@@ -545,7 +540,6 @@ class Submission:
|
|
|
545
540
|
f"machine.context.remote_root:{self.machine.context.remote_root}; "
|
|
546
541
|
f"submission.work_base:{submission.work_base};"
|
|
547
542
|
)
|
|
548
|
-
# self = submission.bind_machine(machine=self.machine)
|
|
549
543
|
else:
|
|
550
544
|
print(self.serialize())
|
|
551
545
|
print(submission.serialize())
|
|
@@ -759,7 +753,6 @@ class Job:
|
|
|
759
753
|
self.fail_count = 0
|
|
760
754
|
self.job_uuid = uuid.uuid4()
|
|
761
755
|
|
|
762
|
-
# self.job_hash = self.get_hash()
|
|
763
756
|
self.job_hash = self.get_hash()
|
|
764
757
|
self.script_file_name = self.job_hash + ".sub"
|
|
765
758
|
|
|
@@ -863,9 +856,7 @@ class Job:
|
|
|
863
856
|
self.submit_job()
|
|
864
857
|
if self.job_state != JobStatus.unsubmitted:
|
|
865
858
|
dlog.info(
|
|
866
|
-
"job:{job_hash} re-submit after terminated; new job_id is {job_id}"
|
|
867
|
-
job_hash=self.job_hash, job_id=self.job_id
|
|
868
|
-
)
|
|
859
|
+
f"job:{self.job_hash} re-submit after terminated; new job_id is {self.job_id}"
|
|
869
860
|
)
|
|
870
861
|
time.sleep(0.2)
|
|
871
862
|
self.get_job_state()
|
|
@@ -1124,9 +1115,9 @@ class Resources:
|
|
|
1124
1115
|
|
|
1125
1116
|
@staticmethod
|
|
1126
1117
|
def arginfo(detail_kwargs=True):
|
|
1127
|
-
doc_number_node = "The number of
|
|
1128
|
-
doc_cpu_per_node = "
|
|
1129
|
-
doc_gpu_per_node = "
|
|
1118
|
+
doc_number_node = "The number of nodes required for each `job`."
|
|
1119
|
+
doc_cpu_per_node = "CPU numbers of each node assigned to each job."
|
|
1120
|
+
doc_gpu_per_node = "GPU numbers of each node assigned to each job."
|
|
1130
1121
|
doc_queue_name = "The queue name of batch job scheduler system."
|
|
1131
1122
|
doc_group_size = "The number of `tasks` in a `job`. 0 means infinity."
|
|
1132
1123
|
doc_custom_flags = "The extra lines pass to job submitting script header"
|
|
@@ -142,10 +142,10 @@ class Client:
|
|
|
142
142
|
res = self.get("/data/get_sts_token", {})
|
|
143
143
|
# print('debug>>>>>>>>>>>>>', res)
|
|
144
144
|
dlog.debug(f"debug: _get_oss_bucket: res:{res}")
|
|
145
|
-
auth = oss2.StsAuth(
|
|
145
|
+
auth = oss2.StsAuth( # type: ignore[reportPossiblyUnboundVariable]
|
|
146
146
|
res["AccessKeyId"], res["AccessKeySecret"], res["SecurityToken"]
|
|
147
147
|
)
|
|
148
|
-
return oss2.Bucket(auth, endpoint, bucket_name)
|
|
148
|
+
return oss2.Bucket(auth, endpoint, bucket_name) # type: ignore[reportPossiblyUnboundVariable]
|
|
149
149
|
|
|
150
150
|
def download(self, oss_file, save_file, endpoint, bucket_name):
|
|
151
151
|
bucket = self._get_oss_bucket(endpoint, bucket_name)
|
|
@@ -184,7 +184,7 @@ class Client:
|
|
|
184
184
|
)
|
|
185
185
|
bucket = self._get_oss_bucket(endpoint, bucket_name)
|
|
186
186
|
total_size = os.path.getsize(zip_task_file)
|
|
187
|
-
part_size = determine_part_size(total_size, preferred_size=1000 * 1024)
|
|
187
|
+
part_size = determine_part_size(total_size, preferred_size=1000 * 1024) # type: ignore[reportPossiblyUnboundVariable]
|
|
188
188
|
upload_id = bucket.init_multipart_upload(oss_task_zip).upload_id
|
|
189
189
|
parts = []
|
|
190
190
|
with open(zip_task_file, "rb") as fileobj:
|
|
@@ -196,9 +196,9 @@ class Client:
|
|
|
196
196
|
oss_task_zip,
|
|
197
197
|
upload_id,
|
|
198
198
|
part_number,
|
|
199
|
-
SizedFileAdapter(fileobj, num_to_upload),
|
|
199
|
+
SizedFileAdapter(fileobj, num_to_upload), # type: ignore[reportPossiblyUnboundVariable]
|
|
200
200
|
)
|
|
201
|
-
parts.append(PartInfo(part_number, result.etag))
|
|
201
|
+
parts.append(PartInfo(part_number, result.etag)) # type: ignore[reportPossiblyUnboundVariable]
|
|
202
202
|
offset += num_to_upload
|
|
203
203
|
part_number += 1
|
|
204
204
|
# result = bucket.complete_multipart_upload(oss_task_zip, upload_id, parts)
|
|
@@ -278,7 +278,11 @@ class Client:
|
|
|
278
278
|
return ""
|
|
279
279
|
resp = requests.get(url, headers={"Range": f"bytes={self.last_log_offset}-"})
|
|
280
280
|
self.last_log_offset += len(resp.content)
|
|
281
|
-
|
|
281
|
+
try:
|
|
282
|
+
return resp.content.decode("utf-8")
|
|
283
|
+
except Exception as e:
|
|
284
|
+
dlog.error(f"Error decoding job log: {e}", stack_info=ENABLE_STACK)
|
|
285
|
+
return ""
|
|
282
286
|
|
|
283
287
|
def _get_job_log(self, job_id):
|
|
284
288
|
ret = self.get(
|
dpdispatcher/utils/hdfs_cli.py
CHANGED
|
@@ -28,7 +28,7 @@ class HDFS:
|
|
|
28
28
|
)
|
|
29
29
|
except Exception as e:
|
|
30
30
|
raise RuntimeError(
|
|
31
|
-
f"Cannot check existence of hdfs uri[{uri}]
|
|
31
|
+
f"Cannot check existence of hdfs uri[{uri}] with cmd[{cmd}]"
|
|
32
32
|
) from e
|
|
33
33
|
|
|
34
34
|
@staticmethod
|
|
@@ -48,9 +48,7 @@ class HDFS:
|
|
|
48
48
|
f"with cmd[{cmd}]; ret[{ret}] output[{out}] stderr[{err}]"
|
|
49
49
|
)
|
|
50
50
|
except Exception as e:
|
|
51
|
-
raise RuntimeError(
|
|
52
|
-
f"Cannot remove hdfs uri[{uri}] " f"with cmd[{cmd}]"
|
|
53
|
-
) from e
|
|
51
|
+
raise RuntimeError(f"Cannot remove hdfs uri[{uri}] with cmd[{cmd}]") from e
|
|
54
52
|
|
|
55
53
|
@staticmethod
|
|
56
54
|
def mkdir(uri):
|
|
@@ -70,7 +68,7 @@ class HDFS:
|
|
|
70
68
|
)
|
|
71
69
|
except Exception as e:
|
|
72
70
|
raise RuntimeError(
|
|
73
|
-
f"Cannot mkdir of hdfs uri[{uri}]
|
|
71
|
+
f"Cannot mkdir of hdfs uri[{uri}] with cmd[{cmd}]"
|
|
74
72
|
) from e
|
|
75
73
|
|
|
76
74
|
@staticmethod
|
|
@@ -80,7 +78,7 @@ class HDFS:
|
|
|
80
78
|
"""
|
|
81
79
|
# Make sure local_path is accessible
|
|
82
80
|
if not os.path.exists(local_path) or not os.access(local_path, os.R_OK):
|
|
83
|
-
raise RuntimeError(f"try to access local_path[{local_path}]
|
|
81
|
+
raise RuntimeError(f"try to access local_path[{local_path}] but failed")
|
|
84
82
|
cmd = f"hadoop fs -copyFromLocal -f {local_path} {to_uri}"
|
|
85
83
|
try:
|
|
86
84
|
ret, out, err = run_cmd_with_all_output(cmd)
|
|
@@ -88,10 +86,8 @@ class HDFS:
|
|
|
88
86
|
return True, out
|
|
89
87
|
else:
|
|
90
88
|
raise RuntimeError(
|
|
91
|
-
"Cannot copy local[{}] to remote[{}] with cmd[{}]; "
|
|
92
|
-
"ret[{}] output[{}] stderr[{}]"
|
|
93
|
-
local_path, to_uri, cmd, ret, out, err
|
|
94
|
-
)
|
|
89
|
+
f"Cannot copy local[{local_path}] to remote[{to_uri}] with cmd[{cmd}]; "
|
|
90
|
+
f"ret[{ret}] output[{out}] stderr[{err}]"
|
|
95
91
|
)
|
|
96
92
|
except Exception as e:
|
|
97
93
|
raise RuntimeError(
|
|
@@ -113,10 +109,8 @@ class HDFS:
|
|
|
113
109
|
return True
|
|
114
110
|
else:
|
|
115
111
|
raise RuntimeError(
|
|
116
|
-
"Cannot copy remote[{}] to local[{}] with cmd[{}]; "
|
|
117
|
-
"ret[{}] output[{}] stderr[{}]"
|
|
118
|
-
from_uri, local_path, cmd, ret, out, err
|
|
119
|
-
)
|
|
112
|
+
f"Cannot copy remote[{from_uri}] to local[{local_path}] with cmd[{cmd}]; "
|
|
113
|
+
f"ret[{ret}] output[{out}] stderr[{err}]"
|
|
120
114
|
)
|
|
121
115
|
except Exception as e:
|
|
122
116
|
raise RuntimeError(
|
|
@@ -136,9 +130,7 @@ class HDFS:
|
|
|
136
130
|
f"cmd [{cmd}] ret[{ret}] output[{out}] stderr[{err}]"
|
|
137
131
|
)
|
|
138
132
|
except Exception as e:
|
|
139
|
-
raise RuntimeError(
|
|
140
|
-
f"Cannot read text from uri[{uri}]" f"cmd [{cmd}]"
|
|
141
|
-
) from e
|
|
133
|
+
raise RuntimeError(f"Cannot read text from uri[{uri}]cmd [{cmd}]") from e
|
|
142
134
|
|
|
143
135
|
@staticmethod
|
|
144
136
|
def move(from_uri, to_uri):
|
|
@@ -155,6 +147,5 @@ class HDFS:
|
|
|
155
147
|
)
|
|
156
148
|
except Exception as e:
|
|
157
149
|
raise RuntimeError(
|
|
158
|
-
f"Cannot move from_uri[{from_uri}] to "
|
|
159
|
-
f"to_uri[{to_uri}] with cmd[{cmd}]"
|
|
150
|
+
f"Cannot move from_uri[{from_uri}] to to_uri[{to_uri}] with cmd[{cmd}]"
|
|
160
151
|
) from e
|
dpdispatcher/utils/utils.py
CHANGED
|
@@ -2,6 +2,7 @@ import base64
|
|
|
2
2
|
import hashlib
|
|
3
3
|
import hmac
|
|
4
4
|
import os
|
|
5
|
+
import shlex
|
|
5
6
|
import struct
|
|
6
7
|
import subprocess
|
|
7
8
|
import time
|
|
@@ -89,6 +90,7 @@ def rsync(
|
|
|
89
90
|
port: int = 22,
|
|
90
91
|
key_filename: Optional[str] = None,
|
|
91
92
|
timeout: Union[int, float] = 10,
|
|
93
|
+
proxy_command: Optional[str] = None,
|
|
92
94
|
):
|
|
93
95
|
"""Call rsync to transfer files.
|
|
94
96
|
|
|
@@ -104,6 +106,8 @@ def rsync(
|
|
|
104
106
|
identity file name
|
|
105
107
|
timeout : int, default=10
|
|
106
108
|
timeout for ssh
|
|
109
|
+
proxy_command : str, optional
|
|
110
|
+
ProxyCommand to use for SSH connection
|
|
107
111
|
|
|
108
112
|
Raises
|
|
109
113
|
------
|
|
@@ -124,20 +128,30 @@ def rsync(
|
|
|
124
128
|
]
|
|
125
129
|
if key_filename is not None:
|
|
126
130
|
ssh_cmd.extend(["-i", key_filename])
|
|
131
|
+
|
|
132
|
+
# Use proxy_command if provided
|
|
133
|
+
if proxy_command is not None:
|
|
134
|
+
ssh_cmd.extend(["-o", f"ProxyCommand={proxy_command}"])
|
|
135
|
+
|
|
136
|
+
# Properly escape the SSH command for rsync's -e option
|
|
137
|
+
ssh_cmd_str = " ".join(shlex.quote(part) for part in ssh_cmd)
|
|
138
|
+
|
|
127
139
|
cmd = [
|
|
128
140
|
"rsync",
|
|
129
|
-
# -
|
|
130
|
-
# -z: compress
|
|
131
|
-
"-
|
|
141
|
+
# -r: recursive, -l: links, -p: perms, -t: times, -D: devices/specials
|
|
142
|
+
# -z: compress (exclude -o: owner, -g: group to avoid permission issues)
|
|
143
|
+
"-rlptDz",
|
|
132
144
|
"-e",
|
|
133
|
-
|
|
145
|
+
ssh_cmd_str,
|
|
134
146
|
"-q",
|
|
135
147
|
from_file,
|
|
136
148
|
to_file,
|
|
137
149
|
]
|
|
138
|
-
|
|
150
|
+
# Convert to string for shell=True
|
|
151
|
+
cmd_str = " ".join(shlex.quote(arg) for arg in cmd)
|
|
152
|
+
ret, out, err = run_cmd_with_all_output(cmd_str, shell=True)
|
|
139
153
|
if ret != 0:
|
|
140
|
-
raise RuntimeError(f"Failed to run {
|
|
154
|
+
raise RuntimeError(f"Failed to run {cmd_str}: {err}")
|
|
141
155
|
|
|
142
156
|
|
|
143
157
|
class RetrySignal(Exception):
|
|
@@ -191,7 +205,7 @@ def retry(
|
|
|
191
205
|
else:
|
|
192
206
|
# raise all exceptions
|
|
193
207
|
raise RuntimeError(
|
|
194
|
-
"Failed to run
|
|
208
|
+
f"Failed to run {func.__name__} for {current_retry} times"
|
|
195
209
|
) from errors[-1]
|
|
196
210
|
|
|
197
211
|
return wrapper
|