dpdispatcher 0.5.11__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dpdispatcher/__init__.py +7 -89
- dpdispatcher/__main__.py +8 -0
- dpdispatcher/_version.py +14 -2
- dpdispatcher/base_context.py +1 -1
- dpdispatcher/contexts/__init__.py +11 -0
- dpdispatcher/{dp_cloud_server_context.py → contexts/dp_cloud_server_context.py} +7 -3
- dpdispatcher/{hdfs_context.py → contexts/hdfs_context.py} +2 -2
- dpdispatcher/{local_context.py → contexts/local_context.py} +51 -14
- dpdispatcher/{openapi_context.py → contexts/openapi_context.py} +3 -2
- dpdispatcher/{ssh_context.py → contexts/ssh_context.py} +113 -34
- dpdispatcher/dlog.py +31 -0
- dpdispatcher/dpdisp.py +113 -1
- dpdispatcher/entrypoints/__init__.py +1 -0
- dpdispatcher/entrypoints/gui.py +31 -0
- dpdispatcher/entrypoints/submission.py +83 -0
- dpdispatcher/machine.py +18 -4
- dpdispatcher/machines/__init__.py +11 -0
- dpdispatcher/{distributed_shell.py → machines/distributed_shell.py} +20 -4
- dpdispatcher/{dp_cloud_server.py → machines/dp_cloud_server.py} +21 -5
- dpdispatcher/{fugaku.py → machines/fugaku.py} +18 -5
- dpdispatcher/{lsf.py → machines/lsf.py} +20 -4
- dpdispatcher/{openapi.py → machines/openapi.py} +23 -4
- dpdispatcher/{pbs.py → machines/pbs.py} +30 -4
- dpdispatcher/{shell.py → machines/shell.py} +17 -3
- dpdispatcher/{slurm.py → machines/slurm.py} +37 -6
- dpdispatcher/submission.py +83 -39
- dpdispatcher/utils/__init__.py +1 -0
- dpdispatcher/{dpcloudserver → utils/dpcloudserver}/client.py +1 -1
- dpdispatcher/{hdfs_cli.py → utils/hdfs_cli.py} +1 -1
- dpdispatcher/utils/record.py +79 -0
- dpdispatcher/{utils.py → utils/utils.py} +14 -2
- {dpdispatcher-0.5.11.dist-info → dpdispatcher-0.6.1.dist-info}/METADATA +7 -2
- dpdispatcher-0.6.1.dist-info/RECORD +44 -0
- {dpdispatcher-0.5.11.dist-info → dpdispatcher-0.6.1.dist-info}/WHEEL +1 -1
- dpdispatcher-0.6.1.dist-info/entry_points.txt +7 -0
- dpdispatcher/dpcloudserver/temp_test.py +0 -90
- dpdispatcher-0.5.11.dist-info/RECORD +0 -36
- dpdispatcher-0.5.11.dist-info/entry_points.txt +0 -2
- /dpdispatcher/{lazy_local_context.py → contexts/lazy_local_context.py} +0 -0
- /dpdispatcher/{dpcloudserver → utils/dpcloudserver}/__init__.py +0 -0
- /dpdispatcher/{dpcloudserver → utils/dpcloudserver}/config.py +0 -0
- /dpdispatcher/{dpcloudserver → utils/dpcloudserver}/retcode.py +0 -0
- /dpdispatcher/{dpcloudserver → utils/dpcloudserver}/zip_file.py +0 -0
- /dpdispatcher/{JobStatus.py → utils/job_status.py} +0 -0
- {dpdispatcher-0.5.11.dist-info → dpdispatcher-0.6.1.dist-info}/LICENSE +0 -0
- {dpdispatcher-0.5.11.dist-info → dpdispatcher-0.6.1.dist-info}/top_level.txt +0 -0
dpdispatcher/dpdisp.py
CHANGED
|
@@ -1,8 +1,120 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
|
+
import argparse
|
|
3
|
+
from typing import List, Optional
|
|
4
|
+
|
|
5
|
+
from dpdispatcher.entrypoints.gui import start_dpgui
|
|
6
|
+
from dpdispatcher.entrypoints.submission import handle_submission
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def main_parser() -> argparse.ArgumentParser:
|
|
10
|
+
"""Dpdispatcher commandline options argument parser.
|
|
11
|
+
|
|
12
|
+
Notes
|
|
13
|
+
-----
|
|
14
|
+
This function is used by documentation.
|
|
15
|
+
|
|
16
|
+
Returns
|
|
17
|
+
-------
|
|
18
|
+
argparse.ArgumentParser
|
|
19
|
+
the argument parser
|
|
20
|
+
"""
|
|
21
|
+
parser = argparse.ArgumentParser(
|
|
22
|
+
description="dpdispatcher: Generate HPC scheduler systems jobs input scripts, submit these scripts to HPC systems, and poke until they finish",
|
|
23
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
24
|
+
)
|
|
25
|
+
subparsers = parser.add_subparsers(title="Valid subcommands", dest="command")
|
|
26
|
+
##########################################
|
|
27
|
+
# backward
|
|
28
|
+
parser_submission = subparsers.add_parser(
|
|
29
|
+
"submission",
|
|
30
|
+
help="Handle terminated submission.",
|
|
31
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
32
|
+
)
|
|
33
|
+
parser_submission.add_argument(
|
|
34
|
+
"SUBMISSION_HASH",
|
|
35
|
+
type=str,
|
|
36
|
+
help="Submission hash to download.",
|
|
37
|
+
)
|
|
38
|
+
parser_submission_action = parser_submission.add_argument_group(
|
|
39
|
+
"Actions",
|
|
40
|
+
description="One or more actions to take on submission.",
|
|
41
|
+
)
|
|
42
|
+
parser_submission_action.add_argument(
|
|
43
|
+
"--download-terminated-log",
|
|
44
|
+
action="store_true",
|
|
45
|
+
help="Download log files of terminated tasks.",
|
|
46
|
+
)
|
|
47
|
+
parser_submission_action.add_argument(
|
|
48
|
+
"--download-finished-task",
|
|
49
|
+
action="store_true",
|
|
50
|
+
help="Download finished tasks.",
|
|
51
|
+
)
|
|
52
|
+
parser_submission_action.add_argument(
|
|
53
|
+
"--clean",
|
|
54
|
+
action="store_true",
|
|
55
|
+
help="Clean submission.",
|
|
56
|
+
)
|
|
57
|
+
##########################################
|
|
58
|
+
# gui
|
|
59
|
+
parser_gui = subparsers.add_parser(
|
|
60
|
+
"gui",
|
|
61
|
+
help="Serve DP-GUI.",
|
|
62
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
63
|
+
)
|
|
64
|
+
parser_gui.add_argument(
|
|
65
|
+
"-p",
|
|
66
|
+
"--port",
|
|
67
|
+
type=int,
|
|
68
|
+
default=6042,
|
|
69
|
+
help="The port to serve DP-GUI on.",
|
|
70
|
+
)
|
|
71
|
+
parser_gui.add_argument(
|
|
72
|
+
"--bind_all",
|
|
73
|
+
action="store_true",
|
|
74
|
+
help=(
|
|
75
|
+
"Serve on all public interfaces. This will expose your DP-GUI instance "
|
|
76
|
+
"to the network on both IPv4 and IPv6 (where available)."
|
|
77
|
+
),
|
|
78
|
+
)
|
|
79
|
+
return parser
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def parse_args(args: Optional[List[str]] = None):
|
|
83
|
+
"""Dpdispatcher commandline options argument parsing.
|
|
84
|
+
|
|
85
|
+
Parameters
|
|
86
|
+
----------
|
|
87
|
+
args : List[str]
|
|
88
|
+
list of command line arguments, main purpose is testing default option None
|
|
89
|
+
takes arguments from sys.argv
|
|
90
|
+
"""
|
|
91
|
+
parser = main_parser()
|
|
92
|
+
|
|
93
|
+
parsed_args = parser.parse_args(args=args)
|
|
94
|
+
if parsed_args.command is None:
|
|
95
|
+
parser.print_help()
|
|
96
|
+
|
|
97
|
+
return parsed_args
|
|
2
98
|
|
|
3
99
|
|
|
4
100
|
def main():
|
|
5
|
-
|
|
101
|
+
args = parse_args()
|
|
102
|
+
if args.command == "submission":
|
|
103
|
+
handle_submission(
|
|
104
|
+
submission_hash=args.SUBMISSION_HASH,
|
|
105
|
+
download_terminated_log=args.download_terminated_log,
|
|
106
|
+
download_finished_task=args.download_finished_task,
|
|
107
|
+
clean=args.clean,
|
|
108
|
+
)
|
|
109
|
+
elif args.command == "gui":
|
|
110
|
+
start_dpgui(
|
|
111
|
+
port=args.port,
|
|
112
|
+
bind_all=args.bind_all,
|
|
113
|
+
)
|
|
114
|
+
elif args.command is None:
|
|
115
|
+
pass
|
|
116
|
+
else:
|
|
117
|
+
raise RuntimeError(f"unknown command {args.command}")
|
|
6
118
|
|
|
7
119
|
|
|
8
120
|
if __name__ == "__main__":
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Entry points."""
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
|
2
|
+
"""DP-GUI entrypoint."""
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def start_dpgui(*, port: int, bind_all: bool, **kwargs):
|
|
6
|
+
"""Host DP-GUI server.
|
|
7
|
+
|
|
8
|
+
Parameters
|
|
9
|
+
----------
|
|
10
|
+
port : int
|
|
11
|
+
The port to serve DP-GUI on.
|
|
12
|
+
bind_all : bool
|
|
13
|
+
Serve on all public interfaces. This will expose your DP-GUI instance
|
|
14
|
+
to the network on both IPv4 and IPv6 (where available).
|
|
15
|
+
**kwargs
|
|
16
|
+
additional arguments
|
|
17
|
+
|
|
18
|
+
Raises
|
|
19
|
+
------
|
|
20
|
+
ModuleNotFoundError
|
|
21
|
+
The dpgui package is not installed
|
|
22
|
+
"""
|
|
23
|
+
try:
|
|
24
|
+
from dpgui import (
|
|
25
|
+
start_dpgui,
|
|
26
|
+
)
|
|
27
|
+
except ModuleNotFoundError as e:
|
|
28
|
+
raise ModuleNotFoundError(
|
|
29
|
+
"To use DP-GUI, please install the dpgui package:\npip install dpgui"
|
|
30
|
+
) from e
|
|
31
|
+
start_dpgui(port=port, bind_all=bind_all)
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from dpdispatcher.dlog import dlog
|
|
4
|
+
from dpdispatcher.submission import Submission
|
|
5
|
+
from dpdispatcher.utils.job_status import JobStatus
|
|
6
|
+
from dpdispatcher.utils.record import record
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def handle_submission(
|
|
10
|
+
*,
|
|
11
|
+
submission_hash: str,
|
|
12
|
+
download_terminated_log: bool = False,
|
|
13
|
+
download_finished_task: bool = False,
|
|
14
|
+
clean: bool = False,
|
|
15
|
+
):
|
|
16
|
+
"""Handle terminated submission.
|
|
17
|
+
|
|
18
|
+
Parameters
|
|
19
|
+
----------
|
|
20
|
+
submission_hash : str
|
|
21
|
+
Submission hash to download.
|
|
22
|
+
download_terminated_log : bool, optional
|
|
23
|
+
Download log files of terminated tasks.
|
|
24
|
+
download_finished_task : bool, optional
|
|
25
|
+
Download finished tasks.
|
|
26
|
+
clean : bool, optional
|
|
27
|
+
Clean submission.
|
|
28
|
+
|
|
29
|
+
Raises
|
|
30
|
+
------
|
|
31
|
+
ValueError
|
|
32
|
+
At least one action should be specified.
|
|
33
|
+
"""
|
|
34
|
+
if int(download_terminated_log) + int(download_finished_task) + int(clean) == 0:
|
|
35
|
+
raise ValueError("At least one action should be specified.")
|
|
36
|
+
|
|
37
|
+
submission_file = record.get_submission(submission_hash)
|
|
38
|
+
submission = Submission.submission_from_json(str(submission_file))
|
|
39
|
+
submission.belonging_tasks = [
|
|
40
|
+
task for job in submission.belonging_jobs for task in job.job_task_list
|
|
41
|
+
]
|
|
42
|
+
# TODO: for unclear reason, the submission_hash may be changed
|
|
43
|
+
submission.submission_hash = submission_hash
|
|
44
|
+
submission.machine.context.bind_submission(submission)
|
|
45
|
+
submission.update_submission_state()
|
|
46
|
+
|
|
47
|
+
terminated_tasks = []
|
|
48
|
+
finished_tasks = []
|
|
49
|
+
for task in submission.belonging_tasks:
|
|
50
|
+
task.get_task_state(submission.machine.context)
|
|
51
|
+
if task.task_state == JobStatus.terminated:
|
|
52
|
+
terminated_tasks.append(task)
|
|
53
|
+
elif task.task_state == JobStatus.finished:
|
|
54
|
+
finished_tasks.append(task)
|
|
55
|
+
submission.belonging_tasks = []
|
|
56
|
+
|
|
57
|
+
if download_terminated_log:
|
|
58
|
+
for task in terminated_tasks:
|
|
59
|
+
task.backward_files = [task.outlog, task.errlog]
|
|
60
|
+
submission.belonging_tasks += terminated_tasks
|
|
61
|
+
if download_finished_task:
|
|
62
|
+
submission.belonging_tasks += finished_tasks
|
|
63
|
+
|
|
64
|
+
submission.download_jobs()
|
|
65
|
+
|
|
66
|
+
if download_terminated_log:
|
|
67
|
+
terminated_log_files = []
|
|
68
|
+
for task in terminated_tasks:
|
|
69
|
+
assert submission.local_root is not None
|
|
70
|
+
terminated_log_files.append(
|
|
71
|
+
Path(submission.local_root) / task.task_work_path / task.outlog
|
|
72
|
+
)
|
|
73
|
+
terminated_log_files.append(
|
|
74
|
+
Path(submission.local_root) / task.task_work_path / task.errlog
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
dlog.info(
|
|
78
|
+
"Terminated logs are downloaded into:\n "
|
|
79
|
+
+ "\n ".join([str(f) for f in terminated_log_files])
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
if clean:
|
|
83
|
+
submission.clean_jobs()
|
dpdispatcher/machine.py
CHANGED
|
@@ -4,10 +4,11 @@ import shlex
|
|
|
4
4
|
from abc import ABCMeta, abstractmethod
|
|
5
5
|
from typing import List, Tuple
|
|
6
6
|
|
|
7
|
+
import yaml
|
|
7
8
|
from dargs import Argument, Variant
|
|
8
9
|
|
|
9
|
-
from dpdispatcher import dlog
|
|
10
10
|
from dpdispatcher.base_context import BaseContext
|
|
11
|
+
from dpdispatcher.dlog import dlog
|
|
11
12
|
|
|
12
13
|
script_template = """\
|
|
13
14
|
{script_header}
|
|
@@ -35,7 +36,7 @@ cd {task_work_path}
|
|
|
35
36
|
test $? -ne 0 && exit 1
|
|
36
37
|
if [ ! -f {task_tag_finished} ] ;then
|
|
37
38
|
{command_env} ( {command} ) {log_err_part}
|
|
38
|
-
if test $? -eq 0; then touch {task_tag_finished}; else echo 1 > $REMOTE_ROOT/{flag_if_job_task_fail};fi
|
|
39
|
+
if test $? -eq 0; then touch {task_tag_finished}; else echo 1 > $REMOTE_ROOT/{flag_if_job_task_fail};tail -v -c 1000 $REMOTE_ROOT/{task_work_path}/{err_file} > $REMOTE_ROOT/{last_err_file};fi
|
|
39
40
|
fi &
|
|
40
41
|
"""
|
|
41
42
|
|
|
@@ -124,6 +125,13 @@ class Machine(metaclass=ABCMeta):
|
|
|
124
125
|
machine = cls.load_from_dict(machine_dict=machine_dict)
|
|
125
126
|
return machine
|
|
126
127
|
|
|
128
|
+
@classmethod
|
|
129
|
+
def load_from_yaml(cls, yaml_path):
|
|
130
|
+
with open(yaml_path) as f:
|
|
131
|
+
machine_dict = yaml.safe_load(f)
|
|
132
|
+
machine = cls.load_from_dict(machine_dict=machine_dict)
|
|
133
|
+
return machine
|
|
134
|
+
|
|
127
135
|
@classmethod
|
|
128
136
|
def load_from_dict(cls, machine_dict):
|
|
129
137
|
batch_type = machine_dict["batch_type"]
|
|
@@ -191,17 +199,20 @@ class Machine(metaclass=ABCMeta):
|
|
|
191
199
|
"abstract method do_submit should be implemented by derived class"
|
|
192
200
|
)
|
|
193
201
|
|
|
202
|
+
def gen_script_run_command(self, job):
|
|
203
|
+
return f"source $REMOTE_ROOT/{job.script_file_name}.run"
|
|
204
|
+
|
|
194
205
|
def gen_script(self, job):
|
|
195
206
|
script_header = self.gen_script_header(job)
|
|
196
207
|
script_custom_flags = self.gen_script_custom_flags_lines(job)
|
|
197
208
|
script_env = self.gen_script_env(job)
|
|
198
|
-
|
|
209
|
+
script_run_command = self.gen_script_run_command(job)
|
|
199
210
|
script_end = self.gen_script_end(job)
|
|
200
211
|
script = script_template.format(
|
|
201
212
|
script_header=script_header,
|
|
202
213
|
script_custom_flags=script_custom_flags,
|
|
203
214
|
script_env=script_env,
|
|
204
|
-
script_command=
|
|
215
|
+
script_command=script_run_command,
|
|
205
216
|
script_end=script_end,
|
|
206
217
|
)
|
|
207
218
|
return script
|
|
@@ -295,6 +306,7 @@ class Machine(metaclass=ABCMeta):
|
|
|
295
306
|
log_err_part += f"2>>{shlex.quote(task.errlog)} "
|
|
296
307
|
|
|
297
308
|
flag_if_job_task_fail = job.job_hash + "_flag_if_job_task_fail"
|
|
309
|
+
last_err_file = job.job_hash + "_last_err_file"
|
|
298
310
|
single_script_command = script_command_template.format(
|
|
299
311
|
flag_if_job_task_fail=flag_if_job_task_fail,
|
|
300
312
|
command_env=command_env,
|
|
@@ -304,6 +316,8 @@ class Machine(metaclass=ABCMeta):
|
|
|
304
316
|
command=task.command,
|
|
305
317
|
task_tag_finished=task_tag_finished,
|
|
306
318
|
log_err_part=log_err_part,
|
|
319
|
+
err_file=shlex.quote(task.errlog),
|
|
320
|
+
last_err_file=shlex.quote(last_err_file),
|
|
307
321
|
)
|
|
308
322
|
script_command += single_script_command
|
|
309
323
|
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Machines."""
|
|
2
|
+
import importlib
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
PACKAGE_BASE = "dpdispatcher.machines"
|
|
6
|
+
NOT_LOADABLE = ("__init__.py",)
|
|
7
|
+
|
|
8
|
+
for module_file in Path(__file__).parent.glob("*.py"):
|
|
9
|
+
if module_file.name not in NOT_LOADABLE:
|
|
10
|
+
module_name = f".{module_file.stem}"
|
|
11
|
+
importlib.import_module(module_name, PACKAGE_BASE)
|
|
@@ -1,7 +1,10 @@
|
|
|
1
|
-
from dpdispatcher import dlog
|
|
2
|
-
from dpdispatcher.JobStatus import JobStatus
|
|
1
|
+
from dpdispatcher.dlog import dlog
|
|
3
2
|
from dpdispatcher.machine import Machine
|
|
4
|
-
from dpdispatcher.utils import
|
|
3
|
+
from dpdispatcher.utils.job_status import JobStatus
|
|
4
|
+
from dpdispatcher.utils.utils import (
|
|
5
|
+
customized_script_header_template,
|
|
6
|
+
run_cmd_with_all_output,
|
|
7
|
+
)
|
|
5
8
|
|
|
6
9
|
shell_script_header_template = """
|
|
7
10
|
#!/bin/bash -l
|
|
@@ -112,7 +115,17 @@ class DistributedShell(Machine):
|
|
|
112
115
|
return script_end
|
|
113
116
|
|
|
114
117
|
def gen_script_header(self, job):
|
|
115
|
-
|
|
118
|
+
resources = job.resources
|
|
119
|
+
if (
|
|
120
|
+
resources["strategy"].get("customized_script_header_template_file")
|
|
121
|
+
is not None
|
|
122
|
+
):
|
|
123
|
+
shell_script_header = customized_script_header_template(
|
|
124
|
+
resources["strategy"]["customized_script_header_template_file"],
|
|
125
|
+
resources,
|
|
126
|
+
)
|
|
127
|
+
else:
|
|
128
|
+
shell_script_header = shell_script_header_template
|
|
116
129
|
return shell_script_header
|
|
117
130
|
|
|
118
131
|
def do_submit(self, job):
|
|
@@ -133,6 +146,9 @@ class DistributedShell(Machine):
|
|
|
133
146
|
job_id_name = job.job_hash + "_job_id"
|
|
134
147
|
output_name = job.job_hash + ".out"
|
|
135
148
|
self.context.write_file(fname=script_file_name, write_str=script_str)
|
|
149
|
+
script_run_str = self.gen_script_command(job)
|
|
150
|
+
script_run_file_name = f"{job.script_file_name}.run"
|
|
151
|
+
self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
|
|
136
152
|
|
|
137
153
|
resources = job.resources
|
|
138
154
|
submit_command = (
|
|
@@ -4,11 +4,12 @@ import time
|
|
|
4
4
|
import uuid
|
|
5
5
|
import warnings
|
|
6
6
|
|
|
7
|
-
from dpdispatcher import dlog
|
|
8
|
-
from dpdispatcher.dpcloudserver import Client, zip_file
|
|
9
|
-
from dpdispatcher.dpcloudserver.config import ALI_OSS_BUCKET_URL
|
|
10
|
-
from dpdispatcher.JobStatus import JobStatus
|
|
7
|
+
from dpdispatcher.dlog import dlog
|
|
11
8
|
from dpdispatcher.machine import Machine
|
|
9
|
+
from dpdispatcher.utils.dpcloudserver import Client, zip_file
|
|
10
|
+
from dpdispatcher.utils.dpcloudserver.config import ALI_OSS_BUCKET_URL
|
|
11
|
+
from dpdispatcher.utils.job_status import JobStatus
|
|
12
|
+
from dpdispatcher.utils.utils import customized_script_header_template
|
|
12
13
|
|
|
13
14
|
shell_script_header_template = """
|
|
14
15
|
#!/bin/bash -l
|
|
@@ -71,13 +72,28 @@ class Bohrium(Machine):
|
|
|
71
72
|
return shell_script
|
|
72
73
|
|
|
73
74
|
def gen_script_header(self, job):
|
|
74
|
-
|
|
75
|
+
resources = job.resources
|
|
76
|
+
if (
|
|
77
|
+
resources["strategy"].get("customized_script_header_template_file")
|
|
78
|
+
is not None
|
|
79
|
+
):
|
|
80
|
+
shell_script_header = customized_script_header_template(
|
|
81
|
+
resources["strategy"]["customized_script_header_template_file"],
|
|
82
|
+
resources,
|
|
83
|
+
)
|
|
84
|
+
else:
|
|
85
|
+
shell_script_header = shell_script_header_template
|
|
75
86
|
return shell_script_header
|
|
76
87
|
|
|
77
88
|
def gen_local_script(self, job):
|
|
78
89
|
script_str = self.gen_script(job)
|
|
79
90
|
script_file_name = job.script_file_name
|
|
80
91
|
self.context.write_local_file(fname=script_file_name, write_str=script_str)
|
|
92
|
+
script_run_str = self.gen_script_command(job)
|
|
93
|
+
script_run_file_name = f"{job.script_file_name}.run"
|
|
94
|
+
self.context.write_local_file(
|
|
95
|
+
fname=script_run_file_name, write_str=script_run_str
|
|
96
|
+
)
|
|
81
97
|
return script_file_name
|
|
82
98
|
|
|
83
99
|
def _gen_backward_files_list(self, job):
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import shlex
|
|
2
2
|
|
|
3
|
-
from dpdispatcher import dlog
|
|
4
|
-
from dpdispatcher.JobStatus import JobStatus
|
|
3
|
+
from dpdispatcher.dlog import dlog
|
|
5
4
|
from dpdispatcher.machine import Machine
|
|
5
|
+
from dpdispatcher.utils.job_status import JobStatus
|
|
6
|
+
from dpdispatcher.utils.utils import customized_script_header_template
|
|
6
7
|
|
|
7
8
|
fugaku_script_header_template = """\
|
|
8
9
|
{queue_name_line}
|
|
@@ -28,9 +29,18 @@ class Fugaku(Machine):
|
|
|
28
29
|
fugaku_script_header_dict[
|
|
29
30
|
"queue_name_line"
|
|
30
31
|
] = f'#PJM -L "rscgrp={resources.queue_name}"'
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
32
|
+
if (
|
|
33
|
+
resources["strategy"].get("customized_script_header_template_file")
|
|
34
|
+
is not None
|
|
35
|
+
):
|
|
36
|
+
fugaku_script_header = customized_script_header_template(
|
|
37
|
+
resources["strategy"]["customized_script_header_template_file"],
|
|
38
|
+
resources,
|
|
39
|
+
)
|
|
40
|
+
else:
|
|
41
|
+
fugaku_script_header = fugaku_script_header_template.format(
|
|
42
|
+
**fugaku_script_header_dict
|
|
43
|
+
)
|
|
34
44
|
return fugaku_script_header
|
|
35
45
|
|
|
36
46
|
def do_submit(self, job):
|
|
@@ -39,6 +49,9 @@ class Fugaku(Machine):
|
|
|
39
49
|
job_id_name = job.job_hash + "_job_id"
|
|
40
50
|
# script_str = self.sub_script(job_dirs, cmd, args=args, resources=resources, outlog=outlog, errlog=errlog)
|
|
41
51
|
self.context.write_file(fname=script_file_name, write_str=script_str)
|
|
52
|
+
script_run_str = self.gen_script_command(job)
|
|
53
|
+
script_run_file_name = f"{job.script_file_name}.run"
|
|
54
|
+
self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
|
|
42
55
|
# self.context.write_file(fname=os.path.join(self.context.submission.work_base, script_file_name), write_str=script_str)
|
|
43
56
|
# script_file_dir = os.path.join(self.context.submission.work_base)
|
|
44
57
|
script_file_dir = self.context.remote_root
|
|
@@ -3,10 +3,14 @@ from typing import List
|
|
|
3
3
|
|
|
4
4
|
from dargs import Argument
|
|
5
5
|
|
|
6
|
-
from dpdispatcher import dlog
|
|
7
|
-
from dpdispatcher.JobStatus import JobStatus
|
|
6
|
+
from dpdispatcher.dlog import dlog
|
|
8
7
|
from dpdispatcher.machine import Machine
|
|
9
|
-
from dpdispatcher.utils import
|
|
8
|
+
from dpdispatcher.utils.job_status import JobStatus
|
|
9
|
+
from dpdispatcher.utils.utils import (
|
|
10
|
+
RetrySignal,
|
|
11
|
+
customized_script_header_template,
|
|
12
|
+
retry,
|
|
13
|
+
)
|
|
10
14
|
|
|
11
15
|
lsf_script_header_template = """\
|
|
12
16
|
#!/bin/bash -l
|
|
@@ -60,7 +64,16 @@ class LSF(Machine):
|
|
|
60
64
|
script_header_dict["lsf_number_gpu_line"] = ""
|
|
61
65
|
else:
|
|
62
66
|
script_header_dict["lsf_number_gpu_line"] = custom_gpu_line
|
|
63
|
-
|
|
67
|
+
if (
|
|
68
|
+
resources["strategy"].get("customized_script_header_template_file")
|
|
69
|
+
is not None
|
|
70
|
+
):
|
|
71
|
+
lsf_script_header = customized_script_header_template(
|
|
72
|
+
resources["strategy"]["customized_script_header_template_file"],
|
|
73
|
+
resources,
|
|
74
|
+
)
|
|
75
|
+
else:
|
|
76
|
+
lsf_script_header = lsf_script_header_template.format(**script_header_dict)
|
|
64
77
|
|
|
65
78
|
return lsf_script_header
|
|
66
79
|
|
|
@@ -70,6 +83,9 @@ class LSF(Machine):
|
|
|
70
83
|
script_str = self.gen_script(job)
|
|
71
84
|
job_id_name = job.job_hash + "_job_id"
|
|
72
85
|
self.context.write_file(fname=script_file_name, write_str=script_str)
|
|
86
|
+
script_run_str = self.gen_script_command(job)
|
|
87
|
+
script_run_file_name = f"{job.script_file_name}.run"
|
|
88
|
+
self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
|
|
73
89
|
|
|
74
90
|
try:
|
|
75
91
|
stdin, stdout, stderr = self.context.block_checkcall(
|
|
@@ -2,6 +2,8 @@ import os
|
|
|
2
2
|
import shutil
|
|
3
3
|
import time
|
|
4
4
|
|
|
5
|
+
from dpdispatcher.utils.utils import customized_script_header_template
|
|
6
|
+
|
|
5
7
|
try:
|
|
6
8
|
from bohriumsdk.client import Client
|
|
7
9
|
from bohriumsdk.job import Job
|
|
@@ -12,9 +14,9 @@ except ModuleNotFoundError:
|
|
|
12
14
|
else:
|
|
13
15
|
found_bohriumsdk = True
|
|
14
16
|
|
|
15
|
-
from dpdispatcher import dlog
|
|
16
|
-
from dpdispatcher.JobStatus import JobStatus
|
|
17
|
+
from dpdispatcher.dlog import dlog
|
|
17
18
|
from dpdispatcher.machine import Machine
|
|
19
|
+
from dpdispatcher.utils.job_status import JobStatus
|
|
18
20
|
|
|
19
21
|
shell_script_header_template = """
|
|
20
22
|
#!/bin/bash -l
|
|
@@ -43,13 +45,28 @@ class OpenAPI(Machine):
|
|
|
43
45
|
return shell_script
|
|
44
46
|
|
|
45
47
|
def gen_script_header(self, job):
|
|
46
|
-
|
|
48
|
+
resources = job.resources
|
|
49
|
+
if (
|
|
50
|
+
resources["strategy"].get("customized_script_header_template_file")
|
|
51
|
+
is not None
|
|
52
|
+
):
|
|
53
|
+
shell_script_header = customized_script_header_template(
|
|
54
|
+
resources["strategy"]["customized_script_header_template_file"],
|
|
55
|
+
resources,
|
|
56
|
+
)
|
|
57
|
+
else:
|
|
58
|
+
shell_script_header = shell_script_header_template
|
|
47
59
|
return shell_script_header
|
|
48
60
|
|
|
49
61
|
def gen_local_script(self, job):
|
|
50
62
|
script_str = self.gen_script(job)
|
|
51
63
|
script_file_name = job.script_file_name
|
|
52
64
|
self.context.write_local_file(fname=script_file_name, write_str=script_str)
|
|
65
|
+
script_run_str = self.gen_script_command(job)
|
|
66
|
+
script_run_file_name = f"{job.script_file_name}.run"
|
|
67
|
+
self.context.write_local_file(
|
|
68
|
+
fname=script_run_file_name, write_str=script_run_str
|
|
69
|
+
)
|
|
53
70
|
return script_file_name
|
|
54
71
|
|
|
55
72
|
def _gen_backward_files_list(self, job):
|
|
@@ -130,7 +147,9 @@ class OpenAPI(Machine):
|
|
|
130
147
|
)
|
|
131
148
|
|
|
132
149
|
job_state = self.map_dp_job_state(
|
|
133
|
-
dp_job_status,
|
|
150
|
+
dp_job_status,
|
|
151
|
+
check_return.get("exitCode", 0), # type: ignore
|
|
152
|
+
self.ignore_exit_code,
|
|
134
153
|
)
|
|
135
154
|
if job_state == JobStatus.finished:
|
|
136
155
|
job_log = self.job.log(job_id)
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import shlex
|
|
2
2
|
|
|
3
|
-
from dpdispatcher import dlog
|
|
4
|
-
from dpdispatcher.JobStatus import JobStatus
|
|
3
|
+
from dpdispatcher.dlog import dlog
|
|
5
4
|
from dpdispatcher.machine import Machine
|
|
5
|
+
from dpdispatcher.utils.job_status import JobStatus
|
|
6
|
+
from dpdispatcher.utils.utils import customized_script_header_template
|
|
6
7
|
|
|
7
8
|
pbs_script_header_template = """
|
|
8
9
|
#!/bin/bash -l
|
|
@@ -28,7 +29,18 @@ class PBS(Machine):
|
|
|
28
29
|
"select_node_line"
|
|
29
30
|
] += f":ngpus={resources.gpu_per_node}"
|
|
30
31
|
pbs_script_header_dict["queue_name_line"] = f"#PBS -q {resources.queue_name}"
|
|
31
|
-
|
|
32
|
+
if (
|
|
33
|
+
resources["strategy"].get("customized_script_header_template_file")
|
|
34
|
+
is not None
|
|
35
|
+
):
|
|
36
|
+
pbs_script_header = customized_script_header_template(
|
|
37
|
+
resources["strategy"]["customized_script_header_template_file"],
|
|
38
|
+
resources,
|
|
39
|
+
)
|
|
40
|
+
else:
|
|
41
|
+
pbs_script_header = pbs_script_header_template.format(
|
|
42
|
+
**pbs_script_header_dict
|
|
43
|
+
)
|
|
32
44
|
return pbs_script_header
|
|
33
45
|
|
|
34
46
|
def do_submit(self, job):
|
|
@@ -37,6 +49,9 @@ class PBS(Machine):
|
|
|
37
49
|
job_id_name = job.job_hash + "_job_id"
|
|
38
50
|
# script_str = self.sub_script(job_dirs, cmd, args=args, resources=resources, outlog=outlog, errlog=errlog)
|
|
39
51
|
self.context.write_file(fname=script_file_name, write_str=script_str)
|
|
52
|
+
script_run_str = self.gen_script_command(job)
|
|
53
|
+
script_run_file_name = f"{job.script_file_name}.run"
|
|
54
|
+
self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
|
|
40
55
|
# self.context.write_file(fname=os.path.join(self.context.submission.work_base, script_file_name), write_str=script_str)
|
|
41
56
|
# script_file_dir = os.path.join(self.context.submission.work_base)
|
|
42
57
|
script_file_dir = self.context.remote_root
|
|
@@ -149,5 +164,16 @@ class Torque(PBS):
|
|
|
149
164
|
gpu_per_node=resources.gpu_per_node
|
|
150
165
|
)
|
|
151
166
|
pbs_script_header_dict["queue_name_line"] = f"#PBS -q {resources.queue_name}"
|
|
152
|
-
|
|
167
|
+
if (
|
|
168
|
+
resources["strategy"].get("customized_script_header_template_file")
|
|
169
|
+
is not None
|
|
170
|
+
):
|
|
171
|
+
pbs_script_header = customized_script_header_template(
|
|
172
|
+
resources["strategy"]["customized_script_header_template_file"],
|
|
173
|
+
resources,
|
|
174
|
+
)
|
|
175
|
+
else:
|
|
176
|
+
pbs_script_header = pbs_script_header_template.format(
|
|
177
|
+
**pbs_script_header_dict
|
|
178
|
+
)
|
|
153
179
|
return pbs_script_header
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import shlex
|
|
2
2
|
|
|
3
|
-
from dpdispatcher import dlog
|
|
4
|
-
from dpdispatcher.JobStatus import JobStatus
|
|
3
|
+
from dpdispatcher.dlog import dlog
|
|
5
4
|
from dpdispatcher.machine import Machine
|
|
5
|
+
from dpdispatcher.utils.job_status import JobStatus
|
|
6
|
+
from dpdispatcher.utils.utils import customized_script_header_template
|
|
6
7
|
|
|
7
8
|
shell_script_header_template = """
|
|
8
9
|
#!/bin/bash -l
|
|
@@ -15,7 +16,17 @@ class Shell(Machine):
|
|
|
15
16
|
return shell_script
|
|
16
17
|
|
|
17
18
|
def gen_script_header(self, job):
|
|
18
|
-
|
|
19
|
+
resources = job.resources
|
|
20
|
+
if (
|
|
21
|
+
resources["strategy"].get("customized_script_header_template_file")
|
|
22
|
+
is not None
|
|
23
|
+
):
|
|
24
|
+
shell_script_header = customized_script_header_template(
|
|
25
|
+
resources["strategy"]["customized_script_header_template_file"],
|
|
26
|
+
resources,
|
|
27
|
+
)
|
|
28
|
+
else:
|
|
29
|
+
shell_script_header = shell_script_header_template
|
|
19
30
|
return shell_script_header
|
|
20
31
|
|
|
21
32
|
def do_submit(self, job):
|
|
@@ -24,6 +35,9 @@ class Shell(Machine):
|
|
|
24
35
|
job_id_name = job.job_hash + "_job_id"
|
|
25
36
|
output_name = job.job_hash + ".out"
|
|
26
37
|
self.context.write_file(fname=script_file_name, write_str=script_str)
|
|
38
|
+
script_run_str = self.gen_script_command(job)
|
|
39
|
+
script_run_file_name = f"{job.script_file_name}.run"
|
|
40
|
+
self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
|
|
27
41
|
ret, stdin, stdout, stderr = self.context.block_call(
|
|
28
42
|
"cd {} && {{ nohup bash {} 1>>{} 2>>{} & }} && echo $!".format(
|
|
29
43
|
shlex.quote(self.context.remote_root),
|