dpdispatcher 0.5.8__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dpdispatcher might be problematic. Click here for more details.
- dpdispatcher/__init__.py +4 -0
- dpdispatcher/_version.py +2 -2
- dpdispatcher/dp_cloud_server.py +7 -0
- dpdispatcher/dp_cloud_server_context.py +10 -7
- dpdispatcher/dpcloudserver/client.py +22 -9
- dpdispatcher/fugaku.py +1 -3
- dpdispatcher/hdfs_cli.py +4 -12
- dpdispatcher/hdfs_context.py +1 -4
- dpdispatcher/lsf.py +2 -6
- dpdispatcher/machine.py +1 -3
- dpdispatcher/openapi.py +198 -0
- dpdispatcher/openapi_context.py +259 -0
- dpdispatcher/pbs.py +4 -12
- dpdispatcher/slurm.py +2 -6
- dpdispatcher/submission.py +9 -19
- {dpdispatcher-0.5.8.dist-info → dpdispatcher-0.5.10.dist-info}/METADATA +12 -1
- dpdispatcher-0.5.10.dist-info/RECORD +36 -0
- {dpdispatcher-0.5.8.dist-info → dpdispatcher-0.5.10.dist-info}/WHEEL +1 -1
- dpdispatcher-0.5.8.dist-info/RECORD +0 -34
- {dpdispatcher-0.5.8.dist-info → dpdispatcher-0.5.10.dist-info}/LICENSE +0 -0
- {dpdispatcher-0.5.8.dist-info → dpdispatcher-0.5.10.dist-info}/entry_points.txt +0 -0
- {dpdispatcher-0.5.8.dist-info → dpdispatcher-0.5.10.dist-info}/top_level.txt +0 -0
dpdispatcher/__init__.py
CHANGED
|
@@ -49,6 +49,8 @@ from .lazy_local_context import LazyLocalContext
|
|
|
49
49
|
from .local_context import LocalContext
|
|
50
50
|
from .lsf import LSF
|
|
51
51
|
from .machine import Machine
|
|
52
|
+
from .openapi import OpenAPI
|
|
53
|
+
from .openapi_context import OpenAPIContext
|
|
52
54
|
from .pbs import PBS, Torque
|
|
53
55
|
from .shell import Shell
|
|
54
56
|
from .slurm import Slurm
|
|
@@ -77,6 +79,8 @@ __all__ = [
|
|
|
77
79
|
"__version__",
|
|
78
80
|
"DistributedShell",
|
|
79
81
|
"DpCloudServer",
|
|
82
|
+
"OpenAPI",
|
|
83
|
+
"OpenAPIContext",
|
|
80
84
|
"DpCloudServerContext",
|
|
81
85
|
"HDFSContext",
|
|
82
86
|
"LazyLocalContext",
|
dpdispatcher/_version.py
CHANGED
dpdispatcher/dp_cloud_server.py
CHANGED
|
@@ -31,6 +31,13 @@ class Bohrium(Machine):
|
|
|
31
31
|
phone = context.remote_profile.get("phone", None)
|
|
32
32
|
username = context.remote_profile.get("username", None)
|
|
33
33
|
password = context.remote_profile.get("password", None)
|
|
34
|
+
|
|
35
|
+
ticket = os.environ.get("BOHR_TICKET", None)
|
|
36
|
+
if ticket:
|
|
37
|
+
self.api = Client(ticket=ticket)
|
|
38
|
+
self.group_id = None
|
|
39
|
+
return
|
|
40
|
+
|
|
34
41
|
if email is None and username is not None:
|
|
35
42
|
raise DeprecationWarning(
|
|
36
43
|
"username is no longer support in current version, "
|
|
@@ -21,7 +21,7 @@ DP_CLOUD_SERVER_HOME_DIR = os.path.join(
|
|
|
21
21
|
os.path.expanduser("~"), ".dpdispatcher/", "dp_cloud_server/"
|
|
22
22
|
)
|
|
23
23
|
ENDPOINT = "http://oss-cn-shenzhen.aliyuncs.com"
|
|
24
|
-
BUCKET_NAME = "dpcloudserver"
|
|
24
|
+
BUCKET_NAME = os.environ.get("BUCKET_NAME", "dpcloudserver")
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
class BohriumContext(BaseContext):
|
|
@@ -39,9 +39,16 @@ class BohriumContext(BaseContext):
|
|
|
39
39
|
self.init_remote_root = remote_root
|
|
40
40
|
self.temp_local_root = os.path.abspath(local_root)
|
|
41
41
|
self.remote_profile = remote_profile
|
|
42
|
+
ticket = os.environ.get("BOHR_TICKET", None)
|
|
42
43
|
email = remote_profile.get("email", None)
|
|
43
44
|
phone = remote_profile.get("phone", None)
|
|
44
45
|
password = remote_profile.get("password")
|
|
46
|
+
os.makedirs(DP_CLOUD_SERVER_HOME_DIR, exist_ok=True)
|
|
47
|
+
|
|
48
|
+
if ticket is not None:
|
|
49
|
+
self.api = Client(ticket=ticket)
|
|
50
|
+
return
|
|
51
|
+
|
|
45
52
|
if email is None and phone is None:
|
|
46
53
|
raise ValueError(
|
|
47
54
|
"can not find email/phone number in remote_profile, please check your machine file."
|
|
@@ -57,8 +64,6 @@ class BohriumContext(BaseContext):
|
|
|
57
64
|
|
|
58
65
|
self.api = Client(account, password)
|
|
59
66
|
|
|
60
|
-
os.makedirs(DP_CLOUD_SERVER_HOME_DIR, exist_ok=True)
|
|
61
|
-
|
|
62
67
|
@classmethod
|
|
63
68
|
def load_from_dict(cls, context_dict):
|
|
64
69
|
local_root = context_dict["local_root"]
|
|
@@ -256,9 +261,7 @@ class BohriumContext(BaseContext):
|
|
|
256
261
|
return os.path.isfile(os.path.join(DP_CLOUD_SERVER_HOME_DIR, fname))
|
|
257
262
|
|
|
258
263
|
def clean(self):
|
|
259
|
-
submission_file_name = "{submission_hash}.json"
|
|
260
|
-
submission_hash=self.submission.submission_hash
|
|
261
|
-
)
|
|
264
|
+
submission_file_name = f"{self.submission.submission_hash}.json"
|
|
262
265
|
submission_json = os.path.join(DP_CLOUD_SERVER_HOME_DIR, submission_file_name)
|
|
263
266
|
os.remove(submission_json)
|
|
264
267
|
return True
|
|
@@ -288,7 +291,7 @@ class BohriumContext(BaseContext):
|
|
|
288
291
|
dict,
|
|
289
292
|
[
|
|
290
293
|
Argument("email", str, optional=True, doc="Email"),
|
|
291
|
-
Argument("password", str, optional=
|
|
294
|
+
Argument("password", str, optional=True, doc="Password"),
|
|
292
295
|
Argument(
|
|
293
296
|
"program_id",
|
|
294
297
|
int,
|
|
@@ -25,7 +25,9 @@ class RequestInfoException(Exception):
|
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
class Client:
|
|
28
|
-
def __init__(
|
|
28
|
+
def __init__(
|
|
29
|
+
self, email=None, password=None, debug=False, ticket=None, base_url=API_HOST
|
|
30
|
+
):
|
|
29
31
|
self.debug = debug
|
|
30
32
|
self.debug = os.getenv("LBG_CLI_DEBUG_PRINT", debug)
|
|
31
33
|
self.config = {}
|
|
@@ -35,6 +37,7 @@ class Client:
|
|
|
35
37
|
self.config["password"] = password
|
|
36
38
|
self.base_url = base_url
|
|
37
39
|
self.last_log_offset = 0
|
|
40
|
+
self.ticket = ticket
|
|
38
41
|
|
|
39
42
|
def post(self, url, data=None, header=None, params=None, retry=5):
|
|
40
43
|
return self._req(
|
|
@@ -51,19 +54,26 @@ class Client:
|
|
|
51
54
|
header = {}
|
|
52
55
|
if not self.token:
|
|
53
56
|
self.refresh_token()
|
|
57
|
+
self.ticket = os.environ.get("BOHR_TICKET", "")
|
|
54
58
|
header["Authorization"] = f"jwt {self.token}"
|
|
59
|
+
header["Brm-Ticket"] = self.ticket
|
|
55
60
|
resp_code = None
|
|
56
61
|
err = None
|
|
57
62
|
for i in range(retry):
|
|
58
63
|
resp = None
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
64
|
+
try:
|
|
65
|
+
if method == "GET":
|
|
66
|
+
resp = requests.get(url, params=params, headers=header)
|
|
67
|
+
else:
|
|
68
|
+
if self.debug:
|
|
69
|
+
print(data)
|
|
70
|
+
resp = requests.post(url, json=data, params=params, headers=header)
|
|
71
|
+
except Exception as e:
|
|
72
|
+
dlog.error(f"request({i}) error {e}", i, stack_info=ENABLE_STACK)
|
|
73
|
+
err = e
|
|
74
|
+
time.sleep(1 * i)
|
|
75
|
+
continue
|
|
76
|
+
|
|
67
77
|
resp_code = resp.status_code
|
|
68
78
|
if not resp.ok:
|
|
69
79
|
if self.debug:
|
|
@@ -96,6 +106,9 @@ class Client:
|
|
|
96
106
|
self.user_id = resp["user_id"]
|
|
97
107
|
|
|
98
108
|
def refresh_token(self, retry=3):
|
|
109
|
+
self.ticket = os.environ.get("BOHR_TICKET", "")
|
|
110
|
+
if self.ticket:
|
|
111
|
+
return
|
|
99
112
|
url = "/account/login"
|
|
100
113
|
post_data = {"email": self.config["email"], "password": self.config["password"]}
|
|
101
114
|
resp_code = None
|
dpdispatcher/fugaku.py
CHANGED
|
@@ -24,9 +24,7 @@ class Fugaku(Machine):
|
|
|
24
24
|
] = f'#PJM -L "node={resources.number_node}" '
|
|
25
25
|
fugaku_script_header_dict[
|
|
26
26
|
"fugaku_ntasks_per_node_line"
|
|
27
|
-
] = '#PJM --mpi "max-proc-per-node={cpu_per_node}"'
|
|
28
|
-
cpu_per_node=resources.cpu_per_node
|
|
29
|
-
)
|
|
27
|
+
] = f'#PJM --mpi "max-proc-per-node={resources.cpu_per_node}"'
|
|
30
28
|
fugaku_script_header_dict[
|
|
31
29
|
"queue_name_line"
|
|
32
30
|
] = f'#PJM -L "rscgrp={resources.queue_name}"'
|
dpdispatcher/hdfs_cli.py
CHANGED
|
@@ -90,9 +90,7 @@ class HDFS:
|
|
|
90
90
|
raise RuntimeError(
|
|
91
91
|
"try to access local_path[{}] " "but failed".format(local_path)
|
|
92
92
|
)
|
|
93
|
-
cmd = "hadoop fs -copyFromLocal -f {
|
|
94
|
-
local=local_path, remote=to_uri
|
|
95
|
-
)
|
|
93
|
+
cmd = f"hadoop fs -copyFromLocal -f {local_path} {to_uri}"
|
|
96
94
|
try:
|
|
97
95
|
ret, out, err = run_cmd_with_all_output(cmd)
|
|
98
96
|
if ret == 0:
|
|
@@ -106,9 +104,7 @@ class HDFS:
|
|
|
106
104
|
)
|
|
107
105
|
except Exception as e:
|
|
108
106
|
raise RuntimeError(
|
|
109
|
-
"Cannot copy local[{}] to remote[{}] with cmd[{}]"
|
|
110
|
-
local_path, to_uri, cmd
|
|
111
|
-
)
|
|
107
|
+
f"Cannot copy local[{local_path}] to remote[{to_uri}] with cmd[{cmd}]"
|
|
112
108
|
) from e
|
|
113
109
|
|
|
114
110
|
@staticmethod
|
|
@@ -118,9 +114,7 @@ class HDFS:
|
|
|
118
114
|
remote = from_uri
|
|
119
115
|
elif isinstance(from_uri, list) or isinstance(from_uri, tuple):
|
|
120
116
|
remote = " ".join(from_uri)
|
|
121
|
-
cmd = "hadoop fs -copyToLocal {remote} {
|
|
122
|
-
remote=remote, local=local_path
|
|
123
|
-
)
|
|
117
|
+
cmd = f"hadoop fs -copyToLocal {remote} {local_path}"
|
|
124
118
|
|
|
125
119
|
try:
|
|
126
120
|
ret, out, err = run_cmd_with_all_output(cmd)
|
|
@@ -135,9 +129,7 @@ class HDFS:
|
|
|
135
129
|
)
|
|
136
130
|
except Exception as e:
|
|
137
131
|
raise RuntimeError(
|
|
138
|
-
"Cannot copy remote[{}] to local[{}] with cmd[{}]"
|
|
139
|
-
from_uri, local_path, cmd
|
|
140
|
-
)
|
|
132
|
+
f"Cannot copy remote[{from_uri}] to local[{local_path}] with cmd[{cmd}]"
|
|
141
133
|
) from e
|
|
142
134
|
|
|
143
135
|
@staticmethod
|
dpdispatcher/hdfs_context.py
CHANGED
|
@@ -137,10 +137,7 @@ class HDFSContext(BaseContext):
|
|
|
137
137
|
if os.path.exists(gz_dir):
|
|
138
138
|
shutil.rmtree(gz_dir, ignore_errors=True)
|
|
139
139
|
os.mkdir(os.path.join(self.local_root, "tmp"))
|
|
140
|
-
rfile_tgz = "{}/{}_*_download.tar.gz"
|
|
141
|
-
self.remote_root,
|
|
142
|
-
submission.submission_hash,
|
|
143
|
-
)
|
|
140
|
+
rfile_tgz = f"{self.remote_root}/{submission.submission_hash}_*_download.tar.gz"
|
|
144
141
|
lfile_tgz = "%s/tmp/" % (self.local_root)
|
|
145
142
|
HDFS.copy_to_local(rfile_tgz, lfile_tgz)
|
|
146
143
|
|
dpdispatcher/lsf.py
CHANGED
|
@@ -31,12 +31,8 @@ class LSF(Machine):
|
|
|
31
31
|
"lsf_nodes_line": "#BSUB -n {number_cores}".format(
|
|
32
32
|
number_cores=resources.number_node * resources.cpu_per_node
|
|
33
33
|
),
|
|
34
|
-
"lsf_ptile_line": "#BSUB -R 'span[ptile={cpu_per_node}]'"
|
|
35
|
-
|
|
36
|
-
),
|
|
37
|
-
"lsf_partition_line": "#BSUB -q {queue_name}".format(
|
|
38
|
-
queue_name=resources.queue_name
|
|
39
|
-
),
|
|
34
|
+
"lsf_ptile_line": f"#BSUB -R 'span[ptile={resources.cpu_per_node}]'",
|
|
35
|
+
"lsf_partition_line": f"#BSUB -q {resources.queue_name}",
|
|
40
36
|
}
|
|
41
37
|
gpu_usage_flag = resources.kwargs.get("gpu_usage", False)
|
|
42
38
|
gpu_new_syntax_flag = resources.kwargs.get("gpu_new_syntax", False)
|
dpdispatcher/machine.py
CHANGED
|
@@ -208,9 +208,7 @@ class Machine(metaclass=ABCMeta):
|
|
|
208
208
|
|
|
209
209
|
def check_if_recover(self, submission):
|
|
210
210
|
submission_hash = submission.submission_hash
|
|
211
|
-
submission_file_name = "{submission_hash}.json"
|
|
212
|
-
submission_hash=submission_hash
|
|
213
|
-
)
|
|
211
|
+
submission_file_name = f"{submission_hash}.json"
|
|
214
212
|
if_recover = self.context.check_file_exists(submission_file_name)
|
|
215
213
|
return if_recover
|
|
216
214
|
|
dpdispatcher/openapi.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
import time
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
from bohriumsdk.client import Client
|
|
7
|
+
from bohriumsdk.job import Job
|
|
8
|
+
from bohriumsdk.storage import Storage
|
|
9
|
+
from bohriumsdk.util import Util
|
|
10
|
+
except ModuleNotFoundError:
|
|
11
|
+
found_bohriumsdk = False
|
|
12
|
+
else:
|
|
13
|
+
found_bohriumsdk = True
|
|
14
|
+
|
|
15
|
+
from dpdispatcher import dlog
|
|
16
|
+
from dpdispatcher.JobStatus import JobStatus
|
|
17
|
+
from dpdispatcher.machine import Machine
|
|
18
|
+
|
|
19
|
+
shell_script_header_template = """
|
|
20
|
+
#!/bin/bash -l
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class OpenAPI(Machine):
|
|
25
|
+
def __init__(self, context):
|
|
26
|
+
if not found_bohriumsdk:
|
|
27
|
+
raise ModuleNotFoundError(
|
|
28
|
+
"bohriumsdk not installed. Install dpdispatcher with `pip install dpdispatcher[bohrium]`"
|
|
29
|
+
)
|
|
30
|
+
self.context = context
|
|
31
|
+
self.remote_profile = context.remote_profile.copy()
|
|
32
|
+
|
|
33
|
+
self.grouped = self.remote_profile.get("grouped", True)
|
|
34
|
+
self.client = Client()
|
|
35
|
+
self.job = Job(client=self.client)
|
|
36
|
+
self.storage = Storage(client=self.client)
|
|
37
|
+
self.group_id = None
|
|
38
|
+
|
|
39
|
+
def gen_script(self, job):
|
|
40
|
+
shell_script = super().gen_script(job)
|
|
41
|
+
return shell_script
|
|
42
|
+
|
|
43
|
+
def gen_script_header(self, job):
|
|
44
|
+
shell_script_header = shell_script_header_template
|
|
45
|
+
return shell_script_header
|
|
46
|
+
|
|
47
|
+
def gen_local_script(self, job):
|
|
48
|
+
script_str = self.gen_script(job)
|
|
49
|
+
script_file_name = job.script_file_name
|
|
50
|
+
self.context.write_local_file(fname=script_file_name, write_str=script_str)
|
|
51
|
+
return script_file_name
|
|
52
|
+
|
|
53
|
+
def _gen_backward_files_list(self, job):
|
|
54
|
+
result_file_list = []
|
|
55
|
+
# result_file_list.extend(job.backward_common_files)
|
|
56
|
+
for task in job.job_task_list:
|
|
57
|
+
result_file_list.extend(
|
|
58
|
+
[os.path.join(task.task_work_path, b_f) for b_f in task.backward_files]
|
|
59
|
+
)
|
|
60
|
+
result_file_list = list(set(result_file_list))
|
|
61
|
+
return result_file_list
|
|
62
|
+
|
|
63
|
+
def do_submit(self, job):
|
|
64
|
+
self.gen_local_script(job)
|
|
65
|
+
|
|
66
|
+
project_id = self.remote_profile.get("project_id", 0)
|
|
67
|
+
|
|
68
|
+
openapi_params = {
|
|
69
|
+
"oss_path": job.upload_path,
|
|
70
|
+
"input_file_type": 3,
|
|
71
|
+
"input_file_method": 1,
|
|
72
|
+
"job_type": "container",
|
|
73
|
+
"job_name": self.remote_profile.get("job_name", "DP-GEN"),
|
|
74
|
+
"project_id": project_id,
|
|
75
|
+
"scass_type": self.remote_profile.get("machine_type", ""),
|
|
76
|
+
"cmd": f"bash {job.script_file_name}",
|
|
77
|
+
"log_files": os.path.join(
|
|
78
|
+
job.job_task_list[0].task_work_path, job.job_task_list[0].outlog
|
|
79
|
+
),
|
|
80
|
+
"out_files": self._gen_backward_files_list(job),
|
|
81
|
+
"platform": self.remote_profile.get("platform", "ali"),
|
|
82
|
+
"image_address": self.remote_profile.get("image_address", ""),
|
|
83
|
+
"job_id": job.job_id,
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
data = self.job.insert(**openapi_params)
|
|
87
|
+
|
|
88
|
+
job.job_id = data.get("jobId", 0) # type: ignore
|
|
89
|
+
# self.job_group_id = data.get("jobGroupId")
|
|
90
|
+
job.job_state = JobStatus.waiting
|
|
91
|
+
return job.job_id
|
|
92
|
+
|
|
93
|
+
def _get_job_detail(self, job_id, group_id):
|
|
94
|
+
check_return = self.job.detail(job_id)
|
|
95
|
+
assert check_return is not None, (
|
|
96
|
+
f"Failed to retrieve tasks information. To resubmit this job, please "
|
|
97
|
+
f"try again, if this problem still exists please delete the submission "
|
|
98
|
+
f"file and try again.\nYou can check submission.submission_hash in the "
|
|
99
|
+
f'previous log or type `grep -rl "{job_id}:job_group_id:{group_id}" '
|
|
100
|
+
f"~/.dpdispatcher/dp_cloud_server/` to find corresponding file. "
|
|
101
|
+
f"You can try with command:\n "
|
|
102
|
+
f'rm $(grep -rl "{job_id}:job_group_id:{group_id}" ~/.dpdispatcher/dp_cloud_server/)'
|
|
103
|
+
)
|
|
104
|
+
return check_return
|
|
105
|
+
|
|
106
|
+
def check_status(self, job):
|
|
107
|
+
if job.job_id == "":
|
|
108
|
+
return JobStatus.unsubmitted
|
|
109
|
+
job_id = job.job_id
|
|
110
|
+
group_id = None
|
|
111
|
+
if hasattr(job, "jgid"):
|
|
112
|
+
group_id = job.jgid
|
|
113
|
+
check_return = self._get_job_detail(job_id, group_id)
|
|
114
|
+
try:
|
|
115
|
+
dp_job_status = check_return["status"] # type: ignore
|
|
116
|
+
except IndexError as e:
|
|
117
|
+
dlog.error(
|
|
118
|
+
f"cannot find job information in bohrium for job {job.job_id}. check_return:{check_return}; retry one more time after 60 seconds"
|
|
119
|
+
)
|
|
120
|
+
time.sleep(60)
|
|
121
|
+
retry_return = self._get_job_detail(job_id, group_id)
|
|
122
|
+
try:
|
|
123
|
+
dp_job_status = retry_return["status"] # type: ignore
|
|
124
|
+
except IndexError as e:
|
|
125
|
+
raise RuntimeError(
|
|
126
|
+
f"cannot find job information in bohrium for job {job.job_id} {check_return} {retry_return}"
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
job_state = self.map_dp_job_state(dp_job_status)
|
|
130
|
+
if job_state == JobStatus.finished:
|
|
131
|
+
job_log = self.job.log(job_id)
|
|
132
|
+
if self.remote_profile.get("output_log"):
|
|
133
|
+
print(job_log, end="")
|
|
134
|
+
# print(job.job_id)
|
|
135
|
+
self._download_job(job)
|
|
136
|
+
elif self.remote_profile.get("output_log") and job_state == JobStatus.running:
|
|
137
|
+
job_log = self.job.log(job_id)
|
|
138
|
+
print(job_log, end="")
|
|
139
|
+
return job_state
|
|
140
|
+
|
|
141
|
+
def _download_job(self, job):
|
|
142
|
+
data = self.job.detail(job.job_id)
|
|
143
|
+
# print(data)
|
|
144
|
+
job_url = data["jobFiles"]["outFiles"][0]["url"] # type: ignore
|
|
145
|
+
if not job_url:
|
|
146
|
+
return
|
|
147
|
+
job_hash = job.job_hash
|
|
148
|
+
result_filename = job_hash + "_back.zip"
|
|
149
|
+
target_result_zip = os.path.join(self.context.local_root, result_filename)
|
|
150
|
+
self.storage.download_from_url(job_url, target_result_zip)
|
|
151
|
+
Util.unzip_file(target_result_zip, out_dir=self.context.local_root)
|
|
152
|
+
try:
|
|
153
|
+
os.makedirs(os.path.join(self.context.local_root, "backup"), exist_ok=True)
|
|
154
|
+
shutil.move(
|
|
155
|
+
target_result_zip,
|
|
156
|
+
os.path.join(
|
|
157
|
+
self.context.local_root,
|
|
158
|
+
"backup",
|
|
159
|
+
os.path.split(target_result_zip)[1],
|
|
160
|
+
),
|
|
161
|
+
)
|
|
162
|
+
except (OSError, shutil.Error) as e:
|
|
163
|
+
dlog.exception("unable to backup file, " + str(e))
|
|
164
|
+
|
|
165
|
+
def check_finish_tag(self, job):
|
|
166
|
+
job_tag_finished = job.job_hash + "_job_tag_finished"
|
|
167
|
+
dlog.info("check if job finished: ", job.job_id, job_tag_finished)
|
|
168
|
+
return self.context.check_file_exists(job_tag_finished)
|
|
169
|
+
# return
|
|
170
|
+
# pass
|
|
171
|
+
|
|
172
|
+
def check_if_recover(self, submission):
|
|
173
|
+
return False
|
|
174
|
+
# pass
|
|
175
|
+
|
|
176
|
+
@staticmethod
|
|
177
|
+
def map_dp_job_state(status):
|
|
178
|
+
if isinstance(status, JobStatus):
|
|
179
|
+
return status
|
|
180
|
+
map_dict = {
|
|
181
|
+
-1: JobStatus.terminated,
|
|
182
|
+
0: JobStatus.waiting,
|
|
183
|
+
1: JobStatus.running,
|
|
184
|
+
2: JobStatus.finished,
|
|
185
|
+
3: JobStatus.waiting,
|
|
186
|
+
4: JobStatus.running,
|
|
187
|
+
5: JobStatus.terminated,
|
|
188
|
+
6: JobStatus.running,
|
|
189
|
+
9: JobStatus.waiting,
|
|
190
|
+
}
|
|
191
|
+
if status not in map_dict:
|
|
192
|
+
dlog.error(f"unknown job status {status}")
|
|
193
|
+
return JobStatus.unknown
|
|
194
|
+
return map_dict[status]
|
|
195
|
+
|
|
196
|
+
# def check_finish_tag(self, job):
|
|
197
|
+
# job_tag_finished = job.job_hash + '_job_tag_finished'
|
|
198
|
+
# return self.context.check_file_exists(job_tag_finished)
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
import uuid
|
|
4
|
+
|
|
5
|
+
import tqdm
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
from bohriumsdk.client import Client
|
|
9
|
+
from bohriumsdk.job import Job
|
|
10
|
+
from bohriumsdk.storage import Storage
|
|
11
|
+
from bohriumsdk.util import Util
|
|
12
|
+
except ModuleNotFoundError:
|
|
13
|
+
found_bohriumsdk = False
|
|
14
|
+
else:
|
|
15
|
+
found_bohriumsdk = True
|
|
16
|
+
|
|
17
|
+
from dpdispatcher import dlog
|
|
18
|
+
from dpdispatcher.base_context import BaseContext
|
|
19
|
+
from dpdispatcher.JobStatus import JobStatus
|
|
20
|
+
|
|
21
|
+
DP_CLOUD_SERVER_HOME_DIR = os.path.join(
|
|
22
|
+
os.path.expanduser("~"), ".dpdispatcher/", "dp_cloud_server/"
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class OpenAPIContext(BaseContext):
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
local_root,
|
|
30
|
+
remote_root=None,
|
|
31
|
+
remote_profile={},
|
|
32
|
+
*args,
|
|
33
|
+
**kwargs,
|
|
34
|
+
):
|
|
35
|
+
if not found_bohriumsdk:
|
|
36
|
+
raise ModuleNotFoundError(
|
|
37
|
+
"bohriumsdk not installed. Install dpdispatcher with `pip install dpdispatcher[bohrium]`"
|
|
38
|
+
)
|
|
39
|
+
self.init_local_root = local_root
|
|
40
|
+
self.init_remote_root = remote_root
|
|
41
|
+
self.temp_local_root = os.path.abspath(local_root)
|
|
42
|
+
self.remote_profile = remote_profile
|
|
43
|
+
self.client = Client()
|
|
44
|
+
self.storage = Storage(client=self.client)
|
|
45
|
+
self.job = Job(client=self.client)
|
|
46
|
+
self.util = Util()
|
|
47
|
+
self.jgid = None
|
|
48
|
+
|
|
49
|
+
@classmethod
|
|
50
|
+
def load_from_dict(cls, context_dict):
|
|
51
|
+
local_root = context_dict.get("local_root", "./")
|
|
52
|
+
remote_root = context_dict.get("remote_root", None)
|
|
53
|
+
remote_profile = context_dict.get("remote_profile", {})
|
|
54
|
+
|
|
55
|
+
bohrium_context = cls(
|
|
56
|
+
local_root=local_root,
|
|
57
|
+
remote_root=remote_root,
|
|
58
|
+
remote_profile=remote_profile,
|
|
59
|
+
)
|
|
60
|
+
return bohrium_context
|
|
61
|
+
|
|
62
|
+
def bind_submission(self, submission):
|
|
63
|
+
self.submission = submission
|
|
64
|
+
self.local_root = os.path.join(self.temp_local_root, submission.work_base)
|
|
65
|
+
self.remote_root = "."
|
|
66
|
+
|
|
67
|
+
self.submission_hash = submission.submission_hash
|
|
68
|
+
|
|
69
|
+
self.machine = submission.machine
|
|
70
|
+
|
|
71
|
+
def _gen_object_key(self, job, zip_filename):
|
|
72
|
+
if hasattr(job, "upload_path") and job.upload_path:
|
|
73
|
+
return job.upload_path
|
|
74
|
+
else:
|
|
75
|
+
project_id = self.remote_profile.get("project_id")
|
|
76
|
+
|
|
77
|
+
uid = uuid.uuid4()
|
|
78
|
+
path = os.path.join(str(project_id), str(uid), zip_filename)
|
|
79
|
+
setattr(job, "upload_path", path)
|
|
80
|
+
return path
|
|
81
|
+
|
|
82
|
+
def upload_job(self, job, common_files=None):
|
|
83
|
+
if common_files is None:
|
|
84
|
+
common_files = []
|
|
85
|
+
self.machine.gen_local_script(job)
|
|
86
|
+
zip_filename = job.job_hash + ".zip"
|
|
87
|
+
zip_task_file = os.path.join(self.local_root, zip_filename)
|
|
88
|
+
|
|
89
|
+
upload_file_list = [
|
|
90
|
+
job.script_file_name,
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
upload_file_list.extend(common_files)
|
|
94
|
+
|
|
95
|
+
for task in job.job_task_list:
|
|
96
|
+
for file in task.forward_files:
|
|
97
|
+
upload_file_list.append(os.path.join(task.task_work_path, file))
|
|
98
|
+
|
|
99
|
+
upload_zip = Util.zip_file_list(
|
|
100
|
+
self.local_root, zip_task_file, file_list=upload_file_list
|
|
101
|
+
)
|
|
102
|
+
project_id = self.remote_profile.get("project_id", 0)
|
|
103
|
+
|
|
104
|
+
data = self.job.create(
|
|
105
|
+
project_id=project_id,
|
|
106
|
+
name=self.remote_profile.get("job_name", "DP-GEN"),
|
|
107
|
+
group_id=self.jgid, # type: ignore
|
|
108
|
+
)
|
|
109
|
+
self.jgid = data.get("jobGroupId", "") # type: ignore
|
|
110
|
+
token = data.get("token", "") # type: ignore
|
|
111
|
+
|
|
112
|
+
object_key = os.path.join(data["storePath"], zip_filename) # type: ignore
|
|
113
|
+
job.upload_path = object_key
|
|
114
|
+
job.job_id = data["jobId"] # type: ignore
|
|
115
|
+
job.jgid = data["jobGroupId"] # type: ignore
|
|
116
|
+
self.storage.upload_From_file_multi_part(
|
|
117
|
+
object_key=object_key, file_path=upload_zip, token=token
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
# self._backup(self.local_root, upload_zip)
|
|
121
|
+
|
|
122
|
+
def upload(self, submission):
|
|
123
|
+
# oss_task_dir = os.path.join('%s/%s/%s.zip' % ('indicate', file_uuid, file_uuid))
|
|
124
|
+
# zip_filename = submission.submission_hash + '.zip'
|
|
125
|
+
# oss_task_zip = 'indicate/' + submission.submission_hash + '/' + zip_filename
|
|
126
|
+
|
|
127
|
+
# zip_path = "/home/felix/workplace/22_dpdispatcher/dpdispatcher-yfb/dpdispatcher/dpcloudserver/t.txt"
|
|
128
|
+
# zip_path = self.local_root
|
|
129
|
+
bar_format = "{l_bar}{bar}| {n:.02f}/{total:.02f} % [{elapsed}<{remaining}, {rate_fmt}{postfix}]"
|
|
130
|
+
job_to_be_uploaded = []
|
|
131
|
+
result = None
|
|
132
|
+
dlog.info("checking all job has been uploaded")
|
|
133
|
+
for job in submission.belonging_jobs:
|
|
134
|
+
if job.job_state == JobStatus.unsubmitted:
|
|
135
|
+
job_to_be_uploaded.append(job)
|
|
136
|
+
if len(job_to_be_uploaded) == 0:
|
|
137
|
+
dlog.info("all job has been uploaded, continue")
|
|
138
|
+
return result
|
|
139
|
+
for job in tqdm.tqdm(
|
|
140
|
+
job_to_be_uploaded,
|
|
141
|
+
desc="Uploading to tiefblue",
|
|
142
|
+
bar_format=bar_format,
|
|
143
|
+
leave=False,
|
|
144
|
+
disable=None,
|
|
145
|
+
):
|
|
146
|
+
self.upload_job(job, submission.forward_common_files)
|
|
147
|
+
return result
|
|
148
|
+
# return oss_task_zip
|
|
149
|
+
# api.upload(self.oss_task_dir, zip_task_file)
|
|
150
|
+
|
|
151
|
+
def download(self, submission):
|
|
152
|
+
jobs = submission.belonging_jobs
|
|
153
|
+
job_hashs = {}
|
|
154
|
+
job_infos = {}
|
|
155
|
+
job_result = []
|
|
156
|
+
for job in jobs:
|
|
157
|
+
jid = job.job_id
|
|
158
|
+
job_hashs[jid] = job.job_hash
|
|
159
|
+
jobinfo = self.job.detail(jid)
|
|
160
|
+
# jobinfo = self.api.get_job_detail(jid)
|
|
161
|
+
job_result.append(jobinfo)
|
|
162
|
+
# if group_id is not None:
|
|
163
|
+
# job_result = self.api.get_tasks_list(group_id)
|
|
164
|
+
for each in job_result:
|
|
165
|
+
if "resultUrl" in each and each["resultUrl"] != "" and each["status"] == 2:
|
|
166
|
+
job_hash = ""
|
|
167
|
+
if each["id"] not in job_hashs:
|
|
168
|
+
dlog.info(
|
|
169
|
+
f"find unexpect job_hash, but task {each['id']} still been download."
|
|
170
|
+
)
|
|
171
|
+
dlog.debug(str(job_hashs))
|
|
172
|
+
job_hash = str(each["id"])
|
|
173
|
+
else:
|
|
174
|
+
job_hash = job_hashs[each["id"]]
|
|
175
|
+
job_infos[job_hash] = each
|
|
176
|
+
bar_format = "{l_bar}{bar}| {n:.02f}/{total:.02f} % [{elapsed}<{remaining}, {rate_fmt}{postfix}]"
|
|
177
|
+
for job_hash, info in tqdm.tqdm(
|
|
178
|
+
job_infos.items(),
|
|
179
|
+
desc="Validating download file from Lebesgue",
|
|
180
|
+
bar_format=bar_format,
|
|
181
|
+
leave=False,
|
|
182
|
+
disable=None,
|
|
183
|
+
):
|
|
184
|
+
result_filename = job_hash + "_back.zip"
|
|
185
|
+
target_result_zip = os.path.join(self.local_root, result_filename)
|
|
186
|
+
if self._check_if_job_has_already_downloaded(
|
|
187
|
+
target_result_zip, self.local_root
|
|
188
|
+
):
|
|
189
|
+
continue
|
|
190
|
+
self.storage.download_from_url(info["resultUrl"], target_result_zip)
|
|
191
|
+
Util.unzip_file(target_result_zip, out_dir=self.local_root)
|
|
192
|
+
self._backup(self.local_root, target_result_zip)
|
|
193
|
+
self._clean_backup(
|
|
194
|
+
self.local_root, keep_backup=self.remote_profile.get("keep_backup", True)
|
|
195
|
+
)
|
|
196
|
+
return True
|
|
197
|
+
|
|
198
|
+
def write_file(self, fname, write_str):
|
|
199
|
+
result = self.write_home_file(fname, write_str)
|
|
200
|
+
return result
|
|
201
|
+
|
|
202
|
+
def write_local_file(self, fname, write_str):
|
|
203
|
+
local_filename = os.path.join(self.local_root, fname)
|
|
204
|
+
with open(local_filename, "w") as f:
|
|
205
|
+
f.write(write_str)
|
|
206
|
+
return local_filename
|
|
207
|
+
|
|
208
|
+
def read_file(self, fname):
|
|
209
|
+
result = self.read_home_file(fname)
|
|
210
|
+
return result
|
|
211
|
+
|
|
212
|
+
def write_home_file(self, fname, write_str):
|
|
213
|
+
# os.makedirs(self.remote_root, exist_ok = True)
|
|
214
|
+
with open(os.path.join(DP_CLOUD_SERVER_HOME_DIR, fname), "w") as fp:
|
|
215
|
+
fp.write(write_str)
|
|
216
|
+
return True
|
|
217
|
+
|
|
218
|
+
def read_home_file(self, fname):
|
|
219
|
+
with open(os.path.join(DP_CLOUD_SERVER_HOME_DIR, fname)) as fp:
|
|
220
|
+
ret = fp.read()
|
|
221
|
+
return ret
|
|
222
|
+
|
|
223
|
+
def check_file_exists(self, fname):
|
|
224
|
+
result = self.check_home_file_exits(fname)
|
|
225
|
+
return result
|
|
226
|
+
|
|
227
|
+
def check_home_file_exits(self, fname):
|
|
228
|
+
return os.path.isfile(os.path.join(DP_CLOUD_SERVER_HOME_DIR, fname))
|
|
229
|
+
|
|
230
|
+
def clean(self):
|
|
231
|
+
submission_file_name = f"{self.submission.submission_hash}.json"
|
|
232
|
+
submission_json = os.path.join(DP_CLOUD_SERVER_HOME_DIR, submission_file_name)
|
|
233
|
+
os.remove(submission_json)
|
|
234
|
+
return True
|
|
235
|
+
|
|
236
|
+
def _check_if_job_has_already_downloaded(self, target, local_root):
|
|
237
|
+
backup_file_location = os.path.join(
|
|
238
|
+
local_root, "backup", os.path.split(target)[1]
|
|
239
|
+
)
|
|
240
|
+
if os.path.exists(backup_file_location):
|
|
241
|
+
return True
|
|
242
|
+
else:
|
|
243
|
+
return False
|
|
244
|
+
|
|
245
|
+
def _backup(self, local_root, target):
|
|
246
|
+
try:
|
|
247
|
+
# move to backup directory
|
|
248
|
+
os.makedirs(os.path.join(local_root, "backup"), exist_ok=True)
|
|
249
|
+
shutil.move(
|
|
250
|
+
target, os.path.join(local_root, "backup", os.path.split(target)[1])
|
|
251
|
+
)
|
|
252
|
+
except (OSError, shutil.Error) as e:
|
|
253
|
+
dlog.exception("unable to backup file, " + str(e))
|
|
254
|
+
|
|
255
|
+
def _clean_backup(self, local_root, keep_backup=True):
|
|
256
|
+
if not keep_backup:
|
|
257
|
+
dir_to_be_removed = os.path.join(local_root, "backup")
|
|
258
|
+
if os.path.exists(dir_to_be_removed):
|
|
259
|
+
shutil.rmtree(dir_to_be_removed)
|
dpdispatcher/pbs.py
CHANGED
|
@@ -22,16 +22,12 @@ class PBS(Machine):
|
|
|
22
22
|
pbs_script_header_dict = {}
|
|
23
23
|
pbs_script_header_dict[
|
|
24
24
|
"select_node_line"
|
|
25
|
-
] = "#PBS -l select={number_node}:ncpus={cpu_per_node}"
|
|
26
|
-
number_node=resources.number_node, cpu_per_node=resources.cpu_per_node
|
|
27
|
-
)
|
|
25
|
+
] = f"#PBS -l select={resources.number_node}:ncpus={resources.cpu_per_node}"
|
|
28
26
|
if resources.gpu_per_node != 0:
|
|
29
27
|
pbs_script_header_dict[
|
|
30
28
|
"select_node_line"
|
|
31
29
|
] += f":ngpus={resources.gpu_per_node}"
|
|
32
|
-
pbs_script_header_dict["queue_name_line"] = "#PBS -q {queue_name}"
|
|
33
|
-
queue_name=resources.queue_name
|
|
34
|
-
)
|
|
30
|
+
pbs_script_header_dict["queue_name_line"] = f"#PBS -q {resources.queue_name}"
|
|
35
31
|
pbs_script_header = pbs_script_header_template.format(**pbs_script_header_dict)
|
|
36
32
|
return pbs_script_header
|
|
37
33
|
|
|
@@ -147,15 +143,11 @@ class Torque(PBS):
|
|
|
147
143
|
pbs_script_header_dict = {}
|
|
148
144
|
pbs_script_header_dict[
|
|
149
145
|
"select_node_line"
|
|
150
|
-
] = "#PBS -l nodes={number_node}:ppn={cpu_per_node}"
|
|
151
|
-
number_node=resources.number_node, cpu_per_node=resources.cpu_per_node
|
|
152
|
-
)
|
|
146
|
+
] = f"#PBS -l nodes={resources.number_node}:ppn={resources.cpu_per_node}"
|
|
153
147
|
if resources.gpu_per_node != 0:
|
|
154
148
|
pbs_script_header_dict["select_node_line"] += ":gpus={gpu_per_node}".format(
|
|
155
149
|
gpu_per_node=resources.gpu_per_node
|
|
156
150
|
)
|
|
157
|
-
pbs_script_header_dict["queue_name_line"] = "#PBS -q {queue_name}"
|
|
158
|
-
queue_name=resources.queue_name
|
|
159
|
-
)
|
|
151
|
+
pbs_script_header_dict["queue_name_line"] = f"#PBS -q {resources.queue_name}"
|
|
160
152
|
pbs_script_header = pbs_script_header_template.format(**pbs_script_header_dict)
|
|
161
153
|
return pbs_script_header
|
dpdispatcher/slurm.py
CHANGED
|
@@ -34,16 +34,12 @@ class Slurm(Machine):
|
|
|
34
34
|
)
|
|
35
35
|
script_header_dict[
|
|
36
36
|
"slurm_ntasks_per_node_line"
|
|
37
|
-
] = "#SBATCH --ntasks-per-node {cpu_per_node}"
|
|
38
|
-
cpu_per_node=resources.cpu_per_node
|
|
39
|
-
)
|
|
37
|
+
] = f"#SBATCH --ntasks-per-node {resources.cpu_per_node}"
|
|
40
38
|
custom_gpu_line = resources.kwargs.get("custom_gpu_line", None)
|
|
41
39
|
if not custom_gpu_line:
|
|
42
40
|
script_header_dict[
|
|
43
41
|
"slurm_number_gpu_line"
|
|
44
|
-
] = "#SBATCH --gres=gpu:{gpu_per_node}"
|
|
45
|
-
gpu_per_node=resources.gpu_per_node
|
|
46
|
-
)
|
|
42
|
+
] = f"#SBATCH --gres=gpu:{resources.gpu_per_node}"
|
|
47
43
|
else:
|
|
48
44
|
script_header_dict["slurm_number_gpu_line"] = custom_gpu_line
|
|
49
45
|
if resources.queue_name != "":
|
dpdispatcher/submission.py
CHANGED
|
@@ -266,7 +266,7 @@ class Submission:
|
|
|
266
266
|
|
|
267
267
|
def try_download_result(self):
|
|
268
268
|
start_time = time.time()
|
|
269
|
-
retry_interval = 60 #
|
|
269
|
+
retry_interval = 60 # retry every 1 minute
|
|
270
270
|
success = False
|
|
271
271
|
while not success:
|
|
272
272
|
try:
|
|
@@ -275,14 +275,14 @@ class Submission:
|
|
|
275
275
|
except (EOFError, Exception) as e:
|
|
276
276
|
dlog.exception(e)
|
|
277
277
|
elapsed_time = time.time() - start_time
|
|
278
|
-
if elapsed_time < 3600: # 1
|
|
278
|
+
if elapsed_time < 3600: # in 1 h
|
|
279
279
|
dlog.info("Retrying in 1 minute...")
|
|
280
280
|
time.sleep(retry_interval)
|
|
281
|
-
elif elapsed_time < 86400: # 1
|
|
282
|
-
retry_interval = 600 #
|
|
281
|
+
elif elapsed_time < 86400: # 1 h ~ 24 h
|
|
282
|
+
retry_interval = 600 # retry every 10 min
|
|
283
283
|
dlog.info("Retrying in 10 minutes...")
|
|
284
284
|
time.sleep(retry_interval)
|
|
285
|
-
else: #
|
|
285
|
+
else: # > 24 h
|
|
286
286
|
dlog.info("Maximum retries time reached. Exiting.")
|
|
287
287
|
break
|
|
288
288
|
|
|
@@ -509,9 +509,7 @@ class Submission:
|
|
|
509
509
|
def submission_to_json(self):
|
|
510
510
|
# self.update_submission_state()
|
|
511
511
|
write_str = json.dumps(self.serialize(), indent=4, default=str)
|
|
512
|
-
submission_file_name = "{submission_hash}.json"
|
|
513
|
-
submission_hash=self.submission_hash
|
|
514
|
-
)
|
|
512
|
+
submission_file_name = f"{self.submission_hash}.json"
|
|
515
513
|
self.machine.context.write_file(submission_file_name, write_str=write_str)
|
|
516
514
|
|
|
517
515
|
@classmethod
|
|
@@ -525,9 +523,7 @@ class Submission:
|
|
|
525
523
|
# def check_if_recover()
|
|
526
524
|
|
|
527
525
|
def try_recover_from_json(self):
|
|
528
|
-
submission_file_name = "{submission_hash}.json"
|
|
529
|
-
submission_hash=self.submission_hash
|
|
530
|
-
)
|
|
526
|
+
submission_file_name = f"{self.submission_hash}.json"
|
|
531
527
|
if_recover = self.machine.context.check_file_exists(submission_file_name)
|
|
532
528
|
submission = None
|
|
533
529
|
submission_dict = {}
|
|
@@ -787,9 +783,7 @@ class Job:
|
|
|
787
783
|
"""
|
|
788
784
|
if len(job_dict.keys()) != 1:
|
|
789
785
|
raise RuntimeError(
|
|
790
|
-
"json file may be broken, len(job_dict.keys()) must be 1. {job_dict}"
|
|
791
|
-
job_dict=job_dict
|
|
792
|
-
)
|
|
786
|
+
f"json file may be broken, len(job_dict.keys()) must be 1. {job_dict}"
|
|
793
787
|
)
|
|
794
788
|
job_hash = list(job_dict.keys())[0]
|
|
795
789
|
|
|
@@ -871,11 +865,7 @@ class Job:
|
|
|
871
865
|
# raise RuntimeError("job:job {job} failed 3 times".format(job=self))
|
|
872
866
|
self.submit_job()
|
|
873
867
|
if self.job_state != JobStatus.unsubmitted:
|
|
874
|
-
dlog.info(
|
|
875
|
-
"job: {job_hash} submit; job_id is {job_id}".format(
|
|
876
|
-
job_hash=self.job_hash, job_id=self.job_id
|
|
877
|
-
)
|
|
878
|
-
)
|
|
868
|
+
dlog.info(f"job: {self.job_hash} submit; job_id is {self.job_id}")
|
|
879
869
|
if self.resources.wait_time != 0:
|
|
880
870
|
time.sleep(self.resources.wait_time)
|
|
881
871
|
# self.get_job_state()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: dpdispatcher
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.10
|
|
4
4
|
Summary: Generate HPC scheduler systems jobs input scripts, submit these scripts to HPC systems, and poke until they finish
|
|
5
5
|
Author: DeepModeling
|
|
6
6
|
License: GNU LESSER GENERAL PUBLIC LICENSE
|
|
@@ -190,9 +190,14 @@ Requires-Dist: dargs (>=0.2.9)
|
|
|
190
190
|
Requires-Dist: requests
|
|
191
191
|
Requires-Dist: tqdm (>=4.9.0)
|
|
192
192
|
Requires-Dist: typing-extensions ; python_version < "3.7"
|
|
193
|
+
Provides-Extra: bohrium
|
|
194
|
+
Requires-Dist: oss2 ; extra == 'bohrium'
|
|
195
|
+
Requires-Dist: tqdm ; extra == 'bohrium'
|
|
196
|
+
Requires-Dist: bohrium-sdk ; extra == 'bohrium'
|
|
193
197
|
Provides-Extra: cloudserver
|
|
194
198
|
Requires-Dist: oss2 ; extra == 'cloudserver'
|
|
195
199
|
Requires-Dist: tqdm ; extra == 'cloudserver'
|
|
200
|
+
Requires-Dist: bohrium-sdk ; extra == 'cloudserver'
|
|
196
201
|
Provides-Extra: docs
|
|
197
202
|
Requires-Dist: sphinx ; extra == 'docs'
|
|
198
203
|
Requires-Dist: myst-parser ; extra == 'docs'
|
|
@@ -223,6 +228,12 @@ DPDispatcher can be installed by `pip`:
|
|
|
223
228
|
pip install dpdispatcher
|
|
224
229
|
```
|
|
225
230
|
|
|
231
|
+
To add [Bohrium](https://bohrium.dp.tech/) support, execute
|
|
232
|
+
|
|
233
|
+
```bash
|
|
234
|
+
pip install dpdispatcher[bohrium]
|
|
235
|
+
```
|
|
236
|
+
|
|
226
237
|
## Usage
|
|
227
238
|
|
|
228
239
|
See [Getting Started](https://dpdispatcher.readthedocs.io/en/latest/getting-started.html) for usage.
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
dpdispatcher/JobStatus.py,sha256=Eszs4TPLfszCuf6zLaFonf25feXDUguF28spYOjJpQE,233
|
|
2
|
+
dpdispatcher/__init__.py,sha256=i33piTZkPDYIm_qiTEI_J3bmHlFayEN1T8T8TyzYdqg,3017
|
|
3
|
+
dpdispatcher/_version.py,sha256=RvtoZGalK6X9Bv5tbHI4epbrqfU-LImzqZ_rqfMFZHA,162
|
|
4
|
+
dpdispatcher/arginfo.py,sha256=pNaxYIE6ahBidpR7OCKZdw8iGt003uTXGSlVzwiuvRg,188
|
|
5
|
+
dpdispatcher/base_context.py,sha256=Hfri0x41XC4MRUjxc0-WMiZB_E4NvLp94ZYaHfYCWHM,3610
|
|
6
|
+
dpdispatcher/distributed_shell.py,sha256=XMcXt8g1f2DY5HYhhyiN5ehV2ihKULY5ng-sB0B7YaI,6933
|
|
7
|
+
dpdispatcher/dp_cloud_server.py,sha256=dOoQk7GaUglnqmVFefuElLPjP_3pCw4asHz8PEUrpSw,10028
|
|
8
|
+
dpdispatcher/dp_cloud_server_context.py,sha256=h_IBVyQ1Ddm70rSDd6AlWqTckqdd6oOCRZZXyGV23K4,11373
|
|
9
|
+
dpdispatcher/dpdisp.py,sha256=_dyH8xEgUR-s2xKkB20D9FIYhSHUCmzc2PxWgo9ildQ,94
|
|
10
|
+
dpdispatcher/fugaku.py,sha256=Nwj3tDMuv29wMZdZcQf7U9EkeoKxrdu0FZg_KuODgX8,3678
|
|
11
|
+
dpdispatcher/hdfs_cli.py,sha256=eBU-1woqmFqVjJqhsDWLRHnU6YrYTptOgsexs9Iy-l4,5835
|
|
12
|
+
dpdispatcher/hdfs_context.py,sha256=uuhDNudDf_7CZWN_1bY4jNJoKL7n8mNZwOWu7N57KwI,8893
|
|
13
|
+
dpdispatcher/lazy_local_context.py,sha256=ZdWNqK3QF8SsoqnCjpFt3ZDRCIagjzJNlKPUYutRUC8,5692
|
|
14
|
+
dpdispatcher/local_context.py,sha256=anYJqQASOnkcAhfckUcFD8_DcjNUZ1KE0GuksxR5Mxw,11772
|
|
15
|
+
dpdispatcher/lsf.py,sha256=uBFWs25H1GZXCzben5ATstoGE8fayWVs1IsV9MkoMyo,7668
|
|
16
|
+
dpdispatcher/machine.py,sha256=ogeQHJfslH4JWc9Mm5SW-Yo7mvbxoeBZQI9aMuvphkE,15235
|
|
17
|
+
dpdispatcher/openapi.py,sha256=CGfYycov13EaBRJgz4cd-ezx8KLXNfdaOOIl8l4PtVc,7386
|
|
18
|
+
dpdispatcher/openapi_context.py,sha256=8ji_ztA-wZdKpqOroXCWddu9FdbojCUM2h2t_dZpI8E,9420
|
|
19
|
+
dpdispatcher/pbs.py,sha256=HGqoFympIpNNiAmtBZEKzsmVtt8-O3Xxp2rjYAUNMuM,6065
|
|
20
|
+
dpdispatcher/shell.py,sha256=kEP7za-qN71y_21p0uBNkopZ5s63Adq54904hjUHv48,4141
|
|
21
|
+
dpdispatcher/slurm.py,sha256=bNvFL6lCU5YveTGuSLbDdk-9jbDXyWbB-OBRFhWnfZg,14530
|
|
22
|
+
dpdispatcher/ssh_context.py,sha256=7Xrm8biVA7tAEDJ6YJZzC3nbdQrVBr_5UOhQNQ7qJ2g,35032
|
|
23
|
+
dpdispatcher/submission.py,sha256=GLbEfvcejQCzlK8V2U-tsZ4K3HrFffWDl9mIyPoHdgY,46295
|
|
24
|
+
dpdispatcher/utils.py,sha256=RXUHJl3S2z26Em3SeltnxtdVM3kv7weXJKvBEjG6I34,5035
|
|
25
|
+
dpdispatcher/dpcloudserver/__init__.py,sha256=FnX9HH-2dXADluNfucg98JPMfruMoBpN9ER9lZkVQvQ,49
|
|
26
|
+
dpdispatcher/dpcloudserver/client.py,sha256=WaNYyuiBvzylyUlGWMNsKVKjnyfQmUn7Qi5kgLMiVfM,11588
|
|
27
|
+
dpdispatcher/dpcloudserver/config.py,sha256=vBRtzExJXTGfXPeBObXrZNAhBNXoFFzMkzSuSrrjHEQ,635
|
|
28
|
+
dpdispatcher/dpcloudserver/retcode.py,sha256=1qAF8gFZx55u2sO8KbtYSIIrjcO-IGufEUlwbkSfC1g,721
|
|
29
|
+
dpdispatcher/dpcloudserver/temp_test.py,sha256=jklOSu7tZ_wW5gycGRiUsbBWMLZDqCBslSYOCb2hTHw,2932
|
|
30
|
+
dpdispatcher/dpcloudserver/zip_file.py,sha256=f9WrlktwHW0YipaWg5Y0kxjMZlhD1cJYa6EUpvu4Cro,2611
|
|
31
|
+
dpdispatcher-0.5.10.dist-info/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
|
|
32
|
+
dpdispatcher-0.5.10.dist-info/METADATA,sha256=9-GohvpxCFjtHxunG_nTBh6FMLyy7_j40xs_239pKVE,12595
|
|
33
|
+
dpdispatcher-0.5.10.dist-info/WHEEL,sha256=AtBG6SXL3KF_v0NxLf0ehyVOh0cold-JbJYXNGorC6Q,92
|
|
34
|
+
dpdispatcher-0.5.10.dist-info/entry_points.txt,sha256=3bKn6IB6SYhKOUbbcOdBBevz4gsDmhmbogKMVn4ptOQ,52
|
|
35
|
+
dpdispatcher-0.5.10.dist-info/top_level.txt,sha256=35jAQoXY-b-e9fJ1_mxhZUiaCoJNt1ZI7mpFRf07Qjs,13
|
|
36
|
+
dpdispatcher-0.5.10.dist-info/RECORD,,
|
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
dpdispatcher/JobStatus.py,sha256=Eszs4TPLfszCuf6zLaFonf25feXDUguF28spYOjJpQE,233
|
|
2
|
-
dpdispatcher/__init__.py,sha256=2GIz4niyzHTbxros1G7Mi4uBJbD3AMSnTPxXSJMJmUs,2907
|
|
3
|
-
dpdispatcher/_version.py,sha256=iqWtoISytDDNpYe-atC8Kl-rZhTojPnDQKAEcFNtIhg,160
|
|
4
|
-
dpdispatcher/arginfo.py,sha256=pNaxYIE6ahBidpR7OCKZdw8iGt003uTXGSlVzwiuvRg,188
|
|
5
|
-
dpdispatcher/base_context.py,sha256=Hfri0x41XC4MRUjxc0-WMiZB_E4NvLp94ZYaHfYCWHM,3610
|
|
6
|
-
dpdispatcher/distributed_shell.py,sha256=XMcXt8g1f2DY5HYhhyiN5ehV2ihKULY5ng-sB0B7YaI,6933
|
|
7
|
-
dpdispatcher/dp_cloud_server.py,sha256=xVpDI0exBwHNSZECLJdfrQsvBzeUn5a0gx5Bzt9UAdU,9857
|
|
8
|
-
dpdispatcher/dp_cloud_server_context.py,sha256=VfRRo4ruorWC8NVjW19EjmxQ0Rbz6XzxrHrJKl4cCZk,11255
|
|
9
|
-
dpdispatcher/dpdisp.py,sha256=_dyH8xEgUR-s2xKkB20D9FIYhSHUCmzc2PxWgo9ildQ,94
|
|
10
|
-
dpdispatcher/fugaku.py,sha256=wSjY0XB3TNNWAPKHgMpoPl5jyYJIlijBcEkYXp6nrZQ,3733
|
|
11
|
-
dpdispatcher/hdfs_cli.py,sha256=9Vrf7Kz_kJgXP2xEdZqNVNxRGbui5RrtnLtEjxfcq9A,6047
|
|
12
|
-
dpdispatcher/hdfs_context.py,sha256=1jT1nzx7VGJFJ42MHTXoFWhfEu4KBkMBJO84klRAnPI,8938
|
|
13
|
-
dpdispatcher/lazy_local_context.py,sha256=ZdWNqK3QF8SsoqnCjpFt3ZDRCIagjzJNlKPUYutRUC8,5692
|
|
14
|
-
dpdispatcher/local_context.py,sha256=anYJqQASOnkcAhfckUcFD8_DcjNUZ1KE0GuksxR5Mxw,11772
|
|
15
|
-
dpdispatcher/lsf.py,sha256=zy-WEnC7f2Dy5hJGnRBl5jpjYZ_H3-KMcE0lxDG6ejo,7790
|
|
16
|
-
dpdispatcher/machine.py,sha256=31xG5ksN8mBVwD8taLsk5KXLhjM0ZTjlHlbbPgiig1c,15296
|
|
17
|
-
dpdispatcher/pbs.py,sha256=LiULEKNDuisrKmOpZyB1af6sGDQ35xrAhMh7VMwpFbY,6327
|
|
18
|
-
dpdispatcher/shell.py,sha256=kEP7za-qN71y_21p0uBNkopZ5s63Adq54904hjUHv48,4141
|
|
19
|
-
dpdispatcher/slurm.py,sha256=krlyjzxK8gIhSsqcKHFvNiUwVE7411wTUwuW9xGzS-E,14648
|
|
20
|
-
dpdispatcher/ssh_context.py,sha256=7Xrm8biVA7tAEDJ6YJZzC3nbdQrVBr_5UOhQNQ7qJ2g,35032
|
|
21
|
-
dpdispatcher/submission.py,sha256=r_F05nHTpN86b2os8RZAjZsCILNarDko2BjAEUYSntw,46643
|
|
22
|
-
dpdispatcher/utils.py,sha256=RXUHJl3S2z26Em3SeltnxtdVM3kv7weXJKvBEjG6I34,5035
|
|
23
|
-
dpdispatcher/dpcloudserver/__init__.py,sha256=FnX9HH-2dXADluNfucg98JPMfruMoBpN9ER9lZkVQvQ,49
|
|
24
|
-
dpdispatcher/dpcloudserver/client.py,sha256=w1wQ8g-FMQlyh00LIAbJLE1xirGXocpp7zAnhbeM4V0,11152
|
|
25
|
-
dpdispatcher/dpcloudserver/config.py,sha256=vBRtzExJXTGfXPeBObXrZNAhBNXoFFzMkzSuSrrjHEQ,635
|
|
26
|
-
dpdispatcher/dpcloudserver/retcode.py,sha256=1qAF8gFZx55u2sO8KbtYSIIrjcO-IGufEUlwbkSfC1g,721
|
|
27
|
-
dpdispatcher/dpcloudserver/temp_test.py,sha256=jklOSu7tZ_wW5gycGRiUsbBWMLZDqCBslSYOCb2hTHw,2932
|
|
28
|
-
dpdispatcher/dpcloudserver/zip_file.py,sha256=f9WrlktwHW0YipaWg5Y0kxjMZlhD1cJYa6EUpvu4Cro,2611
|
|
29
|
-
dpdispatcher-0.5.8.dist-info/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
|
|
30
|
-
dpdispatcher-0.5.8.dist-info/METADATA,sha256=o2oD8_6Ohc04mRTkJWi51-KOPamYqH0kvUD-E0iW-c0,12280
|
|
31
|
-
dpdispatcher-0.5.8.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
|
|
32
|
-
dpdispatcher-0.5.8.dist-info/entry_points.txt,sha256=3bKn6IB6SYhKOUbbcOdBBevz4gsDmhmbogKMVn4ptOQ,52
|
|
33
|
-
dpdispatcher-0.5.8.dist-info/top_level.txt,sha256=35jAQoXY-b-e9fJ1_mxhZUiaCoJNt1ZI7mpFRf07Qjs,13
|
|
34
|
-
dpdispatcher-0.5.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|