dpdispatcher 0.6.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dpdispatcher/_version.py +22 -4
- dpdispatcher/base_context.py +60 -1
- dpdispatcher/contexts/__init__.py +1 -0
- dpdispatcher/contexts/dp_cloud_server_context.py +8 -1
- dpdispatcher/contexts/hdfs_context.py +16 -11
- dpdispatcher/contexts/lazy_local_context.py +2 -19
- dpdispatcher/contexts/local_context.py +77 -43
- dpdispatcher/contexts/openapi_context.py +78 -14
- dpdispatcher/contexts/ssh_context.py +117 -98
- dpdispatcher/dlog.py +9 -5
- dpdispatcher/dpcloudserver/__init__.py +0 -0
- dpdispatcher/dpcloudserver/client.py +7 -0
- dpdispatcher/dpdisp.py +21 -0
- dpdispatcher/entrypoints/run.py +9 -0
- dpdispatcher/entrypoints/submission.py +21 -1
- dpdispatcher/machine.py +15 -4
- dpdispatcher/machines/JH_UniScheduler.py +171 -0
- dpdispatcher/machines/__init__.py +1 -0
- dpdispatcher/machines/distributed_shell.py +6 -10
- dpdispatcher/machines/fugaku.py +9 -12
- dpdispatcher/machines/lsf.py +3 -9
- dpdispatcher/machines/openapi.py +48 -15
- dpdispatcher/machines/pbs.py +183 -20
- dpdispatcher/machines/shell.py +7 -16
- dpdispatcher/machines/slurm.py +30 -42
- dpdispatcher/run.py +172 -0
- dpdispatcher/submission.py +5 -14
- dpdispatcher/utils/dpcloudserver/client.py +10 -6
- dpdispatcher/utils/hdfs_cli.py +10 -19
- dpdispatcher/utils/utils.py +21 -7
- {dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/METADATA +35 -29
- dpdispatcher-1.0.0.dist-info/RECORD +49 -0
- {dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/WHEEL +1 -1
- dpdispatcher-0.6.1.dist-info/RECORD +0 -44
- {dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/entry_points.txt +0 -0
- {dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info/licenses}/LICENSE +0 -0
- {dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/top_level.txt +0 -0
dpdispatcher/_version.py
CHANGED
|
@@ -1,16 +1,34 @@
|
|
|
1
|
-
# file generated by
|
|
1
|
+
# file generated by setuptools-scm
|
|
2
2
|
# don't change, don't track in version control
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"__version__",
|
|
6
|
+
"__version_tuple__",
|
|
7
|
+
"version",
|
|
8
|
+
"version_tuple",
|
|
9
|
+
"__commit_id__",
|
|
10
|
+
"commit_id",
|
|
11
|
+
]
|
|
12
|
+
|
|
3
13
|
TYPE_CHECKING = False
|
|
4
14
|
if TYPE_CHECKING:
|
|
5
|
-
from typing import Tuple
|
|
15
|
+
from typing import Tuple
|
|
16
|
+
from typing import Union
|
|
17
|
+
|
|
6
18
|
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
19
|
+
COMMIT_ID = Union[str, None]
|
|
7
20
|
else:
|
|
8
21
|
VERSION_TUPLE = object
|
|
22
|
+
COMMIT_ID = object
|
|
9
23
|
|
|
10
24
|
version: str
|
|
11
25
|
__version__: str
|
|
12
26
|
__version_tuple__: VERSION_TUPLE
|
|
13
27
|
version_tuple: VERSION_TUPLE
|
|
28
|
+
commit_id: COMMIT_ID
|
|
29
|
+
__commit_id__: COMMIT_ID
|
|
30
|
+
|
|
31
|
+
__version__ = version = '1.0.0'
|
|
32
|
+
__version_tuple__ = version_tuple = (1, 0, 0)
|
|
14
33
|
|
|
15
|
-
|
|
16
|
-
__version_tuple__ = version_tuple = (0, 6, 1)
|
|
34
|
+
__commit_id__ = commit_id = None
|
dpdispatcher/base_context.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from abc import ABCMeta, abstractmethod
|
|
2
|
-
from typing import List, Tuple
|
|
2
|
+
from typing import Any, List, Tuple
|
|
3
3
|
|
|
4
4
|
from dargs import Argument
|
|
5
5
|
|
|
@@ -73,6 +73,65 @@ class BaseContext(metaclass=ABCMeta):
|
|
|
73
73
|
def check_finish(self, proc):
|
|
74
74
|
raise NotImplementedError("abstract method")
|
|
75
75
|
|
|
76
|
+
def block_checkcall(self, cmd, asynchronously=False) -> Tuple[Any, Any, Any]:
|
|
77
|
+
"""Run command with arguments. Wait for command to complete.
|
|
78
|
+
|
|
79
|
+
Parameters
|
|
80
|
+
----------
|
|
81
|
+
cmd : str
|
|
82
|
+
The command to run.
|
|
83
|
+
asynchronously : bool, optional, default=False
|
|
84
|
+
Run command asynchronously. If True, `nohup` will be used to run the command.
|
|
85
|
+
|
|
86
|
+
Returns
|
|
87
|
+
-------
|
|
88
|
+
stdin
|
|
89
|
+
standard inout
|
|
90
|
+
stdout
|
|
91
|
+
standard output
|
|
92
|
+
stderr
|
|
93
|
+
standard error
|
|
94
|
+
|
|
95
|
+
Raises
|
|
96
|
+
------
|
|
97
|
+
RuntimeError
|
|
98
|
+
when the return code is not zero
|
|
99
|
+
"""
|
|
100
|
+
if asynchronously:
|
|
101
|
+
cmd = f"nohup {cmd} >/dev/null &"
|
|
102
|
+
exit_status, stdin, stdout, stderr = self.block_call(cmd)
|
|
103
|
+
if exit_status != 0:
|
|
104
|
+
raise RuntimeError(
|
|
105
|
+
"Get error code {} in calling {} with job: {} . message: {}".format(
|
|
106
|
+
exit_status,
|
|
107
|
+
cmd,
|
|
108
|
+
self.submission.submission_hash,
|
|
109
|
+
stderr.read().decode("utf-8"),
|
|
110
|
+
)
|
|
111
|
+
)
|
|
112
|
+
return stdin, stdout, stderr
|
|
113
|
+
|
|
114
|
+
@abstractmethod
|
|
115
|
+
def block_call(self, cmd) -> Tuple[int, Any, Any, Any]:
|
|
116
|
+
"""Run command with arguments. Wait for command to complete.
|
|
117
|
+
|
|
118
|
+
Parameters
|
|
119
|
+
----------
|
|
120
|
+
cmd : str
|
|
121
|
+
The command to run.
|
|
122
|
+
|
|
123
|
+
Returns
|
|
124
|
+
-------
|
|
125
|
+
exit_status
|
|
126
|
+
exit code
|
|
127
|
+
stdin
|
|
128
|
+
standard inout
|
|
129
|
+
stdout
|
|
130
|
+
standard output
|
|
131
|
+
stderr
|
|
132
|
+
standard error
|
|
133
|
+
"""
|
|
134
|
+
|
|
76
135
|
@classmethod
|
|
77
136
|
def machine_arginfo(cls) -> Argument:
|
|
78
137
|
"""Generate the machine arginfo.
|
|
@@ -161,7 +161,9 @@ class BohriumContext(BaseContext):
|
|
|
161
161
|
# return oss_task_zip
|
|
162
162
|
# api.upload(self.oss_task_dir, zip_task_file)
|
|
163
163
|
|
|
164
|
-
def download(
|
|
164
|
+
def download(
|
|
165
|
+
self, submission, check_exists=False, mark_failure=True, back_error=False
|
|
166
|
+
):
|
|
165
167
|
jobs = submission.belonging_jobs
|
|
166
168
|
job_hashs = {}
|
|
167
169
|
job_infos = {}
|
|
@@ -335,6 +337,11 @@ class BohriumContext(BaseContext):
|
|
|
335
337
|
)
|
|
336
338
|
]
|
|
337
339
|
|
|
340
|
+
def block_call(self, cmd):
|
|
341
|
+
raise RuntimeError(
|
|
342
|
+
"Unsupported method. You may use an unsupported combination of the machine and the context."
|
|
343
|
+
)
|
|
344
|
+
|
|
338
345
|
|
|
339
346
|
DpCloudServerContext = BohriumContext
|
|
340
347
|
LebesgueContext = BohriumContext
|
|
@@ -88,7 +88,7 @@ class HDFSContext(BaseContext):
|
|
|
88
88
|
for ff in task.forward_files:
|
|
89
89
|
abs_file_list = glob(os.path.join(local_job, ff))
|
|
90
90
|
if not abs_file_list:
|
|
91
|
-
raise
|
|
91
|
+
raise FileNotFoundError(
|
|
92
92
|
"cannot find upload file " + os.path.join(local_job, ff)
|
|
93
93
|
)
|
|
94
94
|
rel_file_list = [
|
|
@@ -100,7 +100,7 @@ class HDFSContext(BaseContext):
|
|
|
100
100
|
for fc in submission.forward_common_files:
|
|
101
101
|
abs_file_list = glob(os.path.join(local_job, fc))
|
|
102
102
|
if not abs_file_list:
|
|
103
|
-
raise
|
|
103
|
+
raise FileNotFoundError(
|
|
104
104
|
"cannot find upload file " + os.path.join(local_job, fc)
|
|
105
105
|
)
|
|
106
106
|
rel_file_list = [
|
|
@@ -138,7 +138,7 @@ class HDFSContext(BaseContext):
|
|
|
138
138
|
shutil.rmtree(gz_dir, ignore_errors=True)
|
|
139
139
|
os.mkdir(os.path.join(self.local_root, "tmp"))
|
|
140
140
|
rfile_tgz = f"{self.remote_root}/{submission.submission_hash}_*_download.tar.gz"
|
|
141
|
-
lfile_tgz = "
|
|
141
|
+
lfile_tgz = f"{self.local_root}/tmp/"
|
|
142
142
|
HDFS.copy_to_local(rfile_tgz, lfile_tgz)
|
|
143
143
|
|
|
144
144
|
tgz_file_list = glob(os.path.join(self.local_root, "tmp/*_download.tar.gz"))
|
|
@@ -164,15 +164,17 @@ class HDFSContext(BaseContext):
|
|
|
164
164
|
os.path.join(
|
|
165
165
|
self.local_root,
|
|
166
166
|
task.task_work_path,
|
|
167
|
-
"tag_failure_download_
|
|
167
|
+
f"tag_failure_download_{jj}",
|
|
168
168
|
),
|
|
169
169
|
"w",
|
|
170
170
|
) as fp:
|
|
171
171
|
pass
|
|
172
172
|
else:
|
|
173
|
-
raise
|
|
173
|
+
raise FileNotFoundError(
|
|
174
|
+
"do not find download file " + rfile
|
|
175
|
+
)
|
|
174
176
|
else:
|
|
175
|
-
raise
|
|
177
|
+
raise FileNotFoundError("do not find download file " + rfile)
|
|
176
178
|
else:
|
|
177
179
|
if os.path.exists(lfile):
|
|
178
180
|
dlog.info(f"find existing {lfile}, replacing by {rfile}")
|
|
@@ -196,16 +198,14 @@ class HDFSContext(BaseContext):
|
|
|
196
198
|
if check_exists:
|
|
197
199
|
if mark_failure:
|
|
198
200
|
with open(
|
|
199
|
-
os.path.join(
|
|
200
|
-
self.local_root, "tag_failure_download_%s" % jj
|
|
201
|
-
),
|
|
201
|
+
os.path.join(self.local_root, f"tag_failure_download_{jj}"),
|
|
202
202
|
"w",
|
|
203
203
|
) as fp:
|
|
204
204
|
pass
|
|
205
205
|
else:
|
|
206
|
-
raise
|
|
206
|
+
raise FileNotFoundError("do not find download file " + rfile)
|
|
207
207
|
else:
|
|
208
|
-
raise
|
|
208
|
+
raise FileNotFoundError("do not find download file " + rfile)
|
|
209
209
|
else:
|
|
210
210
|
if os.path.exists(lfile):
|
|
211
211
|
dlog.info(f"find existing {lfile}, replacing by {rfile}")
|
|
@@ -244,3 +244,8 @@ class HDFSContext(BaseContext):
|
|
|
244
244
|
|
|
245
245
|
def read_file(self, fname):
|
|
246
246
|
return HDFS.read_hdfs_file(os.path.join(self.remote_root, fname))
|
|
247
|
+
|
|
248
|
+
def block_call(self, cmd):
|
|
249
|
+
raise RuntimeError(
|
|
250
|
+
"Unsupported method. You may use an unsupported combination of the machine and the context."
|
|
251
|
+
)
|
|
@@ -83,7 +83,7 @@ class LazyLocalContext(BaseContext):
|
|
|
83
83
|
|
|
84
84
|
def upload(
|
|
85
85
|
self,
|
|
86
|
-
|
|
86
|
+
submission,
|
|
87
87
|
# local_up_files,
|
|
88
88
|
dereference=True,
|
|
89
89
|
):
|
|
@@ -91,7 +91,7 @@ class LazyLocalContext(BaseContext):
|
|
|
91
91
|
|
|
92
92
|
def download(
|
|
93
93
|
self,
|
|
94
|
-
|
|
94
|
+
submission,
|
|
95
95
|
# remote_down_files,
|
|
96
96
|
check_exists=False,
|
|
97
97
|
mark_failure=True,
|
|
@@ -112,23 +112,6 @@ class LazyLocalContext(BaseContext):
|
|
|
112
112
|
# else:
|
|
113
113
|
# raise RuntimeError('do not find download file ' + fname)
|
|
114
114
|
|
|
115
|
-
def block_checkcall(self, cmd):
|
|
116
|
-
# script_dir = os.path.join(self.local_root, self.submission.work_base)
|
|
117
|
-
# os.chdir(script_dir)
|
|
118
|
-
proc = sp.Popen(
|
|
119
|
-
cmd, cwd=self.local_root, shell=True, stdout=sp.PIPE, stderr=sp.PIPE
|
|
120
|
-
)
|
|
121
|
-
o, e = proc.communicate()
|
|
122
|
-
stdout = SPRetObj(o)
|
|
123
|
-
stderr = SPRetObj(e)
|
|
124
|
-
code = proc.returncode
|
|
125
|
-
if code != 0:
|
|
126
|
-
raise RuntimeError(
|
|
127
|
-
"Get error code %d in locally calling %s with job: %s ",
|
|
128
|
-
(code, cmd, self.submission.submission_hash),
|
|
129
|
-
)
|
|
130
|
-
return None, stdout, stderr
|
|
131
|
-
|
|
132
115
|
def block_call(self, cmd):
|
|
133
116
|
proc = sp.Popen(
|
|
134
117
|
cmd, cwd=self.local_root, shell=True, stdout=sp.PIPE, stderr=sp.PIPE
|
|
@@ -3,6 +3,9 @@ import shutil
|
|
|
3
3
|
import subprocess as sp
|
|
4
4
|
from glob import glob
|
|
5
5
|
from subprocess import TimeoutExpired
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
from dargs import Argument
|
|
6
9
|
|
|
7
10
|
from dpdispatcher.base_context import BaseContext
|
|
8
11
|
from dpdispatcher.dlog import dlog
|
|
@@ -60,6 +63,7 @@ class LocalContext(BaseContext):
|
|
|
60
63
|
self.temp_local_root = os.path.abspath(local_root)
|
|
61
64
|
self.temp_remote_root = os.path.abspath(remote_root)
|
|
62
65
|
self.remote_profile = remote_profile
|
|
66
|
+
self.symlink = remote_profile.get("symlink", True)
|
|
63
67
|
|
|
64
68
|
@classmethod
|
|
65
69
|
def load_from_dict(cls, context_dict):
|
|
@@ -83,6 +87,25 @@ class LocalContext(BaseContext):
|
|
|
83
87
|
self.temp_remote_root, submission.submission_hash
|
|
84
88
|
)
|
|
85
89
|
|
|
90
|
+
def _copy_from_local_to_remote(self, local_path, remote_path):
|
|
91
|
+
if not os.path.exists(local_path):
|
|
92
|
+
raise FileNotFoundError(
|
|
93
|
+
f"cannot find uploaded file {os.path.join(local_path)}"
|
|
94
|
+
)
|
|
95
|
+
if os.path.exists(remote_path):
|
|
96
|
+
os.remove(remote_path)
|
|
97
|
+
_check_file_path(remote_path)
|
|
98
|
+
|
|
99
|
+
if self.symlink:
|
|
100
|
+
# ensure the file exist
|
|
101
|
+
os.symlink(local_path, remote_path)
|
|
102
|
+
elif os.path.isfile(local_path):
|
|
103
|
+
shutil.copyfile(local_path, remote_path)
|
|
104
|
+
elif os.path.isdir(local_path):
|
|
105
|
+
shutil.copytree(local_path, remote_path)
|
|
106
|
+
else:
|
|
107
|
+
raise ValueError(f"Unknown file type: {local_path}")
|
|
108
|
+
|
|
86
109
|
def upload(self, submission):
|
|
87
110
|
os.makedirs(self.remote_root, exist_ok=True)
|
|
88
111
|
for ii in submission.belonging_tasks:
|
|
@@ -94,7 +117,7 @@ class LocalContext(BaseContext):
|
|
|
94
117
|
for kk in ii.forward_files:
|
|
95
118
|
abs_file_list = glob(os.path.join(local_job, kk))
|
|
96
119
|
if not abs_file_list:
|
|
97
|
-
raise
|
|
120
|
+
raise FileNotFoundError(
|
|
98
121
|
"cannot find upload file " + os.path.join(local_job, kk)
|
|
99
122
|
)
|
|
100
123
|
rel_file_list = [
|
|
@@ -103,14 +126,9 @@ class LocalContext(BaseContext):
|
|
|
103
126
|
file_list.extend(rel_file_list)
|
|
104
127
|
|
|
105
128
|
for jj in file_list:
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
)
|
|
110
|
-
if os.path.exists(os.path.join(remote_job, jj)):
|
|
111
|
-
os.remove(os.path.join(remote_job, jj))
|
|
112
|
-
_check_file_path(os.path.join(remote_job, jj))
|
|
113
|
-
os.symlink(os.path.join(local_job, jj), os.path.join(remote_job, jj))
|
|
129
|
+
self._copy_from_local_to_remote(
|
|
130
|
+
os.path.join(local_job, jj), os.path.join(remote_job, jj)
|
|
131
|
+
)
|
|
114
132
|
|
|
115
133
|
local_job = self.local_root
|
|
116
134
|
remote_job = self.remote_root
|
|
@@ -119,7 +137,7 @@ class LocalContext(BaseContext):
|
|
|
119
137
|
for kk in submission.forward_common_files:
|
|
120
138
|
abs_file_list = glob(os.path.join(local_job, kk))
|
|
121
139
|
if not abs_file_list:
|
|
122
|
-
raise
|
|
140
|
+
raise FileNotFoundError(
|
|
123
141
|
"cannot find upload file " + os.path.join(local_job, kk)
|
|
124
142
|
)
|
|
125
143
|
rel_file_list = [
|
|
@@ -128,14 +146,9 @@ class LocalContext(BaseContext):
|
|
|
128
146
|
file_list.extend(rel_file_list)
|
|
129
147
|
|
|
130
148
|
for jj in file_list:
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
)
|
|
135
|
-
if os.path.exists(os.path.join(remote_job, jj)):
|
|
136
|
-
os.remove(os.path.join(remote_job, jj))
|
|
137
|
-
_check_file_path(os.path.join(remote_job, jj))
|
|
138
|
-
os.symlink(os.path.join(local_job, jj), os.path.join(remote_job, jj))
|
|
149
|
+
self._copy_from_local_to_remote(
|
|
150
|
+
os.path.join(local_job, jj), os.path.join(remote_job, jj)
|
|
151
|
+
)
|
|
139
152
|
|
|
140
153
|
def download(
|
|
141
154
|
self, submission, check_exists=False, mark_failure=True, back_error=False
|
|
@@ -153,14 +166,14 @@ class LocalContext(BaseContext):
|
|
|
153
166
|
tag_file_path = os.path.join(
|
|
154
167
|
self.local_root,
|
|
155
168
|
ii.task_work_path,
|
|
156
|
-
"tag_failure_download_
|
|
169
|
+
f"tag_failure_download_{kk}",
|
|
157
170
|
)
|
|
158
171
|
with open(tag_file_path, "w") as fp:
|
|
159
172
|
pass
|
|
160
173
|
else:
|
|
161
174
|
pass
|
|
162
175
|
else:
|
|
163
|
-
raise
|
|
176
|
+
raise FileNotFoundError(
|
|
164
177
|
"cannot find download file " + os.path.join(remote_job, kk)
|
|
165
178
|
)
|
|
166
179
|
rel_flist = [
|
|
@@ -181,14 +194,16 @@ class LocalContext(BaseContext):
|
|
|
181
194
|
tag_file_path = os.path.join(
|
|
182
195
|
self.local_root,
|
|
183
196
|
ii.task_work_path,
|
|
184
|
-
"tag_failure_download_
|
|
197
|
+
f"tag_failure_download_{jj}",
|
|
185
198
|
)
|
|
186
199
|
with open(tag_file_path, "w") as fp:
|
|
187
200
|
pass
|
|
188
201
|
else:
|
|
189
202
|
pass
|
|
190
203
|
else:
|
|
191
|
-
raise
|
|
204
|
+
raise FileNotFoundError(
|
|
205
|
+
"do not find download file " + rfile
|
|
206
|
+
)
|
|
192
207
|
elif (not os.path.exists(rfile)) and (os.path.exists(lfile)):
|
|
193
208
|
# already downloaded
|
|
194
209
|
pass
|
|
@@ -206,7 +221,10 @@ class LocalContext(BaseContext):
|
|
|
206
221
|
shutil.rmtree(lfile, ignore_errors=True)
|
|
207
222
|
elif os.path.isfile(lfile) or os.path.islink(lfile):
|
|
208
223
|
os.remove(lfile)
|
|
209
|
-
|
|
224
|
+
if not os.path.islink(rfile):
|
|
225
|
+
shutil.move(rfile, lfile)
|
|
226
|
+
else:
|
|
227
|
+
shutil.copyfile(rfile, lfile)
|
|
210
228
|
else:
|
|
211
229
|
raise RuntimeError("should not reach here!")
|
|
212
230
|
else:
|
|
@@ -222,14 +240,14 @@ class LocalContext(BaseContext):
|
|
|
222
240
|
if check_exists:
|
|
223
241
|
if mark_failure:
|
|
224
242
|
tag_file_path = os.path.join(
|
|
225
|
-
self.local_root, "tag_failure_download_
|
|
243
|
+
self.local_root, f"tag_failure_download_{kk}"
|
|
226
244
|
)
|
|
227
245
|
with open(tag_file_path, "w") as fp:
|
|
228
246
|
pass
|
|
229
247
|
else:
|
|
230
248
|
pass
|
|
231
249
|
else:
|
|
232
|
-
raise
|
|
250
|
+
raise FileNotFoundError(
|
|
233
251
|
"cannot find download file " + os.path.join(remote_job, kk)
|
|
234
252
|
)
|
|
235
253
|
rel_flist = [os.path.relpath(ii, start=remote_job) for ii in abs_flist_r]
|
|
@@ -247,7 +265,7 @@ class LocalContext(BaseContext):
|
|
|
247
265
|
if mark_failure:
|
|
248
266
|
with open(
|
|
249
267
|
os.path.join(
|
|
250
|
-
self.local_root, "tag_failure_download_
|
|
268
|
+
self.local_root, f"tag_failure_download_{jj}"
|
|
251
269
|
),
|
|
252
270
|
"w",
|
|
253
271
|
) as fp:
|
|
@@ -255,7 +273,7 @@ class LocalContext(BaseContext):
|
|
|
255
273
|
else:
|
|
256
274
|
pass
|
|
257
275
|
else:
|
|
258
|
-
raise
|
|
276
|
+
raise FileNotFoundError("do not find download file " + rfile)
|
|
259
277
|
elif (not os.path.exists(rfile)) and (os.path.exists(lfile)):
|
|
260
278
|
# already downloaded
|
|
261
279
|
pass
|
|
@@ -273,28 +291,16 @@ class LocalContext(BaseContext):
|
|
|
273
291
|
shutil.rmtree(lfile, ignore_errors=True)
|
|
274
292
|
elif os.path.isfile(lfile) or os.path.islink(lfile):
|
|
275
293
|
os.remove(lfile)
|
|
276
|
-
|
|
294
|
+
if not os.path.islink(rfile):
|
|
295
|
+
shutil.move(rfile, lfile)
|
|
296
|
+
else:
|
|
297
|
+
shutil.copyfile(rfile, lfile)
|
|
277
298
|
else:
|
|
278
299
|
raise RuntimeError("should not reach here!")
|
|
279
300
|
else:
|
|
280
301
|
# no nothing in the case of linked files
|
|
281
302
|
pass
|
|
282
303
|
|
|
283
|
-
def block_checkcall(self, cmd):
|
|
284
|
-
proc = sp.Popen(
|
|
285
|
-
cmd, cwd=self.remote_root, shell=True, stdout=sp.PIPE, stderr=sp.PIPE
|
|
286
|
-
)
|
|
287
|
-
o, e = proc.communicate()
|
|
288
|
-
stdout = SPRetObj(o)
|
|
289
|
-
stderr = SPRetObj(e)
|
|
290
|
-
code = proc.returncode
|
|
291
|
-
if code != 0:
|
|
292
|
-
raise RuntimeError(
|
|
293
|
-
"Get error code %d in locally calling %s with job: %s ",
|
|
294
|
-
(code, cmd, self.submission.submission_hash),
|
|
295
|
-
)
|
|
296
|
-
return None, stdout, stderr
|
|
297
|
-
|
|
298
304
|
def block_call(self, cmd):
|
|
299
305
|
proc = sp.Popen(
|
|
300
306
|
cmd, cwd=self.remote_root, shell=True, stdout=sp.PIPE, stderr=sp.PIPE
|
|
@@ -343,3 +349,31 @@ class LocalContext(BaseContext):
|
|
|
343
349
|
stdout = None
|
|
344
350
|
stderr = None
|
|
345
351
|
return ret, stdout, stderr
|
|
352
|
+
|
|
353
|
+
@classmethod
|
|
354
|
+
def machine_subfields(cls) -> List[Argument]:
|
|
355
|
+
"""Generate the machine subfields.
|
|
356
|
+
|
|
357
|
+
Returns
|
|
358
|
+
-------
|
|
359
|
+
list[Argument]
|
|
360
|
+
machine subfields
|
|
361
|
+
"""
|
|
362
|
+
doc_remote_profile = "The information used to maintain the local machine."
|
|
363
|
+
return [
|
|
364
|
+
Argument(
|
|
365
|
+
"remote_profile",
|
|
366
|
+
dict,
|
|
367
|
+
optional=True,
|
|
368
|
+
doc=doc_remote_profile,
|
|
369
|
+
sub_fields=[
|
|
370
|
+
Argument(
|
|
371
|
+
"symlink",
|
|
372
|
+
bool,
|
|
373
|
+
optional=True,
|
|
374
|
+
default=True,
|
|
375
|
+
doc="Whether to use symbolic links to replace copy. This option should be turned off if the local directory is not accessible on the Batch system.",
|
|
376
|
+
),
|
|
377
|
+
],
|
|
378
|
+
)
|
|
379
|
+
]
|
|
@@ -1,18 +1,20 @@
|
|
|
1
|
+
import glob
|
|
1
2
|
import os
|
|
2
3
|
import shutil
|
|
3
4
|
import uuid
|
|
5
|
+
from zipfile import ZipFile
|
|
4
6
|
|
|
5
7
|
import tqdm
|
|
6
8
|
|
|
7
9
|
try:
|
|
8
|
-
from
|
|
9
|
-
from
|
|
10
|
-
|
|
11
|
-
from bohriumsdk.util import Util
|
|
12
|
-
except ModuleNotFoundError:
|
|
10
|
+
from bohrium import Bohrium
|
|
11
|
+
from bohrium.resources import Job, Tiefblue
|
|
12
|
+
except ModuleNotFoundError as e:
|
|
13
13
|
found_bohriumsdk = False
|
|
14
|
+
import_bohrium_error = e
|
|
14
15
|
else:
|
|
15
16
|
found_bohriumsdk = True
|
|
17
|
+
import_bohrium_error = None
|
|
16
18
|
|
|
17
19
|
from dpdispatcher.base_context import BaseContext
|
|
18
20
|
from dpdispatcher.dlog import dlog
|
|
@@ -23,6 +25,36 @@ DP_CLOUD_SERVER_HOME_DIR = os.path.join(
|
|
|
23
25
|
)
|
|
24
26
|
|
|
25
27
|
|
|
28
|
+
def unzip_file(zip_file, out_dir="./"):
|
|
29
|
+
obj = ZipFile(zip_file, "r")
|
|
30
|
+
for item in obj.namelist():
|
|
31
|
+
obj.extract(item, out_dir)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def zip_file_list(root_path, zip_filename, file_list=[]):
|
|
35
|
+
out_zip_file = os.path.join(root_path, zip_filename)
|
|
36
|
+
# print('debug: file_list', file_list)
|
|
37
|
+
zip_obj = ZipFile(out_zip_file, "w")
|
|
38
|
+
for f in file_list:
|
|
39
|
+
matched_files = os.path.join(root_path, f)
|
|
40
|
+
for ii in glob.glob(matched_files):
|
|
41
|
+
# print('debug: matched_files:ii', ii)
|
|
42
|
+
if os.path.isdir(ii):
|
|
43
|
+
arcname = os.path.relpath(ii, start=root_path)
|
|
44
|
+
zip_obj.write(ii, arcname)
|
|
45
|
+
for root, dirs, files in os.walk(ii):
|
|
46
|
+
for file in files:
|
|
47
|
+
filename = os.path.join(root, file)
|
|
48
|
+
arcname = os.path.relpath(filename, start=root_path)
|
|
49
|
+
# print('debug: filename:arcname:root_path', filename, arcname, root_path)
|
|
50
|
+
zip_obj.write(filename, arcname)
|
|
51
|
+
else:
|
|
52
|
+
arcname = os.path.relpath(ii, start=root_path)
|
|
53
|
+
zip_obj.write(ii, arcname)
|
|
54
|
+
zip_obj.close()
|
|
55
|
+
return out_zip_file
|
|
56
|
+
|
|
57
|
+
|
|
26
58
|
class OpenAPIContext(BaseContext):
|
|
27
59
|
def __init__(
|
|
28
60
|
self,
|
|
@@ -35,16 +67,41 @@ class OpenAPIContext(BaseContext):
|
|
|
35
67
|
if not found_bohriumsdk:
|
|
36
68
|
raise ModuleNotFoundError(
|
|
37
69
|
"bohriumsdk not installed. Install dpdispatcher with `pip install dpdispatcher[bohrium]`"
|
|
38
|
-
)
|
|
70
|
+
) from import_bohrium_error
|
|
39
71
|
self.init_local_root = local_root
|
|
40
72
|
self.init_remote_root = remote_root
|
|
41
73
|
self.temp_local_root = os.path.abspath(local_root)
|
|
42
74
|
self.remote_profile = remote_profile
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
75
|
+
access_key = (
|
|
76
|
+
remote_profile.get("access_key", None)
|
|
77
|
+
or os.getenv("BOHRIUM_ACCESS_KEY", None)
|
|
78
|
+
or os.getenv("ACCESS_KEY", None)
|
|
79
|
+
)
|
|
80
|
+
project_id = (
|
|
81
|
+
remote_profile.get("project_id", None)
|
|
82
|
+
or os.getenv("BOHRIUM_PROJECT_ID", None)
|
|
83
|
+
or os.getenv("PROJECT_ID", None)
|
|
84
|
+
)
|
|
85
|
+
app_key = (
|
|
86
|
+
remote_profile.get("app_key", None)
|
|
87
|
+
or os.getenv("BOHRIUM_APP_KEY", None)
|
|
88
|
+
or os.getenv("APP_KEY", None)
|
|
89
|
+
)
|
|
90
|
+
if access_key is None:
|
|
91
|
+
raise ValueError(
|
|
92
|
+
"remote_profile must contain 'access_key' or set environment variable 'BOHRIUM_ACCESS_KEY'"
|
|
93
|
+
)
|
|
94
|
+
if project_id is None:
|
|
95
|
+
raise ValueError(
|
|
96
|
+
"remote_profile must contain 'project_id' or set environment variable 'BOHRIUM_PROJECT_ID'"
|
|
97
|
+
)
|
|
98
|
+
self.client = Bohrium( # type: ignore[reportPossiblyUnboundVariable]
|
|
99
|
+
access_key=access_key, project_id=project_id, app_key=app_key
|
|
100
|
+
)
|
|
101
|
+
self.storage = Tiefblue() # type: ignore[reportPossiblyUnboundVariable]
|
|
102
|
+
self.job = Job(client=self.client) # type: ignore[reportPossiblyUnboundVariable]
|
|
47
103
|
self.jgid = None
|
|
104
|
+
os.makedirs(DP_CLOUD_SERVER_HOME_DIR, exist_ok=True)
|
|
48
105
|
|
|
49
106
|
@classmethod
|
|
50
107
|
def load_from_dict(cls, context_dict):
|
|
@@ -97,7 +154,7 @@ class OpenAPIContext(BaseContext):
|
|
|
97
154
|
for file in task.forward_files:
|
|
98
155
|
upload_file_list.append(os.path.join(task.task_work_path, file))
|
|
99
156
|
|
|
100
|
-
upload_zip =
|
|
157
|
+
upload_zip = zip_file_list(
|
|
101
158
|
self.local_root, zip_task_file, file_list=upload_file_list
|
|
102
159
|
)
|
|
103
160
|
project_id = self.remote_profile.get("project_id", 0)
|
|
@@ -113,7 +170,7 @@ class OpenAPIContext(BaseContext):
|
|
|
113
170
|
object_key = os.path.join(data["storePath"], zip_filename) # type: ignore
|
|
114
171
|
job.upload_path = object_key
|
|
115
172
|
job.job_id = data["jobId"] # type: ignore
|
|
116
|
-
job.jgid = data
|
|
173
|
+
job.jgid = data.get("jobGroupId", "") # type: ignore
|
|
117
174
|
self.storage.upload_From_file_multi_part(
|
|
118
175
|
object_key=object_key, file_path=upload_zip, token=token
|
|
119
176
|
)
|
|
@@ -149,7 +206,9 @@ class OpenAPIContext(BaseContext):
|
|
|
149
206
|
# return oss_task_zip
|
|
150
207
|
# api.upload(self.oss_task_dir, zip_task_file)
|
|
151
208
|
|
|
152
|
-
def download(
|
|
209
|
+
def download(
|
|
210
|
+
self, submission, check_exists=False, mark_failure=True, back_error=False
|
|
211
|
+
):
|
|
153
212
|
jobs = submission.belonging_jobs
|
|
154
213
|
job_hashs = {}
|
|
155
214
|
job_infos = {}
|
|
@@ -189,7 +248,7 @@ class OpenAPIContext(BaseContext):
|
|
|
189
248
|
):
|
|
190
249
|
continue
|
|
191
250
|
self.storage.download_from_url(info["resultUrl"], target_result_zip)
|
|
192
|
-
|
|
251
|
+
unzip_file(target_result_zip, out_dir=self.local_root)
|
|
193
252
|
self._backup(self.local_root, target_result_zip)
|
|
194
253
|
self._clean_backup(
|
|
195
254
|
self.local_root, keep_backup=self.remote_profile.get("keep_backup", True)
|
|
@@ -258,3 +317,8 @@ class OpenAPIContext(BaseContext):
|
|
|
258
317
|
dir_to_be_removed = os.path.join(local_root, "backup")
|
|
259
318
|
if os.path.exists(dir_to_be_removed):
|
|
260
319
|
shutil.rmtree(dir_to_be_removed)
|
|
320
|
+
|
|
321
|
+
def block_call(self, cmd):
|
|
322
|
+
raise RuntimeError(
|
|
323
|
+
"Unsupported method. You may use an unsupported combination of the machine and the context."
|
|
324
|
+
)
|