dpdispatcher 0.6.3__py3-none-any.whl → 0.6.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dpdispatcher might be problematic. Click here for more details.
- dpdispatcher/_version.py +2 -2
- dpdispatcher/contexts/local_context.py +8 -2
- dpdispatcher/contexts/ssh_context.py +1 -1
- dpdispatcher/dpdisp.py +6 -0
- dpdispatcher/entrypoints/submission.py +21 -1
- dpdispatcher/machines/pbs.py +104 -0
- {dpdispatcher-0.6.3.dist-info → dpdispatcher-0.6.4.dist-info}/METADATA +1 -1
- {dpdispatcher-0.6.3.dist-info → dpdispatcher-0.6.4.dist-info}/RECORD +12 -12
- {dpdispatcher-0.6.3.dist-info → dpdispatcher-0.6.4.dist-info}/WHEEL +1 -1
- {dpdispatcher-0.6.3.dist-info → dpdispatcher-0.6.4.dist-info}/LICENSE +0 -0
- {dpdispatcher-0.6.3.dist-info → dpdispatcher-0.6.4.dist-info}/entry_points.txt +0 -0
- {dpdispatcher-0.6.3.dist-info → dpdispatcher-0.6.4.dist-info}/top_level.txt +0 -0
dpdispatcher/_version.py
CHANGED
|
@@ -208,7 +208,10 @@ class LocalContext(BaseContext):
|
|
|
208
208
|
shutil.rmtree(lfile, ignore_errors=True)
|
|
209
209
|
elif os.path.isfile(lfile) or os.path.islink(lfile):
|
|
210
210
|
os.remove(lfile)
|
|
211
|
-
|
|
211
|
+
if not os.path.islink(rfile):
|
|
212
|
+
shutil.move(rfile, lfile)
|
|
213
|
+
else:
|
|
214
|
+
shutil.copyfile(rfile, lfile)
|
|
212
215
|
else:
|
|
213
216
|
raise RuntimeError("should not reach here!")
|
|
214
217
|
else:
|
|
@@ -275,7 +278,10 @@ class LocalContext(BaseContext):
|
|
|
275
278
|
shutil.rmtree(lfile, ignore_errors=True)
|
|
276
279
|
elif os.path.isfile(lfile) or os.path.islink(lfile):
|
|
277
280
|
os.remove(lfile)
|
|
278
|
-
|
|
281
|
+
if not os.path.islink(rfile):
|
|
282
|
+
shutil.move(rfile, lfile)
|
|
283
|
+
else:
|
|
284
|
+
shutil.copyfile(rfile, lfile)
|
|
279
285
|
else:
|
|
280
286
|
raise RuntimeError("should not reach here!")
|
|
281
287
|
else:
|
|
@@ -295,7 +295,7 @@ class SSHSession:
|
|
|
295
295
|
assert self.ssh is not None
|
|
296
296
|
try:
|
|
297
297
|
return self.ssh.exec_command(cmd)
|
|
298
|
-
except (paramiko.ssh_exception.SSHException, socket.timeout) as e:
|
|
298
|
+
except (paramiko.ssh_exception.SSHException, socket.timeout, EOFError) as e:
|
|
299
299
|
# SSH session not active
|
|
300
300
|
# retry for up to 3 times
|
|
301
301
|
# ensure alive
|
dpdispatcher/dpdisp.py
CHANGED
|
@@ -54,6 +54,11 @@ def main_parser() -> argparse.ArgumentParser:
|
|
|
54
54
|
action="store_true",
|
|
55
55
|
help="Clean submission.",
|
|
56
56
|
)
|
|
57
|
+
parser_submission_action.add_argument(
|
|
58
|
+
"--reset-fail-count",
|
|
59
|
+
action="store_true",
|
|
60
|
+
help="Reset fail count of all jobs to zero.",
|
|
61
|
+
)
|
|
57
62
|
##########################################
|
|
58
63
|
# gui
|
|
59
64
|
parser_gui = subparsers.add_parser(
|
|
@@ -105,6 +110,7 @@ def main():
|
|
|
105
110
|
download_terminated_log=args.download_terminated_log,
|
|
106
111
|
download_finished_task=args.download_finished_task,
|
|
107
112
|
clean=args.clean,
|
|
113
|
+
reset_fail_count=args.reset_fail_count,
|
|
108
114
|
)
|
|
109
115
|
elif args.command == "gui":
|
|
110
116
|
start_dpgui(
|
|
@@ -12,6 +12,7 @@ def handle_submission(
|
|
|
12
12
|
download_terminated_log: bool = False,
|
|
13
13
|
download_finished_task: bool = False,
|
|
14
14
|
clean: bool = False,
|
|
15
|
+
reset_fail_count: bool = False,
|
|
15
16
|
):
|
|
16
17
|
"""Handle terminated submission.
|
|
17
18
|
|
|
@@ -25,13 +26,21 @@ def handle_submission(
|
|
|
25
26
|
Download finished tasks.
|
|
26
27
|
clean : bool, optional
|
|
27
28
|
Clean submission.
|
|
29
|
+
reset_fail_count : bool, optional
|
|
30
|
+
Reset fail count of all jobs to zero.
|
|
28
31
|
|
|
29
32
|
Raises
|
|
30
33
|
------
|
|
31
34
|
ValueError
|
|
32
35
|
At least one action should be specified.
|
|
33
36
|
"""
|
|
34
|
-
if
|
|
37
|
+
if (
|
|
38
|
+
int(download_terminated_log)
|
|
39
|
+
+ int(download_finished_task)
|
|
40
|
+
+ int(clean)
|
|
41
|
+
+ int(reset_fail_count)
|
|
42
|
+
== 0
|
|
43
|
+
):
|
|
35
44
|
raise ValueError("At least one action should be specified.")
|
|
36
45
|
|
|
37
46
|
submission_file = record.get_submission(submission_hash)
|
|
@@ -42,7 +51,18 @@ def handle_submission(
|
|
|
42
51
|
# TODO: for unclear reason, the submission_hash may be changed
|
|
43
52
|
submission.submission_hash = submission_hash
|
|
44
53
|
submission.machine.context.bind_submission(submission)
|
|
54
|
+
if reset_fail_count:
|
|
55
|
+
for job in submission.belonging_jobs:
|
|
56
|
+
job.fail_count = 0
|
|
57
|
+
# save to remote and local
|
|
58
|
+
submission.submission_to_json()
|
|
59
|
+
record.write(submission)
|
|
60
|
+
if int(download_terminated_log) + int(download_finished_task) + int(clean) == 0:
|
|
61
|
+
# if only reset_fail_count, no need to update submission state (expensive)
|
|
62
|
+
return
|
|
45
63
|
submission.update_submission_state()
|
|
64
|
+
submission.submission_to_json()
|
|
65
|
+
record.write(submission)
|
|
46
66
|
|
|
47
67
|
terminated_tasks = []
|
|
48
68
|
finished_tasks = []
|
dpdispatcher/machines/pbs.py
CHANGED
|
@@ -177,3 +177,107 @@ class Torque(PBS):
|
|
|
177
177
|
**pbs_script_header_dict
|
|
178
178
|
)
|
|
179
179
|
return pbs_script_header
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
sge_script_header_template = """
|
|
183
|
+
#!/bin/bash
|
|
184
|
+
#$ -N dpdispatcher_submit
|
|
185
|
+
{select_node_line}
|
|
186
|
+
#$ -cwd
|
|
187
|
+
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
class SGE(PBS):
|
|
192
|
+
def __init__(
|
|
193
|
+
self,
|
|
194
|
+
batch_type=None,
|
|
195
|
+
context_type=None,
|
|
196
|
+
local_root=None,
|
|
197
|
+
remote_root=None,
|
|
198
|
+
remote_profile={},
|
|
199
|
+
*,
|
|
200
|
+
context=None,
|
|
201
|
+
):
|
|
202
|
+
super(PBS, self).__init__(
|
|
203
|
+
batch_type,
|
|
204
|
+
context_type,
|
|
205
|
+
local_root,
|
|
206
|
+
remote_root,
|
|
207
|
+
remote_profile,
|
|
208
|
+
context=context,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
def gen_script_header(self, job):
|
|
212
|
+
resources = job.resources
|
|
213
|
+
sge_script_header_dict = {}
|
|
214
|
+
# resources.number_node is not used
|
|
215
|
+
sge_script_header_dict[
|
|
216
|
+
"select_node_line"
|
|
217
|
+
] = f"#$ -pe mpi {resources.cpu_per_node} "
|
|
218
|
+
# resources.queue_name is not necessary
|
|
219
|
+
sge_script_header = sge_script_header_template.format(**sge_script_header_dict)
|
|
220
|
+
return sge_script_header
|
|
221
|
+
|
|
222
|
+
def do_submit(self, job):
|
|
223
|
+
script_file_name = job.script_file_name
|
|
224
|
+
script_str = self.gen_script(job)
|
|
225
|
+
job_id_name = job.job_hash + "_job_id"
|
|
226
|
+
self.context.write_file(fname=script_file_name, write_str=script_str)
|
|
227
|
+
script_file_dir = self.context.remote_root
|
|
228
|
+
stdin, stdout, stderr = self.context.block_checkcall(
|
|
229
|
+
"cd {} && {} {}".format(script_file_dir, "qsub", script_file_name)
|
|
230
|
+
)
|
|
231
|
+
subret = stdout.readlines()
|
|
232
|
+
job_id = subret[0].split()[2]
|
|
233
|
+
self.context.write_file(job_id_name, job_id)
|
|
234
|
+
return job_id
|
|
235
|
+
|
|
236
|
+
def default_resources(self, resources):
|
|
237
|
+
pass
|
|
238
|
+
|
|
239
|
+
def check_status(self, job):
|
|
240
|
+
job_id = job.job_id
|
|
241
|
+
status_line = None
|
|
242
|
+
if job_id == "":
|
|
243
|
+
return JobStatus.unsubmitted
|
|
244
|
+
ret, stdin, stdout, stderr = self.context.block_call("qstat")
|
|
245
|
+
err_str = stderr.read().decode("utf-8")
|
|
246
|
+
if ret != 0:
|
|
247
|
+
raise RuntimeError(
|
|
248
|
+
"status command qstat fails to execute. erro info: %s return code %d"
|
|
249
|
+
% (err_str, ret)
|
|
250
|
+
)
|
|
251
|
+
status_text_list = stdout.read().decode("utf-8").split("\n")
|
|
252
|
+
for txt in status_text_list:
|
|
253
|
+
if job_id in txt:
|
|
254
|
+
status_line = txt
|
|
255
|
+
|
|
256
|
+
if status_line is None:
|
|
257
|
+
count = 0
|
|
258
|
+
while count <= 6:
|
|
259
|
+
if self.check_finish_tag(job=job):
|
|
260
|
+
return JobStatus.finished
|
|
261
|
+
dlog.info(
|
|
262
|
+
"not tag_finished detected, execute sync command and wait. count "
|
|
263
|
+
+ str(count)
|
|
264
|
+
)
|
|
265
|
+
self.context.block_call("sync")
|
|
266
|
+
import time
|
|
267
|
+
|
|
268
|
+
time.sleep(10)
|
|
269
|
+
count += 1
|
|
270
|
+
return JobStatus.terminated
|
|
271
|
+
else:
|
|
272
|
+
status_word = status_line.split()[4]
|
|
273
|
+
# dlog.info (status_word)
|
|
274
|
+
if status_word in ["qw"]:
|
|
275
|
+
return JobStatus.waiting
|
|
276
|
+
elif status_word in ["r"]:
|
|
277
|
+
return JobStatus.running
|
|
278
|
+
else:
|
|
279
|
+
return JobStatus.unknown
|
|
280
|
+
|
|
281
|
+
def check_finish_tag(self, job):
|
|
282
|
+
job_tag_finished = job.job_hash + "_job_tag_finished"
|
|
283
|
+
return self.context.check_file_exists(job_tag_finished)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: dpdispatcher
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.4
|
|
4
4
|
Summary: Generate HPC scheduler systems jobs input scripts, submit these scripts to HPC systems, and poke until they finish
|
|
5
5
|
Author: DeepModeling
|
|
6
6
|
License: GNU LESSER GENERAL PUBLIC LICENSE
|
|
@@ -1,31 +1,31 @@
|
|
|
1
1
|
dpdispatcher/__init__.py,sha256=CLZP_N5CTp14ujWCykEHuJjoIfKR6CwrclXhjWUgNoE,517
|
|
2
2
|
dpdispatcher/__main__.py,sha256=BFhG-mSBzVZUEezQJqXWZnt2WsnhAHT_zpT8Y6gpOz0,116
|
|
3
|
-
dpdispatcher/_version.py,sha256=
|
|
3
|
+
dpdispatcher/_version.py,sha256=YeKWh9qHxBSK-fURNyCejbICk3LuDCjfwlZrUuEgWgM,411
|
|
4
4
|
dpdispatcher/arginfo.py,sha256=pNaxYIE6ahBidpR7OCKZdw8iGt003uTXGSlVzwiuvRg,188
|
|
5
5
|
dpdispatcher/base_context.py,sha256=NvaC_RHyspxq412z-eCq4Zn8-szZxvn8K6OkXvx7l4Y,3615
|
|
6
6
|
dpdispatcher/dlog.py,sha256=ndh12teQBbJRybXd8UjEmAi6QTsAXajRicDj5mAH5h0,799
|
|
7
|
-
dpdispatcher/dpdisp.py,sha256=
|
|
7
|
+
dpdispatcher/dpdisp.py,sha256=p-f_KCJxALBqP4StpNK2X_-VkGZ0L43MhROxhCpFwE4,3712
|
|
8
8
|
dpdispatcher/machine.py,sha256=XFRH41gNCex_qs9gbg-S88_qab3_UAGfxKWUPxoipCM,16140
|
|
9
9
|
dpdispatcher/submission.py,sha256=mVAHBlT0a3_1PtsEvvhvwNPkAhgLiBXXemX64BcwizU,48447
|
|
10
10
|
dpdispatcher/contexts/__init__.py,sha256=s5M0ZJSrPttSyLdBwKD2m3W7a5AbYZdPB7IAND2j7EY,335
|
|
11
11
|
dpdispatcher/contexts/dp_cloud_server_context.py,sha256=6XK0B2sLGEDeZmV2SZzQdVrMcWAWYZVLLK-IaShEXIY,12245
|
|
12
12
|
dpdispatcher/contexts/hdfs_context.py,sha256=GbV_o3i0NL43B7dCPnArXS5DPkkET4EAiHw1VgsMcdE,9000
|
|
13
13
|
dpdispatcher/contexts/lazy_local_context.py,sha256=F8abWAJRY1Ewx1sErINKN1ltWerXzeCcJgjTvLvucKE,5696
|
|
14
|
-
dpdispatcher/contexts/local_context.py,sha256=
|
|
14
|
+
dpdispatcher/contexts/local_context.py,sha256=vhZtdtduPokw6hU0YbaWNuoCOO6Tio2w99Fi8AegIw0,14052
|
|
15
15
|
dpdispatcher/contexts/openapi_context.py,sha256=DXaMS10SXN3VKEeEdzQyfOgRwUyHRJVCJHd2fKKdsmA,9499
|
|
16
|
-
dpdispatcher/contexts/ssh_context.py,sha256=
|
|
16
|
+
dpdispatcher/contexts/ssh_context.py,sha256=zhBM_qH4zGMws7Yww5txNLhkK7b3maiFEMoewuAJou0,38612
|
|
17
17
|
dpdispatcher/dpcloudserver/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
18
|
dpdispatcher/dpcloudserver/client.py,sha256=k1niKjG6zFnMtHn_UuCjYoOcMju3o3PV-GdyVLr5-KM,165
|
|
19
19
|
dpdispatcher/entrypoints/__init__.py,sha256=exKSFT3j2oCerGwtI8WbHQK-D0K-CyifocRji1xntT4,20
|
|
20
20
|
dpdispatcher/entrypoints/gui.py,sha256=29lMXqbmSRbLj4rfBv7Jnw89NLU9syTB88IUP6IRJsU,830
|
|
21
|
-
dpdispatcher/entrypoints/submission.py,sha256=
|
|
21
|
+
dpdispatcher/entrypoints/submission.py,sha256=ikVwIZAQL0SsYO5xaMIdKXgO6qtc05w1vqmvtG7Nk5M,3401
|
|
22
22
|
dpdispatcher/machines/__init__.py,sha256=9kSYkz2w3flp00IrHWTEwvoFGrathQAT3tvbieye83c,335
|
|
23
23
|
dpdispatcher/machines/distributed_shell.py,sha256=7avNcoOzEj7UcJuKl6b1ka2bj5dixcJaMlZK-I-i_Tc,7571
|
|
24
24
|
dpdispatcher/machines/dp_cloud_server.py,sha256=SR69gsFb2BvOQCW1QnWfP3cQvu_qHLJNsycp5wzosJU,11706
|
|
25
25
|
dpdispatcher/machines/fugaku.py,sha256=9OP3qSaaruqypHAdcuBFQM_MUtFp3yrvhZ5bPyLwEEk,4308
|
|
26
26
|
dpdispatcher/machines/lsf.py,sha256=Qruot39cPEpBNbbPmDwb1Gyfgyw3N36O0hs9PNEXyVU,7997
|
|
27
27
|
dpdispatcher/machines/openapi.py,sha256=Gzzbo8YOAybXGTrgMutexErcaEi3ts7uTUNvOhThFS8,8858
|
|
28
|
-
dpdispatcher/machines/pbs.py,sha256=
|
|
28
|
+
dpdispatcher/machines/pbs.py,sha256=YK0rKIsd8GAZYfxGLezA9RdCQ6AOeuPML_v5OwmioTo,10331
|
|
29
29
|
dpdispatcher/machines/shell.py,sha256=qaia7mC_fz5Bqyelxmc1je-xg7NQ_6vQQ0qAjg2m4RQ,4796
|
|
30
30
|
dpdispatcher/machines/slurm.py,sha256=SP5rQiCPWzq4rqgUgp0IGJXXD_1DURWl4OBRAJ-Kng4,15611
|
|
31
31
|
dpdispatcher/utils/__init__.py,sha256=fwvwkMf7DFNQkNBiIce8Y8gRA6FhICwKjkKiXu_BEJg,13
|
|
@@ -38,9 +38,9 @@ dpdispatcher/utils/dpcloudserver/client.py,sha256=CLfXswvzI4inDrW2bYkfMQ6gQJFcZO
|
|
|
38
38
|
dpdispatcher/utils/dpcloudserver/config.py,sha256=NteQzf1OeEkz2UbkXHHQ0B72cUu23zLVzpM9Yh4v1Cc,559
|
|
39
39
|
dpdispatcher/utils/dpcloudserver/retcode.py,sha256=1qAF8gFZx55u2sO8KbtYSIIrjcO-IGufEUlwbkSfC1g,721
|
|
40
40
|
dpdispatcher/utils/dpcloudserver/zip_file.py,sha256=f9WrlktwHW0YipaWg5Y0kxjMZlhD1cJYa6EUpvu4Cro,2611
|
|
41
|
-
dpdispatcher-0.6.
|
|
42
|
-
dpdispatcher-0.6.
|
|
43
|
-
dpdispatcher-0.6.
|
|
44
|
-
dpdispatcher-0.6.
|
|
45
|
-
dpdispatcher-0.6.
|
|
46
|
-
dpdispatcher-0.6.
|
|
41
|
+
dpdispatcher-0.6.4.dist-info/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
|
|
42
|
+
dpdispatcher-0.6.4.dist-info/METADATA,sha256=9NoY3DjM_USxl-2nBiNNLMCzJ6DZadgEMS1e11Fj72c,12752
|
|
43
|
+
dpdispatcher-0.6.4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
|
44
|
+
dpdispatcher-0.6.4.dist-info/entry_points.txt,sha256=NRHUV0IU_u7_XtcmmEDnVzAcUmurhiEAGwENckrajo4,233
|
|
45
|
+
dpdispatcher-0.6.4.dist-info/top_level.txt,sha256=35jAQoXY-b-e9fJ1_mxhZUiaCoJNt1ZI7mpFRf07Qjs,13
|
|
46
|
+
dpdispatcher-0.6.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|