dpdispatcher 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dpdispatcher might be problematic. Click here for more details.
- dpdispatcher/__init__.py +2 -0
- dpdispatcher/_version.py +2 -2
- dpdispatcher/fugaku.py +94 -0
- dpdispatcher/submission.py +67 -3
- {dpdispatcher-0.5.7.dist-info → dpdispatcher-0.5.8.dist-info}/METADATA +1 -1
- {dpdispatcher-0.5.7.dist-info → dpdispatcher-0.5.8.dist-info}/RECORD +10 -9
- {dpdispatcher-0.5.7.dist-info → dpdispatcher-0.5.8.dist-info}/LICENSE +0 -0
- {dpdispatcher-0.5.7.dist-info → dpdispatcher-0.5.8.dist-info}/WHEEL +0 -0
- {dpdispatcher-0.5.7.dist-info → dpdispatcher-0.5.8.dist-info}/entry_points.txt +0 -0
- {dpdispatcher-0.5.7.dist-info → dpdispatcher-0.5.8.dist-info}/top_level.txt +0 -0
dpdispatcher/__init__.py
CHANGED
|
@@ -43,6 +43,7 @@ except ImportError:
|
|
|
43
43
|
from .distributed_shell import DistributedShell
|
|
44
44
|
from .dp_cloud_server import DpCloudServer, Lebesgue
|
|
45
45
|
from .dp_cloud_server_context import DpCloudServerContext, LebesgueContext
|
|
46
|
+
from .fugaku import Fugaku
|
|
46
47
|
from .hdfs_context import HDFSContext
|
|
47
48
|
from .lazy_local_context import LazyLocalContext
|
|
48
49
|
from .local_context import LocalContext
|
|
@@ -85,6 +86,7 @@ __all__ = [
|
|
|
85
86
|
"PBS",
|
|
86
87
|
"Shell",
|
|
87
88
|
"Slurm",
|
|
89
|
+
"Fugaku",
|
|
88
90
|
"SSHContext",
|
|
89
91
|
"Submission",
|
|
90
92
|
"Task",
|
dpdispatcher/_version.py
CHANGED
dpdispatcher/fugaku.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import shlex
|
|
2
|
+
|
|
3
|
+
from dpdispatcher import dlog
|
|
4
|
+
from dpdispatcher.JobStatus import JobStatus
|
|
5
|
+
from dpdispatcher.machine import Machine
|
|
6
|
+
|
|
7
|
+
fugaku_script_header_template = """\
|
|
8
|
+
{queue_name_line}
|
|
9
|
+
{fugaku_node_number_line}
|
|
10
|
+
{fugaku_ntasks_per_node_line}
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Fugaku(Machine):
|
|
15
|
+
def gen_script(self, job):
|
|
16
|
+
fugaku_script = super().gen_script(job)
|
|
17
|
+
return fugaku_script
|
|
18
|
+
|
|
19
|
+
def gen_script_header(self, job):
|
|
20
|
+
resources = job.resources
|
|
21
|
+
fugaku_script_header_dict = {}
|
|
22
|
+
fugaku_script_header_dict[
|
|
23
|
+
"fugaku_node_number_line"
|
|
24
|
+
] = f'#PJM -L "node={resources.number_node}" '
|
|
25
|
+
fugaku_script_header_dict[
|
|
26
|
+
"fugaku_ntasks_per_node_line"
|
|
27
|
+
] = '#PJM --mpi "max-proc-per-node={cpu_per_node}"'.format(
|
|
28
|
+
cpu_per_node=resources.cpu_per_node
|
|
29
|
+
)
|
|
30
|
+
fugaku_script_header_dict[
|
|
31
|
+
"queue_name_line"
|
|
32
|
+
] = f'#PJM -L "rscgrp={resources.queue_name}"'
|
|
33
|
+
fugaku_script_header = fugaku_script_header_template.format(
|
|
34
|
+
**fugaku_script_header_dict
|
|
35
|
+
)
|
|
36
|
+
return fugaku_script_header
|
|
37
|
+
|
|
38
|
+
def do_submit(self, job):
|
|
39
|
+
script_file_name = job.script_file_name
|
|
40
|
+
script_str = self.gen_script(job)
|
|
41
|
+
job_id_name = job.job_hash + "_job_id"
|
|
42
|
+
# script_str = self.sub_script(job_dirs, cmd, args=args, resources=resources, outlog=outlog, errlog=errlog)
|
|
43
|
+
self.context.write_file(fname=script_file_name, write_str=script_str)
|
|
44
|
+
# self.context.write_file(fname=os.path.join(self.context.submission.work_base, script_file_name), write_str=script_str)
|
|
45
|
+
# script_file_dir = os.path.join(self.context.submission.work_base)
|
|
46
|
+
script_file_dir = self.context.remote_root
|
|
47
|
+
# stdin, stdout, stderr = self.context.block_checkcall('cd %s && %s %s' % (self.context.remote_root, 'pjsub', script_file_name))
|
|
48
|
+
|
|
49
|
+
stdin, stdout, stderr = self.context.block_checkcall(
|
|
50
|
+
"cd {} && {} {}".format(
|
|
51
|
+
shlex.quote(script_file_dir), "pjsub", shlex.quote(script_file_name)
|
|
52
|
+
)
|
|
53
|
+
)
|
|
54
|
+
subret = stdout.readlines()
|
|
55
|
+
job_id = subret[0].split()[5]
|
|
56
|
+
self.context.write_file(job_id_name, job_id)
|
|
57
|
+
return job_id
|
|
58
|
+
|
|
59
|
+
def default_resources(self, resources):
|
|
60
|
+
pass
|
|
61
|
+
|
|
62
|
+
def check_status(self, job):
|
|
63
|
+
job_id = job.job_id
|
|
64
|
+
if job_id == "":
|
|
65
|
+
return JobStatus.unsubmitted
|
|
66
|
+
ret, stdin, stdout, stderr = self.context.block_call("pjstat " + job_id)
|
|
67
|
+
err_str = stderr.read().decode("utf-8")
|
|
68
|
+
try:
|
|
69
|
+
status_line = stdout.read().decode("utf-8").split("\n")[-2]
|
|
70
|
+
# pjstat only retrun 0 if the job is not waiting or running
|
|
71
|
+
except Exception:
|
|
72
|
+
ret, stdin, stdout, stderr = self.context.block_call("pjstat -H " + job_id)
|
|
73
|
+
status_line = stdout.read().decode("utf-8").split("\n")[-2]
|
|
74
|
+
status_word = status_line.split()[3]
|
|
75
|
+
if status_word in ["EXT", "CCL", "ERR"]:
|
|
76
|
+
if self.check_finish_tag(job):
|
|
77
|
+
dlog.info(f"job: {job.job_hash} {job.job_id} finished")
|
|
78
|
+
return JobStatus.finished
|
|
79
|
+
else:
|
|
80
|
+
return JobStatus.terminated
|
|
81
|
+
else:
|
|
82
|
+
return JobStatus.unknown
|
|
83
|
+
status_word = status_line.split()[3]
|
|
84
|
+
# dlog.info (status_word)
|
|
85
|
+
if status_word in ["QUE", "HLD", "RNA", "SPD"]:
|
|
86
|
+
return JobStatus.waiting
|
|
87
|
+
elif status_word in ["RUN", "RNE"]:
|
|
88
|
+
return JobStatus.running
|
|
89
|
+
else:
|
|
90
|
+
return JobStatus.unknown
|
|
91
|
+
|
|
92
|
+
def check_finish_tag(self, job):
|
|
93
|
+
job_tag_finished = job.job_hash + "_job_tag_finished"
|
|
94
|
+
return self.context.check_file_exists(job_tag_finished)
|
dpdispatcher/submission.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# %%
|
|
2
|
+
import asyncio
|
|
2
3
|
import copy
|
|
4
|
+
import functools
|
|
3
5
|
import json
|
|
4
6
|
import os
|
|
5
7
|
import pathlib
|
|
@@ -199,7 +201,9 @@ class Submission:
|
|
|
199
201
|
self.local_root = machine.context.temp_local_root
|
|
200
202
|
return self
|
|
201
203
|
|
|
202
|
-
def run_submission(
|
|
204
|
+
def run_submission(
|
|
205
|
+
self, *, dry_run=False, exit_on_submit=False, clean=True, check_interval=30
|
|
206
|
+
):
|
|
203
207
|
"""Main method to execute the submission.
|
|
204
208
|
First, check whether old Submission exists on the remote machine, and try to recover from it.
|
|
205
209
|
Second, upload the local files to the remote machine where the tasks to be executed.
|
|
@@ -240,7 +244,7 @@ class Submission:
|
|
|
240
244
|
break
|
|
241
245
|
|
|
242
246
|
try:
|
|
243
|
-
time.sleep(
|
|
247
|
+
time.sleep(check_interval)
|
|
244
248
|
except (Exception, KeyboardInterrupt, SystemExit) as e:
|
|
245
249
|
self.submission_to_json()
|
|
246
250
|
dlog.exception(e)
|
|
@@ -254,12 +258,72 @@ class Submission:
|
|
|
254
258
|
finally:
|
|
255
259
|
pass
|
|
256
260
|
self.handle_unexpected_submission_state()
|
|
257
|
-
self.
|
|
261
|
+
self.try_download_result()
|
|
258
262
|
self.submission_to_json()
|
|
259
263
|
if clean:
|
|
260
264
|
self.clean_jobs()
|
|
261
265
|
return self.serialize()
|
|
262
266
|
|
|
267
|
+
def try_download_result(self):
|
|
268
|
+
start_time = time.time()
|
|
269
|
+
retry_interval = 60 # 每1分钟重试一次
|
|
270
|
+
success = False
|
|
271
|
+
while not success:
|
|
272
|
+
try:
|
|
273
|
+
self.download_jobs()
|
|
274
|
+
success = True
|
|
275
|
+
except (EOFError, Exception) as e:
|
|
276
|
+
dlog.exception(e)
|
|
277
|
+
elapsed_time = time.time() - start_time
|
|
278
|
+
if elapsed_time < 3600: # 1小时内
|
|
279
|
+
dlog.info("Retrying in 1 minute...")
|
|
280
|
+
time.sleep(retry_interval)
|
|
281
|
+
elif elapsed_time < 86400: # 1小时后,但在24小时内
|
|
282
|
+
retry_interval = 600 # 每10分钟重试一次
|
|
283
|
+
dlog.info("Retrying in 10 minutes...")
|
|
284
|
+
time.sleep(retry_interval)
|
|
285
|
+
else: # 超过24小时
|
|
286
|
+
dlog.info("Maximum retries time reached. Exiting.")
|
|
287
|
+
break
|
|
288
|
+
|
|
289
|
+
async def async_run_submission(self, **kwargs):
|
|
290
|
+
"""Async interface of run_submission.
|
|
291
|
+
|
|
292
|
+
Examples
|
|
293
|
+
--------
|
|
294
|
+
>>> import asyncio
|
|
295
|
+
>>> from dpdispacher import Machine, Resource, Submission
|
|
296
|
+
>>> async def run_jobs():
|
|
297
|
+
... backgroud_task = set()
|
|
298
|
+
... # task1
|
|
299
|
+
... task1 = Task(...)
|
|
300
|
+
... submission1 = Submission(..., task_list=[task1])
|
|
301
|
+
... background_task = asyncio.create_task(
|
|
302
|
+
... submission1.async_run_submission(check_interval=2, clean=False)
|
|
303
|
+
... )
|
|
304
|
+
... # task2
|
|
305
|
+
... task2 = Task(...)
|
|
306
|
+
... submission2 = Submission(..., task_list=[task1])
|
|
307
|
+
... background_task = asyncio.create_task(
|
|
308
|
+
... submission2.async_run_submission(check_interval=2, clean=False)
|
|
309
|
+
... )
|
|
310
|
+
... background_tasks.add(background_task)
|
|
311
|
+
... result = await asyncio.gather(*background_tasks)
|
|
312
|
+
... return result
|
|
313
|
+
>>> run_jobs()
|
|
314
|
+
|
|
315
|
+
May raise Error if pass `clean=True` explicitly when submit to pbs or slurm.
|
|
316
|
+
"""
|
|
317
|
+
kwargs = {**{"clean": False}, **kwargs}
|
|
318
|
+
if kwargs["clean"]:
|
|
319
|
+
dlog.warning(
|
|
320
|
+
"Using async submission with `clean=True`, "
|
|
321
|
+
"job may fail in queue system"
|
|
322
|
+
)
|
|
323
|
+
loop = asyncio.get_event_loop()
|
|
324
|
+
wrapped_submission = functools.partial(self.run_submission, **kwargs)
|
|
325
|
+
return await loop.run_in_executor(None, wrapped_submission)
|
|
326
|
+
|
|
263
327
|
def update_submission_state(self):
|
|
264
328
|
"""Check whether all the jobs in the submission.
|
|
265
329
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: dpdispatcher
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.8
|
|
4
4
|
Summary: Generate HPC scheduler systems jobs input scripts, submit these scripts to HPC systems, and poke until they finish
|
|
5
5
|
Author: DeepModeling
|
|
6
6
|
License: GNU LESSER GENERAL PUBLIC LICENSE
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
dpdispatcher/JobStatus.py,sha256=Eszs4TPLfszCuf6zLaFonf25feXDUguF28spYOjJpQE,233
|
|
2
|
-
dpdispatcher/__init__.py,sha256=
|
|
3
|
-
dpdispatcher/_version.py,sha256=
|
|
2
|
+
dpdispatcher/__init__.py,sha256=2GIz4niyzHTbxros1G7Mi4uBJbD3AMSnTPxXSJMJmUs,2907
|
|
3
|
+
dpdispatcher/_version.py,sha256=iqWtoISytDDNpYe-atC8Kl-rZhTojPnDQKAEcFNtIhg,160
|
|
4
4
|
dpdispatcher/arginfo.py,sha256=pNaxYIE6ahBidpR7OCKZdw8iGt003uTXGSlVzwiuvRg,188
|
|
5
5
|
dpdispatcher/base_context.py,sha256=Hfri0x41XC4MRUjxc0-WMiZB_E4NvLp94ZYaHfYCWHM,3610
|
|
6
6
|
dpdispatcher/distributed_shell.py,sha256=XMcXt8g1f2DY5HYhhyiN5ehV2ihKULY5ng-sB0B7YaI,6933
|
|
7
7
|
dpdispatcher/dp_cloud_server.py,sha256=xVpDI0exBwHNSZECLJdfrQsvBzeUn5a0gx5Bzt9UAdU,9857
|
|
8
8
|
dpdispatcher/dp_cloud_server_context.py,sha256=VfRRo4ruorWC8NVjW19EjmxQ0Rbz6XzxrHrJKl4cCZk,11255
|
|
9
9
|
dpdispatcher/dpdisp.py,sha256=_dyH8xEgUR-s2xKkB20D9FIYhSHUCmzc2PxWgo9ildQ,94
|
|
10
|
+
dpdispatcher/fugaku.py,sha256=wSjY0XB3TNNWAPKHgMpoPl5jyYJIlijBcEkYXp6nrZQ,3733
|
|
10
11
|
dpdispatcher/hdfs_cli.py,sha256=9Vrf7Kz_kJgXP2xEdZqNVNxRGbui5RrtnLtEjxfcq9A,6047
|
|
11
12
|
dpdispatcher/hdfs_context.py,sha256=1jT1nzx7VGJFJ42MHTXoFWhfEu4KBkMBJO84klRAnPI,8938
|
|
12
13
|
dpdispatcher/lazy_local_context.py,sha256=ZdWNqK3QF8SsoqnCjpFt3ZDRCIagjzJNlKPUYutRUC8,5692
|
|
@@ -17,7 +18,7 @@ dpdispatcher/pbs.py,sha256=LiULEKNDuisrKmOpZyB1af6sGDQ35xrAhMh7VMwpFbY,6327
|
|
|
17
18
|
dpdispatcher/shell.py,sha256=kEP7za-qN71y_21p0uBNkopZ5s63Adq54904hjUHv48,4141
|
|
18
19
|
dpdispatcher/slurm.py,sha256=krlyjzxK8gIhSsqcKHFvNiUwVE7411wTUwuW9xGzS-E,14648
|
|
19
20
|
dpdispatcher/ssh_context.py,sha256=7Xrm8biVA7tAEDJ6YJZzC3nbdQrVBr_5UOhQNQ7qJ2g,35032
|
|
20
|
-
dpdispatcher/submission.py,sha256=
|
|
21
|
+
dpdispatcher/submission.py,sha256=r_F05nHTpN86b2os8RZAjZsCILNarDko2BjAEUYSntw,46643
|
|
21
22
|
dpdispatcher/utils.py,sha256=RXUHJl3S2z26Em3SeltnxtdVM3kv7weXJKvBEjG6I34,5035
|
|
22
23
|
dpdispatcher/dpcloudserver/__init__.py,sha256=FnX9HH-2dXADluNfucg98JPMfruMoBpN9ER9lZkVQvQ,49
|
|
23
24
|
dpdispatcher/dpcloudserver/client.py,sha256=w1wQ8g-FMQlyh00LIAbJLE1xirGXocpp7zAnhbeM4V0,11152
|
|
@@ -25,9 +26,9 @@ dpdispatcher/dpcloudserver/config.py,sha256=vBRtzExJXTGfXPeBObXrZNAhBNXoFFzMkzSu
|
|
|
25
26
|
dpdispatcher/dpcloudserver/retcode.py,sha256=1qAF8gFZx55u2sO8KbtYSIIrjcO-IGufEUlwbkSfC1g,721
|
|
26
27
|
dpdispatcher/dpcloudserver/temp_test.py,sha256=jklOSu7tZ_wW5gycGRiUsbBWMLZDqCBslSYOCb2hTHw,2932
|
|
27
28
|
dpdispatcher/dpcloudserver/zip_file.py,sha256=f9WrlktwHW0YipaWg5Y0kxjMZlhD1cJYa6EUpvu4Cro,2611
|
|
28
|
-
dpdispatcher-0.5.
|
|
29
|
-
dpdispatcher-0.5.
|
|
30
|
-
dpdispatcher-0.5.
|
|
31
|
-
dpdispatcher-0.5.
|
|
32
|
-
dpdispatcher-0.5.
|
|
33
|
-
dpdispatcher-0.5.
|
|
29
|
+
dpdispatcher-0.5.8.dist-info/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
|
|
30
|
+
dpdispatcher-0.5.8.dist-info/METADATA,sha256=o2oD8_6Ohc04mRTkJWi51-KOPamYqH0kvUD-E0iW-c0,12280
|
|
31
|
+
dpdispatcher-0.5.8.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
|
|
32
|
+
dpdispatcher-0.5.8.dist-info/entry_points.txt,sha256=3bKn6IB6SYhKOUbbcOdBBevz4gsDmhmbogKMVn4ptOQ,52
|
|
33
|
+
dpdispatcher-0.5.8.dist-info/top_level.txt,sha256=35jAQoXY-b-e9fJ1_mxhZUiaCoJNt1ZI7mpFRf07Qjs,13
|
|
34
|
+
dpdispatcher-0.5.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|