dpdispatcher 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dpdispatcher might be problematic. Click here for more details.

dpdispatcher/__init__.py CHANGED
@@ -43,6 +43,7 @@ except ImportError:
43
43
  from .distributed_shell import DistributedShell
44
44
  from .dp_cloud_server import DpCloudServer, Lebesgue
45
45
  from .dp_cloud_server_context import DpCloudServerContext, LebesgueContext
46
+ from .fugaku import Fugaku
46
47
  from .hdfs_context import HDFSContext
47
48
  from .lazy_local_context import LazyLocalContext
48
49
  from .local_context import LocalContext
@@ -85,6 +86,7 @@ __all__ = [
85
86
  "PBS",
86
87
  "Shell",
87
88
  "Slurm",
89
+ "Fugaku",
88
90
  "SSHContext",
89
91
  "Submission",
90
92
  "Task",
dpdispatcher/_version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # file generated by setuptools_scm
2
2
  # don't change, don't track in version control
3
- __version__ = version = '0.5.7'
4
- __version_tuple__ = version_tuple = (0, 5, 7)
3
+ __version__ = version = '0.5.8'
4
+ __version_tuple__ = version_tuple = (0, 5, 8)
dpdispatcher/fugaku.py ADDED
@@ -0,0 +1,94 @@
1
+ import shlex
2
+
3
+ from dpdispatcher import dlog
4
+ from dpdispatcher.JobStatus import JobStatus
5
+ from dpdispatcher.machine import Machine
6
+
7
+ fugaku_script_header_template = """\
8
+ {queue_name_line}
9
+ {fugaku_node_number_line}
10
+ {fugaku_ntasks_per_node_line}
11
+ """
12
+
13
+
14
+ class Fugaku(Machine):
15
+ def gen_script(self, job):
16
+ fugaku_script = super().gen_script(job)
17
+ return fugaku_script
18
+
19
+ def gen_script_header(self, job):
20
+ resources = job.resources
21
+ fugaku_script_header_dict = {}
22
+ fugaku_script_header_dict[
23
+ "fugaku_node_number_line"
24
+ ] = f'#PJM -L "node={resources.number_node}" '
25
+ fugaku_script_header_dict[
26
+ "fugaku_ntasks_per_node_line"
27
+ ] = '#PJM --mpi "max-proc-per-node={cpu_per_node}"'.format(
28
+ cpu_per_node=resources.cpu_per_node
29
+ )
30
+ fugaku_script_header_dict[
31
+ "queue_name_line"
32
+ ] = f'#PJM -L "rscgrp={resources.queue_name}"'
33
+ fugaku_script_header = fugaku_script_header_template.format(
34
+ **fugaku_script_header_dict
35
+ )
36
+ return fugaku_script_header
37
+
38
+ def do_submit(self, job):
39
+ script_file_name = job.script_file_name
40
+ script_str = self.gen_script(job)
41
+ job_id_name = job.job_hash + "_job_id"
42
+ # script_str = self.sub_script(job_dirs, cmd, args=args, resources=resources, outlog=outlog, errlog=errlog)
43
+ self.context.write_file(fname=script_file_name, write_str=script_str)
44
+ # self.context.write_file(fname=os.path.join(self.context.submission.work_base, script_file_name), write_str=script_str)
45
+ # script_file_dir = os.path.join(self.context.submission.work_base)
46
+ script_file_dir = self.context.remote_root
47
+ # stdin, stdout, stderr = self.context.block_checkcall('cd %s && %s %s' % (self.context.remote_root, 'pjsub', script_file_name))
48
+
49
+ stdin, stdout, stderr = self.context.block_checkcall(
50
+ "cd {} && {} {}".format(
51
+ shlex.quote(script_file_dir), "pjsub", shlex.quote(script_file_name)
52
+ )
53
+ )
54
+ subret = stdout.readlines()
55
+ job_id = subret[0].split()[5]
56
+ self.context.write_file(job_id_name, job_id)
57
+ return job_id
58
+
59
+ def default_resources(self, resources):
60
+ pass
61
+
62
+ def check_status(self, job):
63
+ job_id = job.job_id
64
+ if job_id == "":
65
+ return JobStatus.unsubmitted
66
+ ret, stdin, stdout, stderr = self.context.block_call("pjstat " + job_id)
67
+ err_str = stderr.read().decode("utf-8")
68
+ try:
69
+ status_line = stdout.read().decode("utf-8").split("\n")[-2]
70
+ # pjstat only retrun 0 if the job is not waiting or running
71
+ except Exception:
72
+ ret, stdin, stdout, stderr = self.context.block_call("pjstat -H " + job_id)
73
+ status_line = stdout.read().decode("utf-8").split("\n")[-2]
74
+ status_word = status_line.split()[3]
75
+ if status_word in ["EXT", "CCL", "ERR"]:
76
+ if self.check_finish_tag(job):
77
+ dlog.info(f"job: {job.job_hash} {job.job_id} finished")
78
+ return JobStatus.finished
79
+ else:
80
+ return JobStatus.terminated
81
+ else:
82
+ return JobStatus.unknown
83
+ status_word = status_line.split()[3]
84
+ # dlog.info (status_word)
85
+ if status_word in ["QUE", "HLD", "RNA", "SPD"]:
86
+ return JobStatus.waiting
87
+ elif status_word in ["RUN", "RNE"]:
88
+ return JobStatus.running
89
+ else:
90
+ return JobStatus.unknown
91
+
92
+ def check_finish_tag(self, job):
93
+ job_tag_finished = job.job_hash + "_job_tag_finished"
94
+ return self.context.check_file_exists(job_tag_finished)
@@ -1,5 +1,7 @@
1
1
  # %%
2
+ import asyncio
2
3
  import copy
4
+ import functools
3
5
  import json
4
6
  import os
5
7
  import pathlib
@@ -199,7 +201,9 @@ class Submission:
199
201
  self.local_root = machine.context.temp_local_root
200
202
  return self
201
203
 
202
- def run_submission(self, *, dry_run=False, exit_on_submit=False, clean=True):
204
+ def run_submission(
205
+ self, *, dry_run=False, exit_on_submit=False, clean=True, check_interval=30
206
+ ):
203
207
  """Main method to execute the submission.
204
208
  First, check whether old Submission exists on the remote machine, and try to recover from it.
205
209
  Second, upload the local files to the remote machine where the tasks to be executed.
@@ -240,7 +244,7 @@ class Submission:
240
244
  break
241
245
 
242
246
  try:
243
- time.sleep(30)
247
+ time.sleep(check_interval)
244
248
  except (Exception, KeyboardInterrupt, SystemExit) as e:
245
249
  self.submission_to_json()
246
250
  dlog.exception(e)
@@ -254,12 +258,72 @@ class Submission:
254
258
  finally:
255
259
  pass
256
260
  self.handle_unexpected_submission_state()
257
- self.download_jobs()
261
+ self.try_download_result()
258
262
  self.submission_to_json()
259
263
  if clean:
260
264
  self.clean_jobs()
261
265
  return self.serialize()
262
266
 
267
+ def try_download_result(self):
268
+ start_time = time.time()
269
+ retry_interval = 60 # 每1分钟重试一次
270
+ success = False
271
+ while not success:
272
+ try:
273
+ self.download_jobs()
274
+ success = True
275
+ except (EOFError, Exception) as e:
276
+ dlog.exception(e)
277
+ elapsed_time = time.time() - start_time
278
+ if elapsed_time < 3600: # 1小时内
279
+ dlog.info("Retrying in 1 minute...")
280
+ time.sleep(retry_interval)
281
+ elif elapsed_time < 86400: # 1小时后,但在24小时内
282
+ retry_interval = 600 # 每10分钟重试一次
283
+ dlog.info("Retrying in 10 minutes...")
284
+ time.sleep(retry_interval)
285
+ else: # 超过24小时
286
+ dlog.info("Maximum retries time reached. Exiting.")
287
+ break
288
+
289
+ async def async_run_submission(self, **kwargs):
290
+ """Async interface of run_submission.
291
+
292
+ Examples
293
+ --------
294
+ >>> import asyncio
295
+ >>> from dpdispacher import Machine, Resource, Submission
296
+ >>> async def run_jobs():
297
+ ... backgroud_task = set()
298
+ ... # task1
299
+ ... task1 = Task(...)
300
+ ... submission1 = Submission(..., task_list=[task1])
301
+ ... background_task = asyncio.create_task(
302
+ ... submission1.async_run_submission(check_interval=2, clean=False)
303
+ ... )
304
+ ... # task2
305
+ ... task2 = Task(...)
306
+ ... submission2 = Submission(..., task_list=[task1])
307
+ ... background_task = asyncio.create_task(
308
+ ... submission2.async_run_submission(check_interval=2, clean=False)
309
+ ... )
310
+ ... background_tasks.add(background_task)
311
+ ... result = await asyncio.gather(*background_tasks)
312
+ ... return result
313
+ >>> run_jobs()
314
+
315
+ May raise Error if pass `clean=True` explicitly when submit to pbs or slurm.
316
+ """
317
+ kwargs = {**{"clean": False}, **kwargs}
318
+ if kwargs["clean"]:
319
+ dlog.warning(
320
+ "Using async submission with `clean=True`, "
321
+ "job may fail in queue system"
322
+ )
323
+ loop = asyncio.get_event_loop()
324
+ wrapped_submission = functools.partial(self.run_submission, **kwargs)
325
+ return await loop.run_in_executor(None, wrapped_submission)
326
+
263
327
  def update_submission_state(self):
264
328
  """Check whether all the jobs in the submission.
265
329
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dpdispatcher
3
- Version: 0.5.7
3
+ Version: 0.5.8
4
4
  Summary: Generate HPC scheduler systems jobs input scripts, submit these scripts to HPC systems, and poke until they finish
5
5
  Author: DeepModeling
6
6
  License: GNU LESSER GENERAL PUBLIC LICENSE
@@ -1,12 +1,13 @@
1
1
  dpdispatcher/JobStatus.py,sha256=Eszs4TPLfszCuf6zLaFonf25feXDUguF28spYOjJpQE,233
2
- dpdispatcher/__init__.py,sha256=U8OLDjSGHxILiz8XH-HYBxjIlhD429HEqqxQ-vVK1a4,2866
3
- dpdispatcher/_version.py,sha256=JZGzq-Js5LHjsQ55qR1dohZK98vvZwStS3mcoNYbvzE,160
2
+ dpdispatcher/__init__.py,sha256=2GIz4niyzHTbxros1G7Mi4uBJbD3AMSnTPxXSJMJmUs,2907
3
+ dpdispatcher/_version.py,sha256=iqWtoISytDDNpYe-atC8Kl-rZhTojPnDQKAEcFNtIhg,160
4
4
  dpdispatcher/arginfo.py,sha256=pNaxYIE6ahBidpR7OCKZdw8iGt003uTXGSlVzwiuvRg,188
5
5
  dpdispatcher/base_context.py,sha256=Hfri0x41XC4MRUjxc0-WMiZB_E4NvLp94ZYaHfYCWHM,3610
6
6
  dpdispatcher/distributed_shell.py,sha256=XMcXt8g1f2DY5HYhhyiN5ehV2ihKULY5ng-sB0B7YaI,6933
7
7
  dpdispatcher/dp_cloud_server.py,sha256=xVpDI0exBwHNSZECLJdfrQsvBzeUn5a0gx5Bzt9UAdU,9857
8
8
  dpdispatcher/dp_cloud_server_context.py,sha256=VfRRo4ruorWC8NVjW19EjmxQ0Rbz6XzxrHrJKl4cCZk,11255
9
9
  dpdispatcher/dpdisp.py,sha256=_dyH8xEgUR-s2xKkB20D9FIYhSHUCmzc2PxWgo9ildQ,94
10
+ dpdispatcher/fugaku.py,sha256=wSjY0XB3TNNWAPKHgMpoPl5jyYJIlijBcEkYXp6nrZQ,3733
10
11
  dpdispatcher/hdfs_cli.py,sha256=9Vrf7Kz_kJgXP2xEdZqNVNxRGbui5RrtnLtEjxfcq9A,6047
11
12
  dpdispatcher/hdfs_context.py,sha256=1jT1nzx7VGJFJ42MHTXoFWhfEu4KBkMBJO84klRAnPI,8938
12
13
  dpdispatcher/lazy_local_context.py,sha256=ZdWNqK3QF8SsoqnCjpFt3ZDRCIagjzJNlKPUYutRUC8,5692
@@ -17,7 +18,7 @@ dpdispatcher/pbs.py,sha256=LiULEKNDuisrKmOpZyB1af6sGDQ35xrAhMh7VMwpFbY,6327
17
18
  dpdispatcher/shell.py,sha256=kEP7za-qN71y_21p0uBNkopZ5s63Adq54904hjUHv48,4141
18
19
  dpdispatcher/slurm.py,sha256=krlyjzxK8gIhSsqcKHFvNiUwVE7411wTUwuW9xGzS-E,14648
19
20
  dpdispatcher/ssh_context.py,sha256=7Xrm8biVA7tAEDJ6YJZzC3nbdQrVBr_5UOhQNQ7qJ2g,35032
20
- dpdispatcher/submission.py,sha256=V8vdPSYgFJULbFs1H-K5Xj9_ynWC3vSmMekgngGPs44,44077
21
+ dpdispatcher/submission.py,sha256=r_F05nHTpN86b2os8RZAjZsCILNarDko2BjAEUYSntw,46643
21
22
  dpdispatcher/utils.py,sha256=RXUHJl3S2z26Em3SeltnxtdVM3kv7weXJKvBEjG6I34,5035
22
23
  dpdispatcher/dpcloudserver/__init__.py,sha256=FnX9HH-2dXADluNfucg98JPMfruMoBpN9ER9lZkVQvQ,49
23
24
  dpdispatcher/dpcloudserver/client.py,sha256=w1wQ8g-FMQlyh00LIAbJLE1xirGXocpp7zAnhbeM4V0,11152
@@ -25,9 +26,9 @@ dpdispatcher/dpcloudserver/config.py,sha256=vBRtzExJXTGfXPeBObXrZNAhBNXoFFzMkzSu
25
26
  dpdispatcher/dpcloudserver/retcode.py,sha256=1qAF8gFZx55u2sO8KbtYSIIrjcO-IGufEUlwbkSfC1g,721
26
27
  dpdispatcher/dpcloudserver/temp_test.py,sha256=jklOSu7tZ_wW5gycGRiUsbBWMLZDqCBslSYOCb2hTHw,2932
27
28
  dpdispatcher/dpcloudserver/zip_file.py,sha256=f9WrlktwHW0YipaWg5Y0kxjMZlhD1cJYa6EUpvu4Cro,2611
28
- dpdispatcher-0.5.7.dist-info/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
29
- dpdispatcher-0.5.7.dist-info/METADATA,sha256=f-kzhWa-DzQnSOHvFxoT6A9rIZy7iOfFrZQpNvQCpPY,12280
30
- dpdispatcher-0.5.7.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
31
- dpdispatcher-0.5.7.dist-info/entry_points.txt,sha256=3bKn6IB6SYhKOUbbcOdBBevz4gsDmhmbogKMVn4ptOQ,52
32
- dpdispatcher-0.5.7.dist-info/top_level.txt,sha256=35jAQoXY-b-e9fJ1_mxhZUiaCoJNt1ZI7mpFRf07Qjs,13
33
- dpdispatcher-0.5.7.dist-info/RECORD,,
29
+ dpdispatcher-0.5.8.dist-info/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
30
+ dpdispatcher-0.5.8.dist-info/METADATA,sha256=o2oD8_6Ohc04mRTkJWi51-KOPamYqH0kvUD-E0iW-c0,12280
31
+ dpdispatcher-0.5.8.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
32
+ dpdispatcher-0.5.8.dist-info/entry_points.txt,sha256=3bKn6IB6SYhKOUbbcOdBBevz4gsDmhmbogKMVn4ptOQ,52
33
+ dpdispatcher-0.5.8.dist-info/top_level.txt,sha256=35jAQoXY-b-e9fJ1_mxhZUiaCoJNt1ZI7mpFRf07Qjs,13
34
+ dpdispatcher-0.5.8.dist-info/RECORD,,