dpdispatcher 0.5.11__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dpdispatcher/__init__.py +7 -89
- dpdispatcher/__main__.py +8 -0
- dpdispatcher/_version.py +14 -2
- dpdispatcher/base_context.py +1 -1
- dpdispatcher/contexts/__init__.py +11 -0
- dpdispatcher/{dp_cloud_server_context.py → contexts/dp_cloud_server_context.py} +7 -3
- dpdispatcher/{hdfs_context.py → contexts/hdfs_context.py} +2 -2
- dpdispatcher/{local_context.py → contexts/local_context.py} +51 -14
- dpdispatcher/{openapi_context.py → contexts/openapi_context.py} +3 -2
- dpdispatcher/{ssh_context.py → contexts/ssh_context.py} +113 -34
- dpdispatcher/dlog.py +31 -0
- dpdispatcher/dpdisp.py +113 -1
- dpdispatcher/entrypoints/__init__.py +1 -0
- dpdispatcher/entrypoints/gui.py +31 -0
- dpdispatcher/entrypoints/submission.py +83 -0
- dpdispatcher/machine.py +18 -4
- dpdispatcher/machines/__init__.py +11 -0
- dpdispatcher/{distributed_shell.py → machines/distributed_shell.py} +20 -4
- dpdispatcher/{dp_cloud_server.py → machines/dp_cloud_server.py} +21 -5
- dpdispatcher/{fugaku.py → machines/fugaku.py} +18 -5
- dpdispatcher/{lsf.py → machines/lsf.py} +20 -4
- dpdispatcher/{openapi.py → machines/openapi.py} +23 -4
- dpdispatcher/{pbs.py → machines/pbs.py} +30 -4
- dpdispatcher/{shell.py → machines/shell.py} +17 -3
- dpdispatcher/{slurm.py → machines/slurm.py} +37 -6
- dpdispatcher/submission.py +83 -39
- dpdispatcher/utils/__init__.py +1 -0
- dpdispatcher/{dpcloudserver → utils/dpcloudserver}/client.py +1 -1
- dpdispatcher/{hdfs_cli.py → utils/hdfs_cli.py} +1 -1
- dpdispatcher/utils/record.py +79 -0
- dpdispatcher/{utils.py → utils/utils.py} +14 -2
- {dpdispatcher-0.5.11.dist-info → dpdispatcher-0.6.1.dist-info}/METADATA +7 -2
- dpdispatcher-0.6.1.dist-info/RECORD +44 -0
- {dpdispatcher-0.5.11.dist-info → dpdispatcher-0.6.1.dist-info}/WHEEL +1 -1
- dpdispatcher-0.6.1.dist-info/entry_points.txt +7 -0
- dpdispatcher/dpcloudserver/temp_test.py +0 -90
- dpdispatcher-0.5.11.dist-info/RECORD +0 -36
- dpdispatcher-0.5.11.dist-info/entry_points.txt +0 -2
- /dpdispatcher/{lazy_local_context.py → contexts/lazy_local_context.py} +0 -0
- /dpdispatcher/{dpcloudserver → utils/dpcloudserver}/__init__.py +0 -0
- /dpdispatcher/{dpcloudserver → utils/dpcloudserver}/config.py +0 -0
- /dpdispatcher/{dpcloudserver → utils/dpcloudserver}/retcode.py +0 -0
- /dpdispatcher/{dpcloudserver → utils/dpcloudserver}/zip_file.py +0 -0
- /dpdispatcher/{JobStatus.py → utils/job_status.py} +0 -0
- {dpdispatcher-0.5.11.dist-info → dpdispatcher-0.6.1.dist-info}/LICENSE +0 -0
- {dpdispatcher-0.5.11.dist-info → dpdispatcher-0.6.1.dist-info}/top_level.txt +0 -0
|
@@ -5,10 +5,14 @@ from typing import List
|
|
|
5
5
|
|
|
6
6
|
from dargs import Argument
|
|
7
7
|
|
|
8
|
-
from dpdispatcher import dlog
|
|
9
|
-
from dpdispatcher.JobStatus import JobStatus
|
|
8
|
+
from dpdispatcher.dlog import dlog
|
|
10
9
|
from dpdispatcher.machine import Machine, script_command_template
|
|
11
|
-
from dpdispatcher.utils import
|
|
10
|
+
from dpdispatcher.utils.job_status import JobStatus
|
|
11
|
+
from dpdispatcher.utils.utils import (
|
|
12
|
+
RetrySignal,
|
|
13
|
+
customized_script_header_template,
|
|
14
|
+
retry,
|
|
15
|
+
)
|
|
12
16
|
|
|
13
17
|
# from dpdispatcher.submission import Resources
|
|
14
18
|
|
|
@@ -20,6 +24,12 @@ slurm_script_header_template = """\
|
|
|
20
24
|
{slurm_number_gpu_line}
|
|
21
25
|
{slurm_partition_line}"""
|
|
22
26
|
|
|
27
|
+
slurm_job_array_script_end_template = """
|
|
28
|
+
wait
|
|
29
|
+
|
|
30
|
+
{append_script_part}
|
|
31
|
+
"""
|
|
32
|
+
|
|
23
33
|
|
|
24
34
|
class Slurm(Machine):
|
|
25
35
|
def gen_script(self, job):
|
|
@@ -48,7 +58,18 @@ class Slurm(Machine):
|
|
|
48
58
|
] = f"#SBATCH --partition {resources.queue_name}"
|
|
49
59
|
else:
|
|
50
60
|
script_header_dict["slurm_partition_line"] = ""
|
|
51
|
-
|
|
61
|
+
if (
|
|
62
|
+
resources["strategy"].get("customized_script_header_template_file")
|
|
63
|
+
is not None
|
|
64
|
+
):
|
|
65
|
+
slurm_script_header = customized_script_header_template(
|
|
66
|
+
resources["strategy"]["customized_script_header_template_file"],
|
|
67
|
+
resources,
|
|
68
|
+
)
|
|
69
|
+
else:
|
|
70
|
+
slurm_script_header = slurm_script_header_template.format(
|
|
71
|
+
**script_header_dict
|
|
72
|
+
)
|
|
52
73
|
return slurm_script_header
|
|
53
74
|
|
|
54
75
|
@retry()
|
|
@@ -58,6 +79,9 @@ class Slurm(Machine):
|
|
|
58
79
|
job_id_name = job.job_hash + "_job_id"
|
|
59
80
|
# script_str = self.sub_script(job_dirs, cmd, args=args, resources=resources, outlog=outlog, errlog=errlog)
|
|
60
81
|
self.context.write_file(fname=script_file_name, write_str=script_str)
|
|
82
|
+
script_run_str = self.gen_script_command(job)
|
|
83
|
+
script_run_file_name = f"{job.script_file_name}.run"
|
|
84
|
+
self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
|
|
61
85
|
# self.context.write_file(fname=os.path.join(self.context.submission.work_base, script_file_name), write_str=script_str)
|
|
62
86
|
ret, stdin, stdout, stderr = self.context.block_call(
|
|
63
87
|
"cd {} && {} {}".format(
|
|
@@ -255,6 +279,7 @@ class SlurmJobArray(Slurm):
|
|
|
255
279
|
log_err_part += f"2>>{shlex.quote(task.errlog)} "
|
|
256
280
|
|
|
257
281
|
flag_if_job_task_fail = job.job_hash + "_flag_if_job_task_fail"
|
|
282
|
+
last_err_file = job.job_hash + "_last_err_file"
|
|
258
283
|
single_script_command = script_command_template.format(
|
|
259
284
|
flag_if_job_task_fail=flag_if_job_task_fail,
|
|
260
285
|
command_env=command_env,
|
|
@@ -264,6 +289,8 @@ class SlurmJobArray(Slurm):
|
|
|
264
289
|
command=task.command,
|
|
265
290
|
task_tag_finished=task_tag_finished,
|
|
266
291
|
log_err_part=log_err_part,
|
|
292
|
+
err_file=shlex.quote(task.errlog),
|
|
293
|
+
last_err_file=shlex.quote(last_err_file),
|
|
267
294
|
)
|
|
268
295
|
if ii % slurm_job_size == 0:
|
|
269
296
|
script_command += f"{ii // slurm_job_size})\n"
|
|
@@ -279,9 +306,13 @@ class SlurmJobArray(Slurm):
|
|
|
279
306
|
return script_command
|
|
280
307
|
|
|
281
308
|
def gen_script_end(self, job):
|
|
282
|
-
# We cannot
|
|
309
|
+
# We cannot touch tag for job array
|
|
283
310
|
# we may check task tag instead
|
|
284
|
-
|
|
311
|
+
append_script = job.resources.append_script
|
|
312
|
+
append_script_part = "\n".join(append_script)
|
|
313
|
+
return slurm_job_array_script_end_template.format(
|
|
314
|
+
append_script_part=append_script_part,
|
|
315
|
+
)
|
|
285
316
|
|
|
286
317
|
@retry()
|
|
287
318
|
def check_status(self, job):
|
dpdispatcher/submission.py
CHANGED
|
@@ -9,14 +9,16 @@ import random
|
|
|
9
9
|
import time
|
|
10
10
|
import uuid
|
|
11
11
|
from hashlib import sha1
|
|
12
|
+
from typing import List, Optional
|
|
12
13
|
|
|
14
|
+
import yaml
|
|
13
15
|
from dargs.dargs import Argument, Variant
|
|
14
16
|
|
|
15
|
-
from dpdispatcher import dlog
|
|
16
|
-
from dpdispatcher.JobStatus import JobStatus
|
|
17
|
+
from dpdispatcher.dlog import dlog
|
|
17
18
|
from dpdispatcher.machine import Machine
|
|
19
|
+
from dpdispatcher.utils.job_status import JobStatus
|
|
20
|
+
from dpdispatcher.utils.record import record
|
|
18
21
|
|
|
19
|
-
# from dpdispatcher.slurm import SlurmResources
|
|
20
22
|
# %%
|
|
21
23
|
default_strategy = dict(if_cuda_multi_devices=False, ratio_unfinished=0.0)
|
|
22
24
|
|
|
@@ -247,9 +249,11 @@ class Submission:
|
|
|
247
249
|
time.sleep(check_interval)
|
|
248
250
|
except (Exception, KeyboardInterrupt, SystemExit) as e:
|
|
249
251
|
self.submission_to_json()
|
|
252
|
+
record_path = record.write(self)
|
|
250
253
|
dlog.exception(e)
|
|
251
254
|
dlog.info(f"submission exit: {self.submission_hash}")
|
|
252
255
|
dlog.info(f"at {self.machine.context.remote_root}")
|
|
256
|
+
dlog.info(f"Submission information is saved in {str(record_path)}.")
|
|
253
257
|
dlog.debug(self.serialize())
|
|
254
258
|
raise e
|
|
255
259
|
else:
|
|
@@ -272,6 +276,9 @@ class Submission:
|
|
|
272
276
|
try:
|
|
273
277
|
self.download_jobs()
|
|
274
278
|
success = True
|
|
279
|
+
except FileNotFoundError as e:
|
|
280
|
+
# retry will never success if the file is not found
|
|
281
|
+
raise e
|
|
275
282
|
except (EOFError, Exception) as e:
|
|
276
283
|
dlog.exception(e)
|
|
277
284
|
elapsed_time = time.time() - start_time
|
|
@@ -339,7 +346,6 @@ class Submission:
|
|
|
339
346
|
dlog.debug(
|
|
340
347
|
f"debug:update_submission_state: job: {job.job_hash}, {job.job_id}, {job.job_state}"
|
|
341
348
|
)
|
|
342
|
-
# self.submission_to_json()
|
|
343
349
|
|
|
344
350
|
def handle_unexpected_submission_state(self):
|
|
345
351
|
"""Handle unexpected job state of the submission.
|
|
@@ -352,25 +358,16 @@ class Submission:
|
|
|
352
358
|
job.handle_unexpected_job_state()
|
|
353
359
|
except Exception as e:
|
|
354
360
|
self.submission_to_json()
|
|
361
|
+
record_path = record.write(self)
|
|
355
362
|
raise RuntimeError(
|
|
356
363
|
f"Meet errors will handle unexpected submission state.\n"
|
|
357
364
|
f"Debug information: remote_root=={self.machine.context.remote_root}.\n"
|
|
358
365
|
f"Debug information: submission_hash=={self.submission_hash}.\n"
|
|
359
|
-
f"Please check
|
|
360
|
-
f"The
|
|
366
|
+
f"Please check error messages above and in remote_root. "
|
|
367
|
+
f"The submission information is saved in {str(record_path)}.\n"
|
|
368
|
+
f"For furthur actions, run the following command with proper flags: dpdisp submission {self.submission_hash}"
|
|
361
369
|
) from e
|
|
362
370
|
|
|
363
|
-
# not used here, submitting job is in handle_unexpected_submission_state.
|
|
364
|
-
|
|
365
|
-
# def submit_submission(self):
|
|
366
|
-
# """submit the job belonging to the submission.
|
|
367
|
-
# """
|
|
368
|
-
# for job in self.belonging_jobs:
|
|
369
|
-
# job.submit_job()
|
|
370
|
-
# self.get_submission_state()
|
|
371
|
-
|
|
372
|
-
# def update_submi
|
|
373
|
-
|
|
374
371
|
def check_ratio_unfinished(self, ratio_unfinished: float) -> bool:
|
|
375
372
|
"""Calculate the ratio of unfinished tasks in the submission.
|
|
376
373
|
|
|
@@ -505,6 +502,8 @@ class Submission:
|
|
|
505
502
|
|
|
506
503
|
def clean_jobs(self):
|
|
507
504
|
self.machine.context.clean()
|
|
505
|
+
assert self.submission_hash is not None
|
|
506
|
+
record.remove(self.submission_hash)
|
|
508
507
|
|
|
509
508
|
def submission_to_json(self):
|
|
510
509
|
# self.update_submission_state()
|
|
@@ -614,6 +613,13 @@ class Task:
|
|
|
614
613
|
task_dict = json.load(f)
|
|
615
614
|
return cls.load_from_dict(task_dict)
|
|
616
615
|
|
|
616
|
+
@classmethod
|
|
617
|
+
def load_from_yaml(cls, yaml_file):
|
|
618
|
+
with open(yaml_file) as f:
|
|
619
|
+
task_dict = yaml.safe_load(f)
|
|
620
|
+
task = cls.load_from_dict(task_dict=task_dict)
|
|
621
|
+
return task
|
|
622
|
+
|
|
617
623
|
@classmethod
|
|
618
624
|
def load_from_dict(cls, task_dict: dict) -> "Task":
|
|
619
625
|
# check dict
|
|
@@ -669,26 +675,30 @@ class Task:
|
|
|
669
675
|
Argument("command", str, optional=False, doc=doc_command),
|
|
670
676
|
Argument("task_work_path", str, optional=False, doc=doc_task_work_path),
|
|
671
677
|
Argument(
|
|
672
|
-
"forward_files",
|
|
678
|
+
"forward_files",
|
|
679
|
+
List[str],
|
|
680
|
+
optional=True,
|
|
681
|
+
doc=doc_forward_files,
|
|
682
|
+
default=[],
|
|
673
683
|
),
|
|
674
684
|
Argument(
|
|
675
685
|
"backward_files",
|
|
676
|
-
|
|
677
|
-
optional=
|
|
686
|
+
List[str],
|
|
687
|
+
optional=True,
|
|
678
688
|
doc=doc_backward_files,
|
|
679
689
|
default=[],
|
|
680
690
|
),
|
|
681
691
|
Argument(
|
|
682
692
|
"outlog",
|
|
683
693
|
[type(None), str],
|
|
684
|
-
optional=
|
|
694
|
+
optional=True,
|
|
685
695
|
doc=doc_outlog,
|
|
686
696
|
default="log",
|
|
687
697
|
),
|
|
688
698
|
Argument(
|
|
689
699
|
"errlog",
|
|
690
700
|
[type(None), str],
|
|
691
|
-
optional=
|
|
701
|
+
optional=True,
|
|
692
702
|
doc=doc_errlog,
|
|
693
703
|
default="err",
|
|
694
704
|
),
|
|
@@ -835,17 +845,21 @@ class Job:
|
|
|
835
845
|
if job_state == JobStatus.terminated:
|
|
836
846
|
self.fail_count += 1
|
|
837
847
|
dlog.info(
|
|
838
|
-
f"job: {self.job_hash} {self.job_id} terminated;"
|
|
848
|
+
f"job: {self.job_hash} {self.job_id} terminated; "
|
|
839
849
|
f"fail_cout is {self.fail_count}; resubmitting job"
|
|
840
850
|
)
|
|
841
851
|
retry_count = 3
|
|
842
852
|
assert self.machine is not None
|
|
843
|
-
if hasattr(self.machine, "retry_count") and self.machine.retry_count
|
|
853
|
+
if hasattr(self.machine, "retry_count") and self.machine.retry_count >= 0:
|
|
844
854
|
retry_count = self.machine.retry_count + 1
|
|
845
855
|
if (self.fail_count) > 0 and (self.fail_count % retry_count == 0):
|
|
846
|
-
|
|
847
|
-
|
|
856
|
+
last_error_message = self.get_last_error_message()
|
|
857
|
+
err_msg = (
|
|
858
|
+
f"job:{self.job_hash} {self.job_id} failed {self.fail_count} times."
|
|
848
859
|
)
|
|
860
|
+
if last_error_message is not None:
|
|
861
|
+
err_msg += f"\nPossible remote error message: {last_error_message}"
|
|
862
|
+
raise RuntimeError(err_msg)
|
|
849
863
|
self.submit_job()
|
|
850
864
|
if self.job_state != JobStatus.unsubmitted:
|
|
851
865
|
dlog.info(
|
|
@@ -923,6 +937,16 @@ class Job:
|
|
|
923
937
|
self.job_hash + "_job.json", write_str=write_str
|
|
924
938
|
)
|
|
925
939
|
|
|
940
|
+
def get_last_error_message(self) -> Optional[str]:
|
|
941
|
+
"""Get last error message when the job is terminated."""
|
|
942
|
+
assert self.machine is not None
|
|
943
|
+
last_err_file = self.job_hash + "_last_err_file"
|
|
944
|
+
if self.machine.context.check_file_exists(last_err_file):
|
|
945
|
+
last_error_message = self.machine.context.read_file(last_err_file)
|
|
946
|
+
# red color
|
|
947
|
+
last_error_message = "\033[31m" + last_error_message + "\033[0m"
|
|
948
|
+
return last_error_message
|
|
949
|
+
|
|
926
950
|
|
|
927
951
|
class Resources:
|
|
928
952
|
"""Resources is used to describe the machine resources we need to do calculations.
|
|
@@ -949,6 +973,9 @@ class Resources:
|
|
|
949
973
|
Usually, this option will be used with Task.task_need_resources variable simultaneously.
|
|
950
974
|
ratio_unfinished : float
|
|
951
975
|
The ratio of `task` that can be unfinished.
|
|
976
|
+
customized_script_header_template_file : str
|
|
977
|
+
The customized template file to generate job submitting script header,
|
|
978
|
+
which overrides the default file.
|
|
952
979
|
para_deg : int
|
|
953
980
|
Decide how many tasks will be run in parallel.
|
|
954
981
|
Usually run with `strategy['if_cuda_multi_devices']`
|
|
@@ -1007,12 +1034,8 @@ class Resources:
|
|
|
1007
1034
|
# if self.gpu_per_node > 1:
|
|
1008
1035
|
# self.in_para_task_num = 0
|
|
1009
1036
|
|
|
1010
|
-
|
|
1011
|
-
self.strategy
|
|
1012
|
-
"if_cuda_multi_devices"
|
|
1013
|
-
)
|
|
1014
|
-
if "ratio_unfinished" not in self.strategy:
|
|
1015
|
-
self.strategy["ratio_unfinished"] = default_strategy.get("ratio_unfinished")
|
|
1037
|
+
for kk, value in default_strategy.items():
|
|
1038
|
+
self.strategy.setdefault(kk, value)
|
|
1016
1039
|
if self.strategy["if_cuda_multi_devices"] is True:
|
|
1017
1040
|
if gpu_per_node < 1:
|
|
1018
1041
|
raise RuntimeError(
|
|
@@ -1080,7 +1103,14 @@ class Resources:
|
|
|
1080
1103
|
def load_from_json(cls, json_file):
|
|
1081
1104
|
with open(json_file) as f:
|
|
1082
1105
|
resources_dict = json.load(f)
|
|
1083
|
-
resources = cls.
|
|
1106
|
+
resources = cls.load_from_dict(resources_dict=resources_dict)
|
|
1107
|
+
return resources
|
|
1108
|
+
|
|
1109
|
+
@classmethod
|
|
1110
|
+
def load_from_yaml(cls, yaml_file):
|
|
1111
|
+
with open(yaml_file) as f:
|
|
1112
|
+
resources_dict = yaml.safe_load(f)
|
|
1113
|
+
resources = cls.load_from_dict(resources_dict=resources_dict)
|
|
1084
1114
|
return resources
|
|
1085
1115
|
|
|
1086
1116
|
@classmethod
|
|
@@ -1121,6 +1151,10 @@ class Resources:
|
|
|
1121
1151
|
"Usually, this option will be used with Task.task_need_resources variable simultaneously."
|
|
1122
1152
|
)
|
|
1123
1153
|
doc_ratio_unfinished = "The ratio of `tasks` that can be unfinished."
|
|
1154
|
+
doc_customized_script_header_template_file = (
|
|
1155
|
+
"The customized template file to generate job submitting script header, "
|
|
1156
|
+
"which overrides the default file."
|
|
1157
|
+
)
|
|
1124
1158
|
|
|
1125
1159
|
strategy_args = [
|
|
1126
1160
|
Argument(
|
|
@@ -1137,6 +1171,12 @@ class Resources:
|
|
|
1137
1171
|
default=0.0,
|
|
1138
1172
|
doc=doc_ratio_unfinished,
|
|
1139
1173
|
),
|
|
1174
|
+
Argument(
|
|
1175
|
+
"customized_script_header_template_file",
|
|
1176
|
+
str,
|
|
1177
|
+
optional=True,
|
|
1178
|
+
doc=doc_customized_script_header_template_file,
|
|
1179
|
+
),
|
|
1140
1180
|
]
|
|
1141
1181
|
doc_strategy = "strategies we use to generation job submitting scripts."
|
|
1142
1182
|
strategy_format = Argument(
|
|
@@ -1153,36 +1193,40 @@ class Resources:
|
|
|
1153
1193
|
),
|
|
1154
1194
|
Argument("queue_name", str, optional=True, doc=doc_queue_name, default=""),
|
|
1155
1195
|
Argument("group_size", int, optional=False, doc=doc_group_size),
|
|
1156
|
-
Argument("custom_flags",
|
|
1196
|
+
Argument("custom_flags", List[str], optional=True, doc=doc_custom_flags),
|
|
1157
1197
|
# Argument("strategy", dict, optional=True, doc=doc_strategy,default=default_strategy),
|
|
1158
1198
|
strategy_format,
|
|
1159
1199
|
Argument("para_deg", int, optional=True, doc=doc_para_deg, default=1),
|
|
1160
1200
|
Argument(
|
|
1161
|
-
"source_list",
|
|
1201
|
+
"source_list", List[str], optional=True, doc=doc_source_list, default=[]
|
|
1162
1202
|
),
|
|
1163
1203
|
Argument(
|
|
1164
1204
|
"module_purge", bool, optional=True, doc=doc_module_purge, default=False
|
|
1165
1205
|
),
|
|
1166
1206
|
Argument(
|
|
1167
1207
|
"module_unload_list",
|
|
1168
|
-
|
|
1208
|
+
List[str],
|
|
1169
1209
|
optional=True,
|
|
1170
1210
|
doc=doc_module_unload_list,
|
|
1171
1211
|
default=[],
|
|
1172
1212
|
),
|
|
1173
1213
|
Argument(
|
|
1174
|
-
"module_list",
|
|
1214
|
+
"module_list", List[str], optional=True, doc=doc_module_list, default=[]
|
|
1175
1215
|
),
|
|
1176
1216
|
Argument("envs", dict, optional=True, doc=doc_envs, default={}),
|
|
1177
1217
|
Argument(
|
|
1178
1218
|
"prepend_script",
|
|
1179
|
-
|
|
1219
|
+
List[str],
|
|
1180
1220
|
optional=True,
|
|
1181
1221
|
doc=doc_prepend_script,
|
|
1182
1222
|
default=[],
|
|
1183
1223
|
),
|
|
1184
1224
|
Argument(
|
|
1185
|
-
"append_script",
|
|
1225
|
+
"append_script",
|
|
1226
|
+
List[str],
|
|
1227
|
+
optional=True,
|
|
1228
|
+
doc=doc_append_script,
|
|
1229
|
+
default=[],
|
|
1186
1230
|
),
|
|
1187
1231
|
Argument(
|
|
1188
1232
|
"wait_time", [int, float], optional=True, doc=doc_wait_time, default=0
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Utils."""
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Record:
|
|
7
|
+
"""Record failed or canceled submissions."""
|
|
8
|
+
|
|
9
|
+
def __init__(self) -> None:
|
|
10
|
+
self.record_directory = Path.home() / ".dpdispatcher" / "submission"
|
|
11
|
+
self.record_directory.mkdir(parents=True, exist_ok=True)
|
|
12
|
+
|
|
13
|
+
def get_submissions(self) -> List[str]:
|
|
14
|
+
"""Get all stored submission hashes.
|
|
15
|
+
|
|
16
|
+
Returns
|
|
17
|
+
-------
|
|
18
|
+
list[str]
|
|
19
|
+
List of submission hashes.
|
|
20
|
+
"""
|
|
21
|
+
return [
|
|
22
|
+
f.stem
|
|
23
|
+
for f in self.record_directory.iterdir()
|
|
24
|
+
if (f.is_file() and f.suffix == ".json")
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
def write(self, submission) -> Path:
|
|
28
|
+
"""Write submission data to file.
|
|
29
|
+
|
|
30
|
+
Parameters
|
|
31
|
+
----------
|
|
32
|
+
submission : dpdispatcher.Submission
|
|
33
|
+
Submission data.
|
|
34
|
+
|
|
35
|
+
Returns
|
|
36
|
+
-------
|
|
37
|
+
pathlib.Path
|
|
38
|
+
Path to submission data.
|
|
39
|
+
"""
|
|
40
|
+
submission_path = self.record_directory / f"{submission.submission_hash}.json"
|
|
41
|
+
submission_path.write_text(json.dumps(submission.serialize(), indent=2))
|
|
42
|
+
return submission_path
|
|
43
|
+
|
|
44
|
+
def get_submission(self, hash: str, not_exist_ok: bool = False) -> Path:
|
|
45
|
+
"""Get submission data by hash.
|
|
46
|
+
|
|
47
|
+
Parameters
|
|
48
|
+
----------
|
|
49
|
+
hash : str
|
|
50
|
+
Hash of submission data.
|
|
51
|
+
|
|
52
|
+
Returns
|
|
53
|
+
-------
|
|
54
|
+
pathlib.Path
|
|
55
|
+
Path to submission data.
|
|
56
|
+
"""
|
|
57
|
+
submission_file = self.record_directory / f"{hash}.json"
|
|
58
|
+
if not not_exist_ok and not submission_file.is_file():
|
|
59
|
+
raise FileNotFoundError(f"Submission file not found: {submission_file}")
|
|
60
|
+
return submission_file
|
|
61
|
+
|
|
62
|
+
def remove(self, hash: str):
|
|
63
|
+
"""Remove submission data by hash.
|
|
64
|
+
|
|
65
|
+
Call this method when the remote directory is cleaned.
|
|
66
|
+
|
|
67
|
+
Parameters
|
|
68
|
+
----------
|
|
69
|
+
hash : str
|
|
70
|
+
Hash of submission data.
|
|
71
|
+
"""
|
|
72
|
+
path = self.get_submission(hash, not_exist_ok=True)
|
|
73
|
+
if path.is_file():
|
|
74
|
+
path.unlink()
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# the record object can be globally used
|
|
78
|
+
record = Record()
|
|
79
|
+
__all__ = ["record"]
|
|
@@ -1,12 +1,16 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
import hashlib
|
|
3
3
|
import hmac
|
|
4
|
+
import os
|
|
4
5
|
import struct
|
|
5
6
|
import subprocess
|
|
6
7
|
import time
|
|
7
|
-
from typing import Callable, Optional, Type, Union
|
|
8
|
+
from typing import TYPE_CHECKING, Callable, Optional, Type, Union
|
|
8
9
|
|
|
9
|
-
from dpdispatcher import dlog
|
|
10
|
+
from dpdispatcher.dlog import dlog
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from dpdispatcher import Resources
|
|
10
14
|
|
|
11
15
|
|
|
12
16
|
def get_sha256(filename):
|
|
@@ -193,3 +197,11 @@ def retry(
|
|
|
193
197
|
return wrapper
|
|
194
198
|
|
|
195
199
|
return decorator
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def customized_script_header_template(
|
|
203
|
+
filename: os.PathLike, resources: "Resources"
|
|
204
|
+
) -> str:
|
|
205
|
+
with open(filename) as f:
|
|
206
|
+
template = f.read()
|
|
207
|
+
return template.format(**resources.serialize())
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: dpdispatcher
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.1
|
|
4
4
|
Summary: Generate HPC scheduler systems jobs input scripts, submit these scripts to HPC systems, and poke until they finish
|
|
5
5
|
Author: DeepModeling
|
|
6
6
|
License: GNU LESSER GENERAL PUBLIC LICENSE
|
|
@@ -186,9 +186,10 @@ Requires-Python: >=3.7
|
|
|
186
186
|
Description-Content-Type: text/markdown
|
|
187
187
|
License-File: LICENSE
|
|
188
188
|
Requires-Dist: paramiko
|
|
189
|
-
Requires-Dist: dargs >=0.
|
|
189
|
+
Requires-Dist: dargs >=0.4.1
|
|
190
190
|
Requires-Dist: requests
|
|
191
191
|
Requires-Dist: tqdm >=4.9.0
|
|
192
|
+
Requires-Dist: pyyaml
|
|
192
193
|
Requires-Dist: typing-extensions ; python_version < "3.7"
|
|
193
194
|
Provides-Extra: bohrium
|
|
194
195
|
Requires-Dist: oss2 ; extra == 'bohrium'
|
|
@@ -205,7 +206,11 @@ Requires-Dist: sphinx-rtd-theme >=1.0.0rc1 ; extra == 'docs'
|
|
|
205
206
|
Requires-Dist: numpydoc ; extra == 'docs'
|
|
206
207
|
Requires-Dist: deepmodeling-sphinx >=0.1.1 ; extra == 'docs'
|
|
207
208
|
Requires-Dist: dargs >=0.3.1 ; extra == 'docs'
|
|
209
|
+
Requires-Dist: sphinx-argparse ; extra == 'docs'
|
|
210
|
+
Provides-Extra: gui
|
|
211
|
+
Requires-Dist: dpgui ; extra == 'gui'
|
|
208
212
|
Provides-Extra: test
|
|
213
|
+
Requires-Dist: dpgui ; extra == 'test'
|
|
209
214
|
|
|
210
215
|
# DPDispatcher
|
|
211
216
|
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
dpdispatcher/__init__.py,sha256=CLZP_N5CTp14ujWCykEHuJjoIfKR6CwrclXhjWUgNoE,517
|
|
2
|
+
dpdispatcher/__main__.py,sha256=BFhG-mSBzVZUEezQJqXWZnt2WsnhAHT_zpT8Y6gpOz0,116
|
|
3
|
+
dpdispatcher/_version.py,sha256=lgiCYGSijhLK71WmuudWf_AyhNAutwQWx2V8bV6a5VQ,411
|
|
4
|
+
dpdispatcher/arginfo.py,sha256=pNaxYIE6ahBidpR7OCKZdw8iGt003uTXGSlVzwiuvRg,188
|
|
5
|
+
dpdispatcher/base_context.py,sha256=NvaC_RHyspxq412z-eCq4Zn8-szZxvn8K6OkXvx7l4Y,3615
|
|
6
|
+
dpdispatcher/dlog.py,sha256=ndh12teQBbJRybXd8UjEmAi6QTsAXajRicDj5mAH5h0,799
|
|
7
|
+
dpdispatcher/dpdisp.py,sha256=YuGb-HWLsDfSO2c7GH0eM20ciojGbx3yq9oZHP7u4yc,3498
|
|
8
|
+
dpdispatcher/machine.py,sha256=XFRH41gNCex_qs9gbg-S88_qab3_UAGfxKWUPxoipCM,16140
|
|
9
|
+
dpdispatcher/submission.py,sha256=mVAHBlT0a3_1PtsEvvhvwNPkAhgLiBXXemX64BcwizU,48447
|
|
10
|
+
dpdispatcher/contexts/__init__.py,sha256=s5M0ZJSrPttSyLdBwKD2m3W7a5AbYZdPB7IAND2j7EY,335
|
|
11
|
+
dpdispatcher/contexts/dp_cloud_server_context.py,sha256=6XK0B2sLGEDeZmV2SZzQdVrMcWAWYZVLLK-IaShEXIY,12245
|
|
12
|
+
dpdispatcher/contexts/hdfs_context.py,sha256=GJs_vmDCjTsnbfTdXpFTfpWTYXnZTDkEO2UJIdpV5F4,8908
|
|
13
|
+
dpdispatcher/contexts/lazy_local_context.py,sha256=F8abWAJRY1Ewx1sErINKN1ltWerXzeCcJgjTvLvucKE,5696
|
|
14
|
+
dpdispatcher/contexts/local_context.py,sha256=7CoGzcX-RU6cpmSYcf4wMwncYaFVUb8Ljj4ksfXcx4s,13678
|
|
15
|
+
dpdispatcher/contexts/openapi_context.py,sha256=DXaMS10SXN3VKEeEdzQyfOgRwUyHRJVCJHd2fKKdsmA,9499
|
|
16
|
+
dpdispatcher/contexts/ssh_context.py,sha256=1UbMIFpSGBcg3H1My4cx5vjAALvaxlZxWOcXwoX6Ff0,38597
|
|
17
|
+
dpdispatcher/entrypoints/__init__.py,sha256=exKSFT3j2oCerGwtI8WbHQK-D0K-CyifocRji1xntT4,20
|
|
18
|
+
dpdispatcher/entrypoints/gui.py,sha256=29lMXqbmSRbLj4rfBv7Jnw89NLU9syTB88IUP6IRJsU,830
|
|
19
|
+
dpdispatcher/entrypoints/submission.py,sha256=R2DXUGXTsk0Jw2y5Wjby40R5dxzXeqr4gn33ov6mdAI,2751
|
|
20
|
+
dpdispatcher/machines/__init__.py,sha256=9kSYkz2w3flp00IrHWTEwvoFGrathQAT3tvbieye83c,335
|
|
21
|
+
dpdispatcher/machines/distributed_shell.py,sha256=7avNcoOzEj7UcJuKl6b1ka2bj5dixcJaMlZK-I-i_Tc,7571
|
|
22
|
+
dpdispatcher/machines/dp_cloud_server.py,sha256=SR69gsFb2BvOQCW1QnWfP3cQvu_qHLJNsycp5wzosJU,11706
|
|
23
|
+
dpdispatcher/machines/fugaku.py,sha256=9OP3qSaaruqypHAdcuBFQM_MUtFp3yrvhZ5bPyLwEEk,4308
|
|
24
|
+
dpdispatcher/machines/lsf.py,sha256=Qruot39cPEpBNbbPmDwb1Gyfgyw3N36O0hs9PNEXyVU,7997
|
|
25
|
+
dpdispatcher/machines/openapi.py,sha256=Gzzbo8YOAybXGTrgMutexErcaEi3ts7uTUNvOhThFS8,8858
|
|
26
|
+
dpdispatcher/machines/pbs.py,sha256=HGBUf96AJ7hWOQPrENP5tFIDnEm4cb9deqJJ8wExbms,7079
|
|
27
|
+
dpdispatcher/machines/shell.py,sha256=qaia7mC_fz5Bqyelxmc1je-xg7NQ_6vQQ0qAjg2m4RQ,4796
|
|
28
|
+
dpdispatcher/machines/slurm.py,sha256=SP5rQiCPWzq4rqgUgp0IGJXXD_1DURWl4OBRAJ-Kng4,15611
|
|
29
|
+
dpdispatcher/utils/__init__.py,sha256=fwvwkMf7DFNQkNBiIce8Y8gRA6FhICwKjkKiXu_BEJg,13
|
|
30
|
+
dpdispatcher/utils/hdfs_cli.py,sha256=Fy36JTrfdhuxGbaHe1hYY0KrlNp06Tbjwo5wpj4ph-8,5434
|
|
31
|
+
dpdispatcher/utils/job_status.py,sha256=Eszs4TPLfszCuf6zLaFonf25feXDUguF28spYOjJpQE,233
|
|
32
|
+
dpdispatcher/utils/record.py,sha256=c8jdPmCuLzRmFo_jOjR0j9zFR1EWX3NSHVuPEIYCycg,2147
|
|
33
|
+
dpdispatcher/utils/utils.py,sha256=1One9eW-v3ejDcL6PB9PSCMZQkalnbxq0DfJoUwQaLs,5334
|
|
34
|
+
dpdispatcher/utils/dpcloudserver/__init__.py,sha256=FnX9HH-2dXADluNfucg98JPMfruMoBpN9ER9lZkVQvQ,49
|
|
35
|
+
dpdispatcher/utils/dpcloudserver/client.py,sha256=CLfXswvzI4inDrW2bYkfMQ6gQJFcZOgLhiXBz_EI17M,12029
|
|
36
|
+
dpdispatcher/utils/dpcloudserver/config.py,sha256=NteQzf1OeEkz2UbkXHHQ0B72cUu23zLVzpM9Yh4v1Cc,559
|
|
37
|
+
dpdispatcher/utils/dpcloudserver/retcode.py,sha256=1qAF8gFZx55u2sO8KbtYSIIrjcO-IGufEUlwbkSfC1g,721
|
|
38
|
+
dpdispatcher/utils/dpcloudserver/zip_file.py,sha256=f9WrlktwHW0YipaWg5Y0kxjMZlhD1cJYa6EUpvu4Cro,2611
|
|
39
|
+
dpdispatcher-0.6.1.dist-info/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
|
|
40
|
+
dpdispatcher-0.6.1.dist-info/METADATA,sha256=TKf52k3Vt9neBJNWXCimoMSmEkvC5ubCjVqUCPOa8_8,12752
|
|
41
|
+
dpdispatcher-0.6.1.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92
|
|
42
|
+
dpdispatcher-0.6.1.dist-info/entry_points.txt,sha256=NRHUV0IU_u7_XtcmmEDnVzAcUmurhiEAGwENckrajo4,233
|
|
43
|
+
dpdispatcher-0.6.1.dist-info/top_level.txt,sha256=35jAQoXY-b-e9fJ1_mxhZUiaCoJNt1ZI7mpFRf07Qjs,13
|
|
44
|
+
dpdispatcher-0.6.1.dist-info/RECORD,,
|