dpdispatcher 0.6.0__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. dpdispatcher/__init__.py +7 -89
  2. dpdispatcher/__main__.py +8 -0
  3. dpdispatcher/_version.py +2 -2
  4. dpdispatcher/base_context.py +1 -1
  5. dpdispatcher/contexts/__init__.py +11 -0
  6. dpdispatcher/{dp_cloud_server_context.py → contexts/dp_cloud_server_context.py} +6 -3
  7. dpdispatcher/{hdfs_context.py → contexts/hdfs_context.py} +2 -2
  8. dpdispatcher/{local_context.py → contexts/local_context.py} +1 -10
  9. dpdispatcher/{openapi_context.py → contexts/openapi_context.py} +2 -2
  10. dpdispatcher/{ssh_context.py → contexts/ssh_context.py} +32 -11
  11. dpdispatcher/dlog.py +31 -0
  12. dpdispatcher/dpdisp.py +41 -2
  13. dpdispatcher/entrypoints/__init__.py +1 -0
  14. dpdispatcher/entrypoints/submission.py +83 -0
  15. dpdispatcher/machine.py +9 -1
  16. dpdispatcher/machines/__init__.py +11 -0
  17. dpdispatcher/{distributed_shell.py → machines/distributed_shell.py} +3 -3
  18. dpdispatcher/{dp_cloud_server.py → machines/dp_cloud_server.py} +5 -5
  19. dpdispatcher/{fugaku.py → machines/fugaku.py} +3 -3
  20. dpdispatcher/{lsf.py → machines/lsf.py} +7 -3
  21. dpdispatcher/{openapi.py → machines/openapi.py} +6 -4
  22. dpdispatcher/{pbs.py → machines/pbs.py} +3 -3
  23. dpdispatcher/{shell.py → machines/shell.py} +3 -3
  24. dpdispatcher/{slurm.py → machines/slurm.py} +19 -5
  25. dpdispatcher/submission.py +55 -33
  26. dpdispatcher/utils/__init__.py +1 -0
  27. dpdispatcher/{dpcloudserver → utils/dpcloudserver}/client.py +1 -1
  28. dpdispatcher/{hdfs_cli.py → utils/hdfs_cli.py} +1 -1
  29. dpdispatcher/utils/record.py +79 -0
  30. dpdispatcher/{utils.py → utils/utils.py} +1 -1
  31. {dpdispatcher-0.6.0.dist-info → dpdispatcher-0.6.1.dist-info}/METADATA +3 -2
  32. dpdispatcher-0.6.1.dist-info/RECORD +44 -0
  33. {dpdispatcher-0.6.0.dist-info → dpdispatcher-0.6.1.dist-info}/WHEEL +1 -1
  34. dpdispatcher/dpcloudserver/temp_test.py +0 -90
  35. dpdispatcher-0.6.0.dist-info/RECORD +0 -37
  36. /dpdispatcher/{lazy_local_context.py → contexts/lazy_local_context.py} +0 -0
  37. /dpdispatcher/{gui.py → entrypoints/gui.py} +0 -0
  38. /dpdispatcher/{dpcloudserver → utils/dpcloudserver}/__init__.py +0 -0
  39. /dpdispatcher/{dpcloudserver → utils/dpcloudserver}/config.py +0 -0
  40. /dpdispatcher/{dpcloudserver → utils/dpcloudserver}/retcode.py +0 -0
  41. /dpdispatcher/{dpcloudserver → utils/dpcloudserver}/zip_file.py +0 -0
  42. /dpdispatcher/{JobStatus.py → utils/job_status.py} +0 -0
  43. {dpdispatcher-0.6.0.dist-info → dpdispatcher-0.6.1.dist-info}/LICENSE +0 -0
  44. {dpdispatcher-0.6.0.dist-info → dpdispatcher-0.6.1.dist-info}/entry_points.txt +0 -0
  45. {dpdispatcher-0.6.0.dist-info → dpdispatcher-0.6.1.dist-info}/top_level.txt +0 -0
dpdispatcher/__init__.py CHANGED
@@ -1,103 +1,21 @@
1
- import logging
2
- import os
3
- import sys
4
- import warnings
5
-
6
- ROOT_PATH = tuple(__path__)[0]
7
- dlog = logging.getLogger(__name__)
8
- dlog.propagate = False
9
- dlog.setLevel(logging.INFO)
10
- try:
11
- dlogf = logging.FileHandler(
12
- os.getcwd() + os.sep + "dpdispatcher" + ".log", delay=True
13
- )
14
- except PermissionError:
15
- warnings.warn(
16
- "dpdispatcher.log meet permission error. redirect the log to ~/dpdispatcher.log"
17
- )
18
- dlogf = logging.FileHandler(
19
- os.path.join(os.path.expanduser("~"), "dpdispatcher.log")
20
- )
21
-
22
- # dlogf = logging.FileHandler('./'+os.sep+SHORT_CMD+'.log')
23
- # dlogf = logging.FileHandler(os.path.join(os.environ['HOME'], SHORT_CMD+'.log'))
24
- # dlogf = logging.FileHandler(os.path.join(os.path.expanduser('~'), SHORT_CMD+'.log'))
25
- # dlogf = logging.FileHandler(os.path.join("/tmp/", SHORT_CMD+'.log'))
26
- dlogf_formatter = logging.Formatter("%(asctime)s - %(levelname)s : %(message)s")
27
- # dlogf_formatter=logging.Formatter('%(asctime)s - %(name)s - [%(filename)s:%(funcName)s - %(lineno)d ] - %(levelname)s \n %(message)s')
28
- dlogf.setFormatter(dlogf_formatter)
29
- dlog.addHandler(dlogf)
30
-
31
- dlog_stdout = logging.StreamHandler(sys.stdout)
32
- dlog_stdout.setFormatter(dlogf_formatter)
33
- dlog.addHandler(dlog_stdout)
34
-
35
1
  __author__ = "DeepModeling Team"
36
- __copyright__ = "Copyright 2019"
37
- __status__ = "Development"
2
+ __copyright__ = "Copyright 2019-2023, DeepModeling"
3
+ __status__ = "Production"
38
4
  try:
39
5
  from ._version import version as __version__
40
6
  except ImportError:
41
- __version__ = "unkown"
42
-
43
- from .distributed_shell import DistributedShell
44
- from .dp_cloud_server import DpCloudServer, Lebesgue
45
- from .dp_cloud_server_context import DpCloudServerContext, LebesgueContext
46
- from .fugaku import Fugaku
47
- from .hdfs_context import HDFSContext
48
- from .lazy_local_context import LazyLocalContext
49
- from .local_context import LocalContext
50
- from .lsf import LSF
51
- from .machine import Machine
52
- from .openapi import OpenAPI
53
- from .openapi_context import OpenAPIContext
54
- from .pbs import PBS, Torque
55
- from .shell import Shell
56
- from .slurm import Slurm
57
- from .ssh_context import SSHContext
58
- from .submission import Job, Resources, Submission, Task
59
-
60
-
61
- def info():
62
- """Show basic information about dpdispatcher, its location and version."""
63
- print("DeepModeling\n------------")
64
- print("Version: " + __version__)
65
- print("Path: " + ROOT_PATH)
66
- print("")
67
- print("Dependency")
68
- print("------------")
69
- for modui in ["psutil", "paramiko", "dargs", "oss2"]:
70
- try:
71
- mm = __import__(modui)
72
- print("%10s %10s %s" % (modui, mm.__version__, mm.__path__[0]))
73
- except ImportError:
74
- print("%10s %10s Not Found" % (modui, ""))
75
- print()
7
+ __version__ = "unknown"
76
8
 
9
+ import dpdispatcher.contexts # noqa: F401
10
+ import dpdispatcher.machines # noqa: F401
11
+ from dpdispatcher.machine import Machine
12
+ from dpdispatcher.submission import Job, Resources, Submission, Task
77
13
 
78
14
  __all__ = [
79
15
  "__version__",
80
- "DistributedShell",
81
- "DpCloudServer",
82
- "OpenAPI",
83
- "OpenAPIContext",
84
- "DpCloudServerContext",
85
- "HDFSContext",
86
- "LazyLocalContext",
87
- "LocalContext",
88
- "LSF",
89
16
  "Machine",
90
- "PBS",
91
- "Shell",
92
- "Slurm",
93
- "Fugaku",
94
- "SSHContext",
95
17
  "Submission",
96
18
  "Task",
97
- "Torque",
98
- "info",
99
- "Lebesgue",
100
- "LebesgueContext",
101
19
  "Job",
102
20
  "Resources",
103
21
  ]
@@ -0,0 +1,8 @@
1
+ """Package dp entry point."""
2
+
3
+ from dpdispatcher.dpdisp import (
4
+ main,
5
+ )
6
+
7
+ if __name__ == "__main__":
8
+ main()
dpdispatcher/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.6.0'
16
- __version_tuple__ = version_tuple = (0, 6, 0)
15
+ __version__ = version = '0.6.1'
16
+ __version_tuple__ = version_tuple = (0, 6, 1)
@@ -3,7 +3,7 @@ from typing import List, Tuple
3
3
 
4
4
  from dargs import Argument
5
5
 
6
- from dpdispatcher import dlog
6
+ from dpdispatcher.dlog import dlog
7
7
 
8
8
 
9
9
  class BaseContext(metaclass=ABCMeta):
@@ -0,0 +1,11 @@
1
+ """Contexts."""
2
+ import importlib
3
+ from pathlib import Path
4
+
5
+ PACKAGE_BASE = "dpdispatcher.contexts"
6
+ NOT_LOADABLE = ("__init__.py",)
7
+
8
+ for module_file in Path(__file__).parent.glob("*.py"):
9
+ if module_file.name not in NOT_LOADABLE:
10
+ module_name = f".{module_file.stem}"
11
+ importlib.import_module(module_name, PACKAGE_BASE)
@@ -8,13 +8,16 @@ from typing import List
8
8
  import tqdm
9
9
  from dargs.dargs import Argument
10
10
 
11
- from dpdispatcher import dlog
12
11
  from dpdispatcher.base_context import BaseContext
13
- from dpdispatcher.dpcloudserver.config import ALI_STS_BUCKET_NAME, ALI_STS_ENDPOINT
12
+ from dpdispatcher.dlog import dlog
14
13
 
15
14
  # from dpdispatcher.submission import Machine
16
15
  # from . import dlog
17
- from .dpcloudserver import Client, zip_file
16
+ from dpdispatcher.utils.dpcloudserver import Client, zip_file
17
+ from dpdispatcher.utils.dpcloudserver.config import (
18
+ ALI_STS_BUCKET_NAME,
19
+ ALI_STS_ENDPOINT,
20
+ )
18
21
 
19
22
  # from zip_file import zip_files
20
23
 
@@ -3,9 +3,9 @@ import shutil
3
3
  import tarfile
4
4
  from glob import glob
5
5
 
6
- from dpdispatcher import dlog
7
6
  from dpdispatcher.base_context import BaseContext
8
- from dpdispatcher.hdfs_cli import HDFS
7
+ from dpdispatcher.dlog import dlog
8
+ from dpdispatcher.utils.hdfs_cli import HDFS
9
9
 
10
10
 
11
11
  class HDFSContext(BaseContext):
@@ -1,12 +1,11 @@
1
- import hashlib
2
1
  import os
3
2
  import shutil
4
3
  import subprocess as sp
5
4
  from glob import glob
6
5
  from subprocess import TimeoutExpired
7
6
 
8
- from dpdispatcher import dlog
9
7
  from dpdispatcher.base_context import BaseContext
8
+ from dpdispatcher.dlog import dlog
10
9
 
11
10
 
12
11
  class SPRetObj:
@@ -30,14 +29,6 @@ def _check_file_path(fname):
30
29
  os.makedirs(dirname, exist_ok=True)
31
30
 
32
31
 
33
- def _identical_files(fname0, fname1):
34
- with open(fname0) as fp:
35
- code0 = hashlib.sha1(fp.read().encode("utf-8")).hexdigest()
36
- with open(fname1) as fp:
37
- code1 = hashlib.sha1(fp.read().encode("utf-8")).hexdigest()
38
- return code0 == code1
39
-
40
-
41
32
  class LocalContext(BaseContext):
42
33
  """Run jobs in the local server and remote directory.
43
34
 
@@ -14,9 +14,9 @@ except ModuleNotFoundError:
14
14
  else:
15
15
  found_bohriumsdk = True
16
16
 
17
- from dpdispatcher import dlog
18
17
  from dpdispatcher.base_context import BaseContext
19
- from dpdispatcher.JobStatus import JobStatus
18
+ from dpdispatcher.dlog import dlog
19
+ from dpdispatcher.utils.job_status import JobStatus
20
20
 
21
21
  DP_CLOUD_SERVER_HOME_DIR = os.path.join(
22
22
  os.path.expanduser("~"), ".dpdispatcher/", "dp_cloud_server/"
@@ -18,11 +18,17 @@ import paramiko
18
18
  import paramiko.ssh_exception
19
19
  from dargs.dargs import Argument
20
20
 
21
- from dpdispatcher import dlog
22
21
  from dpdispatcher.base_context import BaseContext
22
+ from dpdispatcher.dlog import dlog
23
23
 
24
24
  # from dpdispatcher.submission import Machine
25
- from dpdispatcher.utils import RetrySignal, generate_totp, get_sha256, retry, rsync
25
+ from dpdispatcher.utils.utils import (
26
+ RetrySignal,
27
+ generate_totp,
28
+ get_sha256,
29
+ retry,
30
+ rsync,
31
+ )
26
32
 
27
33
 
28
34
  class SSHSession:
@@ -140,6 +146,7 @@ class SSHSession:
140
146
  # Make a Paramiko Transport object using the socket
141
147
  ts = paramiko.Transport(sock)
142
148
  ts.banner_timeout = 60
149
+ ts.auth_timeout = self.timeout + 20
143
150
  ts.use_compression(compress=True)
144
151
 
145
152
  # Tell Paramiko that the Transport is going to be used as a client
@@ -939,21 +946,35 @@ class SSHContext(BaseContext):
939
946
  per_nfile = 100
940
947
  ntar = len(files) // per_nfile + 1
941
948
  if ntar <= 1:
942
- self.block_checkcall(
943
- "tar {} {} {}".format(
944
- tar_command,
945
- shlex.quote(of),
946
- " ".join([shlex.quote(file) for file in files]),
949
+ try:
950
+ self.block_checkcall(
951
+ "tar {} {} {}".format(
952
+ tar_command,
953
+ shlex.quote(of),
954
+ " ".join([shlex.quote(file) for file in files]),
955
+ )
947
956
  )
948
- )
957
+ except RuntimeError as e:
958
+ if "No such file or directory" in str(e):
959
+ raise FileNotFoundError(
960
+ "Any of the backward files does not exist in the remote directory."
961
+ ) from e
962
+ raise e
949
963
  else:
950
964
  file_list_file = os.path.join(
951
965
  self.remote_root, ".tmp.tar." + str(uuid.uuid4())
952
966
  )
953
967
  self.write_file(file_list_file, "\n".join(files))
954
- self.block_checkcall(
955
- f"tar {tar_command} {shlex.quote(of)} -T {shlex.quote(file_list_file)}"
956
- )
968
+ try:
969
+ self.block_checkcall(
970
+ f"tar {tar_command} {shlex.quote(of)} -T {shlex.quote(file_list_file)}"
971
+ )
972
+ except RuntimeError as e:
973
+ if "No such file or directory" in str(e):
974
+ raise FileNotFoundError(
975
+ "Any of the backward files does not exist in the remote directory."
976
+ ) from e
977
+ raise e
957
978
  # trans
958
979
  from_f = pathlib.PurePath(os.path.join(self.remote_root, of)).as_posix()
959
980
  to_f = pathlib.PurePath(os.path.join(self.local_root, of)).as_posix()
dpdispatcher/dlog.py ADDED
@@ -0,0 +1,31 @@
1
+ import logging
2
+ import os
3
+ import sys
4
+ import warnings
5
+
6
+ dlog = logging.getLogger("dpdispatcher")
7
+ dlog.propagate = False
8
+ dlog.setLevel(logging.INFO)
9
+ try:
10
+ dlogf = logging.FileHandler(
11
+ os.getcwd() + os.sep + "dpdispatcher" + ".log", delay=True
12
+ )
13
+ except PermissionError:
14
+ warnings.warn(
15
+ "dpdispatcher.log meet permission error. redirect the log to ~/dpdispatcher.log"
16
+ )
17
+ dlogf = logging.FileHandler(
18
+ os.path.join(os.path.expanduser("~"), "dpdispatcher.log"), delay=True
19
+ )
20
+
21
+ dlogf_formatter = logging.Formatter("%(asctime)s - %(levelname)s : %(message)s")
22
+ dlogf.setFormatter(dlogf_formatter)
23
+ dlog.addHandler(dlogf)
24
+
25
+ dlog_stdout = logging.StreamHandler(sys.stdout)
26
+ dlog_stdout.setFormatter(dlogf_formatter)
27
+ dlog.addHandler(dlog_stdout)
28
+
29
+ __all__ = [
30
+ "dlog",
31
+ ]
dpdispatcher/dpdisp.py CHANGED
@@ -2,7 +2,8 @@
2
2
  import argparse
3
3
  from typing import List, Optional
4
4
 
5
- from dpdispatcher.gui import start_dpgui
5
+ from dpdispatcher.entrypoints.gui import start_dpgui
6
+ from dpdispatcher.entrypoints.submission import handle_submission
6
7
 
7
8
 
8
9
  def main_parser() -> argparse.ArgumentParser:
@@ -23,6 +24,37 @@ def main_parser() -> argparse.ArgumentParser:
23
24
  )
24
25
  subparsers = parser.add_subparsers(title="Valid subcommands", dest="command")
25
26
  ##########################################
27
+ # backward
28
+ parser_submission = subparsers.add_parser(
29
+ "submission",
30
+ help="Handle terminated submission.",
31
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
32
+ )
33
+ parser_submission.add_argument(
34
+ "SUBMISSION_HASH",
35
+ type=str,
36
+ help="Submission hash to download.",
37
+ )
38
+ parser_submission_action = parser_submission.add_argument_group(
39
+ "Actions",
40
+ description="One or more actions to take on submission.",
41
+ )
42
+ parser_submission_action.add_argument(
43
+ "--download-terminated-log",
44
+ action="store_true",
45
+ help="Download log files of terminated tasks.",
46
+ )
47
+ parser_submission_action.add_argument(
48
+ "--download-finished-task",
49
+ action="store_true",
50
+ help="Download finished tasks.",
51
+ )
52
+ parser_submission_action.add_argument(
53
+ "--clean",
54
+ action="store_true",
55
+ help="Clean submission.",
56
+ )
57
+ ##########################################
26
58
  # gui
27
59
  parser_gui = subparsers.add_parser(
28
60
  "gui",
@@ -67,7 +99,14 @@ def parse_args(args: Optional[List[str]] = None):
67
99
 
68
100
  def main():
69
101
  args = parse_args()
70
- if args.command == "gui":
102
+ if args.command == "submission":
103
+ handle_submission(
104
+ submission_hash=args.SUBMISSION_HASH,
105
+ download_terminated_log=args.download_terminated_log,
106
+ download_finished_task=args.download_finished_task,
107
+ clean=args.clean,
108
+ )
109
+ elif args.command == "gui":
71
110
  start_dpgui(
72
111
  port=args.port,
73
112
  bind_all=args.bind_all,
@@ -0,0 +1 @@
1
+ """Entry points."""
@@ -0,0 +1,83 @@
1
+ from pathlib import Path
2
+
3
+ from dpdispatcher.dlog import dlog
4
+ from dpdispatcher.submission import Submission
5
+ from dpdispatcher.utils.job_status import JobStatus
6
+ from dpdispatcher.utils.record import record
7
+
8
+
9
+ def handle_submission(
10
+ *,
11
+ submission_hash: str,
12
+ download_terminated_log: bool = False,
13
+ download_finished_task: bool = False,
14
+ clean: bool = False,
15
+ ):
16
+ """Handle terminated submission.
17
+
18
+ Parameters
19
+ ----------
20
+ submission_hash : str
21
+ Submission hash to download.
22
+ download_terminated_log : bool, optional
23
+ Download log files of terminated tasks.
24
+ download_finished_task : bool, optional
25
+ Download finished tasks.
26
+ clean : bool, optional
27
+ Clean submission.
28
+
29
+ Raises
30
+ ------
31
+ ValueError
32
+ At least one action should be specified.
33
+ """
34
+ if int(download_terminated_log) + int(download_finished_task) + int(clean) == 0:
35
+ raise ValueError("At least one action should be specified.")
36
+
37
+ submission_file = record.get_submission(submission_hash)
38
+ submission = Submission.submission_from_json(str(submission_file))
39
+ submission.belonging_tasks = [
40
+ task for job in submission.belonging_jobs for task in job.job_task_list
41
+ ]
42
+ # TODO: for unclear reason, the submission_hash may be changed
43
+ submission.submission_hash = submission_hash
44
+ submission.machine.context.bind_submission(submission)
45
+ submission.update_submission_state()
46
+
47
+ terminated_tasks = []
48
+ finished_tasks = []
49
+ for task in submission.belonging_tasks:
50
+ task.get_task_state(submission.machine.context)
51
+ if task.task_state == JobStatus.terminated:
52
+ terminated_tasks.append(task)
53
+ elif task.task_state == JobStatus.finished:
54
+ finished_tasks.append(task)
55
+ submission.belonging_tasks = []
56
+
57
+ if download_terminated_log:
58
+ for task in terminated_tasks:
59
+ task.backward_files = [task.outlog, task.errlog]
60
+ submission.belonging_tasks += terminated_tasks
61
+ if download_finished_task:
62
+ submission.belonging_tasks += finished_tasks
63
+
64
+ submission.download_jobs()
65
+
66
+ if download_terminated_log:
67
+ terminated_log_files = []
68
+ for task in terminated_tasks:
69
+ assert submission.local_root is not None
70
+ terminated_log_files.append(
71
+ Path(submission.local_root) / task.task_work_path / task.outlog
72
+ )
73
+ terminated_log_files.append(
74
+ Path(submission.local_root) / task.task_work_path / task.errlog
75
+ )
76
+
77
+ dlog.info(
78
+ "Terminated logs are downloaded into:\n "
79
+ + "\n ".join([str(f) for f in terminated_log_files])
80
+ )
81
+
82
+ if clean:
83
+ submission.clean_jobs()
dpdispatcher/machine.py CHANGED
@@ -4,10 +4,11 @@ import shlex
4
4
  from abc import ABCMeta, abstractmethod
5
5
  from typing import List, Tuple
6
6
 
7
+ import yaml
7
8
  from dargs import Argument, Variant
8
9
 
9
- from dpdispatcher import dlog
10
10
  from dpdispatcher.base_context import BaseContext
11
+ from dpdispatcher.dlog import dlog
11
12
 
12
13
  script_template = """\
13
14
  {script_header}
@@ -124,6 +125,13 @@ class Machine(metaclass=ABCMeta):
124
125
  machine = cls.load_from_dict(machine_dict=machine_dict)
125
126
  return machine
126
127
 
128
+ @classmethod
129
+ def load_from_yaml(cls, yaml_path):
130
+ with open(yaml_path) as f:
131
+ machine_dict = yaml.safe_load(f)
132
+ machine = cls.load_from_dict(machine_dict=machine_dict)
133
+ return machine
134
+
127
135
  @classmethod
128
136
  def load_from_dict(cls, machine_dict):
129
137
  batch_type = machine_dict["batch_type"]
@@ -0,0 +1,11 @@
1
+ """Machines."""
2
+ import importlib
3
+ from pathlib import Path
4
+
5
+ PACKAGE_BASE = "dpdispatcher.machines"
6
+ NOT_LOADABLE = ("__init__.py",)
7
+
8
+ for module_file in Path(__file__).parent.glob("*.py"):
9
+ if module_file.name not in NOT_LOADABLE:
10
+ module_name = f".{module_file.stem}"
11
+ importlib.import_module(module_name, PACKAGE_BASE)
@@ -1,7 +1,7 @@
1
- from dpdispatcher import dlog
2
- from dpdispatcher.JobStatus import JobStatus
1
+ from dpdispatcher.dlog import dlog
3
2
  from dpdispatcher.machine import Machine
4
- from dpdispatcher.utils import (
3
+ from dpdispatcher.utils.job_status import JobStatus
4
+ from dpdispatcher.utils.utils import (
5
5
  customized_script_header_template,
6
6
  run_cmd_with_all_output,
7
7
  )
@@ -4,12 +4,12 @@ import time
4
4
  import uuid
5
5
  import warnings
6
6
 
7
- from dpdispatcher import dlog
8
- from dpdispatcher.dpcloudserver import Client, zip_file
9
- from dpdispatcher.dpcloudserver.config import ALI_OSS_BUCKET_URL
10
- from dpdispatcher.JobStatus import JobStatus
7
+ from dpdispatcher.dlog import dlog
11
8
  from dpdispatcher.machine import Machine
12
- from dpdispatcher.utils import customized_script_header_template
9
+ from dpdispatcher.utils.dpcloudserver import Client, zip_file
10
+ from dpdispatcher.utils.dpcloudserver.config import ALI_OSS_BUCKET_URL
11
+ from dpdispatcher.utils.job_status import JobStatus
12
+ from dpdispatcher.utils.utils import customized_script_header_template
13
13
 
14
14
  shell_script_header_template = """
15
15
  #!/bin/bash -l
@@ -1,9 +1,9 @@
1
1
  import shlex
2
2
 
3
- from dpdispatcher import dlog
4
- from dpdispatcher.JobStatus import JobStatus
3
+ from dpdispatcher.dlog import dlog
5
4
  from dpdispatcher.machine import Machine
6
- from dpdispatcher.utils import customized_script_header_template
5
+ from dpdispatcher.utils.job_status import JobStatus
6
+ from dpdispatcher.utils.utils import customized_script_header_template
7
7
 
8
8
  fugaku_script_header_template = """\
9
9
  {queue_name_line}
@@ -3,10 +3,14 @@ from typing import List
3
3
 
4
4
  from dargs import Argument
5
5
 
6
- from dpdispatcher import dlog
7
- from dpdispatcher.JobStatus import JobStatus
6
+ from dpdispatcher.dlog import dlog
8
7
  from dpdispatcher.machine import Machine
9
- from dpdispatcher.utils import RetrySignal, customized_script_header_template, retry
8
+ from dpdispatcher.utils.job_status import JobStatus
9
+ from dpdispatcher.utils.utils import (
10
+ RetrySignal,
11
+ customized_script_header_template,
12
+ retry,
13
+ )
10
14
 
11
15
  lsf_script_header_template = """\
12
16
  #!/bin/bash -l
@@ -2,7 +2,7 @@ import os
2
2
  import shutil
3
3
  import time
4
4
 
5
- from dpdispatcher.utils import customized_script_header_template
5
+ from dpdispatcher.utils.utils import customized_script_header_template
6
6
 
7
7
  try:
8
8
  from bohriumsdk.client import Client
@@ -14,9 +14,9 @@ except ModuleNotFoundError:
14
14
  else:
15
15
  found_bohriumsdk = True
16
16
 
17
- from dpdispatcher import dlog
18
- from dpdispatcher.JobStatus import JobStatus
17
+ from dpdispatcher.dlog import dlog
19
18
  from dpdispatcher.machine import Machine
19
+ from dpdispatcher.utils.job_status import JobStatus
20
20
 
21
21
  shell_script_header_template = """
22
22
  #!/bin/bash -l
@@ -147,7 +147,9 @@ class OpenAPI(Machine):
147
147
  )
148
148
 
149
149
  job_state = self.map_dp_job_state(
150
- dp_job_status, check_return.get("exitCode", 0), self.ignore_exit_code # type: ignore
150
+ dp_job_status,
151
+ check_return.get("exitCode", 0), # type: ignore
152
+ self.ignore_exit_code,
151
153
  )
152
154
  if job_state == JobStatus.finished:
153
155
  job_log = self.job.log(job_id)
@@ -1,9 +1,9 @@
1
1
  import shlex
2
2
 
3
- from dpdispatcher import dlog
4
- from dpdispatcher.JobStatus import JobStatus
3
+ from dpdispatcher.dlog import dlog
5
4
  from dpdispatcher.machine import Machine
6
- from dpdispatcher.utils import customized_script_header_template
5
+ from dpdispatcher.utils.job_status import JobStatus
6
+ from dpdispatcher.utils.utils import customized_script_header_template
7
7
 
8
8
  pbs_script_header_template = """
9
9
  #!/bin/bash -l
@@ -1,9 +1,9 @@
1
1
  import shlex
2
2
 
3
- from dpdispatcher import dlog
4
- from dpdispatcher.JobStatus import JobStatus
3
+ from dpdispatcher.dlog import dlog
5
4
  from dpdispatcher.machine import Machine
6
- from dpdispatcher.utils import customized_script_header_template
5
+ from dpdispatcher.utils.job_status import JobStatus
6
+ from dpdispatcher.utils.utils import customized_script_header_template
7
7
 
8
8
  shell_script_header_template = """
9
9
  #!/bin/bash -l
@@ -5,10 +5,14 @@ from typing import List
5
5
 
6
6
  from dargs import Argument
7
7
 
8
- from dpdispatcher import dlog
9
- from dpdispatcher.JobStatus import JobStatus
8
+ from dpdispatcher.dlog import dlog
10
9
  from dpdispatcher.machine import Machine, script_command_template
11
- from dpdispatcher.utils import RetrySignal, customized_script_header_template, retry
10
+ from dpdispatcher.utils.job_status import JobStatus
11
+ from dpdispatcher.utils.utils import (
12
+ RetrySignal,
13
+ customized_script_header_template,
14
+ retry,
15
+ )
12
16
 
13
17
  # from dpdispatcher.submission import Resources
14
18
 
@@ -20,6 +24,12 @@ slurm_script_header_template = """\
20
24
  {slurm_number_gpu_line}
21
25
  {slurm_partition_line}"""
22
26
 
27
+ slurm_job_array_script_end_template = """
28
+ wait
29
+
30
+ {append_script_part}
31
+ """
32
+
23
33
 
24
34
  class Slurm(Machine):
25
35
  def gen_script(self, job):
@@ -296,9 +306,13 @@ class SlurmJobArray(Slurm):
296
306
  return script_command
297
307
 
298
308
  def gen_script_end(self, job):
299
- # We cannot have a end script for job array
309
+ # We cannot touch tag for job array
300
310
  # we may check task tag instead
301
- return ""
311
+ append_script = job.resources.append_script
312
+ append_script_part = "\n".join(append_script)
313
+ return slurm_job_array_script_end_template.format(
314
+ append_script_part=append_script_part,
315
+ )
302
316
 
303
317
  @retry()
304
318
  def check_status(self, job):
@@ -9,15 +9,16 @@ import random
9
9
  import time
10
10
  import uuid
11
11
  from hashlib import sha1
12
- from typing import Optional
12
+ from typing import List, Optional
13
13
 
14
+ import yaml
14
15
  from dargs.dargs import Argument, Variant
15
16
 
16
- from dpdispatcher import dlog
17
- from dpdispatcher.JobStatus import JobStatus
17
+ from dpdispatcher.dlog import dlog
18
18
  from dpdispatcher.machine import Machine
19
+ from dpdispatcher.utils.job_status import JobStatus
20
+ from dpdispatcher.utils.record import record
19
21
 
20
- # from dpdispatcher.slurm import SlurmResources
21
22
  # %%
22
23
  default_strategy = dict(if_cuda_multi_devices=False, ratio_unfinished=0.0)
23
24
 
@@ -248,9 +249,11 @@ class Submission:
248
249
  time.sleep(check_interval)
249
250
  except (Exception, KeyboardInterrupt, SystemExit) as e:
250
251
  self.submission_to_json()
252
+ record_path = record.write(self)
251
253
  dlog.exception(e)
252
254
  dlog.info(f"submission exit: {self.submission_hash}")
253
255
  dlog.info(f"at {self.machine.context.remote_root}")
256
+ dlog.info(f"Submission information is saved in {str(record_path)}.")
254
257
  dlog.debug(self.serialize())
255
258
  raise e
256
259
  else:
@@ -273,6 +276,9 @@ class Submission:
273
276
  try:
274
277
  self.download_jobs()
275
278
  success = True
279
+ except FileNotFoundError as e:
280
+ # retry will never success if the file is not found
281
+ raise e
276
282
  except (EOFError, Exception) as e:
277
283
  dlog.exception(e)
278
284
  elapsed_time = time.time() - start_time
@@ -340,7 +346,6 @@ class Submission:
340
346
  dlog.debug(
341
347
  f"debug:update_submission_state: job: {job.job_hash}, {job.job_id}, {job.job_state}"
342
348
  )
343
- # self.submission_to_json()
344
349
 
345
350
  def handle_unexpected_submission_state(self):
346
351
  """Handle unexpected job state of the submission.
@@ -353,25 +358,16 @@ class Submission:
353
358
  job.handle_unexpected_job_state()
354
359
  except Exception as e:
355
360
  self.submission_to_json()
361
+ record_path = record.write(self)
356
362
  raise RuntimeError(
357
363
  f"Meet errors will handle unexpected submission state.\n"
358
364
  f"Debug information: remote_root=={self.machine.context.remote_root}.\n"
359
365
  f"Debug information: submission_hash=={self.submission_hash}.\n"
360
- f"Please check the dirs and scripts in remote_root. "
361
- f"The job information mentioned above may help."
366
+ f"Please check error messages above and in remote_root. "
367
+ f"The submission information is saved in {str(record_path)}.\n"
368
+ f"For furthur actions, run the following command with proper flags: dpdisp submission {self.submission_hash}"
362
369
  ) from e
363
370
 
364
- # not used here, submitting job is in handle_unexpected_submission_state.
365
-
366
- # def submit_submission(self):
367
- # """submit the job belonging to the submission.
368
- # """
369
- # for job in self.belonging_jobs:
370
- # job.submit_job()
371
- # self.get_submission_state()
372
-
373
- # def update_submi
374
-
375
371
  def check_ratio_unfinished(self, ratio_unfinished: float) -> bool:
376
372
  """Calculate the ratio of unfinished tasks in the submission.
377
373
 
@@ -506,6 +502,8 @@ class Submission:
506
502
 
507
503
  def clean_jobs(self):
508
504
  self.machine.context.clean()
505
+ assert self.submission_hash is not None
506
+ record.remove(self.submission_hash)
509
507
 
510
508
  def submission_to_json(self):
511
509
  # self.update_submission_state()
@@ -615,6 +613,13 @@ class Task:
615
613
  task_dict = json.load(f)
616
614
  return cls.load_from_dict(task_dict)
617
615
 
616
+ @classmethod
617
+ def load_from_yaml(cls, yaml_file):
618
+ with open(yaml_file) as f:
619
+ task_dict = yaml.safe_load(f)
620
+ task = cls.load_from_dict(task_dict=task_dict)
621
+ return task
622
+
618
623
  @classmethod
619
624
  def load_from_dict(cls, task_dict: dict) -> "Task":
620
625
  # check dict
@@ -670,26 +675,30 @@ class Task:
670
675
  Argument("command", str, optional=False, doc=doc_command),
671
676
  Argument("task_work_path", str, optional=False, doc=doc_task_work_path),
672
677
  Argument(
673
- "forward_files", list, optional=False, doc=doc_forward_files, default=[]
678
+ "forward_files",
679
+ List[str],
680
+ optional=True,
681
+ doc=doc_forward_files,
682
+ default=[],
674
683
  ),
675
684
  Argument(
676
685
  "backward_files",
677
- list,
678
- optional=False,
686
+ List[str],
687
+ optional=True,
679
688
  doc=doc_backward_files,
680
689
  default=[],
681
690
  ),
682
691
  Argument(
683
692
  "outlog",
684
693
  [type(None), str],
685
- optional=False,
694
+ optional=True,
686
695
  doc=doc_outlog,
687
696
  default="log",
688
697
  ),
689
698
  Argument(
690
699
  "errlog",
691
700
  [type(None), str],
692
- optional=False,
701
+ optional=True,
693
702
  doc=doc_errlog,
694
703
  default="err",
695
704
  ),
@@ -836,16 +845,18 @@ class Job:
836
845
  if job_state == JobStatus.terminated:
837
846
  self.fail_count += 1
838
847
  dlog.info(
839
- f"job: {self.job_hash} {self.job_id} terminated;"
848
+ f"job: {self.job_hash} {self.job_id} terminated; "
840
849
  f"fail_cout is {self.fail_count}; resubmitting job"
841
850
  )
842
851
  retry_count = 3
843
852
  assert self.machine is not None
844
- if hasattr(self.machine, "retry_count") and self.machine.retry_count > 0:
853
+ if hasattr(self.machine, "retry_count") and self.machine.retry_count >= 0:
845
854
  retry_count = self.machine.retry_count + 1
846
855
  if (self.fail_count) > 0 and (self.fail_count % retry_count == 0):
847
856
  last_error_message = self.get_last_error_message()
848
- err_msg = f"job:{self.job_hash} {self.job_id} failed {self.fail_count} times. job_detail:{self}"
857
+ err_msg = (
858
+ f"job:{self.job_hash} {self.job_id} failed {self.fail_count} times."
859
+ )
849
860
  if last_error_message is not None:
850
861
  err_msg += f"\nPossible remote error message: {last_error_message}"
851
862
  raise RuntimeError(err_msg)
@@ -1092,7 +1103,14 @@ class Resources:
1092
1103
  def load_from_json(cls, json_file):
1093
1104
  with open(json_file) as f:
1094
1105
  resources_dict = json.load(f)
1095
- resources = cls.deserialize(resources_dict=resources_dict)
1106
+ resources = cls.load_from_dict(resources_dict=resources_dict)
1107
+ return resources
1108
+
1109
+ @classmethod
1110
+ def load_from_yaml(cls, yaml_file):
1111
+ with open(yaml_file) as f:
1112
+ resources_dict = yaml.safe_load(f)
1113
+ resources = cls.load_from_dict(resources_dict=resources_dict)
1096
1114
  return resources
1097
1115
 
1098
1116
  @classmethod
@@ -1175,36 +1193,40 @@ class Resources:
1175
1193
  ),
1176
1194
  Argument("queue_name", str, optional=True, doc=doc_queue_name, default=""),
1177
1195
  Argument("group_size", int, optional=False, doc=doc_group_size),
1178
- Argument("custom_flags", list, optional=True, doc=doc_custom_flags),
1196
+ Argument("custom_flags", List[str], optional=True, doc=doc_custom_flags),
1179
1197
  # Argument("strategy", dict, optional=True, doc=doc_strategy,default=default_strategy),
1180
1198
  strategy_format,
1181
1199
  Argument("para_deg", int, optional=True, doc=doc_para_deg, default=1),
1182
1200
  Argument(
1183
- "source_list", list, optional=True, doc=doc_source_list, default=[]
1201
+ "source_list", List[str], optional=True, doc=doc_source_list, default=[]
1184
1202
  ),
1185
1203
  Argument(
1186
1204
  "module_purge", bool, optional=True, doc=doc_module_purge, default=False
1187
1205
  ),
1188
1206
  Argument(
1189
1207
  "module_unload_list",
1190
- list,
1208
+ List[str],
1191
1209
  optional=True,
1192
1210
  doc=doc_module_unload_list,
1193
1211
  default=[],
1194
1212
  ),
1195
1213
  Argument(
1196
- "module_list", list, optional=True, doc=doc_module_list, default=[]
1214
+ "module_list", List[str], optional=True, doc=doc_module_list, default=[]
1197
1215
  ),
1198
1216
  Argument("envs", dict, optional=True, doc=doc_envs, default={}),
1199
1217
  Argument(
1200
1218
  "prepend_script",
1201
- list,
1219
+ List[str],
1202
1220
  optional=True,
1203
1221
  doc=doc_prepend_script,
1204
1222
  default=[],
1205
1223
  ),
1206
1224
  Argument(
1207
- "append_script", list, optional=True, doc=doc_append_script, default=[]
1225
+ "append_script",
1226
+ List[str],
1227
+ optional=True,
1228
+ doc=doc_append_script,
1229
+ default=[],
1208
1230
  ),
1209
1231
  Argument(
1210
1232
  "wait_time", [int, float], optional=True, doc=doc_wait_time, default=0
@@ -0,0 +1 @@
1
+ """Utils."""
@@ -6,7 +6,7 @@ from urllib.parse import urljoin
6
6
 
7
7
  import requests
8
8
 
9
- from dpdispatcher import dlog
9
+ from dpdispatcher.dlog import dlog
10
10
 
11
11
  from .config import API_HOST, API_LOGGER_STACK_INFO, HTTP_TIME_OUT
12
12
  from .retcode import RETCODE
@@ -2,7 +2,7 @@
2
2
 
3
3
  import os
4
4
 
5
- from dpdispatcher.utils import run_cmd_with_all_output
5
+ from dpdispatcher.utils.utils import run_cmd_with_all_output
6
6
 
7
7
 
8
8
  class HDFS:
@@ -0,0 +1,79 @@
1
+ import json
2
+ from pathlib import Path
3
+ from typing import List
4
+
5
+
6
+ class Record:
7
+ """Record failed or canceled submissions."""
8
+
9
+ def __init__(self) -> None:
10
+ self.record_directory = Path.home() / ".dpdispatcher" / "submission"
11
+ self.record_directory.mkdir(parents=True, exist_ok=True)
12
+
13
+ def get_submissions(self) -> List[str]:
14
+ """Get all stored submission hashes.
15
+
16
+ Returns
17
+ -------
18
+ list[str]
19
+ List of submission hashes.
20
+ """
21
+ return [
22
+ f.stem
23
+ for f in self.record_directory.iterdir()
24
+ if (f.is_file() and f.suffix == ".json")
25
+ ]
26
+
27
+ def write(self, submission) -> Path:
28
+ """Write submission data to file.
29
+
30
+ Parameters
31
+ ----------
32
+ submission : dpdispatcher.Submission
33
+ Submission data.
34
+
35
+ Returns
36
+ -------
37
+ pathlib.Path
38
+ Path to submission data.
39
+ """
40
+ submission_path = self.record_directory / f"{submission.submission_hash}.json"
41
+ submission_path.write_text(json.dumps(submission.serialize(), indent=2))
42
+ return submission_path
43
+
44
+ def get_submission(self, hash: str, not_exist_ok: bool = False) -> Path:
45
+ """Get submission data by hash.
46
+
47
+ Parameters
48
+ ----------
49
+ hash : str
50
+ Hash of submission data.
51
+
52
+ Returns
53
+ -------
54
+ pathlib.Path
55
+ Path to submission data.
56
+ """
57
+ submission_file = self.record_directory / f"{hash}.json"
58
+ if not not_exist_ok and not submission_file.is_file():
59
+ raise FileNotFoundError(f"Submission file not found: {submission_file}")
60
+ return submission_file
61
+
62
+ def remove(self, hash: str):
63
+ """Remove submission data by hash.
64
+
65
+ Call this method when the remote directory is cleaned.
66
+
67
+ Parameters
68
+ ----------
69
+ hash : str
70
+ Hash of submission data.
71
+ """
72
+ path = self.get_submission(hash, not_exist_ok=True)
73
+ if path.is_file():
74
+ path.unlink()
75
+
76
+
77
+ # the record object can be globally used
78
+ record = Record()
79
+ __all__ = ["record"]
@@ -7,7 +7,7 @@ import subprocess
7
7
  import time
8
8
  from typing import TYPE_CHECKING, Callable, Optional, Type, Union
9
9
 
10
- from dpdispatcher import dlog
10
+ from dpdispatcher.dlog import dlog
11
11
 
12
12
  if TYPE_CHECKING:
13
13
  from dpdispatcher import Resources
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dpdispatcher
3
- Version: 0.6.0
3
+ Version: 0.6.1
4
4
  Summary: Generate HPC scheduler systems jobs input scripts, submit these scripts to HPC systems, and poke until they finish
5
5
  Author: DeepModeling
6
6
  License: GNU LESSER GENERAL PUBLIC LICENSE
@@ -186,9 +186,10 @@ Requires-Python: >=3.7
186
186
  Description-Content-Type: text/markdown
187
187
  License-File: LICENSE
188
188
  Requires-Dist: paramiko
189
- Requires-Dist: dargs >=0.2.9
189
+ Requires-Dist: dargs >=0.4.1
190
190
  Requires-Dist: requests
191
191
  Requires-Dist: tqdm >=4.9.0
192
+ Requires-Dist: pyyaml
192
193
  Requires-Dist: typing-extensions ; python_version < "3.7"
193
194
  Provides-Extra: bohrium
194
195
  Requires-Dist: oss2 ; extra == 'bohrium'
@@ -0,0 +1,44 @@
1
+ dpdispatcher/__init__.py,sha256=CLZP_N5CTp14ujWCykEHuJjoIfKR6CwrclXhjWUgNoE,517
2
+ dpdispatcher/__main__.py,sha256=BFhG-mSBzVZUEezQJqXWZnt2WsnhAHT_zpT8Y6gpOz0,116
3
+ dpdispatcher/_version.py,sha256=lgiCYGSijhLK71WmuudWf_AyhNAutwQWx2V8bV6a5VQ,411
4
+ dpdispatcher/arginfo.py,sha256=pNaxYIE6ahBidpR7OCKZdw8iGt003uTXGSlVzwiuvRg,188
5
+ dpdispatcher/base_context.py,sha256=NvaC_RHyspxq412z-eCq4Zn8-szZxvn8K6OkXvx7l4Y,3615
6
+ dpdispatcher/dlog.py,sha256=ndh12teQBbJRybXd8UjEmAi6QTsAXajRicDj5mAH5h0,799
7
+ dpdispatcher/dpdisp.py,sha256=YuGb-HWLsDfSO2c7GH0eM20ciojGbx3yq9oZHP7u4yc,3498
8
+ dpdispatcher/machine.py,sha256=XFRH41gNCex_qs9gbg-S88_qab3_UAGfxKWUPxoipCM,16140
9
+ dpdispatcher/submission.py,sha256=mVAHBlT0a3_1PtsEvvhvwNPkAhgLiBXXemX64BcwizU,48447
10
+ dpdispatcher/contexts/__init__.py,sha256=s5M0ZJSrPttSyLdBwKD2m3W7a5AbYZdPB7IAND2j7EY,335
11
+ dpdispatcher/contexts/dp_cloud_server_context.py,sha256=6XK0B2sLGEDeZmV2SZzQdVrMcWAWYZVLLK-IaShEXIY,12245
12
+ dpdispatcher/contexts/hdfs_context.py,sha256=GJs_vmDCjTsnbfTdXpFTfpWTYXnZTDkEO2UJIdpV5F4,8908
13
+ dpdispatcher/contexts/lazy_local_context.py,sha256=F8abWAJRY1Ewx1sErINKN1ltWerXzeCcJgjTvLvucKE,5696
14
+ dpdispatcher/contexts/local_context.py,sha256=7CoGzcX-RU6cpmSYcf4wMwncYaFVUb8Ljj4ksfXcx4s,13678
15
+ dpdispatcher/contexts/openapi_context.py,sha256=DXaMS10SXN3VKEeEdzQyfOgRwUyHRJVCJHd2fKKdsmA,9499
16
+ dpdispatcher/contexts/ssh_context.py,sha256=1UbMIFpSGBcg3H1My4cx5vjAALvaxlZxWOcXwoX6Ff0,38597
17
+ dpdispatcher/entrypoints/__init__.py,sha256=exKSFT3j2oCerGwtI8WbHQK-D0K-CyifocRji1xntT4,20
18
+ dpdispatcher/entrypoints/gui.py,sha256=29lMXqbmSRbLj4rfBv7Jnw89NLU9syTB88IUP6IRJsU,830
19
+ dpdispatcher/entrypoints/submission.py,sha256=R2DXUGXTsk0Jw2y5Wjby40R5dxzXeqr4gn33ov6mdAI,2751
20
+ dpdispatcher/machines/__init__.py,sha256=9kSYkz2w3flp00IrHWTEwvoFGrathQAT3tvbieye83c,335
21
+ dpdispatcher/machines/distributed_shell.py,sha256=7avNcoOzEj7UcJuKl6b1ka2bj5dixcJaMlZK-I-i_Tc,7571
22
+ dpdispatcher/machines/dp_cloud_server.py,sha256=SR69gsFb2BvOQCW1QnWfP3cQvu_qHLJNsycp5wzosJU,11706
23
+ dpdispatcher/machines/fugaku.py,sha256=9OP3qSaaruqypHAdcuBFQM_MUtFp3yrvhZ5bPyLwEEk,4308
24
+ dpdispatcher/machines/lsf.py,sha256=Qruot39cPEpBNbbPmDwb1Gyfgyw3N36O0hs9PNEXyVU,7997
25
+ dpdispatcher/machines/openapi.py,sha256=Gzzbo8YOAybXGTrgMutexErcaEi3ts7uTUNvOhThFS8,8858
26
+ dpdispatcher/machines/pbs.py,sha256=HGBUf96AJ7hWOQPrENP5tFIDnEm4cb9deqJJ8wExbms,7079
27
+ dpdispatcher/machines/shell.py,sha256=qaia7mC_fz5Bqyelxmc1je-xg7NQ_6vQQ0qAjg2m4RQ,4796
28
+ dpdispatcher/machines/slurm.py,sha256=SP5rQiCPWzq4rqgUgp0IGJXXD_1DURWl4OBRAJ-Kng4,15611
29
+ dpdispatcher/utils/__init__.py,sha256=fwvwkMf7DFNQkNBiIce8Y8gRA6FhICwKjkKiXu_BEJg,13
30
+ dpdispatcher/utils/hdfs_cli.py,sha256=Fy36JTrfdhuxGbaHe1hYY0KrlNp06Tbjwo5wpj4ph-8,5434
31
+ dpdispatcher/utils/job_status.py,sha256=Eszs4TPLfszCuf6zLaFonf25feXDUguF28spYOjJpQE,233
32
+ dpdispatcher/utils/record.py,sha256=c8jdPmCuLzRmFo_jOjR0j9zFR1EWX3NSHVuPEIYCycg,2147
33
+ dpdispatcher/utils/utils.py,sha256=1One9eW-v3ejDcL6PB9PSCMZQkalnbxq0DfJoUwQaLs,5334
34
+ dpdispatcher/utils/dpcloudserver/__init__.py,sha256=FnX9HH-2dXADluNfucg98JPMfruMoBpN9ER9lZkVQvQ,49
35
+ dpdispatcher/utils/dpcloudserver/client.py,sha256=CLfXswvzI4inDrW2bYkfMQ6gQJFcZOgLhiXBz_EI17M,12029
36
+ dpdispatcher/utils/dpcloudserver/config.py,sha256=NteQzf1OeEkz2UbkXHHQ0B72cUu23zLVzpM9Yh4v1Cc,559
37
+ dpdispatcher/utils/dpcloudserver/retcode.py,sha256=1qAF8gFZx55u2sO8KbtYSIIrjcO-IGufEUlwbkSfC1g,721
38
+ dpdispatcher/utils/dpcloudserver/zip_file.py,sha256=f9WrlktwHW0YipaWg5Y0kxjMZlhD1cJYa6EUpvu4Cro,2611
39
+ dpdispatcher-0.6.1.dist-info/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
40
+ dpdispatcher-0.6.1.dist-info/METADATA,sha256=TKf52k3Vt9neBJNWXCimoMSmEkvC5ubCjVqUCPOa8_8,12752
41
+ dpdispatcher-0.6.1.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92
42
+ dpdispatcher-0.6.1.dist-info/entry_points.txt,sha256=NRHUV0IU_u7_XtcmmEDnVzAcUmurhiEAGwENckrajo4,233
43
+ dpdispatcher-0.6.1.dist-info/top_level.txt,sha256=35jAQoXY-b-e9fJ1_mxhZUiaCoJNt1ZI7mpFRf07Qjs,13
44
+ dpdispatcher-0.6.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.41.2)
2
+ Generator: bdist_wheel (0.41.3)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,90 +0,0 @@
1
- # %%
2
- import os
3
- import sys
4
- import unittest
5
- import uuid
6
-
7
- from dpdispatcher.dpcloudserver import api
8
- from dpdispatcher.dpcloudserver.zip_file import zip_files
9
-
10
- # import api
11
- # from zip_file import zip_files
12
-
13
-
14
- # %%
15
- class DPTest(unittest.TestCase):
16
- test_data = {
17
- "job_type": "indicate",
18
- "log_file": "mylog",
19
- "command": "( echo aa && lmp -i input.lammps && sleep 900 ) > dp.log 2>&1",
20
- "backward_files": [],
21
- "job_name": "dpdispatcher_lammps_test",
22
- "machine": {
23
- "platform": "ali",
24
- "resources": {
25
- "gpu_type": "1 * NVIDIA P100",
26
- "cpu_num": 4,
27
- "mem_limit": 28,
28
- "time_limit": "2:00:00",
29
- "image_name": "yfb-deepmd-kit-1.2.4-cuda10",
30
- },
31
- },
32
- "job_resources": "http://dpcloudserver.oss-cn-shenzhen.aliyuncs.com/dpcloudserver/indicate/a657ff49722839f1ee54edeb3e9b1beb0ee5cc0e/a657ff49722839f1ee54edeb3e9b1beb0ee5cc0e.zip",
33
- }
34
-
35
- username = ""
36
- password = ""
37
-
38
- ENDPOINT = "http://oss-cn-shenzhen.aliyuncs.com"
39
- BUCKET_NAME = "dpcloudserver"
40
-
41
- @classmethod
42
- def setUpClass(cls):
43
- print("execute", sys._getframe().f_code.co_name)
44
-
45
- @classmethod
46
- def tearDownClass(cls):
47
- print("execute", sys._getframe().f_code.co_name)
48
-
49
- def setUp(self):
50
- print("execute", sys._getframe().f_code.co_name)
51
- api.login(self.username, self.password)
52
-
53
- def test_commit_job(self):
54
- print("----------", sys._getframe().f_code.co_name)
55
- file_uuid = uuid.uuid1().hex
56
- oss_task_zip = os.path.join(
57
- "{}/{}/{}.zip".format("indicate", file_uuid, file_uuid)
58
- )
59
- zip_path = "/home/felix/workplace/22_dpdispatcher/dpdispatcher-yfb/dpdispatcher/dpcloudserver/t.txt"
60
- zip_task_file = zip_path + ".zip"
61
- zip_files(zip_path, zip_task_file, [])
62
- api.upload(oss_task_zip, zip_task_file, self.ENDPOINT, self.BUCKET_NAME)
63
- job_id = api.job_create(
64
- self.test_data["job_type"], self.test_data["job_resources"], self.test_data
65
- )
66
- tasks = api.get_job_detail(job_id)
67
- print(tasks)
68
-
69
- def test_get_tasks(self):
70
- print("----------", sys._getframe().f_code.co_name)
71
- jobs = api.get_jobs()
72
- for j in jobs:
73
- tasks = api.get_job_detail(j["id"])
74
- print(tasks)
75
-
76
- # def test_download(self):
77
- # print('----------', sys._getframe().f_code.co_name)
78
- # oss_path = 'dpcloudserver/indicate/abe0febc92ce11eb990800163e094dc5/abe0febc92ce11eb990800163e094dc5.zip'
79
- # api.download(oss_path, "out.zip", self.ENDPOINT, self.BUCKET_NAME)
80
-
81
-
82
- if __name__ == "__main__":
83
- suite = unittest.TestSuite()
84
-
85
- suite.addTest(DPTest("test_commit_job"))
86
- # suite.addTest(DPTest("test_get_tasks"))
87
- # suite.addTest(DPTest("test_download"))
88
-
89
- runner = unittest.TextTestRunner()
90
- runner.run(suite)
@@ -1,37 +0,0 @@
1
- dpdispatcher/JobStatus.py,sha256=Eszs4TPLfszCuf6zLaFonf25feXDUguF28spYOjJpQE,233
2
- dpdispatcher/__init__.py,sha256=i33piTZkPDYIm_qiTEI_J3bmHlFayEN1T8T8TyzYdqg,3017
3
- dpdispatcher/_version.py,sha256=2JKwcA-YQ0okV2N-gwTWy_n51igWrPcsKQFm0cnqsvw,411
4
- dpdispatcher/arginfo.py,sha256=pNaxYIE6ahBidpR7OCKZdw8iGt003uTXGSlVzwiuvRg,188
5
- dpdispatcher/base_context.py,sha256=Hfri0x41XC4MRUjxc0-WMiZB_E4NvLp94ZYaHfYCWHM,3610
6
- dpdispatcher/distributed_shell.py,sha256=ZLz-1GKrjHjeW6_GXKfFufFbMaFuy3RpJz0hoWRRZP0,7553
7
- dpdispatcher/dp_cloud_server.py,sha256=I0yHJtovINmro0LPjWeHnbGahIpS3OkB4ooq4IgjELo,11676
8
- dpdispatcher/dp_cloud_server_context.py,sha256=S81jLpwdur9Z4TptkmLs_7vuleLVeRXyeD1nvUgDodY,12203
9
- dpdispatcher/dpdisp.py,sha256=M5yaAmpyQJ6q_kcclWGu7ixXm5qYN9XbXpqzrboVGwI,2138
10
- dpdispatcher/fugaku.py,sha256=_F-dRI8JniTLgldpfwj5lsX10YR6BfH-x95OPSPF4G4,4290
11
- dpdispatcher/gui.py,sha256=29lMXqbmSRbLj4rfBv7Jnw89NLU9syTB88IUP6IRJsU,830
12
- dpdispatcher/hdfs_cli.py,sha256=SkCA-dgZbcS2a6Z96XzUB3-T3Barkak_5S7wP7WF55o,5428
13
- dpdispatcher/hdfs_context.py,sha256=mO12yMoASGSh8erumVxMOPnHGdhOZuyaPyir9hfY0ZY,8897
14
- dpdispatcher/lazy_local_context.py,sha256=F8abWAJRY1Ewx1sErINKN1ltWerXzeCcJgjTvLvucKE,5696
15
- dpdispatcher/local_context.py,sha256=yCmEDsxTUmDGsH9iPGamk7KOME2Yb7jI2reyDDpAsFI,13948
16
- dpdispatcher/lsf.py,sha256=fLJ1FaaJpSrdYxs-h2RGkOkIMPOnT_owNMwlmMbK_0g,7962
17
- dpdispatcher/machine.py,sha256=oUptUUhWjVfRe82WGGu0WEMovlIB5YqCgxQOAdx6wbw,15898
18
- dpdispatcher/openapi.py,sha256=RGBDQ2KOICm0rDJOtf9MydGQ6VhrsecGfPGGUCVlURo,8815
19
- dpdispatcher/openapi_context.py,sha256=IboZ5Y7pwKfrxe9tot1JroxQuo85tbVsmlgkWeKbJAs,9487
20
- dpdispatcher/pbs.py,sha256=JitABpPC84m14Gu5TJ6eyYXtFTN1Yc5WbMKWsAjy8OI,7061
21
- dpdispatcher/shell.py,sha256=_iKGxLcjEZViqjLNfXXMx_DywSnRHCdGjzg7176kND0,4778
22
- dpdispatcher/slurm.py,sha256=wqcFrP0oxXneZuLjMb3k4JYatfxlFo4dKGIHAqysRyc,15302
23
- dpdispatcher/ssh_context.py,sha256=r2Xn-yFGCtTTia-NpILgqiSeK_SecJRGGNdxwy1q1_k,37871
24
- dpdispatcher/submission.py,sha256=N9tEC6qMQBYoY4t3q7GZNaz6SvoWo6EyN1aRJOGT8yY,47588
25
- dpdispatcher/utils.py,sha256=9gYNpTkWoNXCcDakizlWFO5S5p_1eBHUglie_ZjeJoc,5329
26
- dpdispatcher/dpcloudserver/__init__.py,sha256=FnX9HH-2dXADluNfucg98JPMfruMoBpN9ER9lZkVQvQ,49
27
- dpdispatcher/dpcloudserver/client.py,sha256=52d_ftHk_8kpC3TNcGjXkWXLNQ0U8yYZ2l11L0DzxVg,12024
28
- dpdispatcher/dpcloudserver/config.py,sha256=NteQzf1OeEkz2UbkXHHQ0B72cUu23zLVzpM9Yh4v1Cc,559
29
- dpdispatcher/dpcloudserver/retcode.py,sha256=1qAF8gFZx55u2sO8KbtYSIIrjcO-IGufEUlwbkSfC1g,721
30
- dpdispatcher/dpcloudserver/temp_test.py,sha256=jklOSu7tZ_wW5gycGRiUsbBWMLZDqCBslSYOCb2hTHw,2932
31
- dpdispatcher/dpcloudserver/zip_file.py,sha256=f9WrlktwHW0YipaWg5Y0kxjMZlhD1cJYa6EUpvu4Cro,2611
32
- dpdispatcher-0.6.0.dist-info/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
33
- dpdispatcher-0.6.0.dist-info/METADATA,sha256=AmajK6Ht5EY_RD_cMwnK0Gnnd2zECCo0E_xS_r_ysHg,12730
34
- dpdispatcher-0.6.0.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
35
- dpdispatcher-0.6.0.dist-info/entry_points.txt,sha256=NRHUV0IU_u7_XtcmmEDnVzAcUmurhiEAGwENckrajo4,233
36
- dpdispatcher-0.6.0.dist-info/top_level.txt,sha256=35jAQoXY-b-e9fJ1_mxhZUiaCoJNt1ZI7mpFRf07Qjs,13
37
- dpdispatcher-0.6.0.dist-info/RECORD,,
File without changes
File without changes