dpdispatcher 0.6.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. dpdispatcher/_version.py +22 -4
  2. dpdispatcher/base_context.py +60 -1
  3. dpdispatcher/contexts/__init__.py +1 -0
  4. dpdispatcher/contexts/dp_cloud_server_context.py +8 -1
  5. dpdispatcher/contexts/hdfs_context.py +16 -11
  6. dpdispatcher/contexts/lazy_local_context.py +2 -19
  7. dpdispatcher/contexts/local_context.py +77 -43
  8. dpdispatcher/contexts/openapi_context.py +78 -14
  9. dpdispatcher/contexts/ssh_context.py +117 -98
  10. dpdispatcher/dlog.py +9 -5
  11. dpdispatcher/dpcloudserver/__init__.py +0 -0
  12. dpdispatcher/dpcloudserver/client.py +7 -0
  13. dpdispatcher/dpdisp.py +21 -0
  14. dpdispatcher/entrypoints/run.py +9 -0
  15. dpdispatcher/entrypoints/submission.py +21 -1
  16. dpdispatcher/machine.py +15 -4
  17. dpdispatcher/machines/JH_UniScheduler.py +171 -0
  18. dpdispatcher/machines/__init__.py +1 -0
  19. dpdispatcher/machines/distributed_shell.py +6 -10
  20. dpdispatcher/machines/fugaku.py +9 -12
  21. dpdispatcher/machines/lsf.py +3 -9
  22. dpdispatcher/machines/openapi.py +48 -15
  23. dpdispatcher/machines/pbs.py +183 -20
  24. dpdispatcher/machines/shell.py +7 -16
  25. dpdispatcher/machines/slurm.py +30 -42
  26. dpdispatcher/run.py +172 -0
  27. dpdispatcher/submission.py +5 -14
  28. dpdispatcher/utils/dpcloudserver/client.py +10 -6
  29. dpdispatcher/utils/hdfs_cli.py +10 -19
  30. dpdispatcher/utils/utils.py +21 -7
  31. {dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/METADATA +35 -29
  32. dpdispatcher-1.0.0.dist-info/RECORD +49 -0
  33. {dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/WHEEL +1 -1
  34. dpdispatcher-0.6.1.dist-info/RECORD +0 -44
  35. {dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/entry_points.txt +0 -0
  36. {dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info/licenses}/LICENSE +0 -0
  37. {dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,171 @@
1
+ import shlex
2
+ from typing import List
3
+
4
+ from dargs import Argument
5
+
6
+ from dpdispatcher.dlog import dlog
7
+ from dpdispatcher.machine import Machine
8
+ from dpdispatcher.utils.job_status import JobStatus
9
+ from dpdispatcher.utils.utils import (
10
+ RetrySignal,
11
+ customized_script_header_template,
12
+ retry,
13
+ )
14
+
15
+ JH_UniScheduler_script_header_template = """\
16
+ #!/bin/bash -l
17
+ #JSUB -e %J.err
18
+ #JSUB -o %J.out
19
+ {JH_UniScheduler_nodes_line}
20
+ {JH_UniScheduler_ptile_line}
21
+ {JH_UniScheduler_partition_line}
22
+ {JH_UniScheduler_number_gpu_line}"""
23
+
24
+
25
+ class JH_UniScheduler(Machine):
26
+ """JH_UniScheduler batch."""
27
+
28
+ def gen_script(self, job):
29
+ JH_UniScheduler_script = super().gen_script(job)
30
+ return JH_UniScheduler_script
31
+
32
+ def gen_script_header(self, job):
33
+ resources = job.resources
34
+ script_header_dict = {
35
+ "JH_UniScheduler_nodes_line": f"#JSUB -n {resources.number_node * resources.cpu_per_node}",
36
+ "JH_UniScheduler_ptile_line": f"#JSUB -R 'span[ptile={resources.cpu_per_node}]'",
37
+ "JH_UniScheduler_partition_line": f"#JSUB -q {resources.queue_name}",
38
+ }
39
+ custom_gpu_line = resources.kwargs.get("custom_gpu_line", None)
40
+ if not custom_gpu_line:
41
+ script_header_dict["JH_UniScheduler_number_gpu_line"] = (
42
+ f"#JSUB -gpgpu {resources.gpu_per_node}"
43
+ )
44
+ else:
45
+ script_header_dict["JH_UniScheduler_number_gpu_line"] = custom_gpu_line
46
+ if (
47
+ resources["strategy"].get("customized_script_header_template_file")
48
+ is not None
49
+ ):
50
+ JH_UniScheduler_script_header = customized_script_header_template(
51
+ resources["strategy"]["customized_script_header_template_file"],
52
+ resources,
53
+ )
54
+ else:
55
+ JH_UniScheduler_script_header = (
56
+ JH_UniScheduler_script_header_template.format(**script_header_dict)
57
+ )
58
+
59
+ return JH_UniScheduler_script_header
60
+
61
+ @retry()
62
+ def do_submit(self, job):
63
+ script_file_name = job.script_file_name
64
+ script_str = self.gen_script(job)
65
+ job_id_name = job.job_hash + "_job_id"
66
+ self.context.write_file(fname=script_file_name, write_str=script_str)
67
+ script_run_str = self.gen_script_command(job)
68
+ script_run_file_name = f"{job.script_file_name}.run"
69
+ self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
70
+
71
+ try:
72
+ stdin, stdout, stderr = self.context.block_checkcall(
73
+ "cd {} && {} {}".format(
74
+ shlex.quote(self.context.remote_root),
75
+ "jsub < ",
76
+ shlex.quote(script_file_name),
77
+ )
78
+ )
79
+ except RuntimeError as err:
80
+ raise RetrySignal(err) from err
81
+
82
+ subret = stdout.readlines()
83
+ job_id = subret[0].split()[1][1:-1]
84
+ self.context.write_file(job_id_name, job_id)
85
+ return job_id
86
+
87
+ @retry()
88
+ def check_status(self, job):
89
+ try:
90
+ job_id = job.job_id
91
+ except AttributeError:
92
+ return JobStatus.terminated
93
+ if job_id == "":
94
+ return JobStatus.unsubmitted
95
+ ret, stdin, stdout, stderr = self.context.block_call("jjobs " + job_id)
96
+ err_str = stderr.read().decode("utf-8")
97
+ if (f"Job <{job_id}> is not found") in err_str:
98
+ if self.check_finish_tag(job):
99
+ return JobStatus.finished
100
+ else:
101
+ return JobStatus.terminated
102
+ elif ret != 0:
103
+ # just retry when any unknown error raised.
104
+ raise RetrySignal(
105
+ f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
106
+ )
107
+ status_out = stdout.read().decode("utf-8").split("\n")
108
+ if len(status_out) < 2:
109
+ return JobStatus.unknown
110
+ else:
111
+ status_line = status_out[1]
112
+ status_word = status_line.split()[2]
113
+
114
+ if status_word in ["PEND"]:
115
+ return JobStatus.waiting
116
+ elif status_word in ["RUN", "PSUSP", "SSUSP", "USUSP"]:
117
+ return JobStatus.running
118
+ elif status_word in ["DONE", "EXIT"]:
119
+ if self.check_finish_tag(job):
120
+ dlog.info(f"job: {job.job_hash} {job.job_id} finished")
121
+ return JobStatus.finished
122
+ else:
123
+ return JobStatus.terminated
124
+ else:
125
+ return JobStatus.unknown
126
+
127
+ def check_finish_tag(self, job):
128
+ job_tag_finished = job.job_hash + "_job_tag_finished"
129
+ return self.context.check_file_exists(job_tag_finished)
130
+
131
+ @classmethod
132
+ def resources_subfields(cls) -> List[Argument]:
133
+ """Generate the resources subfields.
134
+
135
+ Returns
136
+ -------
137
+ list[Argument]
138
+ resources subfields
139
+ """
140
+ doc_custom_gpu_line = "Custom GPU configuration, starting with #JSUB"
141
+
142
+ return [
143
+ Argument(
144
+ "kwargs",
145
+ dict,
146
+ [
147
+ Argument(
148
+ "custom_gpu_line",
149
+ str,
150
+ optional=True,
151
+ default=None,
152
+ doc=doc_custom_gpu_line,
153
+ ),
154
+ ],
155
+ optional=False,
156
+ doc="Extra arguments.",
157
+ )
158
+ ]
159
+
160
+ def kill(self, job):
161
+ """Kill the job.
162
+
163
+ Parameters
164
+ ----------
165
+ job : Job
166
+ job
167
+ """
168
+ job_id = job.job_id
169
+ ret, stdin, stdout, stderr = self.context.block_call(
170
+ "jctrl kill " + str(job_id)
171
+ )
@@ -1,4 +1,5 @@
1
1
  """Machines."""
2
+
2
3
  import importlib
3
4
  from pathlib import Path
4
5
 
@@ -64,7 +64,7 @@ class DistributedShell(Machine):
64
64
 
65
65
  source_list = job.resources.source_list
66
66
  for ii in source_list:
67
- line = "{ source %s; } \n" % ii
67
+ line = f"{{ source {ii}; }} \n"
68
68
  source_files_part += line
69
69
 
70
70
  export_envs_part = ""
@@ -96,7 +96,7 @@ class DistributedShell(Machine):
96
96
  def gen_script_end(self, job):
97
97
  all_task_dirs = ""
98
98
  for task in job.job_task_list:
99
- all_task_dirs += "%s " % task.task_work_path
99
+ all_task_dirs += f"{task.task_work_path} "
100
100
  job_tag_finished = job.job_hash + "_job_tag_finished"
101
101
  flag_if_job_task_fail = job.job_hash + "_flag_if_job_task_fail"
102
102
 
@@ -173,18 +173,15 @@ class DistributedShell(Machine):
173
173
  )
174
174
  )
175
175
 
176
- cmd = "{{ nohup {} 1>{} 2>{} & }} && echo $!".format(
177
- submit_command,
178
- output_name,
179
- output_name,
176
+ cmd = (
177
+ f"{{ nohup {submit_command} 1>{output_name} 2>{output_name} & }} && echo $!"
180
178
  )
181
179
  ret, stdout, stderr = run_cmd_with_all_output(cmd)
182
180
 
183
181
  if ret != 0:
184
182
  err_str = stderr.decode("utf-8")
185
183
  raise RuntimeError(
186
- "Command squeue fails to execute, error message:%s\nreturn code %d\n"
187
- % (err_str, ret)
184
+ f"Command {cmd} fails to execute, error message:{err_str}\nreturn code {ret}\n"
188
185
  )
189
186
  job_id = int(stdout.decode("utf-8").strip())
190
187
 
@@ -202,8 +199,7 @@ class DistributedShell(Machine):
202
199
  if ret != 0:
203
200
  err_str = stderr.decode("utf-8")
204
201
  raise RuntimeError(
205
- "Command fails to execute, error message:%s\nreturn code %d\n"
206
- % (err_str, ret)
202
+ f"Command fails to execute, error message:{err_str}\nreturn code {ret}\n"
207
203
  )
208
204
 
209
205
  if_job_exists = bool(stdout.decode("utf-8").strip())
@@ -20,15 +20,15 @@ class Fugaku(Machine):
20
20
  def gen_script_header(self, job):
21
21
  resources = job.resources
22
22
  fugaku_script_header_dict = {}
23
- fugaku_script_header_dict[
24
- "fugaku_node_number_line"
25
- ] = f'#PJM -L "node={resources.number_node}" '
26
- fugaku_script_header_dict[
27
- "fugaku_ntasks_per_node_line"
28
- ] = f'#PJM --mpi "max-proc-per-node={resources.cpu_per_node}"'
29
- fugaku_script_header_dict[
30
- "queue_name_line"
31
- ] = f'#PJM -L "rscgrp={resources.queue_name}"'
23
+ fugaku_script_header_dict["fugaku_node_number_line"] = (
24
+ f'#PJM -L "node={resources.number_node}" '
25
+ )
26
+ fugaku_script_header_dict["fugaku_ntasks_per_node_line"] = (
27
+ f'#PJM --mpi "max-proc-per-node={resources.cpu_per_node}"'
28
+ )
29
+ fugaku_script_header_dict["queue_name_line"] = (
30
+ f'#PJM -L "rscgrp={resources.queue_name}"'
31
+ )
32
32
  if (
33
33
  resources["strategy"].get("customized_script_header_template_file")
34
34
  is not None
@@ -67,9 +67,6 @@ class Fugaku(Machine):
67
67
  self.context.write_file(job_id_name, job_id)
68
68
  return job_id
69
69
 
70
- def default_resources(self, resources):
71
- pass
72
-
73
70
  def check_status(self, job):
74
71
  job_id = job.job_id
75
72
  if job_id == "":
@@ -32,9 +32,7 @@ class LSF(Machine):
32
32
  def gen_script_header(self, job):
33
33
  resources = job.resources
34
34
  script_header_dict = {
35
- "lsf_nodes_line": "#BSUB -n {number_cores}".format(
36
- number_cores=resources.number_node * resources.cpu_per_node
37
- ),
35
+ "lsf_nodes_line": f"#BSUB -n {resources.number_node * resources.cpu_per_node}",
38
36
  "lsf_ptile_line": f"#BSUB -R 'span[ptile={resources.cpu_per_node}]'",
39
37
  "lsf_partition_line": f"#BSUB -q {resources.queue_name}",
40
38
  }
@@ -104,9 +102,6 @@ class LSF(Machine):
104
102
  return job_id
105
103
 
106
104
  # TODO: derive abstract methods
107
- def default_resources(self, resources):
108
- pass
109
-
110
105
  def sub_script_cmd(self, res):
111
106
  pass
112
107
 
@@ -123,7 +118,7 @@ class LSF(Machine):
123
118
  return JobStatus.unsubmitted
124
119
  ret, stdin, stdout, stderr = self.context.block_call("bjobs " + job_id)
125
120
  err_str = stderr.read().decode("utf-8")
126
- if ("Job <%s> is not found" % job_id) in err_str:
121
+ if (f"Job <{job_id}> is not found") in err_str:
127
122
  if self.check_finish_tag(job):
128
123
  return JobStatus.finished
129
124
  else:
@@ -131,8 +126,7 @@ class LSF(Machine):
131
126
  elif ret != 0:
132
127
  # just retry when any unknown error raised.
133
128
  raise RetrySignal(
134
- "Get error code %d in checking status through ssh with job: %s . message: %s"
135
- % (ret, job.job_hash, err_str)
129
+ f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
136
130
  )
137
131
  status_out = stdout.read().decode("utf-8").split("\n")
138
132
  if len(status_out) < 2:
@@ -1,14 +1,13 @@
1
1
  import os
2
2
  import shutil
3
3
  import time
4
+ from zipfile import ZipFile
4
5
 
5
6
  from dpdispatcher.utils.utils import customized_script_header_template
6
7
 
7
8
  try:
8
- from bohriumsdk.client import Client
9
- from bohriumsdk.job import Job
10
- from bohriumsdk.storage import Storage
11
- from bohriumsdk.util import Util
9
+ from bohrium import Bohrium
10
+ from bohrium.resources import Job, Tiefblue
12
11
  except ModuleNotFoundError:
13
12
  found_bohriumsdk = False
14
13
  else:
@@ -23,6 +22,12 @@ shell_script_header_template = """
23
22
  """
24
23
 
25
24
 
25
+ def unzip_file(zip_file, out_dir="./"):
26
+ obj = ZipFile(zip_file, "r")
27
+ for item in obj.namelist():
28
+ obj.extract(item, out_dir)
29
+
30
+
26
31
  class OpenAPI(Machine):
27
32
  def __init__(self, context):
28
33
  if not found_bohriumsdk:
@@ -35,9 +40,35 @@ class OpenAPI(Machine):
35
40
  self.grouped = self.remote_profile.get("grouped", True)
36
41
  self.retry_count = self.remote_profile.get("retry_count", 3)
37
42
  self.ignore_exit_code = context.remote_profile.get("ignore_exit_code", True)
38
- self.client = Client()
39
- self.job = Job(client=self.client)
40
- self.storage = Storage(client=self.client)
43
+
44
+ access_key = (
45
+ self.remote_profile.get("access_key", None)
46
+ or os.getenv("BOHRIUM_ACCESS_KEY", None)
47
+ or os.getenv("ACCESS_KEY", None)
48
+ )
49
+ project_id = (
50
+ self.remote_profile.get("project_id", None)
51
+ or os.getenv("BOHRIUM_PROJECT_ID", None)
52
+ or os.getenv("PROJECT_ID", None)
53
+ )
54
+ app_key = (
55
+ self.remote_profile.get("app_key", None)
56
+ or os.getenv("BOHRIUM_APP_KEY", None)
57
+ or os.getenv("APP_KEY", None)
58
+ )
59
+ if access_key is None:
60
+ raise ValueError(
61
+ "remote_profile must contain 'access_key' or set environment variable 'BOHRIUM_ACCESS_KEY'"
62
+ )
63
+ if project_id is None:
64
+ raise ValueError(
65
+ "remote_profile must contain 'project_id' or set environment variable 'BOHRIUM_PROJECT_ID'"
66
+ )
67
+ self.client = Bohrium( # type: ignore[reportPossiblyUnboundVariable]
68
+ access_key=access_key, project_id=project_id, app_key=app_key
69
+ )
70
+ self.storage = Tiefblue() # type: ignore[reportPossiblyUnboundVariable]
71
+ self.job = Job(client=self.client) # type: ignore[reportPossiblyUnboundVariable]
41
72
  self.group_id = None
42
73
 
43
74
  def gen_script(self, job):
@@ -98,11 +129,13 @@ class OpenAPI(Machine):
98
129
  ),
99
130
  "out_files": self._gen_backward_files_list(job),
100
131
  "platform": self.remote_profile.get("platform", "ali"),
101
- "image_address": self.remote_profile.get("image_address", ""),
132
+ "image_name": self.remote_profile.get("image_address", ""),
102
133
  }
103
- if job.job_state == JobStatus.unsubmitted:
104
- openapi_params["job_id"] = job.job_id
105
-
134
+ if "real_user_id" in self.remote_profile:
135
+ openapi_params["real_user_id"] = self.remote_profile["real_user_id"]
136
+ if "session_id" in self.remote_profile:
137
+ openapi_params["session_id"] = self.remote_profile["session_id"]
138
+ openapi_params["job_id"] = job.job_id
106
139
  data = self.job.insert(**openapi_params)
107
140
 
108
141
  job.job_id = data.get("jobId", 0) # type: ignore
@@ -152,8 +185,8 @@ class OpenAPI(Machine):
152
185
  self.ignore_exit_code,
153
186
  )
154
187
  if job_state == JobStatus.finished:
155
- job_log = self.job.log(job_id)
156
188
  if self.remote_profile.get("output_log"):
189
+ job_log = self.job.log(job_id)
157
190
  print(job_log, end="")
158
191
  self._download_job(job)
159
192
  elif self.remote_profile.get("output_log") and job_state == JobStatus.running:
@@ -163,14 +196,14 @@ class OpenAPI(Machine):
163
196
 
164
197
  def _download_job(self, job):
165
198
  data = self.job.detail(job.job_id)
166
- job_url = data["jobFiles"]["outFiles"][0]["url"] # type: ignore
199
+ job_url = data["resultUrl"] # type: ignore
167
200
  if not job_url:
168
201
  return
169
202
  job_hash = job.job_hash
170
203
  result_filename = job_hash + "_back.zip"
171
204
  target_result_zip = os.path.join(self.context.local_root, result_filename)
172
205
  self.storage.download_from_url(job_url, target_result_zip)
173
- Util.unzip_file(target_result_zip, out_dir=self.context.local_root)
206
+ unzip_file(target_result_zip, out_dir=self.context.local_root)
174
207
  try:
175
208
  os.makedirs(os.path.join(self.context.local_root, "backup"), exist_ok=True)
176
209
  shutil.move(
@@ -213,7 +246,7 @@ class OpenAPI(Machine):
213
246
  if status not in map_dict:
214
247
  dlog.error(f"unknown job status {status}")
215
248
  return JobStatus.unknown
216
- if status == -1 and exit_code != 0 and ignore_exit_code:
249
+ if status == -1 and ignore_exit_code:
217
250
  return JobStatus.finished
218
251
  return map_dict[status]
219
252
 
@@ -1,4 +1,7 @@
1
1
  import shlex
2
+ from typing import List
3
+
4
+ from dargs import Argument
2
5
 
3
6
  from dpdispatcher.dlog import dlog
4
7
  from dpdispatcher.machine import Machine
@@ -21,13 +24,13 @@ class PBS(Machine):
21
24
  def gen_script_header(self, job):
22
25
  resources = job.resources
23
26
  pbs_script_header_dict = {}
24
- pbs_script_header_dict[
25
- "select_node_line"
26
- ] = f"#PBS -l select={resources.number_node}:ncpus={resources.cpu_per_node}"
27
+ pbs_script_header_dict["select_node_line"] = (
28
+ f"#PBS -l select={resources.number_node}:ncpus={resources.cpu_per_node}"
29
+ )
27
30
  if resources.gpu_per_node != 0:
28
- pbs_script_header_dict[
29
- "select_node_line"
30
- ] += f":ngpus={resources.gpu_per_node}"
31
+ pbs_script_header_dict["select_node_line"] += (
32
+ f":ngpus={resources.gpu_per_node}"
33
+ )
31
34
  pbs_script_header_dict["queue_name_line"] = f"#PBS -q {resources.queue_name}"
32
35
  if (
33
36
  resources["strategy"].get("customized_script_header_template_file")
@@ -66,14 +69,12 @@ class PBS(Machine):
66
69
  self.context.write_file(job_id_name, job_id)
67
70
  return job_id
68
71
 
69
- def default_resources(self, resources):
70
- pass
71
-
72
72
  def check_status(self, job):
73
73
  job_id = job.job_id
74
74
  if job_id == "":
75
75
  return JobStatus.unsubmitted
76
- ret, stdin, stdout, stderr = self.context.block_call("qstat -x " + job_id)
76
+ command = "qstat -x " + job_id
77
+ ret, stdin, stdout, stderr = self.context.block_call(command)
77
78
  err_str = stderr.read().decode("utf-8")
78
79
  if ret != 0:
79
80
  if "qstat: Unknown Job Id" in err_str or "Job has finished" in err_str:
@@ -83,8 +84,7 @@ class PBS(Machine):
83
84
  return JobStatus.terminated
84
85
  else:
85
86
  raise RuntimeError(
86
- "status command qstat fails to execute. erro info: %s return code %d"
87
- % (err_str, ret)
87
+ f"status command {command} fails to execute. erro info: {err_str} return code {ret}"
88
88
  )
89
89
  status_line = stdout.read().decode("utf-8").split("\n")[-2]
90
90
  status_word = status_line.split()[-2]
@@ -123,7 +123,8 @@ class Torque(PBS):
123
123
  job_id = job.job_id
124
124
  if job_id == "":
125
125
  return JobStatus.unsubmitted
126
- ret, stdin, stdout, stderr = self.context.block_call("qstat -l " + job_id)
126
+ command = "qstat -l " + job_id
127
+ ret, stdin, stdout, stderr = self.context.block_call(command)
127
128
  err_str = stderr.read().decode("utf-8")
128
129
  if ret != 0:
129
130
  if "qstat: Unknown Job Id" in err_str or "Job has finished" in err_str:
@@ -133,8 +134,7 @@ class Torque(PBS):
133
134
  return JobStatus.terminated
134
135
  else:
135
136
  raise RuntimeError(
136
- "status command qstat fails to execute. erro info: %s return code %d"
137
- % (err_str, ret)
137
+ f"status command {command} fails to execute. erro info: {err_str} return code {ret}"
138
138
  )
139
139
  status_line = stdout.read().decode("utf-8").split("\n")[-2]
140
140
  status_word = status_line.split()[-2]
@@ -156,12 +156,12 @@ class Torque(PBS):
156
156
  # ref: https://support.adaptivecomputing.com/wp-content/uploads/2021/02/torque/torque.htm#topics/torque/2-jobs/requestingRes.htm
157
157
  resources = job.resources
158
158
  pbs_script_header_dict = {}
159
- pbs_script_header_dict[
160
- "select_node_line"
161
- ] = f"#PBS -l nodes={resources.number_node}:ppn={resources.cpu_per_node}"
159
+ pbs_script_header_dict["select_node_line"] = (
160
+ f"#PBS -l nodes={resources.number_node}:ppn={resources.cpu_per_node}"
161
+ )
162
162
  if resources.gpu_per_node != 0:
163
- pbs_script_header_dict["select_node_line"] += ":gpus={gpu_per_node}".format(
164
- gpu_per_node=resources.gpu_per_node
163
+ pbs_script_header_dict["select_node_line"] += (
164
+ f":gpus={resources.gpu_per_node}"
165
165
  )
166
166
  pbs_script_header_dict["queue_name_line"] = f"#PBS -q {resources.queue_name}"
167
167
  if (
@@ -177,3 +177,166 @@ class Torque(PBS):
177
177
  **pbs_script_header_dict
178
178
  )
179
179
  return pbs_script_header
180
+
181
+
182
+ sge_script_header_template = """
183
+ #!/bin/bash
184
+ #$ -S /bin/bash
185
+ #$ -cwd
186
+ {select_node_line}
187
+ """
188
+
189
+
190
+ class SGE(PBS):
191
+ def __init__(
192
+ self,
193
+ batch_type=None,
194
+ context_type=None,
195
+ local_root=None,
196
+ remote_root=None,
197
+ remote_profile={},
198
+ *,
199
+ context=None,
200
+ ):
201
+ super(PBS, self).__init__(
202
+ batch_type,
203
+ context_type,
204
+ local_root,
205
+ remote_root,
206
+ remote_profile,
207
+ context=context,
208
+ )
209
+
210
+ def gen_script_header(self, job):
211
+ ### Ref:https://softpanorama.org/HPC/PBS_and_derivatives/Reference/pbs_command_vs_sge_commands.shtml
212
+ # resources.number_node is not used in SGE
213
+ resources = job.resources
214
+ job_name = resources.kwargs.get("job_name", "wDPjob")
215
+ pe_name = resources.kwargs.get("pe_name", "mpi")
216
+ sge_script_header_dict = {}
217
+ sge_script_header_dict["select_node_line"] = f"#$ -N {job_name}\n"
218
+ sge_script_header_dict["select_node_line"] += (
219
+ f"#$ -pe {pe_name} {resources.cpu_per_node}\n"
220
+ )
221
+
222
+ if resources.queue_name != "":
223
+ sge_script_header_dict["select_node_line"] += (
224
+ f"#$ -q {resources.queue_name}"
225
+ )
226
+ if (
227
+ resources["strategy"].get("customized_script_header_template_file")
228
+ is not None
229
+ ):
230
+ file_name = resources["strategy"]["customized_script_header_template_file"]
231
+ sge_script_header = customized_script_header_template(file_name, resources)
232
+ else:
233
+ sge_script_header = sge_script_header_template.format(
234
+ **sge_script_header_dict
235
+ )
236
+ return sge_script_header
237
+
238
+ def do_submit(self, job):
239
+ script_file_name = job.script_file_name
240
+ script_str = self.gen_script(job)
241
+ job_id_name = job.job_hash + "_job_id"
242
+ self.context.write_file(fname=script_file_name, write_str=script_str)
243
+ script_run_str = self.gen_script_command(job)
244
+ script_run_file_name = f"{job.script_file_name}.run"
245
+ self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
246
+ script_file_dir = self.context.remote_root
247
+ stdin, stdout, stderr = self.context.block_checkcall(
248
+ "cd {} && {} {}".format(script_file_dir, "qsub", script_file_name)
249
+ )
250
+ subret = stdout.readlines()
251
+ job_id = subret[0].split()[2]
252
+ self.context.write_file(job_id_name, job_id)
253
+ return job_id
254
+
255
+ def check_status(self, job):
256
+ ### https://softpanorama.org/HPC/Grid_engine/Queues/queue_states.shtml
257
+ job_id = job.job_id
258
+ status_line = None
259
+ if job_id == "":
260
+ return JobStatus.unsubmitted
261
+ command = "qstat"
262
+ ret, stdin, stdout, stderr = self.context.block_call(command)
263
+ err_str = stderr.read().decode("utf-8")
264
+ if ret != 0:
265
+ raise RuntimeError(
266
+ f"status command {command} fails to execute. erro info: {err_str} return code {ret}"
267
+ )
268
+ status_text_list = stdout.read().decode("utf-8").split("\n")
269
+ for txt in status_text_list:
270
+ if job_id in txt:
271
+ status_line = txt
272
+
273
+ if status_line is None:
274
+ count = 0
275
+ while count <= 6:
276
+ if self.check_finish_tag(job=job):
277
+ return JobStatus.finished
278
+ dlog.info(
279
+ f"not tag_finished detected, execute sync command and wait. count {count}"
280
+ )
281
+ self.context.block_call("sync")
282
+ import time
283
+
284
+ time.sleep(10)
285
+ count += 1
286
+ return JobStatus.terminated
287
+ else:
288
+ status_word = status_line.split()[4]
289
+ # dlog.info (status_word)
290
+ if status_word in ["qw", "hqw", "t"]:
291
+ return JobStatus.waiting
292
+ elif status_word in ["r", "Rr"]:
293
+ return JobStatus.running
294
+ elif status_word in ["Eqw", "dr", "dt"]:
295
+ return JobStatus.terminated
296
+ else:
297
+ return JobStatus.unknown
298
+
299
+ def check_finish_tag(self, job):
300
+ job_tag_finished = job.job_hash + "_job_tag_finished"
301
+ return self.context.check_file_exists(job_tag_finished)
302
+
303
+ @classmethod
304
+ def resources_subfields(cls) -> List[Argument]:
305
+ """Generate the resources subfields.
306
+
307
+ pe_name : str
308
+ The parallel environment name of SGE.
309
+
310
+ Returns
311
+ -------
312
+ list[Argument]
313
+ resources subfields
314
+ """
315
+ doc_pe_name = "The parallel environment name of SGE system."
316
+ doc_job_name = "The name of SGE's job."
317
+
318
+ return [
319
+ Argument(
320
+ "kwargs",
321
+ dict,
322
+ [
323
+ Argument(
324
+ "pe_name",
325
+ str,
326
+ optional=True,
327
+ default="mpi",
328
+ doc=doc_pe_name,
329
+ alias=["sge_pe_name"],
330
+ ),
331
+ Argument(
332
+ "job_name",
333
+ str,
334
+ optional=True,
335
+ default="wDPjob",
336
+ doc=doc_job_name,
337
+ ),
338
+ ],
339
+ optional=False,
340
+ doc="Extra arguments.",
341
+ )
342
+ ]