dpdispatcher 0.6.6__py3-none-any.whl → 0.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dpdispatcher might be problematic. Click here for more details.

dpdispatcher/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.6.6'
16
- __version_tuple__ = version_tuple = (0, 6, 6)
15
+ __version__ = version = '0.6.7'
16
+ __version_tuple__ = version_tuple = (0, 6, 7)
@@ -1,5 +1,5 @@
1
1
  from abc import ABCMeta, abstractmethod
2
- from typing import List, Tuple
2
+ from typing import Any, List, Tuple
3
3
 
4
4
  from dargs import Argument
5
5
 
@@ -73,6 +73,66 @@ class BaseContext(metaclass=ABCMeta):
73
73
  def check_finish(self, proc):
74
74
  raise NotImplementedError("abstract method")
75
75
 
76
+ def block_checkcall(self, cmd, asynchronously=False) -> Tuple[Any, Any, Any]:
77
+ """Run command with arguments. Wait for command to complete.
78
+
79
+ Parameters
80
+ ----------
81
+ cmd : str
82
+ The command to run.
83
+ asynchronously : bool, optional, default=False
84
+ Run command asynchronously. If True, `nohup` will be used to run the command.
85
+
86
+ Returns
87
+ -------
88
+ stdin
89
+ standard inout
90
+ stdout
91
+ standard output
92
+ stderr
93
+ standard error
94
+
95
+ Raises
96
+ ------
97
+ RuntimeError
98
+ when the return code is not zero
99
+ """
100
+ if asynchronously:
101
+ cmd = f"nohup {cmd} >/dev/null &"
102
+ exit_status, stdin, stdout, stderr = self.block_call(cmd)
103
+ if exit_status != 0:
104
+ raise RuntimeError(
105
+ "Get error code %d in calling %s with job: %s . message: %s"
106
+ % (
107
+ exit_status,
108
+ cmd,
109
+ self.submission.submission_hash,
110
+ stderr.read().decode("utf-8"),
111
+ )
112
+ )
113
+ return stdin, stdout, stderr
114
+
115
+ @abstractmethod
116
+ def block_call(self, cmd) -> Tuple[int, Any, Any, Any]:
117
+ """Run command with arguments. Wait for command to complete.
118
+
119
+ Parameters
120
+ ----------
121
+ cmd : str
122
+ The command to run.
123
+
124
+ Returns
125
+ -------
126
+ exit_status
127
+ exit code
128
+ stdin
129
+ standard inout
130
+ stdout
131
+ standard output
132
+ stderr
133
+ standard error
134
+ """
135
+
76
136
  @classmethod
77
137
  def machine_arginfo(cls) -> Argument:
78
138
  """Generate the machine arginfo.
@@ -335,6 +335,11 @@ class BohriumContext(BaseContext):
335
335
  )
336
336
  ]
337
337
 
338
+ def block_call(self, cmd):
339
+ raise RuntimeError(
340
+ "Unsupported method. You may use an unsupported combination of the machine and the context."
341
+ )
342
+
338
343
 
339
344
  DpCloudServerContext = BohriumContext
340
345
  LebesgueContext = BohriumContext
@@ -244,3 +244,8 @@ class HDFSContext(BaseContext):
244
244
 
245
245
  def read_file(self, fname):
246
246
  return HDFS.read_hdfs_file(os.path.join(self.remote_root, fname))
247
+
248
+ def block_call(self, cmd):
249
+ raise RuntimeError(
250
+ "Unsupported method. You may use an unsupported combination of the machine and the context."
251
+ )
@@ -112,23 +112,6 @@ class LazyLocalContext(BaseContext):
112
112
  # else:
113
113
  # raise RuntimeError('do not find download file ' + fname)
114
114
 
115
- def block_checkcall(self, cmd):
116
- # script_dir = os.path.join(self.local_root, self.submission.work_base)
117
- # os.chdir(script_dir)
118
- proc = sp.Popen(
119
- cmd, cwd=self.local_root, shell=True, stdout=sp.PIPE, stderr=sp.PIPE
120
- )
121
- o, e = proc.communicate()
122
- stdout = SPRetObj(o)
123
- stderr = SPRetObj(e)
124
- code = proc.returncode
125
- if code != 0:
126
- raise RuntimeError(
127
- "Get error code %d in locally calling %s with job: %s ",
128
- (code, cmd, self.submission.submission_hash),
129
- )
130
- return None, stdout, stderr
131
-
132
115
  def block_call(self, cmd):
133
116
  proc = sp.Popen(
134
117
  cmd, cwd=self.local_root, shell=True, stdout=sp.PIPE, stderr=sp.PIPE
@@ -3,6 +3,9 @@ import shutil
3
3
  import subprocess as sp
4
4
  from glob import glob
5
5
  from subprocess import TimeoutExpired
6
+ from typing import List
7
+
8
+ from dargs import Argument
6
9
 
7
10
  from dpdispatcher.base_context import BaseContext
8
11
  from dpdispatcher.dlog import dlog
@@ -60,6 +63,7 @@ class LocalContext(BaseContext):
60
63
  self.temp_local_root = os.path.abspath(local_root)
61
64
  self.temp_remote_root = os.path.abspath(remote_root)
62
65
  self.remote_profile = remote_profile
66
+ self.symlink = remote_profile.get("symlink", True)
63
67
 
64
68
  @classmethod
65
69
  def load_from_dict(cls, context_dict):
@@ -83,6 +87,25 @@ class LocalContext(BaseContext):
83
87
  self.temp_remote_root, submission.submission_hash
84
88
  )
85
89
 
90
+ def _copy_from_local_to_remote(self, local_path, remote_path):
91
+ if not os.path.exists(local_path):
92
+ raise FileNotFoundError(
93
+ f"cannot find uploaded file {os.path.join(local_path)}"
94
+ )
95
+ if os.path.exists(remote_path):
96
+ os.remove(remote_path)
97
+ _check_file_path(remote_path)
98
+
99
+ if self.symlink:
100
+ # ensure the file exist
101
+ os.symlink(local_path, remote_path)
102
+ elif os.path.isfile(local_path):
103
+ shutil.copyfile(local_path, remote_path)
104
+ elif os.path.isdir(local_path):
105
+ shutil.copytree(local_path, remote_path)
106
+ else:
107
+ raise ValueError(f"Unknown file type: {local_path}")
108
+
86
109
  def upload(self, submission):
87
110
  os.makedirs(self.remote_root, exist_ok=True)
88
111
  for ii in submission.belonging_tasks:
@@ -103,14 +126,9 @@ class LocalContext(BaseContext):
103
126
  file_list.extend(rel_file_list)
104
127
 
105
128
  for jj in file_list:
106
- if not os.path.exists(os.path.join(local_job, jj)):
107
- raise FileNotFoundError(
108
- "cannot find upload file " + os.path.join(local_job, jj)
109
- )
110
- if os.path.exists(os.path.join(remote_job, jj)):
111
- os.remove(os.path.join(remote_job, jj))
112
- _check_file_path(os.path.join(remote_job, jj))
113
- os.symlink(os.path.join(local_job, jj), os.path.join(remote_job, jj))
129
+ self._copy_from_local_to_remote(
130
+ os.path.join(local_job, jj), os.path.join(remote_job, jj)
131
+ )
114
132
 
115
133
  local_job = self.local_root
116
134
  remote_job = self.remote_root
@@ -128,14 +146,9 @@ class LocalContext(BaseContext):
128
146
  file_list.extend(rel_file_list)
129
147
 
130
148
  for jj in file_list:
131
- if not os.path.exists(os.path.join(local_job, jj)):
132
- raise FileNotFoundError(
133
- "cannot find upload file " + os.path.join(local_job, jj)
134
- )
135
- if os.path.exists(os.path.join(remote_job, jj)):
136
- os.remove(os.path.join(remote_job, jj))
137
- _check_file_path(os.path.join(remote_job, jj))
138
- os.symlink(os.path.join(local_job, jj), os.path.join(remote_job, jj))
149
+ self._copy_from_local_to_remote(
150
+ os.path.join(local_job, jj), os.path.join(remote_job, jj)
151
+ )
139
152
 
140
153
  def download(
141
154
  self, submission, check_exists=False, mark_failure=True, back_error=False
@@ -288,21 +301,6 @@ class LocalContext(BaseContext):
288
301
  # no nothing in the case of linked files
289
302
  pass
290
303
 
291
- def block_checkcall(self, cmd):
292
- proc = sp.Popen(
293
- cmd, cwd=self.remote_root, shell=True, stdout=sp.PIPE, stderr=sp.PIPE
294
- )
295
- o, e = proc.communicate()
296
- stdout = SPRetObj(o)
297
- stderr = SPRetObj(e)
298
- code = proc.returncode
299
- if code != 0:
300
- raise RuntimeError(
301
- f"Get error code {code} in locally calling {cmd} with job: {self.submission.submission_hash}"
302
- f"\nStandard error: {stderr}"
303
- )
304
- return None, stdout, stderr
305
-
306
304
  def block_call(self, cmd):
307
305
  proc = sp.Popen(
308
306
  cmd, cwd=self.remote_root, shell=True, stdout=sp.PIPE, stderr=sp.PIPE
@@ -351,3 +349,31 @@ class LocalContext(BaseContext):
351
349
  stdout = None
352
350
  stderr = None
353
351
  return ret, stdout, stderr
352
+
353
+ @classmethod
354
+ def machine_subfields(cls) -> List[Argument]:
355
+ """Generate the machine subfields.
356
+
357
+ Returns
358
+ -------
359
+ list[Argument]
360
+ machine subfields
361
+ """
362
+ doc_remote_profile = "The information used to maintain the local machine."
363
+ return [
364
+ Argument(
365
+ "remote_profile",
366
+ dict,
367
+ optional=True,
368
+ doc=doc_remote_profile,
369
+ sub_fields=[
370
+ Argument(
371
+ "symlink",
372
+ bool,
373
+ optional=True,
374
+ default=True,
375
+ doc="Whether to use symbolic links to replace copy. This option should be turned off if the local directory is not accessible on the Batch system.",
376
+ ),
377
+ ],
378
+ )
379
+ ]
@@ -258,3 +258,8 @@ class OpenAPIContext(BaseContext):
258
258
  dir_to_be_removed = os.path.join(local_root, "backup")
259
259
  if os.path.exists(dir_to_be_removed):
260
260
  shutil.rmtree(dir_to_be_removed)
261
+
262
+ def block_call(self, cmd):
263
+ raise RuntimeError(
264
+ "Unsupported method. You may use an unsupported combination of the machine and the context."
265
+ )
@@ -44,6 +44,7 @@ class SSHSession:
44
44
  totp_secret=None,
45
45
  tar_compress=True,
46
46
  look_for_keys=True,
47
+ execute_command=None,
47
48
  ):
48
49
  self.hostname = hostname
49
50
  self.username = username
@@ -56,6 +57,7 @@ class SSHSession:
56
57
  self.ssh = None
57
58
  self.tar_compress = tar_compress
58
59
  self.look_for_keys = look_for_keys
60
+ self.execute_command = execute_command
59
61
  self._keyboard_interactive_auth = False
60
62
  self._setup_ssh()
61
63
 
@@ -237,6 +239,8 @@ class SSHSession:
237
239
  self.ssh._transport = ts # type: ignore
238
240
  # reset sftp
239
241
  self._sftp = None
242
+ if self.execute_command is not None:
243
+ self.exec_command(self.execute_command)
240
244
 
241
245
  def inter_handler(self, title, instructions, prompt_list):
242
246
  """inter_handler: the callback for paramiko.transport.auth_interactive.
@@ -338,6 +342,7 @@ class SSHSession:
338
342
  doc_look_for_keys = (
339
343
  "enable searching for discoverable private key files in ~/.ssh/"
340
344
  )
345
+ doc_execute_command = "execute command after ssh connection is established."
341
346
  ssh_remote_profile_args = [
342
347
  Argument("hostname", str, optional=False, doc=doc_hostname),
343
348
  Argument("username", str, optional=False, doc=doc_username),
@@ -379,6 +384,13 @@ class SSHSession:
379
384
  default=True,
380
385
  doc=doc_look_for_keys,
381
386
  ),
387
+ Argument(
388
+ "execute_command",
389
+ str,
390
+ optional=True,
391
+ default=None,
392
+ doc=doc_execute_command,
393
+ ),
382
394
  ]
383
395
  ssh_remote_profile_format = Argument(
384
396
  "ssh_session", dict, ssh_remote_profile_args
@@ -755,41 +767,6 @@ class SSHContext(BaseContext):
755
767
  tar_compress=self.remote_profile.get("tar_compress", None),
756
768
  )
757
769
 
758
- def block_checkcall(self, cmd, asynchronously=False, stderr_whitelist=None):
759
- """Run command with arguments. Wait for command to complete. If the return code
760
- was zero then return, otherwise raise RuntimeError.
761
-
762
- Parameters
763
- ----------
764
- cmd : str
765
- The command to run.
766
- asynchronously : bool, optional, default=False
767
- Run command asynchronously. If True, `nohup` will be used to run the command.
768
- stderr_whitelist : list of str, optional, default=None
769
- If not None, the stderr will be checked against the whitelist. If the stderr
770
- contains any of the strings in the whitelist, the command will be considered
771
- successful.
772
- """
773
- assert self.remote_root is not None
774
- self.ssh_session.ensure_alive()
775
- if asynchronously:
776
- cmd = f"nohup {cmd} >/dev/null &"
777
- stdin, stdout, stderr = self.ssh_session.exec_command(
778
- (f"cd {shlex.quote(self.remote_root)} ;") + cmd
779
- )
780
- exit_status = stdout.channel.recv_exit_status()
781
- if exit_status != 0:
782
- raise RuntimeError(
783
- "Get error code %d in calling %s through ssh with job: %s . message: %s"
784
- % (
785
- exit_status,
786
- cmd,
787
- self.submission.submission_hash,
788
- stderr.read().decode("utf-8"),
789
- )
790
- )
791
- return stdin, stdout, stderr
792
-
793
770
  def block_call(self, cmd):
794
771
  assert self.remote_root is not None
795
772
  self.ssh_session.ensure_alive()
dpdispatcher/machine.py CHANGED
@@ -161,6 +161,9 @@ class Machine(metaclass=ABCMeta):
161
161
  machine_dict["remote_profile"] = self.context.remote_profile
162
162
  else:
163
163
  machine_dict["remote_profile"] = {}
164
+ # normalize the dict
165
+ base = self.arginfo()
166
+ machine_dict = base.normalize_value(machine_dict, trim_pattern="_*")
164
167
  return machine_dict
165
168
 
166
169
  def __eq__(self, other):
@@ -265,6 +268,15 @@ class Machine(metaclass=ABCMeta):
265
268
 
266
269
  export_envs_part = ""
267
270
  envs = job.resources.envs
271
+ envs = {
272
+ # export resources information to the environment variables
273
+ "DPDISPATCHER_NUMBER_NODE": job.resources.number_node,
274
+ "DPDISPATCHER_CPU_PER_NODE": job.resources.cpu_per_node,
275
+ "DPDISPATCHER_GPU_PER_NODE": job.resources.gpu_per_node,
276
+ "DPDISPATCHER_QUEUE_NAME": job.resources.queue_name,
277
+ "DPDISPATCHER_GROUP_SIZE": job.resources.group_size,
278
+ **envs,
279
+ }
268
280
  for k, v in envs.items():
269
281
  if isinstance(v, list):
270
282
  for each_value in v:
@@ -105,7 +105,7 @@ class JH_UniScheduler(Machine):
105
105
  elif ret != 0:
106
106
  # just retry when any unknown error raised.
107
107
  raise RetrySignal(
108
- "Get error code %d in checking status through ssh with job: %s . message: %s"
108
+ "Get error code %d in checking status with job: %s . message: %s"
109
109
  % (ret, job.job_hash, err_str)
110
110
  )
111
111
  status_out = stdout.read().decode("utf-8").split("\n")
@@ -181,8 +181,8 @@ class DistributedShell(Machine):
181
181
  if ret != 0:
182
182
  err_str = stderr.decode("utf-8")
183
183
  raise RuntimeError(
184
- "Command squeue fails to execute, error message:%s\nreturn code %d\n"
185
- % (err_str, ret)
184
+ "Command %s fails to execute, error message:%s\nreturn code %d\n"
185
+ % (cmd, err_str, ret)
186
186
  )
187
187
  job_id = int(stdout.decode("utf-8").strip())
188
188
 
@@ -129,7 +129,7 @@ class LSF(Machine):
129
129
  elif ret != 0:
130
130
  # just retry when any unknown error raised.
131
131
  raise RetrySignal(
132
- "Get error code %d in checking status through ssh with job: %s . message: %s"
132
+ "Get error code %d in checking status with job: %s . message: %s"
133
133
  % (ret, job.job_hash, err_str)
134
134
  )
135
135
  status_out = stdout.read().decode("utf-8").split("\n")
@@ -76,7 +76,8 @@ class PBS(Machine):
76
76
  job_id = job.job_id
77
77
  if job_id == "":
78
78
  return JobStatus.unsubmitted
79
- ret, stdin, stdout, stderr = self.context.block_call("qstat -x " + job_id)
79
+ command = "qstat -x " + job_id
80
+ ret, stdin, stdout, stderr = self.context.block_call(command)
80
81
  err_str = stderr.read().decode("utf-8")
81
82
  if ret != 0:
82
83
  if "qstat: Unknown Job Id" in err_str or "Job has finished" in err_str:
@@ -86,8 +87,8 @@ class PBS(Machine):
86
87
  return JobStatus.terminated
87
88
  else:
88
89
  raise RuntimeError(
89
- "status command qstat fails to execute. erro info: %s return code %d"
90
- % (err_str, ret)
90
+ "status command %s fails to execute. erro info: %s return code %d"
91
+ % (command, err_str, ret)
91
92
  )
92
93
  status_line = stdout.read().decode("utf-8").split("\n")[-2]
93
94
  status_word = status_line.split()[-2]
@@ -126,7 +127,8 @@ class Torque(PBS):
126
127
  job_id = job.job_id
127
128
  if job_id == "":
128
129
  return JobStatus.unsubmitted
129
- ret, stdin, stdout, stderr = self.context.block_call("qstat -l " + job_id)
130
+ command = "qstat -l " + job_id
131
+ ret, stdin, stdout, stderr = self.context.block_call(command)
130
132
  err_str = stderr.read().decode("utf-8")
131
133
  if ret != 0:
132
134
  if "qstat: Unknown Job Id" in err_str or "Job has finished" in err_str:
@@ -136,8 +138,8 @@ class Torque(PBS):
136
138
  return JobStatus.terminated
137
139
  else:
138
140
  raise RuntimeError(
139
- "status command qstat fails to execute. erro info: %s return code %d"
140
- % (err_str, ret)
141
+ "status command %s fails to execute. erro info: %s return code %d"
142
+ % (command, err_str, ret)
141
143
  )
142
144
  status_line = stdout.read().decode("utf-8").split("\n")[-2]
143
145
  status_word = status_line.split()[-2]
@@ -263,11 +265,12 @@ class SGE(PBS):
263
265
  status_line = None
264
266
  if job_id == "":
265
267
  return JobStatus.unsubmitted
266
- ret, stdin, stdout, stderr = self.context.block_call("qstat")
268
+ command = "qstat"
269
+ ret, stdin, stdout, stderr = self.context.block_call(command)
267
270
  err_str = stderr.read().decode("utf-8")
268
271
  if ret != 0:
269
272
  raise RuntimeError(
270
- f"status command qstat fails to execute. erro info: {err_str} return code {ret}"
273
+ f"status command {command} fails to execute. erro info: {err_str} return code {ret}"
271
274
  )
272
275
  status_text_list = stdout.read().decode("utf-8").split("\n")
273
276
  for txt in status_text_list:
@@ -38,14 +38,13 @@ class Shell(Machine):
38
38
  script_run_str = self.gen_script_command(job)
39
39
  script_run_file_name = f"{job.script_file_name}.run"
40
40
  self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
41
- ret, stdin, stdout, stderr = self.context.block_call(
42
- f"cd {shlex.quote(self.context.remote_root)} && {{ nohup bash {script_file_name} 1>>{output_name} 2>>{output_name} & }} && echo $!"
43
- )
41
+ cmd = f"cd {shlex.quote(self.context.remote_root)} && {{ nohup bash {script_file_name} 1>>{output_name} 2>>{output_name} & }} && echo $!"
42
+ ret, stdin, stdout, stderr = self.context.block_call(cmd)
44
43
  if ret != 0:
45
44
  err_str = stderr.read().decode("utf-8")
46
45
  raise RuntimeError(
47
- "status command squeue fails to execute\nerror message:%s\nreturn code %d\n"
48
- % (err_str, ret)
46
+ "status command %s fails to execute\nerror message:%s\nreturn code %d\n"
47
+ % (cmd, err_str, ret)
49
48
  )
50
49
  job_id = int(stdout.read().decode("utf-8").strip())
51
50
  self.context.write_file(job_id_name, str(job_id))
@@ -73,14 +72,16 @@ class Shell(Machine):
73
72
  return JobStatus.unsubmitted
74
73
 
75
74
  # mark defunct process as terminated
76
- ret, stdin, stdout, stderr = self.context.block_call(
75
+ cmd = (
76
+ r"""command -v ps >/dev/null 2>&1 || { echo >&2 "I require ps but it's not installed. Aborting."; exit 1; };"""
77
77
  f"if ps -p {job_id} > /dev/null && ! (ps -o command -p {job_id} | grep defunct >/dev/null) ; then echo 1; fi"
78
78
  )
79
+ ret, stdin, stdout, stderr = self.context.block_call(cmd)
79
80
  if ret != 0:
80
81
  err_str = stderr.read().decode("utf-8")
81
82
  raise RuntimeError(
82
- "status command squeue fails to execute\nerror message:%s\nreturn code %d\n"
83
- % (err_str, ret)
83
+ "status command %s fails to execute\nerror message:%s\nreturn code %d\n"
84
+ % (cmd, err_str, ret)
84
85
  )
85
86
 
86
87
  if_job_exists = bool(stdout.read().decode("utf-8").strip())
@@ -83,13 +83,12 @@ class Slurm(Machine):
83
83
  script_run_file_name = f"{job.script_file_name}.run"
84
84
  self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
85
85
  # self.context.write_file(fname=os.path.join(self.context.submission.work_base, script_file_name), write_str=script_str)
86
- ret, stdin, stdout, stderr = self.context.block_call(
87
- "cd {} && {} {}".format(
88
- shlex.quote(self.context.remote_root),
89
- "sbatch",
90
- shlex.quote(script_file_name),
91
- )
86
+ command = "cd {} && {} {}".format(
87
+ shlex.quote(self.context.remote_root),
88
+ "sbatch",
89
+ shlex.quote(script_file_name),
92
90
  )
91
+ ret, stdin, stdout, stderr = self.context.block_call(command)
93
92
  if ret != 0:
94
93
  err_str = stderr.read().decode("utf-8")
95
94
  if (
@@ -98,7 +97,7 @@ class Slurm(Machine):
98
97
  ):
99
98
  # server network error, retry 3 times
100
99
  raise RetrySignal(
101
- "Get error code %d in submitting through ssh with job: %s . message: %s"
100
+ "Get error code %d in submitting with job: %s . message: %s"
102
101
  % (ret, job.job_hash, err_str)
103
102
  )
104
103
  elif (
@@ -110,8 +109,8 @@ class Slurm(Machine):
110
109
  # job number exceeds, skip the submitting
111
110
  return ""
112
111
  raise RuntimeError(
113
- "status command squeue fails to execute\nerror message:%s\nreturn code %d\n"
114
- % (err_str, ret)
112
+ "command %s fails to execute\nerror message:%s\nreturn code %d\n"
113
+ % (command, err_str, ret)
115
114
  )
116
115
  subret = stdout.readlines()
117
116
  # --parsable
@@ -129,9 +128,8 @@ class Slurm(Machine):
129
128
  job_id = job.job_id
130
129
  if job_id == "":
131
130
  return JobStatus.unsubmitted
132
- ret, stdin, stdout, stderr = self.context.block_call(
133
- 'squeue -o "%.18i %.2t" -j ' + job_id
134
- )
131
+ command = 'squeue -o "%.18i %.2t" -j ' + job_id
132
+ ret, stdin, stdout, stderr = self.context.block_call(command)
135
133
  if ret != 0:
136
134
  err_str = stderr.read().decode("utf-8")
137
135
  if "Invalid job id specified" in err_str:
@@ -147,13 +145,13 @@ class Slurm(Machine):
147
145
  ):
148
146
  # retry 3 times
149
147
  raise RetrySignal(
150
- "Get error code %d in checking status through ssh with job: %s . message: %s"
148
+ "Get error code %d in checking status with job: %s . message: %s"
151
149
  % (ret, job.job_hash, err_str)
152
150
  )
153
151
  raise RuntimeError(
154
- "status command squeue fails to execute."
152
+ "status command %s fails to execute."
155
153
  "job_id:%s \n error message:%s\n return code %d\n"
156
- % (job_id, err_str, ret)
154
+ % (command, job_id, err_str, ret)
157
155
  )
158
156
  status_line = stdout.read().decode("utf-8").split("\n")[-2]
159
157
  status_word = status_line.split()[-1]
@@ -319,9 +317,8 @@ class SlurmJobArray(Slurm):
319
317
  job_id = job.job_id
320
318
  if job_id == "":
321
319
  return JobStatus.unsubmitted
322
- ret, stdin, stdout, stderr = self.context.block_call(
323
- 'squeue -h -o "%.18i %.2t" -j ' + job_id
324
- )
320
+ command = 'squeue -h -o "%.18i %.2t" -j ' + job_id
321
+ ret, stdin, stdout, stderr = self.context.block_call(command)
325
322
  if ret != 0:
326
323
  err_str = stderr.read().decode("utf-8")
327
324
  if "Invalid job id specified" in err_str:
@@ -336,13 +333,13 @@ class SlurmJobArray(Slurm):
336
333
  ):
337
334
  # retry 3 times
338
335
  raise RetrySignal(
339
- "Get error code %d in checking status through ssh with job: %s . message: %s"
336
+ "Get error code %d in checking status with job: %s . message: %s"
340
337
  % (ret, job.job_hash, err_str)
341
338
  )
342
339
  raise RuntimeError(
343
- "status command squeue fails to execute."
340
+ "status command %s fails to execute."
344
341
  "job_id:%s \n error message:%s\n return code %d\n"
345
- % (job_id, err_str, ret)
342
+ % (command, job_id, err_str, ret)
346
343
  )
347
344
  status_lines = stdout.read().decode("utf-8").split("\n")[:-1]
348
345
  status = []
@@ -1122,9 +1122,9 @@ class Resources:
1122
1122
 
1123
1123
  @staticmethod
1124
1124
  def arginfo(detail_kwargs=True):
1125
- doc_number_node = "The number of node need for each `job`"
1126
- doc_cpu_per_node = "cpu numbers of each node assigned to each job."
1127
- doc_gpu_per_node = "gpu numbers of each node assigned to each job."
1125
+ doc_number_node = "The number of nodes required for each `job`."
1126
+ doc_cpu_per_node = "CPU numbers of each node assigned to each job."
1127
+ doc_gpu_per_node = "GPU numbers of each node assigned to each job."
1128
1128
  doc_queue_name = "The queue name of batch job scheduler system."
1129
1129
  doc_group_size = "The number of `tasks` in a `job`. 0 means infinity."
1130
1130
  doc_custom_flags = "The extra lines pass to job submitting script header"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dpdispatcher
3
- Version: 0.6.6
3
+ Version: 0.6.7
4
4
  Summary: Generate HPC scheduler systems jobs input scripts, submit these scripts to HPC systems, and poke until they finish
5
5
  Author: DeepModeling
6
6
  License: GNU LESSER GENERAL PUBLIC LICENSE
@@ -1,36 +1,36 @@
1
1
  dpdispatcher/__init__.py,sha256=CLZP_N5CTp14ujWCykEHuJjoIfKR6CwrclXhjWUgNoE,517
2
2
  dpdispatcher/__main__.py,sha256=BFhG-mSBzVZUEezQJqXWZnt2WsnhAHT_zpT8Y6gpOz0,116
3
- dpdispatcher/_version.py,sha256=A5NOPsDJAvtNjXOWXcGEBcGThUtYnfklnJHouP0KaiU,411
3
+ dpdispatcher/_version.py,sha256=iLXz9haw4jSV4Xm2-5_V8999GBAYoJkXg9-YOwMJpLY,411
4
4
  dpdispatcher/arginfo.py,sha256=pNaxYIE6ahBidpR7OCKZdw8iGt003uTXGSlVzwiuvRg,188
5
- dpdispatcher/base_context.py,sha256=NvaC_RHyspxq412z-eCq4Zn8-szZxvn8K6OkXvx7l4Y,3615
5
+ dpdispatcher/base_context.py,sha256=FDie50yHuLxTwV-k3B_sgAYqR5rLLLVOwk24wSiu4SQ,5254
6
6
  dpdispatcher/dlog.py,sha256=QJKAwB6gV3Zb6zQUL9dZ_uIoTIEy9Z7ecmVQ-8WNmD8,1081
7
7
  dpdispatcher/dpdisp.py,sha256=jhuTmwPY7KBF4WukaQomEwZcfYoISaMbKwuxdDGSluc,4206
8
- dpdispatcher/machine.py,sha256=EXrOckVsW9ZFOBc88eaSt2_WzDqNtjDTkGjOBFKWG04,16106
8
+ dpdispatcher/machine.py,sha256=k53ycs_v7xrl4D93URc5ht0shoO9NPrVl0rYr4v5OiU,16696
9
9
  dpdispatcher/run.py,sha256=tFHbJAioXXpgHTE5bhRRAuc8w7cX1ET9SBbiAg3Rw-I,5382
10
- dpdispatcher/submission.py,sha256=0_PCpRyiUwCHwYAzdXs-3rzq8YzZs0VZBU6tS7SixG0,48361
10
+ dpdispatcher/submission.py,sha256=NaljgA88NLv0rvxoSMZvUMq0sQEggkgKlcT8gXUnqFs,48367
11
11
  dpdispatcher/contexts/__init__.py,sha256=jlvcIppmUnS39yBlkZEDvIQFV-j_BR75ZTbZALF_RB0,336
12
- dpdispatcher/contexts/dp_cloud_server_context.py,sha256=6XK0B2sLGEDeZmV2SZzQdVrMcWAWYZVLLK-IaShEXIY,12245
13
- dpdispatcher/contexts/hdfs_context.py,sha256=B6pjGUD8Xaa0G_Zrnoci2DZnEXxojE9fAcexMMvAZCM,8930
14
- dpdispatcher/contexts/lazy_local_context.py,sha256=F8abWAJRY1Ewx1sErINKN1ltWerXzeCcJgjTvLvucKE,5696
15
- dpdispatcher/contexts/local_context.py,sha256=AsIfOT24FV0_bNlD2xU-pqAJy-XHZ6XTsbll4Vt6bMM,14065
16
- dpdispatcher/contexts/openapi_context.py,sha256=DXaMS10SXN3VKEeEdzQyfOgRwUyHRJVCJHd2fKKdsmA,9499
17
- dpdispatcher/contexts/ssh_context.py,sha256=ApFhzK0c7zxclOSESEswpy_RsM1zLkeEYJ_hCtrALmQ,38682
12
+ dpdispatcher/contexts/dp_cloud_server_context.py,sha256=PGRMef3q2hfK-o5dNIWWvzPca2NK1HrWEgungM4L9Go,12420
13
+ dpdispatcher/contexts/hdfs_context.py,sha256=mYQzXMZ4A9EjjWBAH3Ba6HOErUhMMwCsKxOjpd5R57Y,9105
14
+ dpdispatcher/contexts/lazy_local_context.py,sha256=FAClbLD2F4LizUqFzMOg3t0Z6NLeTDLJy7NkRcDELFs,5070
15
+ dpdispatcher/contexts/local_context.py,sha256=VbaSXGAc_EDMT0K5WV_flBF0bX87ntrwO_hq_Bkcb04,14590
16
+ dpdispatcher/contexts/openapi_context.py,sha256=M7L9axpjOrzvdTpLMDuEzZqe4ZuKIxjS0bzZUv8W2IQ,9674
17
+ dpdispatcher/contexts/ssh_context.py,sha256=s0K-gSKPSykq2PyOzAt4yNEczAdsVGvQ1QmPJpZ4_Vo,37648
18
18
  dpdispatcher/dpcloudserver/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
19
  dpdispatcher/dpcloudserver/client.py,sha256=k1niKjG6zFnMtHn_UuCjYoOcMju3o3PV-GdyVLr5-KM,165
20
20
  dpdispatcher/entrypoints/__init__.py,sha256=exKSFT3j2oCerGwtI8WbHQK-D0K-CyifocRji1xntT4,20
21
21
  dpdispatcher/entrypoints/gui.py,sha256=29lMXqbmSRbLj4rfBv7Jnw89NLU9syTB88IUP6IRJsU,830
22
22
  dpdispatcher/entrypoints/run.py,sha256=tRkHfeAktV6gF31yb2MVOSTlpNGZFw3N0jHBmM1YfIg,175
23
23
  dpdispatcher/entrypoints/submission.py,sha256=ikVwIZAQL0SsYO5xaMIdKXgO6qtc05w1vqmvtG7Nk5M,3401
24
- dpdispatcher/machines/JH_UniScheduler.py,sha256=f7Vs9_m4Th1GVSgsJTy9_nMAY8g9n0ZewnPY2DFECfI,5795
24
+ dpdispatcher/machines/JH_UniScheduler.py,sha256=B-LGldr9H8qPQYdCYoEaXFCEFBPmjFEi0fwEWp0wdR0,5783
25
25
  dpdispatcher/machines/__init__.py,sha256=tOQuPUlW1Ab4qcC0oSAIyDjZA_WyE67h_EIxPCWGhys,336
26
- dpdispatcher/machines/distributed_shell.py,sha256=LvWl6ktPlgmJ7rk90VWxp4douve8hYmuRf-B0saFBds,7534
26
+ dpdispatcher/machines/distributed_shell.py,sha256=TVnXFNqQmBgWk3s34rKSZo0S5N5KPZVmAG3Xbu_kuBo,7535
27
27
  dpdispatcher/machines/dp_cloud_server.py,sha256=SR69gsFb2BvOQCW1QnWfP3cQvu_qHLJNsycp5wzosJU,11706
28
28
  dpdispatcher/machines/fugaku.py,sha256=oY2hD2ldL2dztwtJ9WNisdsfPnaX-5yTRXewIT9r60I,4314
29
- dpdispatcher/machines/lsf.py,sha256=Q6IE4nCkNEKcW0AdBTKPOYgmCJAeXWmUVxZ9sQFkxos,7932
29
+ dpdispatcher/machines/lsf.py,sha256=fOZoOTpFn1nKx79lYkvZQOhNwz39YAIEytxICd56AFU,7920
30
30
  dpdispatcher/machines/openapi.py,sha256=Gzzbo8YOAybXGTrgMutexErcaEi3ts7uTUNvOhThFS8,8858
31
- dpdispatcher/machines/pbs.py,sha256=xPbdnT-g8pDMbq-yuI8G7TA0AZqn9gLXuqfWabQ2Whk,12437
32
- dpdispatcher/machines/shell.py,sha256=DnqMNb2nmBc3gVx8tA8oiUWdnWHKJwpIPs660i3Eq7A,4703
33
- dpdispatcher/machines/slurm.py,sha256=YM2Mv55jAFtDIiJoJLkD6p1Wi1ujjH6t4WlU8EtlbCw,15592
31
+ dpdispatcher/machines/pbs.py,sha256=XeeFQMZoH9DscsrJ_Ykv6fNUtc9TBp4epuFqbUyr3dk,12531
32
+ dpdispatcher/machines/shell.py,sha256=ONaUJpszsCwCcbyVLvC6VoJ-ig2QTU9JQdA-nlgXnu8,4845
33
+ dpdispatcher/machines/slurm.py,sha256=HLYk9E1dChnTeHjOOWNG854AWdlUJVYYmgwaiVswPQ8,15560
34
34
  dpdispatcher/utils/__init__.py,sha256=fwvwkMf7DFNQkNBiIce8Y8gRA6FhICwKjkKiXu_BEJg,13
35
35
  dpdispatcher/utils/hdfs_cli.py,sha256=n3EIfFIralsISlaEewawD35f0P8mabo-u8D8UW3k_7Y,5308
36
36
  dpdispatcher/utils/job_status.py,sha256=Eszs4TPLfszCuf6zLaFonf25feXDUguF28spYOjJpQE,233
@@ -41,9 +41,9 @@ dpdispatcher/utils/dpcloudserver/client.py,sha256=CLfXswvzI4inDrW2bYkfMQ6gQJFcZO
41
41
  dpdispatcher/utils/dpcloudserver/config.py,sha256=NteQzf1OeEkz2UbkXHHQ0B72cUu23zLVzpM9Yh4v1Cc,559
42
42
  dpdispatcher/utils/dpcloudserver/retcode.py,sha256=1qAF8gFZx55u2sO8KbtYSIIrjcO-IGufEUlwbkSfC1g,721
43
43
  dpdispatcher/utils/dpcloudserver/zip_file.py,sha256=f9WrlktwHW0YipaWg5Y0kxjMZlhD1cJYa6EUpvu4Cro,2611
44
- dpdispatcher-0.6.6.dist-info/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
45
- dpdispatcher-0.6.6.dist-info/METADATA,sha256=0sYP0wVNFK9e2SMke4jpCbjpBEDA691quZj60MO3p6k,12828
46
- dpdispatcher-0.6.6.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
47
- dpdispatcher-0.6.6.dist-info/entry_points.txt,sha256=NRHUV0IU_u7_XtcmmEDnVzAcUmurhiEAGwENckrajo4,233
48
- dpdispatcher-0.6.6.dist-info/top_level.txt,sha256=35jAQoXY-b-e9fJ1_mxhZUiaCoJNt1ZI7mpFRf07Qjs,13
49
- dpdispatcher-0.6.6.dist-info/RECORD,,
44
+ dpdispatcher-0.6.7.dist-info/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
45
+ dpdispatcher-0.6.7.dist-info/METADATA,sha256=lNsC7Ruo7GmUOQl1TadlThoLvrOETbZ1s0-sXmrRYL4,12828
46
+ dpdispatcher-0.6.7.dist-info/WHEEL,sha256=UvcQYKBHoFqaQd6LKyqHw9fxEolWLQnlzP0h_LgJAfI,91
47
+ dpdispatcher-0.6.7.dist-info/entry_points.txt,sha256=NRHUV0IU_u7_XtcmmEDnVzAcUmurhiEAGwENckrajo4,233
48
+ dpdispatcher-0.6.7.dist-info/top_level.txt,sha256=35jAQoXY-b-e9fJ1_mxhZUiaCoJNt1ZI7mpFRf07Qjs,13
49
+ dpdispatcher-0.6.7.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (71.1.0)
2
+ Generator: setuptools (74.0.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5