dpdispatcher 0.6.5__py3-none-any.whl → 0.6.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dpdispatcher/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.6.5'
16
- __version_tuple__ = version_tuple = (0, 6, 5)
15
+ __version__ = version = '0.6.6'
16
+ __version_tuple__ = version_tuple = (0, 6, 6)
@@ -295,7 +295,11 @@ class SSHSession:
295
295
  assert self.ssh is not None
296
296
  try:
297
297
  return self.ssh.exec_command(cmd)
298
- except (paramiko.ssh_exception.SSHException, socket.timeout, EOFError) as e:
298
+ except (
299
+ paramiko.ssh_exception.SSHException,
300
+ socket.timeout,
301
+ EOFError,
302
+ ) as e:
299
303
  # SSH session not active
300
304
  # retry for up to 3 times
301
305
  # ensure alive
@@ -355,10 +359,18 @@ class SSHSession:
355
359
  ),
356
360
  Argument("timeout", int, optional=True, default=10, doc=doc_timeout),
357
361
  Argument(
358
- "totp_secret", str, optional=True, default=None, doc=doc_totp_secret
362
+ "totp_secret",
363
+ str,
364
+ optional=True,
365
+ default=None,
366
+ doc=doc_totp_secret,
359
367
  ),
360
368
  Argument(
361
- "tar_compress", bool, optional=True, default=True, doc=doc_tar_compress
369
+ "tar_compress",
370
+ bool,
371
+ optional=True,
372
+ default=True,
373
+ doc=doc_tar_compress,
362
374
  ),
363
375
  Argument(
364
376
  "look_for_keys",
@@ -603,7 +615,10 @@ class SSHContext(BaseContext):
603
615
  directory_list,
604
616
  )
605
617
  self._walk_directory(
606
- submission.forward_common_files, self.local_root, file_list, directory_list
618
+ submission.forward_common_files,
619
+ self.local_root,
620
+ file_list,
621
+ directory_list,
607
622
  )
608
623
 
609
624
  # convert to relative path to local_root
@@ -621,9 +636,9 @@ class SSHContext(BaseContext):
621
636
  ).as_posix()
622
637
  sha256_list.append(f"{sha256} {jj_rel}")
623
638
  # write to remote
624
- sha256_file = os.path.join(
625
- self.remote_root, ".tmp.sha256." + str(uuid.uuid4())
626
- )
639
+ sha256_file = pathlib.PurePath(
640
+ os.path.join(self.remote_root, ".tmp.sha256." + str(uuid.uuid4()))
641
+ ).as_posix()
627
642
  self.write_file(sha256_file, "\n".join(sha256_list))
628
643
  # check sha256
629
644
  # `:` means pass: https://stackoverflow.com/a/2421592/9567349
@@ -736,7 +751,8 @@ class SSHContext(BaseContext):
736
751
  file_list.extend(submission.backward_common_files)
737
752
  if len(file_list) > 0:
738
753
  self._get_files(
739
- file_list, tar_compress=self.remote_profile.get("tar_compress", None)
754
+ file_list,
755
+ tar_compress=self.remote_profile.get("tar_compress", None),
740
756
  )
741
757
 
742
758
  def block_checkcall(self, cmd, asynchronously=False, stderr_whitelist=None):
@@ -793,18 +809,23 @@ class SSHContext(BaseContext):
793
809
  fname = pathlib.PurePath(os.path.join(self.remote_root, fname)).as_posix()
794
810
  # to prevent old file from being overwritten but cancelled, create a temporary file first
795
811
  # when it is fully written, rename it to the original file name
796
- with self.sftp.open(fname + "~", "w") as fp:
797
- fp.write(write_str)
812
+ temp_fname = fname + "_tmp"
813
+ try:
814
+ with self.sftp.open(temp_fname, "w") as fp:
815
+ fp.write(write_str)
816
+ # Rename the temporary file
817
+ self.block_checkcall(f"mv {shlex.quote(temp_fname)} {shlex.quote(fname)}")
798
818
  # sftp.rename may throw OSError
799
- self.block_checkcall(
800
- "mv {} {}".format(shlex.quote(fname + "~"), shlex.quote(fname))
801
- )
819
+ except OSError as e:
820
+ dlog.exception(f"Error writing to file {fname}")
821
+ raise e
802
822
 
803
823
  def read_file(self, fname):
804
824
  assert self.remote_root is not None
805
825
  self.ssh_session.ensure_alive()
806
826
  with self.sftp.open(
807
- pathlib.PurePath(os.path.join(self.remote_root, fname)).as_posix(), "r"
827
+ pathlib.PurePath(os.path.join(self.remote_root, fname)).as_posix(),
828
+ "r",
808
829
  ) as fp:
809
830
  ret = fp.read().decode("utf-8")
810
831
  return ret
@@ -945,36 +966,28 @@ class SSHContext(BaseContext):
945
966
  per_nfile = 100
946
967
  ntar = len(files) // per_nfile + 1
947
968
  if ntar <= 1:
948
- try:
949
- self.block_checkcall(
950
- "tar {} {} {}".format(
951
- tar_command,
952
- shlex.quote(of),
953
- " ".join([shlex.quote(file) for file in files]),
954
- )
955
- )
956
- except RuntimeError as e:
957
- if "No such file or directory" in str(e):
958
- raise FileNotFoundError(
959
- "Any of the backward files does not exist in the remote directory."
960
- ) from e
961
- raise e
969
+ file_list = " ".join([shlex.quote(file) for file in files])
970
+ tar_cmd = f"tar {tar_command} {shlex.quote(of)} {file_list}"
962
971
  else:
963
- file_list_file = os.path.join(
964
- self.remote_root, ".tmp.tar." + str(uuid.uuid4())
965
- )
972
+ file_list_file = pathlib.PurePath(
973
+ os.path.join(self.remote_root, f".tmp_tar_{uuid.uuid4()}")
974
+ ).as_posix()
966
975
  self.write_file(file_list_file, "\n".join(files))
967
- try:
968
- self.block_checkcall(
969
- f"tar {tar_command} {shlex.quote(of)} -T {shlex.quote(file_list_file)}"
970
- )
971
- except RuntimeError as e:
972
- if "No such file or directory" in str(e):
973
- raise FileNotFoundError(
974
- "Any of the backward files does not exist in the remote directory."
975
- ) from e
976
- raise e
977
- # trans
976
+ tar_cmd = (
977
+ f"tar {tar_command} {shlex.quote(of)} -T {shlex.quote(file_list_file)}"
978
+ )
979
+
980
+ # Execute the tar command remotely
981
+ try:
982
+ self.block_checkcall(tar_cmd)
983
+ except RuntimeError as e:
984
+ if "No such file or directory" in str(e):
985
+ raise FileNotFoundError(
986
+ "Backward files do not exist in the remote directory."
987
+ ) from e
988
+ raise e
989
+
990
+ # Transfer the archive from remote to local
978
991
  from_f = pathlib.PurePath(os.path.join(self.remote_root, of)).as_posix()
979
992
  to_f = pathlib.PurePath(os.path.join(self.local_root, of)).as_posix()
980
993
  if os.path.isfile(to_f):
dpdispatcher/machine.py CHANGED
@@ -261,8 +261,7 @@ class Machine(metaclass=ABCMeta):
261
261
 
262
262
  source_list = job.resources.source_list
263
263
  for ii in source_list:
264
- line = f"{{ source {ii}; }} \n"
265
- source_files_part += line
264
+ source_files_part += f"source {ii}\n"
266
265
 
267
266
  export_envs_part = ""
268
267
  envs = job.resources.envs
@@ -1,4 +1,7 @@
1
1
  import shlex
2
+ from typing import List
3
+
4
+ from dargs import Argument
2
5
 
3
6
  from dpdispatcher.dlog import dlog
4
7
  from dpdispatcher.machine import Machine
@@ -181,10 +184,9 @@ class Torque(PBS):
181
184
 
182
185
  sge_script_header_template = """
183
186
  #!/bin/bash
184
- #$ -N dpdispatcher_submit
185
- {select_node_line}
187
+ #$ -S /bin/bash
186
188
  #$ -cwd
187
-
189
+ {select_node_line}
188
190
  """
189
191
 
190
192
 
@@ -209,14 +211,31 @@ class SGE(PBS):
209
211
  )
210
212
 
211
213
  def gen_script_header(self, job):
214
+ ### Ref:https://softpanorama.org/HPC/PBS_and_derivatives/Reference/pbs_command_vs_sge_commands.shtml
215
+ # resources.number_node is not used in SGE
212
216
  resources = job.resources
217
+ job_name = resources.kwargs.get("job_name", "wDPjob")
218
+ pe_name = resources.kwargs.get("pe_name", "mpi")
213
219
  sge_script_header_dict = {}
214
- # resources.number_node is not used
215
- sge_script_header_dict["select_node_line"] = (
216
- f"#$ -pe mpi {resources.cpu_per_node} "
220
+ sge_script_header_dict["select_node_line"] = f"#$ -N {job_name}\n"
221
+ sge_script_header_dict["select_node_line"] += (
222
+ f"#$ -pe {pe_name} {resources.cpu_per_node}\n"
217
223
  )
218
- # resources.queue_name is not necessary
219
- sge_script_header = sge_script_header_template.format(**sge_script_header_dict)
224
+
225
+ if resources.queue_name != "":
226
+ sge_script_header_dict["select_node_line"] += (
227
+ f"#$ -q {resources.queue_name}"
228
+ )
229
+ if (
230
+ resources["strategy"].get("customized_script_header_template_file")
231
+ is not None
232
+ ):
233
+ file_name = resources["strategy"]["customized_script_header_template_file"]
234
+ sge_script_header = customized_script_header_template(file_name, resources)
235
+ else:
236
+ sge_script_header = sge_script_header_template.format(
237
+ **sge_script_header_dict
238
+ )
220
239
  return sge_script_header
221
240
 
222
241
  def do_submit(self, job):
@@ -224,6 +243,9 @@ class SGE(PBS):
224
243
  script_str = self.gen_script(job)
225
244
  job_id_name = job.job_hash + "_job_id"
226
245
  self.context.write_file(fname=script_file_name, write_str=script_str)
246
+ script_run_str = self.gen_script_command(job)
247
+ script_run_file_name = f"{job.script_file_name}.run"
248
+ self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
227
249
  script_file_dir = self.context.remote_root
228
250
  stdin, stdout, stderr = self.context.block_checkcall(
229
251
  "cd {} && {} {}".format(script_file_dir, "qsub", script_file_name)
@@ -245,8 +267,7 @@ class SGE(PBS):
245
267
  err_str = stderr.read().decode("utf-8")
246
268
  if ret != 0:
247
269
  raise RuntimeError(
248
- "status command qstat fails to execute. erro info: %s return code %d"
249
- % (err_str, ret)
270
+ f"status command qstat fails to execute. erro info: {err_str} return code {ret}"
250
271
  )
251
272
  status_text_list = stdout.read().decode("utf-8").split("\n")
252
273
  for txt in status_text_list:
@@ -259,8 +280,7 @@ class SGE(PBS):
259
280
  if self.check_finish_tag(job=job):
260
281
  return JobStatus.finished
261
282
  dlog.info(
262
- "not tag_finished detected, execute sync command and wait. count "
263
- + str(count)
283
+ f"not tag_finished detected, execute sync command and wait. count {count}"
264
284
  )
265
285
  self.context.block_call("sync")
266
286
  import time
@@ -281,3 +301,44 @@ class SGE(PBS):
281
301
  def check_finish_tag(self, job):
282
302
  job_tag_finished = job.job_hash + "_job_tag_finished"
283
303
  return self.context.check_file_exists(job_tag_finished)
304
+
305
+ @classmethod
306
+ def resources_subfields(cls) -> List[Argument]:
307
+ """Generate the resources subfields.
308
+
309
+ pe_name : str
310
+ The parallel environment name of SGE.
311
+
312
+ Returns
313
+ -------
314
+ list[Argument]
315
+ resources subfields
316
+ """
317
+ doc_pe_name = "The parallel environment name of SGE system."
318
+ doc_job_name = "The name of SGE's job."
319
+
320
+ return [
321
+ Argument(
322
+ "kwargs",
323
+ dict,
324
+ [
325
+ Argument(
326
+ "pe_name",
327
+ str,
328
+ optional=True,
329
+ default="mpi",
330
+ doc=doc_pe_name,
331
+ alias=["sge_pe_name"],
332
+ ),
333
+ Argument(
334
+ "job_name",
335
+ str,
336
+ optional=True,
337
+ default="wDPjob",
338
+ doc=doc_job_name,
339
+ ),
340
+ ],
341
+ optional=False,
342
+ doc="Extra arguments.",
343
+ )
344
+ ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dpdispatcher
3
- Version: 0.6.5
3
+ Version: 0.6.6
4
4
  Summary: Generate HPC scheduler systems jobs input scripts, submit these scripts to HPC systems, and poke until they finish
5
5
  Author: DeepModeling
6
6
  License: GNU LESSER GENERAL PUBLIC LICENSE
@@ -207,7 +207,7 @@ Requires-Dist: sphinx-rtd-theme >=1.0.0rc1 ; extra == 'docs'
207
207
  Requires-Dist: numpydoc ; extra == 'docs'
208
208
  Requires-Dist: deepmodeling-sphinx >=0.1.1 ; extra == 'docs'
209
209
  Requires-Dist: dargs >=0.3.1 ; extra == 'docs'
210
- Requires-Dist: sphinx-argparse ; extra == 'docs'
210
+ Requires-Dist: sphinx-argparse <0.5.0 ; extra == 'docs'
211
211
  Provides-Extra: gui
212
212
  Requires-Dist: dpgui ; extra == 'gui'
213
213
  Provides-Extra: test
@@ -1,11 +1,11 @@
1
1
  dpdispatcher/__init__.py,sha256=CLZP_N5CTp14ujWCykEHuJjoIfKR6CwrclXhjWUgNoE,517
2
2
  dpdispatcher/__main__.py,sha256=BFhG-mSBzVZUEezQJqXWZnt2WsnhAHT_zpT8Y6gpOz0,116
3
- dpdispatcher/_version.py,sha256=PuC6q1U5hHaOMp2tDNeTKt6ExeuO2V9ihjqjMYIsVUo,411
3
+ dpdispatcher/_version.py,sha256=A5NOPsDJAvtNjXOWXcGEBcGThUtYnfklnJHouP0KaiU,411
4
4
  dpdispatcher/arginfo.py,sha256=pNaxYIE6ahBidpR7OCKZdw8iGt003uTXGSlVzwiuvRg,188
5
5
  dpdispatcher/base_context.py,sha256=NvaC_RHyspxq412z-eCq4Zn8-szZxvn8K6OkXvx7l4Y,3615
6
6
  dpdispatcher/dlog.py,sha256=QJKAwB6gV3Zb6zQUL9dZ_uIoTIEy9Z7ecmVQ-8WNmD8,1081
7
7
  dpdispatcher/dpdisp.py,sha256=jhuTmwPY7KBF4WukaQomEwZcfYoISaMbKwuxdDGSluc,4206
8
- dpdispatcher/machine.py,sha256=z5D0eLAPfdo5SZdO6NLvWBUUePE0VHRMWurRMzEV0U0,16138
8
+ dpdispatcher/machine.py,sha256=EXrOckVsW9ZFOBc88eaSt2_WzDqNtjDTkGjOBFKWG04,16106
9
9
  dpdispatcher/run.py,sha256=tFHbJAioXXpgHTE5bhRRAuc8w7cX1ET9SBbiAg3Rw-I,5382
10
10
  dpdispatcher/submission.py,sha256=0_PCpRyiUwCHwYAzdXs-3rzq8YzZs0VZBU6tS7SixG0,48361
11
11
  dpdispatcher/contexts/__init__.py,sha256=jlvcIppmUnS39yBlkZEDvIQFV-j_BR75ZTbZALF_RB0,336
@@ -14,7 +14,7 @@ dpdispatcher/contexts/hdfs_context.py,sha256=B6pjGUD8Xaa0G_Zrnoci2DZnEXxojE9fAce
14
14
  dpdispatcher/contexts/lazy_local_context.py,sha256=F8abWAJRY1Ewx1sErINKN1ltWerXzeCcJgjTvLvucKE,5696
15
15
  dpdispatcher/contexts/local_context.py,sha256=AsIfOT24FV0_bNlD2xU-pqAJy-XHZ6XTsbll4Vt6bMM,14065
16
16
  dpdispatcher/contexts/openapi_context.py,sha256=DXaMS10SXN3VKEeEdzQyfOgRwUyHRJVCJHd2fKKdsmA,9499
17
- dpdispatcher/contexts/ssh_context.py,sha256=baMiD_1KlrksqNKCkpx7apovLW_qdfU9U1KRDNTjCz0,38578
17
+ dpdispatcher/contexts/ssh_context.py,sha256=ApFhzK0c7zxclOSESEswpy_RsM1zLkeEYJ_hCtrALmQ,38682
18
18
  dpdispatcher/dpcloudserver/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
19
  dpdispatcher/dpcloudserver/client.py,sha256=k1niKjG6zFnMtHn_UuCjYoOcMju3o3PV-GdyVLr5-KM,165
20
20
  dpdispatcher/entrypoints/__init__.py,sha256=exKSFT3j2oCerGwtI8WbHQK-D0K-CyifocRji1xntT4,20
@@ -28,7 +28,7 @@ dpdispatcher/machines/dp_cloud_server.py,sha256=SR69gsFb2BvOQCW1QnWfP3cQvu_qHLJN
28
28
  dpdispatcher/machines/fugaku.py,sha256=oY2hD2ldL2dztwtJ9WNisdsfPnaX-5yTRXewIT9r60I,4314
29
29
  dpdispatcher/machines/lsf.py,sha256=Q6IE4nCkNEKcW0AdBTKPOYgmCJAeXWmUVxZ9sQFkxos,7932
30
30
  dpdispatcher/machines/openapi.py,sha256=Gzzbo8YOAybXGTrgMutexErcaEi3ts7uTUNvOhThFS8,8858
31
- dpdispatcher/machines/pbs.py,sha256=KjJcLpQr748ZgOwFfWmJ_LG1q6Jm1UF24YCSLiDfcac,10308
31
+ dpdispatcher/machines/pbs.py,sha256=xPbdnT-g8pDMbq-yuI8G7TA0AZqn9gLXuqfWabQ2Whk,12437
32
32
  dpdispatcher/machines/shell.py,sha256=DnqMNb2nmBc3gVx8tA8oiUWdnWHKJwpIPs660i3Eq7A,4703
33
33
  dpdispatcher/machines/slurm.py,sha256=YM2Mv55jAFtDIiJoJLkD6p1Wi1ujjH6t4WlU8EtlbCw,15592
34
34
  dpdispatcher/utils/__init__.py,sha256=fwvwkMf7DFNQkNBiIce8Y8gRA6FhICwKjkKiXu_BEJg,13
@@ -41,9 +41,9 @@ dpdispatcher/utils/dpcloudserver/client.py,sha256=CLfXswvzI4inDrW2bYkfMQ6gQJFcZO
41
41
  dpdispatcher/utils/dpcloudserver/config.py,sha256=NteQzf1OeEkz2UbkXHHQ0B72cUu23zLVzpM9Yh4v1Cc,559
42
42
  dpdispatcher/utils/dpcloudserver/retcode.py,sha256=1qAF8gFZx55u2sO8KbtYSIIrjcO-IGufEUlwbkSfC1g,721
43
43
  dpdispatcher/utils/dpcloudserver/zip_file.py,sha256=f9WrlktwHW0YipaWg5Y0kxjMZlhD1cJYa6EUpvu4Cro,2611
44
- dpdispatcher-0.6.5.dist-info/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
45
- dpdispatcher-0.6.5.dist-info/METADATA,sha256=eLIZlw1J6l08VjltNG2O3Z7kWK_TNVJR08aaGlfwESc,12821
46
- dpdispatcher-0.6.5.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
47
- dpdispatcher-0.6.5.dist-info/entry_points.txt,sha256=NRHUV0IU_u7_XtcmmEDnVzAcUmurhiEAGwENckrajo4,233
48
- dpdispatcher-0.6.5.dist-info/top_level.txt,sha256=35jAQoXY-b-e9fJ1_mxhZUiaCoJNt1ZI7mpFRf07Qjs,13
49
- dpdispatcher-0.6.5.dist-info/RECORD,,
44
+ dpdispatcher-0.6.6.dist-info/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
45
+ dpdispatcher-0.6.6.dist-info/METADATA,sha256=0sYP0wVNFK9e2SMke4jpCbjpBEDA691quZj60MO3p6k,12828
46
+ dpdispatcher-0.6.6.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
47
+ dpdispatcher-0.6.6.dist-info/entry_points.txt,sha256=NRHUV0IU_u7_XtcmmEDnVzAcUmurhiEAGwENckrajo4,233
48
+ dpdispatcher-0.6.6.dist-info/top_level.txt,sha256=35jAQoXY-b-e9fJ1_mxhZUiaCoJNt1ZI7mpFRf07Qjs,13
49
+ dpdispatcher-0.6.6.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.43.0)
2
+ Generator: setuptools (71.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5