dpdispatcher 0.6.3__py3-none-any.whl → 0.6.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dpdispatcher might be problematic. Click here for more details.

dpdispatcher/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.6.3'
16
- __version_tuple__ = version_tuple = (0, 6, 3)
15
+ __version__ = version = '0.6.4'
16
+ __version_tuple__ = version_tuple = (0, 6, 4)
@@ -208,7 +208,10 @@ class LocalContext(BaseContext):
208
208
  shutil.rmtree(lfile, ignore_errors=True)
209
209
  elif os.path.isfile(lfile) or os.path.islink(lfile):
210
210
  os.remove(lfile)
211
- shutil.copyfile(rfile, lfile)
211
+ if not os.path.islink(rfile):
212
+ shutil.move(rfile, lfile)
213
+ else:
214
+ shutil.copyfile(rfile, lfile)
212
215
  else:
213
216
  raise RuntimeError("should not reach here!")
214
217
  else:
@@ -275,7 +278,10 @@ class LocalContext(BaseContext):
275
278
  shutil.rmtree(lfile, ignore_errors=True)
276
279
  elif os.path.isfile(lfile) or os.path.islink(lfile):
277
280
  os.remove(lfile)
278
- shutil.copyfile(rfile, lfile)
281
+ if not os.path.islink(rfile):
282
+ shutil.move(rfile, lfile)
283
+ else:
284
+ shutil.copyfile(rfile, lfile)
279
285
  else:
280
286
  raise RuntimeError("should not reach here!")
281
287
  else:
@@ -295,7 +295,7 @@ class SSHSession:
295
295
  assert self.ssh is not None
296
296
  try:
297
297
  return self.ssh.exec_command(cmd)
298
- except (paramiko.ssh_exception.SSHException, socket.timeout) as e:
298
+ except (paramiko.ssh_exception.SSHException, socket.timeout, EOFError) as e:
299
299
  # SSH session not active
300
300
  # retry for up to 3 times
301
301
  # ensure alive
dpdispatcher/dpdisp.py CHANGED
@@ -54,6 +54,11 @@ def main_parser() -> argparse.ArgumentParser:
54
54
  action="store_true",
55
55
  help="Clean submission.",
56
56
  )
57
+ parser_submission_action.add_argument(
58
+ "--reset-fail-count",
59
+ action="store_true",
60
+ help="Reset fail count of all jobs to zero.",
61
+ )
57
62
  ##########################################
58
63
  # gui
59
64
  parser_gui = subparsers.add_parser(
@@ -105,6 +110,7 @@ def main():
105
110
  download_terminated_log=args.download_terminated_log,
106
111
  download_finished_task=args.download_finished_task,
107
112
  clean=args.clean,
113
+ reset_fail_count=args.reset_fail_count,
108
114
  )
109
115
  elif args.command == "gui":
110
116
  start_dpgui(
@@ -12,6 +12,7 @@ def handle_submission(
12
12
  download_terminated_log: bool = False,
13
13
  download_finished_task: bool = False,
14
14
  clean: bool = False,
15
+ reset_fail_count: bool = False,
15
16
  ):
16
17
  """Handle terminated submission.
17
18
 
@@ -25,13 +26,21 @@ def handle_submission(
25
26
  Download finished tasks.
26
27
  clean : bool, optional
27
28
  Clean submission.
29
+ reset_fail_count : bool, optional
30
+ Reset fail count of all jobs to zero.
28
31
 
29
32
  Raises
30
33
  ------
31
34
  ValueError
32
35
  At least one action should be specified.
33
36
  """
34
- if int(download_terminated_log) + int(download_finished_task) + int(clean) == 0:
37
+ if (
38
+ int(download_terminated_log)
39
+ + int(download_finished_task)
40
+ + int(clean)
41
+ + int(reset_fail_count)
42
+ == 0
43
+ ):
35
44
  raise ValueError("At least one action should be specified.")
36
45
 
37
46
  submission_file = record.get_submission(submission_hash)
@@ -42,7 +51,18 @@ def handle_submission(
42
51
  # TODO: for unclear reason, the submission_hash may be changed
43
52
  submission.submission_hash = submission_hash
44
53
  submission.machine.context.bind_submission(submission)
54
+ if reset_fail_count:
55
+ for job in submission.belonging_jobs:
56
+ job.fail_count = 0
57
+ # save to remote and local
58
+ submission.submission_to_json()
59
+ record.write(submission)
60
+ if int(download_terminated_log) + int(download_finished_task) + int(clean) == 0:
61
+ # if only reset_fail_count, no need to update submission state (expensive)
62
+ return
45
63
  submission.update_submission_state()
64
+ submission.submission_to_json()
65
+ record.write(submission)
46
66
 
47
67
  terminated_tasks = []
48
68
  finished_tasks = []
@@ -177,3 +177,107 @@ class Torque(PBS):
177
177
  **pbs_script_header_dict
178
178
  )
179
179
  return pbs_script_header
180
+
181
+
182
+ sge_script_header_template = """
183
+ #!/bin/bash
184
+ #$ -N dpdispatcher_submit
185
+ {select_node_line}
186
+ #$ -cwd
187
+
188
+ """
189
+
190
+
191
+ class SGE(PBS):
192
+ def __init__(
193
+ self,
194
+ batch_type=None,
195
+ context_type=None,
196
+ local_root=None,
197
+ remote_root=None,
198
+ remote_profile={},
199
+ *,
200
+ context=None,
201
+ ):
202
+ super(PBS, self).__init__(
203
+ batch_type,
204
+ context_type,
205
+ local_root,
206
+ remote_root,
207
+ remote_profile,
208
+ context=context,
209
+ )
210
+
211
+ def gen_script_header(self, job):
212
+ resources = job.resources
213
+ sge_script_header_dict = {}
214
+ # resources.number_node is not used
215
+ sge_script_header_dict[
216
+ "select_node_line"
217
+ ] = f"#$ -pe mpi {resources.cpu_per_node} "
218
+ # resources.queue_name is not necessary
219
+ sge_script_header = sge_script_header_template.format(**sge_script_header_dict)
220
+ return sge_script_header
221
+
222
+ def do_submit(self, job):
223
+ script_file_name = job.script_file_name
224
+ script_str = self.gen_script(job)
225
+ job_id_name = job.job_hash + "_job_id"
226
+ self.context.write_file(fname=script_file_name, write_str=script_str)
227
+ script_file_dir = self.context.remote_root
228
+ stdin, stdout, stderr = self.context.block_checkcall(
229
+ "cd {} && {} {}".format(script_file_dir, "qsub", script_file_name)
230
+ )
231
+ subret = stdout.readlines()
232
+ job_id = subret[0].split()[2]
233
+ self.context.write_file(job_id_name, job_id)
234
+ return job_id
235
+
236
+ def default_resources(self, resources):
237
+ pass
238
+
239
+ def check_status(self, job):
240
+ job_id = job.job_id
241
+ status_line = None
242
+ if job_id == "":
243
+ return JobStatus.unsubmitted
244
+ ret, stdin, stdout, stderr = self.context.block_call("qstat")
245
+ err_str = stderr.read().decode("utf-8")
246
+ if ret != 0:
247
+ raise RuntimeError(
248
+ "status command qstat fails to execute. erro info: %s return code %d"
249
+ % (err_str, ret)
250
+ )
251
+ status_text_list = stdout.read().decode("utf-8").split("\n")
252
+ for txt in status_text_list:
253
+ if job_id in txt:
254
+ status_line = txt
255
+
256
+ if status_line is None:
257
+ count = 0
258
+ while count <= 6:
259
+ if self.check_finish_tag(job=job):
260
+ return JobStatus.finished
261
+ dlog.info(
262
+ "not tag_finished detected, execute sync command and wait. count "
263
+ + str(count)
264
+ )
265
+ self.context.block_call("sync")
266
+ import time
267
+
268
+ time.sleep(10)
269
+ count += 1
270
+ return JobStatus.terminated
271
+ else:
272
+ status_word = status_line.split()[4]
273
+ # dlog.info (status_word)
274
+ if status_word in ["qw"]:
275
+ return JobStatus.waiting
276
+ elif status_word in ["r"]:
277
+ return JobStatus.running
278
+ else:
279
+ return JobStatus.unknown
280
+
281
+ def check_finish_tag(self, job):
282
+ job_tag_finished = job.job_hash + "_job_tag_finished"
283
+ return self.context.check_file_exists(job_tag_finished)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dpdispatcher
3
- Version: 0.6.3
3
+ Version: 0.6.4
4
4
  Summary: Generate HPC scheduler systems jobs input scripts, submit these scripts to HPC systems, and poke until they finish
5
5
  Author: DeepModeling
6
6
  License: GNU LESSER GENERAL PUBLIC LICENSE
@@ -1,31 +1,31 @@
1
1
  dpdispatcher/__init__.py,sha256=CLZP_N5CTp14ujWCykEHuJjoIfKR6CwrclXhjWUgNoE,517
2
2
  dpdispatcher/__main__.py,sha256=BFhG-mSBzVZUEezQJqXWZnt2WsnhAHT_zpT8Y6gpOz0,116
3
- dpdispatcher/_version.py,sha256=ztAzoNjubRfmXpwvac6nvVotoNE53YlNbwgehkHg0Ss,411
3
+ dpdispatcher/_version.py,sha256=YeKWh9qHxBSK-fURNyCejbICk3LuDCjfwlZrUuEgWgM,411
4
4
  dpdispatcher/arginfo.py,sha256=pNaxYIE6ahBidpR7OCKZdw8iGt003uTXGSlVzwiuvRg,188
5
5
  dpdispatcher/base_context.py,sha256=NvaC_RHyspxq412z-eCq4Zn8-szZxvn8K6OkXvx7l4Y,3615
6
6
  dpdispatcher/dlog.py,sha256=ndh12teQBbJRybXd8UjEmAi6QTsAXajRicDj5mAH5h0,799
7
- dpdispatcher/dpdisp.py,sha256=YuGb-HWLsDfSO2c7GH0eM20ciojGbx3yq9oZHP7u4yc,3498
7
+ dpdispatcher/dpdisp.py,sha256=p-f_KCJxALBqP4StpNK2X_-VkGZ0L43MhROxhCpFwE4,3712
8
8
  dpdispatcher/machine.py,sha256=XFRH41gNCex_qs9gbg-S88_qab3_UAGfxKWUPxoipCM,16140
9
9
  dpdispatcher/submission.py,sha256=mVAHBlT0a3_1PtsEvvhvwNPkAhgLiBXXemX64BcwizU,48447
10
10
  dpdispatcher/contexts/__init__.py,sha256=s5M0ZJSrPttSyLdBwKD2m3W7a5AbYZdPB7IAND2j7EY,335
11
11
  dpdispatcher/contexts/dp_cloud_server_context.py,sha256=6XK0B2sLGEDeZmV2SZzQdVrMcWAWYZVLLK-IaShEXIY,12245
12
12
  dpdispatcher/contexts/hdfs_context.py,sha256=GbV_o3i0NL43B7dCPnArXS5DPkkET4EAiHw1VgsMcdE,9000
13
13
  dpdispatcher/contexts/lazy_local_context.py,sha256=F8abWAJRY1Ewx1sErINKN1ltWerXzeCcJgjTvLvucKE,5696
14
- dpdispatcher/contexts/local_context.py,sha256=1RbApz-w0Syxghgk9tdCkELQNXmi5oU-s8J_0UTi-lU,13780
14
+ dpdispatcher/contexts/local_context.py,sha256=vhZtdtduPokw6hU0YbaWNuoCOO6Tio2w99Fi8AegIw0,14052
15
15
  dpdispatcher/contexts/openapi_context.py,sha256=DXaMS10SXN3VKEeEdzQyfOgRwUyHRJVCJHd2fKKdsmA,9499
16
- dpdispatcher/contexts/ssh_context.py,sha256=155S3xDsfTPfbD7-oBUK4PLgpGD1vMEcU0MDThePJtQ,38602
16
+ dpdispatcher/contexts/ssh_context.py,sha256=zhBM_qH4zGMws7Yww5txNLhkK7b3maiFEMoewuAJou0,38612
17
17
  dpdispatcher/dpcloudserver/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  dpdispatcher/dpcloudserver/client.py,sha256=k1niKjG6zFnMtHn_UuCjYoOcMju3o3PV-GdyVLr5-KM,165
19
19
  dpdispatcher/entrypoints/__init__.py,sha256=exKSFT3j2oCerGwtI8WbHQK-D0K-CyifocRji1xntT4,20
20
20
  dpdispatcher/entrypoints/gui.py,sha256=29lMXqbmSRbLj4rfBv7Jnw89NLU9syTB88IUP6IRJsU,830
21
- dpdispatcher/entrypoints/submission.py,sha256=R2DXUGXTsk0Jw2y5Wjby40R5dxzXeqr4gn33ov6mdAI,2751
21
+ dpdispatcher/entrypoints/submission.py,sha256=ikVwIZAQL0SsYO5xaMIdKXgO6qtc05w1vqmvtG7Nk5M,3401
22
22
  dpdispatcher/machines/__init__.py,sha256=9kSYkz2w3flp00IrHWTEwvoFGrathQAT3tvbieye83c,335
23
23
  dpdispatcher/machines/distributed_shell.py,sha256=7avNcoOzEj7UcJuKl6b1ka2bj5dixcJaMlZK-I-i_Tc,7571
24
24
  dpdispatcher/machines/dp_cloud_server.py,sha256=SR69gsFb2BvOQCW1QnWfP3cQvu_qHLJNsycp5wzosJU,11706
25
25
  dpdispatcher/machines/fugaku.py,sha256=9OP3qSaaruqypHAdcuBFQM_MUtFp3yrvhZ5bPyLwEEk,4308
26
26
  dpdispatcher/machines/lsf.py,sha256=Qruot39cPEpBNbbPmDwb1Gyfgyw3N36O0hs9PNEXyVU,7997
27
27
  dpdispatcher/machines/openapi.py,sha256=Gzzbo8YOAybXGTrgMutexErcaEi3ts7uTUNvOhThFS8,8858
28
- dpdispatcher/machines/pbs.py,sha256=HGBUf96AJ7hWOQPrENP5tFIDnEm4cb9deqJJ8wExbms,7079
28
+ dpdispatcher/machines/pbs.py,sha256=YK0rKIsd8GAZYfxGLezA9RdCQ6AOeuPML_v5OwmioTo,10331
29
29
  dpdispatcher/machines/shell.py,sha256=qaia7mC_fz5Bqyelxmc1je-xg7NQ_6vQQ0qAjg2m4RQ,4796
30
30
  dpdispatcher/machines/slurm.py,sha256=SP5rQiCPWzq4rqgUgp0IGJXXD_1DURWl4OBRAJ-Kng4,15611
31
31
  dpdispatcher/utils/__init__.py,sha256=fwvwkMf7DFNQkNBiIce8Y8gRA6FhICwKjkKiXu_BEJg,13
@@ -38,9 +38,9 @@ dpdispatcher/utils/dpcloudserver/client.py,sha256=CLfXswvzI4inDrW2bYkfMQ6gQJFcZO
38
38
  dpdispatcher/utils/dpcloudserver/config.py,sha256=NteQzf1OeEkz2UbkXHHQ0B72cUu23zLVzpM9Yh4v1Cc,559
39
39
  dpdispatcher/utils/dpcloudserver/retcode.py,sha256=1qAF8gFZx55u2sO8KbtYSIIrjcO-IGufEUlwbkSfC1g,721
40
40
  dpdispatcher/utils/dpcloudserver/zip_file.py,sha256=f9WrlktwHW0YipaWg5Y0kxjMZlhD1cJYa6EUpvu4Cro,2611
41
- dpdispatcher-0.6.3.dist-info/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
42
- dpdispatcher-0.6.3.dist-info/METADATA,sha256=Y-EwJVwhTAOszfKFejHSPTsU1DH931Qrhwbdmkie6Mc,12752
43
- dpdispatcher-0.6.3.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92
44
- dpdispatcher-0.6.3.dist-info/entry_points.txt,sha256=NRHUV0IU_u7_XtcmmEDnVzAcUmurhiEAGwENckrajo4,233
45
- dpdispatcher-0.6.3.dist-info/top_level.txt,sha256=35jAQoXY-b-e9fJ1_mxhZUiaCoJNt1ZI7mpFRf07Qjs,13
46
- dpdispatcher-0.6.3.dist-info/RECORD,,
41
+ dpdispatcher-0.6.4.dist-info/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
42
+ dpdispatcher-0.6.4.dist-info/METADATA,sha256=9NoY3DjM_USxl-2nBiNNLMCzJ6DZadgEMS1e11Fj72c,12752
43
+ dpdispatcher-0.6.4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
44
+ dpdispatcher-0.6.4.dist-info/entry_points.txt,sha256=NRHUV0IU_u7_XtcmmEDnVzAcUmurhiEAGwENckrajo4,233
45
+ dpdispatcher-0.6.4.dist-info/top_level.txt,sha256=35jAQoXY-b-e9fJ1_mxhZUiaCoJNt1ZI7mpFRf07Qjs,13
46
+ dpdispatcher-0.6.4.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.41.3)
2
+ Generator: bdist_wheel (0.42.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5