dpdispatcher 0.6.2__py3-none-any.whl → 0.6.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dpdispatcher might be problematic. Click here for more details.

dpdispatcher/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.6.2'
16
- __version_tuple__ = version_tuple = (0, 6, 2)
15
+ __version__ = version = '0.6.4'
16
+ __version_tuple__ = version_tuple = (0, 6, 4)
@@ -88,7 +88,7 @@ class HDFSContext(BaseContext):
88
88
  for ff in task.forward_files:
89
89
  abs_file_list = glob(os.path.join(local_job, ff))
90
90
  if not abs_file_list:
91
- raise RuntimeError(
91
+ raise FileNotFoundError(
92
92
  "cannot find upload file " + os.path.join(local_job, ff)
93
93
  )
94
94
  rel_file_list = [
@@ -100,7 +100,7 @@ class HDFSContext(BaseContext):
100
100
  for fc in submission.forward_common_files:
101
101
  abs_file_list = glob(os.path.join(local_job, fc))
102
102
  if not abs_file_list:
103
- raise RuntimeError(
103
+ raise FileNotFoundError(
104
104
  "cannot find upload file " + os.path.join(local_job, fc)
105
105
  )
106
106
  rel_file_list = [
@@ -170,9 +170,11 @@ class HDFSContext(BaseContext):
170
170
  ) as fp:
171
171
  pass
172
172
  else:
173
- raise RuntimeError("do not find download file " + rfile)
173
+ raise FileNotFoundError(
174
+ "do not find download file " + rfile
175
+ )
174
176
  else:
175
- raise RuntimeError("do not find download file " + rfile)
177
+ raise FileNotFoundError("do not find download file " + rfile)
176
178
  else:
177
179
  if os.path.exists(lfile):
178
180
  dlog.info(f"find existing {lfile}, replacing by {rfile}")
@@ -203,9 +205,9 @@ class HDFSContext(BaseContext):
203
205
  ) as fp:
204
206
  pass
205
207
  else:
206
- raise RuntimeError("do not find download file " + rfile)
208
+ raise FileNotFoundError("do not find download file " + rfile)
207
209
  else:
208
- raise RuntimeError("do not find download file " + rfile)
210
+ raise FileNotFoundError("do not find download file " + rfile)
209
211
  else:
210
212
  if os.path.exists(lfile):
211
213
  dlog.info(f"find existing {lfile}, replacing by {rfile}")
@@ -94,7 +94,7 @@ class LocalContext(BaseContext):
94
94
  for kk in ii.forward_files:
95
95
  abs_file_list = glob(os.path.join(local_job, kk))
96
96
  if not abs_file_list:
97
- raise RuntimeError(
97
+ raise FileNotFoundError(
98
98
  "cannot find upload file " + os.path.join(local_job, kk)
99
99
  )
100
100
  rel_file_list = [
@@ -104,7 +104,7 @@ class LocalContext(BaseContext):
104
104
 
105
105
  for jj in file_list:
106
106
  if not os.path.exists(os.path.join(local_job, jj)):
107
- raise RuntimeError(
107
+ raise FileNotFoundError(
108
108
  "cannot find upload file " + os.path.join(local_job, jj)
109
109
  )
110
110
  if os.path.exists(os.path.join(remote_job, jj)):
@@ -119,7 +119,7 @@ class LocalContext(BaseContext):
119
119
  for kk in submission.forward_common_files:
120
120
  abs_file_list = glob(os.path.join(local_job, kk))
121
121
  if not abs_file_list:
122
- raise RuntimeError(
122
+ raise FileNotFoundError(
123
123
  "cannot find upload file " + os.path.join(local_job, kk)
124
124
  )
125
125
  rel_file_list = [
@@ -129,7 +129,7 @@ class LocalContext(BaseContext):
129
129
 
130
130
  for jj in file_list:
131
131
  if not os.path.exists(os.path.join(local_job, jj)):
132
- raise RuntimeError(
132
+ raise FileNotFoundError(
133
133
  "cannot find upload file " + os.path.join(local_job, jj)
134
134
  )
135
135
  if os.path.exists(os.path.join(remote_job, jj)):
@@ -160,7 +160,7 @@ class LocalContext(BaseContext):
160
160
  else:
161
161
  pass
162
162
  else:
163
- raise RuntimeError(
163
+ raise FileNotFoundError(
164
164
  "cannot find download file " + os.path.join(remote_job, kk)
165
165
  )
166
166
  rel_flist = [
@@ -188,7 +188,9 @@ class LocalContext(BaseContext):
188
188
  else:
189
189
  pass
190
190
  else:
191
- raise RuntimeError("do not find download file " + rfile)
191
+ raise FileNotFoundError(
192
+ "do not find download file " + rfile
193
+ )
192
194
  elif (not os.path.exists(rfile)) and (os.path.exists(lfile)):
193
195
  # already downloaded
194
196
  pass
@@ -206,7 +208,10 @@ class LocalContext(BaseContext):
206
208
  shutil.rmtree(lfile, ignore_errors=True)
207
209
  elif os.path.isfile(lfile) or os.path.islink(lfile):
208
210
  os.remove(lfile)
209
- shutil.copyfile(rfile, lfile)
211
+ if not os.path.islink(rfile):
212
+ shutil.move(rfile, lfile)
213
+ else:
214
+ shutil.copyfile(rfile, lfile)
210
215
  else:
211
216
  raise RuntimeError("should not reach here!")
212
217
  else:
@@ -229,7 +234,7 @@ class LocalContext(BaseContext):
229
234
  else:
230
235
  pass
231
236
  else:
232
- raise RuntimeError(
237
+ raise FileNotFoundError(
233
238
  "cannot find download file " + os.path.join(remote_job, kk)
234
239
  )
235
240
  rel_flist = [os.path.relpath(ii, start=remote_job) for ii in abs_flist_r]
@@ -255,7 +260,7 @@ class LocalContext(BaseContext):
255
260
  else:
256
261
  pass
257
262
  else:
258
- raise RuntimeError("do not find download file " + rfile)
263
+ raise FileNotFoundError("do not find download file " + rfile)
259
264
  elif (not os.path.exists(rfile)) and (os.path.exists(lfile)):
260
265
  # already downloaded
261
266
  pass
@@ -273,7 +278,10 @@ class LocalContext(BaseContext):
273
278
  shutil.rmtree(lfile, ignore_errors=True)
274
279
  elif os.path.isfile(lfile) or os.path.islink(lfile):
275
280
  os.remove(lfile)
276
- shutil.copyfile(rfile, lfile)
281
+ if not os.path.islink(rfile):
282
+ shutil.move(rfile, lfile)
283
+ else:
284
+ shutil.copyfile(rfile, lfile)
277
285
  else:
278
286
  raise RuntimeError("should not reach here!")
279
287
  else:
@@ -295,7 +295,7 @@ class SSHSession:
295
295
  assert self.ssh is not None
296
296
  try:
297
297
  return self.ssh.exec_command(cmd)
298
- except (paramiko.ssh_exception.SSHException, socket.timeout) as e:
298
+ except (paramiko.ssh_exception.SSHException, socket.timeout, EOFError) as e:
299
299
  # SSH session not active
300
300
  # retry for up to 3 times
301
301
  # ensure alive
@@ -569,7 +569,7 @@ class SSHContext(BaseContext):
569
569
  rel_file_list, work_path, file_list, directory_list
570
570
  )
571
571
  else:
572
- raise RuntimeError(f"cannot find upload file {work_path} {jj}")
572
+ raise FileNotFoundError(f"cannot find upload file {work_path} {jj}")
573
573
 
574
574
  def upload(
575
575
  self,
dpdispatcher/dpdisp.py CHANGED
@@ -54,6 +54,11 @@ def main_parser() -> argparse.ArgumentParser:
54
54
  action="store_true",
55
55
  help="Clean submission.",
56
56
  )
57
+ parser_submission_action.add_argument(
58
+ "--reset-fail-count",
59
+ action="store_true",
60
+ help="Reset fail count of all jobs to zero.",
61
+ )
57
62
  ##########################################
58
63
  # gui
59
64
  parser_gui = subparsers.add_parser(
@@ -105,6 +110,7 @@ def main():
105
110
  download_terminated_log=args.download_terminated_log,
106
111
  download_finished_task=args.download_finished_task,
107
112
  clean=args.clean,
113
+ reset_fail_count=args.reset_fail_count,
108
114
  )
109
115
  elif args.command == "gui":
110
116
  start_dpgui(
@@ -12,6 +12,7 @@ def handle_submission(
12
12
  download_terminated_log: bool = False,
13
13
  download_finished_task: bool = False,
14
14
  clean: bool = False,
15
+ reset_fail_count: bool = False,
15
16
  ):
16
17
  """Handle terminated submission.
17
18
 
@@ -25,13 +26,21 @@ def handle_submission(
25
26
  Download finished tasks.
26
27
  clean : bool, optional
27
28
  Clean submission.
29
+ reset_fail_count : bool, optional
30
+ Reset fail count of all jobs to zero.
28
31
 
29
32
  Raises
30
33
  ------
31
34
  ValueError
32
35
  At least one action should be specified.
33
36
  """
34
- if int(download_terminated_log) + int(download_finished_task) + int(clean) == 0:
37
+ if (
38
+ int(download_terminated_log)
39
+ + int(download_finished_task)
40
+ + int(clean)
41
+ + int(reset_fail_count)
42
+ == 0
43
+ ):
35
44
  raise ValueError("At least one action should be specified.")
36
45
 
37
46
  submission_file = record.get_submission(submission_hash)
@@ -42,7 +51,18 @@ def handle_submission(
42
51
  # TODO: for unclear reason, the submission_hash may be changed
43
52
  submission.submission_hash = submission_hash
44
53
  submission.machine.context.bind_submission(submission)
54
+ if reset_fail_count:
55
+ for job in submission.belonging_jobs:
56
+ job.fail_count = 0
57
+ # save to remote and local
58
+ submission.submission_to_json()
59
+ record.write(submission)
60
+ if int(download_terminated_log) + int(download_finished_task) + int(clean) == 0:
61
+ # if only reset_fail_count, no need to update submission state (expensive)
62
+ return
45
63
  submission.update_submission_state()
64
+ submission.submission_to_json()
65
+ record.write(submission)
46
66
 
47
67
  terminated_tasks = []
48
68
  finished_tasks = []
@@ -177,3 +177,107 @@ class Torque(PBS):
177
177
  **pbs_script_header_dict
178
178
  )
179
179
  return pbs_script_header
180
+
181
+
182
+ sge_script_header_template = """
183
+ #!/bin/bash
184
+ #$ -N dpdispatcher_submit
185
+ {select_node_line}
186
+ #$ -cwd
187
+
188
+ """
189
+
190
+
191
+ class SGE(PBS):
192
+ def __init__(
193
+ self,
194
+ batch_type=None,
195
+ context_type=None,
196
+ local_root=None,
197
+ remote_root=None,
198
+ remote_profile={},
199
+ *,
200
+ context=None,
201
+ ):
202
+ super(PBS, self).__init__(
203
+ batch_type,
204
+ context_type,
205
+ local_root,
206
+ remote_root,
207
+ remote_profile,
208
+ context=context,
209
+ )
210
+
211
+ def gen_script_header(self, job):
212
+ resources = job.resources
213
+ sge_script_header_dict = {}
214
+ # resources.number_node is not used
215
+ sge_script_header_dict[
216
+ "select_node_line"
217
+ ] = f"#$ -pe mpi {resources.cpu_per_node} "
218
+ # resources.queue_name is not necessary
219
+ sge_script_header = sge_script_header_template.format(**sge_script_header_dict)
220
+ return sge_script_header
221
+
222
+ def do_submit(self, job):
223
+ script_file_name = job.script_file_name
224
+ script_str = self.gen_script(job)
225
+ job_id_name = job.job_hash + "_job_id"
226
+ self.context.write_file(fname=script_file_name, write_str=script_str)
227
+ script_file_dir = self.context.remote_root
228
+ stdin, stdout, stderr = self.context.block_checkcall(
229
+ "cd {} && {} {}".format(script_file_dir, "qsub", script_file_name)
230
+ )
231
+ subret = stdout.readlines()
232
+ job_id = subret[0].split()[2]
233
+ self.context.write_file(job_id_name, job_id)
234
+ return job_id
235
+
236
+ def default_resources(self, resources):
237
+ pass
238
+
239
+ def check_status(self, job):
240
+ job_id = job.job_id
241
+ status_line = None
242
+ if job_id == "":
243
+ return JobStatus.unsubmitted
244
+ ret, stdin, stdout, stderr = self.context.block_call("qstat")
245
+ err_str = stderr.read().decode("utf-8")
246
+ if ret != 0:
247
+ raise RuntimeError(
248
+ "status command qstat fails to execute. erro info: %s return code %d"
249
+ % (err_str, ret)
250
+ )
251
+ status_text_list = stdout.read().decode("utf-8").split("\n")
252
+ for txt in status_text_list:
253
+ if job_id in txt:
254
+ status_line = txt
255
+
256
+ if status_line is None:
257
+ count = 0
258
+ while count <= 6:
259
+ if self.check_finish_tag(job=job):
260
+ return JobStatus.finished
261
+ dlog.info(
262
+ "not tag_finished detected, execute sync command and wait. count "
263
+ + str(count)
264
+ )
265
+ self.context.block_call("sync")
266
+ import time
267
+
268
+ time.sleep(10)
269
+ count += 1
270
+ return JobStatus.terminated
271
+ else:
272
+ status_word = status_line.split()[4]
273
+ # dlog.info (status_word)
274
+ if status_word in ["qw"]:
275
+ return JobStatus.waiting
276
+ elif status_word in ["r"]:
277
+ return JobStatus.running
278
+ else:
279
+ return JobStatus.unknown
280
+
281
+ def check_finish_tag(self, job):
282
+ job_tag_finished = job.job_hash + "_job_tag_finished"
283
+ return self.context.check_file_exists(job_tag_finished)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dpdispatcher
3
- Version: 0.6.2
3
+ Version: 0.6.4
4
4
  Summary: Generate HPC scheduler systems jobs input scripts, submit these scripts to HPC systems, and poke until they finish
5
5
  Author: DeepModeling
6
6
  License: GNU LESSER GENERAL PUBLIC LICENSE
@@ -1,31 +1,31 @@
1
1
  dpdispatcher/__init__.py,sha256=CLZP_N5CTp14ujWCykEHuJjoIfKR6CwrclXhjWUgNoE,517
2
2
  dpdispatcher/__main__.py,sha256=BFhG-mSBzVZUEezQJqXWZnt2WsnhAHT_zpT8Y6gpOz0,116
3
- dpdispatcher/_version.py,sha256=PMhMumGW6FY6KtkQm1bSNdEAeyrHlBqpSQ7WpjsTyws,411
3
+ dpdispatcher/_version.py,sha256=YeKWh9qHxBSK-fURNyCejbICk3LuDCjfwlZrUuEgWgM,411
4
4
  dpdispatcher/arginfo.py,sha256=pNaxYIE6ahBidpR7OCKZdw8iGt003uTXGSlVzwiuvRg,188
5
5
  dpdispatcher/base_context.py,sha256=NvaC_RHyspxq412z-eCq4Zn8-szZxvn8K6OkXvx7l4Y,3615
6
6
  dpdispatcher/dlog.py,sha256=ndh12teQBbJRybXd8UjEmAi6QTsAXajRicDj5mAH5h0,799
7
- dpdispatcher/dpdisp.py,sha256=YuGb-HWLsDfSO2c7GH0eM20ciojGbx3yq9oZHP7u4yc,3498
7
+ dpdispatcher/dpdisp.py,sha256=p-f_KCJxALBqP4StpNK2X_-VkGZ0L43MhROxhCpFwE4,3712
8
8
  dpdispatcher/machine.py,sha256=XFRH41gNCex_qs9gbg-S88_qab3_UAGfxKWUPxoipCM,16140
9
9
  dpdispatcher/submission.py,sha256=mVAHBlT0a3_1PtsEvvhvwNPkAhgLiBXXemX64BcwizU,48447
10
10
  dpdispatcher/contexts/__init__.py,sha256=s5M0ZJSrPttSyLdBwKD2m3W7a5AbYZdPB7IAND2j7EY,335
11
11
  dpdispatcher/contexts/dp_cloud_server_context.py,sha256=6XK0B2sLGEDeZmV2SZzQdVrMcWAWYZVLLK-IaShEXIY,12245
12
- dpdispatcher/contexts/hdfs_context.py,sha256=GJs_vmDCjTsnbfTdXpFTfpWTYXnZTDkEO2UJIdpV5F4,8908
12
+ dpdispatcher/contexts/hdfs_context.py,sha256=GbV_o3i0NL43B7dCPnArXS5DPkkET4EAiHw1VgsMcdE,9000
13
13
  dpdispatcher/contexts/lazy_local_context.py,sha256=F8abWAJRY1Ewx1sErINKN1ltWerXzeCcJgjTvLvucKE,5696
14
- dpdispatcher/contexts/local_context.py,sha256=7CoGzcX-RU6cpmSYcf4wMwncYaFVUb8Ljj4ksfXcx4s,13678
14
+ dpdispatcher/contexts/local_context.py,sha256=vhZtdtduPokw6hU0YbaWNuoCOO6Tio2w99Fi8AegIw0,14052
15
15
  dpdispatcher/contexts/openapi_context.py,sha256=DXaMS10SXN3VKEeEdzQyfOgRwUyHRJVCJHd2fKKdsmA,9499
16
- dpdispatcher/contexts/ssh_context.py,sha256=1UbMIFpSGBcg3H1My4cx5vjAALvaxlZxWOcXwoX6Ff0,38597
16
+ dpdispatcher/contexts/ssh_context.py,sha256=zhBM_qH4zGMws7Yww5txNLhkK7b3maiFEMoewuAJou0,38612
17
17
  dpdispatcher/dpcloudserver/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  dpdispatcher/dpcloudserver/client.py,sha256=k1niKjG6zFnMtHn_UuCjYoOcMju3o3PV-GdyVLr5-KM,165
19
19
  dpdispatcher/entrypoints/__init__.py,sha256=exKSFT3j2oCerGwtI8WbHQK-D0K-CyifocRji1xntT4,20
20
20
  dpdispatcher/entrypoints/gui.py,sha256=29lMXqbmSRbLj4rfBv7Jnw89NLU9syTB88IUP6IRJsU,830
21
- dpdispatcher/entrypoints/submission.py,sha256=R2DXUGXTsk0Jw2y5Wjby40R5dxzXeqr4gn33ov6mdAI,2751
21
+ dpdispatcher/entrypoints/submission.py,sha256=ikVwIZAQL0SsYO5xaMIdKXgO6qtc05w1vqmvtG7Nk5M,3401
22
22
  dpdispatcher/machines/__init__.py,sha256=9kSYkz2w3flp00IrHWTEwvoFGrathQAT3tvbieye83c,335
23
23
  dpdispatcher/machines/distributed_shell.py,sha256=7avNcoOzEj7UcJuKl6b1ka2bj5dixcJaMlZK-I-i_Tc,7571
24
24
  dpdispatcher/machines/dp_cloud_server.py,sha256=SR69gsFb2BvOQCW1QnWfP3cQvu_qHLJNsycp5wzosJU,11706
25
25
  dpdispatcher/machines/fugaku.py,sha256=9OP3qSaaruqypHAdcuBFQM_MUtFp3yrvhZ5bPyLwEEk,4308
26
26
  dpdispatcher/machines/lsf.py,sha256=Qruot39cPEpBNbbPmDwb1Gyfgyw3N36O0hs9PNEXyVU,7997
27
27
  dpdispatcher/machines/openapi.py,sha256=Gzzbo8YOAybXGTrgMutexErcaEi3ts7uTUNvOhThFS8,8858
28
- dpdispatcher/machines/pbs.py,sha256=HGBUf96AJ7hWOQPrENP5tFIDnEm4cb9deqJJ8wExbms,7079
28
+ dpdispatcher/machines/pbs.py,sha256=YK0rKIsd8GAZYfxGLezA9RdCQ6AOeuPML_v5OwmioTo,10331
29
29
  dpdispatcher/machines/shell.py,sha256=qaia7mC_fz5Bqyelxmc1je-xg7NQ_6vQQ0qAjg2m4RQ,4796
30
30
  dpdispatcher/machines/slurm.py,sha256=SP5rQiCPWzq4rqgUgp0IGJXXD_1DURWl4OBRAJ-Kng4,15611
31
31
  dpdispatcher/utils/__init__.py,sha256=fwvwkMf7DFNQkNBiIce8Y8gRA6FhICwKjkKiXu_BEJg,13
@@ -38,9 +38,9 @@ dpdispatcher/utils/dpcloudserver/client.py,sha256=CLfXswvzI4inDrW2bYkfMQ6gQJFcZO
38
38
  dpdispatcher/utils/dpcloudserver/config.py,sha256=NteQzf1OeEkz2UbkXHHQ0B72cUu23zLVzpM9Yh4v1Cc,559
39
39
  dpdispatcher/utils/dpcloudserver/retcode.py,sha256=1qAF8gFZx55u2sO8KbtYSIIrjcO-IGufEUlwbkSfC1g,721
40
40
  dpdispatcher/utils/dpcloudserver/zip_file.py,sha256=f9WrlktwHW0YipaWg5Y0kxjMZlhD1cJYa6EUpvu4Cro,2611
41
- dpdispatcher-0.6.2.dist-info/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
42
- dpdispatcher-0.6.2.dist-info/METADATA,sha256=1JRjYaFcAF0_NBxNInlFOM2YmpbBKp7P5EAnvX99f-4,12752
43
- dpdispatcher-0.6.2.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92
44
- dpdispatcher-0.6.2.dist-info/entry_points.txt,sha256=NRHUV0IU_u7_XtcmmEDnVzAcUmurhiEAGwENckrajo4,233
45
- dpdispatcher-0.6.2.dist-info/top_level.txt,sha256=35jAQoXY-b-e9fJ1_mxhZUiaCoJNt1ZI7mpFRf07Qjs,13
46
- dpdispatcher-0.6.2.dist-info/RECORD,,
41
+ dpdispatcher-0.6.4.dist-info/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
42
+ dpdispatcher-0.6.4.dist-info/METADATA,sha256=9NoY3DjM_USxl-2nBiNNLMCzJ6DZadgEMS1e11Fj72c,12752
43
+ dpdispatcher-0.6.4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
44
+ dpdispatcher-0.6.4.dist-info/entry_points.txt,sha256=NRHUV0IU_u7_XtcmmEDnVzAcUmurhiEAGwENckrajo4,233
45
+ dpdispatcher-0.6.4.dist-info/top_level.txt,sha256=35jAQoXY-b-e9fJ1_mxhZUiaCoJNt1ZI7mpFRf07Qjs,13
46
+ dpdispatcher-0.6.4.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.41.3)
2
+ Generator: bdist_wheel (0.42.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5