dpdispatcher 0.6.6__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dpdispatcher/machine.py CHANGED
@@ -161,6 +161,9 @@ class Machine(metaclass=ABCMeta):
161
161
  machine_dict["remote_profile"] = self.context.remote_profile
162
162
  else:
163
163
  machine_dict["remote_profile"] = {}
164
+ # normalize the dict
165
+ base = self.arginfo()
166
+ machine_dict = base.normalize_value(machine_dict, trim_pattern="_*")
164
167
  return machine_dict
165
168
 
166
169
  def __eq__(self, other):
@@ -224,7 +227,7 @@ class Machine(metaclass=ABCMeta):
224
227
  return if_recover
225
228
 
226
229
  @abstractmethod
227
- def check_finish_tag(self, **kwargs):
230
+ def check_finish_tag(self, job):
228
231
  raise NotImplementedError(
229
232
  "abstract method check_finish_tag should be implemented by derived class"
230
233
  )
@@ -265,6 +268,15 @@ class Machine(metaclass=ABCMeta):
265
268
 
266
269
  export_envs_part = ""
267
270
  envs = job.resources.envs
271
+ envs = {
272
+ # export resources information to the environment variables
273
+ "DPDISPATCHER_NUMBER_NODE": job.resources.number_node,
274
+ "DPDISPATCHER_CPU_PER_NODE": job.resources.cpu_per_node,
275
+ "DPDISPATCHER_GPU_PER_NODE": job.resources.gpu_per_node,
276
+ "DPDISPATCHER_QUEUE_NAME": job.resources.queue_name,
277
+ "DPDISPATCHER_GROUP_SIZE": job.resources.group_size,
278
+ **envs,
279
+ }
268
280
  for k, v in envs.items():
269
281
  if isinstance(v, list):
270
282
  for each_value in v:
@@ -39,7 +39,7 @@ class JH_UniScheduler(Machine):
39
39
  custom_gpu_line = resources.kwargs.get("custom_gpu_line", None)
40
40
  if not custom_gpu_line:
41
41
  script_header_dict["JH_UniScheduler_number_gpu_line"] = (
42
- "" f"#JSUB -gpgpu {resources.gpu_per_node}"
42
+ f"#JSUB -gpgpu {resources.gpu_per_node}"
43
43
  )
44
44
  else:
45
45
  script_header_dict["JH_UniScheduler_number_gpu_line"] = custom_gpu_line
@@ -84,9 +84,6 @@ class JH_UniScheduler(Machine):
84
84
  self.context.write_file(job_id_name, job_id)
85
85
  return job_id
86
86
 
87
- def default_resources(self, resources):
88
- pass
89
-
90
87
  @retry()
91
88
  def check_status(self, job):
92
89
  try:
@@ -105,8 +102,7 @@ class JH_UniScheduler(Machine):
105
102
  elif ret != 0:
106
103
  # just retry when any unknown error raised.
107
104
  raise RetrySignal(
108
- "Get error code %d in checking status through ssh with job: %s . message: %s"
109
- % (ret, job.job_hash, err_str)
105
+ f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
110
106
  )
111
107
  status_out = stdout.read().decode("utf-8").split("\n")
112
108
  if len(status_out) < 2:
@@ -181,8 +181,7 @@ class DistributedShell(Machine):
181
181
  if ret != 0:
182
182
  err_str = stderr.decode("utf-8")
183
183
  raise RuntimeError(
184
- "Command squeue fails to execute, error message:%s\nreturn code %d\n"
185
- % (err_str, ret)
184
+ f"Command {cmd} fails to execute, error message:{err_str}\nreturn code {ret}\n"
186
185
  )
187
186
  job_id = int(stdout.decode("utf-8").strip())
188
187
 
@@ -200,8 +199,7 @@ class DistributedShell(Machine):
200
199
  if ret != 0:
201
200
  err_str = stderr.decode("utf-8")
202
201
  raise RuntimeError(
203
- "Command fails to execute, error message:%s\nreturn code %d\n"
204
- % (err_str, ret)
202
+ f"Command fails to execute, error message:{err_str}\nreturn code {ret}\n"
205
203
  )
206
204
 
207
205
  if_job_exists = bool(stdout.decode("utf-8").strip())
@@ -67,9 +67,6 @@ class Fugaku(Machine):
67
67
  self.context.write_file(job_id_name, job_id)
68
68
  return job_id
69
69
 
70
- def default_resources(self, resources):
71
- pass
72
-
73
70
  def check_status(self, job):
74
71
  job_id = job.job_id
75
72
  if job_id == "":
@@ -102,9 +102,6 @@ class LSF(Machine):
102
102
  return job_id
103
103
 
104
104
  # TODO: derive abstract methods
105
- def default_resources(self, resources):
106
- pass
107
-
108
105
  def sub_script_cmd(self, res):
109
106
  pass
110
107
 
@@ -129,8 +126,7 @@ class LSF(Machine):
129
126
  elif ret != 0:
130
127
  # just retry when any unknown error raised.
131
128
  raise RetrySignal(
132
- "Get error code %d in checking status through ssh with job: %s . message: %s"
133
- % (ret, job.job_hash, err_str)
129
+ f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
134
130
  )
135
131
  status_out = stdout.read().decode("utf-8").split("\n")
136
132
  if len(status_out) < 2:
@@ -1,14 +1,13 @@
1
1
  import os
2
2
  import shutil
3
3
  import time
4
+ from zipfile import ZipFile
4
5
 
5
6
  from dpdispatcher.utils.utils import customized_script_header_template
6
7
 
7
8
  try:
8
- from bohriumsdk.client import Client
9
- from bohriumsdk.job import Job
10
- from bohriumsdk.storage import Storage
11
- from bohriumsdk.util import Util
9
+ from bohrium import Bohrium
10
+ from bohrium.resources import Job, Tiefblue
12
11
  except ModuleNotFoundError:
13
12
  found_bohriumsdk = False
14
13
  else:
@@ -23,6 +22,12 @@ shell_script_header_template = """
23
22
  """
24
23
 
25
24
 
25
+ def unzip_file(zip_file, out_dir="./"):
26
+ obj = ZipFile(zip_file, "r")
27
+ for item in obj.namelist():
28
+ obj.extract(item, out_dir)
29
+
30
+
26
31
  class OpenAPI(Machine):
27
32
  def __init__(self, context):
28
33
  if not found_bohriumsdk:
@@ -35,9 +40,35 @@ class OpenAPI(Machine):
35
40
  self.grouped = self.remote_profile.get("grouped", True)
36
41
  self.retry_count = self.remote_profile.get("retry_count", 3)
37
42
  self.ignore_exit_code = context.remote_profile.get("ignore_exit_code", True)
38
- self.client = Client()
39
- self.job = Job(client=self.client)
40
- self.storage = Storage(client=self.client)
43
+
44
+ access_key = (
45
+ self.remote_profile.get("access_key", None)
46
+ or os.getenv("BOHRIUM_ACCESS_KEY", None)
47
+ or os.getenv("ACCESS_KEY", None)
48
+ )
49
+ project_id = (
50
+ self.remote_profile.get("project_id", None)
51
+ or os.getenv("BOHRIUM_PROJECT_ID", None)
52
+ or os.getenv("PROJECT_ID", None)
53
+ )
54
+ app_key = (
55
+ self.remote_profile.get("app_key", None)
56
+ or os.getenv("BOHRIUM_APP_KEY", None)
57
+ or os.getenv("APP_KEY", None)
58
+ )
59
+ if access_key is None:
60
+ raise ValueError(
61
+ "remote_profile must contain 'access_key' or set environment variable 'BOHRIUM_ACCESS_KEY'"
62
+ )
63
+ if project_id is None:
64
+ raise ValueError(
65
+ "remote_profile must contain 'project_id' or set environment variable 'BOHRIUM_PROJECT_ID'"
66
+ )
67
+ self.client = Bohrium( # type: ignore[reportPossiblyUnboundVariable]
68
+ access_key=access_key, project_id=project_id, app_key=app_key
69
+ )
70
+ self.storage = Tiefblue() # type: ignore[reportPossiblyUnboundVariable]
71
+ self.job = Job(client=self.client) # type: ignore[reportPossiblyUnboundVariable]
41
72
  self.group_id = None
42
73
 
43
74
  def gen_script(self, job):
@@ -98,11 +129,13 @@ class OpenAPI(Machine):
98
129
  ),
99
130
  "out_files": self._gen_backward_files_list(job),
100
131
  "platform": self.remote_profile.get("platform", "ali"),
101
- "image_address": self.remote_profile.get("image_address", ""),
132
+ "image_name": self.remote_profile.get("image_address", ""),
102
133
  }
103
- if job.job_state == JobStatus.unsubmitted:
104
- openapi_params["job_id"] = job.job_id
105
-
134
+ if "real_user_id" in self.remote_profile:
135
+ openapi_params["real_user_id"] = self.remote_profile["real_user_id"]
136
+ if "session_id" in self.remote_profile:
137
+ openapi_params["session_id"] = self.remote_profile["session_id"]
138
+ openapi_params["job_id"] = job.job_id
106
139
  data = self.job.insert(**openapi_params)
107
140
 
108
141
  job.job_id = data.get("jobId", 0) # type: ignore
@@ -152,8 +185,8 @@ class OpenAPI(Machine):
152
185
  self.ignore_exit_code,
153
186
  )
154
187
  if job_state == JobStatus.finished:
155
- job_log = self.job.log(job_id)
156
188
  if self.remote_profile.get("output_log"):
189
+ job_log = self.job.log(job_id)
157
190
  print(job_log, end="")
158
191
  self._download_job(job)
159
192
  elif self.remote_profile.get("output_log") and job_state == JobStatus.running:
@@ -163,14 +196,14 @@ class OpenAPI(Machine):
163
196
 
164
197
  def _download_job(self, job):
165
198
  data = self.job.detail(job.job_id)
166
- job_url = data["jobFiles"]["outFiles"][0]["url"] # type: ignore
199
+ job_url = data["resultUrl"] # type: ignore
167
200
  if not job_url:
168
201
  return
169
202
  job_hash = job.job_hash
170
203
  result_filename = job_hash + "_back.zip"
171
204
  target_result_zip = os.path.join(self.context.local_root, result_filename)
172
205
  self.storage.download_from_url(job_url, target_result_zip)
173
- Util.unzip_file(target_result_zip, out_dir=self.context.local_root)
206
+ unzip_file(target_result_zip, out_dir=self.context.local_root)
174
207
  try:
175
208
  os.makedirs(os.path.join(self.context.local_root, "backup"), exist_ok=True)
176
209
  shutil.move(
@@ -213,7 +246,7 @@ class OpenAPI(Machine):
213
246
  if status not in map_dict:
214
247
  dlog.error(f"unknown job status {status}")
215
248
  return JobStatus.unknown
216
- if status == -1 and exit_code != 0 and ignore_exit_code:
249
+ if status == -1 and ignore_exit_code:
217
250
  return JobStatus.finished
218
251
  return map_dict[status]
219
252
 
@@ -69,14 +69,12 @@ class PBS(Machine):
69
69
  self.context.write_file(job_id_name, job_id)
70
70
  return job_id
71
71
 
72
- def default_resources(self, resources):
73
- pass
74
-
75
72
  def check_status(self, job):
76
73
  job_id = job.job_id
77
74
  if job_id == "":
78
75
  return JobStatus.unsubmitted
79
- ret, stdin, stdout, stderr = self.context.block_call("qstat -x " + job_id)
76
+ command = "qstat -x " + job_id
77
+ ret, stdin, stdout, stderr = self.context.block_call(command)
80
78
  err_str = stderr.read().decode("utf-8")
81
79
  if ret != 0:
82
80
  if "qstat: Unknown Job Id" in err_str or "Job has finished" in err_str:
@@ -86,8 +84,7 @@ class PBS(Machine):
86
84
  return JobStatus.terminated
87
85
  else:
88
86
  raise RuntimeError(
89
- "status command qstat fails to execute. erro info: %s return code %d"
90
- % (err_str, ret)
87
+ f"status command {command} fails to execute. erro info: {err_str} return code {ret}"
91
88
  )
92
89
  status_line = stdout.read().decode("utf-8").split("\n")[-2]
93
90
  status_word = status_line.split()[-2]
@@ -126,7 +123,8 @@ class Torque(PBS):
126
123
  job_id = job.job_id
127
124
  if job_id == "":
128
125
  return JobStatus.unsubmitted
129
- ret, stdin, stdout, stderr = self.context.block_call("qstat -l " + job_id)
126
+ command = "qstat -l " + job_id
127
+ ret, stdin, stdout, stderr = self.context.block_call(command)
130
128
  err_str = stderr.read().decode("utf-8")
131
129
  if ret != 0:
132
130
  if "qstat: Unknown Job Id" in err_str or "Job has finished" in err_str:
@@ -136,8 +134,7 @@ class Torque(PBS):
136
134
  return JobStatus.terminated
137
135
  else:
138
136
  raise RuntimeError(
139
- "status command qstat fails to execute. erro info: %s return code %d"
140
- % (err_str, ret)
137
+ f"status command {command} fails to execute. erro info: {err_str} return code {ret}"
141
138
  )
142
139
  status_line = stdout.read().decode("utf-8").split("\n")[-2]
143
140
  status_word = status_line.split()[-2]
@@ -255,19 +252,18 @@ class SGE(PBS):
255
252
  self.context.write_file(job_id_name, job_id)
256
253
  return job_id
257
254
 
258
- def default_resources(self, resources):
259
- pass
260
-
261
255
  def check_status(self, job):
256
+ ### https://softpanorama.org/HPC/Grid_engine/Queues/queue_states.shtml
262
257
  job_id = job.job_id
263
258
  status_line = None
264
259
  if job_id == "":
265
260
  return JobStatus.unsubmitted
266
- ret, stdin, stdout, stderr = self.context.block_call("qstat")
261
+ command = "qstat"
262
+ ret, stdin, stdout, stderr = self.context.block_call(command)
267
263
  err_str = stderr.read().decode("utf-8")
268
264
  if ret != 0:
269
265
  raise RuntimeError(
270
- f"status command qstat fails to execute. erro info: {err_str} return code {ret}"
266
+ f"status command {command} fails to execute. erro info: {err_str} return code {ret}"
271
267
  )
272
268
  status_text_list = stdout.read().decode("utf-8").split("\n")
273
269
  for txt in status_text_list:
@@ -291,10 +287,12 @@ class SGE(PBS):
291
287
  else:
292
288
  status_word = status_line.split()[4]
293
289
  # dlog.info (status_word)
294
- if status_word in ["qw"]:
290
+ if status_word in ["qw", "hqw", "t"]:
295
291
  return JobStatus.waiting
296
- elif status_word in ["r"]:
292
+ elif status_word in ["r", "Rr"]:
297
293
  return JobStatus.running
294
+ elif status_word in ["Eqw", "dr", "dt"]:
295
+ return JobStatus.terminated
298
296
  else:
299
297
  return JobStatus.unknown
300
298
 
@@ -38,14 +38,12 @@ class Shell(Machine):
38
38
  script_run_str = self.gen_script_command(job)
39
39
  script_run_file_name = f"{job.script_file_name}.run"
40
40
  self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
41
- ret, stdin, stdout, stderr = self.context.block_call(
42
- f"cd {shlex.quote(self.context.remote_root)} && {{ nohup bash {script_file_name} 1>>{output_name} 2>>{output_name} & }} && echo $!"
43
- )
41
+ cmd = f"cd {shlex.quote(self.context.remote_root)} && {{ nohup bash {script_file_name} 1>>{output_name} 2>>{output_name} & }} && echo $!"
42
+ ret, stdin, stdout, stderr = self.context.block_call(cmd)
44
43
  if ret != 0:
45
44
  err_str = stderr.read().decode("utf-8")
46
45
  raise RuntimeError(
47
- "status command squeue fails to execute\nerror message:%s\nreturn code %d\n"
48
- % (err_str, ret)
46
+ f"status command {cmd} fails to execute\nerror message:{err_str}\nreturn code {ret}\n"
49
47
  )
50
48
  job_id = int(stdout.read().decode("utf-8").strip())
51
49
  self.context.write_file(job_id_name, str(job_id))
@@ -62,9 +60,6 @@ class Shell(Machine):
62
60
  # self.context.write_file(job_id_name, job_id)
63
61
  # return job_id
64
62
 
65
- def default_resources(self, resources):
66
- pass
67
-
68
63
  def check_status(self, job):
69
64
  job_id = job.job_id
70
65
  # print('shell.check_status.job_id', job_id)
@@ -73,14 +68,15 @@ class Shell(Machine):
73
68
  return JobStatus.unsubmitted
74
69
 
75
70
  # mark defunct process as terminated
76
- ret, stdin, stdout, stderr = self.context.block_call(
71
+ cmd = (
72
+ r"""command -v ps >/dev/null 2>&1 || { echo >&2 "I require ps but it's not installed. Aborting."; exit 1; };"""
77
73
  f"if ps -p {job_id} > /dev/null && ! (ps -o command -p {job_id} | grep defunct >/dev/null) ; then echo 1; fi"
78
74
  )
75
+ ret, stdin, stdout, stderr = self.context.block_call(cmd)
79
76
  if ret != 0:
80
77
  err_str = stderr.read().decode("utf-8")
81
78
  raise RuntimeError(
82
- "status command squeue fails to execute\nerror message:%s\nreturn code %d\n"
83
- % (err_str, ret)
79
+ f"status command {cmd} fails to execute\nerror message:{err_str}\nreturn code {ret}\n"
84
80
  )
85
81
 
86
82
  if_job_exists = bool(stdout.read().decode("utf-8").strip())
@@ -83,13 +83,12 @@ class Slurm(Machine):
83
83
  script_run_file_name = f"{job.script_file_name}.run"
84
84
  self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
85
85
  # self.context.write_file(fname=os.path.join(self.context.submission.work_base, script_file_name), write_str=script_str)
86
- ret, stdin, stdout, stderr = self.context.block_call(
87
- "cd {} && {} {}".format(
88
- shlex.quote(self.context.remote_root),
89
- "sbatch",
90
- shlex.quote(script_file_name),
91
- )
86
+ command = "cd {} && {} {}".format(
87
+ shlex.quote(self.context.remote_root),
88
+ "sbatch --parsable",
89
+ shlex.quote(script_file_name),
92
90
  )
91
+ ret, stdin, stdout, stderr = self.context.block_call(command)
93
92
  if ret != 0:
94
93
  err_str = stderr.read().decode("utf-8")
95
94
  if (
@@ -98,8 +97,7 @@ class Slurm(Machine):
98
97
  ):
99
98
  # server network error, retry 3 times
100
99
  raise RetrySignal(
101
- "Get error code %d in submitting through ssh with job: %s . message: %s"
102
- % (ret, job.job_hash, err_str)
100
+ f"Get error code {ret} in submitting with job: {job.job_hash} . message: {err_str}"
103
101
  )
104
102
  elif (
105
103
  "Job violates accounting/QOS policy" in err_str
@@ -110,8 +108,7 @@ class Slurm(Machine):
110
108
  # job number exceeds, skip the submitting
111
109
  return ""
112
110
  raise RuntimeError(
113
- "status command squeue fails to execute\nerror message:%s\nreturn code %d\n"
114
- % (err_str, ret)
111
+ f"command {command} fails to execute\nerror message:{err_str}\nreturn code {ret}\n"
115
112
  )
116
113
  subret = stdout.readlines()
117
114
  # --parsable
@@ -121,17 +118,13 @@ class Slurm(Machine):
121
118
  self.context.write_file(job_id_name, job_id)
122
119
  return job_id
123
120
 
124
- def default_resources(self, resources):
125
- pass
126
-
127
121
  @retry()
128
122
  def check_status(self, job):
129
123
  job_id = job.job_id
130
124
  if job_id == "":
131
125
  return JobStatus.unsubmitted
132
- ret, stdin, stdout, stderr = self.context.block_call(
133
- 'squeue -o "%.18i %.2t" -j ' + job_id
134
- )
126
+ command = 'squeue -o "%.18i %.2t" -j ' + job_id
127
+ ret, stdin, stdout, stderr = self.context.block_call(command)
135
128
  if ret != 0:
136
129
  err_str = stderr.read().decode("utf-8")
137
130
  if "Invalid job id specified" in err_str:
@@ -147,13 +140,11 @@ class Slurm(Machine):
147
140
  ):
148
141
  # retry 3 times
149
142
  raise RetrySignal(
150
- "Get error code %d in checking status through ssh with job: %s . message: %s"
151
- % (ret, job.job_hash, err_str)
143
+ f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
152
144
  )
153
145
  raise RuntimeError(
154
- "status command squeue fails to execute."
155
- "job_id:%s \n error message:%s\n return code %d\n"
156
- % (job_id, err_str, ret)
146
+ f"status command {command} fails to execute."
147
+ f"job_id:{job_id} \n error message:{err_str}\n return code {ret}\n"
157
148
  )
158
149
  status_line = stdout.read().decode("utf-8").split("\n")[-2]
159
150
  status_word = status_line.split()[-1]
@@ -257,7 +248,7 @@ class SlurmJobArray(Slurm):
257
248
  return super().gen_script_header(job) + "\n#SBATCH --array={}".format(
258
249
  ",".join(map(str, job_array))
259
250
  )
260
- return super().gen_script_header(job) + "\n#SBATCH --array=0-%d" % (
251
+ return super().gen_script_header(job) + "\n#SBATCH --array=0-%s" % (
261
252
  math.ceil(len(job.job_task_list) / slurm_job_size) - 1
262
253
  )
263
254
 
@@ -319,9 +310,8 @@ class SlurmJobArray(Slurm):
319
310
  job_id = job.job_id
320
311
  if job_id == "":
321
312
  return JobStatus.unsubmitted
322
- ret, stdin, stdout, stderr = self.context.block_call(
323
- 'squeue -h -o "%.18i %.2t" -j ' + job_id
324
- )
313
+ command = 'squeue -h -o "%.18i %.2t" -j ' + job_id
314
+ ret, stdin, stdout, stderr = self.context.block_call(command)
325
315
  if ret != 0:
326
316
  err_str = stderr.read().decode("utf-8")
327
317
  if "Invalid job id specified" in err_str:
@@ -336,13 +326,11 @@ class SlurmJobArray(Slurm):
336
326
  ):
337
327
  # retry 3 times
338
328
  raise RetrySignal(
339
- "Get error code %d in checking status through ssh with job: %s . message: %s"
340
- % (ret, job.job_hash, err_str)
329
+ f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
341
330
  )
342
331
  raise RuntimeError(
343
- "status command squeue fails to execute."
344
- "job_id:%s \n error message:%s\n return code %d\n"
345
- % (job_id, err_str, ret)
332
+ f"status command {command} fails to execute."
333
+ f"job_id:{job_id} \n error message:{err_str}\n return code {ret}\n"
346
334
  )
347
335
  status_lines = stdout.read().decode("utf-8").split("\n")[:-1]
348
336
  status = []
@@ -55,7 +55,6 @@ class Submission:
55
55
  *,
56
56
  task_list=[],
57
57
  ):
58
- # self.submission_list = submission_list
59
58
  self.local_root = None
60
59
  self.work_base = work_base
61
60
  self._abs_work_base = os.path.abspath(work_base)
@@ -324,8 +323,7 @@ class Submission:
324
323
  kwargs = {**{"clean": False}, **kwargs}
325
324
  if kwargs["clean"]:
326
325
  dlog.warning(
327
- "Using async submission with `clean=True`, "
328
- "job may fail in queue system"
326
+ "Using async submission with `clean=True`, job may fail in queue system"
329
327
  )
330
328
  loop = asyncio.get_event_loop()
331
329
  wrapped_submission = functools.partial(self.run_submission, **kwargs)
@@ -515,12 +513,9 @@ class Submission:
515
513
  def submission_from_json(cls, json_file_name="submission.json"):
516
514
  with open(json_file_name) as f:
517
515
  submission_dict = json.load(f)
518
- # submission_dict = machine.context.read_file(json_file_name)
519
516
  submission = cls.deserialize(submission_dict=submission_dict, machine=None)
520
517
  return submission
521
518
 
522
- # def check_if_recover()
523
-
524
519
  def try_recover_from_json(self):
525
520
  submission_file_name = f"{self.submission_hash}.json"
526
521
  if_recover = self.machine.context.check_file_exists(submission_file_name)
@@ -545,7 +540,6 @@ class Submission:
545
540
  f"machine.context.remote_root:{self.machine.context.remote_root}; "
546
541
  f"submission.work_base:{submission.work_base};"
547
542
  )
548
- # self = submission.bind_machine(machine=self.machine)
549
543
  else:
550
544
  print(self.serialize())
551
545
  print(submission.serialize())
@@ -759,7 +753,6 @@ class Job:
759
753
  self.fail_count = 0
760
754
  self.job_uuid = uuid.uuid4()
761
755
 
762
- # self.job_hash = self.get_hash()
763
756
  self.job_hash = self.get_hash()
764
757
  self.script_file_name = self.job_hash + ".sub"
765
758
 
@@ -1122,9 +1115,9 @@ class Resources:
1122
1115
 
1123
1116
  @staticmethod
1124
1117
  def arginfo(detail_kwargs=True):
1125
- doc_number_node = "The number of node need for each `job`"
1126
- doc_cpu_per_node = "cpu numbers of each node assigned to each job."
1127
- doc_gpu_per_node = "gpu numbers of each node assigned to each job."
1118
+ doc_number_node = "The number of nodes required for each `job`."
1119
+ doc_cpu_per_node = "CPU numbers of each node assigned to each job."
1120
+ doc_gpu_per_node = "GPU numbers of each node assigned to each job."
1128
1121
  doc_queue_name = "The queue name of batch job scheduler system."
1129
1122
  doc_group_size = "The number of `tasks` in a `job`. 0 means infinity."
1130
1123
  doc_custom_flags = "The extra lines pass to job submitting script header"
@@ -142,10 +142,10 @@ class Client:
142
142
  res = self.get("/data/get_sts_token", {})
143
143
  # print('debug>>>>>>>>>>>>>', res)
144
144
  dlog.debug(f"debug: _get_oss_bucket: res:{res}")
145
- auth = oss2.StsAuth(
145
+ auth = oss2.StsAuth( # type: ignore[reportPossiblyUnboundVariable]
146
146
  res["AccessKeyId"], res["AccessKeySecret"], res["SecurityToken"]
147
147
  )
148
- return oss2.Bucket(auth, endpoint, bucket_name)
148
+ return oss2.Bucket(auth, endpoint, bucket_name) # type: ignore[reportPossiblyUnboundVariable]
149
149
 
150
150
  def download(self, oss_file, save_file, endpoint, bucket_name):
151
151
  bucket = self._get_oss_bucket(endpoint, bucket_name)
@@ -184,7 +184,7 @@ class Client:
184
184
  )
185
185
  bucket = self._get_oss_bucket(endpoint, bucket_name)
186
186
  total_size = os.path.getsize(zip_task_file)
187
- part_size = determine_part_size(total_size, preferred_size=1000 * 1024)
187
+ part_size = determine_part_size(total_size, preferred_size=1000 * 1024) # type: ignore[reportPossiblyUnboundVariable]
188
188
  upload_id = bucket.init_multipart_upload(oss_task_zip).upload_id
189
189
  parts = []
190
190
  with open(zip_task_file, "rb") as fileobj:
@@ -196,9 +196,9 @@ class Client:
196
196
  oss_task_zip,
197
197
  upload_id,
198
198
  part_number,
199
- SizedFileAdapter(fileobj, num_to_upload),
199
+ SizedFileAdapter(fileobj, num_to_upload), # type: ignore[reportPossiblyUnboundVariable]
200
200
  )
201
- parts.append(PartInfo(part_number, result.etag))
201
+ parts.append(PartInfo(part_number, result.etag)) # type: ignore[reportPossiblyUnboundVariable]
202
202
  offset += num_to_upload
203
203
  part_number += 1
204
204
  # result = bucket.complete_multipart_upload(oss_task_zip, upload_id, parts)
@@ -278,7 +278,11 @@ class Client:
278
278
  return ""
279
279
  resp = requests.get(url, headers={"Range": f"bytes={self.last_log_offset}-"})
280
280
  self.last_log_offset += len(resp.content)
281
- return resp.content.decode("utf-8")
281
+ try:
282
+ return resp.content.decode("utf-8")
283
+ except Exception as e:
284
+ dlog.error(f"Error decoding job log: {e}", stack_info=ENABLE_STACK)
285
+ return ""
282
286
 
283
287
  def _get_job_log(self, job_id):
284
288
  ret = self.get(
@@ -28,7 +28,7 @@ class HDFS:
28
28
  )
29
29
  except Exception as e:
30
30
  raise RuntimeError(
31
- f"Cannot check existence of hdfs uri[{uri}] " f"with cmd[{cmd}]"
31
+ f"Cannot check existence of hdfs uri[{uri}] with cmd[{cmd}]"
32
32
  ) from e
33
33
 
34
34
  @staticmethod
@@ -48,9 +48,7 @@ class HDFS:
48
48
  f"with cmd[{cmd}]; ret[{ret}] output[{out}] stderr[{err}]"
49
49
  )
50
50
  except Exception as e:
51
- raise RuntimeError(
52
- f"Cannot remove hdfs uri[{uri}] " f"with cmd[{cmd}]"
53
- ) from e
51
+ raise RuntimeError(f"Cannot remove hdfs uri[{uri}] with cmd[{cmd}]") from e
54
52
 
55
53
  @staticmethod
56
54
  def mkdir(uri):
@@ -70,7 +68,7 @@ class HDFS:
70
68
  )
71
69
  except Exception as e:
72
70
  raise RuntimeError(
73
- f"Cannot mkdir of hdfs uri[{uri}] " f"with cmd[{cmd}]"
71
+ f"Cannot mkdir of hdfs uri[{uri}] with cmd[{cmd}]"
74
72
  ) from e
75
73
 
76
74
  @staticmethod
@@ -80,7 +78,7 @@ class HDFS:
80
78
  """
81
79
  # Make sure local_path is accessible
82
80
  if not os.path.exists(local_path) or not os.access(local_path, os.R_OK):
83
- raise RuntimeError(f"try to access local_path[{local_path}] " "but failed")
81
+ raise RuntimeError(f"try to access local_path[{local_path}] but failed")
84
82
  cmd = f"hadoop fs -copyFromLocal -f {local_path} {to_uri}"
85
83
  try:
86
84
  ret, out, err = run_cmd_with_all_output(cmd)
@@ -132,9 +130,7 @@ class HDFS:
132
130
  f"cmd [{cmd}] ret[{ret}] output[{out}] stderr[{err}]"
133
131
  )
134
132
  except Exception as e:
135
- raise RuntimeError(
136
- f"Cannot read text from uri[{uri}]" f"cmd [{cmd}]"
137
- ) from e
133
+ raise RuntimeError(f"Cannot read text from uri[{uri}]cmd [{cmd}]") from e
138
134
 
139
135
  @staticmethod
140
136
  def move(from_uri, to_uri):
@@ -151,6 +147,5 @@ class HDFS:
151
147
  )
152
148
  except Exception as e:
153
149
  raise RuntimeError(
154
- f"Cannot move from_uri[{from_uri}] to "
155
- f"to_uri[{to_uri}] with cmd[{cmd}]"
150
+ f"Cannot move from_uri[{from_uri}] to to_uri[{to_uri}] with cmd[{cmd}]"
156
151
  ) from e