dpdispatcher 0.6.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. dpdispatcher/_version.py +22 -4
  2. dpdispatcher/base_context.py +60 -1
  3. dpdispatcher/contexts/__init__.py +1 -0
  4. dpdispatcher/contexts/dp_cloud_server_context.py +8 -1
  5. dpdispatcher/contexts/hdfs_context.py +16 -11
  6. dpdispatcher/contexts/lazy_local_context.py +2 -19
  7. dpdispatcher/contexts/local_context.py +77 -43
  8. dpdispatcher/contexts/openapi_context.py +78 -14
  9. dpdispatcher/contexts/ssh_context.py +117 -98
  10. dpdispatcher/dlog.py +9 -5
  11. dpdispatcher/dpcloudserver/__init__.py +0 -0
  12. dpdispatcher/dpcloudserver/client.py +7 -0
  13. dpdispatcher/dpdisp.py +21 -0
  14. dpdispatcher/entrypoints/run.py +9 -0
  15. dpdispatcher/entrypoints/submission.py +21 -1
  16. dpdispatcher/machine.py +15 -4
  17. dpdispatcher/machines/JH_UniScheduler.py +171 -0
  18. dpdispatcher/machines/__init__.py +1 -0
  19. dpdispatcher/machines/distributed_shell.py +6 -10
  20. dpdispatcher/machines/fugaku.py +9 -12
  21. dpdispatcher/machines/lsf.py +3 -9
  22. dpdispatcher/machines/openapi.py +48 -15
  23. dpdispatcher/machines/pbs.py +183 -20
  24. dpdispatcher/machines/shell.py +7 -16
  25. dpdispatcher/machines/slurm.py +30 -42
  26. dpdispatcher/run.py +172 -0
  27. dpdispatcher/submission.py +5 -14
  28. dpdispatcher/utils/dpcloudserver/client.py +10 -6
  29. dpdispatcher/utils/hdfs_cli.py +10 -19
  30. dpdispatcher/utils/utils.py +21 -7
  31. {dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/METADATA +35 -29
  32. dpdispatcher-1.0.0.dist-info/RECORD +49 -0
  33. {dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/WHEEL +1 -1
  34. dpdispatcher-0.6.1.dist-info/RECORD +0 -44
  35. {dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/entry_points.txt +0 -0
  36. {dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info/licenses}/LICENSE +0 -0
  37. {dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/top_level.txt +0 -0
@@ -38,19 +38,12 @@ class Shell(Machine):
38
38
  script_run_str = self.gen_script_command(job)
39
39
  script_run_file_name = f"{job.script_file_name}.run"
40
40
  self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
41
- ret, stdin, stdout, stderr = self.context.block_call(
42
- "cd {} && {{ nohup bash {} 1>>{} 2>>{} & }} && echo $!".format(
43
- shlex.quote(self.context.remote_root),
44
- script_file_name,
45
- output_name,
46
- output_name,
47
- )
48
- )
41
+ cmd = f"cd {shlex.quote(self.context.remote_root)} && {{ nohup bash {script_file_name} 1>>{output_name} 2>>{output_name} & }} && echo $!"
42
+ ret, stdin, stdout, stderr = self.context.block_call(cmd)
49
43
  if ret != 0:
50
44
  err_str = stderr.read().decode("utf-8")
51
45
  raise RuntimeError(
52
- "status command squeue fails to execute\nerror message:%s\nreturn code %d\n"
53
- % (err_str, ret)
46
+ f"status command {cmd} fails to execute\nerror message:{err_str}\nreturn code {ret}\n"
54
47
  )
55
48
  job_id = int(stdout.read().decode("utf-8").strip())
56
49
  self.context.write_file(job_id_name, str(job_id))
@@ -67,9 +60,6 @@ class Shell(Machine):
67
60
  # self.context.write_file(job_id_name, job_id)
68
61
  # return job_id
69
62
 
70
- def default_resources(self, resources):
71
- pass
72
-
73
63
  def check_status(self, job):
74
64
  job_id = job.job_id
75
65
  # print('shell.check_status.job_id', job_id)
@@ -78,14 +68,15 @@ class Shell(Machine):
78
68
  return JobStatus.unsubmitted
79
69
 
80
70
  # mark defunct process as terminated
81
- ret, stdin, stdout, stderr = self.context.block_call(
71
+ cmd = (
72
+ r"""command -v ps >/dev/null 2>&1 || { echo >&2 "I require ps but it's not installed. Aborting."; exit 1; };"""
82
73
  f"if ps -p {job_id} > /dev/null && ! (ps -o command -p {job_id} | grep defunct >/dev/null) ; then echo 1; fi"
83
74
  )
75
+ ret, stdin, stdout, stderr = self.context.block_call(cmd)
84
76
  if ret != 0:
85
77
  err_str = stderr.read().decode("utf-8")
86
78
  raise RuntimeError(
87
- "status command squeue fails to execute\nerror message:%s\nreturn code %d\n"
88
- % (err_str, ret)
79
+ f"status command {cmd} fails to execute\nerror message:{err_str}\nreturn code {ret}\n"
89
80
  )
90
81
 
91
82
  if_job_exists = bool(stdout.read().decode("utf-8").strip())
@@ -39,23 +39,23 @@ class Slurm(Machine):
39
39
  def gen_script_header(self, job):
40
40
  resources = job.resources
41
41
  script_header_dict = {}
42
- script_header_dict["slurm_nodes_line"] = "#SBATCH --nodes {number_node}".format(
43
- number_node=resources.number_node
42
+ script_header_dict["slurm_nodes_line"] = (
43
+ f"#SBATCH --nodes {resources.number_node}"
44
+ )
45
+ script_header_dict["slurm_ntasks_per_node_line"] = (
46
+ f"#SBATCH --ntasks-per-node {resources.cpu_per_node}"
44
47
  )
45
- script_header_dict[
46
- "slurm_ntasks_per_node_line"
47
- ] = f"#SBATCH --ntasks-per-node {resources.cpu_per_node}"
48
48
  custom_gpu_line = resources.kwargs.get("custom_gpu_line", None)
49
49
  if not custom_gpu_line:
50
- script_header_dict[
51
- "slurm_number_gpu_line"
52
- ] = f"#SBATCH --gres=gpu:{resources.gpu_per_node}"
50
+ script_header_dict["slurm_number_gpu_line"] = (
51
+ f"#SBATCH --gres=gpu:{resources.gpu_per_node}"
52
+ )
53
53
  else:
54
54
  script_header_dict["slurm_number_gpu_line"] = custom_gpu_line
55
55
  if resources.queue_name != "":
56
- script_header_dict[
57
- "slurm_partition_line"
58
- ] = f"#SBATCH --partition {resources.queue_name}"
56
+ script_header_dict["slurm_partition_line"] = (
57
+ f"#SBATCH --partition {resources.queue_name}"
58
+ )
59
59
  else:
60
60
  script_header_dict["slurm_partition_line"] = ""
61
61
  if (
@@ -83,13 +83,12 @@ class Slurm(Machine):
83
83
  script_run_file_name = f"{job.script_file_name}.run"
84
84
  self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
85
85
  # self.context.write_file(fname=os.path.join(self.context.submission.work_base, script_file_name), write_str=script_str)
86
- ret, stdin, stdout, stderr = self.context.block_call(
87
- "cd {} && {} {}".format(
88
- shlex.quote(self.context.remote_root),
89
- "sbatch",
90
- shlex.quote(script_file_name),
91
- )
86
+ command = "cd {} && {} {}".format(
87
+ shlex.quote(self.context.remote_root),
88
+ "sbatch --parsable",
89
+ shlex.quote(script_file_name),
92
90
  )
91
+ ret, stdin, stdout, stderr = self.context.block_call(command)
93
92
  if ret != 0:
94
93
  err_str = stderr.read().decode("utf-8")
95
94
  if (
@@ -98,8 +97,7 @@ class Slurm(Machine):
98
97
  ):
99
98
  # server network error, retry 3 times
100
99
  raise RetrySignal(
101
- "Get error code %d in submitting through ssh with job: %s . message: %s"
102
- % (ret, job.job_hash, err_str)
100
+ f"Get error code {ret} in submitting with job: {job.job_hash} . message: {err_str}"
103
101
  )
104
102
  elif (
105
103
  "Job violates accounting/QOS policy" in err_str
@@ -110,8 +108,7 @@ class Slurm(Machine):
110
108
  # job number exceeds, skip the submitting
111
109
  return ""
112
110
  raise RuntimeError(
113
- "status command squeue fails to execute\nerror message:%s\nreturn code %d\n"
114
- % (err_str, ret)
111
+ f"command {command} fails to execute\nerror message:{err_str}\nreturn code {ret}\n"
115
112
  )
116
113
  subret = stdout.readlines()
117
114
  # --parsable
@@ -121,17 +118,13 @@ class Slurm(Machine):
121
118
  self.context.write_file(job_id_name, job_id)
122
119
  return job_id
123
120
 
124
- def default_resources(self, resources):
125
- pass
126
-
127
121
  @retry()
128
122
  def check_status(self, job):
129
123
  job_id = job.job_id
130
124
  if job_id == "":
131
125
  return JobStatus.unsubmitted
132
- ret, stdin, stdout, stderr = self.context.block_call(
133
- 'squeue -o "%.18i %.2t" -j ' + job_id
134
- )
126
+ command = 'squeue -o "%.18i %.2t" -j ' + job_id
127
+ ret, stdin, stdout, stderr = self.context.block_call(command)
135
128
  if ret != 0:
136
129
  err_str = stderr.read().decode("utf-8")
137
130
  if "Invalid job id specified" in err_str:
@@ -147,13 +140,11 @@ class Slurm(Machine):
147
140
  ):
148
141
  # retry 3 times
149
142
  raise RetrySignal(
150
- "Get error code %d in checking status through ssh with job: %s . message: %s"
151
- % (ret, job.job_hash, err_str)
143
+ f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
152
144
  )
153
145
  raise RuntimeError(
154
- "status command squeue fails to execute."
155
- "job_id:%s \n error message:%s\n return code %d\n"
156
- % (job_id, err_str, ret)
146
+ f"status command {command} fails to execute."
147
+ f"job_id:{job_id} \n error message:{err_str}\n return code {ret}\n"
157
148
  )
158
149
  status_line = stdout.read().decode("utf-8").split("\n")[-2]
159
150
  status_word = status_line.split()[-1]
@@ -254,10 +245,10 @@ class SlurmJobArray(Slurm):
254
245
  ).as_posix()
255
246
  if not self.context.check_file_exists(task_tag_finished):
256
247
  job_array.add(ii // slurm_job_size)
257
- return super().gen_script_header(job) + "\n#SBATCH --array=%s" % (
248
+ return super().gen_script_header(job) + "\n#SBATCH --array={}".format(
258
249
  ",".join(map(str, job_array))
259
250
  )
260
- return super().gen_script_header(job) + "\n#SBATCH --array=0-%d" % (
251
+ return super().gen_script_header(job) + "\n#SBATCH --array=0-%s" % (
261
252
  math.ceil(len(job.job_task_list) / slurm_job_size) - 1
262
253
  )
263
254
 
@@ -319,9 +310,8 @@ class SlurmJobArray(Slurm):
319
310
  job_id = job.job_id
320
311
  if job_id == "":
321
312
  return JobStatus.unsubmitted
322
- ret, stdin, stdout, stderr = self.context.block_call(
323
- 'squeue -h -o "%.18i %.2t" -j ' + job_id
324
- )
313
+ command = 'squeue -h -o "%.18i %.2t" -j ' + job_id
314
+ ret, stdin, stdout, stderr = self.context.block_call(command)
325
315
  if ret != 0:
326
316
  err_str = stderr.read().decode("utf-8")
327
317
  if "Invalid job id specified" in err_str:
@@ -336,13 +326,11 @@ class SlurmJobArray(Slurm):
336
326
  ):
337
327
  # retry 3 times
338
328
  raise RetrySignal(
339
- "Get error code %d in checking status through ssh with job: %s . message: %s"
340
- % (ret, job.job_hash, err_str)
329
+ f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
341
330
  )
342
331
  raise RuntimeError(
343
- "status command squeue fails to execute."
344
- "job_id:%s \n error message:%s\n return code %d\n"
345
- % (job_id, err_str, ret)
332
+ f"status command {command} fails to execute."
333
+ f"job_id:{job_id} \n error message:{err_str}\n return code {ret}\n"
346
334
  )
347
335
  status_lines = stdout.read().decode("utf-8").split("\n")[:-1]
348
336
  status = []
dpdispatcher/run.py ADDED
@@ -0,0 +1,172 @@
1
+ import os
2
+ import re
3
+ import sys
4
+ from glob import glob
5
+ from hashlib import sha1
6
+
7
+ from dpdispatcher.machine import Machine
8
+ from dpdispatcher.submission import Resources, Submission, Task
9
+
10
+ if sys.version_info >= (3, 11):
11
+ import tomllib
12
+ else:
13
+ import tomli as tomllib
14
+ from typing import List, Optional
15
+
16
+ from dargs import Argument
17
+
18
+ from dpdispatcher.arginfo import machine_dargs, resources_dargs, task_dargs
19
+
20
+ REGEX = r"(?m)^# /// (?P<type>[a-zA-Z0-9-]+)$\s(?P<content>(^#(| .*)$\s)+)^# ///$"
21
+
22
+
23
+ def read_pep723(script: str) -> Optional[dict]:
24
+ """Read a PEP 723 script metadata from a script string.
25
+
26
+ Parameters
27
+ ----------
28
+ script : str
29
+ Script content.
30
+
31
+ Returns
32
+ -------
33
+ dict
34
+ PEP 723 metadata.
35
+ """
36
+ name = "script"
37
+ matches = list(
38
+ filter(lambda m: m.group("type") == name, re.finditer(REGEX, script))
39
+ )
40
+ if len(matches) > 1:
41
+ # TODO: Add tests for scenarios where multiple script blocks are found
42
+ raise ValueError(f"Multiple {name} blocks found")
43
+ elif len(matches) == 1:
44
+ content = "".join(
45
+ line[2:] if line.startswith("# ") else line[1:]
46
+ for line in matches[0].group("content").splitlines(keepends=True)
47
+ )
48
+ return tomllib.loads(content)
49
+ else:
50
+ # TODO: Add tests for scenarios where no metadata is found
51
+ return None
52
+
53
+
54
+ def pep723_args() -> Argument:
55
+ """Return the argument parser for PEP 723 metadata."""
56
+ machine_args = machine_dargs()
57
+ machine_args.fold_subdoc = True
58
+ machine_args.doc = "Machine configuration. See related documentation for details."
59
+ resources_args = resources_dargs(detail_kwargs=False)
60
+ resources_args.fold_subdoc = True
61
+ resources_args.doc = (
62
+ "Resources configuration. See related documentation for details."
63
+ )
64
+ task_args = task_dargs()
65
+ command_arg = task_args["command"]
66
+ command_arg.doc = (
67
+ "Python interpreter or launcher. No need to contain the Python script filename."
68
+ )
69
+ command_arg.default = "python"
70
+ command_arg.optional = True
71
+ task_args["task_work_path"].doc += " Can be a glob pattern."
72
+ task_args.name = "task_list"
73
+ task_args.doc = "List of tasks to execute."
74
+ task_args.repeat = True
75
+ task_args.dtype = (list,)
76
+ return Argument(
77
+ "pep723",
78
+ dtype=dict,
79
+ doc="PEP 723 metadata",
80
+ sub_fields=[
81
+ Argument(
82
+ "work_base",
83
+ dtype=str,
84
+ optional=True,
85
+ default="./",
86
+ doc="Base directory for the work",
87
+ ),
88
+ Argument(
89
+ "forward_common_files",
90
+ dtype=List[str],
91
+ optional=True,
92
+ default=[],
93
+ doc="Common files to forward to the remote machine",
94
+ ),
95
+ Argument(
96
+ "backward_common_files",
97
+ dtype=List[str],
98
+ optional=True,
99
+ default=[],
100
+ doc="Common files to backward from the remote machine",
101
+ ),
102
+ machine_args,
103
+ resources_args,
104
+ task_args,
105
+ ],
106
+ )
107
+
108
+
109
+ def create_submission(metadata: dict, hash: str) -> Submission:
110
+ """Create a Submission instance from a PEP 723 metadata.
111
+
112
+ Parameters
113
+ ----------
114
+ metadata : dict
115
+ PEP 723 metadata.
116
+ hash : str
117
+ Submission hash.
118
+
119
+ Returns
120
+ -------
121
+ Submission
122
+ Submission instance.
123
+ """
124
+ base = pep723_args()
125
+ metadata = base.normalize_value(metadata, trim_pattern="_*")
126
+ base.check_value(metadata, strict=False)
127
+
128
+ tasks = []
129
+ for task in metadata["task_list"]:
130
+ task = task.copy()
131
+ task["command"] += f" $REMOTE_ROOT/script_{hash}.py"
132
+ task_work_path = os.path.join(
133
+ metadata["machine"]["local_root"],
134
+ metadata["work_base"],
135
+ task["task_work_path"],
136
+ )
137
+ if os.path.isdir(task_work_path):
138
+ tasks.append(Task.load_from_dict(task))
139
+ elif glob(task_work_path):
140
+ for file in glob(task_work_path):
141
+ tasks.append(Task.load_from_dict({**task, "task_work_path": file}))
142
+ # TODO: Add tests for scenarios where the task work path is a glob pattern
143
+ else:
144
+ # TODO: Add tests for scenarios where the task work path is not found
145
+ raise FileNotFoundError(f"Task work path {task_work_path} not found.")
146
+ return Submission(
147
+ work_base=metadata["work_base"],
148
+ forward_common_files=metadata["forward_common_files"],
149
+ backward_common_files=metadata["backward_common_files"],
150
+ machine=Machine.load_from_dict(metadata["machine"]),
151
+ resources=Resources.load_from_dict(metadata["resources"]),
152
+ task_list=tasks,
153
+ )
154
+
155
+
156
+ def run_pep723(script: str):
157
+ """Run a PEP 723 script.
158
+
159
+ Parameters
160
+ ----------
161
+ script : str
162
+ Script content.
163
+ """
164
+ metadata = read_pep723(script)
165
+ if metadata is None:
166
+ raise ValueError("No PEP 723 metadata found.")
167
+ dpdispatcher_metadata = metadata["tool"]["dpdispatcher"]
168
+ script_hash = sha1(script.encode("utf-8")).hexdigest()
169
+ submission = create_submission(dpdispatcher_metadata, script_hash)
170
+ submission.machine.context.write_file(f"script_{script_hash}.py", script)
171
+ # write script
172
+ submission.run_submission()
@@ -55,7 +55,6 @@ class Submission:
55
55
  *,
56
56
  task_list=[],
57
57
  ):
58
- # self.submission_list = submission_list
59
58
  self.local_root = None
60
59
  self.work_base = work_base
61
60
  self._abs_work_base = os.path.abspath(work_base)
@@ -324,8 +323,7 @@ class Submission:
324
323
  kwargs = {**{"clean": False}, **kwargs}
325
324
  if kwargs["clean"]:
326
325
  dlog.warning(
327
- "Using async submission with `clean=True`, "
328
- "job may fail in queue system"
326
+ "Using async submission with `clean=True`, job may fail in queue system"
329
327
  )
330
328
  loop = asyncio.get_event_loop()
331
329
  wrapped_submission = functools.partial(self.run_submission, **kwargs)
@@ -515,12 +513,9 @@ class Submission:
515
513
  def submission_from_json(cls, json_file_name="submission.json"):
516
514
  with open(json_file_name) as f:
517
515
  submission_dict = json.load(f)
518
- # submission_dict = machine.context.read_file(json_file_name)
519
516
  submission = cls.deserialize(submission_dict=submission_dict, machine=None)
520
517
  return submission
521
518
 
522
- # def check_if_recover()
523
-
524
519
  def try_recover_from_json(self):
525
520
  submission_file_name = f"{self.submission_hash}.json"
526
521
  if_recover = self.machine.context.check_file_exists(submission_file_name)
@@ -545,7 +540,6 @@ class Submission:
545
540
  f"machine.context.remote_root:{self.machine.context.remote_root}; "
546
541
  f"submission.work_base:{submission.work_base};"
547
542
  )
548
- # self = submission.bind_machine(machine=self.machine)
549
543
  else:
550
544
  print(self.serialize())
551
545
  print(submission.serialize())
@@ -759,7 +753,6 @@ class Job:
759
753
  self.fail_count = 0
760
754
  self.job_uuid = uuid.uuid4()
761
755
 
762
- # self.job_hash = self.get_hash()
763
756
  self.job_hash = self.get_hash()
764
757
  self.script_file_name = self.job_hash + ".sub"
765
758
 
@@ -863,9 +856,7 @@ class Job:
863
856
  self.submit_job()
864
857
  if self.job_state != JobStatus.unsubmitted:
865
858
  dlog.info(
866
- "job:{job_hash} re-submit after terminated; new job_id is {job_id}".format(
867
- job_hash=self.job_hash, job_id=self.job_id
868
- )
859
+ f"job:{self.job_hash} re-submit after terminated; new job_id is {self.job_id}"
869
860
  )
870
861
  time.sleep(0.2)
871
862
  self.get_job_state()
@@ -1124,9 +1115,9 @@ class Resources:
1124
1115
 
1125
1116
  @staticmethod
1126
1117
  def arginfo(detail_kwargs=True):
1127
- doc_number_node = "The number of node need for each `job`"
1128
- doc_cpu_per_node = "cpu numbers of each node assigned to each job."
1129
- doc_gpu_per_node = "gpu numbers of each node assigned to each job."
1118
+ doc_number_node = "The number of nodes required for each `job`."
1119
+ doc_cpu_per_node = "CPU numbers of each node assigned to each job."
1120
+ doc_gpu_per_node = "GPU numbers of each node assigned to each job."
1130
1121
  doc_queue_name = "The queue name of batch job scheduler system."
1131
1122
  doc_group_size = "The number of `tasks` in a `job`. 0 means infinity."
1132
1123
  doc_custom_flags = "The extra lines pass to job submitting script header"
@@ -142,10 +142,10 @@ class Client:
142
142
  res = self.get("/data/get_sts_token", {})
143
143
  # print('debug>>>>>>>>>>>>>', res)
144
144
  dlog.debug(f"debug: _get_oss_bucket: res:{res}")
145
- auth = oss2.StsAuth(
145
+ auth = oss2.StsAuth( # type: ignore[reportPossiblyUnboundVariable]
146
146
  res["AccessKeyId"], res["AccessKeySecret"], res["SecurityToken"]
147
147
  )
148
- return oss2.Bucket(auth, endpoint, bucket_name)
148
+ return oss2.Bucket(auth, endpoint, bucket_name) # type: ignore[reportPossiblyUnboundVariable]
149
149
 
150
150
  def download(self, oss_file, save_file, endpoint, bucket_name):
151
151
  bucket = self._get_oss_bucket(endpoint, bucket_name)
@@ -184,7 +184,7 @@ class Client:
184
184
  )
185
185
  bucket = self._get_oss_bucket(endpoint, bucket_name)
186
186
  total_size = os.path.getsize(zip_task_file)
187
- part_size = determine_part_size(total_size, preferred_size=1000 * 1024)
187
+ part_size = determine_part_size(total_size, preferred_size=1000 * 1024) # type: ignore[reportPossiblyUnboundVariable]
188
188
  upload_id = bucket.init_multipart_upload(oss_task_zip).upload_id
189
189
  parts = []
190
190
  with open(zip_task_file, "rb") as fileobj:
@@ -196,9 +196,9 @@ class Client:
196
196
  oss_task_zip,
197
197
  upload_id,
198
198
  part_number,
199
- SizedFileAdapter(fileobj, num_to_upload),
199
+ SizedFileAdapter(fileobj, num_to_upload), # type: ignore[reportPossiblyUnboundVariable]
200
200
  )
201
- parts.append(PartInfo(part_number, result.etag))
201
+ parts.append(PartInfo(part_number, result.etag)) # type: ignore[reportPossiblyUnboundVariable]
202
202
  offset += num_to_upload
203
203
  part_number += 1
204
204
  # result = bucket.complete_multipart_upload(oss_task_zip, upload_id, parts)
@@ -278,7 +278,11 @@ class Client:
278
278
  return ""
279
279
  resp = requests.get(url, headers={"Range": f"bytes={self.last_log_offset}-"})
280
280
  self.last_log_offset += len(resp.content)
281
- return resp.content.decode("utf-8")
281
+ try:
282
+ return resp.content.decode("utf-8")
283
+ except Exception as e:
284
+ dlog.error(f"Error decoding job log: {e}", stack_info=ENABLE_STACK)
285
+ return ""
282
286
 
283
287
  def _get_job_log(self, job_id):
284
288
  ret = self.get(
@@ -28,7 +28,7 @@ class HDFS:
28
28
  )
29
29
  except Exception as e:
30
30
  raise RuntimeError(
31
- f"Cannot check existence of hdfs uri[{uri}] " f"with cmd[{cmd}]"
31
+ f"Cannot check existence of hdfs uri[{uri}] with cmd[{cmd}]"
32
32
  ) from e
33
33
 
34
34
  @staticmethod
@@ -48,9 +48,7 @@ class HDFS:
48
48
  f"with cmd[{cmd}]; ret[{ret}] output[{out}] stderr[{err}]"
49
49
  )
50
50
  except Exception as e:
51
- raise RuntimeError(
52
- f"Cannot remove hdfs uri[{uri}] " f"with cmd[{cmd}]"
53
- ) from e
51
+ raise RuntimeError(f"Cannot remove hdfs uri[{uri}] with cmd[{cmd}]") from e
54
52
 
55
53
  @staticmethod
56
54
  def mkdir(uri):
@@ -70,7 +68,7 @@ class HDFS:
70
68
  )
71
69
  except Exception as e:
72
70
  raise RuntimeError(
73
- f"Cannot mkdir of hdfs uri[{uri}] " f"with cmd[{cmd}]"
71
+ f"Cannot mkdir of hdfs uri[{uri}] with cmd[{cmd}]"
74
72
  ) from e
75
73
 
76
74
  @staticmethod
@@ -80,7 +78,7 @@ class HDFS:
80
78
  """
81
79
  # Make sure local_path is accessible
82
80
  if not os.path.exists(local_path) or not os.access(local_path, os.R_OK):
83
- raise RuntimeError(f"try to access local_path[{local_path}] " "but failed")
81
+ raise RuntimeError(f"try to access local_path[{local_path}] but failed")
84
82
  cmd = f"hadoop fs -copyFromLocal -f {local_path} {to_uri}"
85
83
  try:
86
84
  ret, out, err = run_cmd_with_all_output(cmd)
@@ -88,10 +86,8 @@ class HDFS:
88
86
  return True, out
89
87
  else:
90
88
  raise RuntimeError(
91
- "Cannot copy local[{}] to remote[{}] with cmd[{}]; "
92
- "ret[{}] output[{}] stderr[{}]".format(
93
- local_path, to_uri, cmd, ret, out, err
94
- )
89
+ f"Cannot copy local[{local_path}] to remote[{to_uri}] with cmd[{cmd}]; "
90
+ f"ret[{ret}] output[{out}] stderr[{err}]"
95
91
  )
96
92
  except Exception as e:
97
93
  raise RuntimeError(
@@ -113,10 +109,8 @@ class HDFS:
113
109
  return True
114
110
  else:
115
111
  raise RuntimeError(
116
- "Cannot copy remote[{}] to local[{}] with cmd[{}]; "
117
- "ret[{}] output[{}] stderr[{}]".format(
118
- from_uri, local_path, cmd, ret, out, err
119
- )
112
+ f"Cannot copy remote[{from_uri}] to local[{local_path}] with cmd[{cmd}]; "
113
+ f"ret[{ret}] output[{out}] stderr[{err}]"
120
114
  )
121
115
  except Exception as e:
122
116
  raise RuntimeError(
@@ -136,9 +130,7 @@ class HDFS:
136
130
  f"cmd [{cmd}] ret[{ret}] output[{out}] stderr[{err}]"
137
131
  )
138
132
  except Exception as e:
139
- raise RuntimeError(
140
- f"Cannot read text from uri[{uri}]" f"cmd [{cmd}]"
141
- ) from e
133
+ raise RuntimeError(f"Cannot read text from uri[{uri}]cmd [{cmd}]") from e
142
134
 
143
135
  @staticmethod
144
136
  def move(from_uri, to_uri):
@@ -155,6 +147,5 @@ class HDFS:
155
147
  )
156
148
  except Exception as e:
157
149
  raise RuntimeError(
158
- f"Cannot move from_uri[{from_uri}] to "
159
- f"to_uri[{to_uri}] with cmd[{cmd}]"
150
+ f"Cannot move from_uri[{from_uri}] to to_uri[{to_uri}] with cmd[{cmd}]"
160
151
  ) from e
@@ -2,6 +2,7 @@ import base64
2
2
  import hashlib
3
3
  import hmac
4
4
  import os
5
+ import shlex
5
6
  import struct
6
7
  import subprocess
7
8
  import time
@@ -89,6 +90,7 @@ def rsync(
89
90
  port: int = 22,
90
91
  key_filename: Optional[str] = None,
91
92
  timeout: Union[int, float] = 10,
93
+ proxy_command: Optional[str] = None,
92
94
  ):
93
95
  """Call rsync to transfer files.
94
96
 
@@ -104,6 +106,8 @@ def rsync(
104
106
  identity file name
105
107
  timeout : int, default=10
106
108
  timeout for ssh
109
+ proxy_command : str, optional
110
+ ProxyCommand to use for SSH connection
107
111
 
108
112
  Raises
109
113
  ------
@@ -124,20 +128,30 @@ def rsync(
124
128
  ]
125
129
  if key_filename is not None:
126
130
  ssh_cmd.extend(["-i", key_filename])
131
+
132
+ # Use proxy_command if provided
133
+ if proxy_command is not None:
134
+ ssh_cmd.extend(["-o", f"ProxyCommand={proxy_command}"])
135
+
136
+ # Properly escape the SSH command for rsync's -e option
137
+ ssh_cmd_str = " ".join(shlex.quote(part) for part in ssh_cmd)
138
+
127
139
  cmd = [
128
140
  "rsync",
129
- # -a: archieve
130
- # -z: compress
131
- "-az",
141
+ # -r: recursive, -l: links, -p: perms, -t: times, -D: devices/specials
142
+ # -z: compress (exclude -o: owner, -g: group to avoid permission issues)
143
+ "-rlptDz",
132
144
  "-e",
133
- " ".join(ssh_cmd),
145
+ ssh_cmd_str,
134
146
  "-q",
135
147
  from_file,
136
148
  to_file,
137
149
  ]
138
- ret, out, err = run_cmd_with_all_output(cmd, shell=False)
150
+ # Convert to string for shell=True
151
+ cmd_str = " ".join(shlex.quote(arg) for arg in cmd)
152
+ ret, out, err = run_cmd_with_all_output(cmd_str, shell=True)
139
153
  if ret != 0:
140
- raise RuntimeError(f"Failed to run {cmd}: {err}")
154
+ raise RuntimeError(f"Failed to run {cmd_str}: {err}")
141
155
 
142
156
 
143
157
  class RetrySignal(Exception):
@@ -191,7 +205,7 @@ def retry(
191
205
  else:
192
206
  # raise all exceptions
193
207
  raise RuntimeError(
194
- "Failed to run %s for %d times" % (func.__name__, current_retry)
208
+ f"Failed to run {func.__name__} for {current_retry} times"
195
209
  ) from errors[-1]
196
210
 
197
211
  return wrapper