PyPI - dpdispatcher - Versions diffs - 0.6.1__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

dpdispatcher 0.6.1py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

dpdispatcher/_version.py +22 -4
dpdispatcher/base_context.py +60 -1
dpdispatcher/contexts/__init__.py +1 -0
dpdispatcher/contexts/dp_cloud_server_context.py +8 -1
dpdispatcher/contexts/hdfs_context.py +16 -11
dpdispatcher/contexts/lazy_local_context.py +2 -19
dpdispatcher/contexts/local_context.py +77 -43
dpdispatcher/contexts/openapi_context.py +78 -14
dpdispatcher/contexts/ssh_context.py +117 -98
dpdispatcher/dlog.py +9 -5
dpdispatcher/dpcloudserver/__init__.py +0 -0
dpdispatcher/dpcloudserver/client.py +7 -0
dpdispatcher/dpdisp.py +21 -0
dpdispatcher/entrypoints/run.py +9 -0
dpdispatcher/entrypoints/submission.py +21 -1
dpdispatcher/machine.py +15 -4
dpdispatcher/machines/JH_UniScheduler.py +171 -0
dpdispatcher/machines/__init__.py +1 -0
dpdispatcher/machines/distributed_shell.py +6 -10
dpdispatcher/machines/fugaku.py +9 -12
dpdispatcher/machines/lsf.py +3 -9
dpdispatcher/machines/openapi.py +48 -15
dpdispatcher/machines/pbs.py +183 -20
dpdispatcher/machines/shell.py +7 -16
dpdispatcher/machines/slurm.py +30 -42
dpdispatcher/run.py +172 -0
dpdispatcher/submission.py +5 -14
dpdispatcher/utils/dpcloudserver/client.py +10 -6
dpdispatcher/utils/hdfs_cli.py +10 -19
dpdispatcher/utils/utils.py +21 -7
{dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/METADATA +35 -29
dpdispatcher-1.0.0.dist-info/RECORD +49 -0
{dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/WHEEL +1 -1
dpdispatcher-0.6.1.dist-info/RECORD +0 -44
{dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/entry_points.txt +0 -0
{dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info/licenses}/LICENSE +0 -0
{dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/top_level.txt +0 -0

dpdispatcher/machines/shell.py CHANGED Viewed

@@ -38,19 +38,12 @@ class Shell(Machine):
         script_run_str = self.gen_script_command(job)
         script_run_file_name = f"{job.script_file_name}.run"
         self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
-        ret, stdin, stdout, stderr = self.context.block_call(
-            "cd {} && {{ nohup bash {} 1>>{} 2>>{} & }} && echo $!".format(
-                shlex.quote(self.context.remote_root),
-                script_file_name,
-                output_name,
-                output_name,
-            )
-        )
+        cmd = f"cd {shlex.quote(self.context.remote_root)} && {{ nohup bash {script_file_name} 1>>{output_name} 2>>{output_name} & }} && echo $!"
+        ret, stdin, stdout, stderr = self.context.block_call(cmd)
         if ret != 0:
             err_str = stderr.read().decode("utf-8")
             raise RuntimeError(
-                "status command squeue fails to execute\nerror message:%s\nreturn code %d\n"
-                % (err_str, ret)
+                f"status command {cmd} fails to execute\nerror message:{err_str}\nreturn code {ret}\n"
             )
         job_id = int(stdout.read().decode("utf-8").strip())
         self.context.write_file(job_id_name, str(job_id))
@@ -67,9 +60,6 @@ class Shell(Machine):
         # self.context.write_file(job_id_name, job_id)
         # return job_id
-    def default_resources(self, resources):
-        pass
     def check_status(self, job):
         job_id = job.job_id
         # print('shell.check_status.job_id', job_id)
@@ -78,14 +68,15 @@ class Shell(Machine):
             return JobStatus.unsubmitted
         # mark defunct process as terminated
-        ret, stdin, stdout, stderr = self.context.block_call(
+        cmd = (
+            r"""command -v ps >/dev/null 2>&1 || { echo >&2 "I require ps but it's not installed. Aborting."; exit 1; };"""
             f"if ps -p {job_id} > /dev/null && ! (ps -o command -p {job_id} | grep defunct >/dev/null) ; then echo 1; fi"
         )
+        ret, stdin, stdout, stderr = self.context.block_call(cmd)
         if ret != 0:
             err_str = stderr.read().decode("utf-8")
             raise RuntimeError(
-                "status command squeue fails to execute\nerror message:%s\nreturn code %d\n"
-                % (err_str, ret)
+                f"status command {cmd} fails to execute\nerror message:{err_str}\nreturn code {ret}\n"
             )
         if_job_exists = bool(stdout.read().decode("utf-8").strip())

dpdispatcher/machines/slurm.py CHANGED Viewed

@@ -39,23 +39,23 @@ class Slurm(Machine):
     def gen_script_header(self, job):
         resources = job.resources
         script_header_dict = {}
-        script_header_dict["slurm_nodes_line"] = "#SBATCH --nodes {number_node}".format(
-            number_node=resources.number_node
+        script_header_dict["slurm_nodes_line"] = (
+            f"#SBATCH --nodes {resources.number_node}"
+        )
+        script_header_dict["slurm_ntasks_per_node_line"] = (
+            f"#SBATCH --ntasks-per-node {resources.cpu_per_node}"
         )
-        script_header_dict[
-            "slurm_ntasks_per_node_line"
-        ] = f"#SBATCH --ntasks-per-node {resources.cpu_per_node}"
         custom_gpu_line = resources.kwargs.get("custom_gpu_line", None)
         if not custom_gpu_line:
-            script_header_dict[
-                "slurm_number_gpu_line"
-            ] = f"#SBATCH --gres=gpu:{resources.gpu_per_node}"
+            script_header_dict["slurm_number_gpu_line"] = (
+                f"#SBATCH --gres=gpu:{resources.gpu_per_node}"
+            )
         else:
             script_header_dict["slurm_number_gpu_line"] = custom_gpu_line
         if resources.queue_name != "":
-            script_header_dict[
-                "slurm_partition_line"
-            ] = f"#SBATCH --partition {resources.queue_name}"
+            script_header_dict["slurm_partition_line"] = (
+                f"#SBATCH --partition {resources.queue_name}"
+            )
         else:
             script_header_dict["slurm_partition_line"] = ""
         if (
@@ -83,13 +83,12 @@ class Slurm(Machine):
         script_run_file_name = f"{job.script_file_name}.run"
         self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
         # self.context.write_file(fname=os.path.join(self.context.submission.work_base, script_file_name), write_str=script_str)
-        ret, stdin, stdout, stderr = self.context.block_call(
-            "cd {} && {} {}".format(
-                shlex.quote(self.context.remote_root),
-                "sbatch",
-                shlex.quote(script_file_name),
-            )
+        command = "cd {} && {} {}".format(
+            shlex.quote(self.context.remote_root),
+            "sbatch --parsable",
+            shlex.quote(script_file_name),
         )
+        ret, stdin, stdout, stderr = self.context.block_call(command)
         if ret != 0:
             err_str = stderr.read().decode("utf-8")
             if (
@@ -98,8 +97,7 @@ class Slurm(Machine):
             ):
                 # server network error, retry 3 times
                 raise RetrySignal(
-                    "Get error code %d in submitting through ssh with job: %s . message: %s"
-                    % (ret, job.job_hash, err_str)
+                    f"Get error code {ret} in submitting with job: {job.job_hash} . message: {err_str}"
                 )
             elif (
                 "Job violates accounting/QOS policy" in err_str
@@ -110,8 +108,7 @@ class Slurm(Machine):
                 # job number exceeds, skip the submitting
                 return ""
             raise RuntimeError(
-                "status command squeue fails to execute\nerror message:%s\nreturn code %d\n"
-                % (err_str, ret)
+                f"command {command} fails to execute\nerror message:{err_str}\nreturn code {ret}\n"
             )
         subret = stdout.readlines()
         # --parsable
@@ -121,17 +118,13 @@ class Slurm(Machine):
         self.context.write_file(job_id_name, job_id)
         return job_id
-    def default_resources(self, resources):
-        pass
     @retry()
     def check_status(self, job):
         job_id = job.job_id
         if job_id == "":
             return JobStatus.unsubmitted
-        ret, stdin, stdout, stderr = self.context.block_call(
-            'squeue -o "%.18i %.2t" -j ' + job_id
-        )
+        command = 'squeue -o "%.18i %.2t" -j ' + job_id
+        ret, stdin, stdout, stderr = self.context.block_call(command)
         if ret != 0:
             err_str = stderr.read().decode("utf-8")
             if "Invalid job id specified" in err_str:
@@ -147,13 +140,11 @@ class Slurm(Machine):
             ):
                 # retry 3 times
                 raise RetrySignal(
-                    "Get error code %d in checking status through ssh with job: %s . message: %s"
-                    % (ret, job.job_hash, err_str)
+                    f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
                 )
             raise RuntimeError(
-                "status command squeue fails to execute."
-                "job_id:%s \n error message:%s\n return code %d\n"
-                % (job_id, err_str, ret)
+                f"status command {command} fails to execute."
+                f"job_id:{job_id} \n error message:{err_str}\n return code {ret}\n"
             )
         status_line = stdout.read().decode("utf-8").split("\n")[-2]
         status_word = status_line.split()[-1]
@@ -254,10 +245,10 @@ class SlurmJobArray(Slurm):
                 ).as_posix()
                 if not self.context.check_file_exists(task_tag_finished):
                     job_array.add(ii // slurm_job_size)
-            return super().gen_script_header(job) + "\n#SBATCH --array=%s" % (
+            return super().gen_script_header(job) + "\n#SBATCH --array={}".format(
                 ",".join(map(str, job_array))
             )
-        return super().gen_script_header(job) + "\n#SBATCH --array=0-%d" % (
+        return super().gen_script_header(job) + "\n#SBATCH --array=0-%s" % (
             math.ceil(len(job.job_task_list) / slurm_job_size) - 1
         )
@@ -319,9 +310,8 @@ class SlurmJobArray(Slurm):
         job_id = job.job_id
         if job_id == "":
             return JobStatus.unsubmitted
-        ret, stdin, stdout, stderr = self.context.block_call(
-            'squeue -h -o "%.18i %.2t" -j ' + job_id
-        )
+        command = 'squeue -h -o "%.18i %.2t" -j ' + job_id
+        ret, stdin, stdout, stderr = self.context.block_call(command)
         if ret != 0:
             err_str = stderr.read().decode("utf-8")
             if "Invalid job id specified" in err_str:
@@ -336,13 +326,11 @@ class SlurmJobArray(Slurm):
             ):
                 # retry 3 times
                 raise RetrySignal(
-                    "Get error code %d in checking status through ssh with job: %s . message: %s"
-                    % (ret, job.job_hash, err_str)
+                    f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
                 )
             raise RuntimeError(
-                "status command squeue fails to execute."
-                "job_id:%s \n error message:%s\n return code %d\n"
-                % (job_id, err_str, ret)
+                f"status command {command} fails to execute."
+                f"job_id:{job_id} \n error message:{err_str}\n return code {ret}\n"
             )
         status_lines = stdout.read().decode("utf-8").split("\n")[:-1]
         status = []

dpdispatcher/run.py ADDED Viewed

@@ -0,0 +1,172 @@
+import os
+import re
+import sys
+from glob import glob
+from hashlib import sha1
+from dpdispatcher.machine import Machine
+from dpdispatcher.submission import Resources, Submission, Task
+if sys.version_info >= (3, 11):
+    import tomllib
+else:
+    import tomli as tomllib
+from typing import List, Optional
+from dargs import Argument
+from dpdispatcher.arginfo import machine_dargs, resources_dargs, task_dargs
+REGEX = r"(?m)^# /// (?P<type>[a-zA-Z0-9-]+)$\s(?P<content>(^#(| .*)$\s)+)^# ///$"
+def read_pep723(script: str) -> Optional[dict]:
+    """Read a PEP 723 script metadata from a script string.
+    Parameters
+    ----------
+    script : str
+        Script content.
+    Returns
+    -------
+    dict
+        PEP 723 metadata.
+    """
+    name = "script"
+    matches = list(
+        filter(lambda m: m.group("type") == name, re.finditer(REGEX, script))
+    )
+    if len(matches) > 1:
+        # TODO: Add tests for scenarios where multiple script blocks are found
+        raise ValueError(f"Multiple {name} blocks found")
+    elif len(matches) == 1:
+        content = "".join(
+            line[2:] if line.startswith("# ") else line[1:]
+            for line in matches[0].group("content").splitlines(keepends=True)
+        )
+        return tomllib.loads(content)
+    else:
+        # TODO: Add tests for scenarios where no metadata is found
+        return None
+def pep723_args() -> Argument:
+    """Return the argument parser for PEP 723 metadata."""
+    machine_args = machine_dargs()
+    machine_args.fold_subdoc = True
+    machine_args.doc = "Machine configuration. See related documentation for details."
+    resources_args = resources_dargs(detail_kwargs=False)
+    resources_args.fold_subdoc = True
+    resources_args.doc = (
+        "Resources configuration. See related documentation for details."
+    )
+    task_args = task_dargs()
+    command_arg = task_args["command"]
+    command_arg.doc = (
+        "Python interpreter or launcher. No need to contain the Python script filename."
+    )
+    command_arg.default = "python"
+    command_arg.optional = True
+    task_args["task_work_path"].doc += " Can be a glob pattern."
+    task_args.name = "task_list"
+    task_args.doc = "List of tasks to execute."
+    task_args.repeat = True
+    task_args.dtype = (list,)
+    return Argument(
+        "pep723",
+        dtype=dict,
+        doc="PEP 723 metadata",
+        sub_fields=[
+            Argument(
+                "work_base",
+                dtype=str,
+                optional=True,
+                default="./",
+                doc="Base directory for the work",
+            ),
+            Argument(
+                "forward_common_files",
+                dtype=List[str],
+                optional=True,
+                default=[],
+                doc="Common files to forward to the remote machine",
+            ),
+            Argument(
+                "backward_common_files",
+                dtype=List[str],
+                optional=True,
+                default=[],
+                doc="Common files to backward from the remote machine",
+            ),
+            machine_args,
+            resources_args,
+            task_args,
+        ],
+    )
+def create_submission(metadata: dict, hash: str) -> Submission:
+    """Create a Submission instance from a PEP 723 metadata.
+    Parameters
+    ----------
+    metadata : dict
+        PEP 723 metadata.
+    hash : str
+        Submission hash.
+    Returns
+    -------
+    Submission
+        Submission instance.
+    """
+    base = pep723_args()
+    metadata = base.normalize_value(metadata, trim_pattern="_*")
+    base.check_value(metadata, strict=False)
+    tasks = []
+    for task in metadata["task_list"]:
+        task = task.copy()
+        task["command"] += f" $REMOTE_ROOT/script_{hash}.py"
+        task_work_path = os.path.join(
+            metadata["machine"]["local_root"],
+            metadata["work_base"],
+            task["task_work_path"],
+        )
+        if os.path.isdir(task_work_path):
+            tasks.append(Task.load_from_dict(task))
+        elif glob(task_work_path):
+            for file in glob(task_work_path):
+                tasks.append(Task.load_from_dict({**task, "task_work_path": file}))
+        # TODO: Add tests for scenarios where the task work path is a glob pattern
+        else:
+            # TODO: Add tests for scenarios where the task work path is not found
+            raise FileNotFoundError(f"Task work path {task_work_path} not found.")
+    return Submission(
+        work_base=metadata["work_base"],
+        forward_common_files=metadata["forward_common_files"],
+        backward_common_files=metadata["backward_common_files"],
+        machine=Machine.load_from_dict(metadata["machine"]),
+        resources=Resources.load_from_dict(metadata["resources"]),
+        task_list=tasks,
+    )
+def run_pep723(script: str):
+    """Run a PEP 723 script.
+    Parameters
+    ----------
+    script : str
+        Script content.
+    """
+    metadata = read_pep723(script)
+    if metadata is None:
+        raise ValueError("No PEP 723 metadata found.")
+    dpdispatcher_metadata = metadata["tool"]["dpdispatcher"]
+    script_hash = sha1(script.encode("utf-8")).hexdigest()
+    submission = create_submission(dpdispatcher_metadata, script_hash)
+    submission.machine.context.write_file(f"script_{script_hash}.py", script)
+    # write script
+    submission.run_submission()

dpdispatcher/submission.py CHANGED Viewed

@@ -55,7 +55,6 @@ class Submission:
         *,
         task_list=[],
     ):
-        # self.submission_list = submission_list
         self.local_root = None
         self.work_base = work_base
         self._abs_work_base = os.path.abspath(work_base)
@@ -324,8 +323,7 @@ class Submission:
         kwargs = {**{"clean": False}, **kwargs}
         if kwargs["clean"]:
             dlog.warning(
-                "Using async submission with `clean=True`, "
-                "job may fail in queue system"
+                "Using async submission with `clean=True`, job may fail in queue system"
             )
         loop = asyncio.get_event_loop()
         wrapped_submission = functools.partial(self.run_submission, **kwargs)
@@ -515,12 +513,9 @@ class Submission:
     def submission_from_json(cls, json_file_name="submission.json"):
         with open(json_file_name) as f:
             submission_dict = json.load(f)
-        # submission_dict = machine.context.read_file(json_file_name)
         submission = cls.deserialize(submission_dict=submission_dict, machine=None)
         return submission
-    # def check_if_recover()
     def try_recover_from_json(self):
         submission_file_name = f"{self.submission_hash}.json"
         if_recover = self.machine.context.check_file_exists(submission_file_name)
@@ -545,7 +540,6 @@ class Submission:
                     f"machine.context.remote_root:{self.machine.context.remote_root}; "
                     f"submission.work_base:{submission.work_base};"
                 )
-                # self = submission.bind_machine(machine=self.machine)
             else:
                 print(self.serialize())
                 print(submission.serialize())
@@ -759,7 +753,6 @@ class Job:
         self.fail_count = 0
         self.job_uuid = uuid.uuid4()
-        # self.job_hash = self.get_hash()
         self.job_hash = self.get_hash()
         self.script_file_name = self.job_hash + ".sub"
@@ -863,9 +856,7 @@ class Job:
             self.submit_job()
             if self.job_state != JobStatus.unsubmitted:
                 dlog.info(
-                    "job:{job_hash} re-submit after terminated; new job_id is {job_id}".format(
-                        job_hash=self.job_hash, job_id=self.job_id
-                    )
+                    f"job:{self.job_hash} re-submit after terminated; new job_id is {self.job_id}"
                 )
                 time.sleep(0.2)
                 self.get_job_state()
@@ -1124,9 +1115,9 @@ class Resources:
     @staticmethod
     def arginfo(detail_kwargs=True):
-        doc_number_node = "The number of node need for each `job`"
-        doc_cpu_per_node = "cpu numbers of each node assigned to each job."
-        doc_gpu_per_node = "gpu numbers of each node assigned to each job."
+        doc_number_node = "The number of nodes required for each `job`."
+        doc_cpu_per_node = "CPU numbers of each node assigned to each job."
+        doc_gpu_per_node = "GPU numbers of each node assigned to each job."
         doc_queue_name = "The queue name of batch job scheduler system."
         doc_group_size = "The number of `tasks` in a `job`. 0 means infinity."
         doc_custom_flags = "The extra lines pass to job submitting script header"

dpdispatcher/utils/dpcloudserver/client.py CHANGED Viewed

@@ -142,10 +142,10 @@ class Client:
         res = self.get("/data/get_sts_token", {})
         # print('debug>>>>>>>>>>>>>', res)
         dlog.debug(f"debug: _get_oss_bucket: res:{res}")
-        auth = oss2.StsAuth(
+        auth = oss2.StsAuth(  # type: ignore[reportPossiblyUnboundVariable]
             res["AccessKeyId"], res["AccessKeySecret"], res["SecurityToken"]
         )
-        return oss2.Bucket(auth, endpoint, bucket_name)
+        return oss2.Bucket(auth, endpoint, bucket_name)  # type: ignore[reportPossiblyUnboundVariable]
     def download(self, oss_file, save_file, endpoint, bucket_name):
         bucket = self._get_oss_bucket(endpoint, bucket_name)
@@ -184,7 +184,7 @@ class Client:
         )
         bucket = self._get_oss_bucket(endpoint, bucket_name)
         total_size = os.path.getsize(zip_task_file)
-        part_size = determine_part_size(total_size, preferred_size=1000 * 1024)
+        part_size = determine_part_size(total_size, preferred_size=1000 * 1024)  # type: ignore[reportPossiblyUnboundVariable]
         upload_id = bucket.init_multipart_upload(oss_task_zip).upload_id
         parts = []
         with open(zip_task_file, "rb") as fileobj:
@@ -196,9 +196,9 @@ class Client:
                     oss_task_zip,
                     upload_id,
                     part_number,
-                    SizedFileAdapter(fileobj, num_to_upload),
+                    SizedFileAdapter(fileobj, num_to_upload),  # type: ignore[reportPossiblyUnboundVariable]
                 )
-                parts.append(PartInfo(part_number, result.etag))
+                parts.append(PartInfo(part_number, result.etag))  # type: ignore[reportPossiblyUnboundVariable]
                 offset += num_to_upload
                 part_number += 1
         # result = bucket.complete_multipart_upload(oss_task_zip, upload_id, parts)
@@ -278,7 +278,11 @@ class Client:
             return ""
         resp = requests.get(url, headers={"Range": f"bytes={self.last_log_offset}-"})
         self.last_log_offset += len(resp.content)
-        return resp.content.decode("utf-8")
+        try:
+            return resp.content.decode("utf-8")
+        except Exception as e:
+            dlog.error(f"Error decoding job log: {e}", stack_info=ENABLE_STACK)
+            return ""
     def _get_job_log(self, job_id):
         ret = self.get(

dpdispatcher/utils/hdfs_cli.py CHANGED Viewed

@@ -28,7 +28,7 @@ class HDFS:
                 )
         except Exception as e:
             raise RuntimeError(
-                f"Cannot check existence of hdfs uri[{uri}] " f"with cmd[{cmd}]"
+                f"Cannot check existence of hdfs uri[{uri}] with cmd[{cmd}]"
             ) from e
     @staticmethod
@@ -48,9 +48,7 @@ class HDFS:
                     f"with cmd[{cmd}]; ret[{ret}] output[{out}] stderr[{err}]"
                 )
         except Exception as e:
-            raise RuntimeError(
-                f"Cannot remove hdfs uri[{uri}] " f"with cmd[{cmd}]"
-            ) from e
+            raise RuntimeError(f"Cannot remove hdfs uri[{uri}] with cmd[{cmd}]") from e
     @staticmethod
     def mkdir(uri):
@@ -70,7 +68,7 @@ class HDFS:
                 )
         except Exception as e:
             raise RuntimeError(
-                f"Cannot mkdir of hdfs uri[{uri}] " f"with cmd[{cmd}]"
+                f"Cannot mkdir of hdfs uri[{uri}] with cmd[{cmd}]"
             ) from e
     @staticmethod
@@ -80,7 +78,7 @@ class HDFS:
         """
         # Make sure local_path is accessible
         if not os.path.exists(local_path) or not os.access(local_path, os.R_OK):
-            raise RuntimeError(f"try to access local_path[{local_path}] " "but failed")
+            raise RuntimeError(f"try to access local_path[{local_path}] but failed")
         cmd = f"hadoop fs -copyFromLocal -f {local_path} {to_uri}"
         try:
             ret, out, err = run_cmd_with_all_output(cmd)
@@ -88,10 +86,8 @@ class HDFS:
                 return True, out
             else:
                 raise RuntimeError(
-                    "Cannot copy local[{}] to remote[{}] with cmd[{}]; "
-                    "ret[{}] output[{}] stderr[{}]".format(
-                        local_path, to_uri, cmd, ret, out, err
-                    )
+                    f"Cannot copy local[{local_path}] to remote[{to_uri}] with cmd[{cmd}]; "
+                    f"ret[{ret}] output[{out}] stderr[{err}]"
                 )
         except Exception as e:
             raise RuntimeError(
@@ -113,10 +109,8 @@ class HDFS:
                 return True
             else:
                 raise RuntimeError(
-                    "Cannot copy remote[{}] to local[{}] with cmd[{}]; "
-                    "ret[{}] output[{}] stderr[{}]".format(
-                        from_uri, local_path, cmd, ret, out, err
-                    )
+                    f"Cannot copy remote[{from_uri}] to local[{local_path}] with cmd[{cmd}]; "
+                    f"ret[{ret}] output[{out}] stderr[{err}]"
                 )
         except Exception as e:
             raise RuntimeError(
@@ -136,9 +130,7 @@ class HDFS:
                     f"cmd [{cmd}] ret[{ret}] output[{out}] stderr[{err}]"
                 )
         except Exception as e:
-            raise RuntimeError(
-                f"Cannot read text from uri[{uri}]" f"cmd [{cmd}]"
-            ) from e
+            raise RuntimeError(f"Cannot read text from uri[{uri}]cmd [{cmd}]") from e
     @staticmethod
     def move(from_uri, to_uri):
@@ -155,6 +147,5 @@ class HDFS:
                 )
         except Exception as e:
             raise RuntimeError(
-                f"Cannot move from_uri[{from_uri}] to "
-                f"to_uri[{to_uri}] with cmd[{cmd}]"
+                f"Cannot move from_uri[{from_uri}] to to_uri[{to_uri}] with cmd[{cmd}]"
             ) from e

dpdispatcher/utils/utils.py CHANGED Viewed

@@ -2,6 +2,7 @@ import base64
 import hashlib
 import hmac
 import os
+import shlex
 import struct
 import subprocess
 import time
@@ -89,6 +90,7 @@ def rsync(
     port: int = 22,
     key_filename: Optional[str] = None,
     timeout: Union[int, float] = 10,
+    proxy_command: Optional[str] = None,
 ):
     """Call rsync to transfer files.
@@ -104,6 +106,8 @@ def rsync(
         identity file name
     timeout : int, default=10
         timeout for ssh
+    proxy_command : str, optional
+        ProxyCommand to use for SSH connection
     Raises
     ------
@@ -124,20 +128,30 @@ def rsync(
     ]
     if key_filename is not None:
         ssh_cmd.extend(["-i", key_filename])
+    # Use proxy_command if provided
+    if proxy_command is not None:
+        ssh_cmd.extend(["-o", f"ProxyCommand={proxy_command}"])
+    # Properly escape the SSH command for rsync's -e option
+    ssh_cmd_str = " ".join(shlex.quote(part) for part in ssh_cmd)
     cmd = [
         "rsync",
-        # -a: archieve
-        # -z: compress
-        "-az",
+        # -r: recursive, -l: links, -p: perms, -t: times, -D: devices/specials
+        # -z: compress (exclude -o: owner, -g: group to avoid permission issues)
+        "-rlptDz",
         "-e",
-        " ".join(ssh_cmd),
+        ssh_cmd_str,
         "-q",
         from_file,
         to_file,
     ]
-    ret, out, err = run_cmd_with_all_output(cmd, shell=False)
+    # Convert to string for shell=True
+    cmd_str = " ".join(shlex.quote(arg) for arg in cmd)
+    ret, out, err = run_cmd_with_all_output(cmd_str, shell=True)
     if ret != 0:
-        raise RuntimeError(f"Failed to run {cmd}: {err}")
+        raise RuntimeError(f"Failed to run {cmd_str}: {err}")
 class RetrySignal(Exception):
@@ -191,7 +205,7 @@ def retry(
             else:
                 # raise all exceptions
                 raise RuntimeError(
-                    "Failed to run %s for %d times" % (func.__name__, current_retry)
+                    f"Failed to run {func.__name__} for {current_retry} times"
                 ) from errors[-1]
         return wrapper

dpdispatcher 0.6.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

dpdispatcher 0.6.1py3-none-any.whl → 1.0.0py3-none-any.whl