PyPI - dpdispatcher - Versions diffs - 0.6.6__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

dpdispatcher 0.6.6py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

dpdispatcher/_version.py +22 -4
dpdispatcher/base_context.py +60 -1
dpdispatcher/contexts/dp_cloud_server_context.py +8 -1
dpdispatcher/contexts/hdfs_context.py +5 -0
dpdispatcher/contexts/lazy_local_context.py +2 -19
dpdispatcher/contexts/local_context.py +57 -31
dpdispatcher/contexts/openapi_context.py +78 -14
dpdispatcher/contexts/ssh_context.py +54 -47
dpdispatcher/machine.py +13 -1
dpdispatcher/machines/JH_UniScheduler.py +2 -6
dpdispatcher/machines/distributed_shell.py +2 -4
dpdispatcher/machines/fugaku.py +0 -3
dpdispatcher/machines/lsf.py +1 -5
dpdispatcher/machines/openapi.py +48 -15
dpdispatcher/machines/pbs.py +14 -16
dpdispatcher/machines/shell.py +7 -11
dpdispatcher/machines/slurm.py +18 -30
dpdispatcher/submission.py +4 -11
dpdispatcher/utils/dpcloudserver/client.py +10 -6
dpdispatcher/utils/hdfs_cli.py +6 -11
dpdispatcher/utils/utils.py +21 -7
{dpdispatcher-0.6.6.dist-info → dpdispatcher-1.0.0.dist-info}/METADATA +34 -29
dpdispatcher-1.0.0.dist-info/RECORD +49 -0
{dpdispatcher-0.6.6.dist-info → dpdispatcher-1.0.0.dist-info}/WHEEL +1 -1
dpdispatcher-0.6.6.dist-info/RECORD +0 -49
{dpdispatcher-0.6.6.dist-info → dpdispatcher-1.0.0.dist-info}/entry_points.txt +0 -0
{dpdispatcher-0.6.6.dist-info → dpdispatcher-1.0.0.dist-info/licenses}/LICENSE +0 -0
{dpdispatcher-0.6.6.dist-info → dpdispatcher-1.0.0.dist-info}/top_level.txt +0 -0

dpdispatcher/machine.py CHANGED Viewed

@@ -161,6 +161,9 @@ class Machine(metaclass=ABCMeta):
             machine_dict["remote_profile"] = self.context.remote_profile
         else:
             machine_dict["remote_profile"] = {}
+        # normalize the dict
+        base = self.arginfo()
+        machine_dict = base.normalize_value(machine_dict, trim_pattern="_*")
         return machine_dict
     def __eq__(self, other):
@@ -224,7 +227,7 @@ class Machine(metaclass=ABCMeta):
         return if_recover
     @abstractmethod
-    def check_finish_tag(self, **kwargs):
+    def check_finish_tag(self, job):
         raise NotImplementedError(
             "abstract method check_finish_tag should be implemented by derived class"
         )
@@ -265,6 +268,15 @@ class Machine(metaclass=ABCMeta):
         export_envs_part = ""
         envs = job.resources.envs
+        envs = {
+            # export resources information to the environment variables
+            "DPDISPATCHER_NUMBER_NODE": job.resources.number_node,
+            "DPDISPATCHER_CPU_PER_NODE": job.resources.cpu_per_node,
+            "DPDISPATCHER_GPU_PER_NODE": job.resources.gpu_per_node,
+            "DPDISPATCHER_QUEUE_NAME": job.resources.queue_name,
+            "DPDISPATCHER_GROUP_SIZE": job.resources.group_size,
+            **envs,
+        }
         for k, v in envs.items():
             if isinstance(v, list):
                 for each_value in v:

dpdispatcher/machines/JH_UniScheduler.py CHANGED Viewed

@@ -39,7 +39,7 @@ class JH_UniScheduler(Machine):
         custom_gpu_line = resources.kwargs.get("custom_gpu_line", None)
         if not custom_gpu_line:
             script_header_dict["JH_UniScheduler_number_gpu_line"] = (
-                "" f"#JSUB -gpgpu {resources.gpu_per_node}"
+                f"#JSUB -gpgpu {resources.gpu_per_node}"
             )
         else:
             script_header_dict["JH_UniScheduler_number_gpu_line"] = custom_gpu_line
@@ -84,9 +84,6 @@ class JH_UniScheduler(Machine):
         self.context.write_file(job_id_name, job_id)
         return job_id
-    def default_resources(self, resources):
-        pass
     @retry()
     def check_status(self, job):
         try:
@@ -105,8 +102,7 @@ class JH_UniScheduler(Machine):
         elif ret != 0:
             # just retry when any unknown error raised.
             raise RetrySignal(
-                "Get error code %d in checking status through ssh with job: %s . message: %s"
-                % (ret, job.job_hash, err_str)
+                f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
             )
         status_out = stdout.read().decode("utf-8").split("\n")
         if len(status_out) < 2:

dpdispatcher/machines/distributed_shell.py CHANGED Viewed

@@ -181,8 +181,7 @@ class DistributedShell(Machine):
         if ret != 0:
             err_str = stderr.decode("utf-8")
             raise RuntimeError(
-                "Command squeue fails to execute, error message:%s\nreturn code %d\n"
-                % (err_str, ret)
+                f"Command {cmd} fails to execute, error message:{err_str}\nreturn code {ret}\n"
             )
         job_id = int(stdout.decode("utf-8").strip())
@@ -200,8 +199,7 @@ class DistributedShell(Machine):
         if ret != 0:
             err_str = stderr.decode("utf-8")
             raise RuntimeError(
-                "Command fails to execute, error message:%s\nreturn code %d\n"
-                % (err_str, ret)
+                f"Command fails to execute, error message:{err_str}\nreturn code {ret}\n"
             )
         if_job_exists = bool(stdout.decode("utf-8").strip())

dpdispatcher/machines/fugaku.py CHANGED Viewed

@@ -67,9 +67,6 @@ class Fugaku(Machine):
         self.context.write_file(job_id_name, job_id)
         return job_id
-    def default_resources(self, resources):
-        pass
     def check_status(self, job):
         job_id = job.job_id
         if job_id == "":

dpdispatcher/machines/lsf.py CHANGED Viewed

@@ -102,9 +102,6 @@ class LSF(Machine):
         return job_id
     # TODO: derive abstract methods
-    def default_resources(self, resources):
-        pass
     def sub_script_cmd(self, res):
         pass
@@ -129,8 +126,7 @@ class LSF(Machine):
         elif ret != 0:
             # just retry when any unknown error raised.
             raise RetrySignal(
-                "Get error code %d in checking status through ssh with job: %s . message: %s"
-                % (ret, job.job_hash, err_str)
+                f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
             )
         status_out = stdout.read().decode("utf-8").split("\n")
         if len(status_out) < 2:

dpdispatcher/machines/openapi.py CHANGED Viewed

@@ -1,14 +1,13 @@
 import os
 import shutil
 import time
+from zipfile import ZipFile
 from dpdispatcher.utils.utils import customized_script_header_template
 try:
-    from bohriumsdk.client import Client
-    from bohriumsdk.job import Job
-    from bohriumsdk.storage import Storage
-    from bohriumsdk.util import Util
+    from bohrium import Bohrium
+    from bohrium.resources import Job, Tiefblue
 except ModuleNotFoundError:
     found_bohriumsdk = False
 else:
@@ -23,6 +22,12 @@ shell_script_header_template = """
 """
+def unzip_file(zip_file, out_dir="./"):
+    obj = ZipFile(zip_file, "r")
+    for item in obj.namelist():
+        obj.extract(item, out_dir)
 class OpenAPI(Machine):
     def __init__(self, context):
         if not found_bohriumsdk:
@@ -35,9 +40,35 @@ class OpenAPI(Machine):
         self.grouped = self.remote_profile.get("grouped", True)
         self.retry_count = self.remote_profile.get("retry_count", 3)
         self.ignore_exit_code = context.remote_profile.get("ignore_exit_code", True)
-        self.client = Client()
-        self.job = Job(client=self.client)
-        self.storage = Storage(client=self.client)
+        access_key = (
+            self.remote_profile.get("access_key", None)
+            or os.getenv("BOHRIUM_ACCESS_KEY", None)
+            or os.getenv("ACCESS_KEY", None)
+        )
+        project_id = (
+            self.remote_profile.get("project_id", None)
+            or os.getenv("BOHRIUM_PROJECT_ID", None)
+            or os.getenv("PROJECT_ID", None)
+        )
+        app_key = (
+            self.remote_profile.get("app_key", None)
+            or os.getenv("BOHRIUM_APP_KEY", None)
+            or os.getenv("APP_KEY", None)
+        )
+        if access_key is None:
+            raise ValueError(
+                "remote_profile must contain 'access_key' or set environment variable 'BOHRIUM_ACCESS_KEY'"
+            )
+        if project_id is None:
+            raise ValueError(
+                "remote_profile must contain 'project_id' or set environment variable 'BOHRIUM_PROJECT_ID'"
+            )
+        self.client = Bohrium(  # type: ignore[reportPossiblyUnboundVariable]
+            access_key=access_key, project_id=project_id, app_key=app_key
+        )
+        self.storage = Tiefblue()  # type: ignore[reportPossiblyUnboundVariable]
+        self.job = Job(client=self.client)  # type: ignore[reportPossiblyUnboundVariable]
         self.group_id = None
     def gen_script(self, job):
@@ -98,11 +129,13 @@ class OpenAPI(Machine):
             ),
             "out_files": self._gen_backward_files_list(job),
             "platform": self.remote_profile.get("platform", "ali"),
-            "image_address": self.remote_profile.get("image_address", ""),
+            "image_name": self.remote_profile.get("image_address", ""),
         }
-        if job.job_state == JobStatus.unsubmitted:
-            openapi_params["job_id"] = job.job_id
+        if "real_user_id" in self.remote_profile:
+            openapi_params["real_user_id"] = self.remote_profile["real_user_id"]
+        if "session_id" in self.remote_profile:
+            openapi_params["session_id"] = self.remote_profile["session_id"]
+        openapi_params["job_id"] = job.job_id
         data = self.job.insert(**openapi_params)
         job.job_id = data.get("jobId", 0)  # type: ignore
@@ -152,8 +185,8 @@ class OpenAPI(Machine):
             self.ignore_exit_code,
         )
         if job_state == JobStatus.finished:
-            job_log = self.job.log(job_id)
             if self.remote_profile.get("output_log"):
+                job_log = self.job.log(job_id)
                 print(job_log, end="")
             self._download_job(job)
         elif self.remote_profile.get("output_log") and job_state == JobStatus.running:
@@ -163,14 +196,14 @@ class OpenAPI(Machine):
     def _download_job(self, job):
         data = self.job.detail(job.job_id)
-        job_url = data["jobFiles"]["outFiles"][0]["url"]  # type: ignore
+        job_url = data["resultUrl"]  # type: ignore
         if not job_url:
             return
         job_hash = job.job_hash
         result_filename = job_hash + "_back.zip"
         target_result_zip = os.path.join(self.context.local_root, result_filename)
         self.storage.download_from_url(job_url, target_result_zip)
-        Util.unzip_file(target_result_zip, out_dir=self.context.local_root)
+        unzip_file(target_result_zip, out_dir=self.context.local_root)
         try:
             os.makedirs(os.path.join(self.context.local_root, "backup"), exist_ok=True)
             shutil.move(
@@ -213,7 +246,7 @@ class OpenAPI(Machine):
         if status not in map_dict:
             dlog.error(f"unknown job status {status}")
             return JobStatus.unknown
-        if status == -1 and exit_code != 0 and ignore_exit_code:
+        if status == -1 and ignore_exit_code:
             return JobStatus.finished
         return map_dict[status]

dpdispatcher/machines/pbs.py CHANGED Viewed

@@ -69,14 +69,12 @@ class PBS(Machine):
         self.context.write_file(job_id_name, job_id)
         return job_id
-    def default_resources(self, resources):
-        pass
     def check_status(self, job):
         job_id = job.job_id
         if job_id == "":
             return JobStatus.unsubmitted
-        ret, stdin, stdout, stderr = self.context.block_call("qstat -x " + job_id)
+        command = "qstat -x " + job_id
+        ret, stdin, stdout, stderr = self.context.block_call(command)
         err_str = stderr.read().decode("utf-8")
         if ret != 0:
             if "qstat: Unknown Job Id" in err_str or "Job has finished" in err_str:
@@ -86,8 +84,7 @@ class PBS(Machine):
                     return JobStatus.terminated
             else:
                 raise RuntimeError(
-                    "status command qstat fails to execute. erro info: %s return code %d"
-                    % (err_str, ret)
+                    f"status command {command} fails to execute. erro info: {err_str} return code {ret}"
                 )
         status_line = stdout.read().decode("utf-8").split("\n")[-2]
         status_word = status_line.split()[-2]
@@ -126,7 +123,8 @@ class Torque(PBS):
         job_id = job.job_id
         if job_id == "":
             return JobStatus.unsubmitted
-        ret, stdin, stdout, stderr = self.context.block_call("qstat -l " + job_id)
+        command = "qstat -l " + job_id
+        ret, stdin, stdout, stderr = self.context.block_call(command)
         err_str = stderr.read().decode("utf-8")
         if ret != 0:
             if "qstat: Unknown Job Id" in err_str or "Job has finished" in err_str:
@@ -136,8 +134,7 @@ class Torque(PBS):
                     return JobStatus.terminated
             else:
                 raise RuntimeError(
-                    "status command qstat fails to execute. erro info: %s return code %d"
-                    % (err_str, ret)
+                    f"status command {command} fails to execute. erro info: {err_str} return code {ret}"
                 )
         status_line = stdout.read().decode("utf-8").split("\n")[-2]
         status_word = status_line.split()[-2]
@@ -255,19 +252,18 @@ class SGE(PBS):
         self.context.write_file(job_id_name, job_id)
         return job_id
-    def default_resources(self, resources):
-        pass
     def check_status(self, job):
+        ### https://softpanorama.org/HPC/Grid_engine/Queues/queue_states.shtml
         job_id = job.job_id
         status_line = None
         if job_id == "":
             return JobStatus.unsubmitted
-        ret, stdin, stdout, stderr = self.context.block_call("qstat")
+        command = "qstat"
+        ret, stdin, stdout, stderr = self.context.block_call(command)
         err_str = stderr.read().decode("utf-8")
         if ret != 0:
             raise RuntimeError(
-                f"status command qstat fails to execute. erro info: {err_str} return code {ret}"
+                f"status command {command} fails to execute. erro info: {err_str} return code {ret}"
             )
         status_text_list = stdout.read().decode("utf-8").split("\n")
         for txt in status_text_list:
@@ -291,10 +287,12 @@ class SGE(PBS):
         else:
             status_word = status_line.split()[4]
             # dlog.info (status_word)
-            if status_word in ["qw"]:
+            if status_word in ["qw", "hqw", "t"]:
                 return JobStatus.waiting
-            elif status_word in ["r"]:
+            elif status_word in ["r", "Rr"]:
                 return JobStatus.running
+            elif status_word in ["Eqw", "dr", "dt"]:
+                return JobStatus.terminated
             else:
                 return JobStatus.unknown

dpdispatcher/machines/shell.py CHANGED Viewed

@@ -38,14 +38,12 @@ class Shell(Machine):
         script_run_str = self.gen_script_command(job)
         script_run_file_name = f"{job.script_file_name}.run"
         self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
-        ret, stdin, stdout, stderr = self.context.block_call(
-            f"cd {shlex.quote(self.context.remote_root)} && {{ nohup bash {script_file_name} 1>>{output_name} 2>>{output_name} & }} && echo $!"
-        )
+        cmd = f"cd {shlex.quote(self.context.remote_root)} && {{ nohup bash {script_file_name} 1>>{output_name} 2>>{output_name} & }} && echo $!"
+        ret, stdin, stdout, stderr = self.context.block_call(cmd)
         if ret != 0:
             err_str = stderr.read().decode("utf-8")
             raise RuntimeError(
-                "status command squeue fails to execute\nerror message:%s\nreturn code %d\n"
-                % (err_str, ret)
+                f"status command {cmd} fails to execute\nerror message:{err_str}\nreturn code {ret}\n"
             )
         job_id = int(stdout.read().decode("utf-8").strip())
         self.context.write_file(job_id_name, str(job_id))
@@ -62,9 +60,6 @@ class Shell(Machine):
         # self.context.write_file(job_id_name, job_id)
         # return job_id
-    def default_resources(self, resources):
-        pass
     def check_status(self, job):
         job_id = job.job_id
         # print('shell.check_status.job_id', job_id)
@@ -73,14 +68,15 @@ class Shell(Machine):
             return JobStatus.unsubmitted
         # mark defunct process as terminated
-        ret, stdin, stdout, stderr = self.context.block_call(
+        cmd = (
+            r"""command -v ps >/dev/null 2>&1 || { echo >&2 "I require ps but it's not installed. Aborting."; exit 1; };"""
             f"if ps -p {job_id} > /dev/null && ! (ps -o command -p {job_id} | grep defunct >/dev/null) ; then echo 1; fi"
         )
+        ret, stdin, stdout, stderr = self.context.block_call(cmd)
         if ret != 0:
             err_str = stderr.read().decode("utf-8")
             raise RuntimeError(
-                "status command squeue fails to execute\nerror message:%s\nreturn code %d\n"
-                % (err_str, ret)
+                f"status command {cmd} fails to execute\nerror message:{err_str}\nreturn code {ret}\n"
             )
         if_job_exists = bool(stdout.read().decode("utf-8").strip())

dpdispatcher/machines/slurm.py CHANGED Viewed

@@ -83,13 +83,12 @@ class Slurm(Machine):
         script_run_file_name = f"{job.script_file_name}.run"
         self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
         # self.context.write_file(fname=os.path.join(self.context.submission.work_base, script_file_name), write_str=script_str)
-        ret, stdin, stdout, stderr = self.context.block_call(
-            "cd {} && {} {}".format(
-                shlex.quote(self.context.remote_root),
-                "sbatch",
-                shlex.quote(script_file_name),
-            )
+        command = "cd {} && {} {}".format(
+            shlex.quote(self.context.remote_root),
+            "sbatch --parsable",
+            shlex.quote(script_file_name),
         )
+        ret, stdin, stdout, stderr = self.context.block_call(command)
         if ret != 0:
             err_str = stderr.read().decode("utf-8")
             if (
@@ -98,8 +97,7 @@ class Slurm(Machine):
             ):
                 # server network error, retry 3 times
                 raise RetrySignal(
-                    "Get error code %d in submitting through ssh with job: %s . message: %s"
-                    % (ret, job.job_hash, err_str)
+                    f"Get error code {ret} in submitting with job: {job.job_hash} . message: {err_str}"
                 )
             elif (
                 "Job violates accounting/QOS policy" in err_str
@@ -110,8 +108,7 @@ class Slurm(Machine):
                 # job number exceeds, skip the submitting
                 return ""
             raise RuntimeError(
-                "status command squeue fails to execute\nerror message:%s\nreturn code %d\n"
-                % (err_str, ret)
+                f"command {command} fails to execute\nerror message:{err_str}\nreturn code {ret}\n"
             )
         subret = stdout.readlines()
         # --parsable
@@ -121,17 +118,13 @@ class Slurm(Machine):
         self.context.write_file(job_id_name, job_id)
         return job_id
-    def default_resources(self, resources):
-        pass
     @retry()
     def check_status(self, job):
         job_id = job.job_id
         if job_id == "":
             return JobStatus.unsubmitted
-        ret, stdin, stdout, stderr = self.context.block_call(
-            'squeue -o "%.18i %.2t" -j ' + job_id
-        )
+        command = 'squeue -o "%.18i %.2t" -j ' + job_id
+        ret, stdin, stdout, stderr = self.context.block_call(command)
         if ret != 0:
             err_str = stderr.read().decode("utf-8")
             if "Invalid job id specified" in err_str:
@@ -147,13 +140,11 @@ class Slurm(Machine):
             ):
                 # retry 3 times
                 raise RetrySignal(
-                    "Get error code %d in checking status through ssh with job: %s . message: %s"
-                    % (ret, job.job_hash, err_str)
+                    f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
                 )
             raise RuntimeError(
-                "status command squeue fails to execute."
-                "job_id:%s \n error message:%s\n return code %d\n"
-                % (job_id, err_str, ret)
+                f"status command {command} fails to execute."
+                f"job_id:{job_id} \n error message:{err_str}\n return code {ret}\n"
             )
         status_line = stdout.read().decode("utf-8").split("\n")[-2]
         status_word = status_line.split()[-1]
@@ -257,7 +248,7 @@ class SlurmJobArray(Slurm):
             return super().gen_script_header(job) + "\n#SBATCH --array={}".format(
                 ",".join(map(str, job_array))
             )
-        return super().gen_script_header(job) + "\n#SBATCH --array=0-%d" % (
+        return super().gen_script_header(job) + "\n#SBATCH --array=0-%s" % (
             math.ceil(len(job.job_task_list) / slurm_job_size) - 1
         )
@@ -319,9 +310,8 @@ class SlurmJobArray(Slurm):
         job_id = job.job_id
         if job_id == "":
             return JobStatus.unsubmitted
-        ret, stdin, stdout, stderr = self.context.block_call(
-            'squeue -h -o "%.18i %.2t" -j ' + job_id
-        )
+        command = 'squeue -h -o "%.18i %.2t" -j ' + job_id
+        ret, stdin, stdout, stderr = self.context.block_call(command)
         if ret != 0:
             err_str = stderr.read().decode("utf-8")
             if "Invalid job id specified" in err_str:
@@ -336,13 +326,11 @@ class SlurmJobArray(Slurm):
             ):
                 # retry 3 times
                 raise RetrySignal(
-                    "Get error code %d in checking status through ssh with job: %s . message: %s"
-                    % (ret, job.job_hash, err_str)
+                    f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
                 )
             raise RuntimeError(
-                "status command squeue fails to execute."
-                "job_id:%s \n error message:%s\n return code %d\n"
-                % (job_id, err_str, ret)
+                f"status command {command} fails to execute."
+                f"job_id:{job_id} \n error message:{err_str}\n return code {ret}\n"
             )
         status_lines = stdout.read().decode("utf-8").split("\n")[:-1]
         status = []

dpdispatcher/submission.py CHANGED Viewed

@@ -55,7 +55,6 @@ class Submission:
         *,
         task_list=[],
     ):
-        # self.submission_list = submission_list
         self.local_root = None
         self.work_base = work_base
         self._abs_work_base = os.path.abspath(work_base)
@@ -324,8 +323,7 @@ class Submission:
         kwargs = {**{"clean": False}, **kwargs}
         if kwargs["clean"]:
             dlog.warning(
-                "Using async submission with `clean=True`, "
-                "job may fail in queue system"
+                "Using async submission with `clean=True`, job may fail in queue system"
             )
         loop = asyncio.get_event_loop()
         wrapped_submission = functools.partial(self.run_submission, **kwargs)
@@ -515,12 +513,9 @@ class Submission:
     def submission_from_json(cls, json_file_name="submission.json"):
         with open(json_file_name) as f:
             submission_dict = json.load(f)
-        # submission_dict = machine.context.read_file(json_file_name)
         submission = cls.deserialize(submission_dict=submission_dict, machine=None)
         return submission
-    # def check_if_recover()
     def try_recover_from_json(self):
         submission_file_name = f"{self.submission_hash}.json"
         if_recover = self.machine.context.check_file_exists(submission_file_name)
@@ -545,7 +540,6 @@ class Submission:
                     f"machine.context.remote_root:{self.machine.context.remote_root}; "
                     f"submission.work_base:{submission.work_base};"
                 )
-                # self = submission.bind_machine(machine=self.machine)
             else:
                 print(self.serialize())
                 print(submission.serialize())
@@ -759,7 +753,6 @@ class Job:
         self.fail_count = 0
         self.job_uuid = uuid.uuid4()
-        # self.job_hash = self.get_hash()
         self.job_hash = self.get_hash()
         self.script_file_name = self.job_hash + ".sub"
@@ -1122,9 +1115,9 @@ class Resources:
     @staticmethod
     def arginfo(detail_kwargs=True):
-        doc_number_node = "The number of node need for each `job`"
-        doc_cpu_per_node = "cpu numbers of each node assigned to each job."
-        doc_gpu_per_node = "gpu numbers of each node assigned to each job."
+        doc_number_node = "The number of nodes required for each `job`."
+        doc_cpu_per_node = "CPU numbers of each node assigned to each job."
+        doc_gpu_per_node = "GPU numbers of each node assigned to each job."
         doc_queue_name = "The queue name of batch job scheduler system."
         doc_group_size = "The number of `tasks` in a `job`. 0 means infinity."
         doc_custom_flags = "The extra lines pass to job submitting script header"

dpdispatcher/utils/dpcloudserver/client.py CHANGED Viewed

@@ -142,10 +142,10 @@ class Client:
         res = self.get("/data/get_sts_token", {})
         # print('debug>>>>>>>>>>>>>', res)
         dlog.debug(f"debug: _get_oss_bucket: res:{res}")
-        auth = oss2.StsAuth(
+        auth = oss2.StsAuth(  # type: ignore[reportPossiblyUnboundVariable]
             res["AccessKeyId"], res["AccessKeySecret"], res["SecurityToken"]
         )
-        return oss2.Bucket(auth, endpoint, bucket_name)
+        return oss2.Bucket(auth, endpoint, bucket_name)  # type: ignore[reportPossiblyUnboundVariable]
     def download(self, oss_file, save_file, endpoint, bucket_name):
         bucket = self._get_oss_bucket(endpoint, bucket_name)
@@ -184,7 +184,7 @@ class Client:
         )
         bucket = self._get_oss_bucket(endpoint, bucket_name)
         total_size = os.path.getsize(zip_task_file)
-        part_size = determine_part_size(total_size, preferred_size=1000 * 1024)
+        part_size = determine_part_size(total_size, preferred_size=1000 * 1024)  # type: ignore[reportPossiblyUnboundVariable]
         upload_id = bucket.init_multipart_upload(oss_task_zip).upload_id
         parts = []
         with open(zip_task_file, "rb") as fileobj:
@@ -196,9 +196,9 @@ class Client:
                     oss_task_zip,
                     upload_id,
                     part_number,
-                    SizedFileAdapter(fileobj, num_to_upload),
+                    SizedFileAdapter(fileobj, num_to_upload),  # type: ignore[reportPossiblyUnboundVariable]
                 )
-                parts.append(PartInfo(part_number, result.etag))
+                parts.append(PartInfo(part_number, result.etag))  # type: ignore[reportPossiblyUnboundVariable]
                 offset += num_to_upload
                 part_number += 1
         # result = bucket.complete_multipart_upload(oss_task_zip, upload_id, parts)
@@ -278,7 +278,11 @@ class Client:
             return ""
         resp = requests.get(url, headers={"Range": f"bytes={self.last_log_offset}-"})
         self.last_log_offset += len(resp.content)
-        return resp.content.decode("utf-8")
+        try:
+            return resp.content.decode("utf-8")
+        except Exception as e:
+            dlog.error(f"Error decoding job log: {e}", stack_info=ENABLE_STACK)
+            return ""
     def _get_job_log(self, job_id):
         ret = self.get(

dpdispatcher/utils/hdfs_cli.py CHANGED Viewed

@@ -28,7 +28,7 @@ class HDFS:
                 )
         except Exception as e:
             raise RuntimeError(
-                f"Cannot check existence of hdfs uri[{uri}] " f"with cmd[{cmd}]"
+                f"Cannot check existence of hdfs uri[{uri}] with cmd[{cmd}]"
             ) from e
     @staticmethod
@@ -48,9 +48,7 @@ class HDFS:
                     f"with cmd[{cmd}]; ret[{ret}] output[{out}] stderr[{err}]"
                 )
         except Exception as e:
-            raise RuntimeError(
-                f"Cannot remove hdfs uri[{uri}] " f"with cmd[{cmd}]"
-            ) from e
+            raise RuntimeError(f"Cannot remove hdfs uri[{uri}] with cmd[{cmd}]") from e
     @staticmethod
     def mkdir(uri):
@@ -70,7 +68,7 @@ class HDFS:
                 )
         except Exception as e:
             raise RuntimeError(
-                f"Cannot mkdir of hdfs uri[{uri}] " f"with cmd[{cmd}]"
+                f"Cannot mkdir of hdfs uri[{uri}] with cmd[{cmd}]"
             ) from e
     @staticmethod
@@ -80,7 +78,7 @@ class HDFS:
         """
         # Make sure local_path is accessible
         if not os.path.exists(local_path) or not os.access(local_path, os.R_OK):
-            raise RuntimeError(f"try to access local_path[{local_path}] " "but failed")
+            raise RuntimeError(f"try to access local_path[{local_path}] but failed")
         cmd = f"hadoop fs -copyFromLocal -f {local_path} {to_uri}"
         try:
             ret, out, err = run_cmd_with_all_output(cmd)
@@ -132,9 +130,7 @@ class HDFS:
                     f"cmd [{cmd}] ret[{ret}] output[{out}] stderr[{err}]"
                 )
         except Exception as e:
-            raise RuntimeError(
-                f"Cannot read text from uri[{uri}]" f"cmd [{cmd}]"
-            ) from e
+            raise RuntimeError(f"Cannot read text from uri[{uri}]cmd [{cmd}]") from e
     @staticmethod
     def move(from_uri, to_uri):
@@ -151,6 +147,5 @@ class HDFS:
                 )
         except Exception as e:
             raise RuntimeError(
-                f"Cannot move from_uri[{from_uri}] to "
-                f"to_uri[{to_uri}] with cmd[{cmd}]"
+                f"Cannot move from_uri[{from_uri}] to to_uri[{to_uri}] with cmd[{cmd}]"
             ) from e

dpdispatcher 0.6.6__py3-none-any.whl → 1.0.0__py3-none-any.whl

dpdispatcher 0.6.6py3-none-any.whl → 1.0.0py3-none-any.whl