PyPI - dpdispatcher - Versions diffs - 0.6.1__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

dpdispatcher 0.6.1py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

dpdispatcher/_version.py +22 -4
dpdispatcher/base_context.py +60 -1
dpdispatcher/contexts/__init__.py +1 -0
dpdispatcher/contexts/dp_cloud_server_context.py +8 -1
dpdispatcher/contexts/hdfs_context.py +16 -11
dpdispatcher/contexts/lazy_local_context.py +2 -19
dpdispatcher/contexts/local_context.py +77 -43
dpdispatcher/contexts/openapi_context.py +78 -14
dpdispatcher/contexts/ssh_context.py +117 -98
dpdispatcher/dlog.py +9 -5
dpdispatcher/dpcloudserver/__init__.py +0 -0
dpdispatcher/dpcloudserver/client.py +7 -0
dpdispatcher/dpdisp.py +21 -0
dpdispatcher/entrypoints/run.py +9 -0
dpdispatcher/entrypoints/submission.py +21 -1
dpdispatcher/machine.py +15 -4
dpdispatcher/machines/JH_UniScheduler.py +171 -0
dpdispatcher/machines/__init__.py +1 -0
dpdispatcher/machines/distributed_shell.py +6 -10
dpdispatcher/machines/fugaku.py +9 -12
dpdispatcher/machines/lsf.py +3 -9
dpdispatcher/machines/openapi.py +48 -15
dpdispatcher/machines/pbs.py +183 -20
dpdispatcher/machines/shell.py +7 -16
dpdispatcher/machines/slurm.py +30 -42
dpdispatcher/run.py +172 -0
dpdispatcher/submission.py +5 -14
dpdispatcher/utils/dpcloudserver/client.py +10 -6
dpdispatcher/utils/hdfs_cli.py +10 -19
dpdispatcher/utils/utils.py +21 -7
{dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/METADATA +35 -29
dpdispatcher-1.0.0.dist-info/RECORD +49 -0
{dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/WHEEL +1 -1
dpdispatcher-0.6.1.dist-info/RECORD +0 -44
{dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/entry_points.txt +0 -0
{dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info/licenses}/LICENSE +0 -0
{dpdispatcher-0.6.1.dist-info → dpdispatcher-1.0.0.dist-info}/top_level.txt +0 -0

dpdispatcher/machines/JH_UniScheduler.py ADDED Viewed

@@ -0,0 +1,171 @@
+import shlex
+from typing import List
+from dargs import Argument
+from dpdispatcher.dlog import dlog
+from dpdispatcher.machine import Machine
+from dpdispatcher.utils.job_status import JobStatus
+from dpdispatcher.utils.utils import (
+    RetrySignal,
+    customized_script_header_template,
+    retry,
+)
+JH_UniScheduler_script_header_template = """\
+#!/bin/bash -l
+#JSUB -e %J.err
+#JSUB -o %J.out
+{JH_UniScheduler_nodes_line}
+{JH_UniScheduler_ptile_line}
+{JH_UniScheduler_partition_line}
+{JH_UniScheduler_number_gpu_line}"""
+class JH_UniScheduler(Machine):
+    """JH_UniScheduler batch."""
+    def gen_script(self, job):
+        JH_UniScheduler_script = super().gen_script(job)
+        return JH_UniScheduler_script
+    def gen_script_header(self, job):
+        resources = job.resources
+        script_header_dict = {
+            "JH_UniScheduler_nodes_line": f"#JSUB -n {resources.number_node * resources.cpu_per_node}",
+            "JH_UniScheduler_ptile_line": f"#JSUB -R 'span[ptile={resources.cpu_per_node}]'",
+            "JH_UniScheduler_partition_line": f"#JSUB -q {resources.queue_name}",
+        }
+        custom_gpu_line = resources.kwargs.get("custom_gpu_line", None)
+        if not custom_gpu_line:
+            script_header_dict["JH_UniScheduler_number_gpu_line"] = (
+                f"#JSUB -gpgpu {resources.gpu_per_node}"
+            )
+        else:
+            script_header_dict["JH_UniScheduler_number_gpu_line"] = custom_gpu_line
+        if (
+            resources["strategy"].get("customized_script_header_template_file")
+            is not None
+        ):
+            JH_UniScheduler_script_header = customized_script_header_template(
+                resources["strategy"]["customized_script_header_template_file"],
+                resources,
+            )
+        else:
+            JH_UniScheduler_script_header = (
+                JH_UniScheduler_script_header_template.format(**script_header_dict)
+            )
+        return JH_UniScheduler_script_header
+    @retry()
+    def do_submit(self, job):
+        script_file_name = job.script_file_name
+        script_str = self.gen_script(job)
+        job_id_name = job.job_hash + "_job_id"
+        self.context.write_file(fname=script_file_name, write_str=script_str)
+        script_run_str = self.gen_script_command(job)
+        script_run_file_name = f"{job.script_file_name}.run"
+        self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
+        try:
+            stdin, stdout, stderr = self.context.block_checkcall(
+                "cd {} && {} {}".format(
+                    shlex.quote(self.context.remote_root),
+                    "jsub < ",
+                    shlex.quote(script_file_name),
+                )
+            )
+        except RuntimeError as err:
+            raise RetrySignal(err) from err
+        subret = stdout.readlines()
+        job_id = subret[0].split()[1][1:-1]
+        self.context.write_file(job_id_name, job_id)
+        return job_id
+    @retry()
+    def check_status(self, job):
+        try:
+            job_id = job.job_id
+        except AttributeError:
+            return JobStatus.terminated
+        if job_id == "":
+            return JobStatus.unsubmitted
+        ret, stdin, stdout, stderr = self.context.block_call("jjobs " + job_id)
+        err_str = stderr.read().decode("utf-8")
+        if (f"Job <{job_id}> is not found") in err_str:
+            if self.check_finish_tag(job):
+                return JobStatus.finished
+            else:
+                return JobStatus.terminated
+        elif ret != 0:
+            # just retry when any unknown error raised.
+            raise RetrySignal(
+                f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
+            )
+        status_out = stdout.read().decode("utf-8").split("\n")
+        if len(status_out) < 2:
+            return JobStatus.unknown
+        else:
+            status_line = status_out[1]
+            status_word = status_line.split()[2]
+        if status_word in ["PEND"]:
+            return JobStatus.waiting
+        elif status_word in ["RUN", "PSUSP", "SSUSP", "USUSP"]:
+            return JobStatus.running
+        elif status_word in ["DONE", "EXIT"]:
+            if self.check_finish_tag(job):
+                dlog.info(f"job: {job.job_hash} {job.job_id} finished")
+                return JobStatus.finished
+            else:
+                return JobStatus.terminated
+        else:
+            return JobStatus.unknown
+    def check_finish_tag(self, job):
+        job_tag_finished = job.job_hash + "_job_tag_finished"
+        return self.context.check_file_exists(job_tag_finished)
+    @classmethod
+    def resources_subfields(cls) -> List[Argument]:
+        """Generate the resources subfields.
+        Returns
+        -------
+        list[Argument]
+            resources subfields
+        """
+        doc_custom_gpu_line = "Custom GPU configuration, starting with #JSUB"
+        return [
+            Argument(
+                "kwargs",
+                dict,
+                [
+                    Argument(
+                        "custom_gpu_line",
+                        str,
+                        optional=True,
+                        default=None,
+                        doc=doc_custom_gpu_line,
+                    ),
+                ],
+                optional=False,
+                doc="Extra arguments.",
+            )
+        ]
+    def kill(self, job):
+        """Kill the job.
+        Parameters
+        ----------
+        job : Job
+            job
+        """
+        job_id = job.job_id
+        ret, stdin, stdout, stderr = self.context.block_call(
+            "jctrl kill " + str(job_id)
+        )

dpdispatcher/machines/__init__.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Machines."""
 import importlib
 from pathlib import Path

dpdispatcher/machines/distributed_shell.py CHANGED Viewed

@@ -64,7 +64,7 @@ class DistributedShell(Machine):
         source_list = job.resources.source_list
         for ii in source_list:
-            line = "{ source %s; } \n" % ii
+            line = f"{{ source {ii}; }} \n"
             source_files_part += line
         export_envs_part = ""
@@ -96,7 +96,7 @@ class DistributedShell(Machine):
     def gen_script_end(self, job):
         all_task_dirs = ""
         for task in job.job_task_list:
-            all_task_dirs += "%s " % task.task_work_path
+            all_task_dirs += f"{task.task_work_path} "
         job_tag_finished = job.job_hash + "_job_tag_finished"
         flag_if_job_task_fail = job.job_hash + "_flag_if_job_task_fail"
@@ -173,18 +173,15 @@ class DistributedShell(Machine):
             )
         )
-        cmd = "{{ nohup {} 1>{} 2>{} & }} && echo $!".format(
-            submit_command,
-            output_name,
-            output_name,
+        cmd = (
+            f"{{ nohup {submit_command} 1>{output_name} 2>{output_name} & }} && echo $!"
         )
         ret, stdout, stderr = run_cmd_with_all_output(cmd)
         if ret != 0:
             err_str = stderr.decode("utf-8")
             raise RuntimeError(
-                "Command squeue fails to execute, error message:%s\nreturn code %d\n"
-                % (err_str, ret)
+                f"Command {cmd} fails to execute, error message:{err_str}\nreturn code {ret}\n"
             )
         job_id = int(stdout.decode("utf-8").strip())
@@ -202,8 +199,7 @@ class DistributedShell(Machine):
         if ret != 0:
             err_str = stderr.decode("utf-8")
             raise RuntimeError(
-                "Command fails to execute, error message:%s\nreturn code %d\n"
-                % (err_str, ret)
+                f"Command fails to execute, error message:{err_str}\nreturn code {ret}\n"
             )
         if_job_exists = bool(stdout.decode("utf-8").strip())

dpdispatcher/machines/fugaku.py CHANGED Viewed

@@ -20,15 +20,15 @@ class Fugaku(Machine):
     def gen_script_header(self, job):
         resources = job.resources
         fugaku_script_header_dict = {}
-        fugaku_script_header_dict[
-            "fugaku_node_number_line"
-        ] = f'#PJM -L "node={resources.number_node}" '
-        fugaku_script_header_dict[
-            "fugaku_ntasks_per_node_line"
-        ] = f'#PJM --mpi "max-proc-per-node={resources.cpu_per_node}"'
-        fugaku_script_header_dict[
-            "queue_name_line"
-        ] = f'#PJM -L "rscgrp={resources.queue_name}"'
+        fugaku_script_header_dict["fugaku_node_number_line"] = (
+            f'#PJM -L "node={resources.number_node}" '
+        )
+        fugaku_script_header_dict["fugaku_ntasks_per_node_line"] = (
+            f'#PJM --mpi "max-proc-per-node={resources.cpu_per_node}"'
+        )
+        fugaku_script_header_dict["queue_name_line"] = (
+            f'#PJM -L "rscgrp={resources.queue_name}"'
+        )
         if (
             resources["strategy"].get("customized_script_header_template_file")
             is not None
@@ -67,9 +67,6 @@ class Fugaku(Machine):
         self.context.write_file(job_id_name, job_id)
         return job_id
-    def default_resources(self, resources):
-        pass
     def check_status(self, job):
         job_id = job.job_id
         if job_id == "":

dpdispatcher/machines/lsf.py CHANGED Viewed

@@ -32,9 +32,7 @@ class LSF(Machine):
     def gen_script_header(self, job):
         resources = job.resources
         script_header_dict = {
-            "lsf_nodes_line": "#BSUB -n {number_cores}".format(
-                number_cores=resources.number_node * resources.cpu_per_node
-            ),
+            "lsf_nodes_line": f"#BSUB -n {resources.number_node * resources.cpu_per_node}",
             "lsf_ptile_line": f"#BSUB -R 'span[ptile={resources.cpu_per_node}]'",
             "lsf_partition_line": f"#BSUB -q {resources.queue_name}",
         }
@@ -104,9 +102,6 @@ class LSF(Machine):
         return job_id
     # TODO: derive abstract methods
-    def default_resources(self, resources):
-        pass
     def sub_script_cmd(self, res):
         pass
@@ -123,7 +118,7 @@ class LSF(Machine):
             return JobStatus.unsubmitted
         ret, stdin, stdout, stderr = self.context.block_call("bjobs " + job_id)
         err_str = stderr.read().decode("utf-8")
-        if ("Job <%s> is not found" % job_id) in err_str:
+        if (f"Job <{job_id}> is not found") in err_str:
             if self.check_finish_tag(job):
                 return JobStatus.finished
             else:
@@ -131,8 +126,7 @@ class LSF(Machine):
         elif ret != 0:
             # just retry when any unknown error raised.
             raise RetrySignal(
-                "Get error code %d in checking status through ssh with job: %s . message: %s"
-                % (ret, job.job_hash, err_str)
+                f"Get error code {ret} in checking status with job: {job.job_hash} . message: {err_str}"
             )
         status_out = stdout.read().decode("utf-8").split("\n")
         if len(status_out) < 2:

dpdispatcher/machines/openapi.py CHANGED Viewed

@@ -1,14 +1,13 @@
 import os
 import shutil
 import time
+from zipfile import ZipFile
 from dpdispatcher.utils.utils import customized_script_header_template
 try:
-    from bohriumsdk.client import Client
-    from bohriumsdk.job import Job
-    from bohriumsdk.storage import Storage
-    from bohriumsdk.util import Util
+    from bohrium import Bohrium
+    from bohrium.resources import Job, Tiefblue
 except ModuleNotFoundError:
     found_bohriumsdk = False
 else:
@@ -23,6 +22,12 @@ shell_script_header_template = """
 """
+def unzip_file(zip_file, out_dir="./"):
+    obj = ZipFile(zip_file, "r")
+    for item in obj.namelist():
+        obj.extract(item, out_dir)
 class OpenAPI(Machine):
     def __init__(self, context):
         if not found_bohriumsdk:
@@ -35,9 +40,35 @@ class OpenAPI(Machine):
         self.grouped = self.remote_profile.get("grouped", True)
         self.retry_count = self.remote_profile.get("retry_count", 3)
         self.ignore_exit_code = context.remote_profile.get("ignore_exit_code", True)
-        self.client = Client()
-        self.job = Job(client=self.client)
-        self.storage = Storage(client=self.client)
+        access_key = (
+            self.remote_profile.get("access_key", None)
+            or os.getenv("BOHRIUM_ACCESS_KEY", None)
+            or os.getenv("ACCESS_KEY", None)
+        )
+        project_id = (
+            self.remote_profile.get("project_id", None)
+            or os.getenv("BOHRIUM_PROJECT_ID", None)
+            or os.getenv("PROJECT_ID", None)
+        )
+        app_key = (
+            self.remote_profile.get("app_key", None)
+            or os.getenv("BOHRIUM_APP_KEY", None)
+            or os.getenv("APP_KEY", None)
+        )
+        if access_key is None:
+            raise ValueError(
+                "remote_profile must contain 'access_key' or set environment variable 'BOHRIUM_ACCESS_KEY'"
+            )
+        if project_id is None:
+            raise ValueError(
+                "remote_profile must contain 'project_id' or set environment variable 'BOHRIUM_PROJECT_ID'"
+            )
+        self.client = Bohrium(  # type: ignore[reportPossiblyUnboundVariable]
+            access_key=access_key, project_id=project_id, app_key=app_key
+        )
+        self.storage = Tiefblue()  # type: ignore[reportPossiblyUnboundVariable]
+        self.job = Job(client=self.client)  # type: ignore[reportPossiblyUnboundVariable]
         self.group_id = None
     def gen_script(self, job):
@@ -98,11 +129,13 @@ class OpenAPI(Machine):
             ),
             "out_files": self._gen_backward_files_list(job),
             "platform": self.remote_profile.get("platform", "ali"),
-            "image_address": self.remote_profile.get("image_address", ""),
+            "image_name": self.remote_profile.get("image_address", ""),
         }
-        if job.job_state == JobStatus.unsubmitted:
-            openapi_params["job_id"] = job.job_id
+        if "real_user_id" in self.remote_profile:
+            openapi_params["real_user_id"] = self.remote_profile["real_user_id"]
+        if "session_id" in self.remote_profile:
+            openapi_params["session_id"] = self.remote_profile["session_id"]
+        openapi_params["job_id"] = job.job_id
         data = self.job.insert(**openapi_params)
         job.job_id = data.get("jobId", 0)  # type: ignore
@@ -152,8 +185,8 @@ class OpenAPI(Machine):
             self.ignore_exit_code,
         )
         if job_state == JobStatus.finished:
-            job_log = self.job.log(job_id)
             if self.remote_profile.get("output_log"):
+                job_log = self.job.log(job_id)
                 print(job_log, end="")
             self._download_job(job)
         elif self.remote_profile.get("output_log") and job_state == JobStatus.running:
@@ -163,14 +196,14 @@ class OpenAPI(Machine):
     def _download_job(self, job):
         data = self.job.detail(job.job_id)
-        job_url = data["jobFiles"]["outFiles"][0]["url"]  # type: ignore
+        job_url = data["resultUrl"]  # type: ignore
         if not job_url:
             return
         job_hash = job.job_hash
         result_filename = job_hash + "_back.zip"
         target_result_zip = os.path.join(self.context.local_root, result_filename)
         self.storage.download_from_url(job_url, target_result_zip)
-        Util.unzip_file(target_result_zip, out_dir=self.context.local_root)
+        unzip_file(target_result_zip, out_dir=self.context.local_root)
         try:
             os.makedirs(os.path.join(self.context.local_root, "backup"), exist_ok=True)
             shutil.move(
@@ -213,7 +246,7 @@ class OpenAPI(Machine):
         if status not in map_dict:
             dlog.error(f"unknown job status {status}")
             return JobStatus.unknown
-        if status == -1 and exit_code != 0 and ignore_exit_code:
+        if status == -1 and ignore_exit_code:
             return JobStatus.finished
         return map_dict[status]

dpdispatcher/machines/pbs.py CHANGED Viewed

@@ -1,4 +1,7 @@
 import shlex
+from typing import List
+from dargs import Argument
 from dpdispatcher.dlog import dlog
 from dpdispatcher.machine import Machine
@@ -21,13 +24,13 @@ class PBS(Machine):
     def gen_script_header(self, job):
         resources = job.resources
         pbs_script_header_dict = {}
-        pbs_script_header_dict[
-            "select_node_line"
-        ] = f"#PBS -l select={resources.number_node}:ncpus={resources.cpu_per_node}"
+        pbs_script_header_dict["select_node_line"] = (
+            f"#PBS -l select={resources.number_node}:ncpus={resources.cpu_per_node}"
+        )
         if resources.gpu_per_node != 0:
-            pbs_script_header_dict[
-                "select_node_line"
-            ] += f":ngpus={resources.gpu_per_node}"
+            pbs_script_header_dict["select_node_line"] += (
+                f":ngpus={resources.gpu_per_node}"
+            )
         pbs_script_header_dict["queue_name_line"] = f"#PBS -q {resources.queue_name}"
         if (
             resources["strategy"].get("customized_script_header_template_file")
@@ -66,14 +69,12 @@ class PBS(Machine):
         self.context.write_file(job_id_name, job_id)
         return job_id
-    def default_resources(self, resources):
-        pass
     def check_status(self, job):
         job_id = job.job_id
         if job_id == "":
             return JobStatus.unsubmitted
-        ret, stdin, stdout, stderr = self.context.block_call("qstat -x " + job_id)
+        command = "qstat -x " + job_id
+        ret, stdin, stdout, stderr = self.context.block_call(command)
         err_str = stderr.read().decode("utf-8")
         if ret != 0:
             if "qstat: Unknown Job Id" in err_str or "Job has finished" in err_str:
@@ -83,8 +84,7 @@ class PBS(Machine):
                     return JobStatus.terminated
             else:
                 raise RuntimeError(
-                    "status command qstat fails to execute. erro info: %s return code %d"
-                    % (err_str, ret)
+                    f"status command {command} fails to execute. erro info: {err_str} return code {ret}"
                 )
         status_line = stdout.read().decode("utf-8").split("\n")[-2]
         status_word = status_line.split()[-2]
@@ -123,7 +123,8 @@ class Torque(PBS):
         job_id = job.job_id
         if job_id == "":
             return JobStatus.unsubmitted
-        ret, stdin, stdout, stderr = self.context.block_call("qstat -l " + job_id)
+        command = "qstat -l " + job_id
+        ret, stdin, stdout, stderr = self.context.block_call(command)
         err_str = stderr.read().decode("utf-8")
         if ret != 0:
             if "qstat: Unknown Job Id" in err_str or "Job has finished" in err_str:
@@ -133,8 +134,7 @@ class Torque(PBS):
                     return JobStatus.terminated
             else:
                 raise RuntimeError(
-                    "status command qstat fails to execute. erro info: %s return code %d"
-                    % (err_str, ret)
+                    f"status command {command} fails to execute. erro info: {err_str} return code {ret}"
                 )
         status_line = stdout.read().decode("utf-8").split("\n")[-2]
         status_word = status_line.split()[-2]
@@ -156,12 +156,12 @@ class Torque(PBS):
         # ref: https://support.adaptivecomputing.com/wp-content/uploads/2021/02/torque/torque.htm#topics/torque/2-jobs/requestingRes.htm
         resources = job.resources
         pbs_script_header_dict = {}
-        pbs_script_header_dict[
-            "select_node_line"
-        ] = f"#PBS -l nodes={resources.number_node}:ppn={resources.cpu_per_node}"
+        pbs_script_header_dict["select_node_line"] = (
+            f"#PBS -l nodes={resources.number_node}:ppn={resources.cpu_per_node}"
+        )
         if resources.gpu_per_node != 0:
-            pbs_script_header_dict["select_node_line"] += ":gpus={gpu_per_node}".format(
-                gpu_per_node=resources.gpu_per_node
+            pbs_script_header_dict["select_node_line"] += (
+                f":gpus={resources.gpu_per_node}"
             )
         pbs_script_header_dict["queue_name_line"] = f"#PBS -q {resources.queue_name}"
         if (
@@ -177,3 +177,166 @@ class Torque(PBS):
                 **pbs_script_header_dict
             )
         return pbs_script_header
+sge_script_header_template = """
+#!/bin/bash
+#$ -S /bin/bash
+#$ -cwd
+{select_node_line}
+"""
+class SGE(PBS):
+    def __init__(
+        self,
+        batch_type=None,
+        context_type=None,
+        local_root=None,
+        remote_root=None,
+        remote_profile={},
+        *,
+        context=None,
+    ):
+        super(PBS, self).__init__(
+            batch_type,
+            context_type,
+            local_root,
+            remote_root,
+            remote_profile,
+            context=context,
+        )
+    def gen_script_header(self, job):
+        ### Ref:https://softpanorama.org/HPC/PBS_and_derivatives/Reference/pbs_command_vs_sge_commands.shtml
+        # resources.number_node is not used in SGE
+        resources = job.resources
+        job_name = resources.kwargs.get("job_name", "wDPjob")
+        pe_name = resources.kwargs.get("pe_name", "mpi")
+        sge_script_header_dict = {}
+        sge_script_header_dict["select_node_line"] = f"#$ -N {job_name}\n"
+        sge_script_header_dict["select_node_line"] += (
+            f"#$ -pe {pe_name} {resources.cpu_per_node}\n"
+        )
+        if resources.queue_name != "":
+            sge_script_header_dict["select_node_line"] += (
+                f"#$ -q {resources.queue_name}"
+            )
+        if (
+            resources["strategy"].get("customized_script_header_template_file")
+            is not None
+        ):
+            file_name = resources["strategy"]["customized_script_header_template_file"]
+            sge_script_header = customized_script_header_template(file_name, resources)
+        else:
+            sge_script_header = sge_script_header_template.format(
+                **sge_script_header_dict
+            )
+        return sge_script_header
+    def do_submit(self, job):
+        script_file_name = job.script_file_name
+        script_str = self.gen_script(job)
+        job_id_name = job.job_hash + "_job_id"
+        self.context.write_file(fname=script_file_name, write_str=script_str)
+        script_run_str = self.gen_script_command(job)
+        script_run_file_name = f"{job.script_file_name}.run"
+        self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
+        script_file_dir = self.context.remote_root
+        stdin, stdout, stderr = self.context.block_checkcall(
+            "cd {} && {} {}".format(script_file_dir, "qsub", script_file_name)
+        )
+        subret = stdout.readlines()
+        job_id = subret[0].split()[2]
+        self.context.write_file(job_id_name, job_id)
+        return job_id
+    def check_status(self, job):
+        ### https://softpanorama.org/HPC/Grid_engine/Queues/queue_states.shtml
+        job_id = job.job_id
+        status_line = None
+        if job_id == "":
+            return JobStatus.unsubmitted
+        command = "qstat"
+        ret, stdin, stdout, stderr = self.context.block_call(command)
+        err_str = stderr.read().decode("utf-8")
+        if ret != 0:
+            raise RuntimeError(
+                f"status command {command} fails to execute. erro info: {err_str} return code {ret}"
+            )
+        status_text_list = stdout.read().decode("utf-8").split("\n")
+        for txt in status_text_list:
+            if job_id in txt:
+                status_line = txt
+        if status_line is None:
+            count = 0
+            while count <= 6:
+                if self.check_finish_tag(job=job):
+                    return JobStatus.finished
+                dlog.info(
+                    f"not tag_finished detected, execute sync command and wait. count {count}"
+                )
+                self.context.block_call("sync")
+                import time
+                time.sleep(10)
+                count += 1
+            return JobStatus.terminated
+        else:
+            status_word = status_line.split()[4]
+            # dlog.info (status_word)
+            if status_word in ["qw", "hqw", "t"]:
+                return JobStatus.waiting
+            elif status_word in ["r", "Rr"]:
+                return JobStatus.running
+            elif status_word in ["Eqw", "dr", "dt"]:
+                return JobStatus.terminated
+            else:
+                return JobStatus.unknown
+    def check_finish_tag(self, job):
+        job_tag_finished = job.job_hash + "_job_tag_finished"
+        return self.context.check_file_exists(job_tag_finished)
+    @classmethod
+    def resources_subfields(cls) -> List[Argument]:
+        """Generate the resources subfields.
+            pe_name : str
+        The parallel environment name of SGE.
+        Returns
+        -------
+        list[Argument]
+            resources subfields
+        """
+        doc_pe_name = "The parallel environment name of SGE system."
+        doc_job_name = "The name of SGE's job."
+        return [
+            Argument(
+                "kwargs",
+                dict,
+                [
+                    Argument(
+                        "pe_name",
+                        str,
+                        optional=True,
+                        default="mpi",
+                        doc=doc_pe_name,
+                        alias=["sge_pe_name"],
+                    ),
+                    Argument(
+                        "job_name",
+                        str,
+                        optional=True,
+                        default="wDPjob",
+                        doc=doc_job_name,
+                    ),
+                ],
+                optional=False,
+                doc="Extra arguments.",
+            )
+        ]

dpdispatcher 0.6.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

dpdispatcher 0.6.1py3-none-any.whl → 1.0.0py3-none-any.whl