PyPI - dirac-cwl - Versions diffs - 1.0.2__py3-none-any.whl - Mend

dirac-cwl 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

dirac_cwl/__init__.py +28 -0
dirac_cwl/commands/__init__.py +5 -0
dirac_cwl/commands/core.py +37 -0
dirac_cwl/commands/download_config.py +22 -0
dirac_cwl/commands/group_outputs.py +32 -0
dirac_cwl/core/__init__.py +1 -0
dirac_cwl/core/exceptions.py +5 -0
dirac_cwl/core/utility.py +41 -0
dirac_cwl/data_management_mocks/data_manager.py +99 -0
dirac_cwl/data_management_mocks/file_catalog.py +132 -0
dirac_cwl/data_management_mocks/sandbox.py +89 -0
dirac_cwl/execution_hooks/__init__.py +40 -0
dirac_cwl/execution_hooks/core.py +342 -0
dirac_cwl/execution_hooks/plugins/__init__.py +16 -0
dirac_cwl/execution_hooks/plugins/core.py +58 -0
dirac_cwl/execution_hooks/registry.py +209 -0
dirac_cwl/job/__init__.py +249 -0
dirac_cwl/job/job_wrapper.py +375 -0
dirac_cwl/job/job_wrapper_template.py +56 -0
dirac_cwl/job/submission_clients.py +166 -0
dirac_cwl/modules/crypto.py +96 -0
dirac_cwl/modules/pi_gather.py +41 -0
dirac_cwl/modules/pi_simulate.py +33 -0
dirac_cwl/production/__init__.py +200 -0
dirac_cwl/submission_models.py +157 -0
dirac_cwl/transformation/__init__.py +203 -0
dirac_cwl-1.0.2.dist-info/METADATA +285 -0
dirac_cwl-1.0.2.dist-info/RECORD +32 -0
dirac_cwl-1.0.2.dist-info/WHEEL +5 -0
dirac_cwl-1.0.2.dist-info/entry_points.txt +8 -0
dirac_cwl-1.0.2.dist-info/licenses/LICENSE +674 -0
dirac_cwl-1.0.2.dist-info/top_level.txt +1 -0

dirac_cwl/job/__init__.py ADDED Viewed

@@ -0,0 +1,249 @@
+"""CLI interface to run a workflow as a job."""
+import logging
+import os
+import random
+import subprocess
+from pathlib import Path
+from typing import Any
+import typer
+from cwl_utils.pack import pack
+from cwl_utils.parser import load_document
+from cwl_utils.parser.cwl_v1_2 import (
+    File,
+)
+from cwl_utils.parser.cwl_v1_2_utils import load_inputfile
+from diracx.cli.utils import AsyncTyper
+from rich import print_json
+from rich.console import Console
+from schema_salad.exceptions import ValidationException
+from dirac_cwl.job.submission_clients import (
+    DIRACSubmissionClient,
+    PrototypeSubmissionClient,
+    SubmissionClient,
+)
+from dirac_cwl.submission_models import (
+    JobInputModel,
+    JobModel,
+    JobSubmissionModel,
+)
+app = AsyncTyper()
+console = Console()
+# -----------------------------------------------------------------------------
+# dirac-cli commands
+# -----------------------------------------------------------------------------
+@app.async_command("submit")
+async def submit_job_client(
+    task_path: str = typer.Argument(..., help="Path to the CWL file"),
+    parameter_path: list[str] | None = typer.Option(None, help="Path to the files containing the metadata"),
+    # Specific parameter for the purpose of the prototype
+    local: bool | None = typer.Option(True, help="Run the job locally instead of submitting it to the router"),
+):
+    """
+    Correspond to the dirac-cli command to submit jobs.
+    This command will:
+    - Validate the workflow
+    - Start the jobs
+    """
+    # Select submission strategy based on local flag
+    submission_client: SubmissionClient = PrototypeSubmissionClient() if local else DIRACSubmissionClient()
+    os.environ["DIRAC_PROTO_LOCAL"] = "0"
+    # Validate the workflow
+    console.print("[blue]:information_source:[/blue] [bold]CLI:[/bold] Validating the job(s)...")
+    try:
+        task = load_document(pack(task_path), baseuri=".")
+    except FileNotFoundError as ex:
+        console.print(f"[red]:heavy_multiplication_x:[/red] [bold]CLI:[/bold] Failed to load the task:\n{ex}")
+        return typer.Exit(code=1)
+    except ValidationException as ex:
+        console.print(f"[red]:heavy_multiplication_x:[/red] [bold]CLI:[/bold] Failed to validate the task:\n{ex}")
+        return typer.Exit(code=1)
+    console.print(f"\t[green]:heavy_check_mark:[/green] Task {task_path}")
+    console.print("\t[green]:heavy_check_mark:[/green] Hints")
+    # Extract parameters if any
+    parameters = []
+    if parameter_path:
+        for parameter_p in parameter_path:
+            try:
+                parameter = load_inputfile(parameter_p)
+            except Exception as ex:
+                console.print(
+                    f"[red]:heavy_multiplication_x:[/red] [bold]CLI:[/bold] Failed to validate the parameter:\n{ex}"
+                )
+                return typer.Exit(code=1)
+            # Prepare files for the ISB
+            isb_file_paths = prepare_input_sandbox(parameter)
+            # Upload parameter sandbox
+            sandbox_id = await submission_client.create_sandbox(isb_file_paths)
+            parameters.append(
+                JobInputModel(
+                    sandbox=[sandbox_id] if sandbox_id else None,
+                    cwl=parameter,
+                )
+            )
+            console.print(f"\t[green]:heavy_check_mark:[/green] Parameter {parameter_p}")
+    job = JobSubmissionModel(
+        task=task,
+        inputs=parameters,
+    )
+    console.print("[green]:heavy_check_mark:[/green] [bold]CLI:[/bold] Job(s) validated.")
+    # Submit the job
+    console.print("[blue]:information_source:[/blue] [bold]CLI:[/bold] Submitting the job(s)...")
+    print_json(job.model_dump_json(indent=4))
+    if not await submission_client.submit_job(job):
+        console.print("[red]:heavy_multiplication_x:[/red] [bold]CLI:[/bold] Failed to submit job(s).")
+        return typer.Exit(code=1)
+def validate_jobs(job: JobSubmissionModel) -> list[JobModel]:
+    """
+    Validate jobs.
+    :param job: The task to execute
+    :return: The list of jobs to execute
+    """
+    console.print("[blue]:information_source:[/blue] [bold]CLI:[/bold] Validating the job(s)...")
+    # Initiate 1 job per parameter
+    jobs = []
+    if not job.inputs:
+        jobs.append(
+            JobModel(
+                task=job.task,
+            )
+        )
+    else:
+        for parameter in job.inputs:
+            jobs.append(
+                JobModel(
+                    task=job.task,
+                    input=parameter,
+                )
+            )
+    console.print("[green]:information_source:[/green] [bold]CLI:[/bold] Job(s) validated!")
+    return jobs
+def prepare_input_sandbox(input_data: dict[str, Any]) -> list[Path]:
+    """
+    Extract the files from the parameters.
+    :param parameters: The parameters of the job
+    :return: The list of files
+    """
+    # Get the files from the input data
+    files = []
+    for _, input_value in input_data.items():
+        if isinstance(input_value, list):
+            for item in input_value:
+                if isinstance(item, File):
+                    files.append(item)
+        elif isinstance(input_value, File):
+            files.append(input_value)
+    files_path = []
+    for file in files:
+        # TODO: path is not the only attribute to consider, but so far it is the only one used
+        if not file.location and not file.path:
+            raise NotImplementedError("File path is not defined.")
+        if file.path:
+            file_path = Path(file.path.replace("file://", ""))
+            files_path.append(file_path)
+    return files_path
+# -----------------------------------------------------------------------------
+# dirac-router commands
+# -----------------------------------------------------------------------------
+def submit_job_router(job: JobSubmissionModel) -> bool:
+    """
+    Execute a job using the router.
+    :param job: The task to execute
+    :return: True if the job executed successfully, False otherwise
+    """
+    logger = logging.getLogger("JobRouter")
+    os.environ["DIRAC_PROTO_LOCAL"] = "1"
+    # Validate the jobs
+    jobs = validate_jobs(job)
+    # Execute the job locally
+    logger.info("Executing jobs locally...")
+    results = []
+    for job in jobs:
+        job_id = random.randint(1000, 9999)
+        results.append(run_job(job_id, job, logger.getChild(f"job-{job_id}")))
+    return all(results)
+# -----------------------------------------------------------------------------
+# Worker node execution
+# -----------------------------------------------------------------------------
+def run_job(job_id: int, job: JobModel, logger: logging.Logger) -> bool:
+    """
+    Run a single job by dumping it to JSON and executing the job_wrapper_template.py script.
+    :param job: The job to execute
+    :param logger: Logger instance for output
+    :return: True if the job executed successfully, False otherwise
+    """
+    logger.info("Executing job locally:\n")
+    print_json(job.model_dump_json(indent=4))
+    # Dump job to a JSON file
+    job_json_path = Path(f"job_{job_id}.json")
+    with open(job_json_path, "w") as f:
+        f.write(job.model_dump_json())
+    # Run the job_wrapper_template.py script via bash command
+    result = subprocess.run(
+        [
+            "python",
+            "-m",
+            "dirac_cwl.job.job_wrapper_template",
+            str(job_json_path),
+        ],
+        capture_output=True,
+        text=True,
+    )
+    # Clean up the job JSON file
+    job_json_path.unlink()
+    # Log output
+    if result.stdout:
+        logger.info("STDOUT %s:\n%s", job_id, result.stdout)
+    if result.stderr:
+        logger.error("STDERR %s:\n%s", job_id, result.stderr)
+    logger.info("Job execution completed.")
+    return result.returncode == 0

dirac_cwl/job/job_wrapper.py ADDED Viewed

@@ -0,0 +1,375 @@
+#!/usr/bin/env python
+"""Job wrapper for executing CWL workflows with DIRAC."""
+import json
+import logging
+import os
+import random
+import shutil
+import subprocess
+from pathlib import Path
+from typing import Any, List, Sequence, cast
+from cwl_utils.parser import (
+    save,
+)
+from cwl_utils.parser.cwl_v1_2 import (
+    CommandLineTool,
+    ExpressionTool,
+    File,
+    Saveable,
+    Workflow,
+)
+from DIRACCommon.Core.Utilities.ReturnValues import (  # type: ignore[import-untyped]
+    returnValueOrRaise,
+)
+from rich.text import Text
+from ruamel.yaml import YAML
+from dirac_cwl.commands import PostProcessCommand, PreProcessCommand
+from dirac_cwl.core.exceptions import WorkflowProcessingException
+from dirac_cwl.core.utility import get_lfns
+from dirac_cwl.execution_hooks import ExecutionHooksHint
+from dirac_cwl.execution_hooks.core import ExecutionHooksBasePlugin
+from dirac_cwl.submission_models import (
+    JobInputModel,
+    JobModel,
+)
+if os.getenv("DIRAC_PROTO_LOCAL") == "1":
+    from dirac_cwl.data_management_mocks.sandbox import create_sandbox, download_sandbox  # type: ignore[no-redef]
+else:
+    from diracx.api.jobs import create_sandbox, download_sandbox  # type: ignore[no-redef]
+# -----------------------------------------------------------------------------
+# JobWrapper
+# -----------------------------------------------------------------------------
+logger = logging.getLogger(__name__)
+class JobWrapper:
+    """Job Wrapper for the execution hook."""
+    def __init__(self) -> None:
+        """Initialize the job wrapper."""
+        self.execution_hooks_plugin: ExecutionHooksBasePlugin | None = None
+        self.job_path: Path = Path()
+    def __download_input_sandbox(self, arguments: JobInputModel, job_path: Path) -> None:
+        """Download the files from the sandbox store.
+        :param arguments: Job input model containing sandbox information.
+        :param job_path: Path to the job working directory.
+        """
+        assert arguments.sandbox is not None
+        if not self.execution_hooks_plugin:
+            raise RuntimeError("Could not download sandboxes")
+        for sandbox in arguments.sandbox:
+            download_sandbox(sandbox, job_path)
+    def __upload_output_sandbox(
+        self,
+        outputs: dict[str, str | Path | Sequence[str | Path]],
+    ):
+        if not self.execution_hooks_plugin:
+            raise RuntimeError("Could not upload sandbox : Execution hook is not defined.")
+        outputs_to_sandbox = []
+        for output_name, src_path in outputs.items():
+            if self.execution_hooks_plugin.output_sandbox and output_name in self.execution_hooks_plugin.output_sandbox:
+                if isinstance(src_path, Path) or isinstance(src_path, str):
+                    src_path = [src_path]
+                for path in src_path:
+                    outputs_to_sandbox.append(path)
+        sb_path = Path(create_sandbox(outputs_to_sandbox))
+        logger.info("Successfully stored output %s in Sandbox %s", self.execution_hooks_plugin.output_sandbox, sb_path)
+    def __download_input_data(self, inputs: JobInputModel, job_path: Path) -> dict[str, Path | list[Path]]:
+        """Download LFNs into the job working directory.
+        :param JobInputModel inputs:
+            The job input model containing ``lfns_input``, a mapping from input names to one or more LFN paths.
+        :param Path job_path:
+            Path to the job working directory where files will be copied.
+        :return dict[str, Path | list[Path]]:
+            A dictionary mapping each input name to the corresponding downloaded
+            file path(s) located in the working directory.
+        """
+        new_paths: dict[str, Path | list[Path]] = {}
+        if not self.execution_hooks_plugin:
+            raise RuntimeWarning("Could not download input data: Execution hook is not defined.")
+        lfns_inputs = get_lfns(inputs.cwl)
+        if lfns_inputs:
+            for input_name, lfns in lfns_inputs.items():
+                res = returnValueOrRaise(self.execution_hooks_plugin._datamanager.getFile(lfns, str(job_path)))
+                if res["Failed"]:
+                    raise RuntimeError(f"Could not get files : {res['Failed']}")
+                paths = res["Successful"]
+                if paths and isinstance(lfns, list):
+                    new_paths[input_name] = [Path(paths[lfn]).relative_to(job_path.resolve()) for lfn in paths]
+                elif paths and isinstance(lfns, str):
+                    new_paths[input_name] = Path(paths[lfns]).relative_to(job_path.resolve())
+        return new_paths
+    def __update_inputs(self, inputs: JobInputModel, updates: dict[str, Path | list[Path]]):
+        """Update CWL job inputs with new file paths.
+        This method updates the `inputs.cwl` object by replacing or adding
+        file paths for each input specified in `updates`. It supports both
+        single files and lists of files.
+        :param inputs: The job input model whose `cwl` dictionary will be updated.
+        :type inputs: JobInputModel
+        :param updates: Dictionary mapping input names to their corresponding local file
+            paths. Each value can be a single `Path` or a list of `Path` objects.
+        :type updates: dict[str, Path | list[Path]]
+        .. note::
+           This method is typically called after downloading LFNs
+           using `download_lfns` to ensure that the CWL job inputs reference
+           the correct local files.
+        """
+        for _, value in inputs.cwl.items():
+            files = value if isinstance(value, list) else [value]
+            for file in files:
+                if isinstance(file, File) and file.path:
+                    file.path = Path(file.path).name
+        for input_name, path in updates.items():
+            if isinstance(path, Path):
+                inputs.cwl[input_name] = File(path=str(path))
+            else:
+                inputs.cwl[input_name] = []
+                for p in path:
+                    inputs.cwl[input_name].append(File(path=str(p)))
+    def __parse_output_filepaths(self, stdout: str) -> dict[str, str | Path | Sequence[str | Path]]:
+        """Get the outputted filepaths per output.
+        :param str stdout:
+            The console output of the the job
+        :return dict[str, list[str]]:
+            The dict of the list of filepaths for each output
+        """
+        outputted_files: dict[str, str | Path | Sequence[str | Path]] = {}
+        outputs = json.loads(stdout)
+        for output, files in outputs.items():
+            if not files:
+                continue
+            if not isinstance(files, list):
+                files = [files]
+            file_paths = []
+            for file in files:
+                if file:
+                    file_paths.append(str(file["path"]))
+            outputted_files[output] = file_paths
+        return outputted_files
+    def pre_process(
+        self,
+        executable: CommandLineTool | Workflow | ExpressionTool,
+        arguments: JobInputModel | None,
+    ) -> list[str]:
+        """
+        Pre-process the job before execution.
+        :return: True if the job is pre-processed successfully, False otherwise
+        """
+        logger = logging.getLogger("JobWrapper - Pre-process")
+        # Prepare the task for cwltool
+        logger.info("Preparing the task for cwltool...")
+        command = ["cwltool", "--parallel"]
+        task_dict = save(executable)
+        task_path = self.job_path / "task.cwl"
+        with open(task_path, "w") as task_file:
+            YAML().dump(task_dict, task_file)
+        command.append(str(task_path.name))
+        if arguments:
+            if arguments.sandbox:
+                # Download the files from the sandbox store
+                logger.info("Downloading the files from the sandbox store...")
+                self.__download_input_sandbox(arguments, self.job_path)
+                logger.info("Files downloaded successfully!")
+            updates = self.__download_input_data(arguments, self.job_path)
+            self.__update_inputs(arguments, updates)
+            logger.info("Preparing the parameters for cwltool...")
+            parameter_dict = save(cast(Saveable, arguments.cwl))
+            parameter_path = self.job_path / "parameter.cwl"
+            with open(parameter_path, "w") as parameter_file:
+                YAML().dump(parameter_dict, parameter_file)
+            command.append(str(parameter_path.name))
+        if self.execution_hooks_plugin:
+            return self.__pre_process_hooks(executable, arguments, self.job_path, command)
+        return command
+    def post_process(
+        self,
+        status: int,
+        stdout: str,
+        stderr: str,
+    ):
+        """
+        Post-process the job after execution.
+        :return: True if the job is post-processed successfully, False otherwise
+        """
+        logger = logging.getLogger("JobWrapper - Post-process")
+        if status != 0:
+            raise RuntimeError(f"Error {status} during the task execution.")
+        logger.info(stdout)
+        logger.info(stderr)
+        outputs = self.__parse_output_filepaths(stdout)
+        success = True
+        if self.execution_hooks_plugin:
+            success = self.__post_process_hooks(self.job_path, outputs=outputs)
+        self.__upload_output_sandbox(outputs=outputs)
+        return success
+    def __pre_process_hooks(
+        self,
+        executable: CommandLineTool | Workflow | ExpressionTool,
+        arguments: Any | None,
+        job_path: Path,
+        command: List[str],
+        **kwargs: Any,
+    ) -> List[str]:
+        """Pre-process job inputs and command before execution.
+        :param CommandLineTool | Workflow | ExpressionTool executable:
+            The CWL tool, workflow, or expression to be executed.
+        :param JobInputModel arguments:
+            The job inputs, including CWL and LFN data.
+        :param Path job_path:
+            Path to the job working directory.
+        :param list[str] command:
+            The command to be executed, which will be modified.
+        :param Any **kwargs:
+            Additional parameters, allowing extensions to pass extra context
+            or configuration options.
+        :return list[str]:
+            The modified command, typically including the serialized CWL
+            input file path.
+        """
+        if not self.execution_hooks_plugin:
+            raise RuntimeWarning("Could not run pre_process_hooks: Execution hook is not defined.")
+        for preprocess_command in self.execution_hooks_plugin.preprocess_commands:
+            if not issubclass(preprocess_command, PreProcessCommand):
+                msg = f"The command {preprocess_command} is not a {PreProcessCommand.__name__}"
+                logger.error(msg)
+                raise TypeError(msg)
+            try:
+                preprocess_command().execute(job_path, **kwargs)
+            except Exception as e:
+                msg = f"Command '{preprocess_command.__name__}' failed during the pre-process stage: {e}"
+                logger.exception(msg)
+                raise WorkflowProcessingException(msg) from e
+        return command
+    def __post_process_hooks(
+        self,
+        job_path: Path,
+        outputs: dict[str, str | Path | Sequence[str | Path]] = {},
+        **kwargs: Any,
+    ) -> bool:
+        """Post-process job outputs.
+        :param Path job_path:
+            Path to the job working directory.
+        :param str|None stdout:
+            cwltool standard output.
+        :param Any **kwargs:
+            Additional keyword arguments for extensibility.
+        """
+        if not self.execution_hooks_plugin:
+            raise RuntimeWarning("Could not run post_process_hooks: Execution hook is not defined.")
+        for postprocess_command in self.execution_hooks_plugin.postprocess_commands:
+            if not issubclass(postprocess_command, PostProcessCommand):
+                msg = f"The command {postprocess_command} is not a {PostProcessCommand.__name__}"
+                logger.error(msg)
+                raise TypeError(msg)
+            try:
+                postprocess_command().execute(job_path, **kwargs)
+            except Exception as e:
+                msg = f"Command '{postprocess_command.__name__}' failed during the post-process stage: {e}"
+                logger.exception(msg)
+                raise WorkflowProcessingException(msg) from e
+        self.execution_hooks_plugin.store_output(outputs)
+        return True
+    def run_job(self, job: JobModel) -> bool:
+        """Execute a given CWL workflow using cwltool.
+        This is the equivalent of the DIRAC JobWrapper.
+        :param job: The job model containing workflow and inputs.
+        :return: True if the job is executed successfully, False otherwise.
+        """
+        logger = logging.getLogger("JobWrapper")
+        # Instantiate runtime metadata from the serializable descriptor and
+        # the job context so implementations can access task inputs/overrides.
+        job_execution_hooks = ExecutionHooksHint.from_cwl(job.task)
+        self.execution_hooks_plugin = job_execution_hooks.to_runtime(job) if job_execution_hooks else None
+        # Isolate the job in a specific directory
+        self.job_path = Path(".") / "workernode" / f"{random.randint(1000, 9999)}"
+        self.job_path.mkdir(parents=True, exist_ok=True)
+        try:
+            # Pre-process the job
+            logger.info("Pre-processing Task...")
+            command = self.pre_process(job.task, job.input)
+            logger.info("Task pre-processed successfully!")
+            # Execute the task
+            logger.info("Executing Task: %s", command)
+            result = subprocess.run(command, capture_output=True, text=True, cwd=self.job_path)
+            if result.returncode != 0:
+                logger.error("Error in executing workflow:\n%s", Text.from_ansi(result.stderr))
+                return False
+            logger.info("Task executed successfully!")
+            # Post-process the job
+            logger.info("Post-processing Task...")
+            if self.post_process(
+                result.returncode,
+                result.stdout,
+                result.stderr,
+            ):
+                logger.info("Task post-processed successfully!")
+                return True
+            logger.error("Failed to post-process Task")
+            return False
+        except Exception:
+            logger.exception("JobWrapper: Failed to execute workflow")
+            return False
+        finally:
+            # Clean up
+            if self.job_path.exists():
+                shutil.rmtree(self.job_path)

dirac_cwl/job/job_wrapper_template.py ADDED Viewed

@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+"""Job wrapper template for executing CWL jobs."""
+import json
+import logging
+import os
+import sys
+import tempfile
+import DIRAC  # type: ignore[import-untyped]
+from cwl_utils.parser import load_document_by_uri
+from cwl_utils.parser.cwl_v1_2_utils import load_inputfile
+from ruamel.yaml import YAML
+if os.getenv("DIRAC_PROTO_LOCAL") != "1":
+    DIRAC.initialize()
+from dirac_cwl.job.job_wrapper import JobWrapper
+from dirac_cwl.submission_models import JobModel
+def main():
+    """Execute the job wrapper for a given job model."""
+    if len(sys.argv) != 2:
+        logging.error("1 argument is required")
+        sys.exit(1)
+    job_json_file = sys.argv[1]
+    job_wrapper = JobWrapper()
+    with open(job_json_file, "r") as file:
+        job_model_dict = json.load(file)
+    task_dict = job_model_dict["task"]
+    with tempfile.NamedTemporaryFile("w+", suffix=".cwl", delete=False) as f:
+        YAML().dump(task_dict, f)
+        f.flush()
+        task_obj = load_document_by_uri(f.name)
+    if job_model_dict["input"]:
+        cwl_inputs_obj = load_inputfile(job_model_dict["input"]["cwl"])
+        job_model_dict["input"]["cwl"] = cwl_inputs_obj
+    job_model_dict["task"] = task_obj
+    job = JobModel.model_validate(job_model_dict)
+    res = job_wrapper.run_job(job)
+    if res:
+        logging.info("Job done.")
+    else:
+        logging.info("Job failed.")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()