PyPI - hpcflow-new2 - Versions diffs - 0.2.0a190__py3-none-any.whl → 0.2.0a200__py3-none-any.whl - Mend

hpcflow-new2 0.2.0a190py3-none-any.whl → 0.2.0a200py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

hpcflow/__pyinstaller/hook-hpcflow.py +1 -0
hpcflow/_version.py +1 -1
hpcflow/data/scripts/bad_script.py +2 -0
hpcflow/data/scripts/do_nothing.py +2 -0
hpcflow/data/scripts/env_specifier_test/input_file_generator_pass_env_spec.py +4 -0
hpcflow/data/scripts/env_specifier_test/main_script_test_pass_env_spec.py +8 -0
hpcflow/data/scripts/env_specifier_test/output_file_parser_pass_env_spec.py +4 -0
hpcflow/data/scripts/env_specifier_test/v1/input_file_generator_basic.py +4 -0
hpcflow/data/scripts/env_specifier_test/v1/main_script_test_direct_in_direct_out.py +7 -0
hpcflow/data/scripts/env_specifier_test/v1/output_file_parser_basic.py +4 -0
hpcflow/data/scripts/env_specifier_test/v2/main_script_test_direct_in_direct_out.py +7 -0
hpcflow/data/scripts/input_file_generator_basic.py +3 -0
hpcflow/data/scripts/input_file_generator_basic_FAIL.py +3 -0
hpcflow/data/scripts/input_file_generator_test_stdout_stderr.py +8 -0
hpcflow/data/scripts/main_script_test_direct_in.py +3 -0
hpcflow/data/scripts/main_script_test_direct_in_direct_out_2.py +6 -0
hpcflow/data/scripts/main_script_test_direct_in_direct_out_2_fail_allowed.py +6 -0
hpcflow/data/scripts/main_script_test_direct_in_direct_out_2_fail_allowed_group.py +7 -0
hpcflow/data/scripts/main_script_test_direct_in_direct_out_3.py +6 -0
hpcflow/data/scripts/main_script_test_direct_in_group_direct_out_3.py +6 -0
hpcflow/data/scripts/main_script_test_direct_in_group_one_fail_direct_out_3.py +6 -0
hpcflow/data/scripts/main_script_test_hdf5_in_obj_2.py +12 -0
hpcflow/data/scripts/main_script_test_json_out_FAIL.py +3 -0
hpcflow/data/scripts/main_script_test_shell_env_vars.py +12 -0
hpcflow/data/scripts/main_script_test_std_out_std_err.py +6 -0
hpcflow/data/scripts/output_file_parser_basic.py +3 -0
hpcflow/data/scripts/output_file_parser_basic_FAIL.py +7 -0
hpcflow/data/scripts/output_file_parser_test_stdout_stderr.py +8 -0
hpcflow/data/scripts/script_exit_test.py +5 -0
hpcflow/data/template_components/environments.yaml +1 -1
hpcflow/sdk/__init__.py +5 -0
hpcflow/sdk/app.py +166 -92
hpcflow/sdk/cli.py +263 -84
hpcflow/sdk/cli_common.py +99 -5
hpcflow/sdk/config/callbacks.py +38 -1
hpcflow/sdk/config/config.py +102 -13
hpcflow/sdk/config/errors.py +19 -5
hpcflow/sdk/config/types.py +3 -0
hpcflow/sdk/core/__init__.py +25 -1
hpcflow/sdk/core/actions.py +914 -262
hpcflow/sdk/core/cache.py +76 -34
hpcflow/sdk/core/command_files.py +14 -128
hpcflow/sdk/core/commands.py +35 -6
hpcflow/sdk/core/element.py +122 -50
hpcflow/sdk/core/errors.py +58 -2
hpcflow/sdk/core/execute.py +207 -0
hpcflow/sdk/core/loop.py +408 -50
hpcflow/sdk/core/loop_cache.py +4 -4
hpcflow/sdk/core/parameters.py +382 -37
hpcflow/sdk/core/run_dir_files.py +13 -40
hpcflow/sdk/core/skip_reason.py +7 -0
hpcflow/sdk/core/task.py +119 -30
hpcflow/sdk/core/task_schema.py +68 -0
hpcflow/sdk/core/test_utils.py +66 -27
hpcflow/sdk/core/types.py +54 -1
hpcflow/sdk/core/utils.py +136 -19
hpcflow/sdk/core/workflow.py +1587 -356
hpcflow/sdk/data/workflow_spec_schema.yaml +2 -0
hpcflow/sdk/demo/cli.py +7 -0
hpcflow/sdk/helper/cli.py +1 -0
hpcflow/sdk/log.py +42 -15
hpcflow/sdk/persistence/base.py +405 -53
hpcflow/sdk/persistence/json.py +177 -52
hpcflow/sdk/persistence/pending.py +237 -69
hpcflow/sdk/persistence/store_resource.py +3 -2
hpcflow/sdk/persistence/types.py +15 -4
hpcflow/sdk/persistence/zarr.py +928 -81
hpcflow/sdk/submission/jobscript.py +1408 -489
hpcflow/sdk/submission/schedulers/__init__.py +40 -5
hpcflow/sdk/submission/schedulers/direct.py +33 -19
hpcflow/sdk/submission/schedulers/sge.py +51 -16
hpcflow/sdk/submission/schedulers/slurm.py +44 -16
hpcflow/sdk/submission/schedulers/utils.py +7 -2
hpcflow/sdk/submission/shells/base.py +68 -20
hpcflow/sdk/submission/shells/bash.py +222 -129
hpcflow/sdk/submission/shells/powershell.py +200 -150
hpcflow/sdk/submission/submission.py +852 -119
hpcflow/sdk/submission/types.py +18 -21
hpcflow/sdk/typing.py +24 -5
hpcflow/sdk/utils/arrays.py +71 -0
hpcflow/sdk/utils/deferred_file.py +55 -0
hpcflow/sdk/utils/hashing.py +16 -0
hpcflow/sdk/utils/patches.py +12 -0
hpcflow/sdk/utils/strings.py +33 -0
hpcflow/tests/api/test_api.py +32 -0
hpcflow/tests/conftest.py +19 -0
hpcflow/tests/data/benchmark_script_runner.yaml +26 -0
hpcflow/tests/data/multi_path_sequences.yaml +29 -0
hpcflow/tests/data/workflow_test_run_abort.yaml +34 -35
hpcflow/tests/schedulers/sge/test_sge_submission.py +36 -0
hpcflow/tests/scripts/test_input_file_generators.py +282 -0
hpcflow/tests/scripts/test_main_scripts.py +821 -70
hpcflow/tests/scripts/test_non_snippet_script.py +46 -0
hpcflow/tests/scripts/test_ouput_file_parsers.py +353 -0
hpcflow/tests/shells/wsl/test_wsl_submission.py +6 -0
hpcflow/tests/unit/test_action.py +176 -0
hpcflow/tests/unit/test_app.py +20 -0
hpcflow/tests/unit/test_cache.py +46 -0
hpcflow/tests/unit/test_cli.py +133 -0
hpcflow/tests/unit/test_config.py +122 -1
hpcflow/tests/unit/test_element_iteration.py +47 -0
hpcflow/tests/unit/test_jobscript_unit.py +757 -0
hpcflow/tests/unit/test_loop.py +1332 -27
hpcflow/tests/unit/test_meta_task.py +325 -0
hpcflow/tests/unit/test_multi_path_sequences.py +229 -0
hpcflow/tests/unit/test_parameter.py +13 -0
hpcflow/tests/unit/test_persistence.py +190 -8
hpcflow/tests/unit/test_run.py +109 -3
hpcflow/tests/unit/test_run_directories.py +29 -0
hpcflow/tests/unit/test_shell.py +20 -0
hpcflow/tests/unit/test_submission.py +5 -76
hpcflow/tests/unit/test_workflow_template.py +31 -0
hpcflow/tests/unit/utils/test_arrays.py +40 -0
hpcflow/tests/unit/utils/test_deferred_file_writer.py +34 -0
hpcflow/tests/unit/utils/test_hashing.py +65 -0
hpcflow/tests/unit/utils/test_patches.py +5 -0
hpcflow/tests/unit/utils/test_redirect_std.py +50 -0
hpcflow/tests/workflows/__init__.py +0 -0
hpcflow/tests/workflows/test_directory_structure.py +31 -0
hpcflow/tests/workflows/test_jobscript.py +332 -0
hpcflow/tests/workflows/test_run_status.py +198 -0
hpcflow/tests/workflows/test_skip_downstream.py +696 -0
hpcflow/tests/workflows/test_submission.py +140 -0
hpcflow/tests/workflows/test_workflows.py +142 -2
hpcflow/tests/workflows/test_zip.py +18 -0
hpcflow/viz_demo.ipynb +6587 -3
{hpcflow_new2-0.2.0a190.dist-info → hpcflow_new2-0.2.0a200.dist-info}/METADATA +7 -4
hpcflow_new2-0.2.0a200.dist-info/RECORD +222 -0
hpcflow_new2-0.2.0a190.dist-info/RECORD +0 -165
{hpcflow_new2-0.2.0a190.dist-info → hpcflow_new2-0.2.0a200.dist-info}/LICENSE +0 -0
{hpcflow_new2-0.2.0a190.dist-info → hpcflow_new2-0.2.0a200.dist-info}/WHEEL +0 -0
{hpcflow_new2-0.2.0a190.dist-info → hpcflow_new2-0.2.0a200.dist-info}/entry_points.txt +0 -0

hpcflow/sdk/core/element.py CHANGED Viewed

@@ -4,12 +4,23 @@ Elements are components of tasks.
 from __future__ import annotations
 import copy
-from dataclasses import dataclass, field
+from dataclasses import dataclass, field, fields
+from operator import attrgetter
 from itertools import chain
 import os
-from typing import cast, overload, TYPE_CHECKING
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Optional,
+    cast,
+    overload,
+    TYPE_CHECKING,
+)
 from hpcflow.sdk.core.enums import ParallelMode
+from hpcflow.sdk.core.skip_reason import SkipReason
 from hpcflow.sdk.core.errors import UnsupportedOSError, UnsupportedSchedulerError
 from hpcflow.sdk.core.json_like import ChildObjectSpec, JSONLike
 from hpcflow.sdk.core.loop_cache import LoopIndex
@@ -23,6 +34,7 @@ from hpcflow.sdk.core.utils import (
 )
 from hpcflow.sdk.log import TimeIt
 from hpcflow.sdk.submission.shells import get_shell
+from hpcflow.sdk.utils.hashing import get_hash
 if TYPE_CHECKING:
     from collections.abc import Iterable, Iterator, Mapping, Sequence
@@ -270,6 +282,12 @@ class ElementResources(JSONLike):
         Whether to use array jobs.
     max_array_items: int
         If using array jobs, up to how many items should be in the job array.
+    write_app_logs: bool
+        Whether an app log file should be written.
+    combine_jobscript_std: bool
+        Whether jobscript standard output and error streams should be combined.
+    combine_scripts: bool
+        Whether Python scripts should be combined.
     time_limit: str
         How long to run for.
     scheduler_args: dict[str, Any]
@@ -280,6 +298,13 @@ class ElementResources(JSONLike):
         Which OS to use.
     environments: dict
         Which execution environments to use.
+    resources_id: int
+        An arbitrary integer that can be used to force multiple jobscripts.
+    skip_downstream_on_failure: bool
+        Whether to skip downstream dependents on failure.
+    allow_failed_dependencies: int | float | bool | None
+        The failure tolerance with respect to dependencies, specified as a number or
+        proportion.
     SGE_parallel_env: str
         Which SGE parallel environment to request.
     SLURM_partition: str
@@ -317,6 +342,12 @@ class ElementResources(JSONLike):
     use_job_array: bool | None = None
     #: If using array jobs, up to how many items should be in the job array.
     max_array_items: int | None = None
+    #: Whether an app log file should be written.
+    write_app_logs: bool = False
+    #: Whether jobscript standard output and error streams should be combined.
+    combine_jobscript_std: bool = field(default_factory=lambda: os.name != "nt")
+    #: Whether Python scripts should be combined.
+    combine_scripts: bool | None = None
     #: How long to run for.
     time_limit: str | None = None
@@ -328,6 +359,13 @@ class ElementResources(JSONLike):
     os_name: str | None = None
     #: Which execution environments to use.
     environments: dict[str, dict[str, Any]] | None = None
+    #: An arbitrary integer that can be used to force multiple jobscripts.
+    resources_id: int | None = None
+    #: Whether to skip downstream dependents on failure.
+    skip_downstream_on_failure: bool = True
+    #: The failure tolerance with respect to dependencies, specified as a number or
+    #: proportion.
+    allow_failed_dependencies: int | float | bool | None = False
     # SGE scheduler specific:
     #: Which SGE parallel environment to request.
@@ -357,37 +395,34 @@ class ElementResources(JSONLike):
         if self.parallel_mode:
             self.parallel_mode = get_enum_by_name_or_val(ParallelMode, self.parallel_mode)
-    def __eq__(self, other: Any) -> bool:
-        return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
+        self.scheduler_args = self.scheduler_args or {}
+        self.shell_args = self.shell_args or {}
+    def __eq__(self, other) -> bool:
+        if type(self) != type(other):
+            return False
+        else:
+            return self.__dict__ == other.__dict__
+    @TimeIt.decorator
     def get_jobscript_hash(self) -> int:
         """Get hash from all arguments that distinguish jobscripts."""
-        def _hash_dict(d: dict) -> int:
-            if not d:
-                return -1
-            keys, vals = zip(*d.items())
-            return hash(tuple((keys, vals)))
+        exclude = ["time_limit", "skip_downstream_on_failure"]
+        if not self.combine_scripts:
+            # usually environment selection need not distinguish jobscripts because
+            # environments become effective/active within the command files, but if we
+            # are combining scripts, then the environments must be the same:
+            exclude.append("environments")
-        exclude = {"time_limit"}
         dct = {k: copy.deepcopy(v) for k, v in self.__dict__.items() if k not in exclude}
-        scheduler_args = dct["scheduler_args"]
-        shell_args = dct["shell_args"]
-        envs = dct["environments"]
-        if "options" in scheduler_args:
-            dct["scheduler_args"]["options"] = _hash_dict(scheduler_args["options"])
-        dct["scheduler_args"] = _hash_dict(dct["scheduler_args"])
-        dct["shell_args"] = _hash_dict(shell_args)
-        if isinstance(envs, dict):
-            for k, v in envs.items():
-                dct["environments"][k] = _hash_dict(v)
-            dct["environments"] = _hash_dict(dct["environments"])
+        # `combine_scripts==False` and `combine_scripts==None` should have an equivalent
+        # contribution to the hash, so always set it to `False` if unset at this point:
+        if self.combine_scripts is None:
+            dct["combine_scripts"] = False
-        return _hash_dict(dct)
+        return get_hash(dct)
     @property
     def is_parallel(self) -> bool:
@@ -416,6 +451,7 @@ class ElementResources(JSONLike):
         return ("num_cores",)  # TODO: filter on `parallel_mode` later
     @staticmethod
+    @TimeIt.decorator
     def get_default_os_name() -> str:
         """
         Get the default value for OS name.
@@ -423,6 +459,7 @@ class ElementResources(JSONLike):
         return os.name
     @classmethod
+    @TimeIt.decorator
     def get_default_shell(cls) -> str:
         """
         Get the default value for name.
@@ -430,6 +467,7 @@ class ElementResources(JSONLike):
         return cls._app.config.default_shell
     @classmethod
+    @TimeIt.decorator
     def get_default_scheduler(cls, os_name: str, shell_name: str) -> str:
         """
         Get the default value for scheduler.
@@ -439,6 +477,7 @@ class ElementResources(JSONLike):
             return "direct_posix"
         return cls._app.config.default_scheduler
+    @TimeIt.decorator
     def set_defaults(self):
         """
         Set defaults for unspecified values that need defaults.
@@ -464,9 +503,11 @@ class ElementResources(JSONLike):
         cfg_defs = cfg_sched.get("defaults", {})
         cfg_opts = cfg_defs.pop("options", {})
         opts = {**cfg_opts, **self.scheduler_args.get("options", {})}
-        self.scheduler_args["options"] = opts
+        if opts:
+            self.scheduler_args["options"] = opts
         self.scheduler_args = {**cfg_defs, **self.scheduler_args}
+    @TimeIt.decorator
     def validate_against_machine(self):
         """Validate the values for `os_name`, `shell` and `scheduler` against those
         supported on this machine (as specified by the app configuration)."""
@@ -477,6 +518,12 @@ class ElementResources(JSONLike):
                 scheduler=self.scheduler,
                 supported=self._app.config.schedulers,
             )
+        if self.os_name == "nt" and self.combine_jobscript_std:
+            raise NotImplementedError(
+                "`combine_jobscript_std` is not yet supported on Windows."
+            )
         # might raise `UnsupportedShellError`:
         get_shell(shell_name=self.shell, os_name=self.os_name)
@@ -626,10 +673,21 @@ class ElementIteration(AppAware):
     @property
     def EAR_IDs(self) -> Mapping[int, Sequence[int]]:
         """
-        Mapping from iteration number to EAR ID, where known.
+        Mapping from action index to EAR ID, where known.
         """
         return self._EAR_IDs
+    @property
+    def loop_skipped(self) -> bool:
+        """True if the the iteration was skipped entirely due to a loop termination."""
+        if not self.action_runs:
+            # this includes when runs are not initialised
+            return False
+        else:
+            return all(
+                i.skip_reason is SkipReason.LOOP_TERMINATION for i in self.action_runs
+            )
     @property
     def EAR_IDs_flat(self) -> Iterable[int]:
         """
@@ -1256,6 +1314,10 @@ class ElementIteration(AppAware):
                     resources["os_name"], resources["shell"]
                 )
+        # unset inapplicable items:
+        if "combine_scripts" in resources and not action.script_is_python_snippet:
+            del resources["combine_scripts"]
         return resources
     def get_resources_obj(
@@ -1446,47 +1508,60 @@ class Element(AppAware):
         """
         return self.iterations[-1]
+    @property
+    def latest_iteration_non_skipped(self):
+        """Get the latest iteration that is not loop-skipped."""
+        for iter_i in self.iterations[::-1]:
+            if not iter_i.loop_skipped:
+                return iter_i
     @property
     def inputs(self) -> ElementInputs:
         """
-        The inputs to this element (or its most recent iteration).
+        The inputs to this element's most recent iteration (that was not skipped due to
+        loop termination).
         """
-        return self.latest_iteration.inputs
+        return self.latest_iteration_non_skipped.inputs
     @property
     def outputs(self) -> ElementOutputs:
         """
-        The outputs from this element (or its most recent iteration).
+        The outputs from this element's most recent iteration (that was not skipped due to
+        loop termination).
         """
-        return self.latest_iteration.outputs
+        return self.latest_iteration_non_skipped.outputs
     @property
     def input_files(self) -> ElementInputFiles:
         """
-        The input files to this element (or its most recent iteration).
+        The input files to this element's most recent iteration (that was not skipped due
+        to loop termination).
         """
-        return self.latest_iteration.input_files
+        return self.latest_iteration_non_skipped.input_files
     @property
     def output_files(self) -> ElementOutputFiles:
         """
-        The output files from this element (or its most recent iteration).
+        The output files from this element's most recent iteration (that was not skipped
+        due to loop termination).
         """
-        return self.latest_iteration.output_files
+        return self.latest_iteration_non_skipped.output_files
     @property
     def schema_parameters(self) -> Sequence[str]:
         """
-        The schema-defined parameters to this element (or its most recent iteration).
+        The schema-defined parameters to this element's most recent iteration (that was
+        not skipped due to loop termination).
         """
-        return self.latest_iteration.schema_parameters
+        return self.latest_iteration_non_skipped.schema_parameters
     @property
     def actions(self) -> Mapping[int, ElementAction]:
         """
-        The actions of this element (or its most recent iteration).
+        The actions of this element's most recent iteration (that was not skipped due to
+        loop termination).
         """
-        return self.latest_iteration.actions
+        return self.latest_iteration_non_skipped.actions
     @property
     def action_runs(self) -> Sequence[ElementActionRun]:
@@ -1494,13 +1569,7 @@ class Element(AppAware):
         A list of element action runs from the latest iteration, where only the
         final run is taken for each element action.
         """
-        return self.latest_iteration.action_runs
-    def init_loop_index(self, loop_name: str) -> None:
-        """
-        Initialise the loop index if necessary.
-        """
-        pass
+        return self.latest_iteration_non_skipped.action_runs
     def to_element_set_data(self) -> tuple[list[InputValue], list[ResourceSpec]]:
         """Generate lists of workflow-bound InputValues and ResourceList."""
@@ -1550,14 +1619,15 @@ class Element(AppAware):
         action_idx: int | None = None,
         run_idx: int = -1,
     ) -> DataIndex:
-        """Get the data index of the most recent element iteration.
+        """Get the data index of the most recent element iteration that
+        is not loop-skipped.
         Parameters
         ----------
         action_idx
             The index of the action within the schema.
         """
-        return self.latest_iteration.get_data_idx(
+        return self.latest_iteration_non_skipped.get_data_idx(
             path=path,
             action_idx=action_idx,
             run_idx=run_idx,
@@ -1633,8 +1703,9 @@ class Element(AppAware):
         raise_on_missing: bool = False,
         raise_on_unset: bool = False,
     ) -> Any:
-        """Get element data of the most recent iteration from the persistent store."""
-        return self.latest_iteration.get(
+        """Get element data of the most recent iteration that is not
+        loop-skipped."""
+        return self.latest_iteration_non_skipped.get(
             path=path,
             action_idx=action_idx,
             run_idx=run_idx,
@@ -1651,6 +1722,7 @@ class Element(AppAware):
     def get_EAR_dependencies(self, as_objects: Literal[False] = False) -> set[int]:
         ...
+    @TimeIt.decorator
     def get_EAR_dependencies(
         self, as_objects: bool = False
     ) -> set[int] | list[ElementActionRun]:

hpcflow/sdk/core/errors.py CHANGED Viewed

@@ -9,10 +9,12 @@ from textwrap import indent
 from typing import Any, TYPE_CHECKING
 if TYPE_CHECKING:
+    from logging import Logger
     from .enums import ParallelMode
     from .object_list import WorkflowLoopList
-    from .parameters import InputSource, ValueSequence
+    from .parameters import InputSource, ValueSequence, SchemaInput
     from .types import ScriptData
+    from .task import WorkflowTask
 class InputValueDuplicateSequenceAddress(ValueError):
@@ -410,7 +412,13 @@ class WorkflowLimitsError(ValueError):
     # FIXME: never used
-class UnsetParameterDataError(Exception):
+class UnsetParameterDataErrorBase(Exception):
+    """
+    Exceptions related to attempts to retrieve unset parameters.
+    """
+class UnsetParameterDataError(UnsetParameterDataErrorBase):
     """
     Tried to read from an unset parameter.
     """
@@ -422,6 +430,50 @@ class UnsetParameterDataError(Exception):
         )
+class UnsetParameterFractionLimitExceededError(UnsetParameterDataErrorBase):
+    """
+    Given the specified `allow_failed_dependencies`, the fraction of failed dependencies
+    (unset parameter data) is too high."""
+    def __init__(
+        self,
+        schema_inp: SchemaInput,
+        task: WorkflowTask,
+        unset_fraction: float,
+        log: Logger | None = None,
+    ):
+        msg = (
+            f"Input {schema_inp.parameter.typ!r} of task {task.name!r}: higher "
+            f"proportion of dependencies failed ({unset_fraction!r}) than allowed "
+            f"({schema_inp.allow_failed_dependencies!r})."
+        )
+        if log:
+            log.info(msg)
+        super().__init__(msg)
+class UnsetParameterNumberLimitExceededError(UnsetParameterDataErrorBase):
+    """
+    Given the specified `allow_failed_dependencies`, the number of failed dependencies
+    (unset parameter data) is too high."""
+    def __init__(
+        self,
+        schema_inp: SchemaInput,
+        task: WorkflowTask,
+        unset_num: int,
+        log: Logger | None = None,
+    ):
+        msg = (
+            f"Input {schema_inp.parameter.typ!r} of task {task.name!r}: higher number of "
+            f"dependencies failed ({unset_num!r}) than allowed "
+            f"({schema_inp.allow_failed_dependencies!r})."
+        )
+        if log:
+            log.info(msg)
+        super().__init__(msg)
 class LoopAlreadyExistsError(Exception):
     """
     A particular loop (or its name) already exists.
@@ -730,6 +782,10 @@ class MissingParameterData(_MissingStoreItemError):
         super().__init__(id_lst, self._item_type)
+class ParametersMetadataReadOnlyError(RuntimeError):
+    pass
 class NotSubmitMachineError(RuntimeError):
     """
     The requested machine can't be submitted to.

hpcflow/sdk/core/execute.py ADDED Viewed

@@ -0,0 +1,207 @@
+import asyncio
+import os
+import queue
+import struct
+import threading
+import time
+import zmq
+from hpcflow.sdk.core.app_aware import AppAware
+class Executor(AppAware):
+    def __init__(self, cmd, env, package_name):
+        # TODO: make zmq_server optional (but required if action is abortable, or if
+        #       `script_data_in`/`out`` is "zeromq")
+        self.cmd = cmd
+        self.env = env
+        self.package_name = package_name
+        # initialise a global ZeroMQ context for use in all threads:
+        zmq.Context()
+        self._q = None  # queue for inter-thread communication
+        # assigned by `start_zmq_server`:
+        self.port_number = None
+        self.server_thread = None
+        # assigned on (non-aborted) completion of the subprocess via `_subprocess_runner`:
+        self.return_code = None
+    @property
+    def q(self):
+        if not self._q:
+            self._q = queue.Queue()
+        return self._q
+    @property
+    def zmq_context(self):
+        return zmq.Context.instance()
+    def _zmq_server(self):
+        """Start a ZeroMQ server on a random port.
+        This method is invoked in a separate thread via `start_zmq_server`.
+        """
+        socket = self.zmq_context.socket(zmq.REP)
+        port_number = socket.bind_to_random_port("tcp://*")
+        self._app.logger.info(f"zmq_server: started on port {port_number}")
+        # send port number back to main thread:
+        self.q.put(port_number)
+        self._app.logger.info(f"zmq_server: port number sent to main thread.")
+        # TODO: exception handling
+        while True:
+            message = socket.recv_string()
+            self._app.logger.info(f"zmq_server: received request: {message}")
+            # Check if the received message is a shutdown signal
+            if message in ("shutdown", "abort"):
+                self.q.put(message)
+                socket.send_string("shutting down the server")
+                break
+            else:
+                socket.send_string(f"received request: {message}")
+        socket.close()
+        self._app.logger.info("zmq_server: server stopped")
+    def start_zmq_server(self) -> int:
+        # start the server thread
+        server_thread = threading.Thread(target=self._zmq_server)
+        server_thread.start()
+        self._app.logger.info(f"server thread started")
+        if os.name == "nt":
+            # some sort of race condition seems to exist on Windows, where self.q.get()
+            # will occasionally hang on the Github Actions runners. This seems to resolve
+            # it.
+            time.sleep(0.1)
+        # block until port number received:
+        port_number = self.q.get(timeout=5)
+        self._app.logger.info(f"received port number from server thread: {port_number}")
+        self.port_number = port_number
+        self.server_thread = server_thread
+        return port_number
+    def stop_zmq_server(self):
+        # send a shutdown signal to the server:
+        socket = self.zmq_context.socket(zmq.REQ)
+        address = f"tcp://localhost:{self.port_number}"
+        socket.connect(address)
+        self._app.logger.info(
+            f"stop_zmq_server: about to send shutdown message to server: {address!r}"
+        )
+        socket.send_string("shutdown")
+        send_shutdown_out = socket.recv()
+        self._app.logger.info(f"stop_zmq_server: received reply: {send_shutdown_out!r}")
+        socket.close()
+        # wait for the server thread to finish:
+        self._app.logger.info(f"stop_zmq_server: joining server thread")
+        self.server_thread.join()
+        self._app.logger.info(f"stop_zmq_server: terminating ZMQ context")
+        self.zmq_context.term()
+        if self.server_thread.is_alive():
+            raise RuntimeError("Server thread is still alive!")
+    def run(self):
+        """Launch the subprocess to execute the commands, and once complete, stop the
+        ZMQ server. Kill the subprocess if a "shutdown" or "abort" message is sent to the
+        server."""
+        asyncio.run(self._run())
+        return self.return_code
+    def _receive_stop(self):
+        """Wait until the queue receives a shutdown message from the server"""
+        while True:
+            if self.q.get() in ("shutdown", "abort"):
+                return
+    async def _subprocess_runner(self):
+        app_caps = self.package_name.upper()
+        env = {**self.env, f"{app_caps}_RUN_PORT": str(self.port_number)}
+        try:
+            process = await asyncio.create_subprocess_exec(*self.cmd, env=env)
+            self._app.logger.info(
+                f"_subprocess_runner: started subprocess: {process=!r}."
+            )
+            ret_code = await process.wait()
+            self._app.logger.info(
+                f"_subprocess_runner: subprocess finished with return code: {ret_code!r}."
+            )
+            self.return_code = ret_code
+        except asyncio.CancelledError:
+            process.kill()
+    async def _run(self):
+        # create tasks for the subprocess and a synchronous Queue.get retrieval:
+        try:
+            wait_abort_thread = asyncio.to_thread(self._receive_stop)
+        except AttributeError:
+            # Python 3.8
+            from hpcflow.sdk.core.utils import to_thread
+            wait_abort_thread = to_thread(self._receive_stop)
+        wait_abort_task = asyncio.create_task(wait_abort_thread)
+        subprocess_task = asyncio.create_task(self._subprocess_runner())
+        # wait for either: subprocess to finish, or a stop signal from the server:
+        _, pending = await asyncio.wait(
+            [wait_abort_task, subprocess_task],
+            return_when=asyncio.FIRST_COMPLETED,
+        )
+        # TODO: test we can SIGTERM and SIGINT the subprocess successfully?
+        #   - add an API for sending signals to the process via the server?
+        if pending == {wait_abort_task}:
+            # subprocess completed; need to shutdown the server
+            self._app.logger.info(f"_run: subprocess completed; stopping zmq server")
+            self.stop_zmq_server()
+        else:
+            # subprocess still running but got a stop request; need to kill subprocess:
+            self._app.logger.info(f"_run: stop request; killing subprocess")
+            subprocess_task.cancel()
+        if self.return_code and os.name == "nt":
+            # Windows return codes are defined as 32-bit unsigned integers, but
+            # some programs might still return negative numbers, so convert to a
+            # signed 32-bit integer:
+            self.return_code = struct.unpack("i", struct.pack("I", self.return_code))[0]
+    @classmethod
+    def send_abort(cls, hostname, port_number):
+        """Send an abort message to a running server."""
+        context = zmq.Context()
+        socket = context.socket(zmq.REQ)
+        address = f"tcp://{hostname}:{port_number}"
+        socket.connect(address)
+        cls._app.logger.info(
+            f"send_abort: about to send abort message to server: {address!r}"
+        )
+        socket.send_string("abort")
+        abort_rep = socket.recv()
+        cls._app.logger.info(f"send_abort: received reply: {abort_rep!r}")
+        socket.close()
+        context.term()

hpcflow-new2 0.2.0a190__py3-none-any.whl → 0.2.0a200__py3-none-any.whl

hpcflow-new2 0.2.0a190py3-none-any.whl → 0.2.0a200py3-none-any.whl