PyPI - parsl - Versions diffs - 2024.6.3__py3-none-any.whl → 2024.6.17__py3-none-any.whl - Mend

parsl 2024.6.3py3-none-any.whl → 2024.6.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

parsl/app/app.py +0 -2
parsl/config.py +27 -4
parsl/dataflow/dflow.py +36 -10
parsl/executors/high_throughput/executor.py +36 -30
parsl/executors/high_throughput/interchange.py +26 -28
parsl/providers/kubernetes/kube.py +22 -9
parsl/providers/slurm/slurm.py +31 -22
parsl/tests/configs/flux_local.py +11 -0
parsl/tests/conftest.py +4 -0
parsl/tests/test_bash_apps/test_stdout.py +20 -2
parsl/tests/test_htex/test_htex.py +24 -7
parsl/tests/test_htex/test_zmq_binding.py +22 -6
parsl/tests/test_python_apps/test_context_manager.py +96 -1
parsl/tests/test_python_apps/test_dependencies_deep.py +59 -0
parsl/tests/test_radical/test_mpi_funcs.py +0 -1
parsl/tests/unit/test_usage_tracking.py +45 -0
parsl/usage_tracking/levels.py +6 -0
parsl/usage_tracking/usage.py +54 -23
parsl/version.py +1 -1
parsl-2024.6.17.data/scripts/interchange.py +681 -0
{parsl-2024.6.3.dist-info → parsl-2024.6.17.dist-info}/METADATA +2 -2
{parsl-2024.6.3.dist-info → parsl-2024.6.17.dist-info}/RECORD +29 -24
{parsl-2024.6.3.data → parsl-2024.6.17.data}/scripts/exec_parsl_function.py +0 -0
{parsl-2024.6.3.data → parsl-2024.6.17.data}/scripts/parsl_coprocess.py +0 -0
{parsl-2024.6.3.data → parsl-2024.6.17.data}/scripts/process_worker_pool.py +0 -0
{parsl-2024.6.3.dist-info → parsl-2024.6.17.dist-info}/LICENSE +0 -0
{parsl-2024.6.3.dist-info → parsl-2024.6.17.dist-info}/WHEEL +0 -0
{parsl-2024.6.3.dist-info → parsl-2024.6.17.dist-info}/entry_points.txt +0 -0
{parsl-2024.6.3.dist-info → parsl-2024.6.17.dist-info}/top_level.txt +0 -0

parsl/app/app.py CHANGED Viewed

@@ -66,8 +66,6 @@ class AppBase(metaclass=ABCMeta):
             self.kwargs['walltime'] = params['walltime'].default
         if 'parsl_resource_specification' in params:
             self.kwargs['parsl_resource_specification'] = params['parsl_resource_specification'].default
-        self.outputs = params['outputs'].default if 'outputs' in params else []
-        self.inputs = params['inputs'].default if 'inputs' in params else []
     @abstractmethod
     def __call__(self, *args: Any, **kwargs: Any) -> AppFuture:

parsl/config.py CHANGED Viewed

@@ -11,6 +11,8 @@ from parsl.executors.base import ParslExecutor
 from parsl.executors.threads import ThreadPoolExecutor
 from parsl.monitoring import MonitoringHub
 from parsl.usage_tracking.api import UsageInformation
+from parsl.usage_tracking.levels import DISABLED as USAGE_TRACKING_DISABLED
+from parsl.usage_tracking.levels import LEVEL_3 as USAGE_TRACKING_LEVEL_3
 from parsl.utils import RepresentationMixin
 logger = logging.getLogger(__name__)
@@ -38,6 +40,15 @@ class Config(RepresentationMixin, UsageInformation):
         ``checkpoint_mode='periodic'``.
     dependency_resolver: plugin point for custom dependency resolvers. Default: only resolve Futures,
         using the `SHALLOW_DEPENDENCY_RESOLVER`.
+    exit_mode: str, optional
+        When Parsl is used as a context manager (using ``with parsl.load`` syntax) then this parameter
+        controls what will happen to running tasks and exceptions at exit. The options are:
+        * ``cleanup``: cleanup the DFK on exit without waiting for any tasks
+        * ``skip``: skip all shutdown behaviour when exiting the context manager
+        * ``wait``: wait for all tasks to complete when exiting normally, but exit immediately when exiting due to an exception.
+        Default is ``cleanup``.
     garbage_collect : bool. optional.
         Delete task records from DFK when tasks have completed. Default: True
     internal_tasks_max_threads : int, optional
@@ -66,9 +77,12 @@ class Config(RepresentationMixin, UsageInformation):
         How often the scaling strategy should be executed. Default is 5 seconds.
     max_idletime : float, optional
         The maximum idle time allowed for an executor before strategy could shut down unused blocks. Default is 120.0 seconds.
-    usage_tracking : bool, optional
-        Set this field to True to opt-in to Parsl's usage tracking system. Parsl only collects minimal, non personally-identifiable,
-        information used for reporting to our funding agencies. Default is False.
+    usage_tracking : int, optional
+        Set this field to 1, 2, or 3 to opt-in to Parsl's usage tracking system.
+        The value represents the level of usage tracking detail to be collected.
+        Setting this field to 0 will disable usage tracking. Default (this field is not set): usage tracking is not enabled.
+        Parsl only collects minimal, non personally-identifiable,
+        information used for reporting to our funding agencies.
     initialize_logging : bool, optional
         Make DFK optionally not initialize any logging. Log messages
         will still be passed into the python logging system under the
@@ -92,6 +106,7 @@ class Config(RepresentationMixin, UsageInformation):
                                         Literal['manual']] = None,
                  checkpoint_period: Optional[str] = None,
                  dependency_resolver: Optional[DependencyResolver] = None,
+                 exit_mode: Literal['cleanup', 'skip', 'wait'] = 'cleanup',
                  garbage_collect: bool = True,
                  internal_tasks_max_threads: int = 10,
                  retries: int = 0,
@@ -102,7 +117,7 @@ class Config(RepresentationMixin, UsageInformation):
                  strategy_period: Union[float, int] = 5,
                  max_idletime: float = 120.0,
                  monitoring: Optional[MonitoringHub] = None,
-                 usage_tracking: bool = False,
+                 usage_tracking: int = 0,
                  initialize_logging: bool = True) -> None:
         executors = tuple(executors or [])
@@ -128,6 +143,7 @@ class Config(RepresentationMixin, UsageInformation):
             checkpoint_period = "00:30:00"
         self.checkpoint_period = checkpoint_period
         self.dependency_resolver = dependency_resolver
+        self.exit_mode = exit_mode
         self.garbage_collect = garbage_collect
         self.internal_tasks_max_threads = internal_tasks_max_threads
         self.retries = retries
@@ -136,6 +152,7 @@ class Config(RepresentationMixin, UsageInformation):
         self.strategy = strategy
         self.strategy_period = strategy_period
         self.max_idletime = max_idletime
+        self.validate_usage_tracking(usage_tracking)
         self.usage_tracking = usage_tracking
         self.initialize_logging = initialize_logging
         self.monitoring = monitoring
@@ -156,6 +173,12 @@ class Config(RepresentationMixin, UsageInformation):
             raise ConfigurationError('Executors must have unique labels ({})'.format(
                 ', '.join(['label={}'.format(repr(d)) for d in duplicates])))
+    def validate_usage_tracking(self, level: int) -> None:
+        if not USAGE_TRACKING_DISABLED <= level <= USAGE_TRACKING_LEVEL_3:
+            raise ConfigurationError(
+                f"Usage Tracking values must be 0, 1, 2, or 3 and not {level}"
+            )
     def get_usage_information(self):
         return {"executors_len": len(self.executors),
                 "dependency_resolver": self.dependency_resolver is not None}

parsl/dataflow/dflow.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import atexit
+import concurrent.futures as cf
 import datetime
 import inspect
 import logging
@@ -209,6 +210,8 @@ class DataFlowKernel:
         self.tasks: Dict[int, TaskRecord] = {}
         self.submitter_lock = threading.Lock()
+        self.dependency_launch_pool = cf.ThreadPoolExecutor(max_workers=1, thread_name_prefix="Dependency-Launch")
         self.dependency_resolver = self.config.dependency_resolver if self.config.dependency_resolver is not None \
             else SHALLOW_DEPENDENCY_RESOLVER
@@ -217,9 +220,24 @@ class DataFlowKernel:
     def __enter__(self):
         return self
-    def __exit__(self, exc_type, exc_value, traceback):
-        logger.debug("Exiting the context manager, calling cleanup for DFK")
-        self.cleanup()
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        mode = self.config.exit_mode
+        logger.debug("Exiting context manager, with exit mode '%s'", mode)
+        if mode == "cleanup":
+            logger.info("Calling cleanup for DFK")
+            self.cleanup()
+        elif mode == "skip":
+            logger.info("Skipping all cleanup handling")
+        elif mode == "wait":
+            if exc_type is None:
+                logger.info("Waiting for all tasks to complete")
+                self.wait_for_current_tasks()
+                self.cleanup()
+            else:
+                logger.info("There was an exception - cleaning up without waiting for task completion")
+                self.cleanup()
+        else:
+            raise InternalConsistencyError(f"Exit case for {mode} should be unreachable, validated by typeguard on Config()")
     def _send_task_log_info(self, task_record: TaskRecord) -> None:
         if self.monitoring:
@@ -611,9 +629,9 @@ class DataFlowKernel:
         return kwargs.get('_parsl_staging_inhibit', False)
     def launch_if_ready(self, task_record: TaskRecord) -> None:
-        """
-        launch_if_ready will launch the specified task, if it is ready
-        to run (for example, without dependencies, and in pending state).
+        """Schedules a task record for re-inspection to see if it is ready
+        for launch and for launch if it is ready. The call will return
+        immediately.
         This should be called by any piece of the DataFlowKernel that
         thinks a task may have become ready to run.
@@ -622,13 +640,17 @@ class DataFlowKernel:
         ready to run - launch_if_ready will not incorrectly launch that
         task.
-        It is also not an error to call launch_if_ready on a task that has
-        already been launched - launch_if_ready will not re-launch that
-        task.
         launch_if_ready is thread safe, so may be called from any thread
         or callback.
         """
+        self.dependency_launch_pool.submit(self._launch_if_ready_async, task_record)
+    @wrap_with_logs
+    def _launch_if_ready_async(self, task_record: TaskRecord) -> None:
+        """
+        _launch_if_ready will launch the specified task, if it is ready
+        to run (for example, without dependencies, and in pending state).
+        """
         exec_fu = None
         task_id = task_record['id']
@@ -1271,6 +1293,10 @@ class DataFlowKernel:
             self.monitoring.close()
             logger.info("Terminated monitoring")
+        logger.info("Terminating dependency launch pool")
+        self.dependency_launch_pool.shutdown()
+        logger.info("Terminated dependency launch pool")
         logger.info("Unregistering atexit hook")
         atexit.unregister(self.atexit_cleanup)
         logger.info("Unregistered atexit hook")

parsl/executors/high_throughput/executor.py CHANGED Viewed

@@ -1,13 +1,13 @@
 import logging
 import math
 import pickle
+import subprocess
 import threading
 import typing
 import warnings
 from collections import defaultdict
 from concurrent.futures import Future
 from dataclasses import dataclass
-from multiprocessing import Process
 from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
 import typeguard
@@ -18,7 +18,7 @@ from parsl.addresses import get_all_addresses
 from parsl.app.errors import RemoteExceptionWrapper
 from parsl.data_provider.staging import Staging
 from parsl.executors.errors import BadMessage, ScalingFailed
-from parsl.executors.high_throughput import interchange, zmq_pipes
+from parsl.executors.high_throughput import zmq_pipes
 from parsl.executors.high_throughput.errors import CommandClientTimeoutError
 from parsl.executors.high_throughput.mpi_prefix_composer import (
     VALID_LAUNCHERS,
@@ -26,7 +26,6 @@ from parsl.executors.high_throughput.mpi_prefix_composer import (
 )
 from parsl.executors.status_handling import BlockProviderExecutor
 from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus
-from parsl.multiprocessing import ForkProcess
 from parsl.process_loggers import wrap_with_logs
 from parsl.providers import LocalProvider
 from parsl.providers.base import ExecutionProvider
@@ -305,7 +304,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
         self._task_counter = 0
         self.worker_ports = worker_ports
         self.worker_port_range = worker_port_range
-        self.interchange_proc: Optional[Process] = None
+        self.interchange_proc: Optional[subprocess.Popen] = None
         self.interchange_port_range = interchange_port_range
         self.heartbeat_threshold = heartbeat_threshold
         self.heartbeat_period = heartbeat_period
@@ -520,37 +519,45 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
         logger.info("Queue management worker finished")
-    def _start_local_interchange_process(self):
+    def _start_local_interchange_process(self) -> None:
         """ Starts the interchange process locally
-        Starts the interchange process locally and uses an internal command queue to
+        Starts the interchange process locally and uses the command queue to
         get the worker task and result ports that the interchange has bound to.
         """
-        self.interchange_proc = ForkProcess(target=interchange.starter,
-                                            kwargs={"client_ports": (self.outgoing_q.port,
-                                                                     self.incoming_q.port,
-                                                                     self.command_client.port),
-                                                    "interchange_address": self.address,
-                                                    "worker_ports": self.worker_ports,
-                                                    "worker_port_range": self.worker_port_range,
-                                                    "hub_address": self.hub_address,
-                                                    "hub_zmq_port": self.hub_zmq_port,
-                                                    "logdir": self.logdir,
-                                                    "heartbeat_threshold": self.heartbeat_threshold,
-                                                    "poll_period": self.poll_period,
-                                                    "logging_level": logging.DEBUG if self.worker_debug else logging.INFO,
-                                                    "cert_dir": self.cert_dir,
-                                                    },
-                                            daemon=True,
-                                            name="HTEX-Interchange"
-                                            )
-        self.interchange_proc.start()
+        interchange_config = {"client_address": "127.0.0.1",
+                              "client_ports": (self.outgoing_q.port,
+                                               self.incoming_q.port,
+                                               self.command_client.port),
+                              "interchange_address": self.address,
+                              "worker_ports": self.worker_ports,
+                              "worker_port_range": self.worker_port_range,
+                              "hub_address": self.hub_address,
+                              "hub_zmq_port": self.hub_zmq_port,
+                              "logdir": self.logdir,
+                              "heartbeat_threshold": self.heartbeat_threshold,
+                              "poll_period": self.poll_period,
+                              "logging_level": logging.DEBUG if self.worker_debug else logging.INFO,
+                              "cert_dir": self.cert_dir,
+                              }
+        config_pickle = pickle.dumps(interchange_config)
+        self.interchange_proc = subprocess.Popen(b"interchange.py", stdin=subprocess.PIPE)
+        stdin = self.interchange_proc.stdin
+        assert stdin is not None, "Popen should have created an IO object (vs default None) because of PIPE mode"
+        logger.debug("Popened interchange process. Writing config object")
+        stdin.write(config_pickle)
+        stdin.flush()
+        logger.debug("Sent config object. Requesting worker ports")
         try:
             (self.worker_task_port, self.worker_result_port) = self.command_client.run("WORKER_PORTS", timeout_s=120)
         except CommandClientTimeoutError:
-            logger.error("Interchange has not completed initialization in 120s. Aborting")
+            logger.error("Interchange has not completed initialization. Aborting")
             raise Exception("Interchange failed to start")
+        logger.debug("Got worker ports")
     def _start_queue_management_thread(self):
         """Method to start the management thread as a daemon.
@@ -809,13 +816,12 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
         logger.info("Attempting HighThroughputExecutor shutdown")
         self.interchange_proc.terminate()
-        self.interchange_proc.join(timeout=timeout)
-        if self.interchange_proc.is_alive():
+        try:
+            self.interchange_proc.wait(timeout=timeout)
+        except subprocess.TimeoutExpired:
             logger.info("Unable to terminate Interchange process; sending SIGKILL")
             self.interchange_proc.kill()
-        self.interchange_proc.close()
         logger.info("Finished HighThroughputExecutor shutdown attempt")
     def get_usage_information(self):

parsl/executors/high_throughput/interchange.py CHANGED Viewed

@@ -65,18 +65,19 @@ class Interchange:
     3. Detect workers that have failed using heartbeats
     """
     def __init__(self,
-                 client_address: str = "127.0.0.1",
-                 interchange_address: Optional[str] = None,
-                 client_ports: Tuple[int, int, int] = (50055, 50056, 50057),
-                 worker_ports: Optional[Tuple[int, int]] = None,
-                 worker_port_range: Tuple[int, int] = (54000, 55000),
-                 hub_address: Optional[str] = None,
-                 hub_zmq_port: Optional[int] = None,
-                 heartbeat_threshold: int = 60,
-                 logdir: str = ".",
-                 logging_level: int = logging.INFO,
-                 poll_period: int = 10,
-                 cert_dir: Optional[str] = None,
+                 *,
+                 client_address: str,
+                 interchange_address: Optional[str],
+                 client_ports: Tuple[int, int, int],
+                 worker_ports: Optional[Tuple[int, int]],
+                 worker_port_range: Tuple[int, int],
+                 hub_address: Optional[str],
+                 hub_zmq_port: Optional[int],
+                 heartbeat_threshold: int,
+                 logdir: str,
+                 logging_level: int,
+                 poll_period: int,
+                 cert_dir: Optional[str],
                  ) -> None:
         """
         Parameters
@@ -92,34 +93,34 @@ class Interchange:
              The ports at which the client can be reached
         worker_ports : tuple(int, int)
-             The specific two ports at which workers will connect to the Interchange. Default: None
+             The specific two ports at which workers will connect to the Interchange.
         worker_port_range : tuple(int, int)
              The interchange picks ports at random from the range which will be used by workers.
-             This is overridden when the worker_ports option is set. Default: (54000, 55000)
+             This is overridden when the worker_ports option is set.
         hub_address : str
              The IP address at which the interchange can send info about managers to when monitoring is enabled.
-             Default: None (meaning monitoring disabled)
+             When None, monitoring is disabled.
         hub_zmq_port : str
              The port at which the interchange can send info about managers to when monitoring is enabled.
-             Default: None (meaning monitoring disabled)
+             When None, monitoring is disabled.
         heartbeat_threshold : int
              Number of seconds since the last heartbeat after which worker is considered lost.
         logdir : str
-             Parsl log directory paths. Logs and temp files go here. Default: '.'
+             Parsl log directory paths. Logs and temp files go here.
         logging_level : int
-             Logging level as defined in the logging module. Default: logging.INFO
+             Logging level as defined in the logging module.
         poll_period : int
-             The main thread polling period, in milliseconds. Default: 10ms
+             The main thread polling period, in milliseconds.
         cert_dir : str | None
-            Path to the certificate directory. Default: None
+            Path to the certificate directory.
         """
         self.cert_dir = cert_dir
         self.logdir = logdir
@@ -671,13 +672,10 @@ def start_file_logger(filename: str, level: int = logging.DEBUG, format_string:
     logger.addHandler(handler)
-@wrap_with_logs(target="interchange")
-def starter(*args: Any, **kwargs: Any) -> None:
-    """Start the interchange process
-    The executor is expected to call this function. The args, kwargs match that of the Interchange.__init__
-    """
+if __name__ == "__main__":
     setproctitle("parsl: HTEX interchange")
-    # logger = multiprocessing.get_logger()
-    ic = Interchange(*args, **kwargs)
+    config = pickle.load(sys.stdin.buffer)
+    ic = Interchange(**config)
     ic.start()

parsl/providers/kubernetes/kube.py CHANGED Viewed

@@ -83,6 +83,10 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
     persistent_volumes: list[(str, str)]
         List of tuples describing persistent volumes to be mounted in the pod.
         The tuples consist of (PVC Name, Mount Directory).
+    service_account_name: str
+        Name of the service account to run the pod as.
+    annotations: Dict[str, str]
+        Annotations to set on the pod.
     """
     @typeguard.typechecked
     def __init__(self,
@@ -103,7 +107,9 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
                  group_id: Optional[str] = None,
                  run_as_non_root: bool = False,
                  secret: Optional[str] = None,
-                 persistent_volumes: List[Tuple[str, str]] = []) -> None:
+                 persistent_volumes: List[Tuple[str, str]] = [],
+                 service_account_name: Optional[str] = None,
+                 annotations: Optional[Dict[str, str]] = None) -> None:
         if not _kubernetes_enabled:
             raise OptionalModuleMissing(['kubernetes'],
                                         "Kubernetes provider requires kubernetes module and config.")
@@ -146,6 +152,8 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
         self.group_id = group_id
         self.run_as_non_root = run_as_non_root
         self.persistent_volumes = persistent_volumes
+        self.service_account_name = service_account_name
+        self.annotations = annotations
         self.kube_client = client.CoreV1Api()
@@ -184,7 +192,9 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
                          pod_name=pod_name,
                          job_name=job_name,
                          cmd_string=formatted_cmd,
-                         volumes=self.persistent_volumes)
+                         volumes=self.persistent_volumes,
+                         service_account_name=self.service_account_name,
+                         annotations=self.annotations)
         self.resources[pod_name] = {'status': JobStatus(JobState.RUNNING)}
         return pod_name
@@ -233,13 +243,13 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
         for jid in to_poll_job_ids:
             phase = None
             try:
-                pod_status = self.kube_client.read_namespaced_pod_status(name=jid, namespace=self.namespace)
+                pod = self.kube_client.read_namespaced_pod(name=jid, namespace=self.namespace)
             except Exception:
                 logger.exception("Failed to poll pod {} status, most likely because pod was terminated".format(jid))
                 if self.resources[jid]['status'] is JobStatus(JobState.RUNNING):
                     phase = 'Unknown'
             else:
-                phase = pod_status.status.phase
+                phase = pod.status.phase
             if phase:
                 status = translate_table.get(phase, JobState.UNKNOWN)
                 logger.debug("Updating pod {} with status {} to parsl status {}".format(jid,
@@ -253,7 +263,9 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
                     job_name,
                     port=80,
                     cmd_string=None,
-                    volumes=[]):
+                    volumes=[],
+                    service_account_name=None,
+                    annotations=None):
         """ Create a kubernetes pod for the job.
         Args:
               - image (string) : Docker image to launch
@@ -274,7 +286,7 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
         # Create the environment variables and command to initiate IPP
         environment_vars = client.V1EnvVar(name="TEST", value="SOME DATA")
-        launch_args = ["-c", "{0};".format(cmd_string)]
+        launch_args = ["-c", "{0}".format(cmd_string)]
         volume_mounts = []
         # Create mount paths for the volumes
@@ -311,11 +323,12 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
                                                    claim_name=volume[0])))
         metadata = client.V1ObjectMeta(name=pod_name,
-                                       labels={"app": job_name})
+                                       labels={"app": job_name},
+                                       annotations=annotations)
         spec = client.V1PodSpec(containers=[container],
                                 image_pull_secrets=[secret],
-                                volumes=volume_defs
-                                )
+                                volumes=volume_defs,
+                                service_account_name=service_account_name)
         pod = client.V1Pod(spec=spec, metadata=metadata)
         api_response = self.kube_client.create_namespaced_pod(namespace=self.namespace,

parsl/providers/slurm/slurm.py CHANGED Viewed

@@ -19,25 +19,29 @@ from parsl.utils import RepresentationMixin, wtime_to_minutes
 logger = logging.getLogger(__name__)
+# From https://slurm.schedmd.com/sacct.html#SECTION_JOB-STATE-CODES
 translate_table = {
-    'PD': JobState.PENDING,
-    'R': JobState.RUNNING,
-    'CA': JobState.CANCELLED,
-    'CF': JobState.PENDING,  # (configuring),
-    'CG': JobState.RUNNING,  # (completing),
-    'CD': JobState.COMPLETED,
-    'F': JobState.FAILED,  # (failed),
-    'TO': JobState.TIMEOUT,  # (timeout),
-    'NF': JobState.FAILED,  # (node failure),
-    'RV': JobState.FAILED,  # (revoked) and
-    'SE': JobState.FAILED   # (special exit state)
+    'PENDING': JobState.PENDING,
+    'RUNNING': JobState.RUNNING,
+    'CANCELLED': JobState.CANCELLED,
+    'COMPLETED': JobState.COMPLETED,
+    'FAILED': JobState.FAILED,
+    'NODE_FAIL': JobState.FAILED,
+    'BOOT_FAIL': JobState.FAILED,
+    'DEADLINE': JobState.TIMEOUT,
+    'TIMEOUT': JobState.TIMEOUT,
+    'REVOKED': JobState.FAILED,
+    'OUT_OF_MEMORY': JobState.FAILED,
+    'SUSPENDED': JobState.HELD,
+    'PREEMPTED': JobState.TIMEOUT,
+    'REQUEUED': JobState.PENDING
 }
 class SlurmProvider(ClusterProvider, RepresentationMixin):
     """Slurm Execution Provider
-    This provider uses sbatch to submit, squeue for status and scancel to cancel
+    This provider uses sbatch to submit, sacct for status and scancel to cancel
     jobs. The sbatch script to be used is created from a template file in this
     same module.
@@ -168,14 +172,16 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
             logger.debug('No active jobs, skipping status update')
             return
-        cmd = "squeue --noheader --format='%i %t' --job '{0}'".format(job_id_list)
+        # Using state%20 to get enough characters to not truncate output
+        # of the state. Without output can look like "<job_id>     CANCELLED+"
+        cmd = "sacct -X --noheader --format=jobid,state%20 --job '{0}'".format(job_id_list)
         logger.debug("Executing %s", cmd)
         retcode, stdout, stderr = self.execute_wait(cmd)
-        logger.debug("squeue returned %s %s", stdout, stderr)
+        logger.debug("sacct returned %s %s", stdout, stderr)
         # Execute_wait failed. Do no update
         if retcode != 0:
-            logger.warning("squeue failed with non-zero exit code {}".format(retcode))
+            logger.warning("sacct failed with non-zero exit code {}".format(retcode))
             return
         jobs_missing = set(self.resources.keys())
@@ -183,7 +189,10 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
             if not line:
                 # Blank line
                 continue
-            job_id, slurm_state = line.split()
+            # Sacct includes extra information in some outputs
+            # For example "<job_id> CANCELLED by <user_id>"
+            # This splits and ignores anything past the first two unpacked values
+            job_id, slurm_state, *ignore = line.split()
             if slurm_state not in translate_table:
                 logger.warning(f"Slurm status {slurm_state} is not recognized")
             status = translate_table.get(slurm_state, JobState.UNKNOWN)
@@ -193,13 +202,13 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
                                                          stderr_path=self.resources[job_id]['job_stderr_path'])
             jobs_missing.remove(job_id)
-        # squeue does not report on jobs that are not running. So we are filling in the
-        # blanks for missing jobs, we might lose some information about why the jobs failed.
+        # sacct can get job info after jobs have completed so this path shouldn't be hit
+        # log a warning if there are missing jobs for some reason
         for missing_job in jobs_missing:
-            logger.debug("Updating missing job {} to completed status".format(missing_job))
-            self.resources[missing_job]['status'] = JobStatus(JobState.COMPLETED,
-                                                              stdout_path=self.resources[missing_job]['job_stdout_path'],
-                                                              stderr_path=self.resources[missing_job]['job_stderr_path'])
+            logger.warning("Updating missing job {} to completed status".format(missing_job))
+            self.resources[missing_job]['status'] = JobStatus(
+                JobState.COMPLETED, stdout_path=self.resources[missing_job]['job_stdout_path'],
+                stderr_path=self.resources[missing_job]['job_stderr_path'])
     def submit(self, command: str, tasks_per_node: int, job_name="parsl.slurm") -> str:
         """Submit the command as a slurm job.

parsl/tests/configs/flux_local.py ADDED Viewed

@@ -0,0 +1,11 @@
+from parsl.config import Config
+from parsl.executors import FluxExecutor
+def fresh_config():
+    return Config(
+        executors=[FluxExecutor()],
+    )
+config = fresh_config()

parsl/tests/conftest.py CHANGED Viewed

@@ -151,6 +151,10 @@ def pytest_configure(config):
         'markers',
         'multiple_cores_required: Marks tests that require multiple cores, such as htex affinity'
     )
+    config.addinivalue_line(
+        'markers',
+        'unix_filesystem_permissions_required: Marks tests that require unix-level filesystem permission enforcement'
+    )
     config.addinivalue_line(
         'markers',
         'issue3328: Marks tests broken by issue #3328'

parsl/tests/test_bash_apps/test_stdout.py CHANGED Viewed

@@ -16,7 +16,6 @@ def echo_to_streams(msg, stderr=None, stdout=None):
 whitelist = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'configs', '*threads*')
 speclist = (
-    '/bad/dir/t.out',
     ['t3.out', 'w'],
     ('t4.out', None),
     (42, 'w'),
@@ -26,7 +25,6 @@ speclist = (
 )
 testids = [
-    'nonexistent_dir',
     'list_not_tuple',
     'null_mode',
     'not_a_string',
@@ -55,6 +53,26 @@ def test_bad_stdout_specs(spec):
 @pytest.mark.issue3328
+@pytest.mark.unix_filesystem_permissions_required
+def test_bad_stdout_file():
+    """Testing bad stderr file"""
+    o = "/bad/dir/t2.out"
+    fn = echo_to_streams("Hello world", stdout=o, stderr='t.err')
+    try:
+        fn.result()
+    except perror.BadStdStreamFile:
+        pass
+    else:
+        assert False, "Did not raise expected exception BadStdStreamFile"
+    return
+@pytest.mark.issue3328
+@pytest.mark.unix_filesystem_permissions_required
 def test_bad_stderr_file():
     """Testing bad stderr file"""

parsl 2024.6.3__py3-none-any.whl → 2024.6.17__py3-none-any.whl

parsl 2024.6.3py3-none-any.whl → 2024.6.17py3-none-any.whl