PyPI - parsl - Versions diffs - 2025.9.8__py3-none-any.whl → 2025.11.10__py3-none-any.whl - Mend

parsl 2025.9.8py3-none-any.whl → 2025.11.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

parsl/app/bash.py +1 -1
parsl/benchmark/perf.py +73 -17
parsl/concurrent/__init__.py +95 -14
parsl/curvezmq.py +0 -16
parsl/data_provider/globus.py +3 -1
parsl/dataflow/dflow.py +106 -204
parsl/dataflow/memoization.py +146 -19
parsl/dataflow/states.py +5 -5
parsl/executors/base.py +2 -2
parsl/executors/execute_task.py +2 -8
parsl/executors/flux/executor.py +4 -6
parsl/executors/globus_compute.py +0 -4
parsl/executors/high_throughput/executor.py +86 -24
parsl/executors/high_throughput/interchange.py +39 -20
parsl/executors/high_throughput/mpi_executor.py +1 -2
parsl/executors/high_throughput/mpi_resource_management.py +7 -14
parsl/executors/high_throughput/process_worker_pool.py +32 -7
parsl/executors/high_throughput/zmq_pipes.py +36 -67
parsl/executors/radical/executor.py +2 -6
parsl/executors/radical/rpex_worker.py +2 -2
parsl/executors/taskvine/executor.py +5 -1
parsl/executors/threads.py +5 -2
parsl/jobs/states.py +2 -2
parsl/jobs/strategy.py +7 -6
parsl/monitoring/monitoring.py +2 -2
parsl/monitoring/radios/filesystem.py +2 -1
parsl/monitoring/radios/htex.py +2 -1
parsl/monitoring/radios/multiprocessing.py +2 -1
parsl/monitoring/radios/udp.py +2 -1
parsl/multiprocessing.py +0 -49
parsl/providers/base.py +24 -37
parsl/providers/pbspro/pbspro.py +1 -1
parsl/serialize/__init__.py +6 -9
parsl/serialize/facade.py +0 -32
parsl/tests/configs/local_threads_globus.py +18 -14
parsl/tests/configs/taskvine_ex.py +1 -1
parsl/tests/sites/test_concurrent.py +51 -3
parsl/tests/test_checkpointing/test_periodic.py +15 -9
parsl/tests/test_checkpointing/test_regression_233.py +0 -1
parsl/tests/test_curvezmq.py +0 -42
parsl/tests/test_execute_task.py +2 -11
parsl/tests/test_htex/test_command_concurrency_regression_1321.py +54 -0
parsl/tests/test_htex/test_htex.py +36 -1
parsl/tests/test_htex/test_interchange_exit_bad_registration.py +2 -2
parsl/tests/test_htex/test_priority_queue.py +26 -3
parsl/tests/test_htex/test_zmq_binding.py +2 -1
parsl/tests/test_mpi_apps/test_mpi_scheduler.py +18 -43
parsl/tests/test_python_apps/test_basic.py +0 -14
parsl/tests/test_python_apps/test_depfail_propagation.py +11 -1
parsl/tests/test_python_apps/test_exception.py +19 -0
parsl/tests/test_python_apps/test_garbage_collect.py +1 -6
parsl/tests/test_python_apps/test_memoize_2.py +11 -1
parsl/tests/test_regression/test_3874.py +47 -0
parsl/tests/test_scaling/test_regression_3696_oscillation.py +1 -0
parsl/tests/test_staging/test_staging_globus.py +2 -2
parsl/tests/unit/test_globus_compute_executor.py +11 -2
parsl/utils.py +8 -3
parsl/version.py +1 -1
{parsl-2025.9.8.data → parsl-2025.11.10.data}/scripts/interchange.py +39 -20
{parsl-2025.9.8.data → parsl-2025.11.10.data}/scripts/process_worker_pool.py +32 -7
{parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/METADATA +64 -50
{parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/RECORD +68 -74
{parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/WHEEL +1 -1
parsl/tests/configs/local_threads_checkpoint_periodic.py +0 -11
parsl/tests/configs/local_threads_no_cache.py +0 -11
parsl/tests/site_tests/test_provider.py +0 -88
parsl/tests/site_tests/test_site.py +0 -70
parsl/tests/test_aalst_patterns.py +0 -474
parsl/tests/test_docs/test_workflow2.py +0 -42
parsl/tests/test_error_handling/test_rand_fail.py +0 -171
parsl/tests/test_regression/test_854.py +0 -62
parsl/tests/test_serialization/test_pack_resource_spec.py +0 -23
{parsl-2025.9.8.data → parsl-2025.11.10.data}/scripts/exec_parsl_function.py +0 -0
{parsl-2025.9.8.data → parsl-2025.11.10.data}/scripts/parsl_coprocess.py +0 -0
{parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/entry_points.txt +0 -0
{parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info/licenses}/LICENSE +0 -0
{parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/top_level.txt +0 -0

parsl/dataflow/memoization.py CHANGED Viewed

@@ -4,19 +4,18 @@ import hashlib
 import logging
 import os
 import pickle
+import threading
+import types
+from concurrent.futures import Future
 from functools import lru_cache, singledispatch
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence
+from typing import Any, Dict, List, Literal, Optional, Sequence
 import typeguard
 from parsl.dataflow.errors import BadCheckpoint
 from parsl.dataflow.taskrecord import TaskRecord
-if TYPE_CHECKING:
-    from parsl import DataFlowKernel  # import loop at runtime - needed for typechecking - TODO turn into "if typing:"
-import types
-from concurrent.futures import Future
+from parsl.errors import ConfigurationError, InternalConsistencyError
+from parsl.utils import Timer, get_all_checkpoints
 logger = logging.getLogger(__name__)
@@ -150,19 +149,41 @@ class Memoizer:
     """
-    def __init__(self, dfk: DataFlowKernel, *, memoize: bool = True, checkpoint_files: Sequence[str]):
-        """Initialize the memoizer.
+    run_dir: str
-        Args:
-            - dfk (DFK obj): The DFK object
+    def __init__(self, *,
+                 memoize: bool = True,
+                 checkpoint_files: Sequence[str] | None,
+                 checkpoint_period: Optional[str],
+                 checkpoint_mode: Literal['task_exit', 'periodic', 'dfk_exit', 'manual'] | None):
+        """Initialize the memoizer.
         KWargs:
             - memoize (Bool): enable memoization or not.
             - checkpoint (Dict): A checkpoint loaded as a dict.
         """
-        self.dfk = dfk
         self.memoize = memoize
+        self.checkpointed_tasks = 0
+        self.checkpoint_lock = threading.Lock()
+        self.checkpoint_files = checkpoint_files
+        self.checkpoint_mode = checkpoint_mode
+        self.checkpoint_period = checkpoint_period
+        self.checkpointable_tasks: List[TaskRecord] = []
+        self._checkpoint_timer: Timer | None = None
+    def start(self) -> None:
+        if self.checkpoint_files is not None:
+            checkpoint_files = self.checkpoint_files
+        elif self.checkpoint_files is None and self.checkpoint_mode is not None:
+            checkpoint_files = get_all_checkpoints(self.run_dir)
+        else:
+            checkpoint_files = []
         checkpoint = self.load_checkpoints(checkpoint_files)
         if self.memoize:
@@ -172,6 +193,26 @@ class Memoizer:
             logger.info("App caching disabled for all apps")
             self.memo_lookup_table = {}
+        if self.checkpoint_mode == "periodic":
+            if self.checkpoint_period is None:
+                raise ConfigurationError("Checkpoint period must be specified with periodic checkpoint mode")
+            else:
+                try:
+                    h, m, s = map(int, self.checkpoint_period.split(':'))
+                except Exception:
+                    raise ConfigurationError("invalid checkpoint_period provided: {0} expected HH:MM:SS".format(self.checkpoint_period))
+                checkpoint_period = (h * 3600) + (m * 60) + s
+                self._checkpoint_timer = Timer(self.checkpoint_queue, interval=checkpoint_period, name="Checkpoint")
+    def close(self) -> None:
+        if self.checkpoint_mode is not None:
+            logger.info("Making final checkpoint")
+            self.checkpoint_queue()
+        if self._checkpoint_timer:
+            logger.info("Stopping checkpoint timer")
+            self._checkpoint_timer.close()
     def make_hash(self, task: TaskRecord) -> str:
         """Create a hash of the task inputs.
@@ -242,16 +283,20 @@ class Memoizer:
         assert isinstance(result, Future) or result is None
         return result
-    def update_memo(self, task: TaskRecord, r: Future[Any]) -> None:
+    def update_memo_result(self, task: TaskRecord, r: Any) -> None:
+        self._update_memo(task)
+    def update_memo_exception(self, task: TaskRecord, e: BaseException) -> None:
+        self._update_memo(task)
+    def _update_memo(self, task: TaskRecord) -> None:
         """Updates the memoization lookup table with the result from a task.
+        This doesn't move any values around but associates the memoization
+        hashsum with the completed (by success or failure) AppFuture.
         Args:
-             - task (dict) : A task dict from dfk.tasks
-             - r (Result future): Result future
+             - task (TaskRecord) : A task record from dfk.tasks
         """
-        # TODO: could use typeguard
-        assert isinstance(r, Future)
         task_id = task['id']
         if not self.memoize or not task['memoize'] or 'hashsum' not in task:
@@ -265,7 +310,7 @@ class Memoizer:
             logger.info(f"Replacing app cache entry {task['hashsum']} with result from task {task_id}")
         else:
             logger.debug(f"Storing app cache entry {task['hashsum']} with result from task {task_id}")
-        self.memo_lookup_table[task['hashsum']] = r
+        self.memo_lookup_table[task['hashsum']] = task['app_fu']
     def _load_checkpoints(self, checkpointDirs: Sequence[str]) -> Dict[str, Future[Any]]:
         """Load a checkpoint file into a lookup table.
@@ -334,3 +379,85 @@ class Memoizer:
             return self._load_checkpoints(checkpointDirs)
         else:
             return {}
+    def update_checkpoint(self, task_record: TaskRecord) -> None:
+        if self.checkpoint_mode == 'task_exit':
+            self.checkpoint_one(task=task_record)
+        elif self.checkpoint_mode in ('manual', 'periodic', 'dfk_exit'):
+            with self.checkpoint_lock:
+                self.checkpointable_tasks.append(task_record)
+        elif self.checkpoint_mode is None:
+            pass
+        else:
+            raise InternalConsistencyError(f"Invalid checkpoint mode {self.checkpoint_mode}")
+    def checkpoint_one(self, *, task: TaskRecord) -> None:
+        """Checkpoint a single task to a checkpoint file.
+        By default the checkpoints are written to the RUNDIR of the current
+        run under RUNDIR/checkpoints/tasks.pkl
+        Kwargs:
+            - task : A task to checkpoint.
+        .. note::
+            Checkpointing only works if memoization is enabled
+        """
+        with self.checkpoint_lock:
+            self._checkpoint_these_tasks([task])
+    def checkpoint_queue(self) -> None:
+        """Checkpoint all tasks registered in self.checkpointable_tasks.
+        By default the checkpoints are written to the RUNDIR of the current
+        run under RUNDIR/checkpoints/tasks.pkl
+        .. note::
+            Checkpointing only works if memoization is enabled
+        """
+        with self.checkpoint_lock:
+            self._checkpoint_these_tasks(self.checkpointable_tasks)
+            self.checkpointable_tasks = []
+    def _checkpoint_these_tasks(self, checkpoint_queue: List[TaskRecord]) -> None:
+        """Checkpoint a list of task records.
+        The checkpoint lock must be held when invoking this method.
+        """
+        checkpoint_dir = '{0}/checkpoint'.format(self.run_dir)
+        checkpoint_tasks = checkpoint_dir + '/tasks.pkl'
+        if not os.path.exists(checkpoint_dir):
+            os.makedirs(checkpoint_dir, exist_ok=True)
+        count = 0
+        with open(checkpoint_tasks, 'ab') as f:
+            for task_record in checkpoint_queue:
+                task_id = task_record['id']
+                app_fu = task_record['app_fu']
+                if app_fu.done() and app_fu.exception() is None:
+                    hashsum = task_record['hashsum']
+                    if not hashsum:
+                        continue
+                    t = {'hash': hashsum, 'exception': None, 'result': app_fu.result()}
+                    # We are using pickle here since pickle dumps to a file in 'ab'
+                    # mode behave like a incremental log.
+                    pickle.dump(t, f)
+                    count += 1
+                    logger.debug("Task {} checkpointed".format(task_id))
+        self.checkpointed_tasks += count
+        if count == 0:
+            if self.checkpointed_tasks == 0:
+                logger.warning("No tasks checkpointed so far in this run. Please ensure caching is enabled")
+            else:
+                logger.debug("No tasks checkpointed in this pass.")
+        else:
+            logger.info("Done checkpointing {} tasks".format(count))

parsl/dataflow/states.py CHANGED Viewed

@@ -67,10 +67,10 @@ class States(IntEnum):
         return self.__class__.__name__ + "." + self.name
-FINAL_STATES = [States.exec_done, States.memo_done, States.failed, States.dep_fail]
-"""States from which we will never move to another state, because the job has
-either definitively completed or failed."""
-FINAL_FAILURE_STATES = [States.failed, States.dep_fail]
+FINAL_FAILURE_STATES = {States.failed, States.dep_fail}
 """States which are final and which indicate a failure. This must
 be a subset of FINAL_STATES"""
+FINAL_STATES = {States.exec_done, States.memo_done, *FINAL_FAILURE_STATES}
+"""States from which we will never move to another state, because the job has
+either definitively completed or failed."""

parsl/executors/base.py CHANGED Viewed

@@ -80,11 +80,11 @@ class ParslExecutor(metaclass=ABCMeta):
         self.shutdown()
         return False
-    @abstractmethod
     def start(self) -> None:
         """Start the executor.
-        Any spin-up operations (for example: starting thread pools) should be performed here.
+        By default, this does nothing, but this method should be overridden to
+        perform any spin-up operations (for example: starting thread pools).
         """
         pass

parsl/executors/execute_task.py CHANGED Viewed

@@ -1,17 +1,11 @@
-import os
-from parsl.serialize import unpack_res_spec_apply_message
+from parsl.serialize import unpack_apply_message
 def execute_task(bufs: bytes):
     """Deserialize the buffer and execute the task.
     Returns the result or throws exception.
     """
-    f, args, kwargs, resource_spec = unpack_res_spec_apply_message(bufs)
-    for varname in resource_spec:
-        envname = "PARSL_" + str(varname).upper()
-        os.environ[envname] = str(resource_spec[varname])
+    f, args, kwargs = unpack_apply_message(bufs)
     # We might need to look into callability of the function from itself
     # since we change it's name in the new namespace

parsl/executors/flux/executor.py CHANGED Viewed

@@ -24,7 +24,7 @@ from parsl.executors.flux.execute_parsl_task import __file__ as _WORKER_PATH
 from parsl.executors.flux.flux_instance_manager import __file__ as _MANAGER_PATH
 from parsl.providers import LocalProvider
 from parsl.providers.base import ExecutionProvider
-from parsl.serialize import deserialize, pack_res_spec_apply_message
+from parsl.serialize import deserialize, pack_apply_message
 from parsl.serialize.errors import SerializationError
 from parsl.utils import RepresentationMixin
@@ -224,7 +224,7 @@ class FluxExecutor(ParslExecutor, RepresentationMixin):
         # add a ``weakref.finalize()`` function for joining the executor thread
         weakref.finalize(
             self,
-            lambda x, y: x.set() or y.join(),
+            lambda x, y: x.set() or y.join(),  # type: ignore[func-returns-value]
             self._stop_event,
             self._submission_thread,
         )
@@ -284,10 +284,8 @@ class FluxExecutor(ParslExecutor, RepresentationMixin):
             infile = os.path.join(self.working_dir, f"{task_id}_in{os.extsep}pkl")
             outfile = os.path.join(self.working_dir, f"{task_id}_out{os.extsep}pkl")
             try:
-                fn_buf = pack_res_spec_apply_message(
-                    func, args, kwargs,
-                    resource_specification={},
-                    buffer_threshold=1024 * 1024
+                fn_buf = pack_apply_message(
+                    func, args, kwargs, buffer_threshold=1 << 20,
                 )
             except TypeError:
                 raise SerializationError(func.__name__)

parsl/executors/globus_compute.py CHANGED Viewed

@@ -76,10 +76,6 @@ class GlobusComputeExecutor(ParslExecutor, RepresentationMixin):
         self.storage_access = storage_access
         self.working_dir = working_dir
-    def start(self) -> None:
-        """ Start the Globus Compute Executor """
-        super().start()
     def submit(self, func: Callable, resource_specification: Dict[str, Any], *args: Any, **kwargs: Any) -> Future:
         """ Submit func to globus-compute

parsl/executors/high_throughput/executor.py CHANGED Viewed

@@ -35,7 +35,7 @@ from parsl.monitoring.radios.zmq_router import ZMQRadioReceiver, start_zmq_recei
 from parsl.process_loggers import wrap_with_logs
 from parsl.providers import LocalProvider
 from parsl.providers.base import ExecutionProvider
-from parsl.serialize import deserialize, pack_res_spec_apply_message
+from parsl.serialize import deserialize, pack_apply_message
 from parsl.serialize.errors import DeserializationError, SerializationError
 from parsl.usage_tracking.api import UsageInformation
 from parsl.utils import RepresentationMixin
@@ -160,6 +160,12 @@ GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionP
 """  # Documentation for params used by both HTEx and MPIEx
+class HTEXFuture(Future):
+    def __init__(self, task_id) -> None:
+        super().__init__()
+        self.parsl_executor_task_id = task_id
 class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageInformation):
     __doc__ = f"""Executor designed for cluster-scale
@@ -237,7 +243,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
     @typeguard.typechecked
     def __init__(self,
                  label: str = 'HighThroughputExecutor',
-                 provider: ExecutionProvider = LocalProvider(),
+                 provider: Optional[ExecutionProvider] = None,
                  launch_cmd: Optional[str] = None,
                  interchange_launch_cmd: Optional[Sequence[str]] = None,
                  address: Optional[str] = None,
@@ -267,7 +273,9 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
         logger.debug("Initializing HighThroughputExecutor")
-        BlockProviderExecutor.__init__(self, provider=provider, block_error_handler=block_error_handler)
+        BlockProviderExecutor.__init__(self,
+                                       provider=provider if provider else LocalProvider(),
+                                       block_error_handler=block_error_handler)
         self.label = label
         self.worker_debug = worker_debug
         self.storage_access = storage_access
@@ -332,6 +340,13 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
         self.encrypted = encrypted
         self.cert_dir = None
+        # This flag will enable/disable internal Python mismatch checks
+        # between the interchange and worker managers. This serves as a
+        # temporary workaround for Globus Compute to support different
+        # Python versions at the endpoint and worker layers. We can drop
+        # the flag once we implement modular internal message protocols.
+        self._check_python_mismatch: bool = True
         if not launch_cmd:
             launch_cmd = DEFAULT_LAUNCH_CMD
         self.launch_cmd = launch_cmd
@@ -494,10 +509,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
             else:
                 for serialized_msg in msgs:
-                    try:
-                        msg = pickle.loads(serialized_msg)
-                    except pickle.UnpicklingError:
-                        raise BadMessage("Message received could not be unpickled")
+                    msg = pickle.loads(serialized_msg)
                     if msg['type'] == 'result':
                         try:
@@ -568,6 +580,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
                               "cert_dir": self.cert_dir,
                               "manager_selector": self.manager_selector,
                               "run_id": self.run_id,
+                              "_check_python_mismatch": self._check_python_mismatch,
                               }
         config_pickle = pickle.dumps(interchange_config)
@@ -663,7 +676,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
                 logger.debug("Sending hold to manager: {}".format(manager['manager']))
                 self._hold_manager(manager['manager'])
-    def submit(self, func, resource_specification, *args, **kwargs):
+    def submit(self, func: Callable, resource_specification: dict, *args, **kwargs) -> HTEXFuture:
         """Submits work to the outgoing_q.
         The outgoing_q is an external process listens on this
@@ -684,34 +697,83 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
         self.validate_resource_spec(resource_specification)
-        if self.bad_state_is_set:
-            raise self.executor_exception
-        self._task_counter += 1
-        task_id = self._task_counter
         # handle people sending blobs gracefully
         if logger.getEffectiveLevel() <= logging.DEBUG:
             args_to_print = tuple([ar if len(ar := repr(arg)) < 100 else (ar[:100] + '...') for arg in args])
             logger.debug("Pushing function {} to queue with args {}".format(func, args_to_print))
-        fut = Future()
-        fut.parsl_executor_task_id = task_id
-        self.tasks[task_id] = fut
         try:
-            fn_buf = pack_res_spec_apply_message(func, args, kwargs,
-                                                 resource_specification=resource_specification,
-                                                 buffer_threshold=1024 * 1024)
+            fn_buf = pack_apply_message(func, args, kwargs, buffer_threshold=1 << 20)
         except TypeError:
             raise SerializationError(func.__name__)
-        msg = {"task_id": task_id, "resource_spec": resource_specification, "buffer": fn_buf}
+        context = {}
+        if resource_specification:
+            context["resource_spec"] = resource_specification
+        return self.submit_payload(context, fn_buf)
+    def submit_payload(self, context: dict, buffer: bytes) -> HTEXFuture:
+        """
+        Submit specially crafted payloads.
+        For use-cases where the ``HighThroughputExecutor`` consumer needs the payload
+        handled by the worker in a special way.  For example, if the function is
+        serialized differently than Parsl's default approach, or if the task must
+        be setup more precisely than Parsl's default ``execute_task`` allows.
+        An example interaction:
+        .. code-block: python
+            >>> htex: HighThroughputExecutor  # setup prior to this example
+            >>> ctxt = {
+            ...   "task_executor": {
+            ...     "f": "full.import.path.of.custom_execute_task",
+            ...     "a": ("additional", "arguments"),
+            ...     "k": {"some": "keyword", "args": "here"}
+            ...   }
+            ... }
+            >>> fn_buf = custom_serialize(task_func, *task_args, **task_kwargs)
+            >>> fut = htex.submit_payload(ctxt, fn_buf)
+        The custom ``custom_execute_task`` would be dynamically imported, and
+        invoked as:
+        .. code-block: python
+            args = ("additional", "arguments")
+            kwargs = {"some": "keyword", "args": "here"}
+            result = custom_execute_task(fn_buf, *args, **kwargs)
+        Parameters
+        ----------
+        context:
+            A task-specific context associated with the function buffer.  Parsl
+            currently implements the keys ``task_executor`` and ``resource_spec``
+        buffer:
+            A serialized function, that will be deserialized and executed by
+            ``execute_task`` (or custom function, if ``task_executor`` is specified)
+        Returns
+        -------
+        An HTEXFuture (a normal Future, with the attribute ``.parsl_executor_task_id``
+        set).  The future will be set to done when the associated function buffer has
+        been invoked and completed.
+        """
+        if self.bad_state_is_set:
+            raise self.executor_exception
+        self._task_counter += 1
+        task_id = self._task_counter
+        fut = HTEXFuture(task_id)
+        self.tasks[task_id] = fut
-        # Post task to the outgoing queue
+        msg = {"task_id": task_id, "context": context, "buffer": buffer}
         self.outgoing_q.put(msg)
-        # Return the future
         return fut
     @property

parsl/executors/high_throughput/interchange.py CHANGED Viewed

@@ -23,7 +23,6 @@ from parsl.monitoring.radios.base import MonitoringRadioSender
 from parsl.monitoring.radios.zmq import ZMQRadioSender
 from parsl.process_loggers import wrap_with_logs
 from parsl.serialize import serialize as serialize_object
-from parsl.utils import setproctitle
 from parsl.version import VERSION as PARSL_VERSION
 PKL_HEARTBEAT_CODE = pickle.dumps((2 ** 32) - 1)
@@ -56,6 +55,7 @@ class Interchange:
                  cert_dir: Optional[str],
                  manager_selector: ManagerSelector,
                  run_id: str,
+                 _check_python_mismatch: bool,
                  ) -> None:
         """
         Parameters
@@ -99,6 +99,11 @@ class Interchange:
         cert_dir : str | None
             Path to the certificate directory.
+        _check_python_mismatch : bool
+            If True, the interchange and worker managers must run the same version of
+            Python. Running different versions can cause inter-process communication
+            errors, so proceed with caution.
         """
         self.cert_dir = cert_dir
         self.logdir = logdir
@@ -126,15 +131,13 @@ class Interchange:
         logger.info("Connected to client")
         self.run_id = run_id
+        self._check_python_mismatch = _check_python_mismatch
         self.hub_address = hub_address
         self.hub_zmq_port = hub_zmq_port
         self.pending_task_queue: SortedList[Any] = SortedList(key=lambda tup: (tup[0], tup[1]))
-        # count of tasks that have been received from the submit side
-        self.task_counter = 0
         # count of tasks that have been sent out to worker pools
         self.count = 0
@@ -157,6 +160,7 @@ class Interchange:
         logger.info(f"Bound to port {worker_port} for incoming worker connections")
         self._ready_managers: Dict[bytes, ManagerRecord] = {}
+        self._logged_manager_count_token: object = None
         self.connected_block_history: List[str] = []
         self.heartbeat_threshold = heartbeat_threshold
@@ -213,7 +217,7 @@ class Interchange:
         reply: Any  # the type of reply depends on the command_req received (aka this needs dependent types...)
-        if self.command_channel in self.socks and self.socks[self.command_channel] == zmq.POLLIN:
+        if self.socks.get(self.command_channel) == zmq.POLLIN:
             logger.debug("entering command_server section")
             command_req = self.command_channel.recv_pyobj()
@@ -310,6 +314,7 @@ class Interchange:
             self.process_manager_socket_message(interesting_managers, monitoring_radio, kill_event)
             self.expire_bad_managers(interesting_managers, monitoring_radio)
             self.expire_drained_managers(interesting_managers, monitoring_radio)
+            self.log_manager_counts(interesting_managers)
             self.process_tasks_to_send(interesting_managers, monitoring_radio)
         self.zmq_context.destroy()
@@ -321,20 +326,20 @@ class Interchange:
         """Process incoming task message(s).
         """
-        if self.task_incoming in self.socks and self.socks[self.task_incoming] == zmq.POLLIN:
+        if self.socks.get(self.task_incoming) == zmq.POLLIN:
             logger.debug("start task_incoming section")
             msg = self.task_incoming.recv_pyobj()
             # Process priority, higher number = lower priority
-            resource_spec = msg.get('resource_spec', {})
+            task_id = msg['task_id']
+            resource_spec = msg['context'].get('resource_spec', {})
             priority = resource_spec.get('priority', float('inf'))
-            queue_entry = (-priority, -self.task_counter, msg)
+            queue_entry = (-priority, -task_id, msg)
-            logger.debug("putting message onto pending_task_queue")
+            logger.debug("Putting task %s onto pending_task_queue", task_id)
             self.pending_task_queue.add(queue_entry)
-            self.task_counter += 1
-            logger.debug(f"Fetched {self.task_counter} tasks so far")
+            logger.debug("Put task %s onto pending_task_queue", task_id)
     def process_manager_socket_message(
         self,
@@ -354,9 +359,10 @@ class Interchange:
             mtype = meta['type']
         except Exception as e:
             logger.warning(
-                f'Failed to read manager message ([{type(e).__name__}] {e})'
+                'Failed to read manager message; ignoring message'
+                f' (Exception: [{type(e).__name__}] {e})'
             )
-            logger.debug('Message:\n   %r\n', msg_parts, exc_info=e)
+            logger.debug('Raw message bytes:\n   %r\n', msg_parts, exc_info=e)
             return
         logger.debug(
@@ -396,7 +402,9 @@ class Interchange:
             logger.info(f'Registration info for manager {manager_id!r}: {meta}')
             self._send_monitoring_info(monitoring_radio, new_rec)
-            if (mgr_minor_py, mgr_parsl_v) != (ix_minor_py, ix_parsl_v):
+            python_mismatch: bool = ix_minor_py != mgr_minor_py
+            parsl_mismatch: bool = ix_parsl_v != mgr_parsl_v
+            if parsl_mismatch or (self._check_python_mismatch and python_mismatch):
                 kill_event.set()
                 vm_exc = VersionMismatch(
                     f"py.v={ix_minor_py} parsl.v={ix_parsl_v}",
@@ -517,15 +525,24 @@ class Interchange:
                 m['active'] = False
                 self._send_monitoring_info(monitoring_radio, m)
+    def log_manager_counts(self, interesting_managers: Set[bytes]) -> None:
+        count_interesting = len(interesting_managers)
+        count_ready = len(self._ready_managers)
+        new_logged_manager_count_token = (count_interesting, count_ready)
+        if self._logged_manager_count_token != new_logged_manager_count_token:
+            logger.debug(
+                "Managers count (interesting/total): %d/%d",
+                count_interesting,
+                count_ready
+            )
+            self._logged_manager_count_token = new_logged_manager_count_token
     def process_tasks_to_send(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
         # Check if there are tasks that could be sent to managers
-        logger.debug(
-            "Managers count (interesting/total): %d/%d",
-            len(interesting_managers),
-            len(self._ready_managers)
-        )
         if interesting_managers and self.pending_task_queue:
             shuffled_managers = self.manager_selector.sort_managers(self._ready_managers, interesting_managers)
@@ -618,6 +635,8 @@ def start_file_logger(filename: str, level: int = logging.DEBUG, format_string:
 if __name__ == "__main__":
+    from parsl.utils import setproctitle
     setproctitle("parsl: HTEX interchange")
     config = pickle.load(sys.stdin.buffer)

parsl/executors/high_throughput/mpi_executor.py CHANGED Viewed

@@ -16,7 +16,6 @@ from parsl.executors.status_handling import BlockProviderExecutor
 from parsl.jobs.states import JobStatus
 from parsl.launchers import SimpleLauncher
 from parsl.monitoring.radios.base import RadioConfig
-from parsl.providers import LocalProvider
 from parsl.providers.base import ExecutionProvider
@@ -47,7 +46,7 @@ class MPIExecutor(HighThroughputExecutor):
     @typeguard.typechecked
     def __init__(self,
                  label: str = 'MPIExecutor',
-                 provider: ExecutionProvider = LocalProvider(),
+                 provider: Optional[ExecutionProvider] = None,
                  launch_cmd: Optional[str] = None,
                  interchange_launch_cmd: Optional[str] = None,
                  address: Optional[str] = None,

parsl 2025.9.8__py3-none-any.whl → 2025.11.10__py3-none-any.whl

parsl 2025.9.8py3-none-any.whl → 2025.11.10py3-none-any.whl