parsl 2025.9.8__py3-none-any.whl → 2025.11.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. parsl/app/bash.py +1 -1
  2. parsl/benchmark/perf.py +73 -17
  3. parsl/concurrent/__init__.py +95 -14
  4. parsl/curvezmq.py +0 -16
  5. parsl/data_provider/globus.py +3 -1
  6. parsl/dataflow/dflow.py +106 -204
  7. parsl/dataflow/memoization.py +146 -19
  8. parsl/dataflow/states.py +5 -5
  9. parsl/executors/base.py +2 -2
  10. parsl/executors/execute_task.py +2 -8
  11. parsl/executors/flux/executor.py +4 -6
  12. parsl/executors/globus_compute.py +0 -4
  13. parsl/executors/high_throughput/executor.py +86 -24
  14. parsl/executors/high_throughput/interchange.py +39 -20
  15. parsl/executors/high_throughput/mpi_executor.py +1 -2
  16. parsl/executors/high_throughput/mpi_resource_management.py +7 -14
  17. parsl/executors/high_throughput/process_worker_pool.py +32 -7
  18. parsl/executors/high_throughput/zmq_pipes.py +36 -67
  19. parsl/executors/radical/executor.py +2 -6
  20. parsl/executors/radical/rpex_worker.py +2 -2
  21. parsl/executors/taskvine/executor.py +5 -1
  22. parsl/executors/threads.py +5 -2
  23. parsl/jobs/states.py +2 -2
  24. parsl/jobs/strategy.py +7 -6
  25. parsl/monitoring/monitoring.py +2 -2
  26. parsl/monitoring/radios/filesystem.py +2 -1
  27. parsl/monitoring/radios/htex.py +2 -1
  28. parsl/monitoring/radios/multiprocessing.py +2 -1
  29. parsl/monitoring/radios/udp.py +2 -1
  30. parsl/multiprocessing.py +0 -49
  31. parsl/providers/base.py +24 -37
  32. parsl/providers/pbspro/pbspro.py +1 -1
  33. parsl/serialize/__init__.py +6 -9
  34. parsl/serialize/facade.py +0 -32
  35. parsl/tests/configs/local_threads_globus.py +18 -14
  36. parsl/tests/configs/taskvine_ex.py +1 -1
  37. parsl/tests/sites/test_concurrent.py +51 -3
  38. parsl/tests/test_checkpointing/test_periodic.py +15 -9
  39. parsl/tests/test_checkpointing/test_regression_233.py +0 -1
  40. parsl/tests/test_curvezmq.py +0 -42
  41. parsl/tests/test_execute_task.py +2 -11
  42. parsl/tests/test_htex/test_command_concurrency_regression_1321.py +54 -0
  43. parsl/tests/test_htex/test_htex.py +36 -1
  44. parsl/tests/test_htex/test_interchange_exit_bad_registration.py +2 -2
  45. parsl/tests/test_htex/test_priority_queue.py +26 -3
  46. parsl/tests/test_htex/test_zmq_binding.py +2 -1
  47. parsl/tests/test_mpi_apps/test_mpi_scheduler.py +18 -43
  48. parsl/tests/test_python_apps/test_basic.py +0 -14
  49. parsl/tests/test_python_apps/test_depfail_propagation.py +11 -1
  50. parsl/tests/test_python_apps/test_exception.py +19 -0
  51. parsl/tests/test_python_apps/test_garbage_collect.py +1 -6
  52. parsl/tests/test_python_apps/test_memoize_2.py +11 -1
  53. parsl/tests/test_regression/test_3874.py +47 -0
  54. parsl/tests/test_scaling/test_regression_3696_oscillation.py +1 -0
  55. parsl/tests/test_staging/test_staging_globus.py +2 -2
  56. parsl/tests/unit/test_globus_compute_executor.py +11 -2
  57. parsl/utils.py +8 -3
  58. parsl/version.py +1 -1
  59. {parsl-2025.9.8.data → parsl-2025.11.10.data}/scripts/interchange.py +39 -20
  60. {parsl-2025.9.8.data → parsl-2025.11.10.data}/scripts/process_worker_pool.py +32 -7
  61. {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/METADATA +64 -50
  62. {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/RECORD +68 -74
  63. {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/WHEEL +1 -1
  64. parsl/tests/configs/local_threads_checkpoint_periodic.py +0 -11
  65. parsl/tests/configs/local_threads_no_cache.py +0 -11
  66. parsl/tests/site_tests/test_provider.py +0 -88
  67. parsl/tests/site_tests/test_site.py +0 -70
  68. parsl/tests/test_aalst_patterns.py +0 -474
  69. parsl/tests/test_docs/test_workflow2.py +0 -42
  70. parsl/tests/test_error_handling/test_rand_fail.py +0 -171
  71. parsl/tests/test_regression/test_854.py +0 -62
  72. parsl/tests/test_serialization/test_pack_resource_spec.py +0 -23
  73. {parsl-2025.9.8.data → parsl-2025.11.10.data}/scripts/exec_parsl_function.py +0 -0
  74. {parsl-2025.9.8.data → parsl-2025.11.10.data}/scripts/parsl_coprocess.py +0 -0
  75. {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/entry_points.txt +0 -0
  76. {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info/licenses}/LICENSE +0 -0
  77. {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/top_level.txt +0 -0
@@ -4,19 +4,18 @@ import hashlib
4
4
  import logging
5
5
  import os
6
6
  import pickle
7
+ import threading
8
+ import types
9
+ from concurrent.futures import Future
7
10
  from functools import lru_cache, singledispatch
8
- from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence
11
+ from typing import Any, Dict, List, Literal, Optional, Sequence
9
12
 
10
13
  import typeguard
11
14
 
12
15
  from parsl.dataflow.errors import BadCheckpoint
13
16
  from parsl.dataflow.taskrecord import TaskRecord
14
-
15
- if TYPE_CHECKING:
16
- from parsl import DataFlowKernel # import loop at runtime - needed for typechecking - TODO turn into "if typing:"
17
-
18
- import types
19
- from concurrent.futures import Future
17
+ from parsl.errors import ConfigurationError, InternalConsistencyError
18
+ from parsl.utils import Timer, get_all_checkpoints
20
19
 
21
20
  logger = logging.getLogger(__name__)
22
21
 
@@ -150,19 +149,41 @@ class Memoizer:
150
149
 
151
150
  """
152
151
 
153
- def __init__(self, dfk: DataFlowKernel, *, memoize: bool = True, checkpoint_files: Sequence[str]):
154
- """Initialize the memoizer.
152
+ run_dir: str
155
153
 
156
- Args:
157
- - dfk (DFK obj): The DFK object
154
+ def __init__(self, *,
155
+ memoize: bool = True,
156
+ checkpoint_files: Sequence[str] | None,
157
+ checkpoint_period: Optional[str],
158
+ checkpoint_mode: Literal['task_exit', 'periodic', 'dfk_exit', 'manual'] | None):
159
+ """Initialize the memoizer.
158
160
 
159
161
  KWargs:
160
162
  - memoize (Bool): enable memoization or not.
161
163
  - checkpoint (Dict): A checkpoint loaded as a dict.
162
164
  """
163
- self.dfk = dfk
164
165
  self.memoize = memoize
165
166
 
167
+ self.checkpointed_tasks = 0
168
+
169
+ self.checkpoint_lock = threading.Lock()
170
+
171
+ self.checkpoint_files = checkpoint_files
172
+ self.checkpoint_mode = checkpoint_mode
173
+ self.checkpoint_period = checkpoint_period
174
+
175
+ self.checkpointable_tasks: List[TaskRecord] = []
176
+
177
+ self._checkpoint_timer: Timer | None = None
178
+
179
+ def start(self) -> None:
180
+ if self.checkpoint_files is not None:
181
+ checkpoint_files = self.checkpoint_files
182
+ elif self.checkpoint_files is None and self.checkpoint_mode is not None:
183
+ checkpoint_files = get_all_checkpoints(self.run_dir)
184
+ else:
185
+ checkpoint_files = []
186
+
166
187
  checkpoint = self.load_checkpoints(checkpoint_files)
167
188
 
168
189
  if self.memoize:
@@ -172,6 +193,26 @@ class Memoizer:
172
193
  logger.info("App caching disabled for all apps")
173
194
  self.memo_lookup_table = {}
174
195
 
196
+ if self.checkpoint_mode == "periodic":
197
+ if self.checkpoint_period is None:
198
+ raise ConfigurationError("Checkpoint period must be specified with periodic checkpoint mode")
199
+ else:
200
+ try:
201
+ h, m, s = map(int, self.checkpoint_period.split(':'))
202
+ except Exception:
203
+ raise ConfigurationError("invalid checkpoint_period provided: {0} expected HH:MM:SS".format(self.checkpoint_period))
204
+ checkpoint_period = (h * 3600) + (m * 60) + s
205
+ self._checkpoint_timer = Timer(self.checkpoint_queue, interval=checkpoint_period, name="Checkpoint")
206
+
207
+ def close(self) -> None:
208
+ if self.checkpoint_mode is not None:
209
+ logger.info("Making final checkpoint")
210
+ self.checkpoint_queue()
211
+
212
+ if self._checkpoint_timer:
213
+ logger.info("Stopping checkpoint timer")
214
+ self._checkpoint_timer.close()
215
+
175
216
  def make_hash(self, task: TaskRecord) -> str:
176
217
  """Create a hash of the task inputs.
177
218
 
@@ -242,16 +283,20 @@ class Memoizer:
242
283
  assert isinstance(result, Future) or result is None
243
284
  return result
244
285
 
245
- def update_memo(self, task: TaskRecord, r: Future[Any]) -> None:
286
+ def update_memo_result(self, task: TaskRecord, r: Any) -> None:
287
+ self._update_memo(task)
288
+
289
+ def update_memo_exception(self, task: TaskRecord, e: BaseException) -> None:
290
+ self._update_memo(task)
291
+
292
+ def _update_memo(self, task: TaskRecord) -> None:
246
293
  """Updates the memoization lookup table with the result from a task.
294
+ This doesn't move any values around but associates the memoization
295
+ hashsum with the completed (by success or failure) AppFuture.
247
296
 
248
297
  Args:
249
- - task (dict) : A task dict from dfk.tasks
250
- - r (Result future): Result future
298
+ - task (TaskRecord) : A task record from dfk.tasks
251
299
  """
252
- # TODO: could use typeguard
253
- assert isinstance(r, Future)
254
-
255
300
  task_id = task['id']
256
301
 
257
302
  if not self.memoize or not task['memoize'] or 'hashsum' not in task:
@@ -265,7 +310,7 @@ class Memoizer:
265
310
  logger.info(f"Replacing app cache entry {task['hashsum']} with result from task {task_id}")
266
311
  else:
267
312
  logger.debug(f"Storing app cache entry {task['hashsum']} with result from task {task_id}")
268
- self.memo_lookup_table[task['hashsum']] = r
313
+ self.memo_lookup_table[task['hashsum']] = task['app_fu']
269
314
 
270
315
  def _load_checkpoints(self, checkpointDirs: Sequence[str]) -> Dict[str, Future[Any]]:
271
316
  """Load a checkpoint file into a lookup table.
@@ -334,3 +379,85 @@ class Memoizer:
334
379
  return self._load_checkpoints(checkpointDirs)
335
380
  else:
336
381
  return {}
382
+
383
+ def update_checkpoint(self, task_record: TaskRecord) -> None:
384
+ if self.checkpoint_mode == 'task_exit':
385
+ self.checkpoint_one(task=task_record)
386
+ elif self.checkpoint_mode in ('manual', 'periodic', 'dfk_exit'):
387
+ with self.checkpoint_lock:
388
+ self.checkpointable_tasks.append(task_record)
389
+ elif self.checkpoint_mode is None:
390
+ pass
391
+ else:
392
+ raise InternalConsistencyError(f"Invalid checkpoint mode {self.checkpoint_mode}")
393
+
394
+ def checkpoint_one(self, *, task: TaskRecord) -> None:
395
+ """Checkpoint a single task to a checkpoint file.
396
+
397
+ By default the checkpoints are written to the RUNDIR of the current
398
+ run under RUNDIR/checkpoints/tasks.pkl
399
+
400
+ Kwargs:
401
+ - task : A task to checkpoint.
402
+
403
+ .. note::
404
+ Checkpointing only works if memoization is enabled
405
+
406
+ """
407
+ with self.checkpoint_lock:
408
+ self._checkpoint_these_tasks([task])
409
+
410
+ def checkpoint_queue(self) -> None:
411
+ """Checkpoint all tasks registered in self.checkpointable_tasks.
412
+
413
+ By default the checkpoints are written to the RUNDIR of the current
414
+ run under RUNDIR/checkpoints/tasks.pkl
415
+
416
+ .. note::
417
+ Checkpointing only works if memoization is enabled
418
+
419
+ """
420
+ with self.checkpoint_lock:
421
+ self._checkpoint_these_tasks(self.checkpointable_tasks)
422
+ self.checkpointable_tasks = []
423
+
424
+ def _checkpoint_these_tasks(self, checkpoint_queue: List[TaskRecord]) -> None:
425
+ """Checkpoint a list of task records.
426
+
427
+ The checkpoint lock must be held when invoking this method.
428
+ """
429
+ checkpoint_dir = '{0}/checkpoint'.format(self.run_dir)
430
+ checkpoint_tasks = checkpoint_dir + '/tasks.pkl'
431
+
432
+ if not os.path.exists(checkpoint_dir):
433
+ os.makedirs(checkpoint_dir, exist_ok=True)
434
+
435
+ count = 0
436
+
437
+ with open(checkpoint_tasks, 'ab') as f:
438
+ for task_record in checkpoint_queue:
439
+ task_id = task_record['id']
440
+
441
+ app_fu = task_record['app_fu']
442
+
443
+ if app_fu.done() and app_fu.exception() is None:
444
+ hashsum = task_record['hashsum']
445
+ if not hashsum:
446
+ continue
447
+ t = {'hash': hashsum, 'exception': None, 'result': app_fu.result()}
448
+
449
+ # We are using pickle here since pickle dumps to a file in 'ab'
450
+ # mode behave like a incremental log.
451
+ pickle.dump(t, f)
452
+ count += 1
453
+ logger.debug("Task {} checkpointed".format(task_id))
454
+
455
+ self.checkpointed_tasks += count
456
+
457
+ if count == 0:
458
+ if self.checkpointed_tasks == 0:
459
+ logger.warning("No tasks checkpointed so far in this run. Please ensure caching is enabled")
460
+ else:
461
+ logger.debug("No tasks checkpointed in this pass.")
462
+ else:
463
+ logger.info("Done checkpointing {} tasks".format(count))
parsl/dataflow/states.py CHANGED
@@ -67,10 +67,10 @@ class States(IntEnum):
67
67
  return self.__class__.__name__ + "." + self.name
68
68
 
69
69
 
70
- FINAL_STATES = [States.exec_done, States.memo_done, States.failed, States.dep_fail]
71
- """States from which we will never move to another state, because the job has
72
- either definitively completed or failed."""
73
-
74
- FINAL_FAILURE_STATES = [States.failed, States.dep_fail]
70
+ FINAL_FAILURE_STATES = {States.failed, States.dep_fail}
75
71
  """States which are final and which indicate a failure. This must
76
72
  be a subset of FINAL_STATES"""
73
+
74
+ FINAL_STATES = {States.exec_done, States.memo_done, *FINAL_FAILURE_STATES}
75
+ """States from which we will never move to another state, because the job has
76
+ either definitively completed or failed."""
parsl/executors/base.py CHANGED
@@ -80,11 +80,11 @@ class ParslExecutor(metaclass=ABCMeta):
80
80
  self.shutdown()
81
81
  return False
82
82
 
83
- @abstractmethod
84
83
  def start(self) -> None:
85
84
  """Start the executor.
86
85
 
87
- Any spin-up operations (for example: starting thread pools) should be performed here.
86
+ By default, this does nothing, but this method should be overridden to
87
+ perform any spin-up operations (for example: starting thread pools).
88
88
  """
89
89
  pass
90
90
 
@@ -1,17 +1,11 @@
1
- import os
2
-
3
- from parsl.serialize import unpack_res_spec_apply_message
1
+ from parsl.serialize import unpack_apply_message
4
2
 
5
3
 
6
4
  def execute_task(bufs: bytes):
7
5
  """Deserialize the buffer and execute the task.
8
6
  Returns the result or throws exception.
9
7
  """
10
- f, args, kwargs, resource_spec = unpack_res_spec_apply_message(bufs)
11
-
12
- for varname in resource_spec:
13
- envname = "PARSL_" + str(varname).upper()
14
- os.environ[envname] = str(resource_spec[varname])
8
+ f, args, kwargs = unpack_apply_message(bufs)
15
9
 
16
10
  # We might need to look into callability of the function from itself
17
11
  # since we change it's name in the new namespace
@@ -24,7 +24,7 @@ from parsl.executors.flux.execute_parsl_task import __file__ as _WORKER_PATH
24
24
  from parsl.executors.flux.flux_instance_manager import __file__ as _MANAGER_PATH
25
25
  from parsl.providers import LocalProvider
26
26
  from parsl.providers.base import ExecutionProvider
27
- from parsl.serialize import deserialize, pack_res_spec_apply_message
27
+ from parsl.serialize import deserialize, pack_apply_message
28
28
  from parsl.serialize.errors import SerializationError
29
29
  from parsl.utils import RepresentationMixin
30
30
 
@@ -224,7 +224,7 @@ class FluxExecutor(ParslExecutor, RepresentationMixin):
224
224
  # add a ``weakref.finalize()`` function for joining the executor thread
225
225
  weakref.finalize(
226
226
  self,
227
- lambda x, y: x.set() or y.join(),
227
+ lambda x, y: x.set() or y.join(), # type: ignore[func-returns-value]
228
228
  self._stop_event,
229
229
  self._submission_thread,
230
230
  )
@@ -284,10 +284,8 @@ class FluxExecutor(ParslExecutor, RepresentationMixin):
284
284
  infile = os.path.join(self.working_dir, f"{task_id}_in{os.extsep}pkl")
285
285
  outfile = os.path.join(self.working_dir, f"{task_id}_out{os.extsep}pkl")
286
286
  try:
287
- fn_buf = pack_res_spec_apply_message(
288
- func, args, kwargs,
289
- resource_specification={},
290
- buffer_threshold=1024 * 1024
287
+ fn_buf = pack_apply_message(
288
+ func, args, kwargs, buffer_threshold=1 << 20,
291
289
  )
292
290
  except TypeError:
293
291
  raise SerializationError(func.__name__)
@@ -76,10 +76,6 @@ class GlobusComputeExecutor(ParslExecutor, RepresentationMixin):
76
76
  self.storage_access = storage_access
77
77
  self.working_dir = working_dir
78
78
 
79
- def start(self) -> None:
80
- """ Start the Globus Compute Executor """
81
- super().start()
82
-
83
79
  def submit(self, func: Callable, resource_specification: Dict[str, Any], *args: Any, **kwargs: Any) -> Future:
84
80
  """ Submit func to globus-compute
85
81
 
@@ -35,7 +35,7 @@ from parsl.monitoring.radios.zmq_router import ZMQRadioReceiver, start_zmq_recei
35
35
  from parsl.process_loggers import wrap_with_logs
36
36
  from parsl.providers import LocalProvider
37
37
  from parsl.providers.base import ExecutionProvider
38
- from parsl.serialize import deserialize, pack_res_spec_apply_message
38
+ from parsl.serialize import deserialize, pack_apply_message
39
39
  from parsl.serialize.errors import DeserializationError, SerializationError
40
40
  from parsl.usage_tracking.api import UsageInformation
41
41
  from parsl.utils import RepresentationMixin
@@ -160,6 +160,12 @@ GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionP
160
160
  """ # Documentation for params used by both HTEx and MPIEx
161
161
 
162
162
 
163
+ class HTEXFuture(Future):
164
+ def __init__(self, task_id) -> None:
165
+ super().__init__()
166
+ self.parsl_executor_task_id = task_id
167
+
168
+
163
169
  class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageInformation):
164
170
  __doc__ = f"""Executor designed for cluster-scale
165
171
 
@@ -237,7 +243,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
237
243
  @typeguard.typechecked
238
244
  def __init__(self,
239
245
  label: str = 'HighThroughputExecutor',
240
- provider: ExecutionProvider = LocalProvider(),
246
+ provider: Optional[ExecutionProvider] = None,
241
247
  launch_cmd: Optional[str] = None,
242
248
  interchange_launch_cmd: Optional[Sequence[str]] = None,
243
249
  address: Optional[str] = None,
@@ -267,7 +273,9 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
267
273
 
268
274
  logger.debug("Initializing HighThroughputExecutor")
269
275
 
270
- BlockProviderExecutor.__init__(self, provider=provider, block_error_handler=block_error_handler)
276
+ BlockProviderExecutor.__init__(self,
277
+ provider=provider if provider else LocalProvider(),
278
+ block_error_handler=block_error_handler)
271
279
  self.label = label
272
280
  self.worker_debug = worker_debug
273
281
  self.storage_access = storage_access
@@ -332,6 +340,13 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
332
340
  self.encrypted = encrypted
333
341
  self.cert_dir = None
334
342
 
343
+ # This flag will enable/disable internal Python mismatch checks
344
+ # between the interchange and worker managers. This serves as a
345
+ # temporary workaround for Globus Compute to support different
346
+ # Python versions at the endpoint and worker layers. We can drop
347
+ # the flag once we implement modular internal message protocols.
348
+ self._check_python_mismatch: bool = True
349
+
335
350
  if not launch_cmd:
336
351
  launch_cmd = DEFAULT_LAUNCH_CMD
337
352
  self.launch_cmd = launch_cmd
@@ -494,10 +509,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
494
509
  else:
495
510
 
496
511
  for serialized_msg in msgs:
497
- try:
498
- msg = pickle.loads(serialized_msg)
499
- except pickle.UnpicklingError:
500
- raise BadMessage("Message received could not be unpickled")
512
+ msg = pickle.loads(serialized_msg)
501
513
 
502
514
  if msg['type'] == 'result':
503
515
  try:
@@ -568,6 +580,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
568
580
  "cert_dir": self.cert_dir,
569
581
  "manager_selector": self.manager_selector,
570
582
  "run_id": self.run_id,
583
+ "_check_python_mismatch": self._check_python_mismatch,
571
584
  }
572
585
 
573
586
  config_pickle = pickle.dumps(interchange_config)
@@ -663,7 +676,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
663
676
  logger.debug("Sending hold to manager: {}".format(manager['manager']))
664
677
  self._hold_manager(manager['manager'])
665
678
 
666
- def submit(self, func, resource_specification, *args, **kwargs):
679
+ def submit(self, func: Callable, resource_specification: dict, *args, **kwargs) -> HTEXFuture:
667
680
  """Submits work to the outgoing_q.
668
681
 
669
682
  The outgoing_q is an external process listens on this
@@ -684,34 +697,83 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
684
697
 
685
698
  self.validate_resource_spec(resource_specification)
686
699
 
687
- if self.bad_state_is_set:
688
- raise self.executor_exception
689
-
690
- self._task_counter += 1
691
- task_id = self._task_counter
692
-
693
700
  # handle people sending blobs gracefully
694
701
  if logger.getEffectiveLevel() <= logging.DEBUG:
695
702
  args_to_print = tuple([ar if len(ar := repr(arg)) < 100 else (ar[:100] + '...') for arg in args])
696
703
  logger.debug("Pushing function {} to queue with args {}".format(func, args_to_print))
697
704
 
698
- fut = Future()
699
- fut.parsl_executor_task_id = task_id
700
- self.tasks[task_id] = fut
701
-
702
705
  try:
703
- fn_buf = pack_res_spec_apply_message(func, args, kwargs,
704
- resource_specification=resource_specification,
705
- buffer_threshold=1024 * 1024)
706
+ fn_buf = pack_apply_message(func, args, kwargs, buffer_threshold=1 << 20)
706
707
  except TypeError:
707
708
  raise SerializationError(func.__name__)
708
709
 
709
- msg = {"task_id": task_id, "resource_spec": resource_specification, "buffer": fn_buf}
710
+ context = {}
711
+ if resource_specification:
712
+ context["resource_spec"] = resource_specification
713
+
714
+ return self.submit_payload(context, fn_buf)
715
+
716
+ def submit_payload(self, context: dict, buffer: bytes) -> HTEXFuture:
717
+ """
718
+ Submit specially crafted payloads.
719
+
720
+ For use-cases where the ``HighThroughputExecutor`` consumer needs the payload
721
+ handled by the worker in a special way. For example, if the function is
722
+ serialized differently than Parsl's default approach, or if the task must
723
+ be setup more precisely than Parsl's default ``execute_task`` allows.
724
+
725
+ An example interaction:
726
+
727
+ .. code-block: python
728
+
729
+ >>> htex: HighThroughputExecutor # setup prior to this example
730
+ >>> ctxt = {
731
+ ... "task_executor": {
732
+ ... "f": "full.import.path.of.custom_execute_task",
733
+ ... "a": ("additional", "arguments"),
734
+ ... "k": {"some": "keyword", "args": "here"}
735
+ ... }
736
+ ... }
737
+ >>> fn_buf = custom_serialize(task_func, *task_args, **task_kwargs)
738
+ >>> fut = htex.submit_payload(ctxt, fn_buf)
739
+
740
+ The custom ``custom_execute_task`` would be dynamically imported, and
741
+ invoked as:
742
+
743
+ .. code-block: python
744
+
745
+ args = ("additional", "arguments")
746
+ kwargs = {"some": "keyword", "args": "here"}
747
+ result = custom_execute_task(fn_buf, *args, **kwargs)
748
+
749
+ Parameters
750
+ ----------
751
+ context:
752
+ A task-specific context associated with the function buffer. Parsl
753
+ currently implements the keys ``task_executor`` and ``resource_spec``
754
+
755
+ buffer:
756
+ A serialized function, that will be deserialized and executed by
757
+ ``execute_task`` (or custom function, if ``task_executor`` is specified)
758
+
759
+ Returns
760
+ -------
761
+ An HTEXFuture (a normal Future, with the attribute ``.parsl_executor_task_id``
762
+ set). The future will be set to done when the associated function buffer has
763
+ been invoked and completed.
764
+ """
765
+ if self.bad_state_is_set:
766
+ raise self.executor_exception
767
+
768
+ self._task_counter += 1
769
+ task_id = self._task_counter
770
+
771
+ fut = HTEXFuture(task_id)
772
+ self.tasks[task_id] = fut
710
773
 
711
- # Post task to the outgoing queue
774
+ msg = {"task_id": task_id, "context": context, "buffer": buffer}
712
775
  self.outgoing_q.put(msg)
713
776
 
714
- # Return the future
715
777
  return fut
716
778
 
717
779
  @property
@@ -23,7 +23,6 @@ from parsl.monitoring.radios.base import MonitoringRadioSender
23
23
  from parsl.monitoring.radios.zmq import ZMQRadioSender
24
24
  from parsl.process_loggers import wrap_with_logs
25
25
  from parsl.serialize import serialize as serialize_object
26
- from parsl.utils import setproctitle
27
26
  from parsl.version import VERSION as PARSL_VERSION
28
27
 
29
28
  PKL_HEARTBEAT_CODE = pickle.dumps((2 ** 32) - 1)
@@ -56,6 +55,7 @@ class Interchange:
56
55
  cert_dir: Optional[str],
57
56
  manager_selector: ManagerSelector,
58
57
  run_id: str,
58
+ _check_python_mismatch: bool,
59
59
  ) -> None:
60
60
  """
61
61
  Parameters
@@ -99,6 +99,11 @@ class Interchange:
99
99
 
100
100
  cert_dir : str | None
101
101
  Path to the certificate directory.
102
+
103
+ _check_python_mismatch : bool
104
+ If True, the interchange and worker managers must run the same version of
105
+ Python. Running different versions can cause inter-process communication
106
+ errors, so proceed with caution.
102
107
  """
103
108
  self.cert_dir = cert_dir
104
109
  self.logdir = logdir
@@ -126,15 +131,13 @@ class Interchange:
126
131
  logger.info("Connected to client")
127
132
 
128
133
  self.run_id = run_id
134
+ self._check_python_mismatch = _check_python_mismatch
129
135
 
130
136
  self.hub_address = hub_address
131
137
  self.hub_zmq_port = hub_zmq_port
132
138
 
133
139
  self.pending_task_queue: SortedList[Any] = SortedList(key=lambda tup: (tup[0], tup[1]))
134
140
 
135
- # count of tasks that have been received from the submit side
136
- self.task_counter = 0
137
-
138
141
  # count of tasks that have been sent out to worker pools
139
142
  self.count = 0
140
143
 
@@ -157,6 +160,7 @@ class Interchange:
157
160
  logger.info(f"Bound to port {worker_port} for incoming worker connections")
158
161
 
159
162
  self._ready_managers: Dict[bytes, ManagerRecord] = {}
163
+ self._logged_manager_count_token: object = None
160
164
  self.connected_block_history: List[str] = []
161
165
 
162
166
  self.heartbeat_threshold = heartbeat_threshold
@@ -213,7 +217,7 @@ class Interchange:
213
217
 
214
218
  reply: Any # the type of reply depends on the command_req received (aka this needs dependent types...)
215
219
 
216
- if self.command_channel in self.socks and self.socks[self.command_channel] == zmq.POLLIN:
220
+ if self.socks.get(self.command_channel) == zmq.POLLIN:
217
221
  logger.debug("entering command_server section")
218
222
 
219
223
  command_req = self.command_channel.recv_pyobj()
@@ -310,6 +314,7 @@ class Interchange:
310
314
  self.process_manager_socket_message(interesting_managers, monitoring_radio, kill_event)
311
315
  self.expire_bad_managers(interesting_managers, monitoring_radio)
312
316
  self.expire_drained_managers(interesting_managers, monitoring_radio)
317
+ self.log_manager_counts(interesting_managers)
313
318
  self.process_tasks_to_send(interesting_managers, monitoring_radio)
314
319
 
315
320
  self.zmq_context.destroy()
@@ -321,20 +326,20 @@ class Interchange:
321
326
  """Process incoming task message(s).
322
327
  """
323
328
 
324
- if self.task_incoming in self.socks and self.socks[self.task_incoming] == zmq.POLLIN:
329
+ if self.socks.get(self.task_incoming) == zmq.POLLIN:
325
330
  logger.debug("start task_incoming section")
326
331
  msg = self.task_incoming.recv_pyobj()
327
332
 
328
333
  # Process priority, higher number = lower priority
329
- resource_spec = msg.get('resource_spec', {})
334
+ task_id = msg['task_id']
335
+ resource_spec = msg['context'].get('resource_spec', {})
330
336
  priority = resource_spec.get('priority', float('inf'))
331
- queue_entry = (-priority, -self.task_counter, msg)
337
+ queue_entry = (-priority, -task_id, msg)
332
338
 
333
- logger.debug("putting message onto pending_task_queue")
339
+ logger.debug("Putting task %s onto pending_task_queue", task_id)
334
340
 
335
341
  self.pending_task_queue.add(queue_entry)
336
- self.task_counter += 1
337
- logger.debug(f"Fetched {self.task_counter} tasks so far")
342
+ logger.debug("Put task %s onto pending_task_queue", task_id)
338
343
 
339
344
  def process_manager_socket_message(
340
345
  self,
@@ -354,9 +359,10 @@ class Interchange:
354
359
  mtype = meta['type']
355
360
  except Exception as e:
356
361
  logger.warning(
357
- f'Failed to read manager message ([{type(e).__name__}] {e})'
362
+ 'Failed to read manager message; ignoring message'
363
+ f' (Exception: [{type(e).__name__}] {e})'
358
364
  )
359
- logger.debug('Message:\n %r\n', msg_parts, exc_info=e)
365
+ logger.debug('Raw message bytes:\n %r\n', msg_parts, exc_info=e)
360
366
  return
361
367
 
362
368
  logger.debug(
@@ -396,7 +402,9 @@ class Interchange:
396
402
  logger.info(f'Registration info for manager {manager_id!r}: {meta}')
397
403
  self._send_monitoring_info(monitoring_radio, new_rec)
398
404
 
399
- if (mgr_minor_py, mgr_parsl_v) != (ix_minor_py, ix_parsl_v):
405
+ python_mismatch: bool = ix_minor_py != mgr_minor_py
406
+ parsl_mismatch: bool = ix_parsl_v != mgr_parsl_v
407
+ if parsl_mismatch or (self._check_python_mismatch and python_mismatch):
400
408
  kill_event.set()
401
409
  vm_exc = VersionMismatch(
402
410
  f"py.v={ix_minor_py} parsl.v={ix_parsl_v}",
@@ -517,15 +525,24 @@ class Interchange:
517
525
  m['active'] = False
518
526
  self._send_monitoring_info(monitoring_radio, m)
519
527
 
528
+ def log_manager_counts(self, interesting_managers: Set[bytes]) -> None:
529
+ count_interesting = len(interesting_managers)
530
+ count_ready = len(self._ready_managers)
531
+
532
+ new_logged_manager_count_token = (count_interesting, count_ready)
533
+
534
+ if self._logged_manager_count_token != new_logged_manager_count_token:
535
+
536
+ logger.debug(
537
+ "Managers count (interesting/total): %d/%d",
538
+ count_interesting,
539
+ count_ready
540
+ )
541
+ self._logged_manager_count_token = new_logged_manager_count_token
542
+
520
543
  def process_tasks_to_send(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
521
544
  # Check if there are tasks that could be sent to managers
522
545
 
523
- logger.debug(
524
- "Managers count (interesting/total): %d/%d",
525
- len(interesting_managers),
526
- len(self._ready_managers)
527
- )
528
-
529
546
  if interesting_managers and self.pending_task_queue:
530
547
  shuffled_managers = self.manager_selector.sort_managers(self._ready_managers, interesting_managers)
531
548
 
@@ -618,6 +635,8 @@ def start_file_logger(filename: str, level: int = logging.DEBUG, format_string:
618
635
 
619
636
 
620
637
  if __name__ == "__main__":
638
+ from parsl.utils import setproctitle
639
+
621
640
  setproctitle("parsl: HTEX interchange")
622
641
 
623
642
  config = pickle.load(sys.stdin.buffer)
@@ -16,7 +16,6 @@ from parsl.executors.status_handling import BlockProviderExecutor
16
16
  from parsl.jobs.states import JobStatus
17
17
  from parsl.launchers import SimpleLauncher
18
18
  from parsl.monitoring.radios.base import RadioConfig
19
- from parsl.providers import LocalProvider
20
19
  from parsl.providers.base import ExecutionProvider
21
20
 
22
21
 
@@ -47,7 +46,7 @@ class MPIExecutor(HighThroughputExecutor):
47
46
  @typeguard.typechecked
48
47
  def __init__(self,
49
48
  label: str = 'MPIExecutor',
50
- provider: ExecutionProvider = LocalProvider(),
49
+ provider: Optional[ExecutionProvider] = None,
51
50
  launch_cmd: Optional[str] = None,
52
51
  interchange_launch_cmd: Optional[str] = None,
53
52
  address: Optional[str] = None,