parsl 2025.8.4__py3-none-any.whl → 2025.11.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. parsl/__init__.py +0 -4
  2. parsl/app/bash.py +1 -1
  3. parsl/benchmark/perf.py +73 -17
  4. parsl/concurrent/__init__.py +95 -14
  5. parsl/curvezmq.py +0 -16
  6. parsl/data_provider/globus.py +3 -1
  7. parsl/dataflow/dflow.py +107 -207
  8. parsl/dataflow/memoization.py +144 -31
  9. parsl/dataflow/states.py +5 -5
  10. parsl/executors/base.py +2 -2
  11. parsl/executors/execute_task.py +2 -8
  12. parsl/executors/flux/executor.py +4 -6
  13. parsl/executors/globus_compute.py +0 -4
  14. parsl/executors/high_throughput/executor.py +86 -25
  15. parsl/executors/high_throughput/interchange.py +55 -42
  16. parsl/executors/high_throughput/mpi_executor.py +1 -2
  17. parsl/executors/high_throughput/mpi_resource_management.py +7 -14
  18. parsl/executors/high_throughput/process_worker_pool.py +32 -7
  19. parsl/executors/high_throughput/zmq_pipes.py +36 -67
  20. parsl/executors/radical/executor.py +2 -6
  21. parsl/executors/radical/rpex_worker.py +2 -2
  22. parsl/executors/taskvine/executor.py +5 -1
  23. parsl/executors/threads.py +5 -2
  24. parsl/jobs/states.py +2 -2
  25. parsl/jobs/strategy.py +7 -6
  26. parsl/monitoring/db_manager.py +21 -23
  27. parsl/monitoring/monitoring.py +2 -2
  28. parsl/monitoring/radios/filesystem.py +2 -1
  29. parsl/monitoring/radios/htex.py +2 -1
  30. parsl/monitoring/radios/multiprocessing.py +2 -1
  31. parsl/monitoring/radios/udp.py +2 -1
  32. parsl/monitoring/radios/udp_router.py +2 -2
  33. parsl/monitoring/radios/zmq_router.py +2 -2
  34. parsl/multiprocessing.py +0 -49
  35. parsl/providers/base.py +24 -37
  36. parsl/providers/pbspro/pbspro.py +1 -1
  37. parsl/serialize/__init__.py +6 -9
  38. parsl/serialize/facade.py +0 -32
  39. parsl/tests/configs/local_threads_globus.py +18 -14
  40. parsl/tests/configs/taskvine_ex.py +1 -1
  41. parsl/tests/manual_tests/test_memory_limits.py +1 -1
  42. parsl/tests/sites/test_concurrent.py +51 -3
  43. parsl/tests/test_checkpointing/test_periodic.py +15 -9
  44. parsl/tests/test_checkpointing/test_python_checkpoint_1.py +6 -3
  45. parsl/tests/test_checkpointing/test_regression_233.py +0 -1
  46. parsl/tests/test_curvezmq.py +0 -42
  47. parsl/tests/test_execute_task.py +2 -11
  48. parsl/tests/test_htex/test_command_concurrency_regression_1321.py +54 -0
  49. parsl/tests/test_htex/test_htex.py +36 -1
  50. parsl/tests/test_htex/test_interchange_exit_bad_registration.py +2 -2
  51. parsl/tests/test_htex/test_priority_queue.py +26 -3
  52. parsl/tests/test_htex/test_zmq_binding.py +2 -1
  53. parsl/tests/test_mpi_apps/test_mpi_scheduler.py +18 -43
  54. parsl/tests/test_python_apps/test_basic.py +0 -14
  55. parsl/tests/test_python_apps/test_depfail_propagation.py +11 -1
  56. parsl/tests/test_python_apps/test_exception.py +19 -0
  57. parsl/tests/test_python_apps/test_garbage_collect.py +1 -6
  58. parsl/tests/test_python_apps/test_memoize_2.py +11 -1
  59. parsl/tests/test_python_apps/test_memoize_exception.py +41 -0
  60. parsl/tests/test_regression/test_3874.py +47 -0
  61. parsl/tests/test_scaling/test_regression_3696_oscillation.py +1 -0
  62. parsl/tests/test_staging/test_staging_globus.py +2 -2
  63. parsl/tests/test_utils/test_representation_mixin.py +53 -0
  64. parsl/tests/unit/test_globus_compute_executor.py +11 -2
  65. parsl/utils.py +11 -3
  66. parsl/version.py +1 -1
  67. {parsl-2025.8.4.data → parsl-2025.11.10.data}/scripts/interchange.py +55 -42
  68. {parsl-2025.8.4.data → parsl-2025.11.10.data}/scripts/process_worker_pool.py +32 -7
  69. {parsl-2025.8.4.dist-info → parsl-2025.11.10.dist-info}/METADATA +64 -50
  70. {parsl-2025.8.4.dist-info → parsl-2025.11.10.dist-info}/RECORD +76 -81
  71. {parsl-2025.8.4.dist-info → parsl-2025.11.10.dist-info}/WHEEL +1 -1
  72. parsl/tests/configs/local_threads_checkpoint_periodic.py +0 -11
  73. parsl/tests/configs/local_threads_no_cache.py +0 -11
  74. parsl/tests/site_tests/test_provider.py +0 -88
  75. parsl/tests/site_tests/test_site.py +0 -70
  76. parsl/tests/test_aalst_patterns.py +0 -474
  77. parsl/tests/test_docs/test_workflow2.py +0 -42
  78. parsl/tests/test_error_handling/test_rand_fail.py +0 -171
  79. parsl/tests/test_regression/test_854.py +0 -62
  80. parsl/tests/test_serialization/test_pack_resource_spec.py +0 -23
  81. {parsl-2025.8.4.data → parsl-2025.11.10.data}/scripts/exec_parsl_function.py +0 -0
  82. {parsl-2025.8.4.data → parsl-2025.11.10.data}/scripts/parsl_coprocess.py +0 -0
  83. {parsl-2025.8.4.dist-info → parsl-2025.11.10.dist-info}/entry_points.txt +0 -0
  84. {parsl-2025.8.4.dist-info → parsl-2025.11.10.dist-info/licenses}/LICENSE +0 -0
  85. {parsl-2025.8.4.dist-info → parsl-2025.11.10.dist-info}/top_level.txt +0 -0
@@ -23,7 +23,6 @@ from parsl.monitoring.radios.base import MonitoringRadioSender
23
23
  from parsl.monitoring.radios.zmq import ZMQRadioSender
24
24
  from parsl.process_loggers import wrap_with_logs
25
25
  from parsl.serialize import serialize as serialize_object
26
- from parsl.utils import setproctitle
27
26
  from parsl.version import VERSION as PARSL_VERSION
28
27
 
29
28
  PKL_HEARTBEAT_CODE = pickle.dumps((2 ** 32) - 1)
@@ -56,6 +55,7 @@ class Interchange:
56
55
  cert_dir: Optional[str],
57
56
  manager_selector: ManagerSelector,
58
57
  run_id: str,
58
+ _check_python_mismatch: bool,
59
59
  ) -> None:
60
60
  """
61
61
  Parameters
@@ -99,6 +99,11 @@ class Interchange:
99
99
 
100
100
  cert_dir : str | None
101
101
  Path to the certificate directory.
102
+
103
+ _check_python_mismatch : bool
104
+ If True, the interchange and worker managers must run the same version of
105
+ Python. Running different versions can cause inter-process communication
106
+ errors, so proceed with caution.
102
107
  """
103
108
  self.cert_dir = cert_dir
104
109
  self.logdir = logdir
@@ -126,15 +131,13 @@ class Interchange:
126
131
  logger.info("Connected to client")
127
132
 
128
133
  self.run_id = run_id
134
+ self._check_python_mismatch = _check_python_mismatch
129
135
 
130
136
  self.hub_address = hub_address
131
137
  self.hub_zmq_port = hub_zmq_port
132
138
 
133
139
  self.pending_task_queue: SortedList[Any] = SortedList(key=lambda tup: (tup[0], tup[1]))
134
140
 
135
- # count of tasks that have been received from the submit side
136
- self.task_counter = 0
137
-
138
141
  # count of tasks that have been sent out to worker pools
139
142
  self.count = 0
140
143
 
@@ -157,6 +160,7 @@ class Interchange:
157
160
  logger.info(f"Bound to port {worker_port} for incoming worker connections")
158
161
 
159
162
  self._ready_managers: Dict[bytes, ManagerRecord] = {}
163
+ self._logged_manager_count_token: object = None
160
164
  self.connected_block_history: List[str] = []
161
165
 
162
166
  self.heartbeat_threshold = heartbeat_threshold
@@ -213,7 +217,7 @@ class Interchange:
213
217
 
214
218
  reply: Any # the type of reply depends on the command_req received (aka this needs dependent types...)
215
219
 
216
- if self.command_channel in self.socks and self.socks[self.command_channel] == zmq.POLLIN:
220
+ if self.socks.get(self.command_channel) == zmq.POLLIN:
217
221
  logger.debug("entering command_server section")
218
222
 
219
223
  command_req = self.command_channel.recv_pyobj()
@@ -222,35 +226,29 @@ class Interchange:
222
226
  reply = self.connected_block_history
223
227
 
224
228
  elif command_req == "WORKERS":
225
- num_workers = 0
226
- for manager in self._ready_managers.values():
227
- num_workers += manager['worker_count']
228
- reply = num_workers
229
+ reply = sum(m['worker_count'] for m in self._ready_managers.values())
229
230
 
230
231
  elif command_req == "MANAGERS":
231
232
  reply = []
232
- for manager_id in self._ready_managers:
233
- m = self._ready_managers[manager_id]
234
- idle_since = m['idle_since']
235
- if idle_since is not None:
236
- idle_duration = time.time() - idle_since
237
- else:
238
- idle_duration = 0.0
239
- resp = {'manager': manager_id.decode('utf-8'),
240
- 'block_id': m['block_id'],
241
- 'worker_count': m['worker_count'],
242
- 'tasks': len(m['tasks']),
243
- 'idle_duration': idle_duration,
244
- 'active': m['active'],
245
- 'parsl_version': m['parsl_version'],
246
- 'python_version': m['python_version'],
247
- 'draining': m['draining']}
233
+ now = time.time()
234
+ for manager_id, m in self._ready_managers.items():
235
+ idle_duration = now - (m['idle_since'] or now)
236
+ resp = {
237
+ 'manager': manager_id.decode('utf-8'),
238
+ 'block_id': m['block_id'],
239
+ 'worker_count': m['worker_count'],
240
+ 'tasks': len(m['tasks']),
241
+ 'idle_duration': idle_duration,
242
+ 'active': m['active'],
243
+ 'parsl_version': m['parsl_version'],
244
+ 'python_version': m['python_version'],
245
+ 'draining': m['draining']
246
+ }
248
247
  reply.append(resp)
249
248
 
250
249
  elif command_req == "MANAGERS_PACKAGES":
251
250
  reply = {}
252
- for manager_id in self._ready_managers:
253
- m = self._ready_managers[manager_id]
251
+ for manager_id, m in self._ready_managers.items():
254
252
  manager_id_str = manager_id.decode('utf-8')
255
253
  reply[manager_id_str] = m["packages"]
256
254
 
@@ -316,6 +314,7 @@ class Interchange:
316
314
  self.process_manager_socket_message(interesting_managers, monitoring_radio, kill_event)
317
315
  self.expire_bad_managers(interesting_managers, monitoring_radio)
318
316
  self.expire_drained_managers(interesting_managers, monitoring_radio)
317
+ self.log_manager_counts(interesting_managers)
319
318
  self.process_tasks_to_send(interesting_managers, monitoring_radio)
320
319
 
321
320
  self.zmq_context.destroy()
@@ -327,20 +326,20 @@ class Interchange:
327
326
  """Process incoming task message(s).
328
327
  """
329
328
 
330
- if self.task_incoming in self.socks and self.socks[self.task_incoming] == zmq.POLLIN:
329
+ if self.socks.get(self.task_incoming) == zmq.POLLIN:
331
330
  logger.debug("start task_incoming section")
332
331
  msg = self.task_incoming.recv_pyobj()
333
332
 
334
333
  # Process priority, higher number = lower priority
335
- resource_spec = msg.get('resource_spec', {})
334
+ task_id = msg['task_id']
335
+ resource_spec = msg['context'].get('resource_spec', {})
336
336
  priority = resource_spec.get('priority', float('inf'))
337
- queue_entry = (-priority, -self.task_counter, msg)
337
+ queue_entry = (-priority, -task_id, msg)
338
338
 
339
- logger.debug("putting message onto pending_task_queue")
339
+ logger.debug("Putting task %s onto pending_task_queue", task_id)
340
340
 
341
341
  self.pending_task_queue.add(queue_entry)
342
- self.task_counter += 1
343
- logger.debug(f"Fetched {self.task_counter} tasks so far")
342
+ logger.debug("Put task %s onto pending_task_queue", task_id)
344
343
 
345
344
  def process_manager_socket_message(
346
345
  self,
@@ -360,9 +359,10 @@ class Interchange:
360
359
  mtype = meta['type']
361
360
  except Exception as e:
362
361
  logger.warning(
363
- f'Failed to read manager message ([{type(e).__name__}] {e})'
362
+ 'Failed to read manager message; ignoring message'
363
+ f' (Exception: [{type(e).__name__}] {e})'
364
364
  )
365
- logger.debug('Message:\n %r\n', msg_parts, exc_info=e)
365
+ logger.debug('Raw message bytes:\n %r\n', msg_parts, exc_info=e)
366
366
  return
367
367
 
368
368
  logger.debug(
@@ -402,7 +402,9 @@ class Interchange:
402
402
  logger.info(f'Registration info for manager {manager_id!r}: {meta}')
403
403
  self._send_monitoring_info(monitoring_radio, new_rec)
404
404
 
405
- if (mgr_minor_py, mgr_parsl_v) != (ix_minor_py, ix_parsl_v):
405
+ python_mismatch: bool = ix_minor_py != mgr_minor_py
406
+ parsl_mismatch: bool = ix_parsl_v != mgr_parsl_v
407
+ if parsl_mismatch or (self._check_python_mismatch and python_mismatch):
406
408
  kill_event.set()
407
409
  vm_exc = VersionMismatch(
408
410
  f"py.v={ix_minor_py} parsl.v={ix_parsl_v}",
@@ -523,15 +525,24 @@ class Interchange:
523
525
  m['active'] = False
524
526
  self._send_monitoring_info(monitoring_radio, m)
525
527
 
528
+ def log_manager_counts(self, interesting_managers: Set[bytes]) -> None:
529
+ count_interesting = len(interesting_managers)
530
+ count_ready = len(self._ready_managers)
531
+
532
+ new_logged_manager_count_token = (count_interesting, count_ready)
533
+
534
+ if self._logged_manager_count_token != new_logged_manager_count_token:
535
+
536
+ logger.debug(
537
+ "Managers count (interesting/total): %d/%d",
538
+ count_interesting,
539
+ count_ready
540
+ )
541
+ self._logged_manager_count_token = new_logged_manager_count_token
542
+
526
543
  def process_tasks_to_send(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
527
544
  # Check if there are tasks that could be sent to managers
528
545
 
529
- logger.debug(
530
- "Managers count (interesting/total): %d/%d",
531
- len(interesting_managers),
532
- len(self._ready_managers)
533
- )
534
-
535
546
  if interesting_managers and self.pending_task_queue:
536
547
  shuffled_managers = self.manager_selector.sort_managers(self._ready_managers, interesting_managers)
537
548
 
@@ -624,6 +635,8 @@ def start_file_logger(filename: str, level: int = logging.DEBUG, format_string:
624
635
 
625
636
 
626
637
  if __name__ == "__main__":
638
+ from parsl.utils import setproctitle
639
+
627
640
  setproctitle("parsl: HTEX interchange")
628
641
 
629
642
  config = pickle.load(sys.stdin.buffer)
@@ -16,7 +16,6 @@ from parsl.executors.status_handling import BlockProviderExecutor
16
16
  from parsl.jobs.states import JobStatus
17
17
  from parsl.launchers import SimpleLauncher
18
18
  from parsl.monitoring.radios.base import RadioConfig
19
- from parsl.providers import LocalProvider
20
19
  from parsl.providers.base import ExecutionProvider
21
20
 
22
21
 
@@ -47,7 +46,7 @@ class MPIExecutor(HighThroughputExecutor):
47
46
  @typeguard.typechecked
48
47
  def __init__(self,
49
48
  label: str = 'MPIExecutor',
50
- provider: ExecutionProvider = LocalProvider(),
49
+ provider: Optional[ExecutionProvider] = None,
51
50
  launch_cmd: Optional[str] = None,
52
51
  interchange_launch_cmd: Optional[str] = None,
53
52
  address: Optional[str] = None,
@@ -9,7 +9,6 @@ from enum import Enum
9
9
  from typing import Dict, List, Optional
10
10
 
11
11
  from parsl.multiprocessing import SpawnContext
12
- from parsl.serialize import pack_res_spec_apply_message, unpack_res_spec_apply_message
13
12
 
14
13
  logger = logging.getLogger(__name__)
15
14
 
@@ -146,11 +145,11 @@ class MPITaskScheduler(TaskScheduler):
146
145
  )
147
146
  acquired_nodes = []
148
147
  with self._free_node_counter.get_lock():
149
- if num_nodes <= self._free_node_counter.value: # type: ignore[attr-defined]
150
- self._free_node_counter.value -= num_nodes # type: ignore[attr-defined]
148
+ if num_nodes <= self._free_node_counter.value:
149
+ self._free_node_counter.value -= num_nodes
151
150
  else:
152
151
  raise MPINodesUnavailable(
153
- requested=num_nodes, available=self._free_node_counter.value # type: ignore[attr-defined]
152
+ requested=num_nodes, available=self._free_node_counter.value
154
153
  )
155
154
 
156
155
  for i in range(num_nodes):
@@ -163,17 +162,14 @@ class MPITaskScheduler(TaskScheduler):
163
162
  for node in nodes:
164
163
  self.nodes_q.put(node)
165
164
  with self._free_node_counter.get_lock():
166
- self._free_node_counter.value += len(nodes) # type: ignore[attr-defined]
165
+ self._free_node_counter.value += len(nodes)
167
166
 
168
167
  def put_task(self, task_package: dict):
169
168
  """Schedule task if resources are available otherwise backlog the task"""
170
- user_ns = locals()
171
- user_ns.update({"__builtins__": __builtins__})
172
- _f, _args, _kwargs, resource_spec = unpack_res_spec_apply_message(task_package["buffer"])
169
+ resource_spec = task_package.get("context", {}).get("resource_spec", {})
173
170
 
174
- nodes_needed = resource_spec.get("num_nodes")
175
- tid = task_package["task_id"]
176
- if nodes_needed:
171
+ if nodes_needed := resource_spec.get("num_nodes"):
172
+ tid = task_package["task_id"]
177
173
  try:
178
174
  allocated_nodes = self._get_nodes(nodes_needed)
179
175
  except MPINodesUnavailable:
@@ -183,9 +179,6 @@ class MPITaskScheduler(TaskScheduler):
183
179
  else:
184
180
  resource_spec["MPI_NODELIST"] = ",".join(allocated_nodes)
185
181
  self._map_tasks_to_nodes[tid] = allocated_nodes
186
- buffer = pack_res_spec_apply_message(_f, _args, _kwargs, resource_spec)
187
- task_package["buffer"] = buffer
188
- task_package["resource_spec"] = resource_spec
189
182
 
190
183
  self.pending_task_q.put(task_package)
191
184
 
@@ -1,6 +1,7 @@
1
1
  #!/usr/bin/env python3
2
2
 
3
3
  import argparse
4
+ import importlib
4
5
  import logging
5
6
  import math
6
7
  import multiprocessing
@@ -17,7 +18,7 @@ from importlib.metadata import distributions
17
18
  from multiprocessing.context import SpawnProcess
18
19
  from multiprocessing.managers import DictProxy
19
20
  from multiprocessing.sharedctypes import Synchronized
20
- from typing import Dict, List, Optional, Sequence
21
+ from typing import Callable, Dict, List, Optional, Sequence
21
22
 
22
23
  import psutil
23
24
  import zmq
@@ -348,7 +349,7 @@ class Manager:
348
349
 
349
350
  logger.debug(
350
351
  'ready workers: %d, pending tasks: %d',
351
- self.ready_worker_count.value, # type: ignore[attr-defined]
352
+ self.ready_worker_count.value,
352
353
  pending_task_count,
353
354
  )
354
355
 
@@ -373,10 +374,12 @@ class Manager:
373
374
  if socks.get(ix_sock) == zmq.POLLIN:
374
375
  pkl_msg = ix_sock.recv()
375
376
  tasks = pickle.loads(pkl_msg)
377
+ del pkl_msg
378
+
376
379
  last_interchange_contact = time.time()
377
380
 
378
381
  if tasks == HEARTBEAT_CODE:
379
- logger.debug("Got heartbeat from interchange")
382
+ logger.debug("Got heartbeat response from interchange")
380
383
  elif tasks == DRAINED_CODE:
381
384
  logger.info("Got fully drained message from interchange - setting kill flag")
382
385
  self._stop_event.set()
@@ -454,6 +457,7 @@ class Manager:
454
457
  'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))}
455
458
  pkl_package = pickle.dumps(result_package)
456
459
  self.pending_result_queue.put(pkl_package)
460
+ del pkl_package
457
461
  except KeyError:
458
462
  logger.info("Worker {} was not busy when it died".format(worker_id))
459
463
 
@@ -603,6 +607,10 @@ def update_resource_spec_env_vars(mpi_launcher: str, resource_spec: Dict, node_i
603
607
 
604
608
 
605
609
  def _init_mpi_env(mpi_launcher: str, resource_spec: Dict):
610
+ for varname in resource_spec:
611
+ envname = "PARSL_" + str(varname).upper()
612
+ os.environ[envname] = str(resource_spec[varname])
613
+
606
614
  node_list = resource_spec.get("MPI_NODELIST")
607
615
  if node_list is None:
608
616
  return
@@ -753,8 +761,8 @@ def worker(
753
761
  worker_enqueued = True
754
762
 
755
763
  try:
756
- # The worker will receive {'task_id':<tid>, 'buffer':<buf>}
757
764
  req = task_queue.get(timeout=task_queue_timeout)
765
+ # req is {'task_id':<tid>, 'buffer':<buf>, 'resource_spec':<dict>}
758
766
  except queue.Empty:
759
767
  continue
760
768
 
@@ -766,17 +774,33 @@ def worker(
766
774
  ready_worker_count.value -= 1
767
775
  worker_enqueued = False
768
776
 
769
- _init_mpi_env(mpi_launcher=mpi_launcher, resource_spec=req["resource_spec"])
777
+ ctxt = req["context"]
778
+ res_spec = ctxt.get("resource_spec", {})
779
+
780
+ _init_mpi_env(mpi_launcher=mpi_launcher, resource_spec=res_spec)
781
+
782
+ exec_func: Callable = execute_task
783
+ exec_args = ()
784
+ exec_kwargs = {}
770
785
 
771
786
  try:
772
- result = execute_task(req['buffer'])
787
+ if task_executor := ctxt.get("task_executor", None):
788
+ mod_name, _, fn_name = task_executor["f"].rpartition(".")
789
+ exec_mod = importlib.import_module(mod_name)
790
+ exec_func = getattr(exec_mod, fn_name)
791
+
792
+ exec_args = task_executor.get("a", ())
793
+ exec_kwargs = task_executor.get("k", {})
794
+
795
+ result = exec_func(req['buffer'], *exec_args, **exec_kwargs)
773
796
  serialized_result = serialize(result, buffer_threshold=1000000)
774
797
  except Exception as e:
775
798
  logger.info('Caught an exception: {}'.format(e))
776
799
  result_package = {'type': 'result', 'task_id': tid, 'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))}
777
800
  else:
778
801
  result_package = {'type': 'result', 'task_id': tid, 'result': serialized_result}
779
- # logger.debug("Result: {}".format(result))
802
+ del serialized_result
803
+ del req
780
804
 
781
805
  logger.info("Completed executor task {}".format(tid))
782
806
  try:
@@ -788,6 +812,7 @@ def worker(
788
812
  })
789
813
 
790
814
  result_queue.put(pkl_package)
815
+ del pkl_package, result_package
791
816
  tasks_in_progress.pop(worker_id)
792
817
  logger.info("All processing finished for executor task {}".format(tid))
793
818
 
@@ -74,51 +74,37 @@ class CommandClient:
74
74
 
75
75
  reply = '__PARSL_ZMQ_PIPES_MAGIC__'
76
76
  with self._lock:
77
- for _ in range(max_retries):
78
- try:
79
- logger.debug("Sending command client command")
80
-
81
- if timeout_s is not None:
82
- remaining_time_s = start_time_s + timeout_s - time.monotonic()
83
- poll_result = self.zmq_socket.poll(timeout=remaining_time_s * 1000, flags=zmq.POLLOUT)
84
- if poll_result == zmq.POLLOUT:
85
- pass # this is OK, so continue
86
- elif poll_result == 0:
87
- raise CommandClientTimeoutError("Waiting for command channel to be ready for a command")
88
- else:
89
- raise InternalConsistencyError(f"ZMQ poll returned unexpected value: {poll_result}")
90
-
91
- self.zmq_socket.send_pyobj(message, copy=True)
92
-
93
- if timeout_s is not None:
94
- logger.debug("Polling for command client response or timeout")
95
- remaining_time_s = start_time_s + timeout_s - time.monotonic()
96
- poll_result = self.zmq_socket.poll(timeout=remaining_time_s * 1000, flags=zmq.POLLIN)
97
- if poll_result == zmq.POLLIN:
98
- pass # this is OK, so continue
99
- elif poll_result == 0:
100
- logger.error("Command timed-out - command client is now bad forever")
101
- self.ok = False
102
- raise CommandClientTimeoutError("Waiting for a reply from command channel")
103
- else:
104
- raise InternalConsistencyError(f"ZMQ poll returned unexpected value: {poll_result}")
105
-
106
- logger.debug("Receiving command client response")
107
- reply = self.zmq_socket.recv_pyobj()
108
- logger.debug("Received command client response")
109
- except zmq.ZMQError:
110
- logger.exception("Potential ZMQ REQ-REP deadlock caught")
111
- logger.info("Trying to reestablish context")
112
- self.zmq_context.recreate()
113
- self.create_socket_and_bind()
77
+ logger.debug("Sending command client command")
78
+
79
+ if timeout_s is not None:
80
+ remaining_time_s = start_time_s + timeout_s - time.monotonic()
81
+ poll_result = self.zmq_socket.poll(timeout=remaining_time_s * 1000, flags=zmq.POLLOUT)
82
+ if poll_result == zmq.POLLOUT:
83
+ pass # this is OK, so continue
84
+ elif poll_result == 0:
85
+ raise CommandClientTimeoutError("Waiting for command channel to be ready for a command")
114
86
  else:
115
- break
116
-
117
- if reply == '__PARSL_ZMQ_PIPES_MAGIC__':
118
- logger.error("Command channel run retries exhausted. Unable to run command")
119
- raise Exception("Command Channel retries exhausted")
87
+ raise InternalConsistencyError(f"ZMQ poll returned unexpected value: {poll_result}")
88
+
89
+ self.zmq_socket.send_pyobj(message, copy=True)
90
+
91
+ if timeout_s is not None:
92
+ logger.debug("Polling for command client response or timeout")
93
+ remaining_time_s = start_time_s + timeout_s - time.monotonic()
94
+ poll_result = self.zmq_socket.poll(timeout=remaining_time_s * 1000, flags=zmq.POLLIN)
95
+ if poll_result == zmq.POLLIN:
96
+ pass # this is OK, so continue
97
+ elif poll_result == 0:
98
+ logger.error("Command timed-out - command client is now bad forever")
99
+ self.ok = False
100
+ raise CommandClientTimeoutError("Waiting for a reply from command channel")
101
+ else:
102
+ raise InternalConsistencyError(f"ZMQ poll returned unexpected value: {poll_result}")
120
103
 
121
- return reply
104
+ logger.debug("Receiving command client response")
105
+ reply = self.zmq_socket.recv_pyobj()
106
+ logger.debug("Received command client response")
107
+ return reply
122
108
 
123
109
  def close(self):
124
110
  self.zmq_socket.close()
@@ -150,30 +136,18 @@ class TasksOutgoing:
150
136
  self.port = self.zmq_socket.bind_to_random_port(tcp_url(ip_address),
151
137
  min_port=port_range[0],
152
138
  max_port=port_range[1])
153
- self.poller = zmq.Poller()
154
- self.poller.register(self.zmq_socket, zmq.POLLOUT)
155
139
 
156
140
  def put(self, message):
157
141
  """ This function needs to be fast at the same time aware of the possibility of
158
142
  ZMQ pipes overflowing.
159
143
 
160
- The timeout increases slowly if contention is detected on ZMQ pipes.
161
144
  We could set copy=False and get slightly better latency but this results
162
145
  in ZMQ sockets reaching a broken state once there are ~10k tasks in flight.
163
146
  This issue can be magnified if each the serialized buffer itself is larger.
164
147
  """
165
- timeout_ms = 1
166
- while True:
167
- socks = dict(self.poller.poll(timeout=timeout_ms))
168
- if self.zmq_socket in socks and socks[self.zmq_socket] == zmq.POLLOUT:
169
- # The copy option adds latency but reduces the risk of ZMQ overflow
170
- logger.debug("Sending TasksOutgoing message")
171
- self.zmq_socket.send_pyobj(message, copy=True)
172
- logger.debug("Sent TasksOutgoing message")
173
- return
174
- else:
175
- timeout_ms *= 2
176
- logger.debug("Not sending due to non-ready zmq pipe, timeout: {} ms".format(timeout_ms))
148
+ logger.debug("Sending TasksOutgoing message")
149
+ self.zmq_socket.send_pyobj(message)
150
+ logger.debug("Sent TasksOutgoing message")
177
151
 
178
152
  def close(self):
179
153
  self.zmq_socket.close()
@@ -206,20 +180,15 @@ class ResultsIncoming:
206
180
  self.port = self.results_receiver.bind_to_random_port(tcp_url(ip_address),
207
181
  min_port=port_range[0],
208
182
  max_port=port_range[1])
209
- self.poller = zmq.Poller()
210
- self.poller.register(self.results_receiver, zmq.POLLIN)
211
183
 
212
184
  def get(self, timeout_ms=None):
213
185
  """Get a message from the queue, returning None if timeout expires
214
186
  without a message. timeout is measured in milliseconds.
215
187
  """
216
- socks = dict(self.poller.poll(timeout=timeout_ms))
217
- if self.results_receiver in socks and socks[self.results_receiver] == zmq.POLLIN:
218
- m = self.results_receiver.recv_multipart()
219
- logger.debug("Received ResultsIncoming message")
220
- return m
221
- else:
222
- return None
188
+ if zmq.POLLIN == self.results_receiver.poll(timeout_ms, zmq.POLLIN):
189
+ logger.debug("Receiving ResultsIncoming multipart message")
190
+ return self.results_receiver.recv_multipart()
191
+ return None
223
192
 
224
193
  def close(self):
225
194
  self.results_receiver.close()
@@ -20,7 +20,7 @@ from parsl.app.errors import BashExitFailure, RemoteExceptionWrapper
20
20
  from parsl.app.python import timeout
21
21
  from parsl.data_provider.files import File
22
22
  from parsl.executors.base import ParslExecutor
23
- from parsl.serialize import deserialize, pack_res_spec_apply_message
23
+ from parsl.serialize import deserialize, pack_apply_message
24
24
  from parsl.serialize.errors import DeserializationError, SerializationError
25
25
  from parsl.utils import RepresentationMixin
26
26
 
@@ -441,11 +441,7 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
441
441
 
442
442
  def _pack_and_apply_message(self, func, args, kwargs):
443
443
  try:
444
- buffer = pack_res_spec_apply_message(func,
445
- args,
446
- kwargs,
447
- resource_specification={},
448
- buffer_threshold=1024 * 1024)
444
+ buffer = pack_apply_message(func, args, kwargs, buffer_threshold=1 << 20)
449
445
  task_func = rp.utils.serialize_bson(buffer)
450
446
  except TypeError:
451
447
  raise SerializationError(func.__name__)
@@ -5,7 +5,7 @@ import radical.pilot as rp
5
5
  import parsl.app.errors as pe
6
6
  from parsl.app.bash import remote_side_bash_executor
7
7
  from parsl.executors.execute_task import execute_task
8
- from parsl.serialize import serialize, unpack_res_spec_apply_message
8
+ from parsl.serialize import serialize, unpack_apply_message
9
9
 
10
10
 
11
11
  class ParslWorker:
@@ -33,7 +33,7 @@ class ParslWorker:
33
33
 
34
34
  try:
35
35
  buffer = rp.utils.deserialize_bson(task['description']['executable'])
36
- func, args, kwargs, _resource_spec = unpack_res_spec_apply_message(buffer)
36
+ func, args, kwargs = unpack_apply_message(buffer)
37
37
  ret = remote_side_bash_executor(func, *args, **kwargs)
38
38
  exc = (None, None)
39
39
  val = None
@@ -107,13 +107,17 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
107
107
  function_exec_mode: Union[Literal['regular'], Literal['serverless']] = 'regular',
108
108
  manager_config: TaskVineManagerConfig = TaskVineManagerConfig(),
109
109
  factory_config: TaskVineFactoryConfig = TaskVineFactoryConfig(),
110
- provider: Optional[ExecutionProvider] = LocalProvider(init_blocks=1),
110
+ provider: Optional[ExecutionProvider] = None,
111
111
  storage_access: Optional[List[Staging]] = None,
112
112
  remote_monitoring_radio: Optional[RadioConfig] = None):
113
113
 
114
114
  # Set worker launch option for this executor
115
115
  if worker_launch_method == 'factory' or worker_launch_method == 'manual':
116
116
  provider = None
117
+ elif worker_launch_method == 'provider' and provider is None:
118
+ # provider method chosen, but no explicit provider supplied to __init__
119
+ # so default to LocalProvider
120
+ provider = LocalProvider(init_blocks=1)
117
121
 
118
122
  # Initialize the parent class with the execution provider and block error handling enabled.
119
123
  # If provider is None, then no worker is launched via the provider method.
@@ -29,12 +29,15 @@ class ThreadPoolExecutor(ParslExecutor, RepresentationMixin):
29
29
 
30
30
  @typeguard.typechecked
31
31
  def __init__(self, label: str = 'threads', max_threads: Optional[int] = 2,
32
- thread_name_prefix: str = '', storage_access: Optional[List[Staging]] = None,
32
+ thread_name_prefix: str | None = None, storage_access: Optional[List[Staging]] = None,
33
33
  working_dir: Optional[str] = None, remote_monitoring_radio: Optional[RadioConfig] = None):
34
34
  ParslExecutor.__init__(self)
35
35
  self.label = label
36
36
  self.max_threads = max_threads
37
- self.thread_name_prefix = thread_name_prefix
37
+ if thread_name_prefix is None:
38
+ self.thread_name_prefix = "ThreadPoolExecutor-" + label
39
+ else:
40
+ self.thread_name_prefix = thread_name_prefix
38
41
 
39
42
  # we allow storage_access to be None now, which means something else to [] now
40
43
  # None now means that a default storage access list will be used, while
parsl/jobs/states.py CHANGED
@@ -10,7 +10,7 @@ class JobState(IntEnum):
10
10
  """Defines a set of states that a job can be in"""
11
11
 
12
12
  UNKNOWN = 0
13
- """The batch provider is unable to determinate a state for this job"""
13
+ """The batch provider is unable to determine a state for this job"""
14
14
 
15
15
  PENDING = 1
16
16
  """"This job is in the batch queue but has not started running"""
@@ -40,7 +40,7 @@ class JobState(IntEnum):
40
40
  """This job is held/suspended in the batch system"""
41
41
 
42
42
  MISSING = 8
43
- """This job has reached a terminal state without the resources(managers/workers)
43
+ """This job has reached a terminal state without the resources (managers/workers)
44
44
  launched in the job connecting back to the Executor. This state is set by HTEX
45
45
  when it is able to infer that the block failed to start workers for eg due to
46
46
  bad worker environment or network connectivity issues.