parsl 2025.9.8__py3-none-any.whl → 2025.11.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. parsl/app/bash.py +1 -1
  2. parsl/benchmark/perf.py +73 -17
  3. parsl/concurrent/__init__.py +95 -14
  4. parsl/curvezmq.py +0 -16
  5. parsl/data_provider/globus.py +3 -1
  6. parsl/dataflow/dflow.py +106 -204
  7. parsl/dataflow/memoization.py +146 -19
  8. parsl/dataflow/states.py +5 -5
  9. parsl/executors/base.py +2 -2
  10. parsl/executors/execute_task.py +2 -8
  11. parsl/executors/flux/executor.py +4 -6
  12. parsl/executors/globus_compute.py +0 -4
  13. parsl/executors/high_throughput/executor.py +86 -24
  14. parsl/executors/high_throughput/interchange.py +39 -20
  15. parsl/executors/high_throughput/mpi_executor.py +1 -2
  16. parsl/executors/high_throughput/mpi_resource_management.py +7 -14
  17. parsl/executors/high_throughput/process_worker_pool.py +32 -7
  18. parsl/executors/high_throughput/zmq_pipes.py +36 -67
  19. parsl/executors/radical/executor.py +2 -6
  20. parsl/executors/radical/rpex_worker.py +2 -2
  21. parsl/executors/taskvine/executor.py +5 -1
  22. parsl/executors/threads.py +5 -2
  23. parsl/jobs/states.py +2 -2
  24. parsl/jobs/strategy.py +7 -6
  25. parsl/monitoring/monitoring.py +2 -2
  26. parsl/monitoring/radios/filesystem.py +2 -1
  27. parsl/monitoring/radios/htex.py +2 -1
  28. parsl/monitoring/radios/multiprocessing.py +2 -1
  29. parsl/monitoring/radios/udp.py +2 -1
  30. parsl/multiprocessing.py +0 -49
  31. parsl/providers/base.py +24 -37
  32. parsl/providers/pbspro/pbspro.py +1 -1
  33. parsl/serialize/__init__.py +6 -9
  34. parsl/serialize/facade.py +0 -32
  35. parsl/tests/configs/local_threads_globus.py +18 -14
  36. parsl/tests/configs/taskvine_ex.py +1 -1
  37. parsl/tests/sites/test_concurrent.py +51 -3
  38. parsl/tests/test_checkpointing/test_periodic.py +15 -9
  39. parsl/tests/test_checkpointing/test_regression_233.py +0 -1
  40. parsl/tests/test_curvezmq.py +0 -42
  41. parsl/tests/test_execute_task.py +2 -11
  42. parsl/tests/test_htex/test_command_concurrency_regression_1321.py +54 -0
  43. parsl/tests/test_htex/test_htex.py +36 -1
  44. parsl/tests/test_htex/test_interchange_exit_bad_registration.py +2 -2
  45. parsl/tests/test_htex/test_priority_queue.py +26 -3
  46. parsl/tests/test_htex/test_zmq_binding.py +2 -1
  47. parsl/tests/test_mpi_apps/test_mpi_scheduler.py +18 -43
  48. parsl/tests/test_python_apps/test_basic.py +0 -14
  49. parsl/tests/test_python_apps/test_depfail_propagation.py +11 -1
  50. parsl/tests/test_python_apps/test_exception.py +19 -0
  51. parsl/tests/test_python_apps/test_garbage_collect.py +1 -6
  52. parsl/tests/test_python_apps/test_memoize_2.py +11 -1
  53. parsl/tests/test_regression/test_3874.py +47 -0
  54. parsl/tests/test_scaling/test_regression_3696_oscillation.py +1 -0
  55. parsl/tests/test_staging/test_staging_globus.py +2 -2
  56. parsl/tests/unit/test_globus_compute_executor.py +11 -2
  57. parsl/utils.py +8 -3
  58. parsl/version.py +1 -1
  59. {parsl-2025.9.8.data → parsl-2025.11.10.data}/scripts/interchange.py +39 -20
  60. {parsl-2025.9.8.data → parsl-2025.11.10.data}/scripts/process_worker_pool.py +32 -7
  61. {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/METADATA +64 -50
  62. {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/RECORD +68 -74
  63. {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/WHEEL +1 -1
  64. parsl/tests/configs/local_threads_checkpoint_periodic.py +0 -11
  65. parsl/tests/configs/local_threads_no_cache.py +0 -11
  66. parsl/tests/site_tests/test_provider.py +0 -88
  67. parsl/tests/site_tests/test_site.py +0 -70
  68. parsl/tests/test_aalst_patterns.py +0 -474
  69. parsl/tests/test_docs/test_workflow2.py +0 -42
  70. parsl/tests/test_error_handling/test_rand_fail.py +0 -171
  71. parsl/tests/test_regression/test_854.py +0 -62
  72. parsl/tests/test_serialization/test_pack_resource_spec.py +0 -23
  73. {parsl-2025.9.8.data → parsl-2025.11.10.data}/scripts/exec_parsl_function.py +0 -0
  74. {parsl-2025.9.8.data → parsl-2025.11.10.data}/scripts/parsl_coprocess.py +0 -0
  75. {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/entry_points.txt +0 -0
  76. {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info/licenses}/LICENSE +0 -0
  77. {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/top_level.txt +0 -0
@@ -9,7 +9,6 @@ from enum import Enum
9
9
  from typing import Dict, List, Optional
10
10
 
11
11
  from parsl.multiprocessing import SpawnContext
12
- from parsl.serialize import pack_res_spec_apply_message, unpack_res_spec_apply_message
13
12
 
14
13
  logger = logging.getLogger(__name__)
15
14
 
@@ -146,11 +145,11 @@ class MPITaskScheduler(TaskScheduler):
146
145
  )
147
146
  acquired_nodes = []
148
147
  with self._free_node_counter.get_lock():
149
- if num_nodes <= self._free_node_counter.value: # type: ignore[attr-defined]
150
- self._free_node_counter.value -= num_nodes # type: ignore[attr-defined]
148
+ if num_nodes <= self._free_node_counter.value:
149
+ self._free_node_counter.value -= num_nodes
151
150
  else:
152
151
  raise MPINodesUnavailable(
153
- requested=num_nodes, available=self._free_node_counter.value # type: ignore[attr-defined]
152
+ requested=num_nodes, available=self._free_node_counter.value
154
153
  )
155
154
 
156
155
  for i in range(num_nodes):
@@ -163,17 +162,14 @@ class MPITaskScheduler(TaskScheduler):
163
162
  for node in nodes:
164
163
  self.nodes_q.put(node)
165
164
  with self._free_node_counter.get_lock():
166
- self._free_node_counter.value += len(nodes) # type: ignore[attr-defined]
165
+ self._free_node_counter.value += len(nodes)
167
166
 
168
167
  def put_task(self, task_package: dict):
169
168
  """Schedule task if resources are available otherwise backlog the task"""
170
- user_ns = locals()
171
- user_ns.update({"__builtins__": __builtins__})
172
- _f, _args, _kwargs, resource_spec = unpack_res_spec_apply_message(task_package["buffer"])
169
+ resource_spec = task_package.get("context", {}).get("resource_spec", {})
173
170
 
174
- nodes_needed = resource_spec.get("num_nodes")
175
- tid = task_package["task_id"]
176
- if nodes_needed:
171
+ if nodes_needed := resource_spec.get("num_nodes"):
172
+ tid = task_package["task_id"]
177
173
  try:
178
174
  allocated_nodes = self._get_nodes(nodes_needed)
179
175
  except MPINodesUnavailable:
@@ -183,9 +179,6 @@ class MPITaskScheduler(TaskScheduler):
183
179
  else:
184
180
  resource_spec["MPI_NODELIST"] = ",".join(allocated_nodes)
185
181
  self._map_tasks_to_nodes[tid] = allocated_nodes
186
- buffer = pack_res_spec_apply_message(_f, _args, _kwargs, resource_spec)
187
- task_package["buffer"] = buffer
188
- task_package["resource_spec"] = resource_spec
189
182
 
190
183
  self.pending_task_q.put(task_package)
191
184
 
@@ -1,6 +1,7 @@
1
1
  #!/usr/bin/env python3
2
2
 
3
3
  import argparse
4
+ import importlib
4
5
  import logging
5
6
  import math
6
7
  import multiprocessing
@@ -17,7 +18,7 @@ from importlib.metadata import distributions
17
18
  from multiprocessing.context import SpawnProcess
18
19
  from multiprocessing.managers import DictProxy
19
20
  from multiprocessing.sharedctypes import Synchronized
20
- from typing import Dict, List, Optional, Sequence
21
+ from typing import Callable, Dict, List, Optional, Sequence
21
22
 
22
23
  import psutil
23
24
  import zmq
@@ -348,7 +349,7 @@ class Manager:
348
349
 
349
350
  logger.debug(
350
351
  'ready workers: %d, pending tasks: %d',
351
- self.ready_worker_count.value, # type: ignore[attr-defined]
352
+ self.ready_worker_count.value,
352
353
  pending_task_count,
353
354
  )
354
355
 
@@ -373,10 +374,12 @@ class Manager:
373
374
  if socks.get(ix_sock) == zmq.POLLIN:
374
375
  pkl_msg = ix_sock.recv()
375
376
  tasks = pickle.loads(pkl_msg)
377
+ del pkl_msg
378
+
376
379
  last_interchange_contact = time.time()
377
380
 
378
381
  if tasks == HEARTBEAT_CODE:
379
- logger.debug("Got heartbeat from interchange")
382
+ logger.debug("Got heartbeat response from interchange")
380
383
  elif tasks == DRAINED_CODE:
381
384
  logger.info("Got fully drained message from interchange - setting kill flag")
382
385
  self._stop_event.set()
@@ -454,6 +457,7 @@ class Manager:
454
457
  'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))}
455
458
  pkl_package = pickle.dumps(result_package)
456
459
  self.pending_result_queue.put(pkl_package)
460
+ del pkl_package
457
461
  except KeyError:
458
462
  logger.info("Worker {} was not busy when it died".format(worker_id))
459
463
 
@@ -603,6 +607,10 @@ def update_resource_spec_env_vars(mpi_launcher: str, resource_spec: Dict, node_i
603
607
 
604
608
 
605
609
  def _init_mpi_env(mpi_launcher: str, resource_spec: Dict):
610
+ for varname in resource_spec:
611
+ envname = "PARSL_" + str(varname).upper()
612
+ os.environ[envname] = str(resource_spec[varname])
613
+
606
614
  node_list = resource_spec.get("MPI_NODELIST")
607
615
  if node_list is None:
608
616
  return
@@ -753,8 +761,8 @@ def worker(
753
761
  worker_enqueued = True
754
762
 
755
763
  try:
756
- # The worker will receive {'task_id':<tid>, 'buffer':<buf>}
757
764
  req = task_queue.get(timeout=task_queue_timeout)
765
+ # req is {'task_id':<tid>, 'buffer':<buf>, 'resource_spec':<dict>}
758
766
  except queue.Empty:
759
767
  continue
760
768
 
@@ -766,17 +774,33 @@ def worker(
766
774
  ready_worker_count.value -= 1
767
775
  worker_enqueued = False
768
776
 
769
- _init_mpi_env(mpi_launcher=mpi_launcher, resource_spec=req["resource_spec"])
777
+ ctxt = req["context"]
778
+ res_spec = ctxt.get("resource_spec", {})
779
+
780
+ _init_mpi_env(mpi_launcher=mpi_launcher, resource_spec=res_spec)
781
+
782
+ exec_func: Callable = execute_task
783
+ exec_args = ()
784
+ exec_kwargs = {}
770
785
 
771
786
  try:
772
- result = execute_task(req['buffer'])
787
+ if task_executor := ctxt.get("task_executor", None):
788
+ mod_name, _, fn_name = task_executor["f"].rpartition(".")
789
+ exec_mod = importlib.import_module(mod_name)
790
+ exec_func = getattr(exec_mod, fn_name)
791
+
792
+ exec_args = task_executor.get("a", ())
793
+ exec_kwargs = task_executor.get("k", {})
794
+
795
+ result = exec_func(req['buffer'], *exec_args, **exec_kwargs)
773
796
  serialized_result = serialize(result, buffer_threshold=1000000)
774
797
  except Exception as e:
775
798
  logger.info('Caught an exception: {}'.format(e))
776
799
  result_package = {'type': 'result', 'task_id': tid, 'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))}
777
800
  else:
778
801
  result_package = {'type': 'result', 'task_id': tid, 'result': serialized_result}
779
- # logger.debug("Result: {}".format(result))
802
+ del serialized_result
803
+ del req
780
804
 
781
805
  logger.info("Completed executor task {}".format(tid))
782
806
  try:
@@ -788,6 +812,7 @@ def worker(
788
812
  })
789
813
 
790
814
  result_queue.put(pkl_package)
815
+ del pkl_package, result_package
791
816
  tasks_in_progress.pop(worker_id)
792
817
  logger.info("All processing finished for executor task {}".format(tid))
793
818
 
@@ -74,51 +74,37 @@ class CommandClient:
74
74
 
75
75
  reply = '__PARSL_ZMQ_PIPES_MAGIC__'
76
76
  with self._lock:
77
- for _ in range(max_retries):
78
- try:
79
- logger.debug("Sending command client command")
80
-
81
- if timeout_s is not None:
82
- remaining_time_s = start_time_s + timeout_s - time.monotonic()
83
- poll_result = self.zmq_socket.poll(timeout=remaining_time_s * 1000, flags=zmq.POLLOUT)
84
- if poll_result == zmq.POLLOUT:
85
- pass # this is OK, so continue
86
- elif poll_result == 0:
87
- raise CommandClientTimeoutError("Waiting for command channel to be ready for a command")
88
- else:
89
- raise InternalConsistencyError(f"ZMQ poll returned unexpected value: {poll_result}")
90
-
91
- self.zmq_socket.send_pyobj(message, copy=True)
92
-
93
- if timeout_s is not None:
94
- logger.debug("Polling for command client response or timeout")
95
- remaining_time_s = start_time_s + timeout_s - time.monotonic()
96
- poll_result = self.zmq_socket.poll(timeout=remaining_time_s * 1000, flags=zmq.POLLIN)
97
- if poll_result == zmq.POLLIN:
98
- pass # this is OK, so continue
99
- elif poll_result == 0:
100
- logger.error("Command timed-out - command client is now bad forever")
101
- self.ok = False
102
- raise CommandClientTimeoutError("Waiting for a reply from command channel")
103
- else:
104
- raise InternalConsistencyError(f"ZMQ poll returned unexpected value: {poll_result}")
105
-
106
- logger.debug("Receiving command client response")
107
- reply = self.zmq_socket.recv_pyobj()
108
- logger.debug("Received command client response")
109
- except zmq.ZMQError:
110
- logger.exception("Potential ZMQ REQ-REP deadlock caught")
111
- logger.info("Trying to reestablish context")
112
- self.zmq_context.recreate()
113
- self.create_socket_and_bind()
77
+ logger.debug("Sending command client command")
78
+
79
+ if timeout_s is not None:
80
+ remaining_time_s = start_time_s + timeout_s - time.monotonic()
81
+ poll_result = self.zmq_socket.poll(timeout=remaining_time_s * 1000, flags=zmq.POLLOUT)
82
+ if poll_result == zmq.POLLOUT:
83
+ pass # this is OK, so continue
84
+ elif poll_result == 0:
85
+ raise CommandClientTimeoutError("Waiting for command channel to be ready for a command")
114
86
  else:
115
- break
116
-
117
- if reply == '__PARSL_ZMQ_PIPES_MAGIC__':
118
- logger.error("Command channel run retries exhausted. Unable to run command")
119
- raise Exception("Command Channel retries exhausted")
87
+ raise InternalConsistencyError(f"ZMQ poll returned unexpected value: {poll_result}")
88
+
89
+ self.zmq_socket.send_pyobj(message, copy=True)
90
+
91
+ if timeout_s is not None:
92
+ logger.debug("Polling for command client response or timeout")
93
+ remaining_time_s = start_time_s + timeout_s - time.monotonic()
94
+ poll_result = self.zmq_socket.poll(timeout=remaining_time_s * 1000, flags=zmq.POLLIN)
95
+ if poll_result == zmq.POLLIN:
96
+ pass # this is OK, so continue
97
+ elif poll_result == 0:
98
+ logger.error("Command timed-out - command client is now bad forever")
99
+ self.ok = False
100
+ raise CommandClientTimeoutError("Waiting for a reply from command channel")
101
+ else:
102
+ raise InternalConsistencyError(f"ZMQ poll returned unexpected value: {poll_result}")
120
103
 
121
- return reply
104
+ logger.debug("Receiving command client response")
105
+ reply = self.zmq_socket.recv_pyobj()
106
+ logger.debug("Received command client response")
107
+ return reply
122
108
 
123
109
  def close(self):
124
110
  self.zmq_socket.close()
@@ -150,30 +136,18 @@ class TasksOutgoing:
150
136
  self.port = self.zmq_socket.bind_to_random_port(tcp_url(ip_address),
151
137
  min_port=port_range[0],
152
138
  max_port=port_range[1])
153
- self.poller = zmq.Poller()
154
- self.poller.register(self.zmq_socket, zmq.POLLOUT)
155
139
 
156
140
  def put(self, message):
157
141
  """ This function needs to be fast at the same time aware of the possibility of
158
142
  ZMQ pipes overflowing.
159
143
 
160
- The timeout increases slowly if contention is detected on ZMQ pipes.
161
144
  We could set copy=False and get slightly better latency but this results
162
145
  in ZMQ sockets reaching a broken state once there are ~10k tasks in flight.
163
146
  This issue can be magnified if each the serialized buffer itself is larger.
164
147
  """
165
- timeout_ms = 1
166
- while True:
167
- socks = dict(self.poller.poll(timeout=timeout_ms))
168
- if self.zmq_socket in socks and socks[self.zmq_socket] == zmq.POLLOUT:
169
- # The copy option adds latency but reduces the risk of ZMQ overflow
170
- logger.debug("Sending TasksOutgoing message")
171
- self.zmq_socket.send_pyobj(message, copy=True)
172
- logger.debug("Sent TasksOutgoing message")
173
- return
174
- else:
175
- timeout_ms *= 2
176
- logger.debug("Not sending due to non-ready zmq pipe, timeout: {} ms".format(timeout_ms))
148
+ logger.debug("Sending TasksOutgoing message")
149
+ self.zmq_socket.send_pyobj(message)
150
+ logger.debug("Sent TasksOutgoing message")
177
151
 
178
152
  def close(self):
179
153
  self.zmq_socket.close()
@@ -206,20 +180,15 @@ class ResultsIncoming:
206
180
  self.port = self.results_receiver.bind_to_random_port(tcp_url(ip_address),
207
181
  min_port=port_range[0],
208
182
  max_port=port_range[1])
209
- self.poller = zmq.Poller()
210
- self.poller.register(self.results_receiver, zmq.POLLIN)
211
183
 
212
184
  def get(self, timeout_ms=None):
213
185
  """Get a message from the queue, returning None if timeout expires
214
186
  without a message. timeout is measured in milliseconds.
215
187
  """
216
- socks = dict(self.poller.poll(timeout=timeout_ms))
217
- if self.results_receiver in socks and socks[self.results_receiver] == zmq.POLLIN:
218
- m = self.results_receiver.recv_multipart()
219
- logger.debug("Received ResultsIncoming message")
220
- return m
221
- else:
222
- return None
188
+ if zmq.POLLIN == self.results_receiver.poll(timeout_ms, zmq.POLLIN):
189
+ logger.debug("Receiving ResultsIncoming multipart message")
190
+ return self.results_receiver.recv_multipart()
191
+ return None
223
192
 
224
193
  def close(self):
225
194
  self.results_receiver.close()
@@ -20,7 +20,7 @@ from parsl.app.errors import BashExitFailure, RemoteExceptionWrapper
20
20
  from parsl.app.python import timeout
21
21
  from parsl.data_provider.files import File
22
22
  from parsl.executors.base import ParslExecutor
23
- from parsl.serialize import deserialize, pack_res_spec_apply_message
23
+ from parsl.serialize import deserialize, pack_apply_message
24
24
  from parsl.serialize.errors import DeserializationError, SerializationError
25
25
  from parsl.utils import RepresentationMixin
26
26
 
@@ -441,11 +441,7 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
441
441
 
442
442
  def _pack_and_apply_message(self, func, args, kwargs):
443
443
  try:
444
- buffer = pack_res_spec_apply_message(func,
445
- args,
446
- kwargs,
447
- resource_specification={},
448
- buffer_threshold=1024 * 1024)
444
+ buffer = pack_apply_message(func, args, kwargs, buffer_threshold=1 << 20)
449
445
  task_func = rp.utils.serialize_bson(buffer)
450
446
  except TypeError:
451
447
  raise SerializationError(func.__name__)
@@ -5,7 +5,7 @@ import radical.pilot as rp
5
5
  import parsl.app.errors as pe
6
6
  from parsl.app.bash import remote_side_bash_executor
7
7
  from parsl.executors.execute_task import execute_task
8
- from parsl.serialize import serialize, unpack_res_spec_apply_message
8
+ from parsl.serialize import serialize, unpack_apply_message
9
9
 
10
10
 
11
11
  class ParslWorker:
@@ -33,7 +33,7 @@ class ParslWorker:
33
33
 
34
34
  try:
35
35
  buffer = rp.utils.deserialize_bson(task['description']['executable'])
36
- func, args, kwargs, _resource_spec = unpack_res_spec_apply_message(buffer)
36
+ func, args, kwargs = unpack_apply_message(buffer)
37
37
  ret = remote_side_bash_executor(func, *args, **kwargs)
38
38
  exc = (None, None)
39
39
  val = None
@@ -107,13 +107,17 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
107
107
  function_exec_mode: Union[Literal['regular'], Literal['serverless']] = 'regular',
108
108
  manager_config: TaskVineManagerConfig = TaskVineManagerConfig(),
109
109
  factory_config: TaskVineFactoryConfig = TaskVineFactoryConfig(),
110
- provider: Optional[ExecutionProvider] = LocalProvider(init_blocks=1),
110
+ provider: Optional[ExecutionProvider] = None,
111
111
  storage_access: Optional[List[Staging]] = None,
112
112
  remote_monitoring_radio: Optional[RadioConfig] = None):
113
113
 
114
114
  # Set worker launch option for this executor
115
115
  if worker_launch_method == 'factory' or worker_launch_method == 'manual':
116
116
  provider = None
117
+ elif worker_launch_method == 'provider' and provider is None:
118
+ # provider method chosen, but no explicit provider supplied to __init__
119
+ # so default to LocalProvider
120
+ provider = LocalProvider(init_blocks=1)
117
121
 
118
122
  # Initialize the parent class with the execution provider and block error handling enabled.
119
123
  # If provider is None, then no worker is launched via the provider method.
@@ -29,12 +29,15 @@ class ThreadPoolExecutor(ParslExecutor, RepresentationMixin):
29
29
 
30
30
  @typeguard.typechecked
31
31
  def __init__(self, label: str = 'threads', max_threads: Optional[int] = 2,
32
- thread_name_prefix: str = '', storage_access: Optional[List[Staging]] = None,
32
+ thread_name_prefix: str | None = None, storage_access: Optional[List[Staging]] = None,
33
33
  working_dir: Optional[str] = None, remote_monitoring_radio: Optional[RadioConfig] = None):
34
34
  ParslExecutor.__init__(self)
35
35
  self.label = label
36
36
  self.max_threads = max_threads
37
- self.thread_name_prefix = thread_name_prefix
37
+ if thread_name_prefix is None:
38
+ self.thread_name_prefix = "ThreadPoolExecutor-" + label
39
+ else:
40
+ self.thread_name_prefix = thread_name_prefix
38
41
 
39
42
  # we allow storage_access to be None now, which means something else to [] now
40
43
  # None now means that a default storage access list will be used, while
parsl/jobs/states.py CHANGED
@@ -10,7 +10,7 @@ class JobState(IntEnum):
10
10
  """Defines a set of states that a job can be in"""
11
11
 
12
12
  UNKNOWN = 0
13
- """The batch provider is unable to determinate a state for this job"""
13
+ """The batch provider is unable to determine a state for this job"""
14
14
 
15
15
  PENDING = 1
16
16
  """"This job is in the batch queue but has not started running"""
@@ -40,7 +40,7 @@ class JobState(IntEnum):
40
40
  """This job is held/suspended in the batch system"""
41
41
 
42
42
  MISSING = 8
43
- """This job has reached a terminal state without the resources(managers/workers)
43
+ """This job has reached a terminal state without the resources (managers/workers)
44
44
  launched in the job connecting back to the Executor. This state is set by HTEX
45
45
  when it is able to infer that the block failed to start workers for eg due to
46
46
  bad worker environment or network connectivity issues.
parsl/jobs/strategy.py CHANGED
@@ -185,6 +185,11 @@ class Strategy:
185
185
 
186
186
  for executor in executors:
187
187
  label = executor.label
188
+
189
+ if executor.bad_state_is_set:
190
+ logger.info(f"Not strategizing for executor {label} because bad state is set")
191
+ continue
192
+
188
193
  logger.debug(f"Strategizing for executor {label}")
189
194
 
190
195
  if self.executors[label]['first']:
@@ -213,12 +218,8 @@ class Strategy:
213
218
 
214
219
  logger.debug(f"Slot ratio calculation: active_slots = {active_slots}, active_tasks = {active_tasks}")
215
220
 
216
- if hasattr(executor, 'connected_workers'):
217
- logger.debug('Executor {} has {} active tasks, {}/{} running/pending blocks, and {} connected workers'.format(
218
- label, active_tasks, running, pending, executor.connected_workers()))
219
- else:
220
- logger.debug('Executor {} has {} active tasks and {}/{} running/pending blocks'.format(
221
- label, active_tasks, running, pending))
221
+ logger.debug('Executor {} has {} active tasks and {}/{} running/pending blocks'.format(
222
+ label, active_tasks, running, pending))
222
223
 
223
224
  # reset idle timer if executor has active tasks
224
225
 
@@ -11,9 +11,9 @@ import typeguard
11
11
 
12
12
  from parsl.monitoring.types import TaggedMonitoringMessage
13
13
  from parsl.multiprocessing import (
14
- SizedQueue,
15
14
  SpawnEvent,
16
15
  SpawnProcess,
16
+ SpawnQueue,
17
17
  join_terminate_close_proc,
18
18
  )
19
19
  from parsl.utils import RepresentationMixin
@@ -126,7 +126,7 @@ class MonitoringHub(RepresentationMixin):
126
126
  self.monitoring_hub_active = True
127
127
 
128
128
  self.resource_msgs: Queue[TaggedMonitoringMessage]
129
- self.resource_msgs = SizedQueue()
129
+ self.resource_msgs = SpawnQueue()
130
130
 
131
131
  self.dbm_exit_event: ms.Event
132
132
  self.dbm_exit_event = SpawnEvent()
@@ -10,11 +10,12 @@ from parsl.monitoring.radios.base import (
10
10
  RadioConfig,
11
11
  )
12
12
  from parsl.monitoring.radios.filesystem_router import FilesystemRadioReceiver
13
+ from parsl.utils import RepresentationMixin
13
14
 
14
15
  logger = logging.getLogger(__name__)
15
16
 
16
17
 
17
- class FilesystemRadio(RadioConfig):
18
+ class FilesystemRadio(RadioConfig, RepresentationMixin):
18
19
  """A MonitoringRadioSender that sends messages over a shared filesystem.
19
20
 
20
21
  The messsage directory structure is based on maildir,
@@ -7,11 +7,12 @@ from parsl.monitoring.radios.base import (
7
7
  MonitoringRadioSender,
8
8
  RadioConfig,
9
9
  )
10
+ from parsl.utils import RepresentationMixin
10
11
 
11
12
  logger = logging.getLogger(__name__)
12
13
 
13
14
 
14
- class HTEXRadio(RadioConfig):
15
+ class HTEXRadio(RadioConfig, RepresentationMixin):
15
16
  def create_sender(self) -> MonitoringRadioSender:
16
17
  return HTEXRadioSender()
17
18
 
@@ -5,9 +5,10 @@ from parsl.monitoring.radios.base import (
5
5
  MonitoringRadioSender,
6
6
  RadioConfig,
7
7
  )
8
+ from parsl.utils import RepresentationMixin
8
9
 
9
10
 
10
- class MultiprocessingQueueRadioSender(MonitoringRadioSender):
11
+ class MultiprocessingQueueRadioSender(MonitoringRadioSender, RepresentationMixin):
11
12
  """A monitoring radio which connects over a multiprocessing Queue.
12
13
  This radio is intended to be used on the submit side, where components
13
14
  in the submit process, or processes launched by multiprocessing, will have
@@ -13,11 +13,12 @@ from parsl.monitoring.radios.base import (
13
13
  RadioConfig,
14
14
  )
15
15
  from parsl.monitoring.radios.udp_router import start_udp_receiver
16
+ from parsl.utils import RepresentationMixin
16
17
 
17
18
  logger = logging.getLogger(__name__)
18
19
 
19
20
 
20
- class UDPRadio(RadioConfig):
21
+ class UDPRadio(RadioConfig, RepresentationMixin):
21
22
  def __init__(self, *, port: Optional[int] = None, atexit_timeout: int = 3, address: str, debug: bool = False, hmac_digest: str = 'sha512'):
22
23
  self.port = port
23
24
  self.atexit_timeout = atexit_timeout
parsl/multiprocessing.py CHANGED
@@ -4,7 +4,6 @@
4
4
  import logging
5
5
  import multiprocessing
6
6
  import multiprocessing.queues
7
- import platform
8
7
  from multiprocessing.context import ForkProcess as ForkProcessType
9
8
  from multiprocessing.context import SpawnProcess as SpawnProcessType
10
9
  from typing import Callable
@@ -21,54 +20,6 @@ SpawnEvent = SpawnContext.Event
21
20
  SpawnQueue = SpawnContext.Queue
22
21
 
23
22
 
24
- class MacSafeQueue(multiprocessing.queues.Queue):
25
- """ Multiprocessing queues do not have qsize attributes on MacOS.
26
- This is slower but more portable version of the multiprocessing Queue
27
- that adds a explicit counter
28
-
29
- Reference : https://github.com/keras-team/autokeras/commit/4ddd568b06b4045ace777bc0fb7bc18573b85a75
30
- """
31
-
32
- def __init__(self, *args, **kwargs):
33
- if 'ctx' not in kwargs:
34
- kwargs['ctx'] = multiprocessing.get_context('spawn')
35
- super().__init__(*args, **kwargs)
36
- self._counter = multiprocessing.Value('i', 0)
37
-
38
- def put(self, *args, **kwargs):
39
- # logger.critical("Putting item {}".format(args))
40
- x = super().put(*args, **kwargs)
41
- with self._counter.get_lock():
42
- self._counter.value += 1
43
- return x
44
-
45
- def get(self, *args, **kwargs):
46
- x = super().get(*args, **kwargs)
47
- with self._counter.get_lock():
48
- self._counter.value -= 1
49
- # logger.critical("Getting item {}".format(x))
50
- return x
51
-
52
- def qsize(self):
53
- return self._counter.value
54
-
55
- def empty(self):
56
- return not self._counter.value
57
-
58
-
59
- # SizedQueue should be constructable using the same calling
60
- # convention as multiprocessing.Queue but that entire signature
61
- # isn't expressible in mypy 0.790
62
- SizedQueue: Callable[..., multiprocessing.Queue]
63
-
64
-
65
- if platform.system() != 'Darwin':
66
- import multiprocessing
67
- SizedQueue = SpawnQueue
68
- else:
69
- SizedQueue = MacSafeQueue
70
-
71
-
72
23
  def join_terminate_close_proc(process: SpawnProcessType, *, timeout: int = 30) -> None:
73
24
  """Increasingly aggressively terminate a process.
74
25
 
parsl/providers/base.py CHANGED
@@ -33,7 +33,28 @@ class ExecutionProvider(metaclass=ABCMeta):
33
33
  [cancel] <--------|----+
34
34
  |
35
35
  +-------------------
36
- """
36
+
37
+ In addition to the listed methods, an ExecutionProvider instance must always
38
+ have these attributes, which both default to `None`:
39
+
40
+ mem_per_node: Real memory to provision per node in GB.
41
+
42
+ Providers which set this attribute should ask for mem_per_node of memory
43
+ when provisioning resources, and set the corresponding environment
44
+ variable PARSL_MEMORY_GB before executing submitted commands.
45
+
46
+ If this attribute is set, executors may use it to calculate how many tasks can
47
+ run concurrently per node.
48
+
49
+ cores_per_node: Number of cores to provision per node.
50
+
51
+ Providers which set this attribute should ask for cores_per_node cores
52
+ when provisioning resources, and set the corresponding environment
53
+ variable PARSL_CORES before executing submitted commands.
54
+
55
+ If this attribute is set, executors may use it to calculate how many tasks can
56
+ run concurrently per node.
57
+ """
37
58
 
38
59
  @abstractmethod
39
60
  def __init__(self) -> None:
@@ -44,8 +65,8 @@ class ExecutionProvider(metaclass=ABCMeta):
44
65
  self.script_dir: Optional[str]
45
66
  self.parallelism: float
46
67
  self.resources: Dict[object, Any]
47
- self._cores_per_node: Optional[int] = None
48
- self._mem_per_node: Optional[float] = None
68
+ self.cores_per_node: Optional[int] = None
69
+ self.mem_per_node: Optional[float] = None
49
70
  pass
50
71
 
51
72
  @abstractmethod
@@ -111,40 +132,6 @@ class ExecutionProvider(metaclass=ABCMeta):
111
132
  ''' Provides the label for this provider '''
112
133
  pass
113
134
 
114
- @property
115
- def mem_per_node(self) -> Optional[float]:
116
- """Real memory to provision per node in GB.
117
-
118
- Providers which set this property should ask for mem_per_node of memory
119
- when provisioning resources, and set the corresponding environment
120
- variable PARSL_MEMORY_GB before executing submitted commands.
121
-
122
- If this property is set, executors may use it to calculate how many tasks can
123
- run concurrently per node.
124
- """
125
- return self._mem_per_node
126
-
127
- @mem_per_node.setter
128
- def mem_per_node(self, value: float) -> None:
129
- self._mem_per_node = value
130
-
131
- @property
132
- def cores_per_node(self) -> Optional[int]:
133
- """Number of cores to provision per node.
134
-
135
- Providers which set this property should ask for cores_per_node cores
136
- when provisioning resources, and set the corresponding environment
137
- variable PARSL_CORES before executing submitted commands.
138
-
139
- If this property is set, executors may use it to calculate how many tasks can
140
- run concurrently per node.
141
- """
142
- return self._cores_per_node
143
-
144
- @cores_per_node.setter
145
- def cores_per_node(self, value: int) -> None:
146
- self._cores_per_node = value
147
-
148
135
  @property
149
136
  @abstractmethod
150
137
  def status_polling_interval(self) -> int: