parsl 2025.9.8__py3-none-any.whl → 2025.11.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/app/bash.py +1 -1
- parsl/benchmark/perf.py +73 -17
- parsl/concurrent/__init__.py +95 -14
- parsl/curvezmq.py +0 -16
- parsl/data_provider/globus.py +3 -1
- parsl/dataflow/dflow.py +106 -204
- parsl/dataflow/memoization.py +146 -19
- parsl/dataflow/states.py +5 -5
- parsl/executors/base.py +2 -2
- parsl/executors/execute_task.py +2 -8
- parsl/executors/flux/executor.py +4 -6
- parsl/executors/globus_compute.py +0 -4
- parsl/executors/high_throughput/executor.py +86 -24
- parsl/executors/high_throughput/interchange.py +39 -20
- parsl/executors/high_throughput/mpi_executor.py +1 -2
- parsl/executors/high_throughput/mpi_resource_management.py +7 -14
- parsl/executors/high_throughput/process_worker_pool.py +32 -7
- parsl/executors/high_throughput/zmq_pipes.py +36 -67
- parsl/executors/radical/executor.py +2 -6
- parsl/executors/radical/rpex_worker.py +2 -2
- parsl/executors/taskvine/executor.py +5 -1
- parsl/executors/threads.py +5 -2
- parsl/jobs/states.py +2 -2
- parsl/jobs/strategy.py +7 -6
- parsl/monitoring/monitoring.py +2 -2
- parsl/monitoring/radios/filesystem.py +2 -1
- parsl/monitoring/radios/htex.py +2 -1
- parsl/monitoring/radios/multiprocessing.py +2 -1
- parsl/monitoring/radios/udp.py +2 -1
- parsl/multiprocessing.py +0 -49
- parsl/providers/base.py +24 -37
- parsl/providers/pbspro/pbspro.py +1 -1
- parsl/serialize/__init__.py +6 -9
- parsl/serialize/facade.py +0 -32
- parsl/tests/configs/local_threads_globus.py +18 -14
- parsl/tests/configs/taskvine_ex.py +1 -1
- parsl/tests/sites/test_concurrent.py +51 -3
- parsl/tests/test_checkpointing/test_periodic.py +15 -9
- parsl/tests/test_checkpointing/test_regression_233.py +0 -1
- parsl/tests/test_curvezmq.py +0 -42
- parsl/tests/test_execute_task.py +2 -11
- parsl/tests/test_htex/test_command_concurrency_regression_1321.py +54 -0
- parsl/tests/test_htex/test_htex.py +36 -1
- parsl/tests/test_htex/test_interchange_exit_bad_registration.py +2 -2
- parsl/tests/test_htex/test_priority_queue.py +26 -3
- parsl/tests/test_htex/test_zmq_binding.py +2 -1
- parsl/tests/test_mpi_apps/test_mpi_scheduler.py +18 -43
- parsl/tests/test_python_apps/test_basic.py +0 -14
- parsl/tests/test_python_apps/test_depfail_propagation.py +11 -1
- parsl/tests/test_python_apps/test_exception.py +19 -0
- parsl/tests/test_python_apps/test_garbage_collect.py +1 -6
- parsl/tests/test_python_apps/test_memoize_2.py +11 -1
- parsl/tests/test_regression/test_3874.py +47 -0
- parsl/tests/test_scaling/test_regression_3696_oscillation.py +1 -0
- parsl/tests/test_staging/test_staging_globus.py +2 -2
- parsl/tests/unit/test_globus_compute_executor.py +11 -2
- parsl/utils.py +8 -3
- parsl/version.py +1 -1
- {parsl-2025.9.8.data → parsl-2025.11.10.data}/scripts/interchange.py +39 -20
- {parsl-2025.9.8.data → parsl-2025.11.10.data}/scripts/process_worker_pool.py +32 -7
- {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/METADATA +64 -50
- {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/RECORD +68 -74
- {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/WHEEL +1 -1
- parsl/tests/configs/local_threads_checkpoint_periodic.py +0 -11
- parsl/tests/configs/local_threads_no_cache.py +0 -11
- parsl/tests/site_tests/test_provider.py +0 -88
- parsl/tests/site_tests/test_site.py +0 -70
- parsl/tests/test_aalst_patterns.py +0 -474
- parsl/tests/test_docs/test_workflow2.py +0 -42
- parsl/tests/test_error_handling/test_rand_fail.py +0 -171
- parsl/tests/test_regression/test_854.py +0 -62
- parsl/tests/test_serialization/test_pack_resource_spec.py +0 -23
- {parsl-2025.9.8.data → parsl-2025.11.10.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2025.9.8.data → parsl-2025.11.10.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/entry_points.txt +0 -0
- {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info/licenses}/LICENSE +0 -0
- {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/top_level.txt +0 -0
|
@@ -9,7 +9,6 @@ from enum import Enum
|
|
|
9
9
|
from typing import Dict, List, Optional
|
|
10
10
|
|
|
11
11
|
from parsl.multiprocessing import SpawnContext
|
|
12
|
-
from parsl.serialize import pack_res_spec_apply_message, unpack_res_spec_apply_message
|
|
13
12
|
|
|
14
13
|
logger = logging.getLogger(__name__)
|
|
15
14
|
|
|
@@ -146,11 +145,11 @@ class MPITaskScheduler(TaskScheduler):
|
|
|
146
145
|
)
|
|
147
146
|
acquired_nodes = []
|
|
148
147
|
with self._free_node_counter.get_lock():
|
|
149
|
-
if num_nodes <= self._free_node_counter.value:
|
|
150
|
-
self._free_node_counter.value -= num_nodes
|
|
148
|
+
if num_nodes <= self._free_node_counter.value:
|
|
149
|
+
self._free_node_counter.value -= num_nodes
|
|
151
150
|
else:
|
|
152
151
|
raise MPINodesUnavailable(
|
|
153
|
-
requested=num_nodes, available=self._free_node_counter.value
|
|
152
|
+
requested=num_nodes, available=self._free_node_counter.value
|
|
154
153
|
)
|
|
155
154
|
|
|
156
155
|
for i in range(num_nodes):
|
|
@@ -163,17 +162,14 @@ class MPITaskScheduler(TaskScheduler):
|
|
|
163
162
|
for node in nodes:
|
|
164
163
|
self.nodes_q.put(node)
|
|
165
164
|
with self._free_node_counter.get_lock():
|
|
166
|
-
self._free_node_counter.value += len(nodes)
|
|
165
|
+
self._free_node_counter.value += len(nodes)
|
|
167
166
|
|
|
168
167
|
def put_task(self, task_package: dict):
|
|
169
168
|
"""Schedule task if resources are available otherwise backlog the task"""
|
|
170
|
-
|
|
171
|
-
user_ns.update({"__builtins__": __builtins__})
|
|
172
|
-
_f, _args, _kwargs, resource_spec = unpack_res_spec_apply_message(task_package["buffer"])
|
|
169
|
+
resource_spec = task_package.get("context", {}).get("resource_spec", {})
|
|
173
170
|
|
|
174
|
-
nodes_needed
|
|
175
|
-
|
|
176
|
-
if nodes_needed:
|
|
171
|
+
if nodes_needed := resource_spec.get("num_nodes"):
|
|
172
|
+
tid = task_package["task_id"]
|
|
177
173
|
try:
|
|
178
174
|
allocated_nodes = self._get_nodes(nodes_needed)
|
|
179
175
|
except MPINodesUnavailable:
|
|
@@ -183,9 +179,6 @@ class MPITaskScheduler(TaskScheduler):
|
|
|
183
179
|
else:
|
|
184
180
|
resource_spec["MPI_NODELIST"] = ",".join(allocated_nodes)
|
|
185
181
|
self._map_tasks_to_nodes[tid] = allocated_nodes
|
|
186
|
-
buffer = pack_res_spec_apply_message(_f, _args, _kwargs, resource_spec)
|
|
187
|
-
task_package["buffer"] = buffer
|
|
188
|
-
task_package["resource_spec"] = resource_spec
|
|
189
182
|
|
|
190
183
|
self.pending_task_q.put(task_package)
|
|
191
184
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
|
|
3
3
|
import argparse
|
|
4
|
+
import importlib
|
|
4
5
|
import logging
|
|
5
6
|
import math
|
|
6
7
|
import multiprocessing
|
|
@@ -17,7 +18,7 @@ from importlib.metadata import distributions
|
|
|
17
18
|
from multiprocessing.context import SpawnProcess
|
|
18
19
|
from multiprocessing.managers import DictProxy
|
|
19
20
|
from multiprocessing.sharedctypes import Synchronized
|
|
20
|
-
from typing import Dict, List, Optional, Sequence
|
|
21
|
+
from typing import Callable, Dict, List, Optional, Sequence
|
|
21
22
|
|
|
22
23
|
import psutil
|
|
23
24
|
import zmq
|
|
@@ -348,7 +349,7 @@ class Manager:
|
|
|
348
349
|
|
|
349
350
|
logger.debug(
|
|
350
351
|
'ready workers: %d, pending tasks: %d',
|
|
351
|
-
self.ready_worker_count.value,
|
|
352
|
+
self.ready_worker_count.value,
|
|
352
353
|
pending_task_count,
|
|
353
354
|
)
|
|
354
355
|
|
|
@@ -373,10 +374,12 @@ class Manager:
|
|
|
373
374
|
if socks.get(ix_sock) == zmq.POLLIN:
|
|
374
375
|
pkl_msg = ix_sock.recv()
|
|
375
376
|
tasks = pickle.loads(pkl_msg)
|
|
377
|
+
del pkl_msg
|
|
378
|
+
|
|
376
379
|
last_interchange_contact = time.time()
|
|
377
380
|
|
|
378
381
|
if tasks == HEARTBEAT_CODE:
|
|
379
|
-
logger.debug("Got heartbeat from interchange")
|
|
382
|
+
logger.debug("Got heartbeat response from interchange")
|
|
380
383
|
elif tasks == DRAINED_CODE:
|
|
381
384
|
logger.info("Got fully drained message from interchange - setting kill flag")
|
|
382
385
|
self._stop_event.set()
|
|
@@ -454,6 +457,7 @@ class Manager:
|
|
|
454
457
|
'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))}
|
|
455
458
|
pkl_package = pickle.dumps(result_package)
|
|
456
459
|
self.pending_result_queue.put(pkl_package)
|
|
460
|
+
del pkl_package
|
|
457
461
|
except KeyError:
|
|
458
462
|
logger.info("Worker {} was not busy when it died".format(worker_id))
|
|
459
463
|
|
|
@@ -603,6 +607,10 @@ def update_resource_spec_env_vars(mpi_launcher: str, resource_spec: Dict, node_i
|
|
|
603
607
|
|
|
604
608
|
|
|
605
609
|
def _init_mpi_env(mpi_launcher: str, resource_spec: Dict):
|
|
610
|
+
for varname in resource_spec:
|
|
611
|
+
envname = "PARSL_" + str(varname).upper()
|
|
612
|
+
os.environ[envname] = str(resource_spec[varname])
|
|
613
|
+
|
|
606
614
|
node_list = resource_spec.get("MPI_NODELIST")
|
|
607
615
|
if node_list is None:
|
|
608
616
|
return
|
|
@@ -753,8 +761,8 @@ def worker(
|
|
|
753
761
|
worker_enqueued = True
|
|
754
762
|
|
|
755
763
|
try:
|
|
756
|
-
# The worker will receive {'task_id':<tid>, 'buffer':<buf>}
|
|
757
764
|
req = task_queue.get(timeout=task_queue_timeout)
|
|
765
|
+
# req is {'task_id':<tid>, 'buffer':<buf>, 'resource_spec':<dict>}
|
|
758
766
|
except queue.Empty:
|
|
759
767
|
continue
|
|
760
768
|
|
|
@@ -766,17 +774,33 @@ def worker(
|
|
|
766
774
|
ready_worker_count.value -= 1
|
|
767
775
|
worker_enqueued = False
|
|
768
776
|
|
|
769
|
-
|
|
777
|
+
ctxt = req["context"]
|
|
778
|
+
res_spec = ctxt.get("resource_spec", {})
|
|
779
|
+
|
|
780
|
+
_init_mpi_env(mpi_launcher=mpi_launcher, resource_spec=res_spec)
|
|
781
|
+
|
|
782
|
+
exec_func: Callable = execute_task
|
|
783
|
+
exec_args = ()
|
|
784
|
+
exec_kwargs = {}
|
|
770
785
|
|
|
771
786
|
try:
|
|
772
|
-
|
|
787
|
+
if task_executor := ctxt.get("task_executor", None):
|
|
788
|
+
mod_name, _, fn_name = task_executor["f"].rpartition(".")
|
|
789
|
+
exec_mod = importlib.import_module(mod_name)
|
|
790
|
+
exec_func = getattr(exec_mod, fn_name)
|
|
791
|
+
|
|
792
|
+
exec_args = task_executor.get("a", ())
|
|
793
|
+
exec_kwargs = task_executor.get("k", {})
|
|
794
|
+
|
|
795
|
+
result = exec_func(req['buffer'], *exec_args, **exec_kwargs)
|
|
773
796
|
serialized_result = serialize(result, buffer_threshold=1000000)
|
|
774
797
|
except Exception as e:
|
|
775
798
|
logger.info('Caught an exception: {}'.format(e))
|
|
776
799
|
result_package = {'type': 'result', 'task_id': tid, 'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))}
|
|
777
800
|
else:
|
|
778
801
|
result_package = {'type': 'result', 'task_id': tid, 'result': serialized_result}
|
|
779
|
-
|
|
802
|
+
del serialized_result
|
|
803
|
+
del req
|
|
780
804
|
|
|
781
805
|
logger.info("Completed executor task {}".format(tid))
|
|
782
806
|
try:
|
|
@@ -788,6 +812,7 @@ def worker(
|
|
|
788
812
|
})
|
|
789
813
|
|
|
790
814
|
result_queue.put(pkl_package)
|
|
815
|
+
del pkl_package, result_package
|
|
791
816
|
tasks_in_progress.pop(worker_id)
|
|
792
817
|
logger.info("All processing finished for executor task {}".format(tid))
|
|
793
818
|
|
|
@@ -74,51 +74,37 @@ class CommandClient:
|
|
|
74
74
|
|
|
75
75
|
reply = '__PARSL_ZMQ_PIPES_MAGIC__'
|
|
76
76
|
with self._lock:
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
elif poll_result == 0:
|
|
87
|
-
raise CommandClientTimeoutError("Waiting for command channel to be ready for a command")
|
|
88
|
-
else:
|
|
89
|
-
raise InternalConsistencyError(f"ZMQ poll returned unexpected value: {poll_result}")
|
|
90
|
-
|
|
91
|
-
self.zmq_socket.send_pyobj(message, copy=True)
|
|
92
|
-
|
|
93
|
-
if timeout_s is not None:
|
|
94
|
-
logger.debug("Polling for command client response or timeout")
|
|
95
|
-
remaining_time_s = start_time_s + timeout_s - time.monotonic()
|
|
96
|
-
poll_result = self.zmq_socket.poll(timeout=remaining_time_s * 1000, flags=zmq.POLLIN)
|
|
97
|
-
if poll_result == zmq.POLLIN:
|
|
98
|
-
pass # this is OK, so continue
|
|
99
|
-
elif poll_result == 0:
|
|
100
|
-
logger.error("Command timed-out - command client is now bad forever")
|
|
101
|
-
self.ok = False
|
|
102
|
-
raise CommandClientTimeoutError("Waiting for a reply from command channel")
|
|
103
|
-
else:
|
|
104
|
-
raise InternalConsistencyError(f"ZMQ poll returned unexpected value: {poll_result}")
|
|
105
|
-
|
|
106
|
-
logger.debug("Receiving command client response")
|
|
107
|
-
reply = self.zmq_socket.recv_pyobj()
|
|
108
|
-
logger.debug("Received command client response")
|
|
109
|
-
except zmq.ZMQError:
|
|
110
|
-
logger.exception("Potential ZMQ REQ-REP deadlock caught")
|
|
111
|
-
logger.info("Trying to reestablish context")
|
|
112
|
-
self.zmq_context.recreate()
|
|
113
|
-
self.create_socket_and_bind()
|
|
77
|
+
logger.debug("Sending command client command")
|
|
78
|
+
|
|
79
|
+
if timeout_s is not None:
|
|
80
|
+
remaining_time_s = start_time_s + timeout_s - time.monotonic()
|
|
81
|
+
poll_result = self.zmq_socket.poll(timeout=remaining_time_s * 1000, flags=zmq.POLLOUT)
|
|
82
|
+
if poll_result == zmq.POLLOUT:
|
|
83
|
+
pass # this is OK, so continue
|
|
84
|
+
elif poll_result == 0:
|
|
85
|
+
raise CommandClientTimeoutError("Waiting for command channel to be ready for a command")
|
|
114
86
|
else:
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
87
|
+
raise InternalConsistencyError(f"ZMQ poll returned unexpected value: {poll_result}")
|
|
88
|
+
|
|
89
|
+
self.zmq_socket.send_pyobj(message, copy=True)
|
|
90
|
+
|
|
91
|
+
if timeout_s is not None:
|
|
92
|
+
logger.debug("Polling for command client response or timeout")
|
|
93
|
+
remaining_time_s = start_time_s + timeout_s - time.monotonic()
|
|
94
|
+
poll_result = self.zmq_socket.poll(timeout=remaining_time_s * 1000, flags=zmq.POLLIN)
|
|
95
|
+
if poll_result == zmq.POLLIN:
|
|
96
|
+
pass # this is OK, so continue
|
|
97
|
+
elif poll_result == 0:
|
|
98
|
+
logger.error("Command timed-out - command client is now bad forever")
|
|
99
|
+
self.ok = False
|
|
100
|
+
raise CommandClientTimeoutError("Waiting for a reply from command channel")
|
|
101
|
+
else:
|
|
102
|
+
raise InternalConsistencyError(f"ZMQ poll returned unexpected value: {poll_result}")
|
|
120
103
|
|
|
121
|
-
|
|
104
|
+
logger.debug("Receiving command client response")
|
|
105
|
+
reply = self.zmq_socket.recv_pyobj()
|
|
106
|
+
logger.debug("Received command client response")
|
|
107
|
+
return reply
|
|
122
108
|
|
|
123
109
|
def close(self):
|
|
124
110
|
self.zmq_socket.close()
|
|
@@ -150,30 +136,18 @@ class TasksOutgoing:
|
|
|
150
136
|
self.port = self.zmq_socket.bind_to_random_port(tcp_url(ip_address),
|
|
151
137
|
min_port=port_range[0],
|
|
152
138
|
max_port=port_range[1])
|
|
153
|
-
self.poller = zmq.Poller()
|
|
154
|
-
self.poller.register(self.zmq_socket, zmq.POLLOUT)
|
|
155
139
|
|
|
156
140
|
def put(self, message):
|
|
157
141
|
""" This function needs to be fast at the same time aware of the possibility of
|
|
158
142
|
ZMQ pipes overflowing.
|
|
159
143
|
|
|
160
|
-
The timeout increases slowly if contention is detected on ZMQ pipes.
|
|
161
144
|
We could set copy=False and get slightly better latency but this results
|
|
162
145
|
in ZMQ sockets reaching a broken state once there are ~10k tasks in flight.
|
|
163
146
|
This issue can be magnified if each the serialized buffer itself is larger.
|
|
164
147
|
"""
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
if self.zmq_socket in socks and socks[self.zmq_socket] == zmq.POLLOUT:
|
|
169
|
-
# The copy option adds latency but reduces the risk of ZMQ overflow
|
|
170
|
-
logger.debug("Sending TasksOutgoing message")
|
|
171
|
-
self.zmq_socket.send_pyobj(message, copy=True)
|
|
172
|
-
logger.debug("Sent TasksOutgoing message")
|
|
173
|
-
return
|
|
174
|
-
else:
|
|
175
|
-
timeout_ms *= 2
|
|
176
|
-
logger.debug("Not sending due to non-ready zmq pipe, timeout: {} ms".format(timeout_ms))
|
|
148
|
+
logger.debug("Sending TasksOutgoing message")
|
|
149
|
+
self.zmq_socket.send_pyobj(message)
|
|
150
|
+
logger.debug("Sent TasksOutgoing message")
|
|
177
151
|
|
|
178
152
|
def close(self):
|
|
179
153
|
self.zmq_socket.close()
|
|
@@ -206,20 +180,15 @@ class ResultsIncoming:
|
|
|
206
180
|
self.port = self.results_receiver.bind_to_random_port(tcp_url(ip_address),
|
|
207
181
|
min_port=port_range[0],
|
|
208
182
|
max_port=port_range[1])
|
|
209
|
-
self.poller = zmq.Poller()
|
|
210
|
-
self.poller.register(self.results_receiver, zmq.POLLIN)
|
|
211
183
|
|
|
212
184
|
def get(self, timeout_ms=None):
|
|
213
185
|
"""Get a message from the queue, returning None if timeout expires
|
|
214
186
|
without a message. timeout is measured in milliseconds.
|
|
215
187
|
"""
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
return m
|
|
221
|
-
else:
|
|
222
|
-
return None
|
|
188
|
+
if zmq.POLLIN == self.results_receiver.poll(timeout_ms, zmq.POLLIN):
|
|
189
|
+
logger.debug("Receiving ResultsIncoming multipart message")
|
|
190
|
+
return self.results_receiver.recv_multipart()
|
|
191
|
+
return None
|
|
223
192
|
|
|
224
193
|
def close(self):
|
|
225
194
|
self.results_receiver.close()
|
|
@@ -20,7 +20,7 @@ from parsl.app.errors import BashExitFailure, RemoteExceptionWrapper
|
|
|
20
20
|
from parsl.app.python import timeout
|
|
21
21
|
from parsl.data_provider.files import File
|
|
22
22
|
from parsl.executors.base import ParslExecutor
|
|
23
|
-
from parsl.serialize import deserialize,
|
|
23
|
+
from parsl.serialize import deserialize, pack_apply_message
|
|
24
24
|
from parsl.serialize.errors import DeserializationError, SerializationError
|
|
25
25
|
from parsl.utils import RepresentationMixin
|
|
26
26
|
|
|
@@ -441,11 +441,7 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
|
|
|
441
441
|
|
|
442
442
|
def _pack_and_apply_message(self, func, args, kwargs):
|
|
443
443
|
try:
|
|
444
|
-
buffer =
|
|
445
|
-
args,
|
|
446
|
-
kwargs,
|
|
447
|
-
resource_specification={},
|
|
448
|
-
buffer_threshold=1024 * 1024)
|
|
444
|
+
buffer = pack_apply_message(func, args, kwargs, buffer_threshold=1 << 20)
|
|
449
445
|
task_func = rp.utils.serialize_bson(buffer)
|
|
450
446
|
except TypeError:
|
|
451
447
|
raise SerializationError(func.__name__)
|
|
@@ -5,7 +5,7 @@ import radical.pilot as rp
|
|
|
5
5
|
import parsl.app.errors as pe
|
|
6
6
|
from parsl.app.bash import remote_side_bash_executor
|
|
7
7
|
from parsl.executors.execute_task import execute_task
|
|
8
|
-
from parsl.serialize import serialize,
|
|
8
|
+
from parsl.serialize import serialize, unpack_apply_message
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class ParslWorker:
|
|
@@ -33,7 +33,7 @@ class ParslWorker:
|
|
|
33
33
|
|
|
34
34
|
try:
|
|
35
35
|
buffer = rp.utils.deserialize_bson(task['description']['executable'])
|
|
36
|
-
func, args, kwargs
|
|
36
|
+
func, args, kwargs = unpack_apply_message(buffer)
|
|
37
37
|
ret = remote_side_bash_executor(func, *args, **kwargs)
|
|
38
38
|
exc = (None, None)
|
|
39
39
|
val = None
|
|
@@ -107,13 +107,17 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
|
107
107
|
function_exec_mode: Union[Literal['regular'], Literal['serverless']] = 'regular',
|
|
108
108
|
manager_config: TaskVineManagerConfig = TaskVineManagerConfig(),
|
|
109
109
|
factory_config: TaskVineFactoryConfig = TaskVineFactoryConfig(),
|
|
110
|
-
provider: Optional[ExecutionProvider] =
|
|
110
|
+
provider: Optional[ExecutionProvider] = None,
|
|
111
111
|
storage_access: Optional[List[Staging]] = None,
|
|
112
112
|
remote_monitoring_radio: Optional[RadioConfig] = None):
|
|
113
113
|
|
|
114
114
|
# Set worker launch option for this executor
|
|
115
115
|
if worker_launch_method == 'factory' or worker_launch_method == 'manual':
|
|
116
116
|
provider = None
|
|
117
|
+
elif worker_launch_method == 'provider' and provider is None:
|
|
118
|
+
# provider method chosen, but no explicit provider supplied to __init__
|
|
119
|
+
# so default to LocalProvider
|
|
120
|
+
provider = LocalProvider(init_blocks=1)
|
|
117
121
|
|
|
118
122
|
# Initialize the parent class with the execution provider and block error handling enabled.
|
|
119
123
|
# If provider is None, then no worker is launched via the provider method.
|
parsl/executors/threads.py
CHANGED
|
@@ -29,12 +29,15 @@ class ThreadPoolExecutor(ParslExecutor, RepresentationMixin):
|
|
|
29
29
|
|
|
30
30
|
@typeguard.typechecked
|
|
31
31
|
def __init__(self, label: str = 'threads', max_threads: Optional[int] = 2,
|
|
32
|
-
thread_name_prefix: str =
|
|
32
|
+
thread_name_prefix: str | None = None, storage_access: Optional[List[Staging]] = None,
|
|
33
33
|
working_dir: Optional[str] = None, remote_monitoring_radio: Optional[RadioConfig] = None):
|
|
34
34
|
ParslExecutor.__init__(self)
|
|
35
35
|
self.label = label
|
|
36
36
|
self.max_threads = max_threads
|
|
37
|
-
|
|
37
|
+
if thread_name_prefix is None:
|
|
38
|
+
self.thread_name_prefix = "ThreadPoolExecutor-" + label
|
|
39
|
+
else:
|
|
40
|
+
self.thread_name_prefix = thread_name_prefix
|
|
38
41
|
|
|
39
42
|
# we allow storage_access to be None now, which means something else to [] now
|
|
40
43
|
# None now means that a default storage access list will be used, while
|
parsl/jobs/states.py
CHANGED
|
@@ -10,7 +10,7 @@ class JobState(IntEnum):
|
|
|
10
10
|
"""Defines a set of states that a job can be in"""
|
|
11
11
|
|
|
12
12
|
UNKNOWN = 0
|
|
13
|
-
"""The batch provider is unable to
|
|
13
|
+
"""The batch provider is unable to determine a state for this job"""
|
|
14
14
|
|
|
15
15
|
PENDING = 1
|
|
16
16
|
""""This job is in the batch queue but has not started running"""
|
|
@@ -40,7 +40,7 @@ class JobState(IntEnum):
|
|
|
40
40
|
"""This job is held/suspended in the batch system"""
|
|
41
41
|
|
|
42
42
|
MISSING = 8
|
|
43
|
-
"""This job has reached a terminal state without the resources(managers/workers)
|
|
43
|
+
"""This job has reached a terminal state without the resources (managers/workers)
|
|
44
44
|
launched in the job connecting back to the Executor. This state is set by HTEX
|
|
45
45
|
when it is able to infer that the block failed to start workers for eg due to
|
|
46
46
|
bad worker environment or network connectivity issues.
|
parsl/jobs/strategy.py
CHANGED
|
@@ -185,6 +185,11 @@ class Strategy:
|
|
|
185
185
|
|
|
186
186
|
for executor in executors:
|
|
187
187
|
label = executor.label
|
|
188
|
+
|
|
189
|
+
if executor.bad_state_is_set:
|
|
190
|
+
logger.info(f"Not strategizing for executor {label} because bad state is set")
|
|
191
|
+
continue
|
|
192
|
+
|
|
188
193
|
logger.debug(f"Strategizing for executor {label}")
|
|
189
194
|
|
|
190
195
|
if self.executors[label]['first']:
|
|
@@ -213,12 +218,8 @@ class Strategy:
|
|
|
213
218
|
|
|
214
219
|
logger.debug(f"Slot ratio calculation: active_slots = {active_slots}, active_tasks = {active_tasks}")
|
|
215
220
|
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
label, active_tasks, running, pending, executor.connected_workers()))
|
|
219
|
-
else:
|
|
220
|
-
logger.debug('Executor {} has {} active tasks and {}/{} running/pending blocks'.format(
|
|
221
|
-
label, active_tasks, running, pending))
|
|
221
|
+
logger.debug('Executor {} has {} active tasks and {}/{} running/pending blocks'.format(
|
|
222
|
+
label, active_tasks, running, pending))
|
|
222
223
|
|
|
223
224
|
# reset idle timer if executor has active tasks
|
|
224
225
|
|
parsl/monitoring/monitoring.py
CHANGED
|
@@ -11,9 +11,9 @@ import typeguard
|
|
|
11
11
|
|
|
12
12
|
from parsl.monitoring.types import TaggedMonitoringMessage
|
|
13
13
|
from parsl.multiprocessing import (
|
|
14
|
-
SizedQueue,
|
|
15
14
|
SpawnEvent,
|
|
16
15
|
SpawnProcess,
|
|
16
|
+
SpawnQueue,
|
|
17
17
|
join_terminate_close_proc,
|
|
18
18
|
)
|
|
19
19
|
from parsl.utils import RepresentationMixin
|
|
@@ -126,7 +126,7 @@ class MonitoringHub(RepresentationMixin):
|
|
|
126
126
|
self.monitoring_hub_active = True
|
|
127
127
|
|
|
128
128
|
self.resource_msgs: Queue[TaggedMonitoringMessage]
|
|
129
|
-
self.resource_msgs =
|
|
129
|
+
self.resource_msgs = SpawnQueue()
|
|
130
130
|
|
|
131
131
|
self.dbm_exit_event: ms.Event
|
|
132
132
|
self.dbm_exit_event = SpawnEvent()
|
|
@@ -10,11 +10,12 @@ from parsl.monitoring.radios.base import (
|
|
|
10
10
|
RadioConfig,
|
|
11
11
|
)
|
|
12
12
|
from parsl.monitoring.radios.filesystem_router import FilesystemRadioReceiver
|
|
13
|
+
from parsl.utils import RepresentationMixin
|
|
13
14
|
|
|
14
15
|
logger = logging.getLogger(__name__)
|
|
15
16
|
|
|
16
17
|
|
|
17
|
-
class FilesystemRadio(RadioConfig):
|
|
18
|
+
class FilesystemRadio(RadioConfig, RepresentationMixin):
|
|
18
19
|
"""A MonitoringRadioSender that sends messages over a shared filesystem.
|
|
19
20
|
|
|
20
21
|
The messsage directory structure is based on maildir,
|
parsl/monitoring/radios/htex.py
CHANGED
|
@@ -7,11 +7,12 @@ from parsl.monitoring.radios.base import (
|
|
|
7
7
|
MonitoringRadioSender,
|
|
8
8
|
RadioConfig,
|
|
9
9
|
)
|
|
10
|
+
from parsl.utils import RepresentationMixin
|
|
10
11
|
|
|
11
12
|
logger = logging.getLogger(__name__)
|
|
12
13
|
|
|
13
14
|
|
|
14
|
-
class HTEXRadio(RadioConfig):
|
|
15
|
+
class HTEXRadio(RadioConfig, RepresentationMixin):
|
|
15
16
|
def create_sender(self) -> MonitoringRadioSender:
|
|
16
17
|
return HTEXRadioSender()
|
|
17
18
|
|
|
@@ -5,9 +5,10 @@ from parsl.monitoring.radios.base import (
|
|
|
5
5
|
MonitoringRadioSender,
|
|
6
6
|
RadioConfig,
|
|
7
7
|
)
|
|
8
|
+
from parsl.utils import RepresentationMixin
|
|
8
9
|
|
|
9
10
|
|
|
10
|
-
class MultiprocessingQueueRadioSender(MonitoringRadioSender):
|
|
11
|
+
class MultiprocessingQueueRadioSender(MonitoringRadioSender, RepresentationMixin):
|
|
11
12
|
"""A monitoring radio which connects over a multiprocessing Queue.
|
|
12
13
|
This radio is intended to be used on the submit side, where components
|
|
13
14
|
in the submit process, or processes launched by multiprocessing, will have
|
parsl/monitoring/radios/udp.py
CHANGED
|
@@ -13,11 +13,12 @@ from parsl.monitoring.radios.base import (
|
|
|
13
13
|
RadioConfig,
|
|
14
14
|
)
|
|
15
15
|
from parsl.monitoring.radios.udp_router import start_udp_receiver
|
|
16
|
+
from parsl.utils import RepresentationMixin
|
|
16
17
|
|
|
17
18
|
logger = logging.getLogger(__name__)
|
|
18
19
|
|
|
19
20
|
|
|
20
|
-
class UDPRadio(RadioConfig):
|
|
21
|
+
class UDPRadio(RadioConfig, RepresentationMixin):
|
|
21
22
|
def __init__(self, *, port: Optional[int] = None, atexit_timeout: int = 3, address: str, debug: bool = False, hmac_digest: str = 'sha512'):
|
|
22
23
|
self.port = port
|
|
23
24
|
self.atexit_timeout = atexit_timeout
|
parsl/multiprocessing.py
CHANGED
|
@@ -4,7 +4,6 @@
|
|
|
4
4
|
import logging
|
|
5
5
|
import multiprocessing
|
|
6
6
|
import multiprocessing.queues
|
|
7
|
-
import platform
|
|
8
7
|
from multiprocessing.context import ForkProcess as ForkProcessType
|
|
9
8
|
from multiprocessing.context import SpawnProcess as SpawnProcessType
|
|
10
9
|
from typing import Callable
|
|
@@ -21,54 +20,6 @@ SpawnEvent = SpawnContext.Event
|
|
|
21
20
|
SpawnQueue = SpawnContext.Queue
|
|
22
21
|
|
|
23
22
|
|
|
24
|
-
class MacSafeQueue(multiprocessing.queues.Queue):
|
|
25
|
-
""" Multiprocessing queues do not have qsize attributes on MacOS.
|
|
26
|
-
This is slower but more portable version of the multiprocessing Queue
|
|
27
|
-
that adds a explicit counter
|
|
28
|
-
|
|
29
|
-
Reference : https://github.com/keras-team/autokeras/commit/4ddd568b06b4045ace777bc0fb7bc18573b85a75
|
|
30
|
-
"""
|
|
31
|
-
|
|
32
|
-
def __init__(self, *args, **kwargs):
|
|
33
|
-
if 'ctx' not in kwargs:
|
|
34
|
-
kwargs['ctx'] = multiprocessing.get_context('spawn')
|
|
35
|
-
super().__init__(*args, **kwargs)
|
|
36
|
-
self._counter = multiprocessing.Value('i', 0)
|
|
37
|
-
|
|
38
|
-
def put(self, *args, **kwargs):
|
|
39
|
-
# logger.critical("Putting item {}".format(args))
|
|
40
|
-
x = super().put(*args, **kwargs)
|
|
41
|
-
with self._counter.get_lock():
|
|
42
|
-
self._counter.value += 1
|
|
43
|
-
return x
|
|
44
|
-
|
|
45
|
-
def get(self, *args, **kwargs):
|
|
46
|
-
x = super().get(*args, **kwargs)
|
|
47
|
-
with self._counter.get_lock():
|
|
48
|
-
self._counter.value -= 1
|
|
49
|
-
# logger.critical("Getting item {}".format(x))
|
|
50
|
-
return x
|
|
51
|
-
|
|
52
|
-
def qsize(self):
|
|
53
|
-
return self._counter.value
|
|
54
|
-
|
|
55
|
-
def empty(self):
|
|
56
|
-
return not self._counter.value
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
# SizedQueue should be constructable using the same calling
|
|
60
|
-
# convention as multiprocessing.Queue but that entire signature
|
|
61
|
-
# isn't expressible in mypy 0.790
|
|
62
|
-
SizedQueue: Callable[..., multiprocessing.Queue]
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
if platform.system() != 'Darwin':
|
|
66
|
-
import multiprocessing
|
|
67
|
-
SizedQueue = SpawnQueue
|
|
68
|
-
else:
|
|
69
|
-
SizedQueue = MacSafeQueue
|
|
70
|
-
|
|
71
|
-
|
|
72
23
|
def join_terminate_close_proc(process: SpawnProcessType, *, timeout: int = 30) -> None:
|
|
73
24
|
"""Increasingly aggressively terminate a process.
|
|
74
25
|
|
parsl/providers/base.py
CHANGED
|
@@ -33,7 +33,28 @@ class ExecutionProvider(metaclass=ABCMeta):
|
|
|
33
33
|
[cancel] <--------|----+
|
|
34
34
|
|
|
|
35
35
|
+-------------------
|
|
36
|
-
|
|
36
|
+
|
|
37
|
+
In addition to the listed methods, an ExecutionProvider instance must always
|
|
38
|
+
have these attributes, which both default to `None`:
|
|
39
|
+
|
|
40
|
+
mem_per_node: Real memory to provision per node in GB.
|
|
41
|
+
|
|
42
|
+
Providers which set this attribute should ask for mem_per_node of memory
|
|
43
|
+
when provisioning resources, and set the corresponding environment
|
|
44
|
+
variable PARSL_MEMORY_GB before executing submitted commands.
|
|
45
|
+
|
|
46
|
+
If this attribute is set, executors may use it to calculate how many tasks can
|
|
47
|
+
run concurrently per node.
|
|
48
|
+
|
|
49
|
+
cores_per_node: Number of cores to provision per node.
|
|
50
|
+
|
|
51
|
+
Providers which set this attribute should ask for cores_per_node cores
|
|
52
|
+
when provisioning resources, and set the corresponding environment
|
|
53
|
+
variable PARSL_CORES before executing submitted commands.
|
|
54
|
+
|
|
55
|
+
If this attribute is set, executors may use it to calculate how many tasks can
|
|
56
|
+
run concurrently per node.
|
|
57
|
+
"""
|
|
37
58
|
|
|
38
59
|
@abstractmethod
|
|
39
60
|
def __init__(self) -> None:
|
|
@@ -44,8 +65,8 @@ class ExecutionProvider(metaclass=ABCMeta):
|
|
|
44
65
|
self.script_dir: Optional[str]
|
|
45
66
|
self.parallelism: float
|
|
46
67
|
self.resources: Dict[object, Any]
|
|
47
|
-
self.
|
|
48
|
-
self.
|
|
68
|
+
self.cores_per_node: Optional[int] = None
|
|
69
|
+
self.mem_per_node: Optional[float] = None
|
|
49
70
|
pass
|
|
50
71
|
|
|
51
72
|
@abstractmethod
|
|
@@ -111,40 +132,6 @@ class ExecutionProvider(metaclass=ABCMeta):
|
|
|
111
132
|
''' Provides the label for this provider '''
|
|
112
133
|
pass
|
|
113
134
|
|
|
114
|
-
@property
|
|
115
|
-
def mem_per_node(self) -> Optional[float]:
|
|
116
|
-
"""Real memory to provision per node in GB.
|
|
117
|
-
|
|
118
|
-
Providers which set this property should ask for mem_per_node of memory
|
|
119
|
-
when provisioning resources, and set the corresponding environment
|
|
120
|
-
variable PARSL_MEMORY_GB before executing submitted commands.
|
|
121
|
-
|
|
122
|
-
If this property is set, executors may use it to calculate how many tasks can
|
|
123
|
-
run concurrently per node.
|
|
124
|
-
"""
|
|
125
|
-
return self._mem_per_node
|
|
126
|
-
|
|
127
|
-
@mem_per_node.setter
|
|
128
|
-
def mem_per_node(self, value: float) -> None:
|
|
129
|
-
self._mem_per_node = value
|
|
130
|
-
|
|
131
|
-
@property
|
|
132
|
-
def cores_per_node(self) -> Optional[int]:
|
|
133
|
-
"""Number of cores to provision per node.
|
|
134
|
-
|
|
135
|
-
Providers which set this property should ask for cores_per_node cores
|
|
136
|
-
when provisioning resources, and set the corresponding environment
|
|
137
|
-
variable PARSL_CORES before executing submitted commands.
|
|
138
|
-
|
|
139
|
-
If this property is set, executors may use it to calculate how many tasks can
|
|
140
|
-
run concurrently per node.
|
|
141
|
-
"""
|
|
142
|
-
return self._cores_per_node
|
|
143
|
-
|
|
144
|
-
@cores_per_node.setter
|
|
145
|
-
def cores_per_node(self, value: int) -> None:
|
|
146
|
-
self._cores_per_node = value
|
|
147
|
-
|
|
148
135
|
@property
|
|
149
136
|
@abstractmethod
|
|
150
137
|
def status_polling_interval(self) -> int:
|