parsl 2024.7.1__py3-none-any.whl → 2024.7.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/app/app.py +4 -0
- parsl/channels/ssh/ssh.py +12 -0
- parsl/dataflow/dflow.py +17 -0
- parsl/executors/flux/flux_instance_manager.py +23 -24
- parsl/executors/high_throughput/executor.py +49 -21
- parsl/executors/high_throughput/mpi_executor.py +2 -0
- parsl/executors/high_throughput/process_worker_pool.py +20 -1
- parsl/executors/radical/executor.py +105 -65
- parsl/executors/radical/rpex_resources.py +14 -7
- parsl/providers/kubernetes/kube.py +2 -3
- parsl/tests/test_bash_apps/test_inputs_default.py +25 -0
- parsl/tests/test_channels/test_dfk_close.py +26 -0
- parsl/tests/test_htex/test_htex.py +13 -0
- parsl/tests/test_providers/test_local_provider.py +12 -7
- parsl/tests/test_python_apps/test_inputs_default.py +22 -0
- parsl/version.py +1 -1
- {parsl-2024.7.1.data → parsl-2024.7.15.data}/scripts/process_worker_pool.py +20 -1
- {parsl-2024.7.1.dist-info → parsl-2024.7.15.dist-info}/METADATA +2 -2
- {parsl-2024.7.1.dist-info → parsl-2024.7.15.dist-info}/RECORD +26 -24
- parsl/executors/radical/rpex_master.py +0 -41
- {parsl-2024.7.1.data → parsl-2024.7.15.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.7.1.data → parsl-2024.7.15.data}/scripts/interchange.py +0 -0
- {parsl-2024.7.1.data → parsl-2024.7.15.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.7.1.dist-info → parsl-2024.7.15.dist-info}/LICENSE +0 -0
- {parsl-2024.7.1.dist-info → parsl-2024.7.15.dist-info}/WHEEL +0 -0
- {parsl-2024.7.1.dist-info → parsl-2024.7.15.dist-info}/entry_points.txt +0 -0
- {parsl-2024.7.1.dist-info → parsl-2024.7.15.dist-info}/top_level.txt +0 -0
parsl/app/app.py
CHANGED
@@ -66,6 +66,10 @@ class AppBase(metaclass=ABCMeta):
|
|
66
66
|
self.kwargs['walltime'] = params['walltime'].default
|
67
67
|
if 'parsl_resource_specification' in params:
|
68
68
|
self.kwargs['parsl_resource_specification'] = params['parsl_resource_specification'].default
|
69
|
+
if 'outputs' in params:
|
70
|
+
self.kwargs['outputs'] = params['outputs'].default
|
71
|
+
if 'inputs' in params:
|
72
|
+
self.kwargs['inputs'] = params['inputs'].default
|
69
73
|
|
70
74
|
@abstractmethod
|
71
75
|
def __call__(self, *args: Any, **kwargs: Any) -> AppFuture:
|
parsl/channels/ssh/ssh.py
CHANGED
@@ -227,8 +227,20 @@ class SSHChannel(Channel, RepresentationMixin):
|
|
227
227
|
|
228
228
|
def close(self) -> None:
|
229
229
|
if self._is_connected():
|
230
|
+
transport = self.ssh_client.get_transport()
|
230
231
|
self.ssh_client.close()
|
231
232
|
|
233
|
+
# ssh_client.close calls transport.close, but transport.close does
|
234
|
+
# not always wait for the transport thread to be stopped. See impl
|
235
|
+
# of Transport.close in paramiko and issue
|
236
|
+
# https://github.com/paramiko/paramiko/issues/520
|
237
|
+
logger.debug("Waiting for transport thread to stop")
|
238
|
+
transport.join(30)
|
239
|
+
if transport.is_alive():
|
240
|
+
logger.warning("SSH transport thread did not shut down")
|
241
|
+
else:
|
242
|
+
logger.debug("SSH transport thread stopped")
|
243
|
+
|
232
244
|
def isdir(self, path):
|
233
245
|
"""Return true if the path refers to an existing directory.
|
234
246
|
|
parsl/dataflow/dflow.py
CHANGED
@@ -1277,6 +1277,23 @@ class DataFlowKernel:
|
|
1277
1277
|
executor.shutdown()
|
1278
1278
|
logger.info(f"Shut down executor {executor.label}")
|
1279
1279
|
|
1280
|
+
if hasattr(executor, 'provider'):
|
1281
|
+
if hasattr(executor.provider, 'script_dir'):
|
1282
|
+
logger.info(f"Closing channel(s) for {executor.label}")
|
1283
|
+
|
1284
|
+
if hasattr(executor.provider, 'channels'):
|
1285
|
+
for channel in executor.provider.channels:
|
1286
|
+
logger.info(f"Closing channel {channel}")
|
1287
|
+
channel.close()
|
1288
|
+
logger.info(f"Closed channel {channel}")
|
1289
|
+
else:
|
1290
|
+
assert hasattr(executor.provider, 'channel'), "If provider has no .channels, it must have .channel"
|
1291
|
+
logger.info(f"Closing channel {executor.provider.channel}")
|
1292
|
+
executor.provider.channel.close()
|
1293
|
+
logger.info(f"Closed channel {executor.provider.channel}")
|
1294
|
+
|
1295
|
+
logger.info(f"Closed executor channel(s) for {executor.label}")
|
1296
|
+
|
1280
1297
|
logger.info("Terminated executors")
|
1281
1298
|
self.time_completed = datetime.datetime.now()
|
1282
1299
|
|
@@ -27,30 +27,29 @@ def main():
|
|
27
27
|
parser.add_argument("hostname", help="hostname of the parent executor's socket")
|
28
28
|
parser.add_argument("port", help="Port of the parent executor's socket")
|
29
29
|
args = parser.parse_args()
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
logging.debug("Flux jobs drained, exiting.")
|
30
|
+
with zmq.Context() as context, context.socket(zmq.REQ) as socket:
|
31
|
+
socket.connect(
|
32
|
+
args.protocol + "://" + gethostbyname(args.hostname) + ":" + args.port
|
33
|
+
)
|
34
|
+
# send the path to the ``flux.job`` package
|
35
|
+
socket.send(dirname(dirname(os.path.realpath(flux.__file__))).encode())
|
36
|
+
logging.debug("Flux package path sent.")
|
37
|
+
# collect the encapsulating Flux instance's URI
|
38
|
+
local_uri = flux.Flux().attr_get("local-uri")
|
39
|
+
hostname = gethostname()
|
40
|
+
if args.hostname == hostname:
|
41
|
+
flux_uri = local_uri
|
42
|
+
else:
|
43
|
+
flux_uri = "ssh://" + gethostname() + local_uri.replace("local://", "")
|
44
|
+
logging.debug("Flux URI is %s", flux_uri)
|
45
|
+
response = socket.recv() # get acknowledgment
|
46
|
+
logging.debug("Received acknowledgment %s", response)
|
47
|
+
socket.send(flux_uri.encode()) # send URI
|
48
|
+
logging.debug("URI sent. Blocking for response...")
|
49
|
+
response = socket.recv() # wait for shutdown message
|
50
|
+
logging.debug("Response %s received, draining flux jobs...", response)
|
51
|
+
flux.Flux().rpc("job-manager.drain").get()
|
52
|
+
logging.debug("Flux jobs drained, exiting.")
|
54
53
|
|
55
54
|
|
56
55
|
if __name__ == "__main__":
|
@@ -56,6 +56,8 @@ DEFAULT_LAUNCH_CMD = ("process_worker_pool.py {debug} {max_workers_per_node} "
|
|
56
56
|
"--mpi-launcher={mpi_launcher} "
|
57
57
|
"--available-accelerators {accelerators}")
|
58
58
|
|
59
|
+
DEFAULT_INTERCHANGE_LAUNCH_CMD = "interchange.py"
|
60
|
+
|
59
61
|
GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionProvider`
|
60
62
|
Provider to access computation resources. Can be one of :class:`~parsl.providers.aws.aws.EC2Provider`,
|
61
63
|
:class:`~parsl.providers.cobalt.cobalt.Cobalt`,
|
@@ -76,6 +78,10 @@ GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionP
|
|
76
78
|
cores_per_worker, nodes_per_block, heartbeat_period ,heartbeat_threshold, logdir). For example:
|
77
79
|
launch_cmd="process_worker_pool.py {debug} -c {cores_per_worker} --task_url={task_url} --result_url={result_url}"
|
78
80
|
|
81
|
+
interchange_launch_cmd : str
|
82
|
+
Custom command line string to launch the interchange process from the executor. If undefined,
|
83
|
+
the executor will use the default "interchange.py" command.
|
84
|
+
|
79
85
|
address : string
|
80
86
|
An address to connect to the main Parsl process which is reachable from the network in which
|
81
87
|
workers will be running. This field expects an IPv4 address (xxx.xxx.xxx.xxx).
|
@@ -162,7 +168,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
162
168
|
| | | | batching | | |
|
163
169
|
Parsl<---Fut-| | | load-balancing| result exception
|
164
170
|
^ | | | watchdogs | | |
|
165
|
-
| | |
|
171
|
+
| | | Result | | | |
|
172
|
+
| | | Queue | | V V
|
166
173
|
| | | Thread<--|-incoming_q<---|--- +---------+
|
167
174
|
| | | | | |
|
168
175
|
| | | | | |
|
@@ -231,6 +238,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
231
238
|
label: str = 'HighThroughputExecutor',
|
232
239
|
provider: ExecutionProvider = LocalProvider(),
|
233
240
|
launch_cmd: Optional[str] = None,
|
241
|
+
interchange_launch_cmd: Optional[str] = None,
|
234
242
|
address: Optional[str] = None,
|
235
243
|
worker_ports: Optional[Tuple[int, int]] = None,
|
236
244
|
worker_port_range: Optional[Tuple[int, int]] = (54000, 55000),
|
@@ -329,6 +337,10 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
329
337
|
launch_cmd = DEFAULT_LAUNCH_CMD
|
330
338
|
self.launch_cmd = launch_cmd
|
331
339
|
|
340
|
+
if not interchange_launch_cmd:
|
341
|
+
interchange_launch_cmd = DEFAULT_INTERCHANGE_LAUNCH_CMD
|
342
|
+
self.interchange_launch_cmd = interchange_launch_cmd
|
343
|
+
|
332
344
|
radio_mode = "htex"
|
333
345
|
|
334
346
|
def _warn_deprecated(self, old: str, new: str):
|
@@ -418,20 +430,19 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
418
430
|
"127.0.0.1", self.interchange_port_range, self.cert_dir
|
419
431
|
)
|
420
432
|
|
421
|
-
self.
|
422
|
-
self.
|
433
|
+
self._result_queue_thread = None
|
434
|
+
self._start_result_queue_thread()
|
423
435
|
self._start_local_interchange_process()
|
424
436
|
|
425
|
-
logger.debug("Created
|
437
|
+
logger.debug("Created result queue thread: %s", self._result_queue_thread)
|
426
438
|
|
427
439
|
self.initialize_scaling()
|
428
440
|
|
429
441
|
@wrap_with_logs
|
430
|
-
def
|
431
|
-
"""Listen to the queue for task
|
442
|
+
def _result_queue_worker(self):
|
443
|
+
"""Listen to the queue for task result messages and handle them.
|
432
444
|
|
433
|
-
Depending on the message, tasks will be updated with results
|
434
|
-
or updates. It expects the following messages:
|
445
|
+
Depending on the message, tasks will be updated with results or exceptions.
|
435
446
|
|
436
447
|
.. code:: python
|
437
448
|
|
@@ -448,7 +459,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
448
459
|
|
449
460
|
The `None` message is a die request.
|
450
461
|
"""
|
451
|
-
logger.debug("
|
462
|
+
logger.debug("Result queue worker starting")
|
452
463
|
|
453
464
|
while not self.bad_state_is_set:
|
454
465
|
try:
|
@@ -517,7 +528,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
517
528
|
else:
|
518
529
|
raise BadMessage("Message received with unknown type {}".format(msg['type']))
|
519
530
|
|
520
|
-
logger.info("
|
531
|
+
logger.info("Result queue worker finished")
|
521
532
|
|
522
533
|
def _start_local_interchange_process(self) -> None:
|
523
534
|
""" Starts the interchange process locally
|
@@ -544,7 +555,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
544
555
|
|
545
556
|
config_pickle = pickle.dumps(interchange_config)
|
546
557
|
|
547
|
-
self.interchange_proc = subprocess.Popen(
|
558
|
+
self.interchange_proc = subprocess.Popen(self.interchange_launch_cmd.encode("utf-8"), stdin=subprocess.PIPE)
|
548
559
|
stdin = self.interchange_proc.stdin
|
549
560
|
assert stdin is not None, "Popen should have created an IO object (vs default None) because of PIPE mode"
|
550
561
|
|
@@ -560,21 +571,21 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
560
571
|
raise Exception("Interchange failed to start")
|
561
572
|
logger.debug("Got worker ports")
|
562
573
|
|
563
|
-
def
|
564
|
-
"""Method to start the
|
574
|
+
def _start_result_queue_thread(self):
|
575
|
+
"""Method to start the result queue thread as a daemon.
|
565
576
|
|
566
577
|
Checks if a thread already exists, then starts it.
|
567
|
-
Could be used later as a restart if the
|
578
|
+
Could be used later as a restart if the result queue thread dies.
|
568
579
|
"""
|
569
|
-
if self.
|
570
|
-
logger.debug("Starting queue
|
571
|
-
self.
|
572
|
-
self.
|
573
|
-
self.
|
574
|
-
logger.debug("Started queue
|
580
|
+
if self._result_queue_thread is None:
|
581
|
+
logger.debug("Starting result queue thread")
|
582
|
+
self._result_queue_thread = threading.Thread(target=self._result_queue_worker, name="HTEX-Result-Queue-Thread")
|
583
|
+
self._result_queue_thread.daemon = True
|
584
|
+
self._result_queue_thread.start()
|
585
|
+
logger.debug("Started result queue thread")
|
575
586
|
|
576
587
|
else:
|
577
|
-
logger.error("
|
588
|
+
logger.error("Result queue thread already exists, returning")
|
578
589
|
|
579
590
|
def hold_worker(self, worker_id: str) -> None:
|
580
591
|
"""Puts a worker on hold, preventing scheduling of additional tasks to it.
|
@@ -823,6 +834,23 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
823
834
|
logger.info("Unable to terminate Interchange process; sending SIGKILL")
|
824
835
|
self.interchange_proc.kill()
|
825
836
|
|
837
|
+
logger.info("Closing ZMQ pipes")
|
838
|
+
|
839
|
+
# These pipes are used in a thread unsafe manner. If you have traced a
|
840
|
+
# problem to this block of code, you might consider what is happening
|
841
|
+
# with other threads that access these.
|
842
|
+
|
843
|
+
# incoming_q is not closed here because it is used by the results queue
|
844
|
+
# worker which is not shut down at this point.
|
845
|
+
|
846
|
+
if hasattr(self, 'outgoing_q'):
|
847
|
+
logger.info("Closing outgoing_q")
|
848
|
+
self.outgoing_q.close()
|
849
|
+
|
850
|
+
if hasattr(self, 'command_client'):
|
851
|
+
logger.info("Closing command client")
|
852
|
+
self.command_client.close()
|
853
|
+
|
826
854
|
logger.info("Finished HighThroughputExecutor shutdown attempt")
|
827
855
|
|
828
856
|
def get_usage_information(self):
|
@@ -38,6 +38,7 @@ class MPIExecutor(HighThroughputExecutor):
|
|
38
38
|
label: str = 'MPIExecutor',
|
39
39
|
provider: ExecutionProvider = LocalProvider(),
|
40
40
|
launch_cmd: Optional[str] = None,
|
41
|
+
interchange_launch_cmd: Optional[str] = None,
|
41
42
|
address: Optional[str] = None,
|
42
43
|
worker_ports: Optional[Tuple[int, int]] = None,
|
43
44
|
worker_port_range: Optional[Tuple[int, int]] = (54000, 55000),
|
@@ -66,6 +67,7 @@ class MPIExecutor(HighThroughputExecutor):
|
|
66
67
|
label=label,
|
67
68
|
provider=provider,
|
68
69
|
launch_cmd=launch_cmd,
|
70
|
+
interchange_launch_cmd=interchange_launch_cmd,
|
69
71
|
address=address,
|
70
72
|
worker_ports=worker_ports,
|
71
73
|
worker_port_range=worker_port_range,
|
@@ -9,6 +9,7 @@ import os
|
|
9
9
|
import pickle
|
10
10
|
import platform
|
11
11
|
import queue
|
12
|
+
import subprocess
|
12
13
|
import sys
|
13
14
|
import threading
|
14
15
|
import time
|
@@ -731,9 +732,27 @@ def worker(
|
|
731
732
|
os.sched_setaffinity(0, my_cores) # type: ignore[attr-defined, unused-ignore]
|
732
733
|
logger.info("Set worker CPU affinity to {}".format(my_cores))
|
733
734
|
|
735
|
+
# If CUDA devices, find total number of devices to allow for MPS
|
736
|
+
# See: https://developer.nvidia.com/system-management-interface
|
737
|
+
nvidia_smi_cmd = "nvidia-smi -L > /dev/null && nvidia-smi -L | wc -l"
|
738
|
+
nvidia_smi_ret = subprocess.run(nvidia_smi_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
739
|
+
if nvidia_smi_ret.returncode == 0:
|
740
|
+
num_cuda_devices = int(nvidia_smi_ret.stdout.split()[0])
|
741
|
+
else:
|
742
|
+
num_cuda_devices = None
|
743
|
+
|
734
744
|
# If desired, pin to accelerator
|
735
745
|
if accelerator is not None:
|
736
|
-
|
746
|
+
try:
|
747
|
+
if num_cuda_devices is not None:
|
748
|
+
procs_per_cuda_device = pool_size // num_cuda_devices
|
749
|
+
partitioned_accelerator = str(int(accelerator) // procs_per_cuda_device) # multiple workers will share a GPU
|
750
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = partitioned_accelerator
|
751
|
+
logger.info(f'Pinned worker to partitioned cuda device: {partitioned_accelerator}')
|
752
|
+
else:
|
753
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = accelerator
|
754
|
+
except (TypeError, ValueError, ZeroDivisionError):
|
755
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = accelerator
|
737
756
|
os.environ["ROCR_VISIBLE_DEVICES"] = accelerator
|
738
757
|
os.environ["ZE_AFFINITY_MASK"] = accelerator
|
739
758
|
os.environ["ZE_ENABLE_PCI_ID_DEVICE_ORDER"] = '1'
|
@@ -9,7 +9,7 @@ import threading as mt
|
|
9
9
|
import time
|
10
10
|
from concurrent.futures import Future
|
11
11
|
from functools import partial
|
12
|
-
from pathlib import
|
12
|
+
from pathlib import PosixPath
|
13
13
|
from typing import Dict, Optional
|
14
14
|
|
15
15
|
import requests
|
@@ -24,7 +24,7 @@ from parsl.serialize import deserialize, pack_res_spec_apply_message
|
|
24
24
|
from parsl.serialize.errors import DeserializationError, SerializationError
|
25
25
|
from parsl.utils import RepresentationMixin
|
26
26
|
|
27
|
-
from .rpex_resources import ResourceConfig
|
27
|
+
from .rpex_resources import CLIENT, MPI, ResourceConfig
|
28
28
|
|
29
29
|
try:
|
30
30
|
import radical.pilot as rp
|
@@ -59,7 +59,7 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
|
|
59
59
|
``rp.PilotManager`` and ``rp.TaskManager``.
|
60
60
|
2. "translate": Unwrap, identify, and parse Parsl ``apps`` into ``rp.TaskDescription``.
|
61
61
|
3. "submit": Submit Parsl apps to ``rp.TaskManager``.
|
62
|
-
4. "
|
62
|
+
4. "shutdown": Shut down the RADICAL-Pilot runtime and all associated components.
|
63
63
|
|
64
64
|
Here is a diagram
|
65
65
|
|
@@ -138,19 +138,26 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
|
|
138
138
|
self.future_tasks: Dict[str, Future] = {}
|
139
139
|
|
140
140
|
if rpex_cfg:
|
141
|
-
self.rpex_cfg = rpex_cfg
|
141
|
+
self.rpex_cfg = rpex_cfg.get_config()
|
142
142
|
elif not rpex_cfg and 'local' in resource:
|
143
|
-
self.rpex_cfg = ResourceConfig()
|
143
|
+
self.rpex_cfg = ResourceConfig().get_config()
|
144
144
|
else:
|
145
|
-
raise ValueError('Resource config
|
146
|
-
'specified for a non-local
|
145
|
+
raise ValueError('Resource config must be '
|
146
|
+
'specified for a non-local resources')
|
147
147
|
|
148
148
|
def task_state_cb(self, task, state):
|
149
149
|
"""
|
150
150
|
Update the state of Parsl Future apps
|
151
151
|
Based on RP task state callbacks.
|
152
152
|
"""
|
153
|
-
|
153
|
+
# check the Master/Worker state
|
154
|
+
if task.mode in [rp.RAPTOR_MASTER, rp.RAPTOR_WORKER]:
|
155
|
+
if state == rp.FAILED:
|
156
|
+
exception = RuntimeError(f'{task.uid} failed with internal error: {task.stderr}')
|
157
|
+
self._fail_all_tasks(exception)
|
158
|
+
|
159
|
+
# check all other tasks state
|
160
|
+
else:
|
154
161
|
parsl_task = self.future_tasks[task.uid]
|
155
162
|
|
156
163
|
if state == rp.DONE:
|
@@ -186,6 +193,23 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
|
|
186
193
|
else:
|
187
194
|
parsl_task.set_exception('Task failed for an unknown reason')
|
188
195
|
|
196
|
+
def _fail_all_tasks(self, exception):
|
197
|
+
"""
|
198
|
+
Fail all outstanding tasks with the given exception.
|
199
|
+
|
200
|
+
This method iterates through all outstanding tasks in the
|
201
|
+
`_future_tasks` dictionary, which have not yet completed,
|
202
|
+
and sets the provided exception as their result, indicating
|
203
|
+
a failure.
|
204
|
+
|
205
|
+
Parameters:
|
206
|
+
- exception: The exception to be set as the result for all
|
207
|
+
outstanding tasks.
|
208
|
+
"""
|
209
|
+
for fut_task in self.future_tasks.values():
|
210
|
+
if not fut_task.done():
|
211
|
+
fut_task.set_exception(exception)
|
212
|
+
|
189
213
|
def start(self):
|
190
214
|
"""Create the Pilot component and pass it.
|
191
215
|
"""
|
@@ -202,63 +226,62 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
|
|
202
226
|
'resource': self.resource}
|
203
227
|
|
204
228
|
if not self.resource or 'local' in self.resource:
|
205
|
-
|
206
|
-
|
207
|
-
# to include the agent sandbox with the ci artifacts.
|
208
|
-
if os.environ.get("LOCAL_SANDBOX"):
|
209
|
-
pd_init['sandbox'] = self.run_dir
|
210
|
-
os.environ["RADICAL_LOG_LVL"] = "DEBUG"
|
211
|
-
|
212
|
-
logger.info("RPEX will be running in the local mode")
|
229
|
+
os.environ["RADICAL_LOG_LVL"] = "DEBUG"
|
230
|
+
logger.info("RPEX will be running in local mode")
|
213
231
|
|
214
232
|
pd = rp.PilotDescription(pd_init)
|
215
233
|
pd.verify()
|
216
234
|
|
217
|
-
|
218
|
-
|
235
|
+
# start RP's main components TMGR, PMGR and Pilot
|
236
|
+
self.tmgr = rp.TaskManager(session=self.session)
|
237
|
+
self.pmgr = rp.PilotManager(session=self.session)
|
238
|
+
self.pilot = self.pmgr.submit_pilots(pd)
|
219
239
|
|
220
|
-
self.
|
221
|
-
|
240
|
+
if not self.pilot.description.get('cores') or not self.pilot.description.get('nodes'):
|
241
|
+
logger.warning('no "cores/nodes" per pilot were set, using default resources')
|
242
|
+
|
243
|
+
self.tmgr.add_pilots(self.pilot)
|
244
|
+
self.tmgr.register_callback(self.task_state_cb)
|
222
245
|
|
223
|
-
tds = list()
|
224
|
-
master_path = '{0}/rpex_master.py'.format(PWD)
|
225
246
|
worker_path = '{0}/rpex_worker.py'.format(PWD)
|
226
247
|
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
248
|
+
self.masters = []
|
249
|
+
|
250
|
+
logger.info(f'Starting {self.rpex_cfg.n_masters} masters and {self.rpex_cfg.n_workers} workers for each master')
|
251
|
+
|
252
|
+
# create N masters
|
253
|
+
for _ in range(self.rpex_cfg.n_masters):
|
254
|
+
md = rp.TaskDescription(self.rpex_cfg.master_descr)
|
255
|
+
md.uid = ru.generate_id('rpex.master.%(item_counter)06d', ru.ID_CUSTOM,
|
231
256
|
ns=self.session.uid)
|
232
|
-
td.ranks = 1
|
233
|
-
td.cores_per_rank = 1
|
234
|
-
td.arguments = [self.rpex_cfg, i]
|
235
|
-
td.input_staging = self._stage_files([File(master_path),
|
236
|
-
File(worker_path),
|
237
|
-
File(self.rpex_cfg)], mode='in')
|
238
|
-
tds.append(td)
|
239
257
|
|
240
|
-
|
241
|
-
|
258
|
+
# submit the master to the TMGR
|
259
|
+
master = self.tmgr.submit_raptors(md)[0]
|
260
|
+
self.masters.append(master)
|
242
261
|
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
262
|
+
workers = []
|
263
|
+
# create N workers for each master and submit them to the TMGR
|
264
|
+
for _ in range(self.rpex_cfg.n_workers):
|
265
|
+
wd = rp.TaskDescription(self.rpex_cfg.worker_descr)
|
266
|
+
wd.uid = ru.generate_id('rpex.worker.%(item_counter)06d', ru.ID_CUSTOM,
|
267
|
+
ns=self.session.uid)
|
268
|
+
wd.raptor_id = master.uid
|
269
|
+
wd.input_staging = self._stage_files([File(worker_path)], mode='in')
|
270
|
+
workers.append(wd)
|
247
271
|
|
248
|
-
|
272
|
+
self.tmgr.submit_workers(workers)
|
273
|
+
|
274
|
+
self.select_master = self._cyclic_master_selector()
|
249
275
|
|
250
276
|
# prepare or use the current env for the agent/pilot side environment
|
251
|
-
if
|
252
|
-
logger.info("creating {0} environment for the executor".format(
|
253
|
-
pilot.prepare_env(env_name=
|
254
|
-
|
277
|
+
if self.rpex_cfg.pilot_env_mode != CLIENT:
|
278
|
+
logger.info("creating {0} environment for the executor".format(self.rpex_cfg.pilot_env.name))
|
279
|
+
self.pilot.prepare_env(env_name=self.rpex_cfg.pilot_env.name,
|
280
|
+
env_spec=self.rpex_cfg.pilot_env.as_dict())
|
255
281
|
else:
|
256
282
|
client_env = sys.prefix
|
257
283
|
logger.info("reusing ({0}) environment for the executor".format(client_env))
|
258
284
|
|
259
|
-
self.tmgr.add_pilots(pilot)
|
260
|
-
self.tmgr.register_callback(self.task_state_cb)
|
261
|
-
|
262
285
|
# create a bulking thread to run the actual task submission
|
263
286
|
# to RP in bulks
|
264
287
|
if self.bulk_mode:
|
@@ -272,8 +295,21 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
|
|
272
295
|
self._bulk_thread.daemon = True
|
273
296
|
self._bulk_thread.start()
|
274
297
|
|
298
|
+
logger.info('bulk mode is on, submitting tasks in bulks')
|
299
|
+
|
275
300
|
return True
|
276
301
|
|
302
|
+
def _cyclic_master_selector(self):
|
303
|
+
"""
|
304
|
+
Balance tasks submission across N masters and N workers
|
305
|
+
"""
|
306
|
+
current_master = 0
|
307
|
+
masters_uids = [m.uid for m in self.masters]
|
308
|
+
|
309
|
+
while True:
|
310
|
+
yield masters_uids[current_master]
|
311
|
+
current_master = (current_master + 1) % len(self.masters)
|
312
|
+
|
277
313
|
def unwrap(self, func, args):
|
278
314
|
"""
|
279
315
|
Unwrap a Parsl app and its args for further processing.
|
@@ -364,22 +400,25 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
|
|
364
400
|
|
365
401
|
# This is the default mode where the bash_app will be executed as
|
366
402
|
# as a single core process by RP. For cores > 1 the user must use
|
367
|
-
# above or use MPI functions if their
|
403
|
+
# task.mode=rp.TASK_EXECUTABLE (above) or use MPI functions if their
|
404
|
+
# code is Python.
|
368
405
|
else:
|
369
406
|
task.mode = rp.TASK_PROC
|
370
|
-
task.raptor_id =
|
407
|
+
task.raptor_id = next(self.select_master)
|
371
408
|
task.executable = self._pack_and_apply_message(func, args, kwargs)
|
372
409
|
|
373
410
|
elif PYTHON in task_type or not task_type:
|
374
411
|
task.mode = rp.TASK_FUNCTION
|
375
|
-
task.raptor_id =
|
412
|
+
task.raptor_id = next(self.select_master)
|
376
413
|
if kwargs.get('walltime'):
|
377
414
|
func = timeout(func, kwargs['walltime'])
|
378
415
|
|
379
|
-
#
|
380
|
-
if
|
416
|
+
# Check how to serialize the function object
|
417
|
+
if MPI in self.rpex_cfg.worker_type.lower():
|
418
|
+
task.use_mpi = True
|
381
419
|
task.function = rp.PythonTask(func, *args, **kwargs)
|
382
420
|
else:
|
421
|
+
task.use_mpi = False
|
383
422
|
task.function = self._pack_and_apply_message(func, args, kwargs)
|
384
423
|
|
385
424
|
task.input_staging = self._stage_files(kwargs.get("inputs", []),
|
@@ -394,7 +433,7 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
|
|
394
433
|
try:
|
395
434
|
task.verify()
|
396
435
|
except ru.typeddict.TDKeyError as e:
|
397
|
-
raise Exception(f'{e}. Please check
|
436
|
+
raise Exception(f'{e}. Please check: https://radicalpilot.readthedocs.io/en/stable/ documentation')
|
398
437
|
|
399
438
|
return task
|
400
439
|
|
@@ -413,7 +452,11 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
|
|
413
452
|
|
414
453
|
def _unpack_and_set_parsl_exception(self, parsl_task, exception):
|
415
454
|
try:
|
416
|
-
|
455
|
+
try:
|
456
|
+
s = rp.utils.deserialize_bson(exception)
|
457
|
+
except Exception:
|
458
|
+
s = exception
|
459
|
+
|
417
460
|
if isinstance(s, RemoteExceptionWrapper):
|
418
461
|
try:
|
419
462
|
s.reraise()
|
@@ -421,6 +464,8 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
|
|
421
464
|
parsl_task.set_exception(e)
|
422
465
|
elif isinstance(s, Exception):
|
423
466
|
parsl_task.set_exception(s)
|
467
|
+
elif isinstance(s, str):
|
468
|
+
parsl_task.set_exception(eval(s))
|
424
469
|
else:
|
425
470
|
raise ValueError("Unknown exception-like type received: {}".format(type(s)))
|
426
471
|
except Exception as e:
|
@@ -440,16 +485,10 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
|
|
440
485
|
elif isinstance(k_val, PosixPath):
|
441
486
|
k_val = k_val.__str__()
|
442
487
|
|
443
|
-
#
|
444
|
-
#
|
445
|
-
# we just set the path to the cwd
|
446
|
-
if '/' not in k_val:
|
447
|
-
k_val = CWD + '/' + k_val
|
448
|
-
|
449
|
-
# finally set the stderr/out to
|
450
|
-
# the desired name by the user
|
488
|
+
# set the stderr/out to the desired
|
489
|
+
# name by the user
|
451
490
|
setattr(task, k, k_val)
|
452
|
-
task.sandbox =
|
491
|
+
task.sandbox = CWD
|
453
492
|
|
454
493
|
def _stage_files(self, files, mode):
|
455
494
|
"""
|
@@ -477,7 +516,7 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
|
|
477
516
|
# this indicates that the user
|
478
517
|
# did not provided a specific
|
479
518
|
# output file and RP will stage out
|
480
|
-
# the task.
|
519
|
+
# the task.stdout from pilot://task_folder
|
481
520
|
# to the CWD or file.url
|
482
521
|
if '/' not in file.url:
|
483
522
|
f = {'source': file.filename,
|
@@ -548,7 +587,8 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
|
|
548
587
|
|
549
588
|
def shutdown(self, hub=True, targets='all', block=False):
|
550
589
|
"""Shutdown the executor, including all RADICAL-Pilot components."""
|
551
|
-
logger.info("RadicalPilotExecutor
|
590
|
+
logger.info("RadicalPilotExecutor is terminating...")
|
552
591
|
self.session.close(download=True)
|
592
|
+
logger.info("RadicalPilotExecutor is terminated.")
|
553
593
|
|
554
594
|
return True
|
@@ -5,6 +5,7 @@ from typing import List
|
|
5
5
|
_setup_paths: List[str] = []
|
6
6
|
try:
|
7
7
|
import radical.pilot as rp
|
8
|
+
import radical.utils as ru
|
8
9
|
except ImportError:
|
9
10
|
pass
|
10
11
|
|
@@ -103,7 +104,7 @@ class ResourceConfig:
|
|
103
104
|
python_v: str = f'{sys.version_info[0]}.{sys.version_info[1]}'
|
104
105
|
worker_type: str = DEFAULT_WORKER
|
105
106
|
|
106
|
-
def
|
107
|
+
def get_config(cls, path=None):
|
107
108
|
|
108
109
|
# Default ENV mode for RP is to reuse
|
109
110
|
# the client side. If this is not the case,
|
@@ -121,6 +122,7 @@ class ResourceConfig:
|
|
121
122
|
cfg = {
|
122
123
|
'n_masters': cls.masters,
|
123
124
|
'n_workers': cls.workers,
|
125
|
+
'worker_type': cls.worker_type,
|
124
126
|
'gpus_per_node': cls.worker_gpus_per_node,
|
125
127
|
'cores_per_node': cls.worker_cores_per_node,
|
126
128
|
'cores_per_master': cls.cores_per_master,
|
@@ -138,9 +140,10 @@ class ResourceConfig:
|
|
138
140
|
'pilot_env_mode': cls.pilot_env_mode,
|
139
141
|
|
140
142
|
'master_descr': {
|
143
|
+
"ranks": 1,
|
144
|
+
"cores_per_rank": 1,
|
141
145
|
"mode": rp.RAPTOR_MASTER,
|
142
146
|
"named_env": cls.pilot_env_name,
|
143
|
-
"executable": "python3 rpex_master.py",
|
144
147
|
},
|
145
148
|
|
146
149
|
'worker_descr': {
|
@@ -149,12 +152,16 @@ class ResourceConfig:
|
|
149
152
|
"raptor_file": "./rpex_worker.py",
|
150
153
|
"raptor_class": cls.worker_type if
|
151
154
|
cls.worker_type.lower() != MPI else MPI_WORKER,
|
155
|
+
"ranks": cls.nodes_per_worker * cls.worker_cores_per_node,
|
156
|
+
"gpus_per_rank": cls.nodes_per_worker * cls.worker_gpus_per_node,
|
152
157
|
}}
|
153
158
|
|
154
|
-
# Convert the class instance to a
|
155
|
-
config_path = 'rpex.cfg'
|
159
|
+
# Convert the class instance to a Json file or a Config dict.
|
156
160
|
if path:
|
161
|
+
config_path = 'rpex.cfg'
|
157
162
|
config_path = path + '/' + config_path
|
158
|
-
|
159
|
-
|
160
|
-
|
163
|
+
with open(config_path, 'w') as f:
|
164
|
+
json.dump(cfg, f, indent=4)
|
165
|
+
else:
|
166
|
+
config_obj = ru.Config(from_dict=cfg)
|
167
|
+
return config_obj
|
@@ -168,10 +168,9 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
|
|
168
168
|
- tasks_per_node (int) : command invocations to be launched per node
|
169
169
|
|
170
170
|
Kwargs:
|
171
|
-
- job_name (String): Name for job
|
171
|
+
- job_name (String): Name for job
|
172
172
|
|
173
173
|
Returns:
|
174
|
-
- None: At capacity, cannot provision more
|
175
174
|
- job_id: (string) Identifier for the job
|
176
175
|
"""
|
177
176
|
|
@@ -187,7 +186,7 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
|
|
187
186
|
formatted_cmd = template_string.format(command=cmd_string,
|
188
187
|
worker_init=self.worker_init)
|
189
188
|
|
190
|
-
logger.debug("Pod name
|
189
|
+
logger.debug("Pod name: %s", pod_name)
|
191
190
|
self._create_pod(image=self.image,
|
192
191
|
pod_name=pod_name,
|
193
192
|
job_name=job_name,
|
@@ -0,0 +1,25 @@
|
|
1
|
+
import pytest
|
2
|
+
|
3
|
+
from parsl import AUTO_LOGNAME, Config, bash_app, python_app
|
4
|
+
from parsl.executors import ThreadPoolExecutor
|
5
|
+
|
6
|
+
|
7
|
+
def local_config():
|
8
|
+
return Config(executors=[ThreadPoolExecutor()])
|
9
|
+
|
10
|
+
|
11
|
+
@pytest.mark.local
|
12
|
+
def test_default_inputs():
|
13
|
+
@python_app
|
14
|
+
def identity(inp):
|
15
|
+
return inp
|
16
|
+
|
17
|
+
@bash_app
|
18
|
+
def sum_inputs(inputs=[identity(1), identity(2)], stdout=AUTO_LOGNAME):
|
19
|
+
calc = sum(inputs)
|
20
|
+
return f"echo {calc}"
|
21
|
+
|
22
|
+
fut = sum_inputs()
|
23
|
+
fut.result()
|
24
|
+
with open(fut.stdout, 'r') as f:
|
25
|
+
assert int(f.read()) == 3
|
@@ -0,0 +1,26 @@
|
|
1
|
+
from unittest.mock import Mock
|
2
|
+
|
3
|
+
import pytest
|
4
|
+
|
5
|
+
import parsl
|
6
|
+
from parsl.channels.base import Channel
|
7
|
+
from parsl.executors import HighThroughputExecutor
|
8
|
+
from parsl.providers import LocalProvider
|
9
|
+
|
10
|
+
|
11
|
+
@pytest.mark.local
|
12
|
+
def test_dfk_close():
|
13
|
+
|
14
|
+
mock_channel = Mock(spec=Channel)
|
15
|
+
|
16
|
+
# block settings all 0 because the mock channel won't be able to
|
17
|
+
# do anything to make a block exist
|
18
|
+
p = LocalProvider(channel=mock_channel, init_blocks=0, min_blocks=0, max_blocks=0)
|
19
|
+
|
20
|
+
e = HighThroughputExecutor(provider=p)
|
21
|
+
|
22
|
+
c = parsl.Config(executors=[e])
|
23
|
+
with parsl.load(c):
|
24
|
+
pass
|
25
|
+
|
26
|
+
assert mock_channel.close.called
|
@@ -136,3 +136,16 @@ def test_max_workers_per_node():
|
|
136
136
|
|
137
137
|
# Ensure max_workers_per_node takes precedence
|
138
138
|
assert htex.max_workers_per_node == htex.max_workers == 1
|
139
|
+
|
140
|
+
|
141
|
+
@pytest.mark.local
|
142
|
+
def test_htex_launch_cmd():
|
143
|
+
htex = HighThroughputExecutor()
|
144
|
+
assert htex.launch_cmd.startswith("process_worker_pool.py")
|
145
|
+
assert htex.interchange_launch_cmd == "interchange.py"
|
146
|
+
|
147
|
+
launch_cmd = "custom-launch-cmd"
|
148
|
+
ix_launch_cmd = "custom-ix-launch-cmd"
|
149
|
+
htex = HighThroughputExecutor(launch_cmd=launch_cmd, interchange_launch_cmd=ix_launch_cmd)
|
150
|
+
assert htex.launch_cmd == launch_cmd
|
151
|
+
assert htex.interchange_launch_cmd == ix_launch_cmd
|
@@ -92,19 +92,24 @@ def test_ssh_channel():
|
|
92
92
|
# already exist, so create it here.
|
93
93
|
pathlib.Path('{}/known.hosts'.format(config_dir)).touch(mode=0o600)
|
94
94
|
script_dir = tempfile.mkdtemp()
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
95
|
+
channel = SSHChannel('127.0.0.1', port=server_port,
|
96
|
+
script_dir=remote_script_dir,
|
97
|
+
host_keys_filename='{}/known.hosts'.format(config_dir),
|
98
|
+
key_filename=priv_key)
|
99
|
+
try:
|
100
|
+
p = LocalProvider(channel=channel,
|
101
|
+
launcher=SingleNodeLauncher(debug=False))
|
102
|
+
p.script_dir = script_dir
|
103
|
+
_run_tests(p)
|
104
|
+
finally:
|
105
|
+
channel.close()
|
102
106
|
finally:
|
103
107
|
_stop_sshd(sshd_thread)
|
104
108
|
|
105
109
|
|
106
110
|
def _stop_sshd(sshd_thread):
|
107
111
|
sshd_thread.stop()
|
112
|
+
sshd_thread.join()
|
108
113
|
|
109
114
|
|
110
115
|
class SSHDThread(threading.Thread):
|
@@ -0,0 +1,22 @@
|
|
1
|
+
import pytest
|
2
|
+
|
3
|
+
import parsl
|
4
|
+
from parsl import python_app
|
5
|
+
from parsl.executors.threads import ThreadPoolExecutor
|
6
|
+
|
7
|
+
|
8
|
+
def local_config():
|
9
|
+
return parsl.Config(executors=[ThreadPoolExecutor()])
|
10
|
+
|
11
|
+
|
12
|
+
@pytest.mark.local
|
13
|
+
def test_default_inputs():
|
14
|
+
@python_app
|
15
|
+
def identity(inp):
|
16
|
+
return inp
|
17
|
+
|
18
|
+
@python_app
|
19
|
+
def add_inputs(inputs=[identity(1), identity(2)]):
|
20
|
+
return sum(inputs)
|
21
|
+
|
22
|
+
assert add_inputs().result() == 3
|
parsl/version.py
CHANGED
@@ -9,6 +9,7 @@ import os
|
|
9
9
|
import pickle
|
10
10
|
import platform
|
11
11
|
import queue
|
12
|
+
import subprocess
|
12
13
|
import sys
|
13
14
|
import threading
|
14
15
|
import time
|
@@ -731,9 +732,27 @@ def worker(
|
|
731
732
|
os.sched_setaffinity(0, my_cores) # type: ignore[attr-defined, unused-ignore]
|
732
733
|
logger.info("Set worker CPU affinity to {}".format(my_cores))
|
733
734
|
|
735
|
+
# If CUDA devices, find total number of devices to allow for MPS
|
736
|
+
# See: https://developer.nvidia.com/system-management-interface
|
737
|
+
nvidia_smi_cmd = "nvidia-smi -L > /dev/null && nvidia-smi -L | wc -l"
|
738
|
+
nvidia_smi_ret = subprocess.run(nvidia_smi_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
739
|
+
if nvidia_smi_ret.returncode == 0:
|
740
|
+
num_cuda_devices = int(nvidia_smi_ret.stdout.split()[0])
|
741
|
+
else:
|
742
|
+
num_cuda_devices = None
|
743
|
+
|
734
744
|
# If desired, pin to accelerator
|
735
745
|
if accelerator is not None:
|
736
|
-
|
746
|
+
try:
|
747
|
+
if num_cuda_devices is not None:
|
748
|
+
procs_per_cuda_device = pool_size // num_cuda_devices
|
749
|
+
partitioned_accelerator = str(int(accelerator) // procs_per_cuda_device) # multiple workers will share a GPU
|
750
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = partitioned_accelerator
|
751
|
+
logger.info(f'Pinned worker to partitioned cuda device: {partitioned_accelerator}')
|
752
|
+
else:
|
753
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = accelerator
|
754
|
+
except (TypeError, ValueError, ZeroDivisionError):
|
755
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = accelerator
|
737
756
|
os.environ["ROCR_VISIBLE_DEVICES"] = accelerator
|
738
757
|
os.environ["ZE_AFFINITY_MASK"] = accelerator
|
739
758
|
os.environ["ZE_ENABLE_PCI_ID_DEVICE_ORDER"] = '1'
|
@@ -1,9 +1,9 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: parsl
|
3
|
-
Version: 2024.7.
|
3
|
+
Version: 2024.7.15
|
4
4
|
Summary: Simple data dependent workflows in Python
|
5
5
|
Home-page: https://github.com/Parsl/parsl
|
6
|
-
Download-URL: https://github.com/Parsl/parsl/archive/2024.07.
|
6
|
+
Download-URL: https://github.com/Parsl/parsl/archive/2024.07.15.tar.gz
|
7
7
|
Author: The Parsl Team
|
8
8
|
Author-email: parsl@googlegroups.com
|
9
9
|
License: Apache 2.0
|
@@ -8,9 +8,9 @@ parsl/multiprocessing.py,sha256=MyaEcEq-Qf860u7V98u-PZrPNdtzOZL_NW6EhIJnmfQ,1937
|
|
8
8
|
parsl/process_loggers.py,sha256=uQ7Gd0W72Jz7rrcYlOMfLsAEhkRltxXJL2MgdduJjEw,1136
|
9
9
|
parsl/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
parsl/utils.py,sha256=91FjQiTUY383ueAjkBAgE21My9nba6SP2a2SrbB1r1Q,11250
|
11
|
-
parsl/version.py,sha256=
|
11
|
+
parsl/version.py,sha256=xBBY22CXKXmBYJqrmCPAgPlHvalhorEzfXaNGRSVeQU,131
|
12
12
|
parsl/app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
-
parsl/app/app.py,sha256=
|
13
|
+
parsl/app/app.py,sha256=0gbM4AH2OtFOLsv07I5nglpElcwMSOi-FzdZZfrk7So,8532
|
14
14
|
parsl/app/bash.py,sha256=jm2AvePlCT9DZR7H_4ANDWxatp5dN_22FUlT_gWhZ-g,5528
|
15
15
|
parsl/app/errors.py,sha256=nJmOEPglAISfD3R1UsTZH-avqiSOJgx_DkpdL9B591w,3917
|
16
16
|
parsl/app/futures.py,sha256=XU1NwkoNVsxy3KF5y0Ihsla5hPbhhuSikZInfS7h7Uo,2910
|
@@ -25,7 +25,7 @@ parsl/channels/local/local.py,sha256=xqH4HnipUN95NgvyB1r33SiqgQKkARgRKmg0_HnumUk
|
|
25
25
|
parsl/channels/oauth_ssh/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
26
26
|
parsl/channels/oauth_ssh/oauth_ssh.py,sha256=GrVOpJ6M6BwtGG4zOU4zakyphzuGY5M3suQ8PyjwyOA,3509
|
27
27
|
parsl/channels/ssh/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
28
|
-
parsl/channels/ssh/ssh.py,sha256=
|
28
|
+
parsl/channels/ssh/ssh.py,sha256=ga8LMZ9ryTZxbgiyljL5DwusYygbUEe-Frt3SBIMecM,10125
|
29
29
|
parsl/channels/ssh_il/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
30
30
|
parsl/channels/ssh_il/ssh_il.py,sha256=5XjotlA83UM4zGfnVriC9pE2NzaCT5hqvXZ9v4GG3pg,2410
|
31
31
|
parsl/concurrent/__init__.py,sha256=TvIVceJYaJAsxedNBF3Vdo9lEQNHH_j3uxJv0zUjP7w,3288
|
@@ -62,7 +62,7 @@ parsl/data_provider/staging.py,sha256=ZDZuuFg38pjUStegKPcvPsfGp3iMeReMzfU6DSwtJj
|
|
62
62
|
parsl/data_provider/zip.py,sha256=S4kVuH9lxAegRURYbvIUR7EYYBOccyslaqyCrVWUBhw,4497
|
63
63
|
parsl/dataflow/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
64
64
|
parsl/dataflow/dependency_resolvers.py,sha256=Om8Dgh7a0ZwgXAc6TlhxLSzvxXHDlNNV1aBNiD3JTNY,3325
|
65
|
-
parsl/dataflow/dflow.py,sha256=
|
65
|
+
parsl/dataflow/dflow.py,sha256=jgNOIk3xXz90RXwC38ujMz7092XRdLFv5BrMyALYhps,68513
|
66
66
|
parsl/dataflow/errors.py,sha256=9SxVhIJY_53FQx8x4OU8UA8nd7lvUbDllH7KfMXpYaY,2177
|
67
67
|
parsl/dataflow/futures.py,sha256=08LuP-HFiHBIZmeKCjlsazw_WpQ5fwevrU2_WbidkYw,6080
|
68
68
|
parsl/dataflow/memoization.py,sha256=l9uw1Bu50GucBF70M5relpGKFkE4dIM9T3R1KrxW0v0,9583
|
@@ -77,23 +77,22 @@ parsl/executors/threads.py,sha256=hJt1LzxphqX4fe_9R9Cf1MU0lepWTU_eJe8O665B0Xo,33
|
|
77
77
|
parsl/executors/flux/__init__.py,sha256=P9grTTeRPXfqXurFhlSS7XhmE6tTbnCnyQ1f9b-oYHE,136
|
78
78
|
parsl/executors/flux/execute_parsl_task.py,sha256=gRN7F4HhdrKQ-bvn4wXrquBzFOp_9WF88hMIeUaRg5I,1553
|
79
79
|
parsl/executors/flux/executor.py,sha256=gPq49CQwtSZYZggLZ0dCXdpUlllKHJbvR8WRKeGh9xE,16977
|
80
|
-
parsl/executors/flux/flux_instance_manager.py,sha256=
|
80
|
+
parsl/executors/flux/flux_instance_manager.py,sha256=5T3Rp7ZM-mlT0Pf0Gxgs5_YmnaPrSF9ec7zvRfLfYJw,2129
|
81
81
|
parsl/executors/high_throughput/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
82
82
|
parsl/executors/high_throughput/errors.py,sha256=Sak8e8UpiEcXefUjMHbhyXc4Rn7kJtOoh7L8wreBQdk,1638
|
83
|
-
parsl/executors/high_throughput/executor.py,sha256=
|
83
|
+
parsl/executors/high_throughput/executor.py,sha256=bCtw_p2f1ztnqQiChKJBOiPyc6aKK39yRXSp5uFpRzk,38185
|
84
84
|
parsl/executors/high_throughput/interchange.py,sha256=IRuiaBmks_R4cU-Sx0Q_Fjv4PdFtzU05GiPdeJstOoA,30578
|
85
85
|
parsl/executors/high_throughput/manager_record.py,sha256=9XppKjDW0DJ7SMkPNxsiDs-HvXGPLrTg6Ceyh4b6gNs,433
|
86
86
|
parsl/executors/high_throughput/monitoring_info.py,sha256=HC0drp6nlXQpAop5PTUKNjdXMgtZVvrBL0JzZJebPP4,298
|
87
|
-
parsl/executors/high_throughput/mpi_executor.py,sha256=
|
87
|
+
parsl/executors/high_throughput/mpi_executor.py,sha256=V07t1GOzFhcwdlZGuYUPqc1NarSr-vUbsNzbK4Cj0m8,3882
|
88
88
|
parsl/executors/high_throughput/mpi_prefix_composer.py,sha256=hah_IznfFqk-rzuHWmg6aiF_saiDRrpW-aSo4kH9Nso,4854
|
89
89
|
parsl/executors/high_throughput/mpi_resource_management.py,sha256=LFBbJ3BnzTcY_v-jNu30uoIB2Enk4cleN4ygY3dncjY,8194
|
90
90
|
parsl/executors/high_throughput/probe.py,sha256=TNpGTXb4_DEeg_h-LHu4zEKi1-hffboxvKcZUl2OZGk,2751
|
91
|
-
parsl/executors/high_throughput/process_worker_pool.py,sha256=
|
91
|
+
parsl/executors/high_throughput/process_worker_pool.py,sha256=P1ZqQOyEpfvXxtfsevGpJvPH_PIxso3Mh0u8PyRbwD8,42958
|
92
92
|
parsl/executors/high_throughput/zmq_pipes.py,sha256=tAjQB3aNVMuTXziN3dbJWre46YpXgliD55qMBbhYTLU,8581
|
93
93
|
parsl/executors/radical/__init__.py,sha256=CKbtV2numw5QvgIBq1htMUrt9TqDCIC2zifyf2svTNU,186
|
94
|
-
parsl/executors/radical/executor.py,sha256=
|
95
|
-
parsl/executors/radical/
|
96
|
-
parsl/executors/radical/rpex_resources.py,sha256=o-jNQ49e-gB7px2uiRkXcVjC8RebTrGH5eryjzhQwKM,4804
|
94
|
+
parsl/executors/radical/executor.py,sha256=426cMt6d8uJFZ_7Ub1kCslaND4OKtBX5WZdz-0RXjMk,22554
|
95
|
+
parsl/executors/radical/rpex_resources.py,sha256=Q7-0u3K447LBCe2y7mVcdw6jqWI7SdPXxCKhkr6FoRQ,5139
|
97
96
|
parsl/executors/radical/rpex_worker.py,sha256=qli6i6ejKubTSv3lAE3YiW8RlkHrfl4Jhrq3jA45mOw,1869
|
98
97
|
parsl/executors/taskvine/__init__.py,sha256=9rwp3M8B0YyEhZMLO0RHaNw7u1nc01WHbXLqnBTanu0,293
|
99
98
|
parsl/executors/taskvine/errors.py,sha256=euIYkSslrNSI85kyi2s0xzOaO9ik4c1fYHstMIeiBJk,652
|
@@ -175,7 +174,7 @@ parsl/providers/grid_engine/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMp
|
|
175
174
|
parsl/providers/grid_engine/grid_engine.py,sha256=jTQjKaJh4eEXGbhrrCcXFV4AVFo2t39iVpslDR8gF6o,8565
|
176
175
|
parsl/providers/grid_engine/template.py,sha256=a7iViKr8LXcFTPmsf_qQeVK5o_RekOAIlUOF0X1q-2M,273
|
177
176
|
parsl/providers/kubernetes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
178
|
-
parsl/providers/kubernetes/kube.py,sha256=
|
177
|
+
parsl/providers/kubernetes/kube.py,sha256=umXdZqrpAIWh7OLsqZyF1L5AhmqmdpzKb3xTV3atKuk,14406
|
179
178
|
parsl/providers/kubernetes/template.py,sha256=VsRz6cmNaII-y4OdMT6sCwzQy95SJX6NMB0hmmFBhX4,50
|
180
179
|
parsl/providers/local/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
181
180
|
parsl/providers/local/local.py,sha256=pTEcl9NnjRcL8FHcMeMEtJj1IXiAOxZ2Cih97Q5jDPY,11388
|
@@ -298,6 +297,7 @@ parsl/tests/test_bash_apps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJ
|
|
298
297
|
parsl/tests/test_bash_apps/test_apptimeout.py,sha256=ha9O9WnVxeDrUFmqvEcqkk7ymPms1ju1henzkbOVyV8,570
|
299
298
|
parsl/tests/test_bash_apps/test_basic.py,sha256=HGzJKtETnUxHQwPaTDuZTPMtIX3lSqtidqLxPn2IV8U,2460
|
300
299
|
parsl/tests/test_bash_apps/test_error_codes.py,sha256=jJ3BwhFpvTGKElKyuiCMWFeBaVeIoWlJkiulWRA2nSE,3961
|
300
|
+
parsl/tests/test_bash_apps/test_inputs_default.py,sha256=6UYdi8bqbTS-cx2WB2v5lVA9smTL55Sl3wgkQvlxJ0I,563
|
301
301
|
parsl/tests/test_bash_apps/test_keyword_overlaps.py,sha256=8bfN2qw4uXJsYquppR1lZQrYW834AZc3zjYIIHTfDoE,209
|
302
302
|
parsl/tests/test_bash_apps/test_kwarg_storage.py,sha256=OMMD3sKSngBSjVCHK9wju0hHzszOqbYuWtscyMuh5_8,720
|
303
303
|
parsl/tests/test_bash_apps/test_memoize.py,sha256=gFhDNFxdRv8DNtErbwtdEvAph6SDFPaWY0tABZGS4I4,1383
|
@@ -308,6 +308,7 @@ parsl/tests/test_bash_apps/test_pipeline.py,sha256=1kQDD8-Dh5H9SKFcKHzN_mSrdxAV_
|
|
308
308
|
parsl/tests/test_bash_apps/test_std_uri.py,sha256=CvAt8BUhNl2pA5chq9YyhkD6eo2IUH6PjWfe3SQ-YRU,3752
|
309
309
|
parsl/tests/test_bash_apps/test_stdout.py,sha256=hrzHXLt308qH2Gg_r0-qy5nFBNXI56vCZQBXIIocCPY,3198
|
310
310
|
parsl/tests/test_channels/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
311
|
+
parsl/tests/test_channels/test_dfk_close.py,sha256=n7IF3Ud_vejg0VNRnvEgxCLmwMvPVvLbXvJdw-Mz_lw,628
|
311
312
|
parsl/tests/test_channels/test_large_output.py,sha256=PGeNSW_sN5mR7KF1hVL2CPfktydYxo4oNz1wVQ-ENN0,595
|
312
313
|
parsl/tests/test_checkpointing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
313
314
|
parsl/tests/test_checkpointing/test_periodic.py,sha256=nfMgrG7sZ8rkMu6iOHS6lp_iTU4IsOyQLQ2Gur_FMmE,1509
|
@@ -343,7 +344,7 @@ parsl/tests/test_htex/test_connected_blocks.py,sha256=gaXZSr__pIaLvKY6rF-4r1p_4d
|
|
343
344
|
parsl/tests/test_htex/test_cpu_affinity_explicit.py,sha256=DVHrRCskDbJIrfB5YSi3ZSbfR4WzijA46aZfZzjNcrU,1382
|
344
345
|
parsl/tests/test_htex/test_disconnected_blocks.py,sha256=3V1Ol9gMS6knjLTgIjB5GrunRSp4ANsJ_2vAvpyMR6c,1858
|
345
346
|
parsl/tests/test_htex/test_drain.py,sha256=Z2Z5-3NfLL9tMgJh4JkVKLZZDl3Z2gDAbEFHDSGdItw,2288
|
346
|
-
parsl/tests/test_htex/test_htex.py,sha256
|
347
|
+
parsl/tests/test_htex/test_htex.py,sha256=qnJ1LjCC2c75BOxZ4CRA7pEX2RrFKG-fWIuPiW6w9k4,5005
|
347
348
|
parsl/tests/test_htex/test_manager_failure.py,sha256=N-obuSZ8f7XA_XcddoN2LWKSVtpKUZvTHb7BFelS3iQ,1143
|
348
349
|
parsl/tests/test_htex/test_managers_command.py,sha256=Y-eUjtBzwW9erCYdph9bOesbkUvX8QUPqXt27DCgVS8,951
|
349
350
|
parsl/tests/test_htex/test_missing_worker.py,sha256=gyp5i7_t-JHyJGtz_eXZKKBY5w8oqLOIxO6cJgGJMtQ,745
|
@@ -370,7 +371,7 @@ parsl/tests/test_mpi_apps/test_mpiex.py,sha256=DcvfDZT_WnwSzL5IF71JPbV_wEI_hZl_W
|
|
370
371
|
parsl/tests/test_mpi_apps/test_resource_spec.py,sha256=A7NwNT4LalCSOiHws1ALrrWy8Mn1IItpv9olhnRVjs0,3987
|
371
372
|
parsl/tests/test_providers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
372
373
|
parsl/tests/test_providers/test_cobalt_deprecation_warning.py,sha256=UN2W6xJxuLx2euPqArORKFEU2VXez9_PYqq-0rZHanQ,391
|
373
|
-
parsl/tests/test_providers/test_local_provider.py,sha256=
|
374
|
+
parsl/tests/test_providers/test_local_provider.py,sha256=XCAy64oM3IZ6k0RYBIr6s-D2LL7gr6_xXZ_3Pv1D0gM,7076
|
374
375
|
parsl/tests/test_providers/test_pbspro_template.py,sha256=-bi1vags9yyNfpBxtjTqFjzMIg1VVPyf2M958UcXWmA,855
|
375
376
|
parsl/tests/test_providers/test_slurm_instantiate.py,sha256=eW3pEZRIzZO1-eKFrBc7N5uoN5otwghgbqut74Kyqoc,500
|
376
377
|
parsl/tests/test_providers/test_slurm_template.py,sha256=pBEeimO-vGbMmC1QT7BP7s5BH6fFeqaWnI4f6tWPFEo,901
|
@@ -389,6 +390,7 @@ parsl/tests/test_python_apps/test_fibonacci_recursive.py,sha256=q7LMFcu_pJSNPdz8
|
|
389
390
|
parsl/tests/test_python_apps/test_futures.py,sha256=EWnzmPn5sVCgeMxc0Uz2ieaaVYr98tFZ7g8YJFqYuC8,2355
|
390
391
|
parsl/tests/test_python_apps/test_garbage_collect.py,sha256=RPntrLuzPkeNbhS7mmqEnHbyOcuV1YVppgZ8BaX-h84,1076
|
391
392
|
parsl/tests/test_python_apps/test_import_fail.py,sha256=Vd8IMa_UsbHYkr3IGnS-rgGb6zKxB1tOTqMZY5lc_xY,691
|
393
|
+
parsl/tests/test_python_apps/test_inputs_default.py,sha256=J2GR1NgdvEucNSJkfO6GC5OoMiuvSzO0tASCowT8HM0,436
|
392
394
|
parsl/tests/test_python_apps/test_join.py,sha256=qnwdPYC_uIS5hQ2jmU2nIP_3P_TaMY8Av1ut10EZA_M,2678
|
393
395
|
parsl/tests/test_python_apps/test_lifted.py,sha256=Na6qC_dZSeYJcZdkGn-dCjgYkQV267HmGFfaqFcRVcQ,3408
|
394
396
|
parsl/tests/test_python_apps/test_mapred.py,sha256=C7nTl0NsP_2TCtcmZXWFMpvAG4pwGswrIJKr-5sRUNY,786
|
@@ -465,13 +467,13 @@ parsl/usage_tracking/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
|
|
465
467
|
parsl/usage_tracking/api.py,sha256=iaCY58Dc5J4UM7_dJzEEs871P1p1HdxBMtNGyVdzc9g,1821
|
466
468
|
parsl/usage_tracking/levels.py,sha256=xbfzYEsd55KiZJ-mzNgPebvOH4rRHum04hROzEf41tU,291
|
467
469
|
parsl/usage_tracking/usage.py,sha256=qNEJ7nPimqd3Y7OWFLdYmNwJ6XDKlyfV_fTzasxsQw8,8690
|
468
|
-
parsl-2024.7.
|
469
|
-
parsl-2024.7.
|
470
|
-
parsl-2024.7.
|
471
|
-
parsl-2024.7.
|
472
|
-
parsl-2024.7.
|
473
|
-
parsl-2024.7.
|
474
|
-
parsl-2024.7.
|
475
|
-
parsl-2024.7.
|
476
|
-
parsl-2024.7.
|
477
|
-
parsl-2024.7.
|
470
|
+
parsl-2024.7.15.data/scripts/exec_parsl_function.py,sha256=RUkJ4JSJAjr7YyRZ58zhMdg8cR5dVV9odUl3AuzNf3k,7802
|
471
|
+
parsl-2024.7.15.data/scripts/interchange.py,sha256=n0aOHLX64DEWx-OA4vWrYRVZfmaz8Rc8haNtafbgh4k,30565
|
472
|
+
parsl-2024.7.15.data/scripts/parsl_coprocess.py,sha256=zrVjEqQvFOHxsLufPi00xzMONagjVwLZbavPM7bbjK4,5722
|
473
|
+
parsl-2024.7.15.data/scripts/process_worker_pool.py,sha256=pfIQ_JzqjviaiTfVI49qw4qy8FBS8AavN_12oL8DyzE,42944
|
474
|
+
parsl-2024.7.15.dist-info/LICENSE,sha256=tAkwu8-AdEyGxGoSvJ2gVmQdcicWw3j1ZZueVV74M-E,11357
|
475
|
+
parsl-2024.7.15.dist-info/METADATA,sha256=bagqkFFK8EeAICbm5afqQ4--DJWNZ_900VszWxbxsZk,4124
|
476
|
+
parsl-2024.7.15.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
477
|
+
parsl-2024.7.15.dist-info/entry_points.txt,sha256=XqnsWDYoEcLbsMcpnYGKLEnSBmaIe1YoM5YsBdJG2tI,176
|
478
|
+
parsl-2024.7.15.dist-info/top_level.txt,sha256=PIheYoUFQtF2icLsgOykgU-Cjuwr2Oi6On2jo5RYgRM,6
|
479
|
+
parsl-2024.7.15.dist-info/RECORD,,
|
@@ -1,41 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
|
3
|
-
import sys
|
4
|
-
|
5
|
-
import radical.pilot as rp
|
6
|
-
import radical.utils as ru
|
7
|
-
|
8
|
-
# ------------------------------------------------------------------------------
|
9
|
-
#
|
10
|
-
if __name__ == '__main__':
|
11
|
-
|
12
|
-
# The purpose of this master is to (a) spawn a set or workers
|
13
|
-
# within the same allocation, (b) to distribute work items to
|
14
|
-
# those workers, and (c) to collect the responses again.
|
15
|
-
cfg_fname = str(sys.argv[1])
|
16
|
-
cfg = ru.Config(cfg=ru.read_json(cfg_fname))
|
17
|
-
cfg.rank = int(sys.argv[2])
|
18
|
-
|
19
|
-
worker_descr = cfg.worker_descr
|
20
|
-
n_workers = cfg.n_workers
|
21
|
-
gpus_per_node = cfg.gpus_per_node
|
22
|
-
cores_per_node = cfg.cores_per_node
|
23
|
-
nodes_per_worker = cfg.nodes_per_worker
|
24
|
-
|
25
|
-
# create a master class instance - this will establish communication
|
26
|
-
# to the pilot agent
|
27
|
-
master = rp.raptor.Master(cfg)
|
28
|
-
|
29
|
-
# insert `n` worker into the agent. The agent will schedule (place)
|
30
|
-
# those workers and execute them.
|
31
|
-
worker_descr['ranks'] = nodes_per_worker * cores_per_node
|
32
|
-
worker_descr['gpus_per_rank'] = nodes_per_worker * gpus_per_node
|
33
|
-
worker_ids = master.submit_workers(
|
34
|
-
[rp.TaskDescription(worker_descr) for _ in range(n_workers)])
|
35
|
-
|
36
|
-
# wait for all workers
|
37
|
-
master.wait_workers()
|
38
|
-
master.start()
|
39
|
-
master.join()
|
40
|
-
|
41
|
-
# ------------------------------------------------------------------------------
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|