parsl 2024.7.1__py3-none-any.whl → 2024.7.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. parsl/app/app.py +4 -0
  2. parsl/channels/ssh/ssh.py +12 -0
  3. parsl/dataflow/dflow.py +17 -0
  4. parsl/executors/flux/flux_instance_manager.py +23 -24
  5. parsl/executors/high_throughput/executor.py +49 -21
  6. parsl/executors/high_throughput/mpi_executor.py +2 -0
  7. parsl/executors/high_throughput/process_worker_pool.py +20 -1
  8. parsl/executors/radical/executor.py +105 -65
  9. parsl/executors/radical/rpex_resources.py +14 -7
  10. parsl/providers/kubernetes/kube.py +2 -3
  11. parsl/tests/test_bash_apps/test_inputs_default.py +25 -0
  12. parsl/tests/test_channels/test_dfk_close.py +26 -0
  13. parsl/tests/test_htex/test_htex.py +13 -0
  14. parsl/tests/test_providers/test_local_provider.py +12 -7
  15. parsl/tests/test_python_apps/test_inputs_default.py +22 -0
  16. parsl/version.py +1 -1
  17. {parsl-2024.7.1.data → parsl-2024.7.15.data}/scripts/process_worker_pool.py +20 -1
  18. {parsl-2024.7.1.dist-info → parsl-2024.7.15.dist-info}/METADATA +2 -2
  19. {parsl-2024.7.1.dist-info → parsl-2024.7.15.dist-info}/RECORD +26 -24
  20. parsl/executors/radical/rpex_master.py +0 -41
  21. {parsl-2024.7.1.data → parsl-2024.7.15.data}/scripts/exec_parsl_function.py +0 -0
  22. {parsl-2024.7.1.data → parsl-2024.7.15.data}/scripts/interchange.py +0 -0
  23. {parsl-2024.7.1.data → parsl-2024.7.15.data}/scripts/parsl_coprocess.py +0 -0
  24. {parsl-2024.7.1.dist-info → parsl-2024.7.15.dist-info}/LICENSE +0 -0
  25. {parsl-2024.7.1.dist-info → parsl-2024.7.15.dist-info}/WHEEL +0 -0
  26. {parsl-2024.7.1.dist-info → parsl-2024.7.15.dist-info}/entry_points.txt +0 -0
  27. {parsl-2024.7.1.dist-info → parsl-2024.7.15.dist-info}/top_level.txt +0 -0
parsl/app/app.py CHANGED
@@ -66,6 +66,10 @@ class AppBase(metaclass=ABCMeta):
66
66
  self.kwargs['walltime'] = params['walltime'].default
67
67
  if 'parsl_resource_specification' in params:
68
68
  self.kwargs['parsl_resource_specification'] = params['parsl_resource_specification'].default
69
+ if 'outputs' in params:
70
+ self.kwargs['outputs'] = params['outputs'].default
71
+ if 'inputs' in params:
72
+ self.kwargs['inputs'] = params['inputs'].default
69
73
 
70
74
  @abstractmethod
71
75
  def __call__(self, *args: Any, **kwargs: Any) -> AppFuture:
parsl/channels/ssh/ssh.py CHANGED
@@ -227,8 +227,20 @@ class SSHChannel(Channel, RepresentationMixin):
227
227
 
228
228
  def close(self) -> None:
229
229
  if self._is_connected():
230
+ transport = self.ssh_client.get_transport()
230
231
  self.ssh_client.close()
231
232
 
233
+ # ssh_client.close calls transport.close, but transport.close does
234
+ # not always wait for the transport thread to be stopped. See impl
235
+ # of Transport.close in paramiko and issue
236
+ # https://github.com/paramiko/paramiko/issues/520
237
+ logger.debug("Waiting for transport thread to stop")
238
+ transport.join(30)
239
+ if transport.is_alive():
240
+ logger.warning("SSH transport thread did not shut down")
241
+ else:
242
+ logger.debug("SSH transport thread stopped")
243
+
232
244
  def isdir(self, path):
233
245
  """Return true if the path refers to an existing directory.
234
246
 
parsl/dataflow/dflow.py CHANGED
@@ -1277,6 +1277,23 @@ class DataFlowKernel:
1277
1277
  executor.shutdown()
1278
1278
  logger.info(f"Shut down executor {executor.label}")
1279
1279
 
1280
+ if hasattr(executor, 'provider'):
1281
+ if hasattr(executor.provider, 'script_dir'):
1282
+ logger.info(f"Closing channel(s) for {executor.label}")
1283
+
1284
+ if hasattr(executor.provider, 'channels'):
1285
+ for channel in executor.provider.channels:
1286
+ logger.info(f"Closing channel {channel}")
1287
+ channel.close()
1288
+ logger.info(f"Closed channel {channel}")
1289
+ else:
1290
+ assert hasattr(executor.provider, 'channel'), "If provider has no .channels, it must have .channel"
1291
+ logger.info(f"Closing channel {executor.provider.channel}")
1292
+ executor.provider.channel.close()
1293
+ logger.info(f"Closed channel {executor.provider.channel}")
1294
+
1295
+ logger.info(f"Closed executor channel(s) for {executor.label}")
1296
+
1280
1297
  logger.info("Terminated executors")
1281
1298
  self.time_completed = datetime.datetime.now()
1282
1299
 
@@ -27,30 +27,29 @@ def main():
27
27
  parser.add_argument("hostname", help="hostname of the parent executor's socket")
28
28
  parser.add_argument("port", help="Port of the parent executor's socket")
29
29
  args = parser.parse_args()
30
- context = zmq.Context()
31
- socket = context.socket(zmq.REQ)
32
- socket.connect(
33
- args.protocol + "://" + gethostbyname(args.hostname) + ":" + args.port
34
- )
35
- # send the path to the ``flux.job`` package
36
- socket.send(dirname(dirname(os.path.realpath(flux.__file__))).encode())
37
- logging.debug("Flux package path sent.")
38
- # collect the encapsulating Flux instance's URI
39
- local_uri = flux.Flux().attr_get("local-uri")
40
- hostname = gethostname()
41
- if args.hostname == hostname:
42
- flux_uri = local_uri
43
- else:
44
- flux_uri = "ssh://" + gethostname() + local_uri.replace("local://", "")
45
- logging.debug("Flux URI is %s", flux_uri)
46
- response = socket.recv() # get acknowledgment
47
- logging.debug("Received acknowledgment %s", response)
48
- socket.send(flux_uri.encode()) # send URI
49
- logging.debug("URI sent. Blocking for response...")
50
- response = socket.recv() # wait for shutdown message
51
- logging.debug("Response %s received, draining flux jobs...", response)
52
- flux.Flux().rpc("job-manager.drain").get()
53
- logging.debug("Flux jobs drained, exiting.")
30
+ with zmq.Context() as context, context.socket(zmq.REQ) as socket:
31
+ socket.connect(
32
+ args.protocol + "://" + gethostbyname(args.hostname) + ":" + args.port
33
+ )
34
+ # send the path to the ``flux.job`` package
35
+ socket.send(dirname(dirname(os.path.realpath(flux.__file__))).encode())
36
+ logging.debug("Flux package path sent.")
37
+ # collect the encapsulating Flux instance's URI
38
+ local_uri = flux.Flux().attr_get("local-uri")
39
+ hostname = gethostname()
40
+ if args.hostname == hostname:
41
+ flux_uri = local_uri
42
+ else:
43
+ flux_uri = "ssh://" + gethostname() + local_uri.replace("local://", "")
44
+ logging.debug("Flux URI is %s", flux_uri)
45
+ response = socket.recv() # get acknowledgment
46
+ logging.debug("Received acknowledgment %s", response)
47
+ socket.send(flux_uri.encode()) # send URI
48
+ logging.debug("URI sent. Blocking for response...")
49
+ response = socket.recv() # wait for shutdown message
50
+ logging.debug("Response %s received, draining flux jobs...", response)
51
+ flux.Flux().rpc("job-manager.drain").get()
52
+ logging.debug("Flux jobs drained, exiting.")
54
53
 
55
54
 
56
55
  if __name__ == "__main__":
@@ -56,6 +56,8 @@ DEFAULT_LAUNCH_CMD = ("process_worker_pool.py {debug} {max_workers_per_node} "
56
56
  "--mpi-launcher={mpi_launcher} "
57
57
  "--available-accelerators {accelerators}")
58
58
 
59
+ DEFAULT_INTERCHANGE_LAUNCH_CMD = "interchange.py"
60
+
59
61
  GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionProvider`
60
62
  Provider to access computation resources. Can be one of :class:`~parsl.providers.aws.aws.EC2Provider`,
61
63
  :class:`~parsl.providers.cobalt.cobalt.Cobalt`,
@@ -76,6 +78,10 @@ GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionP
76
78
  cores_per_worker, nodes_per_block, heartbeat_period ,heartbeat_threshold, logdir). For example:
77
79
  launch_cmd="process_worker_pool.py {debug} -c {cores_per_worker} --task_url={task_url} --result_url={result_url}"
78
80
 
81
+ interchange_launch_cmd : str
82
+ Custom command line string to launch the interchange process from the executor. If undefined,
83
+ the executor will use the default "interchange.py" command.
84
+
79
85
  address : string
80
86
  An address to connect to the main Parsl process which is reachable from the network in which
81
87
  workers will be running. This field expects an IPv4 address (xxx.xxx.xxx.xxx).
@@ -162,7 +168,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
162
168
  | | | | batching | | |
163
169
  Parsl<---Fut-| | | load-balancing| result exception
164
170
  ^ | | | watchdogs | | |
165
- | | | Q_mngmnt | | V V
171
+ | | | Result | | | |
172
+ | | | Queue | | V V
166
173
  | | | Thread<--|-incoming_q<---|--- +---------+
167
174
  | | | | | |
168
175
  | | | | | |
@@ -231,6 +238,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
231
238
  label: str = 'HighThroughputExecutor',
232
239
  provider: ExecutionProvider = LocalProvider(),
233
240
  launch_cmd: Optional[str] = None,
241
+ interchange_launch_cmd: Optional[str] = None,
234
242
  address: Optional[str] = None,
235
243
  worker_ports: Optional[Tuple[int, int]] = None,
236
244
  worker_port_range: Optional[Tuple[int, int]] = (54000, 55000),
@@ -329,6 +337,10 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
329
337
  launch_cmd = DEFAULT_LAUNCH_CMD
330
338
  self.launch_cmd = launch_cmd
331
339
 
340
+ if not interchange_launch_cmd:
341
+ interchange_launch_cmd = DEFAULT_INTERCHANGE_LAUNCH_CMD
342
+ self.interchange_launch_cmd = interchange_launch_cmd
343
+
332
344
  radio_mode = "htex"
333
345
 
334
346
  def _warn_deprecated(self, old: str, new: str):
@@ -418,20 +430,19 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
418
430
  "127.0.0.1", self.interchange_port_range, self.cert_dir
419
431
  )
420
432
 
421
- self._queue_management_thread = None
422
- self._start_queue_management_thread()
433
+ self._result_queue_thread = None
434
+ self._start_result_queue_thread()
423
435
  self._start_local_interchange_process()
424
436
 
425
- logger.debug("Created management thread: {}".format(self._queue_management_thread))
437
+ logger.debug("Created result queue thread: %s", self._result_queue_thread)
426
438
 
427
439
  self.initialize_scaling()
428
440
 
429
441
  @wrap_with_logs
430
- def _queue_management_worker(self):
431
- """Listen to the queue for task status messages and handle them.
442
+ def _result_queue_worker(self):
443
+ """Listen to the queue for task result messages and handle them.
432
444
 
433
- Depending on the message, tasks will be updated with results, exceptions,
434
- or updates. It expects the following messages:
445
+ Depending on the message, tasks will be updated with results or exceptions.
435
446
 
436
447
  .. code:: python
437
448
 
@@ -448,7 +459,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
448
459
 
449
460
  The `None` message is a die request.
450
461
  """
451
- logger.debug("Queue management worker starting")
462
+ logger.debug("Result queue worker starting")
452
463
 
453
464
  while not self.bad_state_is_set:
454
465
  try:
@@ -517,7 +528,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
517
528
  else:
518
529
  raise BadMessage("Message received with unknown type {}".format(msg['type']))
519
530
 
520
- logger.info("Queue management worker finished")
531
+ logger.info("Result queue worker finished")
521
532
 
522
533
  def _start_local_interchange_process(self) -> None:
523
534
  """ Starts the interchange process locally
@@ -544,7 +555,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
544
555
 
545
556
  config_pickle = pickle.dumps(interchange_config)
546
557
 
547
- self.interchange_proc = subprocess.Popen(b"interchange.py", stdin=subprocess.PIPE)
558
+ self.interchange_proc = subprocess.Popen(self.interchange_launch_cmd.encode("utf-8"), stdin=subprocess.PIPE)
548
559
  stdin = self.interchange_proc.stdin
549
560
  assert stdin is not None, "Popen should have created an IO object (vs default None) because of PIPE mode"
550
561
 
@@ -560,21 +571,21 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
560
571
  raise Exception("Interchange failed to start")
561
572
  logger.debug("Got worker ports")
562
573
 
563
- def _start_queue_management_thread(self):
564
- """Method to start the management thread as a daemon.
574
+ def _start_result_queue_thread(self):
575
+ """Method to start the result queue thread as a daemon.
565
576
 
566
577
  Checks if a thread already exists, then starts it.
567
- Could be used later as a restart if the management thread dies.
578
+ Could be used later as a restart if the result queue thread dies.
568
579
  """
569
- if self._queue_management_thread is None:
570
- logger.debug("Starting queue management thread")
571
- self._queue_management_thread = threading.Thread(target=self._queue_management_worker, name="HTEX-Queue-Management-Thread")
572
- self._queue_management_thread.daemon = True
573
- self._queue_management_thread.start()
574
- logger.debug("Started queue management thread")
580
+ if self._result_queue_thread is None:
581
+ logger.debug("Starting result queue thread")
582
+ self._result_queue_thread = threading.Thread(target=self._result_queue_worker, name="HTEX-Result-Queue-Thread")
583
+ self._result_queue_thread.daemon = True
584
+ self._result_queue_thread.start()
585
+ logger.debug("Started result queue thread")
575
586
 
576
587
  else:
577
- logger.error("Management thread already exists, returning")
588
+ logger.error("Result queue thread already exists, returning")
578
589
 
579
590
  def hold_worker(self, worker_id: str) -> None:
580
591
  """Puts a worker on hold, preventing scheduling of additional tasks to it.
@@ -823,6 +834,23 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
823
834
  logger.info("Unable to terminate Interchange process; sending SIGKILL")
824
835
  self.interchange_proc.kill()
825
836
 
837
+ logger.info("Closing ZMQ pipes")
838
+
839
+ # These pipes are used in a thread unsafe manner. If you have traced a
840
+ # problem to this block of code, you might consider what is happening
841
+ # with other threads that access these.
842
+
843
+ # incoming_q is not closed here because it is used by the results queue
844
+ # worker which is not shut down at this point.
845
+
846
+ if hasattr(self, 'outgoing_q'):
847
+ logger.info("Closing outgoing_q")
848
+ self.outgoing_q.close()
849
+
850
+ if hasattr(self, 'command_client'):
851
+ logger.info("Closing command client")
852
+ self.command_client.close()
853
+
826
854
  logger.info("Finished HighThroughputExecutor shutdown attempt")
827
855
 
828
856
  def get_usage_information(self):
@@ -38,6 +38,7 @@ class MPIExecutor(HighThroughputExecutor):
38
38
  label: str = 'MPIExecutor',
39
39
  provider: ExecutionProvider = LocalProvider(),
40
40
  launch_cmd: Optional[str] = None,
41
+ interchange_launch_cmd: Optional[str] = None,
41
42
  address: Optional[str] = None,
42
43
  worker_ports: Optional[Tuple[int, int]] = None,
43
44
  worker_port_range: Optional[Tuple[int, int]] = (54000, 55000),
@@ -66,6 +67,7 @@ class MPIExecutor(HighThroughputExecutor):
66
67
  label=label,
67
68
  provider=provider,
68
69
  launch_cmd=launch_cmd,
70
+ interchange_launch_cmd=interchange_launch_cmd,
69
71
  address=address,
70
72
  worker_ports=worker_ports,
71
73
  worker_port_range=worker_port_range,
@@ -9,6 +9,7 @@ import os
9
9
  import pickle
10
10
  import platform
11
11
  import queue
12
+ import subprocess
12
13
  import sys
13
14
  import threading
14
15
  import time
@@ -731,9 +732,27 @@ def worker(
731
732
  os.sched_setaffinity(0, my_cores) # type: ignore[attr-defined, unused-ignore]
732
733
  logger.info("Set worker CPU affinity to {}".format(my_cores))
733
734
 
735
+ # If CUDA devices, find total number of devices to allow for MPS
736
+ # See: https://developer.nvidia.com/system-management-interface
737
+ nvidia_smi_cmd = "nvidia-smi -L > /dev/null && nvidia-smi -L | wc -l"
738
+ nvidia_smi_ret = subprocess.run(nvidia_smi_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
739
+ if nvidia_smi_ret.returncode == 0:
740
+ num_cuda_devices = int(nvidia_smi_ret.stdout.split()[0])
741
+ else:
742
+ num_cuda_devices = None
743
+
734
744
  # If desired, pin to accelerator
735
745
  if accelerator is not None:
736
- os.environ["CUDA_VISIBLE_DEVICES"] = accelerator
746
+ try:
747
+ if num_cuda_devices is not None:
748
+ procs_per_cuda_device = pool_size // num_cuda_devices
749
+ partitioned_accelerator = str(int(accelerator) // procs_per_cuda_device) # multiple workers will share a GPU
750
+ os.environ["CUDA_VISIBLE_DEVICES"] = partitioned_accelerator
751
+ logger.info(f'Pinned worker to partitioned cuda device: {partitioned_accelerator}')
752
+ else:
753
+ os.environ["CUDA_VISIBLE_DEVICES"] = accelerator
754
+ except (TypeError, ValueError, ZeroDivisionError):
755
+ os.environ["CUDA_VISIBLE_DEVICES"] = accelerator
737
756
  os.environ["ROCR_VISIBLE_DEVICES"] = accelerator
738
757
  os.environ["ZE_AFFINITY_MASK"] = accelerator
739
758
  os.environ["ZE_ENABLE_PCI_ID_DEVICE_ORDER"] = '1'
@@ -9,7 +9,7 @@ import threading as mt
9
9
  import time
10
10
  from concurrent.futures import Future
11
11
  from functools import partial
12
- from pathlib import Path, PosixPath
12
+ from pathlib import PosixPath
13
13
  from typing import Dict, Optional
14
14
 
15
15
  import requests
@@ -24,7 +24,7 @@ from parsl.serialize import deserialize, pack_res_spec_apply_message
24
24
  from parsl.serialize.errors import DeserializationError, SerializationError
25
25
  from parsl.utils import RepresentationMixin
26
26
 
27
- from .rpex_resources import ResourceConfig
27
+ from .rpex_resources import CLIENT, MPI, ResourceConfig
28
28
 
29
29
  try:
30
30
  import radical.pilot as rp
@@ -59,7 +59,7 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
59
59
  ``rp.PilotManager`` and ``rp.TaskManager``.
60
60
  2. "translate": Unwrap, identify, and parse Parsl ``apps`` into ``rp.TaskDescription``.
61
61
  3. "submit": Submit Parsl apps to ``rp.TaskManager``.
62
- 4. "shut_down": Shut down the RADICAL-Pilot runtime and all associated components.
62
+ 4. "shutdown": Shut down the RADICAL-Pilot runtime and all associated components.
63
63
 
64
64
  Here is a diagram
65
65
 
@@ -138,19 +138,26 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
138
138
  self.future_tasks: Dict[str, Future] = {}
139
139
 
140
140
  if rpex_cfg:
141
- self.rpex_cfg = rpex_cfg
141
+ self.rpex_cfg = rpex_cfg.get_config()
142
142
  elif not rpex_cfg and 'local' in resource:
143
- self.rpex_cfg = ResourceConfig()
143
+ self.rpex_cfg = ResourceConfig().get_config()
144
144
  else:
145
- raise ValueError('Resource config file must be '
146
- 'specified for a non-local execution')
145
+ raise ValueError('Resource config must be '
146
+ 'specified for a non-local resources')
147
147
 
148
148
  def task_state_cb(self, task, state):
149
149
  """
150
150
  Update the state of Parsl Future apps
151
151
  Based on RP task state callbacks.
152
152
  """
153
- if not task.uid.startswith('master'):
153
+ # check the Master/Worker state
154
+ if task.mode in [rp.RAPTOR_MASTER, rp.RAPTOR_WORKER]:
155
+ if state == rp.FAILED:
156
+ exception = RuntimeError(f'{task.uid} failed with internal error: {task.stderr}')
157
+ self._fail_all_tasks(exception)
158
+
159
+ # check all other tasks state
160
+ else:
154
161
  parsl_task = self.future_tasks[task.uid]
155
162
 
156
163
  if state == rp.DONE:
@@ -186,6 +193,23 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
186
193
  else:
187
194
  parsl_task.set_exception('Task failed for an unknown reason')
188
195
 
196
+ def _fail_all_tasks(self, exception):
197
+ """
198
+ Fail all outstanding tasks with the given exception.
199
+
200
+ This method iterates through all outstanding tasks in the
201
+ `_future_tasks` dictionary, which have not yet completed,
202
+ and sets the provided exception as their result, indicating
203
+ a failure.
204
+
205
+ Parameters:
206
+ - exception: The exception to be set as the result for all
207
+ outstanding tasks.
208
+ """
209
+ for fut_task in self.future_tasks.values():
210
+ if not fut_task.done():
211
+ fut_task.set_exception(exception)
212
+
189
213
  def start(self):
190
214
  """Create the Pilot component and pass it.
191
215
  """
@@ -202,63 +226,62 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
202
226
  'resource': self.resource}
203
227
 
204
228
  if not self.resource or 'local' in self.resource:
205
- # move the agent sandbox to the working dir mainly
206
- # for debugging purposes. This will allow parsl
207
- # to include the agent sandbox with the ci artifacts.
208
- if os.environ.get("LOCAL_SANDBOX"):
209
- pd_init['sandbox'] = self.run_dir
210
- os.environ["RADICAL_LOG_LVL"] = "DEBUG"
211
-
212
- logger.info("RPEX will be running in the local mode")
229
+ os.environ["RADICAL_LOG_LVL"] = "DEBUG"
230
+ logger.info("RPEX will be running in local mode")
213
231
 
214
232
  pd = rp.PilotDescription(pd_init)
215
233
  pd.verify()
216
234
 
217
- self.rpex_cfg = self.rpex_cfg._get_cfg_file(path=self.run_dir)
218
- cfg = ru.Config(cfg=ru.read_json(self.rpex_cfg))
235
+ # start RP's main components TMGR, PMGR and Pilot
236
+ self.tmgr = rp.TaskManager(session=self.session)
237
+ self.pmgr = rp.PilotManager(session=self.session)
238
+ self.pilot = self.pmgr.submit_pilots(pd)
219
239
 
220
- self.master = cfg.master_descr
221
- self.n_masters = cfg.n_masters
240
+ if not self.pilot.description.get('cores') or not self.pilot.description.get('nodes'):
241
+ logger.warning('no "cores/nodes" per pilot were set, using default resources')
242
+
243
+ self.tmgr.add_pilots(self.pilot)
244
+ self.tmgr.register_callback(self.task_state_cb)
222
245
 
223
- tds = list()
224
- master_path = '{0}/rpex_master.py'.format(PWD)
225
246
  worker_path = '{0}/rpex_worker.py'.format(PWD)
226
247
 
227
- for i in range(self.n_masters):
228
- td = rp.TaskDescription(self.master)
229
- td.mode = rp.RAPTOR_MASTER
230
- td.uid = ru.generate_id('master.%(item_counter)06d', ru.ID_CUSTOM,
248
+ self.masters = []
249
+
250
+ logger.info(f'Starting {self.rpex_cfg.n_masters} masters and {self.rpex_cfg.n_workers} workers for each master')
251
+
252
+ # create N masters
253
+ for _ in range(self.rpex_cfg.n_masters):
254
+ md = rp.TaskDescription(self.rpex_cfg.master_descr)
255
+ md.uid = ru.generate_id('rpex.master.%(item_counter)06d', ru.ID_CUSTOM,
231
256
  ns=self.session.uid)
232
- td.ranks = 1
233
- td.cores_per_rank = 1
234
- td.arguments = [self.rpex_cfg, i]
235
- td.input_staging = self._stage_files([File(master_path),
236
- File(worker_path),
237
- File(self.rpex_cfg)], mode='in')
238
- tds.append(td)
239
257
 
240
- self.pmgr = rp.PilotManager(session=self.session)
241
- self.tmgr = rp.TaskManager(session=self.session)
258
+ # submit the master to the TMGR
259
+ master = self.tmgr.submit_raptors(md)[0]
260
+ self.masters.append(master)
242
261
 
243
- # submit pilot(s)
244
- pilot = self.pmgr.submit_pilots(pd)
245
- if not pilot.description.get('cores'):
246
- logger.warning('no "cores" per pilot was set, using default resources {0}'.format(pilot.resources))
262
+ workers = []
263
+ # create N workers for each master and submit them to the TMGR
264
+ for _ in range(self.rpex_cfg.n_workers):
265
+ wd = rp.TaskDescription(self.rpex_cfg.worker_descr)
266
+ wd.uid = ru.generate_id('rpex.worker.%(item_counter)06d', ru.ID_CUSTOM,
267
+ ns=self.session.uid)
268
+ wd.raptor_id = master.uid
269
+ wd.input_staging = self._stage_files([File(worker_path)], mode='in')
270
+ workers.append(wd)
247
271
 
248
- self.tmgr.submit_tasks(tds)
272
+ self.tmgr.submit_workers(workers)
273
+
274
+ self.select_master = self._cyclic_master_selector()
249
275
 
250
276
  # prepare or use the current env for the agent/pilot side environment
251
- if cfg.pilot_env_mode != 'client':
252
- logger.info("creating {0} environment for the executor".format(cfg.pilot_env.name))
253
- pilot.prepare_env(env_name=cfg.pilot_env.name,
254
- env_spec=cfg.pilot_env.as_dict())
277
+ if self.rpex_cfg.pilot_env_mode != CLIENT:
278
+ logger.info("creating {0} environment for the executor".format(self.rpex_cfg.pilot_env.name))
279
+ self.pilot.prepare_env(env_name=self.rpex_cfg.pilot_env.name,
280
+ env_spec=self.rpex_cfg.pilot_env.as_dict())
255
281
  else:
256
282
  client_env = sys.prefix
257
283
  logger.info("reusing ({0}) environment for the executor".format(client_env))
258
284
 
259
- self.tmgr.add_pilots(pilot)
260
- self.tmgr.register_callback(self.task_state_cb)
261
-
262
285
  # create a bulking thread to run the actual task submission
263
286
  # to RP in bulks
264
287
  if self.bulk_mode:
@@ -272,8 +295,21 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
272
295
  self._bulk_thread.daemon = True
273
296
  self._bulk_thread.start()
274
297
 
298
+ logger.info('bulk mode is on, submitting tasks in bulks')
299
+
275
300
  return True
276
301
 
302
+ def _cyclic_master_selector(self):
303
+ """
304
+ Balance tasks submission across N masters and N workers
305
+ """
306
+ current_master = 0
307
+ masters_uids = [m.uid for m in self.masters]
308
+
309
+ while True:
310
+ yield masters_uids[current_master]
311
+ current_master = (current_master + 1) % len(self.masters)
312
+
277
313
  def unwrap(self, func, args):
278
314
  """
279
315
  Unwrap a Parsl app and its args for further processing.
@@ -364,22 +400,25 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
364
400
 
365
401
  # This is the default mode where the bash_app will be executed as
366
402
  # as a single core process by RP. For cores > 1 the user must use
367
- # above or use MPI functions if their code is Python.
403
+ # task.mode=rp.TASK_EXECUTABLE (above) or use MPI functions if their
404
+ # code is Python.
368
405
  else:
369
406
  task.mode = rp.TASK_PROC
370
- task.raptor_id = 'master.%06d' % (tid % self.n_masters)
407
+ task.raptor_id = next(self.select_master)
371
408
  task.executable = self._pack_and_apply_message(func, args, kwargs)
372
409
 
373
410
  elif PYTHON in task_type or not task_type:
374
411
  task.mode = rp.TASK_FUNCTION
375
- task.raptor_id = 'master.%06d' % (tid % self.n_masters)
412
+ task.raptor_id = next(self.select_master)
376
413
  if kwargs.get('walltime'):
377
414
  func = timeout(func, kwargs['walltime'])
378
415
 
379
- # we process MPI function differently
380
- if 'comm' in kwargs:
416
+ # Check how to serialize the function object
417
+ if MPI in self.rpex_cfg.worker_type.lower():
418
+ task.use_mpi = True
381
419
  task.function = rp.PythonTask(func, *args, **kwargs)
382
420
  else:
421
+ task.use_mpi = False
383
422
  task.function = self._pack_and_apply_message(func, args, kwargs)
384
423
 
385
424
  task.input_staging = self._stage_files(kwargs.get("inputs", []),
@@ -394,7 +433,7 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
394
433
  try:
395
434
  task.verify()
396
435
  except ru.typeddict.TDKeyError as e:
397
- raise Exception(f'{e}. Please check Radical.Pilot TaskDescription documentation')
436
+ raise Exception(f'{e}. Please check: https://radicalpilot.readthedocs.io/en/stable/ documentation')
398
437
 
399
438
  return task
400
439
 
@@ -413,7 +452,11 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
413
452
 
414
453
  def _unpack_and_set_parsl_exception(self, parsl_task, exception):
415
454
  try:
416
- s = rp.utils.deserialize_bson(exception)
455
+ try:
456
+ s = rp.utils.deserialize_bson(exception)
457
+ except Exception:
458
+ s = exception
459
+
417
460
  if isinstance(s, RemoteExceptionWrapper):
418
461
  try:
419
462
  s.reraise()
@@ -421,6 +464,8 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
421
464
  parsl_task.set_exception(e)
422
465
  elif isinstance(s, Exception):
423
466
  parsl_task.set_exception(s)
467
+ elif isinstance(s, str):
468
+ parsl_task.set_exception(eval(s))
424
469
  else:
425
470
  raise ValueError("Unknown exception-like type received: {}".format(type(s)))
426
471
  except Exception as e:
@@ -440,16 +485,10 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
440
485
  elif isinstance(k_val, PosixPath):
441
486
  k_val = k_val.__str__()
442
487
 
443
- # if the stderr/out has no path
444
- # then we consider it local and
445
- # we just set the path to the cwd
446
- if '/' not in k_val:
447
- k_val = CWD + '/' + k_val
448
-
449
- # finally set the stderr/out to
450
- # the desired name by the user
488
+ # set the stderr/out to the desired
489
+ # name by the user
451
490
  setattr(task, k, k_val)
452
- task.sandbox = Path(k_val).parent.__str__()
491
+ task.sandbox = CWD
453
492
 
454
493
  def _stage_files(self, files, mode):
455
494
  """
@@ -477,7 +516,7 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
477
516
  # this indicates that the user
478
517
  # did not provided a specific
479
518
  # output file and RP will stage out
480
- # the task.output from pilot://task_folder
519
+ # the task.stdout from pilot://task_folder
481
520
  # to the CWD or file.url
482
521
  if '/' not in file.url:
483
522
  f = {'source': file.filename,
@@ -548,7 +587,8 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
548
587
 
549
588
  def shutdown(self, hub=True, targets='all', block=False):
550
589
  """Shutdown the executor, including all RADICAL-Pilot components."""
551
- logger.info("RadicalPilotExecutor shutdown")
590
+ logger.info("RadicalPilotExecutor is terminating...")
552
591
  self.session.close(download=True)
592
+ logger.info("RadicalPilotExecutor is terminated.")
553
593
 
554
594
  return True
@@ -5,6 +5,7 @@ from typing import List
5
5
  _setup_paths: List[str] = []
6
6
  try:
7
7
  import radical.pilot as rp
8
+ import radical.utils as ru
8
9
  except ImportError:
9
10
  pass
10
11
 
@@ -103,7 +104,7 @@ class ResourceConfig:
103
104
  python_v: str = f'{sys.version_info[0]}.{sys.version_info[1]}'
104
105
  worker_type: str = DEFAULT_WORKER
105
106
 
106
- def _get_cfg_file(cls, path=None):
107
+ def get_config(cls, path=None):
107
108
 
108
109
  # Default ENV mode for RP is to reuse
109
110
  # the client side. If this is not the case,
@@ -121,6 +122,7 @@ class ResourceConfig:
121
122
  cfg = {
122
123
  'n_masters': cls.masters,
123
124
  'n_workers': cls.workers,
125
+ 'worker_type': cls.worker_type,
124
126
  'gpus_per_node': cls.worker_gpus_per_node,
125
127
  'cores_per_node': cls.worker_cores_per_node,
126
128
  'cores_per_master': cls.cores_per_master,
@@ -138,9 +140,10 @@ class ResourceConfig:
138
140
  'pilot_env_mode': cls.pilot_env_mode,
139
141
 
140
142
  'master_descr': {
143
+ "ranks": 1,
144
+ "cores_per_rank": 1,
141
145
  "mode": rp.RAPTOR_MASTER,
142
146
  "named_env": cls.pilot_env_name,
143
- "executable": "python3 rpex_master.py",
144
147
  },
145
148
 
146
149
  'worker_descr': {
@@ -149,12 +152,16 @@ class ResourceConfig:
149
152
  "raptor_file": "./rpex_worker.py",
150
153
  "raptor_class": cls.worker_type if
151
154
  cls.worker_type.lower() != MPI else MPI_WORKER,
155
+ "ranks": cls.nodes_per_worker * cls.worker_cores_per_node,
156
+ "gpus_per_rank": cls.nodes_per_worker * cls.worker_gpus_per_node,
152
157
  }}
153
158
 
154
- # Convert the class instance to a cfg file.
155
- config_path = 'rpex.cfg'
159
+ # Convert the class instance to a Json file or a Config dict.
156
160
  if path:
161
+ config_path = 'rpex.cfg'
157
162
  config_path = path + '/' + config_path
158
- with open(config_path, 'w') as f:
159
- json.dump(cfg, f, indent=4)
160
- return config_path
163
+ with open(config_path, 'w') as f:
164
+ json.dump(cfg, f, indent=4)
165
+ else:
166
+ config_obj = ru.Config(from_dict=cfg)
167
+ return config_obj
@@ -168,10 +168,9 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
168
168
  - tasks_per_node (int) : command invocations to be launched per node
169
169
 
170
170
  Kwargs:
171
- - job_name (String): Name for job, must be unique
171
+ - job_name (String): Name for job
172
172
 
173
173
  Returns:
174
- - None: At capacity, cannot provision more
175
174
  - job_id: (string) Identifier for the job
176
175
  """
177
176
 
@@ -187,7 +186,7 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
187
186
  formatted_cmd = template_string.format(command=cmd_string,
188
187
  worker_init=self.worker_init)
189
188
 
190
- logger.debug("Pod name :{}".format(pod_name))
189
+ logger.debug("Pod name: %s", pod_name)
191
190
  self._create_pod(image=self.image,
192
191
  pod_name=pod_name,
193
192
  job_name=job_name,
@@ -0,0 +1,25 @@
1
+ import pytest
2
+
3
+ from parsl import AUTO_LOGNAME, Config, bash_app, python_app
4
+ from parsl.executors import ThreadPoolExecutor
5
+
6
+
7
+ def local_config():
8
+ return Config(executors=[ThreadPoolExecutor()])
9
+
10
+
11
+ @pytest.mark.local
12
+ def test_default_inputs():
13
+ @python_app
14
+ def identity(inp):
15
+ return inp
16
+
17
+ @bash_app
18
+ def sum_inputs(inputs=[identity(1), identity(2)], stdout=AUTO_LOGNAME):
19
+ calc = sum(inputs)
20
+ return f"echo {calc}"
21
+
22
+ fut = sum_inputs()
23
+ fut.result()
24
+ with open(fut.stdout, 'r') as f:
25
+ assert int(f.read()) == 3
@@ -0,0 +1,26 @@
1
+ from unittest.mock import Mock
2
+
3
+ import pytest
4
+
5
+ import parsl
6
+ from parsl.channels.base import Channel
7
+ from parsl.executors import HighThroughputExecutor
8
+ from parsl.providers import LocalProvider
9
+
10
+
11
+ @pytest.mark.local
12
+ def test_dfk_close():
13
+
14
+ mock_channel = Mock(spec=Channel)
15
+
16
+ # block settings all 0 because the mock channel won't be able to
17
+ # do anything to make a block exist
18
+ p = LocalProvider(channel=mock_channel, init_blocks=0, min_blocks=0, max_blocks=0)
19
+
20
+ e = HighThroughputExecutor(provider=p)
21
+
22
+ c = parsl.Config(executors=[e])
23
+ with parsl.load(c):
24
+ pass
25
+
26
+ assert mock_channel.close.called
@@ -136,3 +136,16 @@ def test_max_workers_per_node():
136
136
 
137
137
  # Ensure max_workers_per_node takes precedence
138
138
  assert htex.max_workers_per_node == htex.max_workers == 1
139
+
140
+
141
+ @pytest.mark.local
142
+ def test_htex_launch_cmd():
143
+ htex = HighThroughputExecutor()
144
+ assert htex.launch_cmd.startswith("process_worker_pool.py")
145
+ assert htex.interchange_launch_cmd == "interchange.py"
146
+
147
+ launch_cmd = "custom-launch-cmd"
148
+ ix_launch_cmd = "custom-ix-launch-cmd"
149
+ htex = HighThroughputExecutor(launch_cmd=launch_cmd, interchange_launch_cmd=ix_launch_cmd)
150
+ assert htex.launch_cmd == launch_cmd
151
+ assert htex.interchange_launch_cmd == ix_launch_cmd
@@ -92,19 +92,24 @@ def test_ssh_channel():
92
92
  # already exist, so create it here.
93
93
  pathlib.Path('{}/known.hosts'.format(config_dir)).touch(mode=0o600)
94
94
  script_dir = tempfile.mkdtemp()
95
- p = LocalProvider(channel=SSHChannel('127.0.0.1', port=server_port,
96
- script_dir=remote_script_dir,
97
- host_keys_filename='{}/known.hosts'.format(config_dir),
98
- key_filename=priv_key),
99
- launcher=SingleNodeLauncher(debug=False))
100
- p.script_dir = script_dir
101
- _run_tests(p)
95
+ channel = SSHChannel('127.0.0.1', port=server_port,
96
+ script_dir=remote_script_dir,
97
+ host_keys_filename='{}/known.hosts'.format(config_dir),
98
+ key_filename=priv_key)
99
+ try:
100
+ p = LocalProvider(channel=channel,
101
+ launcher=SingleNodeLauncher(debug=False))
102
+ p.script_dir = script_dir
103
+ _run_tests(p)
104
+ finally:
105
+ channel.close()
102
106
  finally:
103
107
  _stop_sshd(sshd_thread)
104
108
 
105
109
 
106
110
  def _stop_sshd(sshd_thread):
107
111
  sshd_thread.stop()
112
+ sshd_thread.join()
108
113
 
109
114
 
110
115
  class SSHDThread(threading.Thread):
@@ -0,0 +1,22 @@
1
+ import pytest
2
+
3
+ import parsl
4
+ from parsl import python_app
5
+ from parsl.executors.threads import ThreadPoolExecutor
6
+
7
+
8
+ def local_config():
9
+ return parsl.Config(executors=[ThreadPoolExecutor()])
10
+
11
+
12
+ @pytest.mark.local
13
+ def test_default_inputs():
14
+ @python_app
15
+ def identity(inp):
16
+ return inp
17
+
18
+ @python_app
19
+ def add_inputs(inputs=[identity(1), identity(2)]):
20
+ return sum(inputs)
21
+
22
+ assert add_inputs().result() == 3
parsl/version.py CHANGED
@@ -3,4 +3,4 @@
3
3
  Year.Month.Day[alpha/beta/..]
4
4
  Alphas will be numbered like this -> 2024.12.10a0
5
5
  """
6
- VERSION = '2024.07.01'
6
+ VERSION = '2024.07.15'
@@ -9,6 +9,7 @@ import os
9
9
  import pickle
10
10
  import platform
11
11
  import queue
12
+ import subprocess
12
13
  import sys
13
14
  import threading
14
15
  import time
@@ -731,9 +732,27 @@ def worker(
731
732
  os.sched_setaffinity(0, my_cores) # type: ignore[attr-defined, unused-ignore]
732
733
  logger.info("Set worker CPU affinity to {}".format(my_cores))
733
734
 
735
+ # If CUDA devices, find total number of devices to allow for MPS
736
+ # See: https://developer.nvidia.com/system-management-interface
737
+ nvidia_smi_cmd = "nvidia-smi -L > /dev/null && nvidia-smi -L | wc -l"
738
+ nvidia_smi_ret = subprocess.run(nvidia_smi_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
739
+ if nvidia_smi_ret.returncode == 0:
740
+ num_cuda_devices = int(nvidia_smi_ret.stdout.split()[0])
741
+ else:
742
+ num_cuda_devices = None
743
+
734
744
  # If desired, pin to accelerator
735
745
  if accelerator is not None:
736
- os.environ["CUDA_VISIBLE_DEVICES"] = accelerator
746
+ try:
747
+ if num_cuda_devices is not None:
748
+ procs_per_cuda_device = pool_size // num_cuda_devices
749
+ partitioned_accelerator = str(int(accelerator) // procs_per_cuda_device) # multiple workers will share a GPU
750
+ os.environ["CUDA_VISIBLE_DEVICES"] = partitioned_accelerator
751
+ logger.info(f'Pinned worker to partitioned cuda device: {partitioned_accelerator}')
752
+ else:
753
+ os.environ["CUDA_VISIBLE_DEVICES"] = accelerator
754
+ except (TypeError, ValueError, ZeroDivisionError):
755
+ os.environ["CUDA_VISIBLE_DEVICES"] = accelerator
737
756
  os.environ["ROCR_VISIBLE_DEVICES"] = accelerator
738
757
  os.environ["ZE_AFFINITY_MASK"] = accelerator
739
758
  os.environ["ZE_ENABLE_PCI_ID_DEVICE_ORDER"] = '1'
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: parsl
3
- Version: 2024.7.1
3
+ Version: 2024.7.15
4
4
  Summary: Simple data dependent workflows in Python
5
5
  Home-page: https://github.com/Parsl/parsl
6
- Download-URL: https://github.com/Parsl/parsl/archive/2024.07.01.tar.gz
6
+ Download-URL: https://github.com/Parsl/parsl/archive/2024.07.15.tar.gz
7
7
  Author: The Parsl Team
8
8
  Author-email: parsl@googlegroups.com
9
9
  License: Apache 2.0
@@ -8,9 +8,9 @@ parsl/multiprocessing.py,sha256=MyaEcEq-Qf860u7V98u-PZrPNdtzOZL_NW6EhIJnmfQ,1937
8
8
  parsl/process_loggers.py,sha256=uQ7Gd0W72Jz7rrcYlOMfLsAEhkRltxXJL2MgdduJjEw,1136
9
9
  parsl/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  parsl/utils.py,sha256=91FjQiTUY383ueAjkBAgE21My9nba6SP2a2SrbB1r1Q,11250
11
- parsl/version.py,sha256=rymKA_7RPC0NJoPK5DYnRc2K1WmH8LD0xsv3iQyTwDA,131
11
+ parsl/version.py,sha256=xBBY22CXKXmBYJqrmCPAgPlHvalhorEzfXaNGRSVeQU,131
12
12
  parsl/app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- parsl/app/app.py,sha256=D5Ok_gt99mlclM_QfZbquHUBkibyG4tYdUN9ijRwUnQ,8345
13
+ parsl/app/app.py,sha256=0gbM4AH2OtFOLsv07I5nglpElcwMSOi-FzdZZfrk7So,8532
14
14
  parsl/app/bash.py,sha256=jm2AvePlCT9DZR7H_4ANDWxatp5dN_22FUlT_gWhZ-g,5528
15
15
  parsl/app/errors.py,sha256=nJmOEPglAISfD3R1UsTZH-avqiSOJgx_DkpdL9B591w,3917
16
16
  parsl/app/futures.py,sha256=XU1NwkoNVsxy3KF5y0Ihsla5hPbhhuSikZInfS7h7Uo,2910
@@ -25,7 +25,7 @@ parsl/channels/local/local.py,sha256=xqH4HnipUN95NgvyB1r33SiqgQKkARgRKmg0_HnumUk
25
25
  parsl/channels/oauth_ssh/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
26
  parsl/channels/oauth_ssh/oauth_ssh.py,sha256=GrVOpJ6M6BwtGG4zOU4zakyphzuGY5M3suQ8PyjwyOA,3509
27
27
  parsl/channels/ssh/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
- parsl/channels/ssh/ssh.py,sha256=pwbekDM55dwQHrWwNk5wXcQUAf7cGmRahAwZQ89lxDw,9508
28
+ parsl/channels/ssh/ssh.py,sha256=ga8LMZ9ryTZxbgiyljL5DwusYygbUEe-Frt3SBIMecM,10125
29
29
  parsl/channels/ssh_il/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
30
  parsl/channels/ssh_il/ssh_il.py,sha256=5XjotlA83UM4zGfnVriC9pE2NzaCT5hqvXZ9v4GG3pg,2410
31
31
  parsl/concurrent/__init__.py,sha256=TvIVceJYaJAsxedNBF3Vdo9lEQNHH_j3uxJv0zUjP7w,3288
@@ -62,7 +62,7 @@ parsl/data_provider/staging.py,sha256=ZDZuuFg38pjUStegKPcvPsfGp3iMeReMzfU6DSwtJj
62
62
  parsl/data_provider/zip.py,sha256=S4kVuH9lxAegRURYbvIUR7EYYBOccyslaqyCrVWUBhw,4497
63
63
  parsl/dataflow/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
64
  parsl/dataflow/dependency_resolvers.py,sha256=Om8Dgh7a0ZwgXAc6TlhxLSzvxXHDlNNV1aBNiD3JTNY,3325
65
- parsl/dataflow/dflow.py,sha256=j2FApaGbY45fL4fyqQEv2pyZ4m1CnHGrAAmPZxqs2kk,67555
65
+ parsl/dataflow/dflow.py,sha256=jgNOIk3xXz90RXwC38ujMz7092XRdLFv5BrMyALYhps,68513
66
66
  parsl/dataflow/errors.py,sha256=9SxVhIJY_53FQx8x4OU8UA8nd7lvUbDllH7KfMXpYaY,2177
67
67
  parsl/dataflow/futures.py,sha256=08LuP-HFiHBIZmeKCjlsazw_WpQ5fwevrU2_WbidkYw,6080
68
68
  parsl/dataflow/memoization.py,sha256=l9uw1Bu50GucBF70M5relpGKFkE4dIM9T3R1KrxW0v0,9583
@@ -77,23 +77,22 @@ parsl/executors/threads.py,sha256=hJt1LzxphqX4fe_9R9Cf1MU0lepWTU_eJe8O665B0Xo,33
77
77
  parsl/executors/flux/__init__.py,sha256=P9grTTeRPXfqXurFhlSS7XhmE6tTbnCnyQ1f9b-oYHE,136
78
78
  parsl/executors/flux/execute_parsl_task.py,sha256=gRN7F4HhdrKQ-bvn4wXrquBzFOp_9WF88hMIeUaRg5I,1553
79
79
  parsl/executors/flux/executor.py,sha256=gPq49CQwtSZYZggLZ0dCXdpUlllKHJbvR8WRKeGh9xE,16977
80
- parsl/executors/flux/flux_instance_manager.py,sha256=2KVcphlybF-ALYD_3_YjMUi0f5LkjdoJOT_783CW4H0,2036
80
+ parsl/executors/flux/flux_instance_manager.py,sha256=5T3Rp7ZM-mlT0Pf0Gxgs5_YmnaPrSF9ec7zvRfLfYJw,2129
81
81
  parsl/executors/high_throughput/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
82
82
  parsl/executors/high_throughput/errors.py,sha256=Sak8e8UpiEcXefUjMHbhyXc4Rn7kJtOoh7L8wreBQdk,1638
83
- parsl/executors/high_throughput/executor.py,sha256=XO0QkRdQIXYUOdabTTIJ6HIlMai0Tvu78bYHMFT-tNc,37061
83
+ parsl/executors/high_throughput/executor.py,sha256=bCtw_p2f1ztnqQiChKJBOiPyc6aKK39yRXSp5uFpRzk,38185
84
84
  parsl/executors/high_throughput/interchange.py,sha256=IRuiaBmks_R4cU-Sx0Q_Fjv4PdFtzU05GiPdeJstOoA,30578
85
85
  parsl/executors/high_throughput/manager_record.py,sha256=9XppKjDW0DJ7SMkPNxsiDs-HvXGPLrTg6Ceyh4b6gNs,433
86
86
  parsl/executors/high_throughput/monitoring_info.py,sha256=HC0drp6nlXQpAop5PTUKNjdXMgtZVvrBL0JzZJebPP4,298
87
- parsl/executors/high_throughput/mpi_executor.py,sha256=B2CR1pHaGQzIwTrQ-_i08NZG-NwS6yr8y7nxPaa_rkA,3760
87
+ parsl/executors/high_throughput/mpi_executor.py,sha256=V07t1GOzFhcwdlZGuYUPqc1NarSr-vUbsNzbK4Cj0m8,3882
88
88
  parsl/executors/high_throughput/mpi_prefix_composer.py,sha256=hah_IznfFqk-rzuHWmg6aiF_saiDRrpW-aSo4kH9Nso,4854
89
89
  parsl/executors/high_throughput/mpi_resource_management.py,sha256=LFBbJ3BnzTcY_v-jNu30uoIB2Enk4cleN4ygY3dncjY,8194
90
90
  parsl/executors/high_throughput/probe.py,sha256=TNpGTXb4_DEeg_h-LHu4zEKi1-hffboxvKcZUl2OZGk,2751
91
- parsl/executors/high_throughput/process_worker_pool.py,sha256=ROTp8v1i_07OtrC1Qfcn0Qe2vXiGFuO38wcVQFnA8UM,41893
91
+ parsl/executors/high_throughput/process_worker_pool.py,sha256=P1ZqQOyEpfvXxtfsevGpJvPH_PIxso3Mh0u8PyRbwD8,42958
92
92
  parsl/executors/high_throughput/zmq_pipes.py,sha256=tAjQB3aNVMuTXziN3dbJWre46YpXgliD55qMBbhYTLU,8581
93
93
  parsl/executors/radical/__init__.py,sha256=CKbtV2numw5QvgIBq1htMUrt9TqDCIC2zifyf2svTNU,186
94
- parsl/executors/radical/executor.py,sha256=hCNVz2rbbWqNsx9K8LhUYHWfT3pGPe_acqUiTA74AI8,20997
95
- parsl/executors/radical/rpex_master.py,sha256=XkduTQqt-o7aG8_uCbG56hUmL5UkIs-lY6As5hChmJo,1367
96
- parsl/executors/radical/rpex_resources.py,sha256=o-jNQ49e-gB7px2uiRkXcVjC8RebTrGH5eryjzhQwKM,4804
94
+ parsl/executors/radical/executor.py,sha256=426cMt6d8uJFZ_7Ub1kCslaND4OKtBX5WZdz-0RXjMk,22554
95
+ parsl/executors/radical/rpex_resources.py,sha256=Q7-0u3K447LBCe2y7mVcdw6jqWI7SdPXxCKhkr6FoRQ,5139
97
96
  parsl/executors/radical/rpex_worker.py,sha256=qli6i6ejKubTSv3lAE3YiW8RlkHrfl4Jhrq3jA45mOw,1869
98
97
  parsl/executors/taskvine/__init__.py,sha256=9rwp3M8B0YyEhZMLO0RHaNw7u1nc01WHbXLqnBTanu0,293
99
98
  parsl/executors/taskvine/errors.py,sha256=euIYkSslrNSI85kyi2s0xzOaO9ik4c1fYHstMIeiBJk,652
@@ -175,7 +174,7 @@ parsl/providers/grid_engine/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMp
175
174
  parsl/providers/grid_engine/grid_engine.py,sha256=jTQjKaJh4eEXGbhrrCcXFV4AVFo2t39iVpslDR8gF6o,8565
176
175
  parsl/providers/grid_engine/template.py,sha256=a7iViKr8LXcFTPmsf_qQeVK5o_RekOAIlUOF0X1q-2M,273
177
176
  parsl/providers/kubernetes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
178
- parsl/providers/kubernetes/kube.py,sha256=K9-HsdjOJ_kKqUbqnSu4CjuZj9CjtJO8LDLIWIETiz0,14485
177
+ parsl/providers/kubernetes/kube.py,sha256=umXdZqrpAIWh7OLsqZyF1L5AhmqmdpzKb3xTV3atKuk,14406
179
178
  parsl/providers/kubernetes/template.py,sha256=VsRz6cmNaII-y4OdMT6sCwzQy95SJX6NMB0hmmFBhX4,50
180
179
  parsl/providers/local/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
181
180
  parsl/providers/local/local.py,sha256=pTEcl9NnjRcL8FHcMeMEtJj1IXiAOxZ2Cih97Q5jDPY,11388
@@ -298,6 +297,7 @@ parsl/tests/test_bash_apps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJ
298
297
  parsl/tests/test_bash_apps/test_apptimeout.py,sha256=ha9O9WnVxeDrUFmqvEcqkk7ymPms1ju1henzkbOVyV8,570
299
298
  parsl/tests/test_bash_apps/test_basic.py,sha256=HGzJKtETnUxHQwPaTDuZTPMtIX3lSqtidqLxPn2IV8U,2460
300
299
  parsl/tests/test_bash_apps/test_error_codes.py,sha256=jJ3BwhFpvTGKElKyuiCMWFeBaVeIoWlJkiulWRA2nSE,3961
300
+ parsl/tests/test_bash_apps/test_inputs_default.py,sha256=6UYdi8bqbTS-cx2WB2v5lVA9smTL55Sl3wgkQvlxJ0I,563
301
301
  parsl/tests/test_bash_apps/test_keyword_overlaps.py,sha256=8bfN2qw4uXJsYquppR1lZQrYW834AZc3zjYIIHTfDoE,209
302
302
  parsl/tests/test_bash_apps/test_kwarg_storage.py,sha256=OMMD3sKSngBSjVCHK9wju0hHzszOqbYuWtscyMuh5_8,720
303
303
  parsl/tests/test_bash_apps/test_memoize.py,sha256=gFhDNFxdRv8DNtErbwtdEvAph6SDFPaWY0tABZGS4I4,1383
@@ -308,6 +308,7 @@ parsl/tests/test_bash_apps/test_pipeline.py,sha256=1kQDD8-Dh5H9SKFcKHzN_mSrdxAV_
308
308
  parsl/tests/test_bash_apps/test_std_uri.py,sha256=CvAt8BUhNl2pA5chq9YyhkD6eo2IUH6PjWfe3SQ-YRU,3752
309
309
  parsl/tests/test_bash_apps/test_stdout.py,sha256=hrzHXLt308qH2Gg_r0-qy5nFBNXI56vCZQBXIIocCPY,3198
310
310
  parsl/tests/test_channels/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
311
+ parsl/tests/test_channels/test_dfk_close.py,sha256=n7IF3Ud_vejg0VNRnvEgxCLmwMvPVvLbXvJdw-Mz_lw,628
311
312
  parsl/tests/test_channels/test_large_output.py,sha256=PGeNSW_sN5mR7KF1hVL2CPfktydYxo4oNz1wVQ-ENN0,595
312
313
  parsl/tests/test_checkpointing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
313
314
  parsl/tests/test_checkpointing/test_periodic.py,sha256=nfMgrG7sZ8rkMu6iOHS6lp_iTU4IsOyQLQ2Gur_FMmE,1509
@@ -343,7 +344,7 @@ parsl/tests/test_htex/test_connected_blocks.py,sha256=gaXZSr__pIaLvKY6rF-4r1p_4d
343
344
  parsl/tests/test_htex/test_cpu_affinity_explicit.py,sha256=DVHrRCskDbJIrfB5YSi3ZSbfR4WzijA46aZfZzjNcrU,1382
344
345
  parsl/tests/test_htex/test_disconnected_blocks.py,sha256=3V1Ol9gMS6knjLTgIjB5GrunRSp4ANsJ_2vAvpyMR6c,1858
345
346
  parsl/tests/test_htex/test_drain.py,sha256=Z2Z5-3NfLL9tMgJh4JkVKLZZDl3Z2gDAbEFHDSGdItw,2288
346
- parsl/tests/test_htex/test_htex.py,sha256=-gAD-c2h9EpgYG52IN4AUVBbsWnVD31-bIdaNffoGUY,4524
347
+ parsl/tests/test_htex/test_htex.py,sha256=qnJ1LjCC2c75BOxZ4CRA7pEX2RrFKG-fWIuPiW6w9k4,5005
347
348
  parsl/tests/test_htex/test_manager_failure.py,sha256=N-obuSZ8f7XA_XcddoN2LWKSVtpKUZvTHb7BFelS3iQ,1143
348
349
  parsl/tests/test_htex/test_managers_command.py,sha256=Y-eUjtBzwW9erCYdph9bOesbkUvX8QUPqXt27DCgVS8,951
349
350
  parsl/tests/test_htex/test_missing_worker.py,sha256=gyp5i7_t-JHyJGtz_eXZKKBY5w8oqLOIxO6cJgGJMtQ,745
@@ -370,7 +371,7 @@ parsl/tests/test_mpi_apps/test_mpiex.py,sha256=DcvfDZT_WnwSzL5IF71JPbV_wEI_hZl_W
370
371
  parsl/tests/test_mpi_apps/test_resource_spec.py,sha256=A7NwNT4LalCSOiHws1ALrrWy8Mn1IItpv9olhnRVjs0,3987
371
372
  parsl/tests/test_providers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
372
373
  parsl/tests/test_providers/test_cobalt_deprecation_warning.py,sha256=UN2W6xJxuLx2euPqArORKFEU2VXez9_PYqq-0rZHanQ,391
373
- parsl/tests/test_providers/test_local_provider.py,sha256=LsqAr0_I2uJFQ1awYl0mGOFajFPsF46B9iyJg7KmP7s,6969
374
+ parsl/tests/test_providers/test_local_provider.py,sha256=XCAy64oM3IZ6k0RYBIr6s-D2LL7gr6_xXZ_3Pv1D0gM,7076
374
375
  parsl/tests/test_providers/test_pbspro_template.py,sha256=-bi1vags9yyNfpBxtjTqFjzMIg1VVPyf2M958UcXWmA,855
375
376
  parsl/tests/test_providers/test_slurm_instantiate.py,sha256=eW3pEZRIzZO1-eKFrBc7N5uoN5otwghgbqut74Kyqoc,500
376
377
  parsl/tests/test_providers/test_slurm_template.py,sha256=pBEeimO-vGbMmC1QT7BP7s5BH6fFeqaWnI4f6tWPFEo,901
@@ -389,6 +390,7 @@ parsl/tests/test_python_apps/test_fibonacci_recursive.py,sha256=q7LMFcu_pJSNPdz8
389
390
  parsl/tests/test_python_apps/test_futures.py,sha256=EWnzmPn5sVCgeMxc0Uz2ieaaVYr98tFZ7g8YJFqYuC8,2355
390
391
  parsl/tests/test_python_apps/test_garbage_collect.py,sha256=RPntrLuzPkeNbhS7mmqEnHbyOcuV1YVppgZ8BaX-h84,1076
391
392
  parsl/tests/test_python_apps/test_import_fail.py,sha256=Vd8IMa_UsbHYkr3IGnS-rgGb6zKxB1tOTqMZY5lc_xY,691
393
+ parsl/tests/test_python_apps/test_inputs_default.py,sha256=J2GR1NgdvEucNSJkfO6GC5OoMiuvSzO0tASCowT8HM0,436
392
394
  parsl/tests/test_python_apps/test_join.py,sha256=qnwdPYC_uIS5hQ2jmU2nIP_3P_TaMY8Av1ut10EZA_M,2678
393
395
  parsl/tests/test_python_apps/test_lifted.py,sha256=Na6qC_dZSeYJcZdkGn-dCjgYkQV267HmGFfaqFcRVcQ,3408
394
396
  parsl/tests/test_python_apps/test_mapred.py,sha256=C7nTl0NsP_2TCtcmZXWFMpvAG4pwGswrIJKr-5sRUNY,786
@@ -465,13 +467,13 @@ parsl/usage_tracking/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
465
467
  parsl/usage_tracking/api.py,sha256=iaCY58Dc5J4UM7_dJzEEs871P1p1HdxBMtNGyVdzc9g,1821
466
468
  parsl/usage_tracking/levels.py,sha256=xbfzYEsd55KiZJ-mzNgPebvOH4rRHum04hROzEf41tU,291
467
469
  parsl/usage_tracking/usage.py,sha256=qNEJ7nPimqd3Y7OWFLdYmNwJ6XDKlyfV_fTzasxsQw8,8690
468
- parsl-2024.7.1.data/scripts/exec_parsl_function.py,sha256=RUkJ4JSJAjr7YyRZ58zhMdg8cR5dVV9odUl3AuzNf3k,7802
469
- parsl-2024.7.1.data/scripts/interchange.py,sha256=n0aOHLX64DEWx-OA4vWrYRVZfmaz8Rc8haNtafbgh4k,30565
470
- parsl-2024.7.1.data/scripts/parsl_coprocess.py,sha256=zrVjEqQvFOHxsLufPi00xzMONagjVwLZbavPM7bbjK4,5722
471
- parsl-2024.7.1.data/scripts/process_worker_pool.py,sha256=weug6_LAMbqEKQhiI6ZMg8r3e-XBDw1-L5_COEt7caM,41879
472
- parsl-2024.7.1.dist-info/LICENSE,sha256=tAkwu8-AdEyGxGoSvJ2gVmQdcicWw3j1ZZueVV74M-E,11357
473
- parsl-2024.7.1.dist-info/METADATA,sha256=TQ_3YOcktX94s5XnASKWQNt2X2bp6pQHIG3ocx71qTY,4123
474
- parsl-2024.7.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
475
- parsl-2024.7.1.dist-info/entry_points.txt,sha256=XqnsWDYoEcLbsMcpnYGKLEnSBmaIe1YoM5YsBdJG2tI,176
476
- parsl-2024.7.1.dist-info/top_level.txt,sha256=PIheYoUFQtF2icLsgOykgU-Cjuwr2Oi6On2jo5RYgRM,6
477
- parsl-2024.7.1.dist-info/RECORD,,
470
+ parsl-2024.7.15.data/scripts/exec_parsl_function.py,sha256=RUkJ4JSJAjr7YyRZ58zhMdg8cR5dVV9odUl3AuzNf3k,7802
471
+ parsl-2024.7.15.data/scripts/interchange.py,sha256=n0aOHLX64DEWx-OA4vWrYRVZfmaz8Rc8haNtafbgh4k,30565
472
+ parsl-2024.7.15.data/scripts/parsl_coprocess.py,sha256=zrVjEqQvFOHxsLufPi00xzMONagjVwLZbavPM7bbjK4,5722
473
+ parsl-2024.7.15.data/scripts/process_worker_pool.py,sha256=pfIQ_JzqjviaiTfVI49qw4qy8FBS8AavN_12oL8DyzE,42944
474
+ parsl-2024.7.15.dist-info/LICENSE,sha256=tAkwu8-AdEyGxGoSvJ2gVmQdcicWw3j1ZZueVV74M-E,11357
475
+ parsl-2024.7.15.dist-info/METADATA,sha256=bagqkFFK8EeAICbm5afqQ4--DJWNZ_900VszWxbxsZk,4124
476
+ parsl-2024.7.15.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
477
+ parsl-2024.7.15.dist-info/entry_points.txt,sha256=XqnsWDYoEcLbsMcpnYGKLEnSBmaIe1YoM5YsBdJG2tI,176
478
+ parsl-2024.7.15.dist-info/top_level.txt,sha256=PIheYoUFQtF2icLsgOykgU-Cjuwr2Oi6On2jo5RYgRM,6
479
+ parsl-2024.7.15.dist-info/RECORD,,
@@ -1,41 +0,0 @@
1
- #!/usr/bin/env python3
2
-
3
- import sys
4
-
5
- import radical.pilot as rp
6
- import radical.utils as ru
7
-
8
- # ------------------------------------------------------------------------------
9
- #
10
- if __name__ == '__main__':
11
-
12
- # The purpose of this master is to (a) spawn a set or workers
13
- # within the same allocation, (b) to distribute work items to
14
- # those workers, and (c) to collect the responses again.
15
- cfg_fname = str(sys.argv[1])
16
- cfg = ru.Config(cfg=ru.read_json(cfg_fname))
17
- cfg.rank = int(sys.argv[2])
18
-
19
- worker_descr = cfg.worker_descr
20
- n_workers = cfg.n_workers
21
- gpus_per_node = cfg.gpus_per_node
22
- cores_per_node = cfg.cores_per_node
23
- nodes_per_worker = cfg.nodes_per_worker
24
-
25
- # create a master class instance - this will establish communication
26
- # to the pilot agent
27
- master = rp.raptor.Master(cfg)
28
-
29
- # insert `n` worker into the agent. The agent will schedule (place)
30
- # those workers and execute them.
31
- worker_descr['ranks'] = nodes_per_worker * cores_per_node
32
- worker_descr['gpus_per_rank'] = nodes_per_worker * gpus_per_node
33
- worker_ids = master.submit_workers(
34
- [rp.TaskDescription(worker_descr) for _ in range(n_workers)])
35
-
36
- # wait for all workers
37
- master.wait_workers()
38
- master.start()
39
- master.join()
40
-
41
- # ------------------------------------------------------------------------------