parsl 2024.8.12__py3-none-any.whl → 2024.8.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. parsl/channels/oauth_ssh/oauth_ssh.py +10 -2
  2. parsl/channels/ssh/ssh.py +16 -6
  3. parsl/channels/ssh_il/ssh_il.py +12 -2
  4. parsl/executors/high_throughput/executor.py +18 -27
  5. parsl/executors/high_throughput/interchange.py +31 -29
  6. parsl/executors/high_throughput/mpi_executor.py +23 -2
  7. parsl/executors/high_throughput/mpi_prefix_composer.py +5 -4
  8. parsl/executors/status_handling.py +5 -2
  9. parsl/jobs/states.py +6 -1
  10. parsl/monitoring/db_manager.py +21 -65
  11. parsl/monitoring/monitoring.py +10 -23
  12. parsl/monitoring/router.py +12 -39
  13. parsl/providers/slurm/slurm.py +40 -10
  14. parsl/tests/test_htex/test_multiple_disconnected_blocks.py +3 -5
  15. parsl/tests/test_htex/test_resource_spec_validation.py +40 -0
  16. parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +1 -1
  17. parsl/tests/test_mpi_apps/test_bad_mpi_config.py +29 -14
  18. parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +16 -8
  19. parsl/tests/test_mpi_apps/test_mpiex.py +2 -3
  20. parsl/tests/test_mpi_apps/test_resource_spec.py +39 -41
  21. parsl/tests/test_scaling/test_regression_3568_scaledown_vs_MISSING.py +85 -0
  22. parsl/version.py +1 -1
  23. {parsl-2024.8.12.data → parsl-2024.8.26.data}/scripts/interchange.py +31 -29
  24. {parsl-2024.8.12.dist-info → parsl-2024.8.26.dist-info}/METADATA +5 -3
  25. {parsl-2024.8.12.dist-info → parsl-2024.8.26.dist-info}/RECORD +32 -31
  26. parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +0 -47
  27. {parsl-2024.8.12.data → parsl-2024.8.26.data}/scripts/exec_parsl_function.py +0 -0
  28. {parsl-2024.8.12.data → parsl-2024.8.26.data}/scripts/parsl_coprocess.py +0 -0
  29. {parsl-2024.8.12.data → parsl-2024.8.26.data}/scripts/process_worker_pool.py +0 -0
  30. {parsl-2024.8.12.dist-info → parsl-2024.8.26.dist-info}/LICENSE +0 -0
  31. {parsl-2024.8.12.dist-info → parsl-2024.8.26.dist-info}/WHEEL +0 -0
  32. {parsl-2024.8.12.dist-info → parsl-2024.8.26.dist-info}/entry_points.txt +0 -0
  33. {parsl-2024.8.12.dist-info → parsl-2024.8.26.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,14 @@
1
1
  import datetime
2
2
  import logging
3
+ import multiprocessing.queues as mpq
3
4
  import os
4
5
  import queue
5
6
  import threading
6
7
  import time
7
8
  from typing import Any, Dict, List, Optional, Set, Tuple, TypeVar, cast
8
9
 
10
+ import typeguard
11
+
9
12
  from parsl.dataflow.states import States
10
13
  from parsl.errors import OptionalModuleMissing
11
14
  from parsl.log_utils import set_file_logger
@@ -305,39 +308,13 @@ class DatabaseManager:
305
308
  self.pending_resource_queue: queue.Queue[MonitoringMessage] = queue.Queue()
306
309
 
307
310
  def start(self,
308
- priority_queue: "queue.Queue[TaggedMonitoringMessage]",
309
- node_queue: "queue.Queue[MonitoringMessage]",
310
- block_queue: "queue.Queue[MonitoringMessage]",
311
- resource_queue: "queue.Queue[MonitoringMessage]") -> None:
311
+ resource_queue: mpq.Queue) -> None:
312
312
 
313
313
  self._kill_event = threading.Event()
314
- self._priority_queue_pull_thread = threading.Thread(target=self._migrate_logs_to_internal,
315
- args=(
316
- priority_queue, 'priority', self._kill_event,),
317
- name="Monitoring-migrate-priority",
318
- daemon=True,
319
- )
320
- self._priority_queue_pull_thread.start()
321
-
322
- self._node_queue_pull_thread = threading.Thread(target=self._migrate_logs_to_internal,
323
- args=(
324
- node_queue, 'node', self._kill_event,),
325
- name="Monitoring-migrate-node",
326
- daemon=True,
327
- )
328
- self._node_queue_pull_thread.start()
329
-
330
- self._block_queue_pull_thread = threading.Thread(target=self._migrate_logs_to_internal,
331
- args=(
332
- block_queue, 'block', self._kill_event,),
333
- name="Monitoring-migrate-block",
334
- daemon=True,
335
- )
336
- self._block_queue_pull_thread.start()
337
314
 
338
315
  self._resource_queue_pull_thread = threading.Thread(target=self._migrate_logs_to_internal,
339
316
  args=(
340
- resource_queue, 'resource', self._kill_event,),
317
+ resource_queue, self._kill_event,),
341
318
  name="Monitoring-migrate-resource",
342
319
  daemon=True,
343
320
  )
@@ -369,20 +346,18 @@ class DatabaseManager:
369
346
  while (not self._kill_event.is_set() or
370
347
  self.pending_priority_queue.qsize() != 0 or self.pending_resource_queue.qsize() != 0 or
371
348
  self.pending_node_queue.qsize() != 0 or self.pending_block_queue.qsize() != 0 or
372
- priority_queue.qsize() != 0 or resource_queue.qsize() != 0 or
373
- node_queue.qsize() != 0 or block_queue.qsize() != 0):
349
+ resource_queue.qsize() != 0):
374
350
 
375
351
  """
376
352
  WORKFLOW_INFO and TASK_INFO messages (i.e. priority messages)
377
353
 
378
354
  """
379
355
  try:
380
- logger.debug("""Checking STOP conditions: {}, {}, {}, {}, {}, {}, {}, {}, {}""".format(
356
+ logger.debug("""Checking STOP conditions: {}, {}, {}, {}, {}, {}""".format(
381
357
  self._kill_event.is_set(),
382
358
  self.pending_priority_queue.qsize() != 0, self.pending_resource_queue.qsize() != 0,
383
359
  self.pending_node_queue.qsize() != 0, self.pending_block_queue.qsize() != 0,
384
- priority_queue.qsize() != 0, resource_queue.qsize() != 0,
385
- node_queue.qsize() != 0, block_queue.qsize() != 0))
360
+ resource_queue.qsize() != 0))
386
361
 
387
362
  # This is the list of resource messages which can be reprocessed as if they
388
363
  # had just arrived because the corresponding first task message has been
@@ -574,43 +549,26 @@ class DatabaseManager:
574
549
  raise RuntimeError("An exception happened sometime during database processing and should have been logged in database_manager.log")
575
550
 
576
551
  @wrap_with_logs(target="database_manager")
577
- def _migrate_logs_to_internal(self, logs_queue: queue.Queue, queue_tag: str, kill_event: threading.Event) -> None:
578
- logger.info("Starting processing for queue {}".format(queue_tag))
552
+ def _migrate_logs_to_internal(self, logs_queue: queue.Queue, kill_event: threading.Event) -> None:
553
+ logger.info("Starting _migrate_logs_to_internal")
579
554
 
580
555
  while not kill_event.is_set() or logs_queue.qsize() != 0:
581
- logger.debug("""Checking STOP conditions for {} threads: {}, {}"""
582
- .format(queue_tag, kill_event.is_set(), logs_queue.qsize() != 0))
556
+ logger.debug("Checking STOP conditions: kill event: %s, queue has entries: %s",
557
+ kill_event.is_set(), logs_queue.qsize() != 0)
583
558
  try:
584
559
  x, addr = logs_queue.get(timeout=0.1)
585
560
  except queue.Empty:
586
561
  continue
587
562
  else:
588
- if queue_tag == 'priority' and x == 'STOP':
563
+ if x == 'STOP':
589
564
  self.close()
590
- elif queue_tag == 'priority': # implicitly not 'STOP'
591
- assert isinstance(x, tuple)
592
- assert len(x) == 2
593
- assert x[0] in [MessageType.WORKFLOW_INFO, MessageType.TASK_INFO], \
594
- "_migrate_logs_to_internal can only migrate WORKFLOW_,TASK_INFO message from priority queue, got x[0] == {}".format(x[0])
595
- self._dispatch_to_internal(x)
596
- elif queue_tag == 'resource':
597
- assert isinstance(x, tuple), "_migrate_logs_to_internal was expecting a tuple, got {}".format(x)
598
- assert x[0] == MessageType.RESOURCE_INFO, (
599
- "_migrate_logs_to_internal can only migrate RESOURCE_INFO message from resource queue, "
600
- "got tag {}, message {}".format(x[0], x)
601
- )
602
- self._dispatch_to_internal(x)
603
- elif queue_tag == 'node':
604
- assert len(x) == 2, "expected message tuple to have exactly two elements"
605
- assert x[0] == MessageType.NODE_INFO, "_migrate_logs_to_internal can only migrate NODE_INFO messages from node queue"
606
-
607
- self._dispatch_to_internal(x)
608
- elif queue_tag == "block":
609
- self._dispatch_to_internal(x)
610
565
  else:
611
- logger.error(f"Discarding because unknown queue tag '{queue_tag}', message: {x}")
566
+ self._dispatch_to_internal(x)
612
567
 
613
568
  def _dispatch_to_internal(self, x: Tuple) -> None:
569
+ assert isinstance(x, tuple)
570
+ assert len(x) == 2, "expected message tuple to have exactly two elements"
571
+
614
572
  if x[0] in [MessageType.WORKFLOW_INFO, MessageType.TASK_INFO]:
615
573
  self.pending_priority_queue.put(cast(Any, x))
616
574
  elif x[0] == MessageType.RESOURCE_INFO:
@@ -719,11 +677,9 @@ class DatabaseManager:
719
677
 
720
678
 
721
679
  @wrap_with_logs(target="database_manager")
722
- def dbm_starter(exception_q: "queue.Queue[Tuple[str, str]]",
723
- priority_msgs: "queue.Queue[TaggedMonitoringMessage]",
724
- node_msgs: "queue.Queue[MonitoringMessage]",
725
- block_msgs: "queue.Queue[MonitoringMessage]",
726
- resource_msgs: "queue.Queue[MonitoringMessage]",
680
+ @typeguard.typechecked
681
+ def dbm_starter(exception_q: mpq.Queue,
682
+ resource_msgs: mpq.Queue,
727
683
  db_url: str,
728
684
  logdir: str,
729
685
  logging_level: int) -> None:
@@ -739,7 +695,7 @@ def dbm_starter(exception_q: "queue.Queue[Tuple[str, str]]",
739
695
  logdir=logdir,
740
696
  logging_level=logging_level)
741
697
  logger.info("Starting dbm in dbm starter")
742
- dbm.start(priority_msgs, node_msgs, block_msgs, resource_msgs)
698
+ dbm.start(resource_msgs)
743
699
  except KeyboardInterrupt:
744
700
  logger.exception("KeyboardInterrupt signal caught")
745
701
  dbm.close()
@@ -7,7 +7,7 @@ import queue
7
7
  import time
8
8
  from multiprocessing import Event, Process
9
9
  from multiprocessing.queues import Queue
10
- from typing import TYPE_CHECKING, Any, Optional, Tuple, Union, cast
10
+ from typing import TYPE_CHECKING, Any, Literal, Optional, Tuple, Union, cast
11
11
 
12
12
  import typeguard
13
13
 
@@ -138,25 +138,18 @@ class MonitoringHub(RepresentationMixin):
138
138
  self.exception_q: Queue[Tuple[str, str]]
139
139
  self.exception_q = SizedQueue(maxsize=10)
140
140
 
141
- self.priority_msgs: Queue[Tuple[Any, int]]
142
- self.priority_msgs = SizedQueue()
143
-
144
- self.resource_msgs: Queue[AddressedMonitoringMessage]
141
+ self.resource_msgs: Queue[Union[AddressedMonitoringMessage, Tuple[Literal["STOP"], Literal[0]]]]
145
142
  self.resource_msgs = SizedQueue()
146
143
 
147
- self.node_msgs: Queue[AddressedMonitoringMessage]
148
- self.node_msgs = SizedQueue()
149
-
150
- self.block_msgs: Queue[AddressedMonitoringMessage]
151
- self.block_msgs = SizedQueue()
152
-
153
144
  self.router_exit_event: ms.Event
154
145
  self.router_exit_event = Event()
155
146
 
156
147
  self.router_proc = ForkProcess(target=router_starter,
157
- args=(comm_q, self.exception_q, self.priority_msgs, self.node_msgs,
158
- self.block_msgs, self.resource_msgs, self.router_exit_event),
159
- kwargs={"hub_address": self.hub_address,
148
+ kwargs={"comm_q": comm_q,
149
+ "exception_q": self.exception_q,
150
+ "resource_msgs": self.resource_msgs,
151
+ "exit_event": self.router_exit_event,
152
+ "hub_address": self.hub_address,
160
153
  "udp_port": self.hub_port,
161
154
  "zmq_port_range": self.hub_port_range,
162
155
  "logdir": self.logdir,
@@ -168,7 +161,7 @@ class MonitoringHub(RepresentationMixin):
168
161
  self.router_proc.start()
169
162
 
170
163
  self.dbm_proc = ForkProcess(target=dbm_starter,
171
- args=(self.exception_q, self.priority_msgs, self.node_msgs, self.block_msgs, self.resource_msgs,),
164
+ args=(self.exception_q, self.resource_msgs,),
172
165
  kwargs={"logdir": self.logdir,
173
166
  "logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
174
167
  "db_url": self.logging_endpoint,
@@ -187,7 +180,7 @@ class MonitoringHub(RepresentationMixin):
187
180
  self.filesystem_proc.start()
188
181
  logger.info(f"Started filesystem radio receiver process {self.filesystem_proc.pid}")
189
182
 
190
- self.radio = MultiprocessingQueueRadioSender(self.block_msgs)
183
+ self.radio = MultiprocessingQueueRadioSender(self.resource_msgs)
191
184
 
192
185
  try:
193
186
  comm_q_result = comm_q.get(block=True, timeout=120)
@@ -244,7 +237,7 @@ class MonitoringHub(RepresentationMixin):
244
237
  logger.debug("Finished waiting for router termination")
245
238
  if len(exception_msgs) == 0:
246
239
  logger.debug("Sending STOP to DBM")
247
- self.priority_msgs.put(("STOP", 0))
240
+ self.resource_msgs.put(("STOP", 0))
248
241
  else:
249
242
  logger.debug("Not sending STOP to DBM, because there were DBM exceptions")
250
243
  logger.debug("Waiting for DB termination")
@@ -262,14 +255,8 @@ class MonitoringHub(RepresentationMixin):
262
255
  logger.info("Closing monitoring multiprocessing queues")
263
256
  self.exception_q.close()
264
257
  self.exception_q.join_thread()
265
- self.priority_msgs.close()
266
- self.priority_msgs.join_thread()
267
258
  self.resource_msgs.close()
268
259
  self.resource_msgs.join_thread()
269
- self.node_msgs.close()
270
- self.node_msgs.join_thread()
271
- self.block_msgs.close()
272
- self.block_msgs.join_thread()
273
260
  logger.info("Closed monitoring multiprocessing queues")
274
261
 
275
262
 
@@ -1,19 +1,19 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
+ import multiprocessing.queues as mpq
4
5
  import os
5
6
  import pickle
6
- import queue
7
7
  import socket
8
8
  import threading
9
9
  import time
10
10
  from multiprocessing.synchronize import Event
11
- from typing import Optional, Tuple, Union
11
+ from typing import Optional, Tuple
12
12
 
13
+ import typeguard
13
14
  import zmq
14
15
 
15
16
  from parsl.log_utils import set_file_logger
16
- from parsl.monitoring.message_type import MessageType
17
17
  from parsl.monitoring.types import AddressedMonitoringMessage, TaggedMonitoringMessage
18
18
  from parsl.process_loggers import wrap_with_logs
19
19
  from parsl.utils import setproctitle
@@ -33,10 +33,7 @@ class MonitoringRouter:
33
33
  logdir: str = ".",
34
34
  logging_level: int = logging.INFO,
35
35
  atexit_timeout: int = 3, # in seconds
36
- priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
37
- node_msgs: "queue.Queue[AddressedMonitoringMessage]",
38
- block_msgs: "queue.Queue[AddressedMonitoringMessage]",
39
- resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
36
+ resource_msgs: mpq.Queue,
40
37
  exit_event: Event,
41
38
  ):
42
39
  """ Initializes a monitoring configuration class.
@@ -56,8 +53,8 @@ class MonitoringRouter:
56
53
  Logging level as defined in the logging module. Default: logging.INFO
57
54
  atexit_timeout : float, optional
58
55
  The amount of time in seconds to terminate the hub without receiving any messages, after the last dfk workflow message is received.
59
- *_msgs : Queue
60
- Four multiprocessing queues to receive messages, routed by type tag, and sometimes modified according to type tag.
56
+ resource_msgs : multiprocessing.Queue
57
+ A multiprocessing queue to receive messages to be routed onwards to the database process
61
58
 
62
59
  exit_event : Event
63
60
  An event that the main Parsl process will set to signal that the monitoring router should shut down.
@@ -101,9 +98,6 @@ class MonitoringRouter:
101
98
  min_port=zmq_port_range[0],
102
99
  max_port=zmq_port_range[1])
103
100
 
104
- self.priority_msgs = priority_msgs
105
- self.node_msgs = node_msgs
106
- self.block_msgs = block_msgs
107
101
  self.resource_msgs = resource_msgs
108
102
  self.exit_event = exit_event
109
103
 
@@ -169,24 +163,7 @@ class MonitoringRouter:
169
163
  msg_0: AddressedMonitoringMessage
170
164
  msg_0 = (msg, 0)
171
165
 
172
- if msg[0] == MessageType.NODE_INFO:
173
- self.node_msgs.put(msg_0)
174
- elif msg[0] == MessageType.RESOURCE_INFO:
175
- self.resource_msgs.put(msg_0)
176
- elif msg[0] == MessageType.BLOCK_INFO:
177
- self.block_msgs.put(msg_0)
178
- elif msg[0] == MessageType.TASK_INFO:
179
- self.priority_msgs.put(msg_0)
180
- elif msg[0] == MessageType.WORKFLOW_INFO:
181
- self.priority_msgs.put(msg_0)
182
- else:
183
- # There is a type: ignore here because if msg[0]
184
- # is of the correct type, this code is unreachable,
185
- # but there is no verification that the message
186
- # received from zmq_receiver_channel.recv_pyobj() is actually
187
- # of that type.
188
- self.logger.error("Discarding message " # type: ignore[unreachable]
189
- f"from interchange with unknown type {msg[0].value}")
166
+ self.resource_msgs.put(msg_0)
190
167
  except zmq.Again:
191
168
  pass
192
169
  except Exception:
@@ -202,12 +179,11 @@ class MonitoringRouter:
202
179
 
203
180
 
204
181
  @wrap_with_logs
205
- def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
206
- exception_q: "queue.Queue[Tuple[str, str]]",
207
- priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
208
- node_msgs: "queue.Queue[AddressedMonitoringMessage]",
209
- block_msgs: "queue.Queue[AddressedMonitoringMessage]",
210
- resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
182
+ @typeguard.typechecked
183
+ def router_starter(*,
184
+ comm_q: mpq.Queue,
185
+ exception_q: mpq.Queue,
186
+ resource_msgs: mpq.Queue,
211
187
  exit_event: Event,
212
188
 
213
189
  hub_address: str,
@@ -223,9 +199,6 @@ def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
223
199
  zmq_port_range=zmq_port_range,
224
200
  logdir=logdir,
225
201
  logging_level=logging_level,
226
- priority_msgs=priority_msgs,
227
- node_msgs=node_msgs,
228
- block_msgs=block_msgs,
229
202
  resource_msgs=resource_msgs,
230
203
  exit_event=exit_event)
231
204
  except Exception as e:
@@ -20,7 +20,7 @@ from parsl.utils import RepresentationMixin, wtime_to_minutes
20
20
  logger = logging.getLogger(__name__)
21
21
 
22
22
  # From https://slurm.schedmd.com/sacct.html#SECTION_JOB-STATE-CODES
23
- translate_table = {
23
+ sacct_translate_table = {
24
24
  'PENDING': JobState.PENDING,
25
25
  'RUNNING': JobState.RUNNING,
26
26
  'CANCELLED': JobState.CANCELLED,
@@ -37,6 +37,20 @@ translate_table = {
37
37
  'REQUEUED': JobState.PENDING
38
38
  }
39
39
 
40
+ squeue_translate_table = {
41
+ 'PD': JobState.PENDING,
42
+ 'R': JobState.RUNNING,
43
+ 'CA': JobState.CANCELLED,
44
+ 'CF': JobState.PENDING, # (configuring),
45
+ 'CG': JobState.RUNNING, # (completing),
46
+ 'CD': JobState.COMPLETED,
47
+ 'F': JobState.FAILED, # (failed),
48
+ 'TO': JobState.TIMEOUT, # (timeout),
49
+ 'NF': JobState.FAILED, # (node failure),
50
+ 'RV': JobState.FAILED, # (revoked) and
51
+ 'SE': JobState.FAILED # (special exit state)
52
+ }
53
+
40
54
 
41
55
  class SlurmProvider(ClusterProvider, RepresentationMixin):
42
56
  """Slurm Execution Provider
@@ -155,6 +169,23 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
155
169
 
156
170
  self.regex_job_id = regex_job_id
157
171
  self.worker_init = worker_init + '\n'
172
+ # Check if sacct works and if not fall back to squeue
173
+ cmd = "sacct -X"
174
+ logger.debug("Executing %s", cmd)
175
+ retcode, stdout, stderr = self.execute_wait(cmd)
176
+ # If sacct fails it should return retcode=1 stderr="Slurm accounting storage is disabled"
177
+ logger.debug(f"sacct returned retcode={retcode} stderr={stderr}")
178
+ if retcode == 0:
179
+ logger.debug("using sacct to get job status")
180
+ # Using state%20 to get enough characters to not truncate output
181
+ # of the state. Without output can look like "<job_id> CANCELLED+"
182
+ self._cmd = "sacct -X --noheader --format=jobid,state%20 --job '{0}'"
183
+ self._translate_table = sacct_translate_table
184
+ else:
185
+ logger.debug(f"sacct failed with retcode={retcode}")
186
+ logger.debug("falling back to using squeue to get job status")
187
+ self._cmd = "squeue --noheader --format='%i %t' --job '{0}'"
188
+ self._translate_table = squeue_translate_table
158
189
 
159
190
  def _status(self):
160
191
  '''Returns the status list for a list of job_ids
@@ -172,16 +203,14 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
172
203
  logger.debug('No active jobs, skipping status update')
173
204
  return
174
205
 
175
- # Using state%20 to get enough characters to not truncate output
176
- # of the state. Without output can look like "<job_id> CANCELLED+"
177
- cmd = "sacct -X --noheader --format=jobid,state%20 --job '{0}'".format(job_id_list)
206
+ cmd = self._cmd.format(job_id_list)
178
207
  logger.debug("Executing %s", cmd)
179
208
  retcode, stdout, stderr = self.execute_wait(cmd)
180
- logger.debug("sacct returned %s %s", stdout, stderr)
209
+ logger.debug("sacct/squeue returned %s %s", stdout, stderr)
181
210
 
182
211
  # Execute_wait failed. Do no update
183
212
  if retcode != 0:
184
- logger.warning("sacct failed with non-zero exit code {}".format(retcode))
213
+ logger.warning("sacct/squeue failed with non-zero exit code {}".format(retcode))
185
214
  return
186
215
 
187
216
  jobs_missing = set(self.resources.keys())
@@ -193,9 +222,9 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
193
222
  # For example "<job_id> CANCELLED by <user_id>"
194
223
  # This splits and ignores anything past the first two unpacked values
195
224
  job_id, slurm_state, *ignore = line.split()
196
- if slurm_state not in translate_table:
225
+ if slurm_state not in self._translate_table:
197
226
  logger.warning(f"Slurm status {slurm_state} is not recognized")
198
- status = translate_table.get(slurm_state, JobState.UNKNOWN)
227
+ status = self._translate_table.get(slurm_state, JobState.UNKNOWN)
199
228
  logger.debug("Updating job {} with slurm status {} to parsl state {!s}".format(job_id, slurm_state, status))
200
229
  self.resources[job_id]['status'] = JobStatus(status,
201
230
  stdout_path=self.resources[job_id]['job_stdout_path'],
@@ -203,9 +232,10 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
203
232
  jobs_missing.remove(job_id)
204
233
 
205
234
  # sacct can get job info after jobs have completed so this path shouldn't be hit
206
- # log a warning if there are missing jobs for some reason
235
+ # squeue does not report on jobs that are not running. So we are filling in the
236
+ # blanks for missing jobs, we might lose some information about why the jobs failed.
207
237
  for missing_job in jobs_missing:
208
- logger.warning("Updating missing job {} to completed status".format(missing_job))
238
+ logger.debug("Updating missing job {} to completed status".format(missing_job))
209
239
  self.resources[missing_job]['status'] = JobStatus(
210
240
  JobState.COMPLETED, stdout_path=self.resources[missing_job]['job_stdout_path'],
211
241
  stderr_path=self.resources[missing_job]['job_stderr_path'])
@@ -21,16 +21,14 @@ def local_config():
21
21
  poll_period=100,
22
22
  max_workers_per_node=1,
23
23
  provider=LocalProvider(
24
- worker_init="conda deactivate; export PATH=''; which python; exit 0",
25
- init_blocks=2,
26
- max_blocks=4,
27
- min_blocks=0,
24
+ worker_init="exit 0",
25
+ init_blocks=2
28
26
  ),
29
27
  )
30
28
  ],
31
29
  run_dir="/tmp/test_htex",
32
30
  max_idletime=0.5,
33
- strategy='htex_auto_scale',
31
+ strategy='none',
34
32
  )
35
33
 
36
34
 
@@ -0,0 +1,40 @@
1
+ import queue
2
+ from unittest import mock
3
+
4
+ import pytest
5
+
6
+ from parsl.executors import HighThroughputExecutor
7
+ from parsl.executors.high_throughput.mpi_prefix_composer import (
8
+ InvalidResourceSpecification,
9
+ )
10
+
11
+
12
+ def double(x):
13
+ return x * 2
14
+
15
+
16
+ @pytest.mark.local
17
+ def test_submit_calls_validate():
18
+
19
+ htex = HighThroughputExecutor()
20
+ htex.outgoing_q = mock.Mock(spec=queue.Queue)
21
+ htex.validate_resource_spec = mock.Mock(spec=htex.validate_resource_spec)
22
+
23
+ res_spec = {}
24
+ htex.submit(double, res_spec, (5,), {})
25
+ htex.validate_resource_spec.assert_called()
26
+
27
+
28
+ @pytest.mark.local
29
+ def test_resource_spec_validation():
30
+ htex = HighThroughputExecutor()
31
+ ret_val = htex.validate_resource_spec({})
32
+ assert ret_val is None
33
+
34
+
35
+ @pytest.mark.local
36
+ def test_resource_spec_validation_bad_keys():
37
+ htex = HighThroughputExecutor()
38
+
39
+ with pytest.raises(InvalidResourceSpecification):
40
+ htex.validate_resource_spec({"num_nodes": 2})
@@ -78,6 +78,6 @@ def test_row_counts(tmpd_cwd, strategy):
78
78
  (c, ) = result.first()
79
79
  assert c == 1, "There should be a single pending status"
80
80
 
81
- result = connection.execute(text("SELECT COUNT(*) FROM block WHERE block_id = 0 AND status = 'CANCELLED' AND run_id = :run_id"), binds)
81
+ result = connection.execute(text("SELECT COUNT(*) FROM block WHERE block_id = 0 AND status = 'SCALED_IN' AND run_id = :run_id"), binds)
82
82
  (c, ) = result.first()
83
83
  assert c == 1, "There should be a single cancelled status"
@@ -1,33 +1,48 @@
1
1
  import pytest
2
2
 
3
3
  from parsl import Config
4
- from parsl.executors import HighThroughputExecutor
4
+ from parsl.executors import MPIExecutor
5
5
  from parsl.launchers import AprunLauncher, SimpleLauncher, SrunLauncher
6
6
  from parsl.providers import SlurmProvider
7
7
 
8
8
 
9
9
  @pytest.mark.local
10
- def test_bad_launcher_with_mpi_mode():
11
- """AssertionError if a launcher other than SimpleLauncher is supplied"""
10
+ def test_bad_launcher():
11
+ """TypeError if a launcher other than SimpleLauncher is supplied"""
12
12
 
13
13
  for launcher in [SrunLauncher(), AprunLauncher()]:
14
- with pytest.raises(AssertionError):
14
+ with pytest.raises(TypeError):
15
15
  Config(executors=[
16
- HighThroughputExecutor(
17
- enable_mpi_mode=True,
16
+ MPIExecutor(
18
17
  provider=SlurmProvider(launcher=launcher),
19
18
  )
20
19
  ])
21
20
 
22
21
 
23
22
  @pytest.mark.local
24
- def test_correct_launcher_with_mpi_mode():
23
+ def test_bad_mpi_launcher():
24
+ """ValueError if an unsupported mpi_launcher is specified"""
25
+
26
+ with pytest.raises(ValueError):
27
+ Config(executors=[
28
+ MPIExecutor(
29
+ mpi_launcher="bad_launcher",
30
+ provider=SlurmProvider(launcher=SimpleLauncher()),
31
+ )
32
+ ])
33
+
34
+
35
+ @pytest.mark.local
36
+ @pytest.mark.parametrize(
37
+ "mpi_launcher",
38
+ ["srun", "aprun", "mpiexec"]
39
+ )
40
+ def test_correct_launcher_with_mpi_mode(mpi_launcher: str):
25
41
  """Confirm that SimpleLauncher works with mpi_mode"""
26
42
 
27
- config = Config(executors=[
28
- HighThroughputExecutor(
29
- enable_mpi_mode=True,
30
- provider=SlurmProvider(launcher=SimpleLauncher()),
31
- )
32
- ])
33
- assert isinstance(config.executors[0].provider.launcher, SimpleLauncher)
43
+ executor = MPIExecutor(
44
+ mpi_launcher=mpi_launcher,
45
+ provider=SlurmProvider(launcher=SimpleLauncher()),
46
+ )
47
+
48
+ assert isinstance(executor.provider.launcher, SimpleLauncher)
@@ -6,26 +6,34 @@ from typing import Dict
6
6
  import pytest
7
7
 
8
8
  import parsl
9
- from parsl import bash_app, python_app
9
+ from parsl import Config, bash_app, python_app
10
+ from parsl.executors import MPIExecutor
10
11
  from parsl.executors.high_throughput.mpi_prefix_composer import (
11
12
  MissingResourceSpecification,
12
13
  )
13
- from parsl.tests.configs.htex_local import fresh_config
14
+ from parsl.launchers import SimpleLauncher
15
+ from parsl.providers import LocalProvider
14
16
 
15
17
  EXECUTOR_LABEL = "MPI_TEST"
16
18
 
17
19
 
18
20
  def local_setup():
19
- config = fresh_config()
20
- config.executors[0].label = EXECUTOR_LABEL
21
- config.executors[0].max_workers_per_node = 2
22
- config.executors[0].enable_mpi_mode = True
23
- config.executors[0].mpi_launcher = "mpiexec"
24
21
 
25
22
  cwd = os.path.abspath(os.path.dirname(__file__))
26
23
  pbs_nodefile = os.path.join(cwd, "mocks", "pbs_nodefile")
27
24
 
28
- config.executors[0].provider.worker_init = f"export PBS_NODEFILE={pbs_nodefile}"
25
+ config = Config(
26
+ executors=[
27
+ MPIExecutor(
28
+ label=EXECUTOR_LABEL,
29
+ max_workers_per_block=2,
30
+ mpi_launcher="mpiexec",
31
+ provider=LocalProvider(
32
+ worker_init=f"export PBS_NODEFILE={pbs_nodefile}",
33
+ launcher=SimpleLauncher()
34
+ )
35
+ )
36
+ ])
29
37
 
30
38
  parsl.load(config)
31
39
 
@@ -4,7 +4,6 @@ from pathlib import Path
4
4
 
5
5
  import pytest
6
6
 
7
- import parsl
8
7
  from parsl import Config, HighThroughputExecutor
9
8
  from parsl.executors.high_throughput.mpi_executor import MPIExecutor
10
9
  from parsl.launchers import SimpleLauncher
@@ -42,8 +41,8 @@ def test_docstring():
42
41
  def test_init():
43
42
  """Ensure all relevant kwargs are copied over from HTEx"""
44
43
 
45
- new_kwargs = {'max_workers_per_block'}
46
- excluded_kwargs = {'available_accelerators', 'enable_mpi_mode', 'cores_per_worker', 'max_workers_per_node',
44
+ new_kwargs = {'max_workers_per_block', 'mpi_launcher'}
45
+ excluded_kwargs = {'available_accelerators', 'cores_per_worker', 'max_workers_per_node',
47
46
  'mem_per_worker', 'cpu_affinity', 'max_workers', 'manager_selector'}
48
47
 
49
48
  # Get the kwargs from both HTEx and MPIEx