parsl 2024.8.12__py3-none-any.whl → 2024.8.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/channels/oauth_ssh/oauth_ssh.py +10 -2
- parsl/channels/ssh/ssh.py +16 -6
- parsl/channels/ssh_il/ssh_il.py +12 -2
- parsl/executors/high_throughput/executor.py +18 -27
- parsl/executors/high_throughput/interchange.py +31 -29
- parsl/executors/high_throughput/mpi_executor.py +23 -2
- parsl/executors/high_throughput/mpi_prefix_composer.py +5 -4
- parsl/executors/status_handling.py +5 -2
- parsl/jobs/states.py +6 -1
- parsl/monitoring/db_manager.py +21 -65
- parsl/monitoring/monitoring.py +10 -23
- parsl/monitoring/router.py +12 -39
- parsl/providers/slurm/slurm.py +40 -10
- parsl/tests/test_htex/test_multiple_disconnected_blocks.py +3 -5
- parsl/tests/test_htex/test_resource_spec_validation.py +40 -0
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +1 -1
- parsl/tests/test_mpi_apps/test_bad_mpi_config.py +29 -14
- parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +16 -8
- parsl/tests/test_mpi_apps/test_mpiex.py +2 -3
- parsl/tests/test_mpi_apps/test_resource_spec.py +39 -41
- parsl/tests/test_scaling/test_regression_3568_scaledown_vs_MISSING.py +85 -0
- parsl/version.py +1 -1
- {parsl-2024.8.12.data → parsl-2024.8.26.data}/scripts/interchange.py +31 -29
- {parsl-2024.8.12.dist-info → parsl-2024.8.26.dist-info}/METADATA +5 -3
- {parsl-2024.8.12.dist-info → parsl-2024.8.26.dist-info}/RECORD +32 -31
- parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +0 -47
- {parsl-2024.8.12.data → parsl-2024.8.26.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.8.12.data → parsl-2024.8.26.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.8.12.data → parsl-2024.8.26.data}/scripts/process_worker_pool.py +0 -0
- {parsl-2024.8.12.dist-info → parsl-2024.8.26.dist-info}/LICENSE +0 -0
- {parsl-2024.8.12.dist-info → parsl-2024.8.26.dist-info}/WHEEL +0 -0
- {parsl-2024.8.12.dist-info → parsl-2024.8.26.dist-info}/entry_points.txt +0 -0
- {parsl-2024.8.12.dist-info → parsl-2024.8.26.dist-info}/top_level.txt +0 -0
parsl/monitoring/db_manager.py
CHANGED
@@ -1,11 +1,14 @@
|
|
1
1
|
import datetime
|
2
2
|
import logging
|
3
|
+
import multiprocessing.queues as mpq
|
3
4
|
import os
|
4
5
|
import queue
|
5
6
|
import threading
|
6
7
|
import time
|
7
8
|
from typing import Any, Dict, List, Optional, Set, Tuple, TypeVar, cast
|
8
9
|
|
10
|
+
import typeguard
|
11
|
+
|
9
12
|
from parsl.dataflow.states import States
|
10
13
|
from parsl.errors import OptionalModuleMissing
|
11
14
|
from parsl.log_utils import set_file_logger
|
@@ -305,39 +308,13 @@ class DatabaseManager:
|
|
305
308
|
self.pending_resource_queue: queue.Queue[MonitoringMessage] = queue.Queue()
|
306
309
|
|
307
310
|
def start(self,
|
308
|
-
|
309
|
-
node_queue: "queue.Queue[MonitoringMessage]",
|
310
|
-
block_queue: "queue.Queue[MonitoringMessage]",
|
311
|
-
resource_queue: "queue.Queue[MonitoringMessage]") -> None:
|
311
|
+
resource_queue: mpq.Queue) -> None:
|
312
312
|
|
313
313
|
self._kill_event = threading.Event()
|
314
|
-
self._priority_queue_pull_thread = threading.Thread(target=self._migrate_logs_to_internal,
|
315
|
-
args=(
|
316
|
-
priority_queue, 'priority', self._kill_event,),
|
317
|
-
name="Monitoring-migrate-priority",
|
318
|
-
daemon=True,
|
319
|
-
)
|
320
|
-
self._priority_queue_pull_thread.start()
|
321
|
-
|
322
|
-
self._node_queue_pull_thread = threading.Thread(target=self._migrate_logs_to_internal,
|
323
|
-
args=(
|
324
|
-
node_queue, 'node', self._kill_event,),
|
325
|
-
name="Monitoring-migrate-node",
|
326
|
-
daemon=True,
|
327
|
-
)
|
328
|
-
self._node_queue_pull_thread.start()
|
329
|
-
|
330
|
-
self._block_queue_pull_thread = threading.Thread(target=self._migrate_logs_to_internal,
|
331
|
-
args=(
|
332
|
-
block_queue, 'block', self._kill_event,),
|
333
|
-
name="Monitoring-migrate-block",
|
334
|
-
daemon=True,
|
335
|
-
)
|
336
|
-
self._block_queue_pull_thread.start()
|
337
314
|
|
338
315
|
self._resource_queue_pull_thread = threading.Thread(target=self._migrate_logs_to_internal,
|
339
316
|
args=(
|
340
|
-
resource_queue,
|
317
|
+
resource_queue, self._kill_event,),
|
341
318
|
name="Monitoring-migrate-resource",
|
342
319
|
daemon=True,
|
343
320
|
)
|
@@ -369,20 +346,18 @@ class DatabaseManager:
|
|
369
346
|
while (not self._kill_event.is_set() or
|
370
347
|
self.pending_priority_queue.qsize() != 0 or self.pending_resource_queue.qsize() != 0 or
|
371
348
|
self.pending_node_queue.qsize() != 0 or self.pending_block_queue.qsize() != 0 or
|
372
|
-
|
373
|
-
node_queue.qsize() != 0 or block_queue.qsize() != 0):
|
349
|
+
resource_queue.qsize() != 0):
|
374
350
|
|
375
351
|
"""
|
376
352
|
WORKFLOW_INFO and TASK_INFO messages (i.e. priority messages)
|
377
353
|
|
378
354
|
"""
|
379
355
|
try:
|
380
|
-
logger.debug("""Checking STOP conditions: {}, {}, {}, {}, {}, {}
|
356
|
+
logger.debug("""Checking STOP conditions: {}, {}, {}, {}, {}, {}""".format(
|
381
357
|
self._kill_event.is_set(),
|
382
358
|
self.pending_priority_queue.qsize() != 0, self.pending_resource_queue.qsize() != 0,
|
383
359
|
self.pending_node_queue.qsize() != 0, self.pending_block_queue.qsize() != 0,
|
384
|
-
|
385
|
-
node_queue.qsize() != 0, block_queue.qsize() != 0))
|
360
|
+
resource_queue.qsize() != 0))
|
386
361
|
|
387
362
|
# This is the list of resource messages which can be reprocessed as if they
|
388
363
|
# had just arrived because the corresponding first task message has been
|
@@ -574,43 +549,26 @@ class DatabaseManager:
|
|
574
549
|
raise RuntimeError("An exception happened sometime during database processing and should have been logged in database_manager.log")
|
575
550
|
|
576
551
|
@wrap_with_logs(target="database_manager")
|
577
|
-
def _migrate_logs_to_internal(self, logs_queue: queue.Queue,
|
578
|
-
logger.info("Starting
|
552
|
+
def _migrate_logs_to_internal(self, logs_queue: queue.Queue, kill_event: threading.Event) -> None:
|
553
|
+
logger.info("Starting _migrate_logs_to_internal")
|
579
554
|
|
580
555
|
while not kill_event.is_set() or logs_queue.qsize() != 0:
|
581
|
-
logger.debug("
|
582
|
-
|
556
|
+
logger.debug("Checking STOP conditions: kill event: %s, queue has entries: %s",
|
557
|
+
kill_event.is_set(), logs_queue.qsize() != 0)
|
583
558
|
try:
|
584
559
|
x, addr = logs_queue.get(timeout=0.1)
|
585
560
|
except queue.Empty:
|
586
561
|
continue
|
587
562
|
else:
|
588
|
-
if
|
563
|
+
if x == 'STOP':
|
589
564
|
self.close()
|
590
|
-
elif queue_tag == 'priority': # implicitly not 'STOP'
|
591
|
-
assert isinstance(x, tuple)
|
592
|
-
assert len(x) == 2
|
593
|
-
assert x[0] in [MessageType.WORKFLOW_INFO, MessageType.TASK_INFO], \
|
594
|
-
"_migrate_logs_to_internal can only migrate WORKFLOW_,TASK_INFO message from priority queue, got x[0] == {}".format(x[0])
|
595
|
-
self._dispatch_to_internal(x)
|
596
|
-
elif queue_tag == 'resource':
|
597
|
-
assert isinstance(x, tuple), "_migrate_logs_to_internal was expecting a tuple, got {}".format(x)
|
598
|
-
assert x[0] == MessageType.RESOURCE_INFO, (
|
599
|
-
"_migrate_logs_to_internal can only migrate RESOURCE_INFO message from resource queue, "
|
600
|
-
"got tag {}, message {}".format(x[0], x)
|
601
|
-
)
|
602
|
-
self._dispatch_to_internal(x)
|
603
|
-
elif queue_tag == 'node':
|
604
|
-
assert len(x) == 2, "expected message tuple to have exactly two elements"
|
605
|
-
assert x[0] == MessageType.NODE_INFO, "_migrate_logs_to_internal can only migrate NODE_INFO messages from node queue"
|
606
|
-
|
607
|
-
self._dispatch_to_internal(x)
|
608
|
-
elif queue_tag == "block":
|
609
|
-
self._dispatch_to_internal(x)
|
610
565
|
else:
|
611
|
-
|
566
|
+
self._dispatch_to_internal(x)
|
612
567
|
|
613
568
|
def _dispatch_to_internal(self, x: Tuple) -> None:
|
569
|
+
assert isinstance(x, tuple)
|
570
|
+
assert len(x) == 2, "expected message tuple to have exactly two elements"
|
571
|
+
|
614
572
|
if x[0] in [MessageType.WORKFLOW_INFO, MessageType.TASK_INFO]:
|
615
573
|
self.pending_priority_queue.put(cast(Any, x))
|
616
574
|
elif x[0] == MessageType.RESOURCE_INFO:
|
@@ -719,11 +677,9 @@ class DatabaseManager:
|
|
719
677
|
|
720
678
|
|
721
679
|
@wrap_with_logs(target="database_manager")
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
block_msgs: "queue.Queue[MonitoringMessage]",
|
726
|
-
resource_msgs: "queue.Queue[MonitoringMessage]",
|
680
|
+
@typeguard.typechecked
|
681
|
+
def dbm_starter(exception_q: mpq.Queue,
|
682
|
+
resource_msgs: mpq.Queue,
|
727
683
|
db_url: str,
|
728
684
|
logdir: str,
|
729
685
|
logging_level: int) -> None:
|
@@ -739,7 +695,7 @@ def dbm_starter(exception_q: "queue.Queue[Tuple[str, str]]",
|
|
739
695
|
logdir=logdir,
|
740
696
|
logging_level=logging_level)
|
741
697
|
logger.info("Starting dbm in dbm starter")
|
742
|
-
dbm.start(
|
698
|
+
dbm.start(resource_msgs)
|
743
699
|
except KeyboardInterrupt:
|
744
700
|
logger.exception("KeyboardInterrupt signal caught")
|
745
701
|
dbm.close()
|
parsl/monitoring/monitoring.py
CHANGED
@@ -7,7 +7,7 @@ import queue
|
|
7
7
|
import time
|
8
8
|
from multiprocessing import Event, Process
|
9
9
|
from multiprocessing.queues import Queue
|
10
|
-
from typing import TYPE_CHECKING, Any, Optional, Tuple, Union, cast
|
10
|
+
from typing import TYPE_CHECKING, Any, Literal, Optional, Tuple, Union, cast
|
11
11
|
|
12
12
|
import typeguard
|
13
13
|
|
@@ -138,25 +138,18 @@ class MonitoringHub(RepresentationMixin):
|
|
138
138
|
self.exception_q: Queue[Tuple[str, str]]
|
139
139
|
self.exception_q = SizedQueue(maxsize=10)
|
140
140
|
|
141
|
-
self.
|
142
|
-
self.priority_msgs = SizedQueue()
|
143
|
-
|
144
|
-
self.resource_msgs: Queue[AddressedMonitoringMessage]
|
141
|
+
self.resource_msgs: Queue[Union[AddressedMonitoringMessage, Tuple[Literal["STOP"], Literal[0]]]]
|
145
142
|
self.resource_msgs = SizedQueue()
|
146
143
|
|
147
|
-
self.node_msgs: Queue[AddressedMonitoringMessage]
|
148
|
-
self.node_msgs = SizedQueue()
|
149
|
-
|
150
|
-
self.block_msgs: Queue[AddressedMonitoringMessage]
|
151
|
-
self.block_msgs = SizedQueue()
|
152
|
-
|
153
144
|
self.router_exit_event: ms.Event
|
154
145
|
self.router_exit_event = Event()
|
155
146
|
|
156
147
|
self.router_proc = ForkProcess(target=router_starter,
|
157
|
-
|
158
|
-
|
159
|
-
|
148
|
+
kwargs={"comm_q": comm_q,
|
149
|
+
"exception_q": self.exception_q,
|
150
|
+
"resource_msgs": self.resource_msgs,
|
151
|
+
"exit_event": self.router_exit_event,
|
152
|
+
"hub_address": self.hub_address,
|
160
153
|
"udp_port": self.hub_port,
|
161
154
|
"zmq_port_range": self.hub_port_range,
|
162
155
|
"logdir": self.logdir,
|
@@ -168,7 +161,7 @@ class MonitoringHub(RepresentationMixin):
|
|
168
161
|
self.router_proc.start()
|
169
162
|
|
170
163
|
self.dbm_proc = ForkProcess(target=dbm_starter,
|
171
|
-
args=(self.exception_q, self.
|
164
|
+
args=(self.exception_q, self.resource_msgs,),
|
172
165
|
kwargs={"logdir": self.logdir,
|
173
166
|
"logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
|
174
167
|
"db_url": self.logging_endpoint,
|
@@ -187,7 +180,7 @@ class MonitoringHub(RepresentationMixin):
|
|
187
180
|
self.filesystem_proc.start()
|
188
181
|
logger.info(f"Started filesystem radio receiver process {self.filesystem_proc.pid}")
|
189
182
|
|
190
|
-
self.radio = MultiprocessingQueueRadioSender(self.
|
183
|
+
self.radio = MultiprocessingQueueRadioSender(self.resource_msgs)
|
191
184
|
|
192
185
|
try:
|
193
186
|
comm_q_result = comm_q.get(block=True, timeout=120)
|
@@ -244,7 +237,7 @@ class MonitoringHub(RepresentationMixin):
|
|
244
237
|
logger.debug("Finished waiting for router termination")
|
245
238
|
if len(exception_msgs) == 0:
|
246
239
|
logger.debug("Sending STOP to DBM")
|
247
|
-
self.
|
240
|
+
self.resource_msgs.put(("STOP", 0))
|
248
241
|
else:
|
249
242
|
logger.debug("Not sending STOP to DBM, because there were DBM exceptions")
|
250
243
|
logger.debug("Waiting for DB termination")
|
@@ -262,14 +255,8 @@ class MonitoringHub(RepresentationMixin):
|
|
262
255
|
logger.info("Closing monitoring multiprocessing queues")
|
263
256
|
self.exception_q.close()
|
264
257
|
self.exception_q.join_thread()
|
265
|
-
self.priority_msgs.close()
|
266
|
-
self.priority_msgs.join_thread()
|
267
258
|
self.resource_msgs.close()
|
268
259
|
self.resource_msgs.join_thread()
|
269
|
-
self.node_msgs.close()
|
270
|
-
self.node_msgs.join_thread()
|
271
|
-
self.block_msgs.close()
|
272
|
-
self.block_msgs.join_thread()
|
273
260
|
logger.info("Closed monitoring multiprocessing queues")
|
274
261
|
|
275
262
|
|
parsl/monitoring/router.py
CHANGED
@@ -1,19 +1,19 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import logging
|
4
|
+
import multiprocessing.queues as mpq
|
4
5
|
import os
|
5
6
|
import pickle
|
6
|
-
import queue
|
7
7
|
import socket
|
8
8
|
import threading
|
9
9
|
import time
|
10
10
|
from multiprocessing.synchronize import Event
|
11
|
-
from typing import Optional, Tuple
|
11
|
+
from typing import Optional, Tuple
|
12
12
|
|
13
|
+
import typeguard
|
13
14
|
import zmq
|
14
15
|
|
15
16
|
from parsl.log_utils import set_file_logger
|
16
|
-
from parsl.monitoring.message_type import MessageType
|
17
17
|
from parsl.monitoring.types import AddressedMonitoringMessage, TaggedMonitoringMessage
|
18
18
|
from parsl.process_loggers import wrap_with_logs
|
19
19
|
from parsl.utils import setproctitle
|
@@ -33,10 +33,7 @@ class MonitoringRouter:
|
|
33
33
|
logdir: str = ".",
|
34
34
|
logging_level: int = logging.INFO,
|
35
35
|
atexit_timeout: int = 3, # in seconds
|
36
|
-
|
37
|
-
node_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
38
|
-
block_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
39
|
-
resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
36
|
+
resource_msgs: mpq.Queue,
|
40
37
|
exit_event: Event,
|
41
38
|
):
|
42
39
|
""" Initializes a monitoring configuration class.
|
@@ -56,8 +53,8 @@ class MonitoringRouter:
|
|
56
53
|
Logging level as defined in the logging module. Default: logging.INFO
|
57
54
|
atexit_timeout : float, optional
|
58
55
|
The amount of time in seconds to terminate the hub without receiving any messages, after the last dfk workflow message is received.
|
59
|
-
|
60
|
-
|
56
|
+
resource_msgs : multiprocessing.Queue
|
57
|
+
A multiprocessing queue to receive messages to be routed onwards to the database process
|
61
58
|
|
62
59
|
exit_event : Event
|
63
60
|
An event that the main Parsl process will set to signal that the monitoring router should shut down.
|
@@ -101,9 +98,6 @@ class MonitoringRouter:
|
|
101
98
|
min_port=zmq_port_range[0],
|
102
99
|
max_port=zmq_port_range[1])
|
103
100
|
|
104
|
-
self.priority_msgs = priority_msgs
|
105
|
-
self.node_msgs = node_msgs
|
106
|
-
self.block_msgs = block_msgs
|
107
101
|
self.resource_msgs = resource_msgs
|
108
102
|
self.exit_event = exit_event
|
109
103
|
|
@@ -169,24 +163,7 @@ class MonitoringRouter:
|
|
169
163
|
msg_0: AddressedMonitoringMessage
|
170
164
|
msg_0 = (msg, 0)
|
171
165
|
|
172
|
-
|
173
|
-
self.node_msgs.put(msg_0)
|
174
|
-
elif msg[0] == MessageType.RESOURCE_INFO:
|
175
|
-
self.resource_msgs.put(msg_0)
|
176
|
-
elif msg[0] == MessageType.BLOCK_INFO:
|
177
|
-
self.block_msgs.put(msg_0)
|
178
|
-
elif msg[0] == MessageType.TASK_INFO:
|
179
|
-
self.priority_msgs.put(msg_0)
|
180
|
-
elif msg[0] == MessageType.WORKFLOW_INFO:
|
181
|
-
self.priority_msgs.put(msg_0)
|
182
|
-
else:
|
183
|
-
# There is a type: ignore here because if msg[0]
|
184
|
-
# is of the correct type, this code is unreachable,
|
185
|
-
# but there is no verification that the message
|
186
|
-
# received from zmq_receiver_channel.recv_pyobj() is actually
|
187
|
-
# of that type.
|
188
|
-
self.logger.error("Discarding message " # type: ignore[unreachable]
|
189
|
-
f"from interchange with unknown type {msg[0].value}")
|
166
|
+
self.resource_msgs.put(msg_0)
|
190
167
|
except zmq.Again:
|
191
168
|
pass
|
192
169
|
except Exception:
|
@@ -202,12 +179,11 @@ class MonitoringRouter:
|
|
202
179
|
|
203
180
|
|
204
181
|
@wrap_with_logs
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
182
|
+
@typeguard.typechecked
|
183
|
+
def router_starter(*,
|
184
|
+
comm_q: mpq.Queue,
|
185
|
+
exception_q: mpq.Queue,
|
186
|
+
resource_msgs: mpq.Queue,
|
211
187
|
exit_event: Event,
|
212
188
|
|
213
189
|
hub_address: str,
|
@@ -223,9 +199,6 @@ def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
|
|
223
199
|
zmq_port_range=zmq_port_range,
|
224
200
|
logdir=logdir,
|
225
201
|
logging_level=logging_level,
|
226
|
-
priority_msgs=priority_msgs,
|
227
|
-
node_msgs=node_msgs,
|
228
|
-
block_msgs=block_msgs,
|
229
202
|
resource_msgs=resource_msgs,
|
230
203
|
exit_event=exit_event)
|
231
204
|
except Exception as e:
|
parsl/providers/slurm/slurm.py
CHANGED
@@ -20,7 +20,7 @@ from parsl.utils import RepresentationMixin, wtime_to_minutes
|
|
20
20
|
logger = logging.getLogger(__name__)
|
21
21
|
|
22
22
|
# From https://slurm.schedmd.com/sacct.html#SECTION_JOB-STATE-CODES
|
23
|
-
|
23
|
+
sacct_translate_table = {
|
24
24
|
'PENDING': JobState.PENDING,
|
25
25
|
'RUNNING': JobState.RUNNING,
|
26
26
|
'CANCELLED': JobState.CANCELLED,
|
@@ -37,6 +37,20 @@ translate_table = {
|
|
37
37
|
'REQUEUED': JobState.PENDING
|
38
38
|
}
|
39
39
|
|
40
|
+
squeue_translate_table = {
|
41
|
+
'PD': JobState.PENDING,
|
42
|
+
'R': JobState.RUNNING,
|
43
|
+
'CA': JobState.CANCELLED,
|
44
|
+
'CF': JobState.PENDING, # (configuring),
|
45
|
+
'CG': JobState.RUNNING, # (completing),
|
46
|
+
'CD': JobState.COMPLETED,
|
47
|
+
'F': JobState.FAILED, # (failed),
|
48
|
+
'TO': JobState.TIMEOUT, # (timeout),
|
49
|
+
'NF': JobState.FAILED, # (node failure),
|
50
|
+
'RV': JobState.FAILED, # (revoked) and
|
51
|
+
'SE': JobState.FAILED # (special exit state)
|
52
|
+
}
|
53
|
+
|
40
54
|
|
41
55
|
class SlurmProvider(ClusterProvider, RepresentationMixin):
|
42
56
|
"""Slurm Execution Provider
|
@@ -155,6 +169,23 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
|
|
155
169
|
|
156
170
|
self.regex_job_id = regex_job_id
|
157
171
|
self.worker_init = worker_init + '\n'
|
172
|
+
# Check if sacct works and if not fall back to squeue
|
173
|
+
cmd = "sacct -X"
|
174
|
+
logger.debug("Executing %s", cmd)
|
175
|
+
retcode, stdout, stderr = self.execute_wait(cmd)
|
176
|
+
# If sacct fails it should return retcode=1 stderr="Slurm accounting storage is disabled"
|
177
|
+
logger.debug(f"sacct returned retcode={retcode} stderr={stderr}")
|
178
|
+
if retcode == 0:
|
179
|
+
logger.debug("using sacct to get job status")
|
180
|
+
# Using state%20 to get enough characters to not truncate output
|
181
|
+
# of the state. Without output can look like "<job_id> CANCELLED+"
|
182
|
+
self._cmd = "sacct -X --noheader --format=jobid,state%20 --job '{0}'"
|
183
|
+
self._translate_table = sacct_translate_table
|
184
|
+
else:
|
185
|
+
logger.debug(f"sacct failed with retcode={retcode}")
|
186
|
+
logger.debug("falling back to using squeue to get job status")
|
187
|
+
self._cmd = "squeue --noheader --format='%i %t' --job '{0}'"
|
188
|
+
self._translate_table = squeue_translate_table
|
158
189
|
|
159
190
|
def _status(self):
|
160
191
|
'''Returns the status list for a list of job_ids
|
@@ -172,16 +203,14 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
|
|
172
203
|
logger.debug('No active jobs, skipping status update')
|
173
204
|
return
|
174
205
|
|
175
|
-
|
176
|
-
# of the state. Without output can look like "<job_id> CANCELLED+"
|
177
|
-
cmd = "sacct -X --noheader --format=jobid,state%20 --job '{0}'".format(job_id_list)
|
206
|
+
cmd = self._cmd.format(job_id_list)
|
178
207
|
logger.debug("Executing %s", cmd)
|
179
208
|
retcode, stdout, stderr = self.execute_wait(cmd)
|
180
|
-
logger.debug("sacct returned %s %s", stdout, stderr)
|
209
|
+
logger.debug("sacct/squeue returned %s %s", stdout, stderr)
|
181
210
|
|
182
211
|
# Execute_wait failed. Do no update
|
183
212
|
if retcode != 0:
|
184
|
-
logger.warning("sacct failed with non-zero exit code {}".format(retcode))
|
213
|
+
logger.warning("sacct/squeue failed with non-zero exit code {}".format(retcode))
|
185
214
|
return
|
186
215
|
|
187
216
|
jobs_missing = set(self.resources.keys())
|
@@ -193,9 +222,9 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
|
|
193
222
|
# For example "<job_id> CANCELLED by <user_id>"
|
194
223
|
# This splits and ignores anything past the first two unpacked values
|
195
224
|
job_id, slurm_state, *ignore = line.split()
|
196
|
-
if slurm_state not in
|
225
|
+
if slurm_state not in self._translate_table:
|
197
226
|
logger.warning(f"Slurm status {slurm_state} is not recognized")
|
198
|
-
status =
|
227
|
+
status = self._translate_table.get(slurm_state, JobState.UNKNOWN)
|
199
228
|
logger.debug("Updating job {} with slurm status {} to parsl state {!s}".format(job_id, slurm_state, status))
|
200
229
|
self.resources[job_id]['status'] = JobStatus(status,
|
201
230
|
stdout_path=self.resources[job_id]['job_stdout_path'],
|
@@ -203,9 +232,10 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
|
|
203
232
|
jobs_missing.remove(job_id)
|
204
233
|
|
205
234
|
# sacct can get job info after jobs have completed so this path shouldn't be hit
|
206
|
-
#
|
235
|
+
# squeue does not report on jobs that are not running. So we are filling in the
|
236
|
+
# blanks for missing jobs, we might lose some information about why the jobs failed.
|
207
237
|
for missing_job in jobs_missing:
|
208
|
-
logger.
|
238
|
+
logger.debug("Updating missing job {} to completed status".format(missing_job))
|
209
239
|
self.resources[missing_job]['status'] = JobStatus(
|
210
240
|
JobState.COMPLETED, stdout_path=self.resources[missing_job]['job_stdout_path'],
|
211
241
|
stderr_path=self.resources[missing_job]['job_stderr_path'])
|
@@ -21,16 +21,14 @@ def local_config():
|
|
21
21
|
poll_period=100,
|
22
22
|
max_workers_per_node=1,
|
23
23
|
provider=LocalProvider(
|
24
|
-
worker_init="
|
25
|
-
init_blocks=2
|
26
|
-
max_blocks=4,
|
27
|
-
min_blocks=0,
|
24
|
+
worker_init="exit 0",
|
25
|
+
init_blocks=2
|
28
26
|
),
|
29
27
|
)
|
30
28
|
],
|
31
29
|
run_dir="/tmp/test_htex",
|
32
30
|
max_idletime=0.5,
|
33
|
-
strategy='
|
31
|
+
strategy='none',
|
34
32
|
)
|
35
33
|
|
36
34
|
|
@@ -0,0 +1,40 @@
|
|
1
|
+
import queue
|
2
|
+
from unittest import mock
|
3
|
+
|
4
|
+
import pytest
|
5
|
+
|
6
|
+
from parsl.executors import HighThroughputExecutor
|
7
|
+
from parsl.executors.high_throughput.mpi_prefix_composer import (
|
8
|
+
InvalidResourceSpecification,
|
9
|
+
)
|
10
|
+
|
11
|
+
|
12
|
+
def double(x):
|
13
|
+
return x * 2
|
14
|
+
|
15
|
+
|
16
|
+
@pytest.mark.local
|
17
|
+
def test_submit_calls_validate():
|
18
|
+
|
19
|
+
htex = HighThroughputExecutor()
|
20
|
+
htex.outgoing_q = mock.Mock(spec=queue.Queue)
|
21
|
+
htex.validate_resource_spec = mock.Mock(spec=htex.validate_resource_spec)
|
22
|
+
|
23
|
+
res_spec = {}
|
24
|
+
htex.submit(double, res_spec, (5,), {})
|
25
|
+
htex.validate_resource_spec.assert_called()
|
26
|
+
|
27
|
+
|
28
|
+
@pytest.mark.local
|
29
|
+
def test_resource_spec_validation():
|
30
|
+
htex = HighThroughputExecutor()
|
31
|
+
ret_val = htex.validate_resource_spec({})
|
32
|
+
assert ret_val is None
|
33
|
+
|
34
|
+
|
35
|
+
@pytest.mark.local
|
36
|
+
def test_resource_spec_validation_bad_keys():
|
37
|
+
htex = HighThroughputExecutor()
|
38
|
+
|
39
|
+
with pytest.raises(InvalidResourceSpecification):
|
40
|
+
htex.validate_resource_spec({"num_nodes": 2})
|
@@ -78,6 +78,6 @@ def test_row_counts(tmpd_cwd, strategy):
|
|
78
78
|
(c, ) = result.first()
|
79
79
|
assert c == 1, "There should be a single pending status"
|
80
80
|
|
81
|
-
result = connection.execute(text("SELECT COUNT(*) FROM block WHERE block_id = 0 AND status = '
|
81
|
+
result = connection.execute(text("SELECT COUNT(*) FROM block WHERE block_id = 0 AND status = 'SCALED_IN' AND run_id = :run_id"), binds)
|
82
82
|
(c, ) = result.first()
|
83
83
|
assert c == 1, "There should be a single cancelled status"
|
@@ -1,33 +1,48 @@
|
|
1
1
|
import pytest
|
2
2
|
|
3
3
|
from parsl import Config
|
4
|
-
from parsl.executors import
|
4
|
+
from parsl.executors import MPIExecutor
|
5
5
|
from parsl.launchers import AprunLauncher, SimpleLauncher, SrunLauncher
|
6
6
|
from parsl.providers import SlurmProvider
|
7
7
|
|
8
8
|
|
9
9
|
@pytest.mark.local
|
10
|
-
def
|
11
|
-
"""
|
10
|
+
def test_bad_launcher():
|
11
|
+
"""TypeError if a launcher other than SimpleLauncher is supplied"""
|
12
12
|
|
13
13
|
for launcher in [SrunLauncher(), AprunLauncher()]:
|
14
|
-
with pytest.raises(
|
14
|
+
with pytest.raises(TypeError):
|
15
15
|
Config(executors=[
|
16
|
-
|
17
|
-
enable_mpi_mode=True,
|
16
|
+
MPIExecutor(
|
18
17
|
provider=SlurmProvider(launcher=launcher),
|
19
18
|
)
|
20
19
|
])
|
21
20
|
|
22
21
|
|
23
22
|
@pytest.mark.local
|
24
|
-
def
|
23
|
+
def test_bad_mpi_launcher():
|
24
|
+
"""ValueError if an unsupported mpi_launcher is specified"""
|
25
|
+
|
26
|
+
with pytest.raises(ValueError):
|
27
|
+
Config(executors=[
|
28
|
+
MPIExecutor(
|
29
|
+
mpi_launcher="bad_launcher",
|
30
|
+
provider=SlurmProvider(launcher=SimpleLauncher()),
|
31
|
+
)
|
32
|
+
])
|
33
|
+
|
34
|
+
|
35
|
+
@pytest.mark.local
|
36
|
+
@pytest.mark.parametrize(
|
37
|
+
"mpi_launcher",
|
38
|
+
["srun", "aprun", "mpiexec"]
|
39
|
+
)
|
40
|
+
def test_correct_launcher_with_mpi_mode(mpi_launcher: str):
|
25
41
|
"""Confirm that SimpleLauncher works with mpi_mode"""
|
26
42
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
assert isinstance(config.executors[0].provider.launcher, SimpleLauncher)
|
43
|
+
executor = MPIExecutor(
|
44
|
+
mpi_launcher=mpi_launcher,
|
45
|
+
provider=SlurmProvider(launcher=SimpleLauncher()),
|
46
|
+
)
|
47
|
+
|
48
|
+
assert isinstance(executor.provider.launcher, SimpleLauncher)
|
@@ -6,26 +6,34 @@ from typing import Dict
|
|
6
6
|
import pytest
|
7
7
|
|
8
8
|
import parsl
|
9
|
-
from parsl import bash_app, python_app
|
9
|
+
from parsl import Config, bash_app, python_app
|
10
|
+
from parsl.executors import MPIExecutor
|
10
11
|
from parsl.executors.high_throughput.mpi_prefix_composer import (
|
11
12
|
MissingResourceSpecification,
|
12
13
|
)
|
13
|
-
from parsl.
|
14
|
+
from parsl.launchers import SimpleLauncher
|
15
|
+
from parsl.providers import LocalProvider
|
14
16
|
|
15
17
|
EXECUTOR_LABEL = "MPI_TEST"
|
16
18
|
|
17
19
|
|
18
20
|
def local_setup():
|
19
|
-
config = fresh_config()
|
20
|
-
config.executors[0].label = EXECUTOR_LABEL
|
21
|
-
config.executors[0].max_workers_per_node = 2
|
22
|
-
config.executors[0].enable_mpi_mode = True
|
23
|
-
config.executors[0].mpi_launcher = "mpiexec"
|
24
21
|
|
25
22
|
cwd = os.path.abspath(os.path.dirname(__file__))
|
26
23
|
pbs_nodefile = os.path.join(cwd, "mocks", "pbs_nodefile")
|
27
24
|
|
28
|
-
config
|
25
|
+
config = Config(
|
26
|
+
executors=[
|
27
|
+
MPIExecutor(
|
28
|
+
label=EXECUTOR_LABEL,
|
29
|
+
max_workers_per_block=2,
|
30
|
+
mpi_launcher="mpiexec",
|
31
|
+
provider=LocalProvider(
|
32
|
+
worker_init=f"export PBS_NODEFILE={pbs_nodefile}",
|
33
|
+
launcher=SimpleLauncher()
|
34
|
+
)
|
35
|
+
)
|
36
|
+
])
|
29
37
|
|
30
38
|
parsl.load(config)
|
31
39
|
|
@@ -4,7 +4,6 @@ from pathlib import Path
|
|
4
4
|
|
5
5
|
import pytest
|
6
6
|
|
7
|
-
import parsl
|
8
7
|
from parsl import Config, HighThroughputExecutor
|
9
8
|
from parsl.executors.high_throughput.mpi_executor import MPIExecutor
|
10
9
|
from parsl.launchers import SimpleLauncher
|
@@ -42,8 +41,8 @@ def test_docstring():
|
|
42
41
|
def test_init():
|
43
42
|
"""Ensure all relevant kwargs are copied over from HTEx"""
|
44
43
|
|
45
|
-
new_kwargs = {'max_workers_per_block'}
|
46
|
-
excluded_kwargs = {'available_accelerators', '
|
44
|
+
new_kwargs = {'max_workers_per_block', 'mpi_launcher'}
|
45
|
+
excluded_kwargs = {'available_accelerators', 'cores_per_worker', 'max_workers_per_node',
|
47
46
|
'mem_per_worker', 'cpu_affinity', 'max_workers', 'manager_selector'}
|
48
47
|
|
49
48
|
# Get the kwargs from both HTEx and MPIEx
|