parsl 2024.6.3__py3-none-any.whl → 2024.6.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/app/app.py +0 -2
- parsl/config.py +27 -4
- parsl/dataflow/dflow.py +36 -10
- parsl/executors/high_throughput/executor.py +36 -30
- parsl/executors/high_throughput/interchange.py +26 -28
- parsl/providers/kubernetes/kube.py +22 -9
- parsl/providers/slurm/slurm.py +31 -22
- parsl/tests/configs/flux_local.py +11 -0
- parsl/tests/conftest.py +4 -0
- parsl/tests/test_bash_apps/test_stdout.py +20 -2
- parsl/tests/test_htex/test_htex.py +24 -7
- parsl/tests/test_htex/test_zmq_binding.py +22 -6
- parsl/tests/test_python_apps/test_context_manager.py +96 -1
- parsl/tests/test_python_apps/test_dependencies_deep.py +59 -0
- parsl/tests/test_radical/test_mpi_funcs.py +0 -1
- parsl/tests/unit/test_usage_tracking.py +45 -0
- parsl/usage_tracking/levels.py +6 -0
- parsl/usage_tracking/usage.py +54 -23
- parsl/version.py +1 -1
- parsl-2024.6.17.data/scripts/interchange.py +681 -0
- {parsl-2024.6.3.dist-info → parsl-2024.6.17.dist-info}/METADATA +2 -2
- {parsl-2024.6.3.dist-info → parsl-2024.6.17.dist-info}/RECORD +29 -24
- {parsl-2024.6.3.data → parsl-2024.6.17.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.6.3.data → parsl-2024.6.17.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.6.3.data → parsl-2024.6.17.data}/scripts/process_worker_pool.py +0 -0
- {parsl-2024.6.3.dist-info → parsl-2024.6.17.dist-info}/LICENSE +0 -0
- {parsl-2024.6.3.dist-info → parsl-2024.6.17.dist-info}/WHEEL +0 -0
- {parsl-2024.6.3.dist-info → parsl-2024.6.17.dist-info}/entry_points.txt +0 -0
- {parsl-2024.6.3.dist-info → parsl-2024.6.17.dist-info}/top_level.txt +0 -0
parsl/app/app.py
CHANGED
@@ -66,8 +66,6 @@ class AppBase(metaclass=ABCMeta):
|
|
66
66
|
self.kwargs['walltime'] = params['walltime'].default
|
67
67
|
if 'parsl_resource_specification' in params:
|
68
68
|
self.kwargs['parsl_resource_specification'] = params['parsl_resource_specification'].default
|
69
|
-
self.outputs = params['outputs'].default if 'outputs' in params else []
|
70
|
-
self.inputs = params['inputs'].default if 'inputs' in params else []
|
71
69
|
|
72
70
|
@abstractmethod
|
73
71
|
def __call__(self, *args: Any, **kwargs: Any) -> AppFuture:
|
parsl/config.py
CHANGED
@@ -11,6 +11,8 @@ from parsl.executors.base import ParslExecutor
|
|
11
11
|
from parsl.executors.threads import ThreadPoolExecutor
|
12
12
|
from parsl.monitoring import MonitoringHub
|
13
13
|
from parsl.usage_tracking.api import UsageInformation
|
14
|
+
from parsl.usage_tracking.levels import DISABLED as USAGE_TRACKING_DISABLED
|
15
|
+
from parsl.usage_tracking.levels import LEVEL_3 as USAGE_TRACKING_LEVEL_3
|
14
16
|
from parsl.utils import RepresentationMixin
|
15
17
|
|
16
18
|
logger = logging.getLogger(__name__)
|
@@ -38,6 +40,15 @@ class Config(RepresentationMixin, UsageInformation):
|
|
38
40
|
``checkpoint_mode='periodic'``.
|
39
41
|
dependency_resolver: plugin point for custom dependency resolvers. Default: only resolve Futures,
|
40
42
|
using the `SHALLOW_DEPENDENCY_RESOLVER`.
|
43
|
+
exit_mode: str, optional
|
44
|
+
When Parsl is used as a context manager (using ``with parsl.load`` syntax) then this parameter
|
45
|
+
controls what will happen to running tasks and exceptions at exit. The options are:
|
46
|
+
|
47
|
+
* ``cleanup``: cleanup the DFK on exit without waiting for any tasks
|
48
|
+
* ``skip``: skip all shutdown behaviour when exiting the context manager
|
49
|
+
* ``wait``: wait for all tasks to complete when exiting normally, but exit immediately when exiting due to an exception.
|
50
|
+
|
51
|
+
Default is ``cleanup``.
|
41
52
|
garbage_collect : bool. optional.
|
42
53
|
Delete task records from DFK when tasks have completed. Default: True
|
43
54
|
internal_tasks_max_threads : int, optional
|
@@ -66,9 +77,12 @@ class Config(RepresentationMixin, UsageInformation):
|
|
66
77
|
How often the scaling strategy should be executed. Default is 5 seconds.
|
67
78
|
max_idletime : float, optional
|
68
79
|
The maximum idle time allowed for an executor before strategy could shut down unused blocks. Default is 120.0 seconds.
|
69
|
-
usage_tracking :
|
70
|
-
Set this field to
|
71
|
-
|
80
|
+
usage_tracking : int, optional
|
81
|
+
Set this field to 1, 2, or 3 to opt-in to Parsl's usage tracking system.
|
82
|
+
The value represents the level of usage tracking detail to be collected.
|
83
|
+
Setting this field to 0 will disable usage tracking. Default (this field is not set): usage tracking is not enabled.
|
84
|
+
Parsl only collects minimal, non personally-identifiable,
|
85
|
+
information used for reporting to our funding agencies.
|
72
86
|
initialize_logging : bool, optional
|
73
87
|
Make DFK optionally not initialize any logging. Log messages
|
74
88
|
will still be passed into the python logging system under the
|
@@ -92,6 +106,7 @@ class Config(RepresentationMixin, UsageInformation):
|
|
92
106
|
Literal['manual']] = None,
|
93
107
|
checkpoint_period: Optional[str] = None,
|
94
108
|
dependency_resolver: Optional[DependencyResolver] = None,
|
109
|
+
exit_mode: Literal['cleanup', 'skip', 'wait'] = 'cleanup',
|
95
110
|
garbage_collect: bool = True,
|
96
111
|
internal_tasks_max_threads: int = 10,
|
97
112
|
retries: int = 0,
|
@@ -102,7 +117,7 @@ class Config(RepresentationMixin, UsageInformation):
|
|
102
117
|
strategy_period: Union[float, int] = 5,
|
103
118
|
max_idletime: float = 120.0,
|
104
119
|
monitoring: Optional[MonitoringHub] = None,
|
105
|
-
usage_tracking:
|
120
|
+
usage_tracking: int = 0,
|
106
121
|
initialize_logging: bool = True) -> None:
|
107
122
|
|
108
123
|
executors = tuple(executors or [])
|
@@ -128,6 +143,7 @@ class Config(RepresentationMixin, UsageInformation):
|
|
128
143
|
checkpoint_period = "00:30:00"
|
129
144
|
self.checkpoint_period = checkpoint_period
|
130
145
|
self.dependency_resolver = dependency_resolver
|
146
|
+
self.exit_mode = exit_mode
|
131
147
|
self.garbage_collect = garbage_collect
|
132
148
|
self.internal_tasks_max_threads = internal_tasks_max_threads
|
133
149
|
self.retries = retries
|
@@ -136,6 +152,7 @@ class Config(RepresentationMixin, UsageInformation):
|
|
136
152
|
self.strategy = strategy
|
137
153
|
self.strategy_period = strategy_period
|
138
154
|
self.max_idletime = max_idletime
|
155
|
+
self.validate_usage_tracking(usage_tracking)
|
139
156
|
self.usage_tracking = usage_tracking
|
140
157
|
self.initialize_logging = initialize_logging
|
141
158
|
self.monitoring = monitoring
|
@@ -156,6 +173,12 @@ class Config(RepresentationMixin, UsageInformation):
|
|
156
173
|
raise ConfigurationError('Executors must have unique labels ({})'.format(
|
157
174
|
', '.join(['label={}'.format(repr(d)) for d in duplicates])))
|
158
175
|
|
176
|
+
def validate_usage_tracking(self, level: int) -> None:
|
177
|
+
if not USAGE_TRACKING_DISABLED <= level <= USAGE_TRACKING_LEVEL_3:
|
178
|
+
raise ConfigurationError(
|
179
|
+
f"Usage Tracking values must be 0, 1, 2, or 3 and not {level}"
|
180
|
+
)
|
181
|
+
|
159
182
|
def get_usage_information(self):
|
160
183
|
return {"executors_len": len(self.executors),
|
161
184
|
"dependency_resolver": self.dependency_resolver is not None}
|
parsl/dataflow/dflow.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import atexit
|
4
|
+
import concurrent.futures as cf
|
4
5
|
import datetime
|
5
6
|
import inspect
|
6
7
|
import logging
|
@@ -209,6 +210,8 @@ class DataFlowKernel:
|
|
209
210
|
self.tasks: Dict[int, TaskRecord] = {}
|
210
211
|
self.submitter_lock = threading.Lock()
|
211
212
|
|
213
|
+
self.dependency_launch_pool = cf.ThreadPoolExecutor(max_workers=1, thread_name_prefix="Dependency-Launch")
|
214
|
+
|
212
215
|
self.dependency_resolver = self.config.dependency_resolver if self.config.dependency_resolver is not None \
|
213
216
|
else SHALLOW_DEPENDENCY_RESOLVER
|
214
217
|
|
@@ -217,9 +220,24 @@ class DataFlowKernel:
|
|
217
220
|
def __enter__(self):
|
218
221
|
return self
|
219
222
|
|
220
|
-
def __exit__(self, exc_type, exc_value, traceback):
|
221
|
-
|
222
|
-
|
223
|
+
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
224
|
+
mode = self.config.exit_mode
|
225
|
+
logger.debug("Exiting context manager, with exit mode '%s'", mode)
|
226
|
+
if mode == "cleanup":
|
227
|
+
logger.info("Calling cleanup for DFK")
|
228
|
+
self.cleanup()
|
229
|
+
elif mode == "skip":
|
230
|
+
logger.info("Skipping all cleanup handling")
|
231
|
+
elif mode == "wait":
|
232
|
+
if exc_type is None:
|
233
|
+
logger.info("Waiting for all tasks to complete")
|
234
|
+
self.wait_for_current_tasks()
|
235
|
+
self.cleanup()
|
236
|
+
else:
|
237
|
+
logger.info("There was an exception - cleaning up without waiting for task completion")
|
238
|
+
self.cleanup()
|
239
|
+
else:
|
240
|
+
raise InternalConsistencyError(f"Exit case for {mode} should be unreachable, validated by typeguard on Config()")
|
223
241
|
|
224
242
|
def _send_task_log_info(self, task_record: TaskRecord) -> None:
|
225
243
|
if self.monitoring:
|
@@ -611,9 +629,9 @@ class DataFlowKernel:
|
|
611
629
|
return kwargs.get('_parsl_staging_inhibit', False)
|
612
630
|
|
613
631
|
def launch_if_ready(self, task_record: TaskRecord) -> None:
|
614
|
-
"""
|
615
|
-
|
616
|
-
|
632
|
+
"""Schedules a task record for re-inspection to see if it is ready
|
633
|
+
for launch and for launch if it is ready. The call will return
|
634
|
+
immediately.
|
617
635
|
|
618
636
|
This should be called by any piece of the DataFlowKernel that
|
619
637
|
thinks a task may have become ready to run.
|
@@ -622,13 +640,17 @@ class DataFlowKernel:
|
|
622
640
|
ready to run - launch_if_ready will not incorrectly launch that
|
623
641
|
task.
|
624
642
|
|
625
|
-
It is also not an error to call launch_if_ready on a task that has
|
626
|
-
already been launched - launch_if_ready will not re-launch that
|
627
|
-
task.
|
628
|
-
|
629
643
|
launch_if_ready is thread safe, so may be called from any thread
|
630
644
|
or callback.
|
631
645
|
"""
|
646
|
+
self.dependency_launch_pool.submit(self._launch_if_ready_async, task_record)
|
647
|
+
|
648
|
+
@wrap_with_logs
|
649
|
+
def _launch_if_ready_async(self, task_record: TaskRecord) -> None:
|
650
|
+
"""
|
651
|
+
_launch_if_ready will launch the specified task, if it is ready
|
652
|
+
to run (for example, without dependencies, and in pending state).
|
653
|
+
"""
|
632
654
|
exec_fu = None
|
633
655
|
|
634
656
|
task_id = task_record['id']
|
@@ -1271,6 +1293,10 @@ class DataFlowKernel:
|
|
1271
1293
|
self.monitoring.close()
|
1272
1294
|
logger.info("Terminated monitoring")
|
1273
1295
|
|
1296
|
+
logger.info("Terminating dependency launch pool")
|
1297
|
+
self.dependency_launch_pool.shutdown()
|
1298
|
+
logger.info("Terminated dependency launch pool")
|
1299
|
+
|
1274
1300
|
logger.info("Unregistering atexit hook")
|
1275
1301
|
atexit.unregister(self.atexit_cleanup)
|
1276
1302
|
logger.info("Unregistered atexit hook")
|
@@ -1,13 +1,13 @@
|
|
1
1
|
import logging
|
2
2
|
import math
|
3
3
|
import pickle
|
4
|
+
import subprocess
|
4
5
|
import threading
|
5
6
|
import typing
|
6
7
|
import warnings
|
7
8
|
from collections import defaultdict
|
8
9
|
from concurrent.futures import Future
|
9
10
|
from dataclasses import dataclass
|
10
|
-
from multiprocessing import Process
|
11
11
|
from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
|
12
12
|
|
13
13
|
import typeguard
|
@@ -18,7 +18,7 @@ from parsl.addresses import get_all_addresses
|
|
18
18
|
from parsl.app.errors import RemoteExceptionWrapper
|
19
19
|
from parsl.data_provider.staging import Staging
|
20
20
|
from parsl.executors.errors import BadMessage, ScalingFailed
|
21
|
-
from parsl.executors.high_throughput import
|
21
|
+
from parsl.executors.high_throughput import zmq_pipes
|
22
22
|
from parsl.executors.high_throughput.errors import CommandClientTimeoutError
|
23
23
|
from parsl.executors.high_throughput.mpi_prefix_composer import (
|
24
24
|
VALID_LAUNCHERS,
|
@@ -26,7 +26,6 @@ from parsl.executors.high_throughput.mpi_prefix_composer import (
|
|
26
26
|
)
|
27
27
|
from parsl.executors.status_handling import BlockProviderExecutor
|
28
28
|
from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus
|
29
|
-
from parsl.multiprocessing import ForkProcess
|
30
29
|
from parsl.process_loggers import wrap_with_logs
|
31
30
|
from parsl.providers import LocalProvider
|
32
31
|
from parsl.providers.base import ExecutionProvider
|
@@ -305,7 +304,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
305
304
|
self._task_counter = 0
|
306
305
|
self.worker_ports = worker_ports
|
307
306
|
self.worker_port_range = worker_port_range
|
308
|
-
self.interchange_proc: Optional[
|
307
|
+
self.interchange_proc: Optional[subprocess.Popen] = None
|
309
308
|
self.interchange_port_range = interchange_port_range
|
310
309
|
self.heartbeat_threshold = heartbeat_threshold
|
311
310
|
self.heartbeat_period = heartbeat_period
|
@@ -520,37 +519,45 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
520
519
|
|
521
520
|
logger.info("Queue management worker finished")
|
522
521
|
|
523
|
-
def _start_local_interchange_process(self):
|
522
|
+
def _start_local_interchange_process(self) -> None:
|
524
523
|
""" Starts the interchange process locally
|
525
524
|
|
526
|
-
Starts the interchange process locally and uses
|
525
|
+
Starts the interchange process locally and uses the command queue to
|
527
526
|
get the worker task and result ports that the interchange has bound to.
|
528
527
|
"""
|
529
|
-
self.interchange_proc = ForkProcess(target=interchange.starter,
|
530
|
-
kwargs={"client_ports": (self.outgoing_q.port,
|
531
|
-
self.incoming_q.port,
|
532
|
-
self.command_client.port),
|
533
|
-
"interchange_address": self.address,
|
534
|
-
"worker_ports": self.worker_ports,
|
535
|
-
"worker_port_range": self.worker_port_range,
|
536
|
-
"hub_address": self.hub_address,
|
537
|
-
"hub_zmq_port": self.hub_zmq_port,
|
538
|
-
"logdir": self.logdir,
|
539
|
-
"heartbeat_threshold": self.heartbeat_threshold,
|
540
|
-
"poll_period": self.poll_period,
|
541
|
-
"logging_level": logging.DEBUG if self.worker_debug else logging.INFO,
|
542
|
-
"cert_dir": self.cert_dir,
|
543
|
-
},
|
544
|
-
daemon=True,
|
545
|
-
name="HTEX-Interchange"
|
546
|
-
)
|
547
|
-
self.interchange_proc.start()
|
548
528
|
|
529
|
+
interchange_config = {"client_address": "127.0.0.1",
|
530
|
+
"client_ports": (self.outgoing_q.port,
|
531
|
+
self.incoming_q.port,
|
532
|
+
self.command_client.port),
|
533
|
+
"interchange_address": self.address,
|
534
|
+
"worker_ports": self.worker_ports,
|
535
|
+
"worker_port_range": self.worker_port_range,
|
536
|
+
"hub_address": self.hub_address,
|
537
|
+
"hub_zmq_port": self.hub_zmq_port,
|
538
|
+
"logdir": self.logdir,
|
539
|
+
"heartbeat_threshold": self.heartbeat_threshold,
|
540
|
+
"poll_period": self.poll_period,
|
541
|
+
"logging_level": logging.DEBUG if self.worker_debug else logging.INFO,
|
542
|
+
"cert_dir": self.cert_dir,
|
543
|
+
}
|
544
|
+
|
545
|
+
config_pickle = pickle.dumps(interchange_config)
|
546
|
+
|
547
|
+
self.interchange_proc = subprocess.Popen(b"interchange.py", stdin=subprocess.PIPE)
|
548
|
+
stdin = self.interchange_proc.stdin
|
549
|
+
assert stdin is not None, "Popen should have created an IO object (vs default None) because of PIPE mode"
|
550
|
+
|
551
|
+
logger.debug("Popened interchange process. Writing config object")
|
552
|
+
stdin.write(config_pickle)
|
553
|
+
stdin.flush()
|
554
|
+
logger.debug("Sent config object. Requesting worker ports")
|
549
555
|
try:
|
550
556
|
(self.worker_task_port, self.worker_result_port) = self.command_client.run("WORKER_PORTS", timeout_s=120)
|
551
557
|
except CommandClientTimeoutError:
|
552
|
-
logger.error("Interchange has not completed initialization
|
558
|
+
logger.error("Interchange has not completed initialization. Aborting")
|
553
559
|
raise Exception("Interchange failed to start")
|
560
|
+
logger.debug("Got worker ports")
|
554
561
|
|
555
562
|
def _start_queue_management_thread(self):
|
556
563
|
"""Method to start the management thread as a daemon.
|
@@ -809,13 +816,12 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
809
816
|
logger.info("Attempting HighThroughputExecutor shutdown")
|
810
817
|
|
811
818
|
self.interchange_proc.terminate()
|
812
|
-
|
813
|
-
|
819
|
+
try:
|
820
|
+
self.interchange_proc.wait(timeout=timeout)
|
821
|
+
except subprocess.TimeoutExpired:
|
814
822
|
logger.info("Unable to terminate Interchange process; sending SIGKILL")
|
815
823
|
self.interchange_proc.kill()
|
816
824
|
|
817
|
-
self.interchange_proc.close()
|
818
|
-
|
819
825
|
logger.info("Finished HighThroughputExecutor shutdown attempt")
|
820
826
|
|
821
827
|
def get_usage_information(self):
|
@@ -65,18 +65,19 @@ class Interchange:
|
|
65
65
|
3. Detect workers that have failed using heartbeats
|
66
66
|
"""
|
67
67
|
def __init__(self,
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
68
|
+
*,
|
69
|
+
client_address: str,
|
70
|
+
interchange_address: Optional[str],
|
71
|
+
client_ports: Tuple[int, int, int],
|
72
|
+
worker_ports: Optional[Tuple[int, int]],
|
73
|
+
worker_port_range: Tuple[int, int],
|
74
|
+
hub_address: Optional[str],
|
75
|
+
hub_zmq_port: Optional[int],
|
76
|
+
heartbeat_threshold: int,
|
77
|
+
logdir: str,
|
78
|
+
logging_level: int,
|
79
|
+
poll_period: int,
|
80
|
+
cert_dir: Optional[str],
|
80
81
|
) -> None:
|
81
82
|
"""
|
82
83
|
Parameters
|
@@ -92,34 +93,34 @@ class Interchange:
|
|
92
93
|
The ports at which the client can be reached
|
93
94
|
|
94
95
|
worker_ports : tuple(int, int)
|
95
|
-
The specific two ports at which workers will connect to the Interchange.
|
96
|
+
The specific two ports at which workers will connect to the Interchange.
|
96
97
|
|
97
98
|
worker_port_range : tuple(int, int)
|
98
99
|
The interchange picks ports at random from the range which will be used by workers.
|
99
|
-
This is overridden when the worker_ports option is set.
|
100
|
+
This is overridden when the worker_ports option is set.
|
100
101
|
|
101
102
|
hub_address : str
|
102
103
|
The IP address at which the interchange can send info about managers to when monitoring is enabled.
|
103
|
-
|
104
|
+
When None, monitoring is disabled.
|
104
105
|
|
105
106
|
hub_zmq_port : str
|
106
107
|
The port at which the interchange can send info about managers to when monitoring is enabled.
|
107
|
-
|
108
|
+
When None, monitoring is disabled.
|
108
109
|
|
109
110
|
heartbeat_threshold : int
|
110
111
|
Number of seconds since the last heartbeat after which worker is considered lost.
|
111
112
|
|
112
113
|
logdir : str
|
113
|
-
Parsl log directory paths. Logs and temp files go here.
|
114
|
+
Parsl log directory paths. Logs and temp files go here.
|
114
115
|
|
115
116
|
logging_level : int
|
116
|
-
Logging level as defined in the logging module.
|
117
|
+
Logging level as defined in the logging module.
|
117
118
|
|
118
119
|
poll_period : int
|
119
|
-
The main thread polling period, in milliseconds.
|
120
|
+
The main thread polling period, in milliseconds.
|
120
121
|
|
121
122
|
cert_dir : str | None
|
122
|
-
Path to the certificate directory.
|
123
|
+
Path to the certificate directory.
|
123
124
|
"""
|
124
125
|
self.cert_dir = cert_dir
|
125
126
|
self.logdir = logdir
|
@@ -671,13 +672,10 @@ def start_file_logger(filename: str, level: int = logging.DEBUG, format_string:
|
|
671
672
|
logger.addHandler(handler)
|
672
673
|
|
673
674
|
|
674
|
-
|
675
|
-
def starter(*args: Any, **kwargs: Any) -> None:
|
676
|
-
"""Start the interchange process
|
677
|
-
|
678
|
-
The executor is expected to call this function. The args, kwargs match that of the Interchange.__init__
|
679
|
-
"""
|
675
|
+
if __name__ == "__main__":
|
680
676
|
setproctitle("parsl: HTEX interchange")
|
681
|
-
|
682
|
-
|
677
|
+
|
678
|
+
config = pickle.load(sys.stdin.buffer)
|
679
|
+
|
680
|
+
ic = Interchange(**config)
|
683
681
|
ic.start()
|
@@ -83,6 +83,10 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
|
|
83
83
|
persistent_volumes: list[(str, str)]
|
84
84
|
List of tuples describing persistent volumes to be mounted in the pod.
|
85
85
|
The tuples consist of (PVC Name, Mount Directory).
|
86
|
+
service_account_name: str
|
87
|
+
Name of the service account to run the pod as.
|
88
|
+
annotations: Dict[str, str]
|
89
|
+
Annotations to set on the pod.
|
86
90
|
"""
|
87
91
|
@typeguard.typechecked
|
88
92
|
def __init__(self,
|
@@ -103,7 +107,9 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
|
|
103
107
|
group_id: Optional[str] = None,
|
104
108
|
run_as_non_root: bool = False,
|
105
109
|
secret: Optional[str] = None,
|
106
|
-
persistent_volumes: List[Tuple[str, str]] = []
|
110
|
+
persistent_volumes: List[Tuple[str, str]] = [],
|
111
|
+
service_account_name: Optional[str] = None,
|
112
|
+
annotations: Optional[Dict[str, str]] = None) -> None:
|
107
113
|
if not _kubernetes_enabled:
|
108
114
|
raise OptionalModuleMissing(['kubernetes'],
|
109
115
|
"Kubernetes provider requires kubernetes module and config.")
|
@@ -146,6 +152,8 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
|
|
146
152
|
self.group_id = group_id
|
147
153
|
self.run_as_non_root = run_as_non_root
|
148
154
|
self.persistent_volumes = persistent_volumes
|
155
|
+
self.service_account_name = service_account_name
|
156
|
+
self.annotations = annotations
|
149
157
|
|
150
158
|
self.kube_client = client.CoreV1Api()
|
151
159
|
|
@@ -184,7 +192,9 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
|
|
184
192
|
pod_name=pod_name,
|
185
193
|
job_name=job_name,
|
186
194
|
cmd_string=formatted_cmd,
|
187
|
-
volumes=self.persistent_volumes
|
195
|
+
volumes=self.persistent_volumes,
|
196
|
+
service_account_name=self.service_account_name,
|
197
|
+
annotations=self.annotations)
|
188
198
|
self.resources[pod_name] = {'status': JobStatus(JobState.RUNNING)}
|
189
199
|
|
190
200
|
return pod_name
|
@@ -233,13 +243,13 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
|
|
233
243
|
for jid in to_poll_job_ids:
|
234
244
|
phase = None
|
235
245
|
try:
|
236
|
-
|
246
|
+
pod = self.kube_client.read_namespaced_pod(name=jid, namespace=self.namespace)
|
237
247
|
except Exception:
|
238
248
|
logger.exception("Failed to poll pod {} status, most likely because pod was terminated".format(jid))
|
239
249
|
if self.resources[jid]['status'] is JobStatus(JobState.RUNNING):
|
240
250
|
phase = 'Unknown'
|
241
251
|
else:
|
242
|
-
phase =
|
252
|
+
phase = pod.status.phase
|
243
253
|
if phase:
|
244
254
|
status = translate_table.get(phase, JobState.UNKNOWN)
|
245
255
|
logger.debug("Updating pod {} with status {} to parsl status {}".format(jid,
|
@@ -253,7 +263,9 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
|
|
253
263
|
job_name,
|
254
264
|
port=80,
|
255
265
|
cmd_string=None,
|
256
|
-
volumes=[]
|
266
|
+
volumes=[],
|
267
|
+
service_account_name=None,
|
268
|
+
annotations=None):
|
257
269
|
""" Create a kubernetes pod for the job.
|
258
270
|
Args:
|
259
271
|
- image (string) : Docker image to launch
|
@@ -274,7 +286,7 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
|
|
274
286
|
# Create the environment variables and command to initiate IPP
|
275
287
|
environment_vars = client.V1EnvVar(name="TEST", value="SOME DATA")
|
276
288
|
|
277
|
-
launch_args = ["-c", "{0}
|
289
|
+
launch_args = ["-c", "{0}".format(cmd_string)]
|
278
290
|
|
279
291
|
volume_mounts = []
|
280
292
|
# Create mount paths for the volumes
|
@@ -311,11 +323,12 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
|
|
311
323
|
claim_name=volume[0])))
|
312
324
|
|
313
325
|
metadata = client.V1ObjectMeta(name=pod_name,
|
314
|
-
labels={"app": job_name}
|
326
|
+
labels={"app": job_name},
|
327
|
+
annotations=annotations)
|
315
328
|
spec = client.V1PodSpec(containers=[container],
|
316
329
|
image_pull_secrets=[secret],
|
317
|
-
volumes=volume_defs
|
318
|
-
)
|
330
|
+
volumes=volume_defs,
|
331
|
+
service_account_name=service_account_name)
|
319
332
|
|
320
333
|
pod = client.V1Pod(spec=spec, metadata=metadata)
|
321
334
|
api_response = self.kube_client.create_namespaced_pod(namespace=self.namespace,
|
parsl/providers/slurm/slurm.py
CHANGED
@@ -19,25 +19,29 @@ from parsl.utils import RepresentationMixin, wtime_to_minutes
|
|
19
19
|
|
20
20
|
logger = logging.getLogger(__name__)
|
21
21
|
|
22
|
+
# From https://slurm.schedmd.com/sacct.html#SECTION_JOB-STATE-CODES
|
22
23
|
translate_table = {
|
23
|
-
'
|
24
|
-
'
|
25
|
-
'
|
26
|
-
'
|
27
|
-
'
|
28
|
-
'
|
29
|
-
'
|
30
|
-
'
|
31
|
-
'
|
32
|
-
'
|
33
|
-
'
|
24
|
+
'PENDING': JobState.PENDING,
|
25
|
+
'RUNNING': JobState.RUNNING,
|
26
|
+
'CANCELLED': JobState.CANCELLED,
|
27
|
+
'COMPLETED': JobState.COMPLETED,
|
28
|
+
'FAILED': JobState.FAILED,
|
29
|
+
'NODE_FAIL': JobState.FAILED,
|
30
|
+
'BOOT_FAIL': JobState.FAILED,
|
31
|
+
'DEADLINE': JobState.TIMEOUT,
|
32
|
+
'TIMEOUT': JobState.TIMEOUT,
|
33
|
+
'REVOKED': JobState.FAILED,
|
34
|
+
'OUT_OF_MEMORY': JobState.FAILED,
|
35
|
+
'SUSPENDED': JobState.HELD,
|
36
|
+
'PREEMPTED': JobState.TIMEOUT,
|
37
|
+
'REQUEUED': JobState.PENDING
|
34
38
|
}
|
35
39
|
|
36
40
|
|
37
41
|
class SlurmProvider(ClusterProvider, RepresentationMixin):
|
38
42
|
"""Slurm Execution Provider
|
39
43
|
|
40
|
-
This provider uses sbatch to submit,
|
44
|
+
This provider uses sbatch to submit, sacct for status and scancel to cancel
|
41
45
|
jobs. The sbatch script to be used is created from a template file in this
|
42
46
|
same module.
|
43
47
|
|
@@ -168,14 +172,16 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
|
|
168
172
|
logger.debug('No active jobs, skipping status update')
|
169
173
|
return
|
170
174
|
|
171
|
-
|
175
|
+
# Using state%20 to get enough characters to not truncate output
|
176
|
+
# of the state. Without output can look like "<job_id> CANCELLED+"
|
177
|
+
cmd = "sacct -X --noheader --format=jobid,state%20 --job '{0}'".format(job_id_list)
|
172
178
|
logger.debug("Executing %s", cmd)
|
173
179
|
retcode, stdout, stderr = self.execute_wait(cmd)
|
174
|
-
logger.debug("
|
180
|
+
logger.debug("sacct returned %s %s", stdout, stderr)
|
175
181
|
|
176
182
|
# Execute_wait failed. Do no update
|
177
183
|
if retcode != 0:
|
178
|
-
logger.warning("
|
184
|
+
logger.warning("sacct failed with non-zero exit code {}".format(retcode))
|
179
185
|
return
|
180
186
|
|
181
187
|
jobs_missing = set(self.resources.keys())
|
@@ -183,7 +189,10 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
|
|
183
189
|
if not line:
|
184
190
|
# Blank line
|
185
191
|
continue
|
186
|
-
|
192
|
+
# Sacct includes extra information in some outputs
|
193
|
+
# For example "<job_id> CANCELLED by <user_id>"
|
194
|
+
# This splits and ignores anything past the first two unpacked values
|
195
|
+
job_id, slurm_state, *ignore = line.split()
|
187
196
|
if slurm_state not in translate_table:
|
188
197
|
logger.warning(f"Slurm status {slurm_state} is not recognized")
|
189
198
|
status = translate_table.get(slurm_state, JobState.UNKNOWN)
|
@@ -193,13 +202,13 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
|
|
193
202
|
stderr_path=self.resources[job_id]['job_stderr_path'])
|
194
203
|
jobs_missing.remove(job_id)
|
195
204
|
|
196
|
-
#
|
197
|
-
#
|
205
|
+
# sacct can get job info after jobs have completed so this path shouldn't be hit
|
206
|
+
# log a warning if there are missing jobs for some reason
|
198
207
|
for missing_job in jobs_missing:
|
199
|
-
logger.
|
200
|
-
self.resources[missing_job]['status'] = JobStatus(
|
201
|
-
|
202
|
-
|
208
|
+
logger.warning("Updating missing job {} to completed status".format(missing_job))
|
209
|
+
self.resources[missing_job]['status'] = JobStatus(
|
210
|
+
JobState.COMPLETED, stdout_path=self.resources[missing_job]['job_stdout_path'],
|
211
|
+
stderr_path=self.resources[missing_job]['job_stderr_path'])
|
203
212
|
|
204
213
|
def submit(self, command: str, tasks_per_node: int, job_name="parsl.slurm") -> str:
|
205
214
|
"""Submit the command as a slurm job.
|
parsl/tests/conftest.py
CHANGED
@@ -151,6 +151,10 @@ def pytest_configure(config):
|
|
151
151
|
'markers',
|
152
152
|
'multiple_cores_required: Marks tests that require multiple cores, such as htex affinity'
|
153
153
|
)
|
154
|
+
config.addinivalue_line(
|
155
|
+
'markers',
|
156
|
+
'unix_filesystem_permissions_required: Marks tests that require unix-level filesystem permission enforcement'
|
157
|
+
)
|
154
158
|
config.addinivalue_line(
|
155
159
|
'markers',
|
156
160
|
'issue3328: Marks tests broken by issue #3328'
|
@@ -16,7 +16,6 @@ def echo_to_streams(msg, stderr=None, stdout=None):
|
|
16
16
|
whitelist = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'configs', '*threads*')
|
17
17
|
|
18
18
|
speclist = (
|
19
|
-
'/bad/dir/t.out',
|
20
19
|
['t3.out', 'w'],
|
21
20
|
('t4.out', None),
|
22
21
|
(42, 'w'),
|
@@ -26,7 +25,6 @@ speclist = (
|
|
26
25
|
)
|
27
26
|
|
28
27
|
testids = [
|
29
|
-
'nonexistent_dir',
|
30
28
|
'list_not_tuple',
|
31
29
|
'null_mode',
|
32
30
|
'not_a_string',
|
@@ -55,6 +53,26 @@ def test_bad_stdout_specs(spec):
|
|
55
53
|
|
56
54
|
|
57
55
|
@pytest.mark.issue3328
|
56
|
+
@pytest.mark.unix_filesystem_permissions_required
|
57
|
+
def test_bad_stdout_file():
|
58
|
+
"""Testing bad stderr file"""
|
59
|
+
|
60
|
+
o = "/bad/dir/t2.out"
|
61
|
+
|
62
|
+
fn = echo_to_streams("Hello world", stdout=o, stderr='t.err')
|
63
|
+
|
64
|
+
try:
|
65
|
+
fn.result()
|
66
|
+
except perror.BadStdStreamFile:
|
67
|
+
pass
|
68
|
+
else:
|
69
|
+
assert False, "Did not raise expected exception BadStdStreamFile"
|
70
|
+
|
71
|
+
return
|
72
|
+
|
73
|
+
|
74
|
+
@pytest.mark.issue3328
|
75
|
+
@pytest.mark.unix_filesystem_permissions_required
|
58
76
|
def test_bad_stderr_file():
|
59
77
|
"""Testing bad stderr file"""
|
60
78
|
|