parsl 2024.10.14__py3-none-any.whl → 2024.10.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/channels/base.py +0 -11
- parsl/channels/errors.py +0 -17
- parsl/channels/local/local.py +3 -16
- parsl/channels/ssh/ssh.py +0 -11
- parsl/dataflow/dflow.py +1 -1
- parsl/executors/high_throughput/executor.py +16 -9
- parsl/executors/high_throughput/interchange.py +8 -5
- parsl/executors/high_throughput/manager_selector.py +30 -0
- parsl/executors/high_throughput/process_worker_pool.py +1 -9
- parsl/monitoring/db_manager.py +1 -1
- parsl/monitoring/monitoring.py +5 -5
- parsl/monitoring/radios.py +2 -2
- parsl/monitoring/router.py +4 -7
- parsl/monitoring/types.py +3 -6
- parsl/providers/base.py +0 -16
- parsl/providers/kubernetes/kube.py +35 -28
- parsl/tests/{integration/test_channels → test_channels}/test_local_channel.py +4 -8
- parsl/tests/test_htex/test_block_manager_selector_unit.py +20 -0
- parsl/tests/test_htex/test_drain.py +6 -4
- parsl/tests/test_htex/test_manager_selector_by_block.py +53 -0
- parsl/tests/test_htex/test_resource_spec_validation.py +7 -0
- parsl/tests/test_providers/test_kubernetes_provider.py +102 -0
- parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +92 -0
- parsl/tests/test_serialization/test_3495_deserialize_managerlost.py +1 -1
- parsl/tests/test_utils/test_sanitize_dns.py +76 -0
- parsl/utils.py +78 -0
- parsl/version.py +1 -1
- {parsl-2024.10.14.data → parsl-2024.10.28.data}/scripts/interchange.py +8 -5
- {parsl-2024.10.14.data → parsl-2024.10.28.data}/scripts/process_worker_pool.py +1 -9
- {parsl-2024.10.14.dist-info → parsl-2024.10.28.dist-info}/METADATA +2 -2
- {parsl-2024.10.14.dist-info → parsl-2024.10.28.dist-info}/RECORD +37 -33
- parsl/tests/integration/test_channels/test_channels.py +0 -17
- {parsl-2024.10.14.data → parsl-2024.10.28.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.10.14.data → parsl-2024.10.28.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.10.14.dist-info → parsl-2024.10.28.dist-info}/LICENSE +0 -0
- {parsl-2024.10.14.dist-info → parsl-2024.10.28.dist-info}/WHEEL +0 -0
- {parsl-2024.10.14.dist-info → parsl-2024.10.28.dist-info}/entry_points.txt +0 -0
- {parsl-2024.10.14.dist-info → parsl-2024.10.28.dist-info}/top_level.txt +0 -0
parsl/channels/base.py
CHANGED
@@ -120,14 +120,3 @@ class Channel(metaclass=ABCMeta):
|
|
120
120
|
Path of directory to check.
|
121
121
|
"""
|
122
122
|
pass
|
123
|
-
|
124
|
-
@abstractmethod
|
125
|
-
def abspath(self, path: str) -> str:
|
126
|
-
"""Return the absolute path.
|
127
|
-
|
128
|
-
Parameters
|
129
|
-
----------
|
130
|
-
path : str
|
131
|
-
Path for which the absolute path will be returned.
|
132
|
-
"""
|
133
|
-
pass
|
parsl/channels/errors.py
CHANGED
@@ -1,7 +1,5 @@
|
|
1
1
|
''' Exceptions raise by Apps.
|
2
2
|
'''
|
3
|
-
from typing import Optional
|
4
|
-
|
5
3
|
from parsl.errors import ParslError
|
6
4
|
|
7
5
|
|
@@ -60,21 +58,6 @@ class BadPermsScriptPath(ChannelError):
|
|
60
58
|
super().__init__("User does not have permissions to access the script_dir", e, hostname)
|
61
59
|
|
62
60
|
|
63
|
-
class FileExists(ChannelError):
|
64
|
-
''' Push or pull of file over channel fails since a file of the name already
|
65
|
-
exists on the destination.
|
66
|
-
|
67
|
-
Contains:
|
68
|
-
reason(string)
|
69
|
-
e (paramiko exception object)
|
70
|
-
hostname (string)
|
71
|
-
'''
|
72
|
-
|
73
|
-
def __init__(self, e: Exception, hostname: str, filename: Optional[str] = None) -> None:
|
74
|
-
super().__init__("File name collision in channel transport phase: {}".format(filename),
|
75
|
-
e, hostname)
|
76
|
-
|
77
|
-
|
78
61
|
class AuthException(ChannelError):
|
79
62
|
''' An error raised during execution of an app.
|
80
63
|
What this exception contains depends entirely on context
|
parsl/channels/local/local.py
CHANGED
@@ -37,19 +37,16 @@ class LocalChannel(Channel, RepresentationMixin):
|
|
37
37
|
|
38
38
|
Args:
|
39
39
|
- cmd (string) : Commandline string to execute
|
40
|
-
- walltime (int) : walltime in seconds
|
40
|
+
- walltime (int) : walltime in seconds
|
41
41
|
|
42
42
|
Kwargs:
|
43
43
|
- envs (dict) : Dictionary of env variables. This will be used
|
44
44
|
to override the envs set at channel initialization.
|
45
45
|
|
46
46
|
Returns:
|
47
|
-
- retcode : Return code from the execution
|
47
|
+
- retcode : Return code from the execution
|
48
48
|
- stdout : stdout string
|
49
49
|
- stderr : stderr string
|
50
|
-
|
51
|
-
Raises:
|
52
|
-
None.
|
53
50
|
'''
|
54
51
|
current_env = copy.deepcopy(self._envs)
|
55
52
|
current_env.update(envs)
|
@@ -145,16 +142,6 @@ class LocalChannel(Channel, RepresentationMixin):
|
|
145
142
|
|
146
143
|
return os.makedirs(path, mode, exist_ok)
|
147
144
|
|
148
|
-
def abspath(self, path):
|
149
|
-
"""Return the absolute path.
|
150
|
-
|
151
|
-
Parameters
|
152
|
-
----------
|
153
|
-
path : str
|
154
|
-
Path for which the absolute path will be returned.
|
155
|
-
"""
|
156
|
-
return os.path.abspath(path)
|
157
|
-
|
158
145
|
@property
|
159
146
|
def script_dir(self):
|
160
147
|
return self._script_dir
|
@@ -162,5 +149,5 @@ class LocalChannel(Channel, RepresentationMixin):
|
|
162
149
|
@script_dir.setter
|
163
150
|
def script_dir(self, value):
|
164
151
|
if value is not None:
|
165
|
-
value =
|
152
|
+
value = os.path.abspath(value)
|
166
153
|
self._script_dir = value
|
parsl/channels/ssh/ssh.py
CHANGED
@@ -214,7 +214,6 @@ class DeprecatedSSHChannel(Channel, RepresentationMixin):
|
|
214
214
|
- str: Local path to file
|
215
215
|
|
216
216
|
Raises:
|
217
|
-
- FileExists : Name collision at local directory.
|
218
217
|
- FileCopyException : FileCopy failed.
|
219
218
|
'''
|
220
219
|
|
@@ -287,16 +286,6 @@ class DeprecatedSSHChannel(Channel, RepresentationMixin):
|
|
287
286
|
self.execute_wait('mkdir -p {}'.format(path))
|
288
287
|
self._valid_sftp_client().chmod(path, mode)
|
289
288
|
|
290
|
-
def abspath(self, path):
|
291
|
-
"""Return the absolute path on the remote side.
|
292
|
-
|
293
|
-
Parameters
|
294
|
-
----------
|
295
|
-
path : str
|
296
|
-
Path for which the absolute path will be returned.
|
297
|
-
"""
|
298
|
-
return self._valid_sftp_client().normalize(path)
|
299
|
-
|
300
289
|
@property
|
301
290
|
def script_dir(self):
|
302
291
|
return self._script_dir
|
parsl/dataflow/dflow.py
CHANGED
@@ -146,6 +146,11 @@ GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionP
|
|
146
146
|
|
147
147
|
encrypted : bool
|
148
148
|
Flag to enable/disable encryption (CurveZMQ). Default is False.
|
149
|
+
|
150
|
+
manager_selector: ManagerSelector
|
151
|
+
Determines what strategy the interchange uses to select managers during task distribution.
|
152
|
+
See API reference under "Manager Selectors" regarding the various manager selectors.
|
153
|
+
Default: 'RandomManagerSelector'
|
149
154
|
""" # Documentation for params used by both HTEx and MPIEx
|
150
155
|
|
151
156
|
|
@@ -341,15 +346,17 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
341
346
|
return self.logdir
|
342
347
|
|
343
348
|
def validate_resource_spec(self, resource_specification: dict):
|
344
|
-
"""HTEX
|
345
|
-
|
349
|
+
"""HTEX supports the following *Optional* resource specifications:
|
350
|
+
priority: lower value is higher priority"""
|
346
351
|
if resource_specification:
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
352
|
+
acceptable_fields = {'priority'}
|
353
|
+
keys = set(resource_specification.keys())
|
354
|
+
invalid_keys = keys - acceptable_fields
|
355
|
+
if invalid_keys:
|
356
|
+
message = "Task resource specification only accepts these types of resources: {}".format(
|
357
|
+
', '.join(acceptable_fields))
|
358
|
+
logger.error(message)
|
359
|
+
raise InvalidResourceSpecification(set(invalid_keys), message)
|
353
360
|
return
|
354
361
|
|
355
362
|
def initialize_scaling(self):
|
@@ -657,7 +664,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
657
664
|
except TypeError:
|
658
665
|
raise SerializationError(func.__name__)
|
659
666
|
|
660
|
-
msg = {"task_id": task_id, "buffer": fn_buf}
|
667
|
+
msg = {"task_id": task_id, "resource_spec": resource_specification, "buffer": fn_buf}
|
661
668
|
|
662
669
|
# Post task to the outgoing queue
|
663
670
|
self.outgoing_q.put(msg)
|
@@ -66,7 +66,7 @@ class Interchange:
|
|
66
66
|
If specified the interchange will only listen on this address for connections from workers
|
67
67
|
else, it binds to all addresses.
|
68
68
|
|
69
|
-
client_ports :
|
69
|
+
client_ports : tuple(int, int, int)
|
70
70
|
The ports at which the client can be reached
|
71
71
|
|
72
72
|
worker_ports : tuple(int, int)
|
@@ -104,7 +104,6 @@ class Interchange:
|
|
104
104
|
os.makedirs(self.logdir, exist_ok=True)
|
105
105
|
|
106
106
|
start_file_logger("{}/interchange.log".format(self.logdir), level=logging_level)
|
107
|
-
logger.propagate = False
|
108
107
|
logger.debug("Initializing Interchange process")
|
109
108
|
|
110
109
|
self.client_address = client_address
|
@@ -437,9 +436,13 @@ class Interchange:
|
|
437
436
|
logger.info(f"Manager {manager_id!r} has compatible Parsl version {msg['parsl_v']}")
|
438
437
|
logger.info(f"Manager {manager_id!r} has compatible Python version {msg['python_v'].rsplit('.', 1)[0]}")
|
439
438
|
elif msg['type'] == 'heartbeat':
|
440
|
-
|
441
|
-
|
442
|
-
|
439
|
+
manager = self._ready_managers.get(manager_id)
|
440
|
+
if manager:
|
441
|
+
manager['last_heartbeat'] = time.time()
|
442
|
+
logger.debug("Manager %r sent heartbeat via tasks connection", manager_id)
|
443
|
+
self.task_outgoing.send_multipart([manager_id, b'', PKL_HEARTBEAT_CODE])
|
444
|
+
else:
|
445
|
+
logger.warning("Received heartbeat via tasks connection for not-registered manager %r", manager_id)
|
443
446
|
elif msg['type'] == 'drain':
|
444
447
|
self._ready_managers[manager_id]['draining'] = True
|
445
448
|
logger.debug("Manager %r requested drain", manager_id)
|
@@ -19,7 +19,37 @@ class ManagerSelector(metaclass=ABCMeta):
|
|
19
19
|
|
20
20
|
class RandomManagerSelector(ManagerSelector):
|
21
21
|
|
22
|
+
"""Returns a shuffled list of interesting_managers
|
23
|
+
|
24
|
+
By default this strategy is used by the interchange. Works well
|
25
|
+
in distributing workloads equally across all availble compute
|
26
|
+
resources. The random workload strategy is not effective in
|
27
|
+
conjunction with elastic scaling behavior as the even task
|
28
|
+
distribution does not allow the scaling down of blocks, leading
|
29
|
+
to wasted resource consumption.
|
30
|
+
"""
|
31
|
+
|
22
32
|
def sort_managers(self, ready_managers: Dict[bytes, ManagerRecord], manager_list: Set[bytes]) -> List[bytes]:
|
23
33
|
c_manager_list = list(manager_list)
|
24
34
|
random.shuffle(c_manager_list)
|
25
35
|
return c_manager_list
|
36
|
+
|
37
|
+
|
38
|
+
class BlockIdManagerSelector(ManagerSelector):
|
39
|
+
|
40
|
+
"""Returns an interesting_managers list sorted by block ID
|
41
|
+
|
42
|
+
Observations:
|
43
|
+
1. BlockID manager selector helps with workloads that see a varying
|
44
|
+
amount of tasks over time. New blocks are prioritized with the
|
45
|
+
blockID manager selector, when used with 'htex_auto_scaling', results
|
46
|
+
in compute cost savings.
|
47
|
+
|
48
|
+
2. Doesn't really work with bag-of-tasks workloads. When all the tasks
|
49
|
+
are put into the queue upfront, all blocks operate at near full
|
50
|
+
utilization for the majority of the workload, which task goes where
|
51
|
+
doesn't really matter.
|
52
|
+
"""
|
53
|
+
|
54
|
+
def sort_managers(self, ready_managers: Dict[bytes, ManagerRecord], manager_list: Set[bytes]) -> List[bytes]:
|
55
|
+
return sorted(manager_list, key=lambda x: (ready_managers[x]['block_id'] is not None, ready_managers[x]['block_id']))
|
@@ -362,7 +362,7 @@ class Manager:
|
|
362
362
|
if tasks == HEARTBEAT_CODE:
|
363
363
|
logger.debug("Got heartbeat from interchange")
|
364
364
|
elif tasks == DRAINED_CODE:
|
365
|
-
logger.info("Got
|
365
|
+
logger.info("Got fully drained message from interchange - setting kill flag")
|
366
366
|
kill_event.set()
|
367
367
|
else:
|
368
368
|
task_recv_counter += len(tasks)
|
@@ -650,14 +650,6 @@ def worker(
|
|
650
650
|
debug: bool,
|
651
651
|
mpi_launcher: str,
|
652
652
|
):
|
653
|
-
"""
|
654
|
-
|
655
|
-
Put request token into queue
|
656
|
-
Get task from task_queue
|
657
|
-
Pop request from queue
|
658
|
-
Put result into result_queue
|
659
|
-
"""
|
660
|
-
|
661
653
|
# override the global logger inherited from the __main__ process (which
|
662
654
|
# usually logs to manager.log) with one specific to this worker.
|
663
655
|
global logger
|
parsl/monitoring/db_manager.py
CHANGED
@@ -556,7 +556,7 @@ class DatabaseManager:
|
|
556
556
|
logger.debug("Checking STOP conditions: kill event: %s, queue has entries: %s",
|
557
557
|
kill_event.is_set(), logs_queue.qsize() != 0)
|
558
558
|
try:
|
559
|
-
x
|
559
|
+
x = logs_queue.get(timeout=0.1)
|
560
560
|
except queue.Empty:
|
561
561
|
continue
|
562
562
|
else:
|
parsl/monitoring/monitoring.py
CHANGED
@@ -16,7 +16,7 @@ from parsl.monitoring.errors import MonitoringHubStartError
|
|
16
16
|
from parsl.monitoring.message_type import MessageType
|
17
17
|
from parsl.monitoring.radios import MultiprocessingQueueRadioSender
|
18
18
|
from parsl.monitoring.router import router_starter
|
19
|
-
from parsl.monitoring.types import
|
19
|
+
from parsl.monitoring.types import TaggedMonitoringMessage
|
20
20
|
from parsl.multiprocessing import ForkProcess, SizedQueue
|
21
21
|
from parsl.process_loggers import wrap_with_logs
|
22
22
|
from parsl.serialize import deserialize
|
@@ -138,7 +138,7 @@ class MonitoringHub(RepresentationMixin):
|
|
138
138
|
self.exception_q: Queue[Tuple[str, str]]
|
139
139
|
self.exception_q = SizedQueue(maxsize=10)
|
140
140
|
|
141
|
-
self.resource_msgs: Queue[Union[
|
141
|
+
self.resource_msgs: Queue[Union[TaggedMonitoringMessage, Literal["STOP"]]]
|
142
142
|
self.resource_msgs = SizedQueue()
|
143
143
|
|
144
144
|
self.router_exit_event: ms.Event
|
@@ -237,7 +237,7 @@ class MonitoringHub(RepresentationMixin):
|
|
237
237
|
logger.debug("Finished waiting for router termination")
|
238
238
|
if len(exception_msgs) == 0:
|
239
239
|
logger.debug("Sending STOP to DBM")
|
240
|
-
self.resource_msgs.put(
|
240
|
+
self.resource_msgs.put("STOP")
|
241
241
|
else:
|
242
242
|
logger.debug("Not sending STOP to DBM, because there were DBM exceptions")
|
243
243
|
logger.debug("Waiting for DB termination")
|
@@ -261,7 +261,7 @@ class MonitoringHub(RepresentationMixin):
|
|
261
261
|
|
262
262
|
|
263
263
|
@wrap_with_logs
|
264
|
-
def filesystem_receiver(logdir: str, q: "queue.Queue[
|
264
|
+
def filesystem_receiver(logdir: str, q: "queue.Queue[TaggedMonitoringMessage]", run_dir: str) -> None:
|
265
265
|
logger = set_file_logger("{}/monitoring_filesystem_radio.log".format(logdir),
|
266
266
|
name="monitoring_filesystem_radio",
|
267
267
|
level=logging.INFO)
|
@@ -288,7 +288,7 @@ def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]
|
|
288
288
|
message = deserialize(f.read())
|
289
289
|
logger.debug(f"Message received is: {message}")
|
290
290
|
assert isinstance(message, tuple)
|
291
|
-
q.put(cast(
|
291
|
+
q.put(cast(TaggedMonitoringMessage, message))
|
292
292
|
os.remove(full_path_filename)
|
293
293
|
except Exception:
|
294
294
|
logger.exception(f"Exception processing {filename} - probably will be retried next iteration")
|
parsl/monitoring/radios.py
CHANGED
@@ -58,7 +58,7 @@ class FilesystemRadioSender(MonitoringRadioSender):
|
|
58
58
|
|
59
59
|
tmp_filename = f"{self.tmp_path}/{unique_id}"
|
60
60
|
new_filename = f"{self.new_path}/{unique_id}"
|
61
|
-
buffer =
|
61
|
+
buffer = message
|
62
62
|
|
63
63
|
# this will write the message out then atomically
|
64
64
|
# move it into new/, so that a partially written
|
@@ -187,7 +187,7 @@ class MultiprocessingQueueRadioSender(MonitoringRadioSender):
|
|
187
187
|
self.queue = queue
|
188
188
|
|
189
189
|
def send(self, message: object) -> None:
|
190
|
-
self.queue.put(
|
190
|
+
self.queue.put(message)
|
191
191
|
|
192
192
|
|
193
193
|
class ZMQRadioSender(MonitoringRadioSender):
|
parsl/monitoring/router.py
CHANGED
@@ -14,7 +14,7 @@ import typeguard
|
|
14
14
|
import zmq
|
15
15
|
|
16
16
|
from parsl.log_utils import set_file_logger
|
17
|
-
from parsl.monitoring.types import
|
17
|
+
from parsl.monitoring.types import TaggedMonitoringMessage
|
18
18
|
from parsl.process_loggers import wrap_with_logs
|
19
19
|
from parsl.utils import setproctitle
|
20
20
|
|
@@ -125,7 +125,7 @@ class MonitoringRouter:
|
|
125
125
|
data, addr = self.udp_sock.recvfrom(2048)
|
126
126
|
resource_msg = pickle.loads(data)
|
127
127
|
self.logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg))
|
128
|
-
self.resource_msgs.put(
|
128
|
+
self.resource_msgs.put(resource_msg)
|
129
129
|
except socket.timeout:
|
130
130
|
pass
|
131
131
|
|
@@ -136,7 +136,7 @@ class MonitoringRouter:
|
|
136
136
|
data, addr = self.udp_sock.recvfrom(2048)
|
137
137
|
msg = pickle.loads(data)
|
138
138
|
self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
|
139
|
-
self.resource_msgs.put(
|
139
|
+
self.resource_msgs.put(msg)
|
140
140
|
last_msg_received_time = time.time()
|
141
141
|
except socket.timeout:
|
142
142
|
pass
|
@@ -160,10 +160,7 @@ class MonitoringRouter:
|
|
160
160
|
assert len(msg) >= 1, "ZMQ Receiver expects tuples of length at least 1, got {}".format(msg)
|
161
161
|
assert len(msg) == 2, "ZMQ Receiver expects message tuples of exactly length 2, got {}".format(msg)
|
162
162
|
|
163
|
-
|
164
|
-
msg_0 = (msg, 0)
|
165
|
-
|
166
|
-
self.resource_msgs.put(msg_0)
|
163
|
+
self.resource_msgs.put(msg)
|
167
164
|
except zmq.Again:
|
168
165
|
pass
|
169
166
|
except Exception:
|
parsl/monitoring/types.py
CHANGED
@@ -1,14 +1,11 @@
|
|
1
|
-
from typing import Any, Dict, Tuple
|
1
|
+
from typing import Any, Dict, Tuple
|
2
2
|
|
3
3
|
from typing_extensions import TypeAlias
|
4
4
|
|
5
5
|
from parsl.monitoring.message_type import MessageType
|
6
6
|
|
7
|
-
# A
|
8
|
-
#
|
9
|
-
# a TaggedMonitoringMessage, and then that can be further tagged with
|
10
|
-
# an often unused sender address, giving an AddressedMonitoringMessage.
|
7
|
+
# A MonitoringMessage dictionary can be tagged, giving a
|
8
|
+
# TaggedMonitoringMessage.
|
11
9
|
|
12
10
|
MonitoringMessage: TypeAlias = Dict[str, Any]
|
13
11
|
TaggedMonitoringMessage: TypeAlias = Tuple[MessageType, MonitoringMessage]
|
14
|
-
AddressedMonitoringMessage: TypeAlias = Tuple[TaggedMonitoringMessage, Union[str, int]]
|
parsl/providers/base.py
CHANGED
@@ -2,7 +2,6 @@ import logging
|
|
2
2
|
from abc import ABCMeta, abstractmethod, abstractproperty
|
3
3
|
from typing import Any, Dict, List, Optional
|
4
4
|
|
5
|
-
from parsl.channels.base import Channel
|
6
5
|
from parsl.jobs.states import JobStatus
|
7
6
|
|
8
7
|
logger = logging.getLogger(__name__)
|
@@ -154,18 +153,3 @@ class ExecutionProvider(metaclass=ABCMeta):
|
|
154
153
|
:return: the number of seconds to wait between calls to status()
|
155
154
|
"""
|
156
155
|
pass
|
157
|
-
|
158
|
-
|
159
|
-
class Channeled():
|
160
|
-
"""A marker type to indicate that parsl should manage a Channel for this provider"""
|
161
|
-
def __init__(self) -> None:
|
162
|
-
self.channel: Channel
|
163
|
-
pass
|
164
|
-
|
165
|
-
|
166
|
-
class MultiChanneled():
|
167
|
-
"""A marker type to indicate that parsl should manage multiple Channels for this provider"""
|
168
|
-
|
169
|
-
def __init__(self) -> None:
|
170
|
-
self.channels: List[Channel]
|
171
|
-
pass
|
@@ -1,10 +1,5 @@
|
|
1
1
|
import logging
|
2
|
-
import
|
3
|
-
|
4
|
-
from parsl.providers.kubernetes.template import template_string
|
5
|
-
|
6
|
-
logger = logging.getLogger(__name__)
|
7
|
-
|
2
|
+
import uuid
|
8
3
|
from typing import Any, Dict, List, Optional, Tuple
|
9
4
|
|
10
5
|
import typeguard
|
@@ -12,7 +7,8 @@ import typeguard
|
|
12
7
|
from parsl.errors import OptionalModuleMissing
|
13
8
|
from parsl.jobs.states import JobState, JobStatus
|
14
9
|
from parsl.providers.base import ExecutionProvider
|
15
|
-
from parsl.
|
10
|
+
from parsl.providers.kubernetes.template import template_string
|
11
|
+
from parsl.utils import RepresentationMixin, sanitize_dns_subdomain_rfc1123
|
16
12
|
|
17
13
|
try:
|
18
14
|
from kubernetes import client, config
|
@@ -20,6 +16,8 @@ try:
|
|
20
16
|
except (ImportError, NameError, FileNotFoundError):
|
21
17
|
_kubernetes_enabled = False
|
22
18
|
|
19
|
+
logger = logging.getLogger(__name__)
|
20
|
+
|
23
21
|
translate_table = {
|
24
22
|
'Running': JobState.RUNNING,
|
25
23
|
'Pending': JobState.PENDING,
|
@@ -161,7 +159,7 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
|
|
161
159
|
self.resources: Dict[object, Dict[str, Any]]
|
162
160
|
self.resources = {}
|
163
161
|
|
164
|
-
def submit(self, cmd_string, tasks_per_node, job_name="parsl"):
|
162
|
+
def submit(self, cmd_string: str, tasks_per_node: int, job_name: str = "parsl.kube"):
|
165
163
|
""" Submit a job
|
166
164
|
Args:
|
167
165
|
- cmd_string :(String) - Name of the container to initiate
|
@@ -173,15 +171,19 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
|
|
173
171
|
Returns:
|
174
172
|
- job_id: (string) Identifier for the job
|
175
173
|
"""
|
174
|
+
job_id = uuid.uuid4().hex[:8]
|
176
175
|
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
176
|
+
pod_name = self.pod_name or job_name
|
177
|
+
try:
|
178
|
+
pod_name = sanitize_dns_subdomain_rfc1123(pod_name)
|
179
|
+
except ValueError:
|
180
|
+
logger.warning(
|
181
|
+
f"Invalid pod name '{pod_name}' for job '{job_id}', falling back to 'parsl.kube'"
|
182
|
+
)
|
183
|
+
pod_name = "parsl.kube"
|
184
|
+
pod_name = pod_name[:253 - 1 - len(job_id)] # Leave room for the job ID
|
185
|
+
pod_name = pod_name.rstrip(".-") # Remove trailing dot or hyphen after trim
|
186
|
+
pod_name = f"{pod_name}.{job_id}"
|
185
187
|
|
186
188
|
formatted_cmd = template_string.format(command=cmd_string,
|
187
189
|
worker_init=self.worker_init)
|
@@ -189,14 +191,14 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
|
|
189
191
|
logger.debug("Pod name: %s", pod_name)
|
190
192
|
self._create_pod(image=self.image,
|
191
193
|
pod_name=pod_name,
|
192
|
-
|
194
|
+
job_id=job_id,
|
193
195
|
cmd_string=formatted_cmd,
|
194
196
|
volumes=self.persistent_volumes,
|
195
197
|
service_account_name=self.service_account_name,
|
196
198
|
annotations=self.annotations)
|
197
|
-
self.resources[
|
199
|
+
self.resources[job_id] = {'status': JobStatus(JobState.RUNNING), 'pod_name': pod_name}
|
198
200
|
|
199
|
-
return
|
201
|
+
return job_id
|
200
202
|
|
201
203
|
def status(self, job_ids):
|
202
204
|
""" Get the status of a list of jobs identified by the job identifiers
|
@@ -212,6 +214,9 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
|
|
212
214
|
self._status()
|
213
215
|
return [self.resources[jid]['status'] for jid in job_ids]
|
214
216
|
|
217
|
+
def _get_pod_name(self, job_id: str) -> str:
|
218
|
+
return self.resources[job_id]['pod_name']
|
219
|
+
|
215
220
|
def cancel(self, job_ids):
|
216
221
|
""" Cancels the jobs specified by a list of job ids
|
217
222
|
Args:
|
@@ -221,7 +226,8 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
|
|
221
226
|
"""
|
222
227
|
for job in job_ids:
|
223
228
|
logger.debug("Terminating job/pod: {0}".format(job))
|
224
|
-
self.
|
229
|
+
pod_name = self._get_pod_name(job)
|
230
|
+
self._delete_pod(pod_name)
|
225
231
|
|
226
232
|
self.resources[job]['status'] = JobStatus(JobState.CANCELLED)
|
227
233
|
rets = [True for i in job_ids]
|
@@ -242,7 +248,8 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
|
|
242
248
|
for jid in to_poll_job_ids:
|
243
249
|
phase = None
|
244
250
|
try:
|
245
|
-
|
251
|
+
pod_name = self._get_pod_name(jid)
|
252
|
+
pod = self.kube_client.read_namespaced_pod(name=pod_name, namespace=self.namespace)
|
246
253
|
except Exception:
|
247
254
|
logger.exception("Failed to poll pod {} status, most likely because pod was terminated".format(jid))
|
248
255
|
if self.resources[jid]['status'] is JobStatus(JobState.RUNNING):
|
@@ -257,10 +264,10 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
|
|
257
264
|
self.resources[jid]['status'] = JobStatus(status)
|
258
265
|
|
259
266
|
def _create_pod(self,
|
260
|
-
image,
|
261
|
-
pod_name,
|
262
|
-
|
263
|
-
port=80,
|
267
|
+
image: str,
|
268
|
+
pod_name: str,
|
269
|
+
job_id: str,
|
270
|
+
port: int = 80,
|
264
271
|
cmd_string=None,
|
265
272
|
volumes=[],
|
266
273
|
service_account_name=None,
|
@@ -269,7 +276,7 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
|
|
269
276
|
Args:
|
270
277
|
- image (string) : Docker image to launch
|
271
278
|
- pod_name (string) : Name of the pod
|
272
|
-
-
|
279
|
+
- job_id (string) : Job ID
|
273
280
|
KWargs:
|
274
281
|
- port (integer) : Container port
|
275
282
|
Returns:
|
@@ -299,7 +306,7 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
|
|
299
306
|
)
|
300
307
|
# Configure Pod template container
|
301
308
|
container = client.V1Container(
|
302
|
-
name=
|
309
|
+
name=job_id,
|
303
310
|
image=image,
|
304
311
|
resources=resources,
|
305
312
|
ports=[client.V1ContainerPort(container_port=port)],
|
@@ -322,7 +329,7 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
|
|
322
329
|
claim_name=volume[0])))
|
323
330
|
|
324
331
|
metadata = client.V1ObjectMeta(name=pod_name,
|
325
|
-
labels={"
|
332
|
+
labels={"parsl-job-id": job_id},
|
326
333
|
annotations=annotations)
|
327
334
|
spec = client.V1PodSpec(containers=[container],
|
328
335
|
image_pull_secrets=[secret],
|
@@ -1,6 +1,9 @@
|
|
1
|
+
import pytest
|
2
|
+
|
1
3
|
from parsl.channels.local.local import LocalChannel
|
2
4
|
|
3
5
|
|
6
|
+
@pytest.mark.local
|
4
7
|
def test_env():
|
5
8
|
''' Regression testing for issue #27
|
6
9
|
'''
|
@@ -15,9 +18,8 @@ def test_env():
|
|
15
18
|
x = [s for s in stdout if s.startswith("HOME=")]
|
16
19
|
assert x, "HOME not found"
|
17
20
|
|
18
|
-
print("RC:{} \nSTDOUT:{} \nSTDERR:{}".format(rc, stdout, stderr))
|
19
|
-
|
20
21
|
|
22
|
+
@pytest.mark.local
|
21
23
|
def test_env_mod():
|
22
24
|
''' Testing for env update at execute time.
|
23
25
|
'''
|
@@ -34,9 +36,3 @@ def test_env_mod():
|
|
34
36
|
|
35
37
|
x = [s for s in stdout if s.startswith("TEST_ENV=fooo")]
|
36
38
|
assert x, "User set env missing"
|
37
|
-
|
38
|
-
|
39
|
-
if __name__ == "__main__":
|
40
|
-
|
41
|
-
test_env()
|
42
|
-
test_env_mod()
|
@@ -0,0 +1,20 @@
|
|
1
|
+
import pytest
|
2
|
+
|
3
|
+
from parsl.executors.high_throughput.manager_record import ManagerRecord
|
4
|
+
from parsl.executors.high_throughput.manager_selector import BlockIdManagerSelector
|
5
|
+
|
6
|
+
|
7
|
+
@pytest.mark.local
|
8
|
+
def test_sort_managers():
|
9
|
+
ready_managers = {
|
10
|
+
b'manager1': {'block_id': 1},
|
11
|
+
b'manager2': {'block_id': None},
|
12
|
+
b'manager3': {'block_id': 3},
|
13
|
+
b'manager4': {'block_id': 2}
|
14
|
+
}
|
15
|
+
|
16
|
+
manager_list = {b'manager1', b'manager2', b'manager3', b'manager4'}
|
17
|
+
expected_sorted_list = [b'manager2', b'manager1', b'manager4', b'manager3']
|
18
|
+
manager_selector = BlockIdManagerSelector()
|
19
|
+
sorted_managers = manager_selector.sort_managers(ready_managers, manager_list)
|
20
|
+
assert sorted_managers == expected_sorted_list
|