parsl 2024.10.14__py3-none-any.whl → 2024.10.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. parsl/channels/base.py +0 -11
  2. parsl/channels/errors.py +0 -17
  3. parsl/channels/local/local.py +3 -16
  4. parsl/channels/ssh/ssh.py +0 -11
  5. parsl/dataflow/dflow.py +1 -1
  6. parsl/executors/high_throughput/executor.py +16 -9
  7. parsl/executors/high_throughput/interchange.py +8 -5
  8. parsl/executors/high_throughput/manager_selector.py +30 -0
  9. parsl/executors/high_throughput/process_worker_pool.py +1 -9
  10. parsl/monitoring/db_manager.py +1 -1
  11. parsl/monitoring/monitoring.py +5 -5
  12. parsl/monitoring/radios.py +2 -2
  13. parsl/monitoring/router.py +4 -7
  14. parsl/monitoring/types.py +3 -6
  15. parsl/providers/base.py +0 -16
  16. parsl/providers/kubernetes/kube.py +35 -28
  17. parsl/tests/{integration/test_channels → test_channels}/test_local_channel.py +4 -8
  18. parsl/tests/test_htex/test_block_manager_selector_unit.py +20 -0
  19. parsl/tests/test_htex/test_drain.py +6 -4
  20. parsl/tests/test_htex/test_manager_selector_by_block.py +53 -0
  21. parsl/tests/test_htex/test_resource_spec_validation.py +7 -0
  22. parsl/tests/test_providers/test_kubernetes_provider.py +102 -0
  23. parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +92 -0
  24. parsl/tests/test_serialization/test_3495_deserialize_managerlost.py +1 -1
  25. parsl/tests/test_utils/test_sanitize_dns.py +76 -0
  26. parsl/utils.py +78 -0
  27. parsl/version.py +1 -1
  28. {parsl-2024.10.14.data → parsl-2024.10.28.data}/scripts/interchange.py +8 -5
  29. {parsl-2024.10.14.data → parsl-2024.10.28.data}/scripts/process_worker_pool.py +1 -9
  30. {parsl-2024.10.14.dist-info → parsl-2024.10.28.dist-info}/METADATA +2 -2
  31. {parsl-2024.10.14.dist-info → parsl-2024.10.28.dist-info}/RECORD +37 -33
  32. parsl/tests/integration/test_channels/test_channels.py +0 -17
  33. {parsl-2024.10.14.data → parsl-2024.10.28.data}/scripts/exec_parsl_function.py +0 -0
  34. {parsl-2024.10.14.data → parsl-2024.10.28.data}/scripts/parsl_coprocess.py +0 -0
  35. {parsl-2024.10.14.dist-info → parsl-2024.10.28.dist-info}/LICENSE +0 -0
  36. {parsl-2024.10.14.dist-info → parsl-2024.10.28.dist-info}/WHEEL +0 -0
  37. {parsl-2024.10.14.dist-info → parsl-2024.10.28.dist-info}/entry_points.txt +0 -0
  38. {parsl-2024.10.14.dist-info → parsl-2024.10.28.dist-info}/top_level.txt +0 -0
parsl/channels/base.py CHANGED
@@ -120,14 +120,3 @@ class Channel(metaclass=ABCMeta):
120
120
  Path of directory to check.
121
121
  """
122
122
  pass
123
-
124
- @abstractmethod
125
- def abspath(self, path: str) -> str:
126
- """Return the absolute path.
127
-
128
- Parameters
129
- ----------
130
- path : str
131
- Path for which the absolute path will be returned.
132
- """
133
- pass
parsl/channels/errors.py CHANGED
@@ -1,7 +1,5 @@
1
1
  ''' Exceptions raise by Apps.
2
2
  '''
3
- from typing import Optional
4
-
5
3
  from parsl.errors import ParslError
6
4
 
7
5
 
@@ -60,21 +58,6 @@ class BadPermsScriptPath(ChannelError):
60
58
  super().__init__("User does not have permissions to access the script_dir", e, hostname)
61
59
 
62
60
 
63
- class FileExists(ChannelError):
64
- ''' Push or pull of file over channel fails since a file of the name already
65
- exists on the destination.
66
-
67
- Contains:
68
- reason(string)
69
- e (paramiko exception object)
70
- hostname (string)
71
- '''
72
-
73
- def __init__(self, e: Exception, hostname: str, filename: Optional[str] = None) -> None:
74
- super().__init__("File name collision in channel transport phase: {}".format(filename),
75
- e, hostname)
76
-
77
-
78
61
  class AuthException(ChannelError):
79
62
  ''' An error raised during execution of an app.
80
63
  What this exception contains depends entirely on context
@@ -37,19 +37,16 @@ class LocalChannel(Channel, RepresentationMixin):
37
37
 
38
38
  Args:
39
39
  - cmd (string) : Commandline string to execute
40
- - walltime (int) : walltime in seconds, this is not really used now.
40
+ - walltime (int) : walltime in seconds
41
41
 
42
42
  Kwargs:
43
43
  - envs (dict) : Dictionary of env variables. This will be used
44
44
  to override the envs set at channel initialization.
45
45
 
46
46
  Returns:
47
- - retcode : Return code from the execution, -1 on fail
47
+ - retcode : Return code from the execution
48
48
  - stdout : stdout string
49
49
  - stderr : stderr string
50
-
51
- Raises:
52
- None.
53
50
  '''
54
51
  current_env = copy.deepcopy(self._envs)
55
52
  current_env.update(envs)
@@ -145,16 +142,6 @@ class LocalChannel(Channel, RepresentationMixin):
145
142
 
146
143
  return os.makedirs(path, mode, exist_ok)
147
144
 
148
- def abspath(self, path):
149
- """Return the absolute path.
150
-
151
- Parameters
152
- ----------
153
- path : str
154
- Path for which the absolute path will be returned.
155
- """
156
- return os.path.abspath(path)
157
-
158
145
  @property
159
146
  def script_dir(self):
160
147
  return self._script_dir
@@ -162,5 +149,5 @@ class LocalChannel(Channel, RepresentationMixin):
162
149
  @script_dir.setter
163
150
  def script_dir(self, value):
164
151
  if value is not None:
165
- value = self.abspath(value)
152
+ value = os.path.abspath(value)
166
153
  self._script_dir = value
parsl/channels/ssh/ssh.py CHANGED
@@ -214,7 +214,6 @@ class DeprecatedSSHChannel(Channel, RepresentationMixin):
214
214
  - str: Local path to file
215
215
 
216
216
  Raises:
217
- - FileExists : Name collision at local directory.
218
217
  - FileCopyException : FileCopy failed.
219
218
  '''
220
219
 
@@ -287,16 +286,6 @@ class DeprecatedSSHChannel(Channel, RepresentationMixin):
287
286
  self.execute_wait('mkdir -p {}'.format(path))
288
287
  self._valid_sftp_client().chmod(path, mode)
289
288
 
290
- def abspath(self, path):
291
- """Return the absolute path on the remote side.
292
-
293
- Parameters
294
- ----------
295
- path : str
296
- Path for which the absolute path will be returned.
297
- """
298
- return self._valid_sftp_client().normalize(path)
299
-
300
289
  @property
301
290
  def script_dir(self):
302
291
  return self._script_dir
parsl/dataflow/dflow.py CHANGED
@@ -987,7 +987,7 @@ class DataFlowKernel:
987
987
  - app_kwargs (dict) : Rest of the kwargs to the fn passed as dict.
988
988
 
989
989
  Returns:
990
- (AppFuture) [DataFutures,]
990
+ AppFuture
991
991
 
992
992
  """
993
993
 
@@ -146,6 +146,11 @@ GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionP
146
146
 
147
147
  encrypted : bool
148
148
  Flag to enable/disable encryption (CurveZMQ). Default is False.
149
+
150
+ manager_selector: ManagerSelector
151
+ Determines what strategy the interchange uses to select managers during task distribution.
152
+ See API reference under "Manager Selectors" regarding the various manager selectors.
153
+ Default: 'RandomManagerSelector'
149
154
  """ # Documentation for params used by both HTEx and MPIEx
150
155
 
151
156
 
@@ -341,15 +346,17 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
341
346
  return self.logdir
342
347
 
343
348
  def validate_resource_spec(self, resource_specification: dict):
344
- """HTEX does not support *any* resource_specification options and
345
- will raise InvalidResourceSpecification is any are passed to it"""
349
+ """HTEX supports the following *Optional* resource specifications:
350
+ priority: lower value is higher priority"""
346
351
  if resource_specification:
347
- raise InvalidResourceSpecification(
348
- set(resource_specification.keys()),
349
- ("HTEX does not support the supplied resource_specifications."
350
- "For MPI applications consider using the MPIExecutor. "
351
- "For specifications for core count/memory/walltime, consider using WorkQueueExecutor. ")
352
- )
352
+ acceptable_fields = {'priority'}
353
+ keys = set(resource_specification.keys())
354
+ invalid_keys = keys - acceptable_fields
355
+ if invalid_keys:
356
+ message = "Task resource specification only accepts these types of resources: {}".format(
357
+ ', '.join(acceptable_fields))
358
+ logger.error(message)
359
+ raise InvalidResourceSpecification(set(invalid_keys), message)
353
360
  return
354
361
 
355
362
  def initialize_scaling(self):
@@ -657,7 +664,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
657
664
  except TypeError:
658
665
  raise SerializationError(func.__name__)
659
666
 
660
- msg = {"task_id": task_id, "buffer": fn_buf}
667
+ msg = {"task_id": task_id, "resource_spec": resource_specification, "buffer": fn_buf}
661
668
 
662
669
  # Post task to the outgoing queue
663
670
  self.outgoing_q.put(msg)
@@ -66,7 +66,7 @@ class Interchange:
66
66
  If specified the interchange will only listen on this address for connections from workers
67
67
  else, it binds to all addresses.
68
68
 
69
- client_ports : triple(int, int, int)
69
+ client_ports : tuple(int, int, int)
70
70
  The ports at which the client can be reached
71
71
 
72
72
  worker_ports : tuple(int, int)
@@ -104,7 +104,6 @@ class Interchange:
104
104
  os.makedirs(self.logdir, exist_ok=True)
105
105
 
106
106
  start_file_logger("{}/interchange.log".format(self.logdir), level=logging_level)
107
- logger.propagate = False
108
107
  logger.debug("Initializing Interchange process")
109
108
 
110
109
  self.client_address = client_address
@@ -437,9 +436,13 @@ class Interchange:
437
436
  logger.info(f"Manager {manager_id!r} has compatible Parsl version {msg['parsl_v']}")
438
437
  logger.info(f"Manager {manager_id!r} has compatible Python version {msg['python_v'].rsplit('.', 1)[0]}")
439
438
  elif msg['type'] == 'heartbeat':
440
- self._ready_managers[manager_id]['last_heartbeat'] = time.time()
441
- logger.debug("Manager %r sent heartbeat via tasks connection", manager_id)
442
- self.task_outgoing.send_multipart([manager_id, b'', PKL_HEARTBEAT_CODE])
439
+ manager = self._ready_managers.get(manager_id)
440
+ if manager:
441
+ manager['last_heartbeat'] = time.time()
442
+ logger.debug("Manager %r sent heartbeat via tasks connection", manager_id)
443
+ self.task_outgoing.send_multipart([manager_id, b'', PKL_HEARTBEAT_CODE])
444
+ else:
445
+ logger.warning("Received heartbeat via tasks connection for not-registered manager %r", manager_id)
443
446
  elif msg['type'] == 'drain':
444
447
  self._ready_managers[manager_id]['draining'] = True
445
448
  logger.debug("Manager %r requested drain", manager_id)
@@ -19,7 +19,37 @@ class ManagerSelector(metaclass=ABCMeta):
19
19
 
20
20
  class RandomManagerSelector(ManagerSelector):
21
21
 
22
+ """Returns a shuffled list of interesting_managers
23
+
24
+ By default this strategy is used by the interchange. Works well
25
+ in distributing workloads equally across all availble compute
26
+ resources. The random workload strategy is not effective in
27
+ conjunction with elastic scaling behavior as the even task
28
+ distribution does not allow the scaling down of blocks, leading
29
+ to wasted resource consumption.
30
+ """
31
+
22
32
  def sort_managers(self, ready_managers: Dict[bytes, ManagerRecord], manager_list: Set[bytes]) -> List[bytes]:
23
33
  c_manager_list = list(manager_list)
24
34
  random.shuffle(c_manager_list)
25
35
  return c_manager_list
36
+
37
+
38
+ class BlockIdManagerSelector(ManagerSelector):
39
+
40
+ """Returns an interesting_managers list sorted by block ID
41
+
42
+ Observations:
43
+ 1. BlockID manager selector helps with workloads that see a varying
44
+ amount of tasks over time. New blocks are prioritized with the
45
+ blockID manager selector, when used with 'htex_auto_scaling', results
46
+ in compute cost savings.
47
+
48
+ 2. Doesn't really work with bag-of-tasks workloads. When all the tasks
49
+ are put into the queue upfront, all blocks operate at near full
50
+ utilization for the majority of the workload, which task goes where
51
+ doesn't really matter.
52
+ """
53
+
54
+ def sort_managers(self, ready_managers: Dict[bytes, ManagerRecord], manager_list: Set[bytes]) -> List[bytes]:
55
+ return sorted(manager_list, key=lambda x: (ready_managers[x]['block_id'] is not None, ready_managers[x]['block_id']))
@@ -362,7 +362,7 @@ class Manager:
362
362
  if tasks == HEARTBEAT_CODE:
363
363
  logger.debug("Got heartbeat from interchange")
364
364
  elif tasks == DRAINED_CODE:
365
- logger.info("Got fulled drained message from interchange - setting kill flag")
365
+ logger.info("Got fully drained message from interchange - setting kill flag")
366
366
  kill_event.set()
367
367
  else:
368
368
  task_recv_counter += len(tasks)
@@ -650,14 +650,6 @@ def worker(
650
650
  debug: bool,
651
651
  mpi_launcher: str,
652
652
  ):
653
- """
654
-
655
- Put request token into queue
656
- Get task from task_queue
657
- Pop request from queue
658
- Put result into result_queue
659
- """
660
-
661
653
  # override the global logger inherited from the __main__ process (which
662
654
  # usually logs to manager.log) with one specific to this worker.
663
655
  global logger
@@ -556,7 +556,7 @@ class DatabaseManager:
556
556
  logger.debug("Checking STOP conditions: kill event: %s, queue has entries: %s",
557
557
  kill_event.is_set(), logs_queue.qsize() != 0)
558
558
  try:
559
- x, addr = logs_queue.get(timeout=0.1)
559
+ x = logs_queue.get(timeout=0.1)
560
560
  except queue.Empty:
561
561
  continue
562
562
  else:
@@ -16,7 +16,7 @@ from parsl.monitoring.errors import MonitoringHubStartError
16
16
  from parsl.monitoring.message_type import MessageType
17
17
  from parsl.monitoring.radios import MultiprocessingQueueRadioSender
18
18
  from parsl.monitoring.router import router_starter
19
- from parsl.monitoring.types import AddressedMonitoringMessage
19
+ from parsl.monitoring.types import TaggedMonitoringMessage
20
20
  from parsl.multiprocessing import ForkProcess, SizedQueue
21
21
  from parsl.process_loggers import wrap_with_logs
22
22
  from parsl.serialize import deserialize
@@ -138,7 +138,7 @@ class MonitoringHub(RepresentationMixin):
138
138
  self.exception_q: Queue[Tuple[str, str]]
139
139
  self.exception_q = SizedQueue(maxsize=10)
140
140
 
141
- self.resource_msgs: Queue[Union[AddressedMonitoringMessage, Tuple[Literal["STOP"], Literal[0]]]]
141
+ self.resource_msgs: Queue[Union[TaggedMonitoringMessage, Literal["STOP"]]]
142
142
  self.resource_msgs = SizedQueue()
143
143
 
144
144
  self.router_exit_event: ms.Event
@@ -237,7 +237,7 @@ class MonitoringHub(RepresentationMixin):
237
237
  logger.debug("Finished waiting for router termination")
238
238
  if len(exception_msgs) == 0:
239
239
  logger.debug("Sending STOP to DBM")
240
- self.resource_msgs.put(("STOP", 0))
240
+ self.resource_msgs.put("STOP")
241
241
  else:
242
242
  logger.debug("Not sending STOP to DBM, because there were DBM exceptions")
243
243
  logger.debug("Waiting for DB termination")
@@ -261,7 +261,7 @@ class MonitoringHub(RepresentationMixin):
261
261
 
262
262
 
263
263
  @wrap_with_logs
264
- def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]", run_dir: str) -> None:
264
+ def filesystem_receiver(logdir: str, q: "queue.Queue[TaggedMonitoringMessage]", run_dir: str) -> None:
265
265
  logger = set_file_logger("{}/monitoring_filesystem_radio.log".format(logdir),
266
266
  name="monitoring_filesystem_radio",
267
267
  level=logging.INFO)
@@ -288,7 +288,7 @@ def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]
288
288
  message = deserialize(f.read())
289
289
  logger.debug(f"Message received is: {message}")
290
290
  assert isinstance(message, tuple)
291
- q.put(cast(AddressedMonitoringMessage, message))
291
+ q.put(cast(TaggedMonitoringMessage, message))
292
292
  os.remove(full_path_filename)
293
293
  except Exception:
294
294
  logger.exception(f"Exception processing {filename} - probably will be retried next iteration")
@@ -58,7 +58,7 @@ class FilesystemRadioSender(MonitoringRadioSender):
58
58
 
59
59
  tmp_filename = f"{self.tmp_path}/{unique_id}"
60
60
  new_filename = f"{self.new_path}/{unique_id}"
61
- buffer = (message, "NA")
61
+ buffer = message
62
62
 
63
63
  # this will write the message out then atomically
64
64
  # move it into new/, so that a partially written
@@ -187,7 +187,7 @@ class MultiprocessingQueueRadioSender(MonitoringRadioSender):
187
187
  self.queue = queue
188
188
 
189
189
  def send(self, message: object) -> None:
190
- self.queue.put((message, 0))
190
+ self.queue.put(message)
191
191
 
192
192
 
193
193
  class ZMQRadioSender(MonitoringRadioSender):
@@ -14,7 +14,7 @@ import typeguard
14
14
  import zmq
15
15
 
16
16
  from parsl.log_utils import set_file_logger
17
- from parsl.monitoring.types import AddressedMonitoringMessage, TaggedMonitoringMessage
17
+ from parsl.monitoring.types import TaggedMonitoringMessage
18
18
  from parsl.process_loggers import wrap_with_logs
19
19
  from parsl.utils import setproctitle
20
20
 
@@ -125,7 +125,7 @@ class MonitoringRouter:
125
125
  data, addr = self.udp_sock.recvfrom(2048)
126
126
  resource_msg = pickle.loads(data)
127
127
  self.logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg))
128
- self.resource_msgs.put((resource_msg, addr))
128
+ self.resource_msgs.put(resource_msg)
129
129
  except socket.timeout:
130
130
  pass
131
131
 
@@ -136,7 +136,7 @@ class MonitoringRouter:
136
136
  data, addr = self.udp_sock.recvfrom(2048)
137
137
  msg = pickle.loads(data)
138
138
  self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
139
- self.resource_msgs.put((msg, addr))
139
+ self.resource_msgs.put(msg)
140
140
  last_msg_received_time = time.time()
141
141
  except socket.timeout:
142
142
  pass
@@ -160,10 +160,7 @@ class MonitoringRouter:
160
160
  assert len(msg) >= 1, "ZMQ Receiver expects tuples of length at least 1, got {}".format(msg)
161
161
  assert len(msg) == 2, "ZMQ Receiver expects message tuples of exactly length 2, got {}".format(msg)
162
162
 
163
- msg_0: AddressedMonitoringMessage
164
- msg_0 = (msg, 0)
165
-
166
- self.resource_msgs.put(msg_0)
163
+ self.resource_msgs.put(msg)
167
164
  except zmq.Again:
168
165
  pass
169
166
  except Exception:
parsl/monitoring/types.py CHANGED
@@ -1,14 +1,11 @@
1
- from typing import Any, Dict, Tuple, Union
1
+ from typing import Any, Dict, Tuple
2
2
 
3
3
  from typing_extensions import TypeAlias
4
4
 
5
5
  from parsl.monitoring.message_type import MessageType
6
6
 
7
- # A basic parsl monitoring message is wrapped by up to two wrappers:
8
- # The basic monitoring message dictionary can first be tagged, giving
9
- # a TaggedMonitoringMessage, and then that can be further tagged with
10
- # an often unused sender address, giving an AddressedMonitoringMessage.
7
+ # A MonitoringMessage dictionary can be tagged, giving a
8
+ # TaggedMonitoringMessage.
11
9
 
12
10
  MonitoringMessage: TypeAlias = Dict[str, Any]
13
11
  TaggedMonitoringMessage: TypeAlias = Tuple[MessageType, MonitoringMessage]
14
- AddressedMonitoringMessage: TypeAlias = Tuple[TaggedMonitoringMessage, Union[str, int]]
parsl/providers/base.py CHANGED
@@ -2,7 +2,6 @@ import logging
2
2
  from abc import ABCMeta, abstractmethod, abstractproperty
3
3
  from typing import Any, Dict, List, Optional
4
4
 
5
- from parsl.channels.base import Channel
6
5
  from parsl.jobs.states import JobStatus
7
6
 
8
7
  logger = logging.getLogger(__name__)
@@ -154,18 +153,3 @@ class ExecutionProvider(metaclass=ABCMeta):
154
153
  :return: the number of seconds to wait between calls to status()
155
154
  """
156
155
  pass
157
-
158
-
159
- class Channeled():
160
- """A marker type to indicate that parsl should manage a Channel for this provider"""
161
- def __init__(self) -> None:
162
- self.channel: Channel
163
- pass
164
-
165
-
166
- class MultiChanneled():
167
- """A marker type to indicate that parsl should manage multiple Channels for this provider"""
168
-
169
- def __init__(self) -> None:
170
- self.channels: List[Channel]
171
- pass
@@ -1,10 +1,5 @@
1
1
  import logging
2
- import time
3
-
4
- from parsl.providers.kubernetes.template import template_string
5
-
6
- logger = logging.getLogger(__name__)
7
-
2
+ import uuid
8
3
  from typing import Any, Dict, List, Optional, Tuple
9
4
 
10
5
  import typeguard
@@ -12,7 +7,8 @@ import typeguard
12
7
  from parsl.errors import OptionalModuleMissing
13
8
  from parsl.jobs.states import JobState, JobStatus
14
9
  from parsl.providers.base import ExecutionProvider
15
- from parsl.utils import RepresentationMixin
10
+ from parsl.providers.kubernetes.template import template_string
11
+ from parsl.utils import RepresentationMixin, sanitize_dns_subdomain_rfc1123
16
12
 
17
13
  try:
18
14
  from kubernetes import client, config
@@ -20,6 +16,8 @@ try:
20
16
  except (ImportError, NameError, FileNotFoundError):
21
17
  _kubernetes_enabled = False
22
18
 
19
+ logger = logging.getLogger(__name__)
20
+
23
21
  translate_table = {
24
22
  'Running': JobState.RUNNING,
25
23
  'Pending': JobState.PENDING,
@@ -161,7 +159,7 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
161
159
  self.resources: Dict[object, Dict[str, Any]]
162
160
  self.resources = {}
163
161
 
164
- def submit(self, cmd_string, tasks_per_node, job_name="parsl"):
162
+ def submit(self, cmd_string: str, tasks_per_node: int, job_name: str = "parsl.kube"):
165
163
  """ Submit a job
166
164
  Args:
167
165
  - cmd_string :(String) - Name of the container to initiate
@@ -173,15 +171,19 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
173
171
  Returns:
174
172
  - job_id: (string) Identifier for the job
175
173
  """
174
+ job_id = uuid.uuid4().hex[:8]
176
175
 
177
- cur_timestamp = str(time.time() * 1000).split(".")[0]
178
- job_name = "{0}-{1}".format(job_name, cur_timestamp)
179
-
180
- if not self.pod_name:
181
- pod_name = '{}'.format(job_name)
182
- else:
183
- pod_name = '{}-{}'.format(self.pod_name,
184
- cur_timestamp)
176
+ pod_name = self.pod_name or job_name
177
+ try:
178
+ pod_name = sanitize_dns_subdomain_rfc1123(pod_name)
179
+ except ValueError:
180
+ logger.warning(
181
+ f"Invalid pod name '{pod_name}' for job '{job_id}', falling back to 'parsl.kube'"
182
+ )
183
+ pod_name = "parsl.kube"
184
+ pod_name = pod_name[:253 - 1 - len(job_id)] # Leave room for the job ID
185
+ pod_name = pod_name.rstrip(".-") # Remove trailing dot or hyphen after trim
186
+ pod_name = f"{pod_name}.{job_id}"
185
187
 
186
188
  formatted_cmd = template_string.format(command=cmd_string,
187
189
  worker_init=self.worker_init)
@@ -189,14 +191,14 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
189
191
  logger.debug("Pod name: %s", pod_name)
190
192
  self._create_pod(image=self.image,
191
193
  pod_name=pod_name,
192
- job_name=job_name,
194
+ job_id=job_id,
193
195
  cmd_string=formatted_cmd,
194
196
  volumes=self.persistent_volumes,
195
197
  service_account_name=self.service_account_name,
196
198
  annotations=self.annotations)
197
- self.resources[pod_name] = {'status': JobStatus(JobState.RUNNING)}
199
+ self.resources[job_id] = {'status': JobStatus(JobState.RUNNING), 'pod_name': pod_name}
198
200
 
199
- return pod_name
201
+ return job_id
200
202
 
201
203
  def status(self, job_ids):
202
204
  """ Get the status of a list of jobs identified by the job identifiers
@@ -212,6 +214,9 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
212
214
  self._status()
213
215
  return [self.resources[jid]['status'] for jid in job_ids]
214
216
 
217
+ def _get_pod_name(self, job_id: str) -> str:
218
+ return self.resources[job_id]['pod_name']
219
+
215
220
  def cancel(self, job_ids):
216
221
  """ Cancels the jobs specified by a list of job ids
217
222
  Args:
@@ -221,7 +226,8 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
221
226
  """
222
227
  for job in job_ids:
223
228
  logger.debug("Terminating job/pod: {0}".format(job))
224
- self._delete_pod(job)
229
+ pod_name = self._get_pod_name(job)
230
+ self._delete_pod(pod_name)
225
231
 
226
232
  self.resources[job]['status'] = JobStatus(JobState.CANCELLED)
227
233
  rets = [True for i in job_ids]
@@ -242,7 +248,8 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
242
248
  for jid in to_poll_job_ids:
243
249
  phase = None
244
250
  try:
245
- pod = self.kube_client.read_namespaced_pod(name=jid, namespace=self.namespace)
251
+ pod_name = self._get_pod_name(jid)
252
+ pod = self.kube_client.read_namespaced_pod(name=pod_name, namespace=self.namespace)
246
253
  except Exception:
247
254
  logger.exception("Failed to poll pod {} status, most likely because pod was terminated".format(jid))
248
255
  if self.resources[jid]['status'] is JobStatus(JobState.RUNNING):
@@ -257,10 +264,10 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
257
264
  self.resources[jid]['status'] = JobStatus(status)
258
265
 
259
266
  def _create_pod(self,
260
- image,
261
- pod_name,
262
- job_name,
263
- port=80,
267
+ image: str,
268
+ pod_name: str,
269
+ job_id: str,
270
+ port: int = 80,
264
271
  cmd_string=None,
265
272
  volumes=[],
266
273
  service_account_name=None,
@@ -269,7 +276,7 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
269
276
  Args:
270
277
  - image (string) : Docker image to launch
271
278
  - pod_name (string) : Name of the pod
272
- - job_name (string) : App label
279
+ - job_id (string) : Job ID
273
280
  KWargs:
274
281
  - port (integer) : Container port
275
282
  Returns:
@@ -299,7 +306,7 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
299
306
  )
300
307
  # Configure Pod template container
301
308
  container = client.V1Container(
302
- name=pod_name,
309
+ name=job_id,
303
310
  image=image,
304
311
  resources=resources,
305
312
  ports=[client.V1ContainerPort(container_port=port)],
@@ -322,7 +329,7 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
322
329
  claim_name=volume[0])))
323
330
 
324
331
  metadata = client.V1ObjectMeta(name=pod_name,
325
- labels={"app": job_name},
332
+ labels={"parsl-job-id": job_id},
326
333
  annotations=annotations)
327
334
  spec = client.V1PodSpec(containers=[container],
328
335
  image_pull_secrets=[secret],
@@ -1,6 +1,9 @@
1
+ import pytest
2
+
1
3
  from parsl.channels.local.local import LocalChannel
2
4
 
3
5
 
6
+ @pytest.mark.local
4
7
  def test_env():
5
8
  ''' Regression testing for issue #27
6
9
  '''
@@ -15,9 +18,8 @@ def test_env():
15
18
  x = [s for s in stdout if s.startswith("HOME=")]
16
19
  assert x, "HOME not found"
17
20
 
18
- print("RC:{} \nSTDOUT:{} \nSTDERR:{}".format(rc, stdout, stderr))
19
-
20
21
 
22
+ @pytest.mark.local
21
23
  def test_env_mod():
22
24
  ''' Testing for env update at execute time.
23
25
  '''
@@ -34,9 +36,3 @@ def test_env_mod():
34
36
 
35
37
  x = [s for s in stdout if s.startswith("TEST_ENV=fooo")]
36
38
  assert x, "User set env missing"
37
-
38
-
39
- if __name__ == "__main__":
40
-
41
- test_env()
42
- test_env_mod()
@@ -0,0 +1,20 @@
1
+ import pytest
2
+
3
+ from parsl.executors.high_throughput.manager_record import ManagerRecord
4
+ from parsl.executors.high_throughput.manager_selector import BlockIdManagerSelector
5
+
6
+
7
+ @pytest.mark.local
8
+ def test_sort_managers():
9
+ ready_managers = {
10
+ b'manager1': {'block_id': 1},
11
+ b'manager2': {'block_id': None},
12
+ b'manager3': {'block_id': 3},
13
+ b'manager4': {'block_id': 2}
14
+ }
15
+
16
+ manager_list = {b'manager1', b'manager2', b'manager3', b'manager4'}
17
+ expected_sorted_list = [b'manager2', b'manager1', b'manager4', b'manager3']
18
+ manager_selector = BlockIdManagerSelector()
19
+ sorted_managers = manager_selector.sort_managers(ready_managers, manager_list)
20
+ assert sorted_managers == expected_sorted_list