parsl 2024.6.3__py3-none-any.whl → 2024.6.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. parsl/app/app.py +0 -2
  2. parsl/config.py +27 -4
  3. parsl/dataflow/dflow.py +36 -10
  4. parsl/executors/high_throughput/executor.py +36 -30
  5. parsl/executors/high_throughput/interchange.py +26 -28
  6. parsl/providers/kubernetes/kube.py +22 -9
  7. parsl/providers/slurm/slurm.py +31 -22
  8. parsl/tests/configs/flux_local.py +11 -0
  9. parsl/tests/conftest.py +4 -0
  10. parsl/tests/test_bash_apps/test_stdout.py +20 -2
  11. parsl/tests/test_htex/test_htex.py +24 -7
  12. parsl/tests/test_htex/test_zmq_binding.py +22 -6
  13. parsl/tests/test_python_apps/test_context_manager.py +96 -1
  14. parsl/tests/test_python_apps/test_dependencies_deep.py +59 -0
  15. parsl/tests/test_radical/test_mpi_funcs.py +0 -1
  16. parsl/tests/unit/test_usage_tracking.py +45 -0
  17. parsl/usage_tracking/levels.py +6 -0
  18. parsl/usage_tracking/usage.py +54 -23
  19. parsl/version.py +1 -1
  20. parsl-2024.6.17.data/scripts/interchange.py +681 -0
  21. {parsl-2024.6.3.dist-info → parsl-2024.6.17.dist-info}/METADATA +2 -2
  22. {parsl-2024.6.3.dist-info → parsl-2024.6.17.dist-info}/RECORD +29 -24
  23. {parsl-2024.6.3.data → parsl-2024.6.17.data}/scripts/exec_parsl_function.py +0 -0
  24. {parsl-2024.6.3.data → parsl-2024.6.17.data}/scripts/parsl_coprocess.py +0 -0
  25. {parsl-2024.6.3.data → parsl-2024.6.17.data}/scripts/process_worker_pool.py +0 -0
  26. {parsl-2024.6.3.dist-info → parsl-2024.6.17.dist-info}/LICENSE +0 -0
  27. {parsl-2024.6.3.dist-info → parsl-2024.6.17.dist-info}/WHEEL +0 -0
  28. {parsl-2024.6.3.dist-info → parsl-2024.6.17.dist-info}/entry_points.txt +0 -0
  29. {parsl-2024.6.3.dist-info → parsl-2024.6.17.dist-info}/top_level.txt +0 -0
parsl/app/app.py CHANGED
@@ -66,8 +66,6 @@ class AppBase(metaclass=ABCMeta):
66
66
  self.kwargs['walltime'] = params['walltime'].default
67
67
  if 'parsl_resource_specification' in params:
68
68
  self.kwargs['parsl_resource_specification'] = params['parsl_resource_specification'].default
69
- self.outputs = params['outputs'].default if 'outputs' in params else []
70
- self.inputs = params['inputs'].default if 'inputs' in params else []
71
69
 
72
70
  @abstractmethod
73
71
  def __call__(self, *args: Any, **kwargs: Any) -> AppFuture:
parsl/config.py CHANGED
@@ -11,6 +11,8 @@ from parsl.executors.base import ParslExecutor
11
11
  from parsl.executors.threads import ThreadPoolExecutor
12
12
  from parsl.monitoring import MonitoringHub
13
13
  from parsl.usage_tracking.api import UsageInformation
14
+ from parsl.usage_tracking.levels import DISABLED as USAGE_TRACKING_DISABLED
15
+ from parsl.usage_tracking.levels import LEVEL_3 as USAGE_TRACKING_LEVEL_3
14
16
  from parsl.utils import RepresentationMixin
15
17
 
16
18
  logger = logging.getLogger(__name__)
@@ -38,6 +40,15 @@ class Config(RepresentationMixin, UsageInformation):
38
40
  ``checkpoint_mode='periodic'``.
39
41
  dependency_resolver: plugin point for custom dependency resolvers. Default: only resolve Futures,
40
42
  using the `SHALLOW_DEPENDENCY_RESOLVER`.
43
+ exit_mode: str, optional
44
+ When Parsl is used as a context manager (using ``with parsl.load`` syntax) then this parameter
45
+ controls what will happen to running tasks and exceptions at exit. The options are:
46
+
47
+ * ``cleanup``: cleanup the DFK on exit without waiting for any tasks
48
+ * ``skip``: skip all shutdown behaviour when exiting the context manager
49
+ * ``wait``: wait for all tasks to complete when exiting normally, but exit immediately when exiting due to an exception.
50
+
51
+ Default is ``cleanup``.
41
52
  garbage_collect : bool. optional.
42
53
  Delete task records from DFK when tasks have completed. Default: True
43
54
  internal_tasks_max_threads : int, optional
@@ -66,9 +77,12 @@ class Config(RepresentationMixin, UsageInformation):
66
77
  How often the scaling strategy should be executed. Default is 5 seconds.
67
78
  max_idletime : float, optional
68
79
  The maximum idle time allowed for an executor before strategy could shut down unused blocks. Default is 120.0 seconds.
69
- usage_tracking : bool, optional
70
- Set this field to True to opt-in to Parsl's usage tracking system. Parsl only collects minimal, non personally-identifiable,
71
- information used for reporting to our funding agencies. Default is False.
80
+ usage_tracking : int, optional
81
+ Set this field to 1, 2, or 3 to opt-in to Parsl's usage tracking system.
82
+ The value represents the level of usage tracking detail to be collected.
83
+ Setting this field to 0 will disable usage tracking. Default (this field is not set): usage tracking is not enabled.
84
+ Parsl only collects minimal, non personally-identifiable,
85
+ information used for reporting to our funding agencies.
72
86
  initialize_logging : bool, optional
73
87
  Make DFK optionally not initialize any logging. Log messages
74
88
  will still be passed into the python logging system under the
@@ -92,6 +106,7 @@ class Config(RepresentationMixin, UsageInformation):
92
106
  Literal['manual']] = None,
93
107
  checkpoint_period: Optional[str] = None,
94
108
  dependency_resolver: Optional[DependencyResolver] = None,
109
+ exit_mode: Literal['cleanup', 'skip', 'wait'] = 'cleanup',
95
110
  garbage_collect: bool = True,
96
111
  internal_tasks_max_threads: int = 10,
97
112
  retries: int = 0,
@@ -102,7 +117,7 @@ class Config(RepresentationMixin, UsageInformation):
102
117
  strategy_period: Union[float, int] = 5,
103
118
  max_idletime: float = 120.0,
104
119
  monitoring: Optional[MonitoringHub] = None,
105
- usage_tracking: bool = False,
120
+ usage_tracking: int = 0,
106
121
  initialize_logging: bool = True) -> None:
107
122
 
108
123
  executors = tuple(executors or [])
@@ -128,6 +143,7 @@ class Config(RepresentationMixin, UsageInformation):
128
143
  checkpoint_period = "00:30:00"
129
144
  self.checkpoint_period = checkpoint_period
130
145
  self.dependency_resolver = dependency_resolver
146
+ self.exit_mode = exit_mode
131
147
  self.garbage_collect = garbage_collect
132
148
  self.internal_tasks_max_threads = internal_tasks_max_threads
133
149
  self.retries = retries
@@ -136,6 +152,7 @@ class Config(RepresentationMixin, UsageInformation):
136
152
  self.strategy = strategy
137
153
  self.strategy_period = strategy_period
138
154
  self.max_idletime = max_idletime
155
+ self.validate_usage_tracking(usage_tracking)
139
156
  self.usage_tracking = usage_tracking
140
157
  self.initialize_logging = initialize_logging
141
158
  self.monitoring = monitoring
@@ -156,6 +173,12 @@ class Config(RepresentationMixin, UsageInformation):
156
173
  raise ConfigurationError('Executors must have unique labels ({})'.format(
157
174
  ', '.join(['label={}'.format(repr(d)) for d in duplicates])))
158
175
 
176
+ def validate_usage_tracking(self, level: int) -> None:
177
+ if not USAGE_TRACKING_DISABLED <= level <= USAGE_TRACKING_LEVEL_3:
178
+ raise ConfigurationError(
179
+ f"Usage Tracking values must be 0, 1, 2, or 3 and not {level}"
180
+ )
181
+
159
182
  def get_usage_information(self):
160
183
  return {"executors_len": len(self.executors),
161
184
  "dependency_resolver": self.dependency_resolver is not None}
parsl/dataflow/dflow.py CHANGED
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import atexit
4
+ import concurrent.futures as cf
4
5
  import datetime
5
6
  import inspect
6
7
  import logging
@@ -209,6 +210,8 @@ class DataFlowKernel:
209
210
  self.tasks: Dict[int, TaskRecord] = {}
210
211
  self.submitter_lock = threading.Lock()
211
212
 
213
+ self.dependency_launch_pool = cf.ThreadPoolExecutor(max_workers=1, thread_name_prefix="Dependency-Launch")
214
+
212
215
  self.dependency_resolver = self.config.dependency_resolver if self.config.dependency_resolver is not None \
213
216
  else SHALLOW_DEPENDENCY_RESOLVER
214
217
 
@@ -217,9 +220,24 @@ class DataFlowKernel:
217
220
  def __enter__(self):
218
221
  return self
219
222
 
220
- def __exit__(self, exc_type, exc_value, traceback):
221
- logger.debug("Exiting the context manager, calling cleanup for DFK")
222
- self.cleanup()
223
+ def __exit__(self, exc_type, exc_value, traceback) -> None:
224
+ mode = self.config.exit_mode
225
+ logger.debug("Exiting context manager, with exit mode '%s'", mode)
226
+ if mode == "cleanup":
227
+ logger.info("Calling cleanup for DFK")
228
+ self.cleanup()
229
+ elif mode == "skip":
230
+ logger.info("Skipping all cleanup handling")
231
+ elif mode == "wait":
232
+ if exc_type is None:
233
+ logger.info("Waiting for all tasks to complete")
234
+ self.wait_for_current_tasks()
235
+ self.cleanup()
236
+ else:
237
+ logger.info("There was an exception - cleaning up without waiting for task completion")
238
+ self.cleanup()
239
+ else:
240
+ raise InternalConsistencyError(f"Exit case for {mode} should be unreachable, validated by typeguard on Config()")
223
241
 
224
242
  def _send_task_log_info(self, task_record: TaskRecord) -> None:
225
243
  if self.monitoring:
@@ -611,9 +629,9 @@ class DataFlowKernel:
611
629
  return kwargs.get('_parsl_staging_inhibit', False)
612
630
 
613
631
  def launch_if_ready(self, task_record: TaskRecord) -> None:
614
- """
615
- launch_if_ready will launch the specified task, if it is ready
616
- to run (for example, without dependencies, and in pending state).
632
+ """Schedules a task record for re-inspection to see if it is ready
633
+ for launch and for launch if it is ready. The call will return
634
+ immediately.
617
635
 
618
636
  This should be called by any piece of the DataFlowKernel that
619
637
  thinks a task may have become ready to run.
@@ -622,13 +640,17 @@ class DataFlowKernel:
622
640
  ready to run - launch_if_ready will not incorrectly launch that
623
641
  task.
624
642
 
625
- It is also not an error to call launch_if_ready on a task that has
626
- already been launched - launch_if_ready will not re-launch that
627
- task.
628
-
629
643
  launch_if_ready is thread safe, so may be called from any thread
630
644
  or callback.
631
645
  """
646
+ self.dependency_launch_pool.submit(self._launch_if_ready_async, task_record)
647
+
648
+ @wrap_with_logs
649
+ def _launch_if_ready_async(self, task_record: TaskRecord) -> None:
650
+ """
651
+ _launch_if_ready will launch the specified task, if it is ready
652
+ to run (for example, without dependencies, and in pending state).
653
+ """
632
654
  exec_fu = None
633
655
 
634
656
  task_id = task_record['id']
@@ -1271,6 +1293,10 @@ class DataFlowKernel:
1271
1293
  self.monitoring.close()
1272
1294
  logger.info("Terminated monitoring")
1273
1295
 
1296
+ logger.info("Terminating dependency launch pool")
1297
+ self.dependency_launch_pool.shutdown()
1298
+ logger.info("Terminated dependency launch pool")
1299
+
1274
1300
  logger.info("Unregistering atexit hook")
1275
1301
  atexit.unregister(self.atexit_cleanup)
1276
1302
  logger.info("Unregistered atexit hook")
@@ -1,13 +1,13 @@
1
1
  import logging
2
2
  import math
3
3
  import pickle
4
+ import subprocess
4
5
  import threading
5
6
  import typing
6
7
  import warnings
7
8
  from collections import defaultdict
8
9
  from concurrent.futures import Future
9
10
  from dataclasses import dataclass
10
- from multiprocessing import Process
11
11
  from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
12
12
 
13
13
  import typeguard
@@ -18,7 +18,7 @@ from parsl.addresses import get_all_addresses
18
18
  from parsl.app.errors import RemoteExceptionWrapper
19
19
  from parsl.data_provider.staging import Staging
20
20
  from parsl.executors.errors import BadMessage, ScalingFailed
21
- from parsl.executors.high_throughput import interchange, zmq_pipes
21
+ from parsl.executors.high_throughput import zmq_pipes
22
22
  from parsl.executors.high_throughput.errors import CommandClientTimeoutError
23
23
  from parsl.executors.high_throughput.mpi_prefix_composer import (
24
24
  VALID_LAUNCHERS,
@@ -26,7 +26,6 @@ from parsl.executors.high_throughput.mpi_prefix_composer import (
26
26
  )
27
27
  from parsl.executors.status_handling import BlockProviderExecutor
28
28
  from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus
29
- from parsl.multiprocessing import ForkProcess
30
29
  from parsl.process_loggers import wrap_with_logs
31
30
  from parsl.providers import LocalProvider
32
31
  from parsl.providers.base import ExecutionProvider
@@ -305,7 +304,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
305
304
  self._task_counter = 0
306
305
  self.worker_ports = worker_ports
307
306
  self.worker_port_range = worker_port_range
308
- self.interchange_proc: Optional[Process] = None
307
+ self.interchange_proc: Optional[subprocess.Popen] = None
309
308
  self.interchange_port_range = interchange_port_range
310
309
  self.heartbeat_threshold = heartbeat_threshold
311
310
  self.heartbeat_period = heartbeat_period
@@ -520,37 +519,45 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
520
519
 
521
520
  logger.info("Queue management worker finished")
522
521
 
523
- def _start_local_interchange_process(self):
522
+ def _start_local_interchange_process(self) -> None:
524
523
  """ Starts the interchange process locally
525
524
 
526
- Starts the interchange process locally and uses an internal command queue to
525
+ Starts the interchange process locally and uses the command queue to
527
526
  get the worker task and result ports that the interchange has bound to.
528
527
  """
529
- self.interchange_proc = ForkProcess(target=interchange.starter,
530
- kwargs={"client_ports": (self.outgoing_q.port,
531
- self.incoming_q.port,
532
- self.command_client.port),
533
- "interchange_address": self.address,
534
- "worker_ports": self.worker_ports,
535
- "worker_port_range": self.worker_port_range,
536
- "hub_address": self.hub_address,
537
- "hub_zmq_port": self.hub_zmq_port,
538
- "logdir": self.logdir,
539
- "heartbeat_threshold": self.heartbeat_threshold,
540
- "poll_period": self.poll_period,
541
- "logging_level": logging.DEBUG if self.worker_debug else logging.INFO,
542
- "cert_dir": self.cert_dir,
543
- },
544
- daemon=True,
545
- name="HTEX-Interchange"
546
- )
547
- self.interchange_proc.start()
548
528
 
529
+ interchange_config = {"client_address": "127.0.0.1",
530
+ "client_ports": (self.outgoing_q.port,
531
+ self.incoming_q.port,
532
+ self.command_client.port),
533
+ "interchange_address": self.address,
534
+ "worker_ports": self.worker_ports,
535
+ "worker_port_range": self.worker_port_range,
536
+ "hub_address": self.hub_address,
537
+ "hub_zmq_port": self.hub_zmq_port,
538
+ "logdir": self.logdir,
539
+ "heartbeat_threshold": self.heartbeat_threshold,
540
+ "poll_period": self.poll_period,
541
+ "logging_level": logging.DEBUG if self.worker_debug else logging.INFO,
542
+ "cert_dir": self.cert_dir,
543
+ }
544
+
545
+ config_pickle = pickle.dumps(interchange_config)
546
+
547
+ self.interchange_proc = subprocess.Popen(b"interchange.py", stdin=subprocess.PIPE)
548
+ stdin = self.interchange_proc.stdin
549
+ assert stdin is not None, "Popen should have created an IO object (vs default None) because of PIPE mode"
550
+
551
+ logger.debug("Popened interchange process. Writing config object")
552
+ stdin.write(config_pickle)
553
+ stdin.flush()
554
+ logger.debug("Sent config object. Requesting worker ports")
549
555
  try:
550
556
  (self.worker_task_port, self.worker_result_port) = self.command_client.run("WORKER_PORTS", timeout_s=120)
551
557
  except CommandClientTimeoutError:
552
- logger.error("Interchange has not completed initialization in 120s. Aborting")
558
+ logger.error("Interchange has not completed initialization. Aborting")
553
559
  raise Exception("Interchange failed to start")
560
+ logger.debug("Got worker ports")
554
561
 
555
562
  def _start_queue_management_thread(self):
556
563
  """Method to start the management thread as a daemon.
@@ -809,13 +816,12 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
809
816
  logger.info("Attempting HighThroughputExecutor shutdown")
810
817
 
811
818
  self.interchange_proc.terminate()
812
- self.interchange_proc.join(timeout=timeout)
813
- if self.interchange_proc.is_alive():
819
+ try:
820
+ self.interchange_proc.wait(timeout=timeout)
821
+ except subprocess.TimeoutExpired:
814
822
  logger.info("Unable to terminate Interchange process; sending SIGKILL")
815
823
  self.interchange_proc.kill()
816
824
 
817
- self.interchange_proc.close()
818
-
819
825
  logger.info("Finished HighThroughputExecutor shutdown attempt")
820
826
 
821
827
  def get_usage_information(self):
@@ -65,18 +65,19 @@ class Interchange:
65
65
  3. Detect workers that have failed using heartbeats
66
66
  """
67
67
  def __init__(self,
68
- client_address: str = "127.0.0.1",
69
- interchange_address: Optional[str] = None,
70
- client_ports: Tuple[int, int, int] = (50055, 50056, 50057),
71
- worker_ports: Optional[Tuple[int, int]] = None,
72
- worker_port_range: Tuple[int, int] = (54000, 55000),
73
- hub_address: Optional[str] = None,
74
- hub_zmq_port: Optional[int] = None,
75
- heartbeat_threshold: int = 60,
76
- logdir: str = ".",
77
- logging_level: int = logging.INFO,
78
- poll_period: int = 10,
79
- cert_dir: Optional[str] = None,
68
+ *,
69
+ client_address: str,
70
+ interchange_address: Optional[str],
71
+ client_ports: Tuple[int, int, int],
72
+ worker_ports: Optional[Tuple[int, int]],
73
+ worker_port_range: Tuple[int, int],
74
+ hub_address: Optional[str],
75
+ hub_zmq_port: Optional[int],
76
+ heartbeat_threshold: int,
77
+ logdir: str,
78
+ logging_level: int,
79
+ poll_period: int,
80
+ cert_dir: Optional[str],
80
81
  ) -> None:
81
82
  """
82
83
  Parameters
@@ -92,34 +93,34 @@ class Interchange:
92
93
  The ports at which the client can be reached
93
94
 
94
95
  worker_ports : tuple(int, int)
95
- The specific two ports at which workers will connect to the Interchange. Default: None
96
+ The specific two ports at which workers will connect to the Interchange.
96
97
 
97
98
  worker_port_range : tuple(int, int)
98
99
  The interchange picks ports at random from the range which will be used by workers.
99
- This is overridden when the worker_ports option is set. Default: (54000, 55000)
100
+ This is overridden when the worker_ports option is set.
100
101
 
101
102
  hub_address : str
102
103
  The IP address at which the interchange can send info about managers to when monitoring is enabled.
103
- Default: None (meaning monitoring disabled)
104
+ When None, monitoring is disabled.
104
105
 
105
106
  hub_zmq_port : str
106
107
  The port at which the interchange can send info about managers to when monitoring is enabled.
107
- Default: None (meaning monitoring disabled)
108
+ When None, monitoring is disabled.
108
109
 
109
110
  heartbeat_threshold : int
110
111
  Number of seconds since the last heartbeat after which worker is considered lost.
111
112
 
112
113
  logdir : str
113
- Parsl log directory paths. Logs and temp files go here. Default: '.'
114
+ Parsl log directory paths. Logs and temp files go here.
114
115
 
115
116
  logging_level : int
116
- Logging level as defined in the logging module. Default: logging.INFO
117
+ Logging level as defined in the logging module.
117
118
 
118
119
  poll_period : int
119
- The main thread polling period, in milliseconds. Default: 10ms
120
+ The main thread polling period, in milliseconds.
120
121
 
121
122
  cert_dir : str | None
122
- Path to the certificate directory. Default: None
123
+ Path to the certificate directory.
123
124
  """
124
125
  self.cert_dir = cert_dir
125
126
  self.logdir = logdir
@@ -671,13 +672,10 @@ def start_file_logger(filename: str, level: int = logging.DEBUG, format_string:
671
672
  logger.addHandler(handler)
672
673
 
673
674
 
674
- @wrap_with_logs(target="interchange")
675
- def starter(*args: Any, **kwargs: Any) -> None:
676
- """Start the interchange process
677
-
678
- The executor is expected to call this function. The args, kwargs match that of the Interchange.__init__
679
- """
675
+ if __name__ == "__main__":
680
676
  setproctitle("parsl: HTEX interchange")
681
- # logger = multiprocessing.get_logger()
682
- ic = Interchange(*args, **kwargs)
677
+
678
+ config = pickle.load(sys.stdin.buffer)
679
+
680
+ ic = Interchange(**config)
683
681
  ic.start()
@@ -83,6 +83,10 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
83
83
  persistent_volumes: list[(str, str)]
84
84
  List of tuples describing persistent volumes to be mounted in the pod.
85
85
  The tuples consist of (PVC Name, Mount Directory).
86
+ service_account_name: str
87
+ Name of the service account to run the pod as.
88
+ annotations: Dict[str, str]
89
+ Annotations to set on the pod.
86
90
  """
87
91
  @typeguard.typechecked
88
92
  def __init__(self,
@@ -103,7 +107,9 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
103
107
  group_id: Optional[str] = None,
104
108
  run_as_non_root: bool = False,
105
109
  secret: Optional[str] = None,
106
- persistent_volumes: List[Tuple[str, str]] = []) -> None:
110
+ persistent_volumes: List[Tuple[str, str]] = [],
111
+ service_account_name: Optional[str] = None,
112
+ annotations: Optional[Dict[str, str]] = None) -> None:
107
113
  if not _kubernetes_enabled:
108
114
  raise OptionalModuleMissing(['kubernetes'],
109
115
  "Kubernetes provider requires kubernetes module and config.")
@@ -146,6 +152,8 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
146
152
  self.group_id = group_id
147
153
  self.run_as_non_root = run_as_non_root
148
154
  self.persistent_volumes = persistent_volumes
155
+ self.service_account_name = service_account_name
156
+ self.annotations = annotations
149
157
 
150
158
  self.kube_client = client.CoreV1Api()
151
159
 
@@ -184,7 +192,9 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
184
192
  pod_name=pod_name,
185
193
  job_name=job_name,
186
194
  cmd_string=formatted_cmd,
187
- volumes=self.persistent_volumes)
195
+ volumes=self.persistent_volumes,
196
+ service_account_name=self.service_account_name,
197
+ annotations=self.annotations)
188
198
  self.resources[pod_name] = {'status': JobStatus(JobState.RUNNING)}
189
199
 
190
200
  return pod_name
@@ -233,13 +243,13 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
233
243
  for jid in to_poll_job_ids:
234
244
  phase = None
235
245
  try:
236
- pod_status = self.kube_client.read_namespaced_pod_status(name=jid, namespace=self.namespace)
246
+ pod = self.kube_client.read_namespaced_pod(name=jid, namespace=self.namespace)
237
247
  except Exception:
238
248
  logger.exception("Failed to poll pod {} status, most likely because pod was terminated".format(jid))
239
249
  if self.resources[jid]['status'] is JobStatus(JobState.RUNNING):
240
250
  phase = 'Unknown'
241
251
  else:
242
- phase = pod_status.status.phase
252
+ phase = pod.status.phase
243
253
  if phase:
244
254
  status = translate_table.get(phase, JobState.UNKNOWN)
245
255
  logger.debug("Updating pod {} with status {} to parsl status {}".format(jid,
@@ -253,7 +263,9 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
253
263
  job_name,
254
264
  port=80,
255
265
  cmd_string=None,
256
- volumes=[]):
266
+ volumes=[],
267
+ service_account_name=None,
268
+ annotations=None):
257
269
  """ Create a kubernetes pod for the job.
258
270
  Args:
259
271
  - image (string) : Docker image to launch
@@ -274,7 +286,7 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
274
286
  # Create the environment variables and command to initiate IPP
275
287
  environment_vars = client.V1EnvVar(name="TEST", value="SOME DATA")
276
288
 
277
- launch_args = ["-c", "{0};".format(cmd_string)]
289
+ launch_args = ["-c", "{0}".format(cmd_string)]
278
290
 
279
291
  volume_mounts = []
280
292
  # Create mount paths for the volumes
@@ -311,11 +323,12 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
311
323
  claim_name=volume[0])))
312
324
 
313
325
  metadata = client.V1ObjectMeta(name=pod_name,
314
- labels={"app": job_name})
326
+ labels={"app": job_name},
327
+ annotations=annotations)
315
328
  spec = client.V1PodSpec(containers=[container],
316
329
  image_pull_secrets=[secret],
317
- volumes=volume_defs
318
- )
330
+ volumes=volume_defs,
331
+ service_account_name=service_account_name)
319
332
 
320
333
  pod = client.V1Pod(spec=spec, metadata=metadata)
321
334
  api_response = self.kube_client.create_namespaced_pod(namespace=self.namespace,
@@ -19,25 +19,29 @@ from parsl.utils import RepresentationMixin, wtime_to_minutes
19
19
 
20
20
  logger = logging.getLogger(__name__)
21
21
 
22
+ # From https://slurm.schedmd.com/sacct.html#SECTION_JOB-STATE-CODES
22
23
  translate_table = {
23
- 'PD': JobState.PENDING,
24
- 'R': JobState.RUNNING,
25
- 'CA': JobState.CANCELLED,
26
- 'CF': JobState.PENDING, # (configuring),
27
- 'CG': JobState.RUNNING, # (completing),
28
- 'CD': JobState.COMPLETED,
29
- 'F': JobState.FAILED, # (failed),
30
- 'TO': JobState.TIMEOUT, # (timeout),
31
- 'NF': JobState.FAILED, # (node failure),
32
- 'RV': JobState.FAILED, # (revoked) and
33
- 'SE': JobState.FAILED # (special exit state)
24
+ 'PENDING': JobState.PENDING,
25
+ 'RUNNING': JobState.RUNNING,
26
+ 'CANCELLED': JobState.CANCELLED,
27
+ 'COMPLETED': JobState.COMPLETED,
28
+ 'FAILED': JobState.FAILED,
29
+ 'NODE_FAIL': JobState.FAILED,
30
+ 'BOOT_FAIL': JobState.FAILED,
31
+ 'DEADLINE': JobState.TIMEOUT,
32
+ 'TIMEOUT': JobState.TIMEOUT,
33
+ 'REVOKED': JobState.FAILED,
34
+ 'OUT_OF_MEMORY': JobState.FAILED,
35
+ 'SUSPENDED': JobState.HELD,
36
+ 'PREEMPTED': JobState.TIMEOUT,
37
+ 'REQUEUED': JobState.PENDING
34
38
  }
35
39
 
36
40
 
37
41
  class SlurmProvider(ClusterProvider, RepresentationMixin):
38
42
  """Slurm Execution Provider
39
43
 
40
- This provider uses sbatch to submit, squeue for status and scancel to cancel
44
+ This provider uses sbatch to submit, sacct for status and scancel to cancel
41
45
  jobs. The sbatch script to be used is created from a template file in this
42
46
  same module.
43
47
 
@@ -168,14 +172,16 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
168
172
  logger.debug('No active jobs, skipping status update')
169
173
  return
170
174
 
171
- cmd = "squeue --noheader --format='%i %t' --job '{0}'".format(job_id_list)
175
+ # Using state%20 to get enough characters to not truncate output
176
+ # of the state. Without output can look like "<job_id> CANCELLED+"
177
+ cmd = "sacct -X --noheader --format=jobid,state%20 --job '{0}'".format(job_id_list)
172
178
  logger.debug("Executing %s", cmd)
173
179
  retcode, stdout, stderr = self.execute_wait(cmd)
174
- logger.debug("squeue returned %s %s", stdout, stderr)
180
+ logger.debug("sacct returned %s %s", stdout, stderr)
175
181
 
176
182
  # Execute_wait failed. Do no update
177
183
  if retcode != 0:
178
- logger.warning("squeue failed with non-zero exit code {}".format(retcode))
184
+ logger.warning("sacct failed with non-zero exit code {}".format(retcode))
179
185
  return
180
186
 
181
187
  jobs_missing = set(self.resources.keys())
@@ -183,7 +189,10 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
183
189
  if not line:
184
190
  # Blank line
185
191
  continue
186
- job_id, slurm_state = line.split()
192
+ # Sacct includes extra information in some outputs
193
+ # For example "<job_id> CANCELLED by <user_id>"
194
+ # This splits and ignores anything past the first two unpacked values
195
+ job_id, slurm_state, *ignore = line.split()
187
196
  if slurm_state not in translate_table:
188
197
  logger.warning(f"Slurm status {slurm_state} is not recognized")
189
198
  status = translate_table.get(slurm_state, JobState.UNKNOWN)
@@ -193,13 +202,13 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
193
202
  stderr_path=self.resources[job_id]['job_stderr_path'])
194
203
  jobs_missing.remove(job_id)
195
204
 
196
- # squeue does not report on jobs that are not running. So we are filling in the
197
- # blanks for missing jobs, we might lose some information about why the jobs failed.
205
+ # sacct can get job info after jobs have completed so this path shouldn't be hit
206
+ # log a warning if there are missing jobs for some reason
198
207
  for missing_job in jobs_missing:
199
- logger.debug("Updating missing job {} to completed status".format(missing_job))
200
- self.resources[missing_job]['status'] = JobStatus(JobState.COMPLETED,
201
- stdout_path=self.resources[missing_job]['job_stdout_path'],
202
- stderr_path=self.resources[missing_job]['job_stderr_path'])
208
+ logger.warning("Updating missing job {} to completed status".format(missing_job))
209
+ self.resources[missing_job]['status'] = JobStatus(
210
+ JobState.COMPLETED, stdout_path=self.resources[missing_job]['job_stdout_path'],
211
+ stderr_path=self.resources[missing_job]['job_stderr_path'])
203
212
 
204
213
  def submit(self, command: str, tasks_per_node: int, job_name="parsl.slurm") -> str:
205
214
  """Submit the command as a slurm job.
@@ -0,0 +1,11 @@
1
+ from parsl.config import Config
2
+ from parsl.executors import FluxExecutor
3
+
4
+
5
+ def fresh_config():
6
+ return Config(
7
+ executors=[FluxExecutor()],
8
+ )
9
+
10
+
11
+ config = fresh_config()
parsl/tests/conftest.py CHANGED
@@ -151,6 +151,10 @@ def pytest_configure(config):
151
151
  'markers',
152
152
  'multiple_cores_required: Marks tests that require multiple cores, such as htex affinity'
153
153
  )
154
+ config.addinivalue_line(
155
+ 'markers',
156
+ 'unix_filesystem_permissions_required: Marks tests that require unix-level filesystem permission enforcement'
157
+ )
154
158
  config.addinivalue_line(
155
159
  'markers',
156
160
  'issue3328: Marks tests broken by issue #3328'
@@ -16,7 +16,6 @@ def echo_to_streams(msg, stderr=None, stdout=None):
16
16
  whitelist = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'configs', '*threads*')
17
17
 
18
18
  speclist = (
19
- '/bad/dir/t.out',
20
19
  ['t3.out', 'w'],
21
20
  ('t4.out', None),
22
21
  (42, 'w'),
@@ -26,7 +25,6 @@ speclist = (
26
25
  )
27
26
 
28
27
  testids = [
29
- 'nonexistent_dir',
30
28
  'list_not_tuple',
31
29
  'null_mode',
32
30
  'not_a_string',
@@ -55,6 +53,26 @@ def test_bad_stdout_specs(spec):
55
53
 
56
54
 
57
55
  @pytest.mark.issue3328
56
+ @pytest.mark.unix_filesystem_permissions_required
57
+ def test_bad_stdout_file():
58
+ """Testing bad stderr file"""
59
+
60
+ o = "/bad/dir/t2.out"
61
+
62
+ fn = echo_to_streams("Hello world", stdout=o, stderr='t.err')
63
+
64
+ try:
65
+ fn.result()
66
+ except perror.BadStdStreamFile:
67
+ pass
68
+ else:
69
+ assert False, "Did not raise expected exception BadStdStreamFile"
70
+
71
+ return
72
+
73
+
74
+ @pytest.mark.issue3328
75
+ @pytest.mark.unix_filesystem_permissions_required
58
76
  def test_bad_stderr_file():
59
77
  """Testing bad stderr file"""
60
78