parsl 2024.2.12__py3-none-any.whl → 2024.2.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. parsl/channels/errors.py +1 -4
  2. parsl/configs/{comet.py → expanse.py} +5 -5
  3. parsl/dataflow/dflow.py +12 -12
  4. parsl/executors/flux/executor.py +5 -3
  5. parsl/executors/high_throughput/executor.py +56 -10
  6. parsl/executors/high_throughput/mpi_prefix_composer.py +137 -0
  7. parsl/executors/high_throughput/mpi_resource_management.py +217 -0
  8. parsl/executors/high_throughput/process_worker_pool.py +65 -9
  9. parsl/executors/radical/executor.py +6 -3
  10. parsl/executors/radical/rpex_worker.py +2 -2
  11. parsl/jobs/states.py +5 -5
  12. parsl/monitoring/db_manager.py +2 -1
  13. parsl/monitoring/monitoring.py +7 -4
  14. parsl/multiprocessing.py +3 -4
  15. parsl/providers/cobalt/cobalt.py +6 -0
  16. parsl/providers/pbspro/pbspro.py +18 -4
  17. parsl/providers/pbspro/template.py +2 -2
  18. parsl/providers/slurm/slurm.py +17 -4
  19. parsl/providers/slurm/template.py +2 -2
  20. parsl/serialize/__init__.py +7 -2
  21. parsl/serialize/facade.py +32 -1
  22. parsl/tests/test_error_handling/test_resource_spec.py +6 -0
  23. parsl/tests/test_htex/test_htex.py +66 -3
  24. parsl/tests/test_monitoring/test_incomplete_futures.py +65 -0
  25. parsl/tests/test_mpi_apps/__init__.py +0 -0
  26. parsl/tests/test_mpi_apps/test_bad_mpi_config.py +41 -0
  27. parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +51 -0
  28. parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +171 -0
  29. parsl/tests/test_mpi_apps/test_mpi_prefix.py +71 -0
  30. parsl/tests/test_mpi_apps/test_mpi_scheduler.py +158 -0
  31. parsl/tests/test_mpi_apps/test_resource_spec.py +145 -0
  32. parsl/tests/test_providers/test_cobalt_deprecation_warning.py +16 -0
  33. parsl/tests/test_providers/test_pbspro_template.py +28 -0
  34. parsl/tests/test_providers/test_slurm_template.py +29 -0
  35. parsl/tests/test_radical/test_mpi_funcs.py +1 -0
  36. parsl/tests/test_scaling/test_scale_down.py +6 -5
  37. parsl/tests/test_serialization/test_htex_code_cache.py +57 -0
  38. parsl/tests/test_serialization/test_pack_resource_spec.py +22 -0
  39. parsl/usage_tracking/usage.py +29 -55
  40. parsl/utils.py +12 -35
  41. parsl/version.py +1 -1
  42. {parsl-2024.2.12.data → parsl-2024.2.26.data}/scripts/process_worker_pool.py +65 -9
  43. {parsl-2024.2.12.dist-info → parsl-2024.2.26.dist-info}/METADATA +2 -2
  44. {parsl-2024.2.12.dist-info → parsl-2024.2.26.dist-info}/RECORD +50 -37
  45. parsl/configs/cooley.py +0 -29
  46. parsl/configs/theta.py +0 -33
  47. {parsl-2024.2.12.data → parsl-2024.2.26.data}/scripts/exec_parsl_function.py +0 -0
  48. {parsl-2024.2.12.data → parsl-2024.2.26.data}/scripts/parsl_coprocess.py +0 -0
  49. {parsl-2024.2.12.dist-info → parsl-2024.2.26.dist-info}/LICENSE +0 -0
  50. {parsl-2024.2.12.dist-info → parsl-2024.2.26.dist-info}/WHEEL +0 -0
  51. {parsl-2024.2.12.dist-info → parsl-2024.2.26.dist-info}/entry_points.txt +0 -0
  52. {parsl-2024.2.12.dist-info → parsl-2024.2.26.dist-info}/top_level.txt +0 -0
@@ -10,7 +10,7 @@ import pickle
10
10
  import time
11
11
  import queue
12
12
  import uuid
13
- from typing import Sequence, Optional
13
+ from typing import Sequence, Optional, Dict, List
14
14
 
15
15
  import zmq
16
16
  import math
@@ -27,7 +27,13 @@ from parsl.app.errors import RemoteExceptionWrapper
27
27
  from parsl.executors.high_throughput.errors import WorkerLost
28
28
  from parsl.executors.high_throughput.probe import probe_addresses
29
29
  from parsl.multiprocessing import SpawnContext
30
- from parsl.serialize import unpack_apply_message, serialize
30
+ from parsl.serialize import unpack_res_spec_apply_message, serialize
31
+ from parsl.executors.high_throughput.mpi_resource_management import (
32
+ TaskScheduler,
33
+ MPITaskScheduler
34
+ )
35
+
36
+ from parsl.executors.high_throughput.mpi_prefix_composer import compose_all, VALID_LAUNCHERS
31
37
 
32
38
  HEARTBEAT_CODE = (2 ** 32) - 1
33
39
 
@@ -64,6 +70,8 @@ class Manager:
64
70
  heartbeat_period,
65
71
  poll_period,
66
72
  cpu_affinity,
73
+ enable_mpi_mode: bool = False,
74
+ mpi_launcher: str = "mpiexec",
67
75
  available_accelerators: Sequence[str],
68
76
  cert_dir: Optional[str]):
69
77
  """
@@ -120,6 +128,14 @@ class Manager:
120
128
  available_accelerators: list of str
121
129
  List of accelerators available to the workers.
122
130
 
131
+ enable_mpi_mode: bool
132
+ When set to true, the manager assumes ownership of the batch job and each worker
133
+ claims a subset of nodes from a shared pool to execute multi-node mpi tasks. Node
134
+ info is made available to workers via env vars.
135
+
136
+ mpi_launcher: str
137
+ Set to one of the supported MPI launchers: ("srun", "aprun", "mpiexec")
138
+
123
139
  cert_dir : str | None
124
140
  Path to the certificate directory.
125
141
  """
@@ -159,6 +175,9 @@ class Manager:
159
175
  self.uid = uid
160
176
  self.block_id = block_id
161
177
 
178
+ self.enable_mpi_mode = enable_mpi_mode
179
+ self.mpi_launcher = mpi_launcher
180
+
162
181
  if os.environ.get('PARSL_CORES'):
163
182
  cores_on_node = int(os.environ['PARSL_CORES'])
164
183
  else:
@@ -186,6 +205,17 @@ class Manager:
186
205
  self.monitoring_queue = self._mp_manager.Queue()
187
206
  self.pending_task_queue = SpawnContext.Queue()
188
207
  self.pending_result_queue = SpawnContext.Queue()
208
+ self.task_scheduler: TaskScheduler
209
+ if self.enable_mpi_mode:
210
+ self.task_scheduler = MPITaskScheduler(
211
+ self.pending_task_queue,
212
+ self.pending_result_queue,
213
+ )
214
+ else:
215
+ self.task_scheduler = TaskScheduler(
216
+ self.pending_task_queue,
217
+ self.pending_result_queue
218
+ )
189
219
  self.ready_worker_count = SpawnContext.Value("i", 0)
190
220
 
191
221
  self.max_queue_size = self.prefetch_capacity + self.worker_count
@@ -286,9 +316,7 @@ class Manager:
286
316
  logger.debug("Got executor tasks: {}, cumulative count of tasks: {}".format([t['task_id'] for t in tasks], task_recv_counter))
287
317
 
288
318
  for task in tasks:
289
- self.pending_task_queue.put(task)
290
- # logger.debug("Ready tasks: {}".format(
291
- # [i['task_id'] for i in self.pending_task_queue]))
319
+ self.task_scheduler.put_task(task)
292
320
 
293
321
  else:
294
322
  logger.debug("No incoming tasks")
@@ -327,7 +355,7 @@ class Manager:
327
355
  while not kill_event.is_set():
328
356
  try:
329
357
  logger.debug("Starting pending_result_queue get")
330
- r = self.pending_result_queue.get(block=True, timeout=push_poll_period)
358
+ r = self.task_scheduler.get_result(block=True, timeout=push_poll_period)
331
359
  logger.debug("Got a result item")
332
360
  items.append(r)
333
361
  except queue.Empty:
@@ -497,6 +525,7 @@ class Manager:
497
525
  os.getpid(),
498
526
  args.logdir,
499
527
  args.debug,
528
+ self.mpi_launcher,
500
529
  ),
501
530
  name="HTEX-Worker-{}".format(worker_id),
502
531
  )
@@ -504,7 +533,13 @@ class Manager:
504
533
  return p
505
534
 
506
535
 
507
- def execute_task(bufs):
536
+ def update_resource_spec_env_vars(mpi_launcher: str, resource_spec: Dict, node_info: List[str]) -> None:
537
+ prefix_table = compose_all(mpi_launcher, resource_spec=resource_spec, node_hostnames=node_info)
538
+ for key in prefix_table:
539
+ os.environ[key] = prefix_table[key]
540
+
541
+
542
+ def execute_task(bufs, mpi_launcher: Optional[str] = None):
508
543
  """Deserialize the buffer and execute the task.
509
544
 
510
545
  Returns the result or throws exception.
@@ -512,8 +547,20 @@ def execute_task(bufs):
512
547
  user_ns = locals()
513
548
  user_ns.update({'__builtins__': __builtins__})
514
549
 
515
- f, args, kwargs = unpack_apply_message(bufs, user_ns, copy=False)
550
+ f, args, kwargs, resource_spec = unpack_res_spec_apply_message(bufs, user_ns, copy=False)
551
+
552
+ for varname in resource_spec:
553
+ envname = "PARSL_" + str(varname).upper()
554
+ os.environ[envname] = str(resource_spec[varname])
516
555
 
556
+ if resource_spec.get("MPI_NODELIST"):
557
+ worker_id = os.environ['PARSL_WORKER_RANK']
558
+ nodes_for_task = resource_spec["MPI_NODELIST"].split(',')
559
+ logger.info(f"Launching task on provisioned nodes: {nodes_for_task}")
560
+ assert mpi_launcher
561
+ update_resource_spec_env_vars(mpi_launcher,
562
+ resource_spec=resource_spec,
563
+ node_info=nodes_for_task)
517
564
  # We might need to look into callability of the function from itself
518
565
  # since we change it's name in the new namespace
519
566
  prefix = "parsl_"
@@ -550,6 +597,7 @@ def worker(
550
597
  manager_pid: int,
551
598
  logdir: str,
552
599
  debug: bool,
600
+ mpi_launcher: str,
553
601
  ):
554
602
  """
555
603
 
@@ -668,7 +716,7 @@ def worker(
668
716
  worker_enqueued = False
669
717
 
670
718
  try:
671
- result = execute_task(req['buffer'])
719
+ result = execute_task(req['buffer'], mpi_launcher=mpi_launcher)
672
720
  serialized_result = serialize(result, buffer_threshold=1000000)
673
721
  except Exception as e:
674
722
  logger.info('Caught an exception: {}'.format(e))
@@ -768,6 +816,10 @@ if __name__ == "__main__":
768
816
  help="Whether/how workers should control CPU affinity.")
769
817
  parser.add_argument("--available-accelerators", type=str, nargs="*",
770
818
  help="Names of available accelerators")
819
+ parser.add_argument("--enable_mpi_mode", action='store_true',
820
+ help="Enable MPI mode")
821
+ parser.add_argument("--mpi-launcher", type=str, choices=VALID_LAUNCHERS,
822
+ help="MPI launcher to use iff enable_mpi_mode=true")
771
823
 
772
824
  args = parser.parse_args()
773
825
 
@@ -797,6 +849,8 @@ if __name__ == "__main__":
797
849
  logger.info("Heartbeat period: {}".format(args.hb_period))
798
850
  logger.info("CPU affinity: {}".format(args.cpu_affinity))
799
851
  logger.info("Accelerators: {}".format(" ".join(args.available_accelerators)))
852
+ logger.info("enable_mpi_mode: {}".format(args.enable_mpi_mode))
853
+ logger.info("mpi_launcher: {}".format(args.mpi_launcher))
800
854
 
801
855
  manager = Manager(task_port=args.task_port,
802
856
  result_port=args.result_port,
@@ -812,6 +866,8 @@ if __name__ == "__main__":
812
866
  heartbeat_period=int(args.hb_period),
813
867
  poll_period=int(args.poll),
814
868
  cpu_affinity=args.cpu_affinity,
869
+ enable_mpi_mode=args.enable_mpi_mode,
870
+ mpi_launcher=args.mpi_launcher,
815
871
  available_accelerators=args.available_accelerators,
816
872
  cert_dir=None if args.cert_dir == "None" else args.cert_dir)
817
873
  manager.start()
@@ -23,7 +23,7 @@ from parsl.utils import RepresentationMixin
23
23
  from parsl.app.errors import BashExitFailure
24
24
  from parsl.executors.base import ParslExecutor
25
25
  from parsl.app.errors import RemoteExceptionWrapper
26
- from parsl.serialize import pack_apply_message, deserialize
26
+ from parsl.serialize import deserialize, pack_res_spec_apply_message
27
27
  from parsl.serialize.errors import SerializationError, DeserializationError
28
28
 
29
29
  try:
@@ -400,8 +400,11 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
400
400
 
401
401
  def _pack_and_apply_message(self, func, args, kwargs):
402
402
  try:
403
- buffer = pack_apply_message(func, args, kwargs,
404
- buffer_threshold=1024 * 1024)
403
+ buffer = pack_res_spec_apply_message(func,
404
+ args,
405
+ kwargs,
406
+ resource_specification={},
407
+ buffer_threshold=1024 * 1024)
405
408
  task_func = rp.utils.serialize_bson(buffer)
406
409
  except TypeError:
407
410
  raise SerializationError(func.__name__)
@@ -3,7 +3,7 @@ import radical.pilot as rp
3
3
 
4
4
  import parsl.app.errors as pe
5
5
  from parsl.app.bash import remote_side_bash_executor
6
- from parsl.serialize import unpack_apply_message, serialize
6
+ from parsl.serialize import unpack_res_spec_apply_message, serialize
7
7
  from parsl.executors.high_throughput.process_worker_pool import execute_task
8
8
 
9
9
 
@@ -32,7 +32,7 @@ class ParslWorker:
32
32
 
33
33
  try:
34
34
  buffer = rp.utils.deserialize_bson(task['description']['executable'])
35
- func, args, kwargs = unpack_apply_message(buffer, {}, copy=False)
35
+ func, args, kwargs, _resource_spec = unpack_res_spec_apply_message(buffer, {}, copy=False)
36
36
  ret = remote_side_bash_executor(func, *args, **kwargs)
37
37
  exc = (None, None)
38
38
  val = None
parsl/jobs/states.py CHANGED
@@ -47,7 +47,7 @@ class JobState(IntEnum):
47
47
  """
48
48
 
49
49
  def __str__(self) -> str:
50
- return self.__class__.__name__ + "." + self.name
50
+ return f"{self.__class__.__name__}.{self.name}"
51
51
 
52
52
 
53
53
  TERMINAL_STATES = [JobState.CANCELLED, JobState.COMPLETED, JobState.FAILED,
@@ -84,16 +84,16 @@ class JobStatus:
84
84
 
85
85
  def __repr__(self) -> str:
86
86
  if self.message is not None:
87
- extra = f"state={self.state} message={self.message}".format(self.state, self.message)
87
+ extra = f"state={self.state} message={self.message}"
88
88
  else:
89
- extra = f"state={self.state}".format(self.state)
89
+ extra = f"state={self.state}"
90
90
  return f"<{type(self).__module__}.{type(self).__qualname__} object at {hex(id(self))}, {extra}>"
91
91
 
92
92
  def __str__(self) -> str:
93
93
  if self.message is not None:
94
- return "{} ({})".format(self.state, self.message)
94
+ return f"{self.state} ({self.message})"
95
95
  else:
96
- return "{}".format(self.state)
96
+ return f"{self.state}"
97
97
 
98
98
  @property
99
99
  def stdout(self) -> Optional[str]:
@@ -444,7 +444,8 @@ class DatabaseManager:
444
444
  'run_id', 'task_id',
445
445
  'task_fail_count',
446
446
  'task_fail_cost',
447
- 'task_hashsum'],
447
+ 'task_hashsum',
448
+ 'task_inputs'],
448
449
  messages=task_info_update_messages)
449
450
  logger.debug("Inserting {} task_info_all_messages into status table".format(len(task_info_all_messages)))
450
451
 
@@ -84,7 +84,7 @@ class MonitoringHub(RepresentationMixin):
84
84
 
85
85
  workflow_name: Optional[str] = None,
86
86
  workflow_version: Optional[str] = None,
87
- logging_endpoint: str = 'sqlite:///runinfo/monitoring.db',
87
+ logging_endpoint: Optional[str] = None,
88
88
  logdir: Optional[str] = None,
89
89
  monitoring_debug: bool = False,
90
90
  resource_monitoring_enabled: bool = True,
@@ -118,7 +118,7 @@ class MonitoringHub(RepresentationMixin):
118
118
  logging_endpoint : str
119
119
  The database connection url for monitoring to log the information.
120
120
  These URLs follow RFC-1738, and can include username, password, hostname, database name.
121
- Default: 'sqlite:///monitoring.db'
121
+ Default: sqlite, in the configured run_dir.
122
122
  logdir : str
123
123
  Parsl log directory paths. Logs and temp files go here. Default: '.'
124
124
  monitoring_debug : Bool
@@ -162,11 +162,14 @@ class MonitoringHub(RepresentationMixin):
162
162
  self.resource_monitoring_enabled = resource_monitoring_enabled
163
163
  self.resource_monitoring_interval = resource_monitoring_interval
164
164
 
165
- def start(self, run_id: str, run_dir: str) -> int:
165
+ def start(self, run_id: str, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> int:
166
166
 
167
167
  if self.logdir is None:
168
168
  self.logdir = "."
169
169
 
170
+ if self.logging_endpoint is None:
171
+ self.logging_endpoint = f"sqlite:///{os.fspath(config_run_dir)}/monitoring.db"
172
+
170
173
  os.makedirs(self.logdir, exist_ok=True)
171
174
 
172
175
  # Initialize the ZMQ pipe to the Parsl Client
@@ -231,7 +234,7 @@ class MonitoringHub(RepresentationMixin):
231
234
  self.logger.info("Started the router process {} and DBM process {}".format(self.router_proc.pid, self.dbm_proc.pid))
232
235
 
233
236
  self.filesystem_proc = Process(target=filesystem_receiver,
234
- args=(self.logdir, self.resource_msgs, run_dir),
237
+ args=(self.logdir, self.resource_msgs, dfk_run_dir),
235
238
  name="Monitoring-Filesystem-Process",
236
239
  daemon=True
237
240
  )
parsl/multiprocessing.py CHANGED
@@ -5,17 +5,16 @@ import logging
5
5
  import multiprocessing
6
6
  import multiprocessing.queues
7
7
  import platform
8
+ from multiprocessing.context import ForkProcess as ForkProcessType
8
9
 
9
- from typing import Callable, Type
10
+ from typing import Callable
10
11
 
11
12
  logger = logging.getLogger(__name__)
12
13
 
13
14
  ForkContext = multiprocessing.get_context("fork")
14
15
  SpawnContext = multiprocessing.get_context("spawn")
15
16
 
16
- # maybe ForkProcess should be: Callable[..., Process] so as to make
17
- # it clear that it returns a Process always to the type checker?
18
- ForkProcess: Type = ForkContext.Process
17
+ ForkProcess: Callable[..., ForkProcessType] = ForkContext.Process
19
18
 
20
19
 
21
20
  class MacSafeQueue(multiprocessing.queues.Queue):
@@ -1,6 +1,7 @@
1
1
  import logging
2
2
  import os
3
3
  import time
4
+ import warnings
4
5
 
5
6
  from parsl.providers.errors import ScaleOutFailed
6
7
  from parsl.channels import LocalChannel
@@ -24,6 +25,8 @@ translate_table = {
24
25
  class CobaltProvider(ClusterProvider, RepresentationMixin):
25
26
  """ Cobalt Execution Provider
26
27
 
28
+ WARNING: CobaltProvider is deprecated and will be removed by 2024.04
29
+
27
30
  This provider uses cobalt to submit (qsub), obtain the status of (qstat), and cancel (qdel)
28
31
  jobs. Theo script to be used is created from a template file in this
29
32
  same module.
@@ -86,6 +89,9 @@ class CobaltProvider(ClusterProvider, RepresentationMixin):
86
89
  self.queue = queue
87
90
  self.scheduler_options = scheduler_options
88
91
  self.worker_init = worker_init
92
+ warnings.warn("CobaltProvider is deprecated; This will be removed after 2024-04",
93
+ DeprecationWarning,
94
+ stacklevel=2)
89
95
 
90
96
  def _status(self):
91
97
  """Returns the status list for a list of job_ids
@@ -119,13 +119,17 @@ class PBSProProvider(TorqueProvider):
119
119
 
120
120
  job_state = job.get('job_state', JobState.UNKNOWN)
121
121
  state = translate_table.get(job_state, JobState.UNKNOWN)
122
- self.resources[job_id]['status'] = JobStatus(state)
122
+ self.resources[job_id]['status'] = JobStatus(state,
123
+ stdout_path=self.resources[job_id]['job_stdout_path'],
124
+ stderr_path=self.resources[job_id]['job_stderr_path'])
123
125
  jobs_missing.remove(job_id)
124
126
 
125
127
  # squeue does not report on jobs that are not running. So we are filling in the
126
128
  # blanks for missing jobs, we might lose some information about why the jobs failed.
127
129
  for missing_job in jobs_missing:
128
- self.resources[missing_job]['status'] = JobStatus(JobState.COMPLETED)
130
+ self.resources[missing_job]['status'] = JobStatus(JobState.COMPLETED,
131
+ stdout_path=self.resources[missing_job]['job_stdout_path'],
132
+ stderr_path=self.resources[missing_job]['job_stderr_path'])
129
133
 
130
134
  def submit(self, command, tasks_per_node, job_name="parsl"):
131
135
  """Submits the command job.
@@ -149,7 +153,11 @@ class PBSProProvider(TorqueProvider):
149
153
 
150
154
  job_name = "{0}.{1}".format(job_name, time.time())
151
155
 
152
- script_path = os.path.abspath("{0}/{1}.submit".format(self.script_dir, job_name))
156
+ assert self.script_dir, "Expected script_dir to be set"
157
+ script_path = os.path.join(self.script_dir, job_name)
158
+ script_path = os.path.abspath(script_path)
159
+ job_stdout_path = script_path + ".stdout"
160
+ job_stderr_path = script_path + ".stderr"
153
161
 
154
162
  logger.debug("Requesting {} nodes_per_block, {} tasks_per_node".format(
155
163
  self.nodes_per_block, tasks_per_node)
@@ -163,6 +171,8 @@ class PBSProProvider(TorqueProvider):
163
171
  job_config["scheduler_options"] = self.scheduler_options
164
172
  job_config["worker_init"] = self.worker_init
165
173
  job_config["user_script"] = command
174
+ job_config["job_stdout_path"] = job_stdout_path
175
+ job_config["job_stderr_path"] = job_stderr_path
166
176
 
167
177
  # Add a colon to select_options if one isn't included
168
178
  if self.select_options and not self.select_options.startswith(":"):
@@ -194,7 +204,11 @@ class PBSProProvider(TorqueProvider):
194
204
  for line in stdout.split('\n'):
195
205
  if line.strip():
196
206
  job_id = line.strip()
197
- self.resources[job_id] = {'job_id': job_id, 'status': JobStatus(JobState.PENDING)}
207
+ self.resources[job_id] = {'job_id': job_id,
208
+ 'status': JobStatus(JobState.PENDING),
209
+ 'job_stdout_path': job_stdout_path,
210
+ 'job_stderr_path': job_stderr_path,
211
+ }
198
212
  else:
199
213
  message = "Command '{}' failed with return code {}".format(launch_cmd, retcode)
200
214
  if (stdout is not None) and (stderr is not None):
@@ -5,8 +5,8 @@ template_string = '''#!/bin/bash
5
5
  #PBS -m n
6
6
  #PBS -l walltime=$walltime
7
7
  #PBS -l select=${nodes_per_block}:ncpus=${ncpus}${select_options}
8
- #PBS -o ${submit_script_dir}/${jobname}.submit.stdout
9
- #PBS -e ${submit_script_dir}/${jobname}.submit.stderr
8
+ #PBS -o ${job_stdout_path}
9
+ #PBS -e ${job_stderr_path}
10
10
  ${scheduler_options}
11
11
 
12
12
  ${worker_init}
@@ -188,14 +188,18 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
188
188
  logger.warning(f"Slurm status {slurm_state} is not recognized")
189
189
  status = translate_table.get(slurm_state, JobState.UNKNOWN)
190
190
  logger.debug("Updating job {} with slurm status {} to parsl state {!s}".format(job_id, slurm_state, status))
191
- self.resources[job_id]['status'] = JobStatus(status)
191
+ self.resources[job_id]['status'] = JobStatus(status,
192
+ stdout_path=self.resources[job_id]['job_stdout_path'],
193
+ stderr_path=self.resources[job_id]['job_stderr_path'])
192
194
  jobs_missing.remove(job_id)
193
195
 
194
196
  # squeue does not report on jobs that are not running. So we are filling in the
195
197
  # blanks for missing jobs, we might lose some information about why the jobs failed.
196
198
  for missing_job in jobs_missing:
197
199
  logger.debug("Updating missing job {} to completed status".format(missing_job))
198
- self.resources[missing_job]['status'] = JobStatus(JobState.COMPLETED)
200
+ self.resources[missing_job]['status'] = JobStatus(JobState.COMPLETED,
201
+ stdout_path=self.resources[missing_job]['job_stdout_path'],
202
+ stderr_path=self.resources[missing_job]['job_stderr_path'])
199
203
 
200
204
  def submit(self, command: str, tasks_per_node: int, job_name="parsl.slurm") -> str:
201
205
  """Submit the command as a slurm job.
@@ -226,8 +230,11 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
226
230
 
227
231
  job_name = "{0}.{1}".format(job_name, time.time())
228
232
 
229
- script_path = "{0}/{1}.submit".format(self.script_dir, job_name)
233
+ assert self.script_dir, "Expected script_dir to be set"
234
+ script_path = os.path.join(self.script_dir, job_name)
230
235
  script_path = os.path.abspath(script_path)
236
+ job_stdout_path = script_path + ".stdout"
237
+ job_stderr_path = script_path + ".stderr"
231
238
 
232
239
  logger.debug("Requesting one block with {} nodes".format(self.nodes_per_block))
233
240
 
@@ -239,6 +246,8 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
239
246
  job_config["scheduler_options"] = scheduler_options
240
247
  job_config["worker_init"] = worker_init
241
248
  job_config["user_script"] = command
249
+ job_config["job_stdout_path"] = job_stdout_path
250
+ job_config["job_stderr_path"] = job_stderr_path
242
251
 
243
252
  # Wrap the command
244
253
  job_config["user_script"] = self.launcher(command,
@@ -262,7 +271,11 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
262
271
  match = re.match(self.regex_job_id, line)
263
272
  if match:
264
273
  job_id = match.group("id")
265
- self.resources[job_id] = {'job_id': job_id, 'status': JobStatus(JobState.PENDING)}
274
+ self.resources[job_id] = {'job_id': job_id,
275
+ 'status': JobStatus(JobState.PENDING),
276
+ 'job_stdout_path': job_stdout_path,
277
+ 'job_stderr_path': job_stderr_path,
278
+ }
266
279
  return job_id
267
280
  else:
268
281
  logger.error("Could not read job ID from submit command standard output.")
@@ -1,8 +1,8 @@
1
1
  template_string = '''#!/bin/bash
2
2
 
3
3
  #SBATCH --job-name=${jobname}
4
- #SBATCH --output=${submit_script_dir}/${jobname}.submit.stdout
5
- #SBATCH --error=${submit_script_dir}/${jobname}.submit.stderr
4
+ #SBATCH --output=${job_stdout_path}
5
+ #SBATCH --error=${job_stderr_path}
6
6
  #SBATCH --nodes=${nodes}
7
7
  #SBATCH --time=${walltime}
8
8
  #SBATCH --ntasks-per-node=${tasks_per_node}
@@ -1,6 +1,11 @@
1
- from parsl.serialize.facade import serialize, deserialize, pack_apply_message, unpack_apply_message
1
+ from parsl.serialize.facade import (serialize, deserialize, pack_apply_message,
2
+ unpack_apply_message, unpack_res_spec_apply_message,
3
+ pack_res_spec_apply_message)
2
4
 
3
5
  __all__ = ['serialize',
4
6
  'deserialize',
5
7
  'pack_apply_message',
6
- 'unpack_apply_message']
8
+ 'unpack_apply_message',
9
+ 'unpack_res_spec_apply_message',
10
+ 'pack_res_spec_apply_message'
11
+ ]
parsl/serialize/facade.py CHANGED
@@ -62,13 +62,44 @@ def pack_apply_message(func: Any, args: Any, kwargs: Any, buffer_threshold: int
62
62
  return packed_buffer
63
63
 
64
64
 
65
+ def pack_res_spec_apply_message(func: Any, args: Any, kwargs: Any, resource_specification: Any, buffer_threshold: int = int(128 * 1e6)) -> bytes:
66
+ """Serialize and pack function, parameters, and resource_specification
67
+
68
+ Parameters
69
+ ----------
70
+
71
+ func: Function
72
+ A function to ship
73
+
74
+ args: Tuple/list of objects
75
+ positional parameters as a list
76
+
77
+ kwargs: Dict
78
+ Dict containing named parameters
79
+
80
+ resource_specification: Dict
81
+ Dict containing application resource specification
82
+
83
+ buffer_threshold: int
84
+ Limits buffer to specified size in bytes. Exceeding this limit would give you
85
+ a warning in the log. Default is 128MB.
86
+ """
87
+ return pack_apply_message(func, args, (kwargs, resource_specification), buffer_threshold=buffer_threshold)
88
+
89
+
65
90
  def unpack_apply_message(packed_buffer: bytes, user_ns: Any = None, copy: Any = False) -> List[Any]:
66
91
  """ Unpack and deserialize function and parameters
67
-
68
92
  """
69
93
  return [deserialize(buf) for buf in unpack_buffers(packed_buffer)]
70
94
 
71
95
 
96
+ def unpack_res_spec_apply_message(packed_buffer: bytes, user_ns: Any = None, copy: Any = False) -> List[Any]:
97
+ """ Unpack and deserialize function, parameters, and resource_specification
98
+ """
99
+ func, args, (kwargs, resource_spec) = unpack_apply_message(packed_buffer, user_ns=user_ns, copy=copy)
100
+ return [func, args, kwargs, resource_spec]
101
+
102
+
72
103
  def serialize(obj: Any, buffer_threshold: int = int(1e6)) -> bytes:
73
104
  """ Try available serialization methods one at a time
74
105
 
@@ -2,6 +2,8 @@ import parsl
2
2
  from parsl.app.app import python_app
3
3
  from parsl.executors.errors import UnsupportedFeatureError, ExecutorError
4
4
  from parsl.executors import WorkQueueExecutor
5
+ from parsl.executors.high_throughput.mpi_prefix_composer import InvalidResourceSpecification
6
+ from parsl.executors.high_throughput.executor import HighThroughputExecutor
5
7
 
6
8
 
7
9
  @python_app
@@ -22,6 +24,8 @@ def test_resource(n=2):
22
24
  fut = double(n, parsl_resource_specification=spec)
23
25
  try:
24
26
  fut.result()
27
+ except InvalidResourceSpecification:
28
+ assert isinstance(executor, HighThroughputExecutor)
25
29
  except UnsupportedFeatureError:
26
30
  assert not isinstance(executor, WorkQueueExecutor)
27
31
  except Exception as e:
@@ -33,6 +37,8 @@ def test_resource(n=2):
33
37
  fut = double(n, parsl_resource_specification=spec)
34
38
  try:
35
39
  fut.result()
40
+ except InvalidResourceSpecification:
41
+ assert isinstance(executor, HighThroughputExecutor)
36
42
  except UnsupportedFeatureError:
37
43
  assert not isinstance(executor, WorkQueueExecutor)
38
44
  except Exception as e: