parsl 2024.2.12__py3-none-any.whl → 2024.2.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/channels/errors.py +1 -4
- parsl/configs/{comet.py → expanse.py} +5 -5
- parsl/dataflow/dflow.py +12 -12
- parsl/executors/flux/executor.py +5 -3
- parsl/executors/high_throughput/executor.py +56 -10
- parsl/executors/high_throughput/mpi_prefix_composer.py +137 -0
- parsl/executors/high_throughput/mpi_resource_management.py +217 -0
- parsl/executors/high_throughput/process_worker_pool.py +65 -9
- parsl/executors/radical/executor.py +6 -3
- parsl/executors/radical/rpex_worker.py +2 -2
- parsl/jobs/states.py +5 -5
- parsl/monitoring/db_manager.py +2 -1
- parsl/monitoring/monitoring.py +7 -4
- parsl/multiprocessing.py +3 -4
- parsl/providers/cobalt/cobalt.py +6 -0
- parsl/providers/pbspro/pbspro.py +18 -4
- parsl/providers/pbspro/template.py +2 -2
- parsl/providers/slurm/slurm.py +17 -4
- parsl/providers/slurm/template.py +2 -2
- parsl/serialize/__init__.py +7 -2
- parsl/serialize/facade.py +32 -1
- parsl/tests/test_error_handling/test_resource_spec.py +6 -0
- parsl/tests/test_htex/test_htex.py +66 -3
- parsl/tests/test_monitoring/test_incomplete_futures.py +65 -0
- parsl/tests/test_mpi_apps/__init__.py +0 -0
- parsl/tests/test_mpi_apps/test_bad_mpi_config.py +41 -0
- parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +51 -0
- parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +171 -0
- parsl/tests/test_mpi_apps/test_mpi_prefix.py +71 -0
- parsl/tests/test_mpi_apps/test_mpi_scheduler.py +158 -0
- parsl/tests/test_mpi_apps/test_resource_spec.py +145 -0
- parsl/tests/test_providers/test_cobalt_deprecation_warning.py +16 -0
- parsl/tests/test_providers/test_pbspro_template.py +28 -0
- parsl/tests/test_providers/test_slurm_template.py +29 -0
- parsl/tests/test_radical/test_mpi_funcs.py +1 -0
- parsl/tests/test_scaling/test_scale_down.py +6 -5
- parsl/tests/test_serialization/test_htex_code_cache.py +57 -0
- parsl/tests/test_serialization/test_pack_resource_spec.py +22 -0
- parsl/usage_tracking/usage.py +29 -55
- parsl/utils.py +12 -35
- parsl/version.py +1 -1
- {parsl-2024.2.12.data → parsl-2024.2.26.data}/scripts/process_worker_pool.py +65 -9
- {parsl-2024.2.12.dist-info → parsl-2024.2.26.dist-info}/METADATA +2 -2
- {parsl-2024.2.12.dist-info → parsl-2024.2.26.dist-info}/RECORD +50 -37
- parsl/configs/cooley.py +0 -29
- parsl/configs/theta.py +0 -33
- {parsl-2024.2.12.data → parsl-2024.2.26.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.2.12.data → parsl-2024.2.26.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.2.12.dist-info → parsl-2024.2.26.dist-info}/LICENSE +0 -0
- {parsl-2024.2.12.dist-info → parsl-2024.2.26.dist-info}/WHEEL +0 -0
- {parsl-2024.2.12.dist-info → parsl-2024.2.26.dist-info}/entry_points.txt +0 -0
- {parsl-2024.2.12.dist-info → parsl-2024.2.26.dist-info}/top_level.txt +0 -0
@@ -10,7 +10,7 @@ import pickle
|
|
10
10
|
import time
|
11
11
|
import queue
|
12
12
|
import uuid
|
13
|
-
from typing import Sequence, Optional
|
13
|
+
from typing import Sequence, Optional, Dict, List
|
14
14
|
|
15
15
|
import zmq
|
16
16
|
import math
|
@@ -27,7 +27,13 @@ from parsl.app.errors import RemoteExceptionWrapper
|
|
27
27
|
from parsl.executors.high_throughput.errors import WorkerLost
|
28
28
|
from parsl.executors.high_throughput.probe import probe_addresses
|
29
29
|
from parsl.multiprocessing import SpawnContext
|
30
|
-
from parsl.serialize import
|
30
|
+
from parsl.serialize import unpack_res_spec_apply_message, serialize
|
31
|
+
from parsl.executors.high_throughput.mpi_resource_management import (
|
32
|
+
TaskScheduler,
|
33
|
+
MPITaskScheduler
|
34
|
+
)
|
35
|
+
|
36
|
+
from parsl.executors.high_throughput.mpi_prefix_composer import compose_all, VALID_LAUNCHERS
|
31
37
|
|
32
38
|
HEARTBEAT_CODE = (2 ** 32) - 1
|
33
39
|
|
@@ -64,6 +70,8 @@ class Manager:
|
|
64
70
|
heartbeat_period,
|
65
71
|
poll_period,
|
66
72
|
cpu_affinity,
|
73
|
+
enable_mpi_mode: bool = False,
|
74
|
+
mpi_launcher: str = "mpiexec",
|
67
75
|
available_accelerators: Sequence[str],
|
68
76
|
cert_dir: Optional[str]):
|
69
77
|
"""
|
@@ -120,6 +128,14 @@ class Manager:
|
|
120
128
|
available_accelerators: list of str
|
121
129
|
List of accelerators available to the workers.
|
122
130
|
|
131
|
+
enable_mpi_mode: bool
|
132
|
+
When set to true, the manager assumes ownership of the batch job and each worker
|
133
|
+
claims a subset of nodes from a shared pool to execute multi-node mpi tasks. Node
|
134
|
+
info is made available to workers via env vars.
|
135
|
+
|
136
|
+
mpi_launcher: str
|
137
|
+
Set to one of the supported MPI launchers: ("srun", "aprun", "mpiexec")
|
138
|
+
|
123
139
|
cert_dir : str | None
|
124
140
|
Path to the certificate directory.
|
125
141
|
"""
|
@@ -159,6 +175,9 @@ class Manager:
|
|
159
175
|
self.uid = uid
|
160
176
|
self.block_id = block_id
|
161
177
|
|
178
|
+
self.enable_mpi_mode = enable_mpi_mode
|
179
|
+
self.mpi_launcher = mpi_launcher
|
180
|
+
|
162
181
|
if os.environ.get('PARSL_CORES'):
|
163
182
|
cores_on_node = int(os.environ['PARSL_CORES'])
|
164
183
|
else:
|
@@ -186,6 +205,17 @@ class Manager:
|
|
186
205
|
self.monitoring_queue = self._mp_manager.Queue()
|
187
206
|
self.pending_task_queue = SpawnContext.Queue()
|
188
207
|
self.pending_result_queue = SpawnContext.Queue()
|
208
|
+
self.task_scheduler: TaskScheduler
|
209
|
+
if self.enable_mpi_mode:
|
210
|
+
self.task_scheduler = MPITaskScheduler(
|
211
|
+
self.pending_task_queue,
|
212
|
+
self.pending_result_queue,
|
213
|
+
)
|
214
|
+
else:
|
215
|
+
self.task_scheduler = TaskScheduler(
|
216
|
+
self.pending_task_queue,
|
217
|
+
self.pending_result_queue
|
218
|
+
)
|
189
219
|
self.ready_worker_count = SpawnContext.Value("i", 0)
|
190
220
|
|
191
221
|
self.max_queue_size = self.prefetch_capacity + self.worker_count
|
@@ -286,9 +316,7 @@ class Manager:
|
|
286
316
|
logger.debug("Got executor tasks: {}, cumulative count of tasks: {}".format([t['task_id'] for t in tasks], task_recv_counter))
|
287
317
|
|
288
318
|
for task in tasks:
|
289
|
-
self.
|
290
|
-
# logger.debug("Ready tasks: {}".format(
|
291
|
-
# [i['task_id'] for i in self.pending_task_queue]))
|
319
|
+
self.task_scheduler.put_task(task)
|
292
320
|
|
293
321
|
else:
|
294
322
|
logger.debug("No incoming tasks")
|
@@ -327,7 +355,7 @@ class Manager:
|
|
327
355
|
while not kill_event.is_set():
|
328
356
|
try:
|
329
357
|
logger.debug("Starting pending_result_queue get")
|
330
|
-
r = self.
|
358
|
+
r = self.task_scheduler.get_result(block=True, timeout=push_poll_period)
|
331
359
|
logger.debug("Got a result item")
|
332
360
|
items.append(r)
|
333
361
|
except queue.Empty:
|
@@ -497,6 +525,7 @@ class Manager:
|
|
497
525
|
os.getpid(),
|
498
526
|
args.logdir,
|
499
527
|
args.debug,
|
528
|
+
self.mpi_launcher,
|
500
529
|
),
|
501
530
|
name="HTEX-Worker-{}".format(worker_id),
|
502
531
|
)
|
@@ -504,7 +533,13 @@ class Manager:
|
|
504
533
|
return p
|
505
534
|
|
506
535
|
|
507
|
-
def
|
536
|
+
def update_resource_spec_env_vars(mpi_launcher: str, resource_spec: Dict, node_info: List[str]) -> None:
|
537
|
+
prefix_table = compose_all(mpi_launcher, resource_spec=resource_spec, node_hostnames=node_info)
|
538
|
+
for key in prefix_table:
|
539
|
+
os.environ[key] = prefix_table[key]
|
540
|
+
|
541
|
+
|
542
|
+
def execute_task(bufs, mpi_launcher: Optional[str] = None):
|
508
543
|
"""Deserialize the buffer and execute the task.
|
509
544
|
|
510
545
|
Returns the result or throws exception.
|
@@ -512,8 +547,20 @@ def execute_task(bufs):
|
|
512
547
|
user_ns = locals()
|
513
548
|
user_ns.update({'__builtins__': __builtins__})
|
514
549
|
|
515
|
-
f, args, kwargs =
|
550
|
+
f, args, kwargs, resource_spec = unpack_res_spec_apply_message(bufs, user_ns, copy=False)
|
551
|
+
|
552
|
+
for varname in resource_spec:
|
553
|
+
envname = "PARSL_" + str(varname).upper()
|
554
|
+
os.environ[envname] = str(resource_spec[varname])
|
516
555
|
|
556
|
+
if resource_spec.get("MPI_NODELIST"):
|
557
|
+
worker_id = os.environ['PARSL_WORKER_RANK']
|
558
|
+
nodes_for_task = resource_spec["MPI_NODELIST"].split(',')
|
559
|
+
logger.info(f"Launching task on provisioned nodes: {nodes_for_task}")
|
560
|
+
assert mpi_launcher
|
561
|
+
update_resource_spec_env_vars(mpi_launcher,
|
562
|
+
resource_spec=resource_spec,
|
563
|
+
node_info=nodes_for_task)
|
517
564
|
# We might need to look into callability of the function from itself
|
518
565
|
# since we change it's name in the new namespace
|
519
566
|
prefix = "parsl_"
|
@@ -550,6 +597,7 @@ def worker(
|
|
550
597
|
manager_pid: int,
|
551
598
|
logdir: str,
|
552
599
|
debug: bool,
|
600
|
+
mpi_launcher: str,
|
553
601
|
):
|
554
602
|
"""
|
555
603
|
|
@@ -668,7 +716,7 @@ def worker(
|
|
668
716
|
worker_enqueued = False
|
669
717
|
|
670
718
|
try:
|
671
|
-
result = execute_task(req['buffer'])
|
719
|
+
result = execute_task(req['buffer'], mpi_launcher=mpi_launcher)
|
672
720
|
serialized_result = serialize(result, buffer_threshold=1000000)
|
673
721
|
except Exception as e:
|
674
722
|
logger.info('Caught an exception: {}'.format(e))
|
@@ -768,6 +816,10 @@ if __name__ == "__main__":
|
|
768
816
|
help="Whether/how workers should control CPU affinity.")
|
769
817
|
parser.add_argument("--available-accelerators", type=str, nargs="*",
|
770
818
|
help="Names of available accelerators")
|
819
|
+
parser.add_argument("--enable_mpi_mode", action='store_true',
|
820
|
+
help="Enable MPI mode")
|
821
|
+
parser.add_argument("--mpi-launcher", type=str, choices=VALID_LAUNCHERS,
|
822
|
+
help="MPI launcher to use iff enable_mpi_mode=true")
|
771
823
|
|
772
824
|
args = parser.parse_args()
|
773
825
|
|
@@ -797,6 +849,8 @@ if __name__ == "__main__":
|
|
797
849
|
logger.info("Heartbeat period: {}".format(args.hb_period))
|
798
850
|
logger.info("CPU affinity: {}".format(args.cpu_affinity))
|
799
851
|
logger.info("Accelerators: {}".format(" ".join(args.available_accelerators)))
|
852
|
+
logger.info("enable_mpi_mode: {}".format(args.enable_mpi_mode))
|
853
|
+
logger.info("mpi_launcher: {}".format(args.mpi_launcher))
|
800
854
|
|
801
855
|
manager = Manager(task_port=args.task_port,
|
802
856
|
result_port=args.result_port,
|
@@ -812,6 +866,8 @@ if __name__ == "__main__":
|
|
812
866
|
heartbeat_period=int(args.hb_period),
|
813
867
|
poll_period=int(args.poll),
|
814
868
|
cpu_affinity=args.cpu_affinity,
|
869
|
+
enable_mpi_mode=args.enable_mpi_mode,
|
870
|
+
mpi_launcher=args.mpi_launcher,
|
815
871
|
available_accelerators=args.available_accelerators,
|
816
872
|
cert_dir=None if args.cert_dir == "None" else args.cert_dir)
|
817
873
|
manager.start()
|
@@ -23,7 +23,7 @@ from parsl.utils import RepresentationMixin
|
|
23
23
|
from parsl.app.errors import BashExitFailure
|
24
24
|
from parsl.executors.base import ParslExecutor
|
25
25
|
from parsl.app.errors import RemoteExceptionWrapper
|
26
|
-
from parsl.serialize import
|
26
|
+
from parsl.serialize import deserialize, pack_res_spec_apply_message
|
27
27
|
from parsl.serialize.errors import SerializationError, DeserializationError
|
28
28
|
|
29
29
|
try:
|
@@ -400,8 +400,11 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
|
|
400
400
|
|
401
401
|
def _pack_and_apply_message(self, func, args, kwargs):
|
402
402
|
try:
|
403
|
-
buffer =
|
404
|
-
|
403
|
+
buffer = pack_res_spec_apply_message(func,
|
404
|
+
args,
|
405
|
+
kwargs,
|
406
|
+
resource_specification={},
|
407
|
+
buffer_threshold=1024 * 1024)
|
405
408
|
task_func = rp.utils.serialize_bson(buffer)
|
406
409
|
except TypeError:
|
407
410
|
raise SerializationError(func.__name__)
|
@@ -3,7 +3,7 @@ import radical.pilot as rp
|
|
3
3
|
|
4
4
|
import parsl.app.errors as pe
|
5
5
|
from parsl.app.bash import remote_side_bash_executor
|
6
|
-
from parsl.serialize import
|
6
|
+
from parsl.serialize import unpack_res_spec_apply_message, serialize
|
7
7
|
from parsl.executors.high_throughput.process_worker_pool import execute_task
|
8
8
|
|
9
9
|
|
@@ -32,7 +32,7 @@ class ParslWorker:
|
|
32
32
|
|
33
33
|
try:
|
34
34
|
buffer = rp.utils.deserialize_bson(task['description']['executable'])
|
35
|
-
func, args, kwargs =
|
35
|
+
func, args, kwargs, _resource_spec = unpack_res_spec_apply_message(buffer, {}, copy=False)
|
36
36
|
ret = remote_side_bash_executor(func, *args, **kwargs)
|
37
37
|
exc = (None, None)
|
38
38
|
val = None
|
parsl/jobs/states.py
CHANGED
@@ -47,7 +47,7 @@ class JobState(IntEnum):
|
|
47
47
|
"""
|
48
48
|
|
49
49
|
def __str__(self) -> str:
|
50
|
-
return self.__class__.__name__
|
50
|
+
return f"{self.__class__.__name__}.{self.name}"
|
51
51
|
|
52
52
|
|
53
53
|
TERMINAL_STATES = [JobState.CANCELLED, JobState.COMPLETED, JobState.FAILED,
|
@@ -84,16 +84,16 @@ class JobStatus:
|
|
84
84
|
|
85
85
|
def __repr__(self) -> str:
|
86
86
|
if self.message is not None:
|
87
|
-
extra = f"state={self.state} message={self.message}"
|
87
|
+
extra = f"state={self.state} message={self.message}"
|
88
88
|
else:
|
89
|
-
extra = f"state={self.state}"
|
89
|
+
extra = f"state={self.state}"
|
90
90
|
return f"<{type(self).__module__}.{type(self).__qualname__} object at {hex(id(self))}, {extra}>"
|
91
91
|
|
92
92
|
def __str__(self) -> str:
|
93
93
|
if self.message is not None:
|
94
|
-
return "{} ({
|
94
|
+
return f"{self.state} ({self.message})"
|
95
95
|
else:
|
96
|
-
return "{
|
96
|
+
return f"{self.state}"
|
97
97
|
|
98
98
|
@property
|
99
99
|
def stdout(self) -> Optional[str]:
|
parsl/monitoring/db_manager.py
CHANGED
@@ -444,7 +444,8 @@ class DatabaseManager:
|
|
444
444
|
'run_id', 'task_id',
|
445
445
|
'task_fail_count',
|
446
446
|
'task_fail_cost',
|
447
|
-
'task_hashsum'
|
447
|
+
'task_hashsum',
|
448
|
+
'task_inputs'],
|
448
449
|
messages=task_info_update_messages)
|
449
450
|
logger.debug("Inserting {} task_info_all_messages into status table".format(len(task_info_all_messages)))
|
450
451
|
|
parsl/monitoring/monitoring.py
CHANGED
@@ -84,7 +84,7 @@ class MonitoringHub(RepresentationMixin):
|
|
84
84
|
|
85
85
|
workflow_name: Optional[str] = None,
|
86
86
|
workflow_version: Optional[str] = None,
|
87
|
-
logging_endpoint: str =
|
87
|
+
logging_endpoint: Optional[str] = None,
|
88
88
|
logdir: Optional[str] = None,
|
89
89
|
monitoring_debug: bool = False,
|
90
90
|
resource_monitoring_enabled: bool = True,
|
@@ -118,7 +118,7 @@ class MonitoringHub(RepresentationMixin):
|
|
118
118
|
logging_endpoint : str
|
119
119
|
The database connection url for monitoring to log the information.
|
120
120
|
These URLs follow RFC-1738, and can include username, password, hostname, database name.
|
121
|
-
Default:
|
121
|
+
Default: sqlite, in the configured run_dir.
|
122
122
|
logdir : str
|
123
123
|
Parsl log directory paths. Logs and temp files go here. Default: '.'
|
124
124
|
monitoring_debug : Bool
|
@@ -162,11 +162,14 @@ class MonitoringHub(RepresentationMixin):
|
|
162
162
|
self.resource_monitoring_enabled = resource_monitoring_enabled
|
163
163
|
self.resource_monitoring_interval = resource_monitoring_interval
|
164
164
|
|
165
|
-
def start(self, run_id: str,
|
165
|
+
def start(self, run_id: str, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> int:
|
166
166
|
|
167
167
|
if self.logdir is None:
|
168
168
|
self.logdir = "."
|
169
169
|
|
170
|
+
if self.logging_endpoint is None:
|
171
|
+
self.logging_endpoint = f"sqlite:///{os.fspath(config_run_dir)}/monitoring.db"
|
172
|
+
|
170
173
|
os.makedirs(self.logdir, exist_ok=True)
|
171
174
|
|
172
175
|
# Initialize the ZMQ pipe to the Parsl Client
|
@@ -231,7 +234,7 @@ class MonitoringHub(RepresentationMixin):
|
|
231
234
|
self.logger.info("Started the router process {} and DBM process {}".format(self.router_proc.pid, self.dbm_proc.pid))
|
232
235
|
|
233
236
|
self.filesystem_proc = Process(target=filesystem_receiver,
|
234
|
-
args=(self.logdir, self.resource_msgs,
|
237
|
+
args=(self.logdir, self.resource_msgs, dfk_run_dir),
|
235
238
|
name="Monitoring-Filesystem-Process",
|
236
239
|
daemon=True
|
237
240
|
)
|
parsl/multiprocessing.py
CHANGED
@@ -5,17 +5,16 @@ import logging
|
|
5
5
|
import multiprocessing
|
6
6
|
import multiprocessing.queues
|
7
7
|
import platform
|
8
|
+
from multiprocessing.context import ForkProcess as ForkProcessType
|
8
9
|
|
9
|
-
from typing import Callable
|
10
|
+
from typing import Callable
|
10
11
|
|
11
12
|
logger = logging.getLogger(__name__)
|
12
13
|
|
13
14
|
ForkContext = multiprocessing.get_context("fork")
|
14
15
|
SpawnContext = multiprocessing.get_context("spawn")
|
15
16
|
|
16
|
-
|
17
|
-
# it clear that it returns a Process always to the type checker?
|
18
|
-
ForkProcess: Type = ForkContext.Process
|
17
|
+
ForkProcess: Callable[..., ForkProcessType] = ForkContext.Process
|
19
18
|
|
20
19
|
|
21
20
|
class MacSafeQueue(multiprocessing.queues.Queue):
|
parsl/providers/cobalt/cobalt.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import logging
|
2
2
|
import os
|
3
3
|
import time
|
4
|
+
import warnings
|
4
5
|
|
5
6
|
from parsl.providers.errors import ScaleOutFailed
|
6
7
|
from parsl.channels import LocalChannel
|
@@ -24,6 +25,8 @@ translate_table = {
|
|
24
25
|
class CobaltProvider(ClusterProvider, RepresentationMixin):
|
25
26
|
""" Cobalt Execution Provider
|
26
27
|
|
28
|
+
WARNING: CobaltProvider is deprecated and will be removed by 2024.04
|
29
|
+
|
27
30
|
This provider uses cobalt to submit (qsub), obtain the status of (qstat), and cancel (qdel)
|
28
31
|
jobs. Theo script to be used is created from a template file in this
|
29
32
|
same module.
|
@@ -86,6 +89,9 @@ class CobaltProvider(ClusterProvider, RepresentationMixin):
|
|
86
89
|
self.queue = queue
|
87
90
|
self.scheduler_options = scheduler_options
|
88
91
|
self.worker_init = worker_init
|
92
|
+
warnings.warn("CobaltProvider is deprecated; This will be removed after 2024-04",
|
93
|
+
DeprecationWarning,
|
94
|
+
stacklevel=2)
|
89
95
|
|
90
96
|
def _status(self):
|
91
97
|
"""Returns the status list for a list of job_ids
|
parsl/providers/pbspro/pbspro.py
CHANGED
@@ -119,13 +119,17 @@ class PBSProProvider(TorqueProvider):
|
|
119
119
|
|
120
120
|
job_state = job.get('job_state', JobState.UNKNOWN)
|
121
121
|
state = translate_table.get(job_state, JobState.UNKNOWN)
|
122
|
-
self.resources[job_id]['status'] = JobStatus(state
|
122
|
+
self.resources[job_id]['status'] = JobStatus(state,
|
123
|
+
stdout_path=self.resources[job_id]['job_stdout_path'],
|
124
|
+
stderr_path=self.resources[job_id]['job_stderr_path'])
|
123
125
|
jobs_missing.remove(job_id)
|
124
126
|
|
125
127
|
# squeue does not report on jobs that are not running. So we are filling in the
|
126
128
|
# blanks for missing jobs, we might lose some information about why the jobs failed.
|
127
129
|
for missing_job in jobs_missing:
|
128
|
-
self.resources[missing_job]['status'] = JobStatus(JobState.COMPLETED
|
130
|
+
self.resources[missing_job]['status'] = JobStatus(JobState.COMPLETED,
|
131
|
+
stdout_path=self.resources[missing_job]['job_stdout_path'],
|
132
|
+
stderr_path=self.resources[missing_job]['job_stderr_path'])
|
129
133
|
|
130
134
|
def submit(self, command, tasks_per_node, job_name="parsl"):
|
131
135
|
"""Submits the command job.
|
@@ -149,7 +153,11 @@ class PBSProProvider(TorqueProvider):
|
|
149
153
|
|
150
154
|
job_name = "{0}.{1}".format(job_name, time.time())
|
151
155
|
|
152
|
-
|
156
|
+
assert self.script_dir, "Expected script_dir to be set"
|
157
|
+
script_path = os.path.join(self.script_dir, job_name)
|
158
|
+
script_path = os.path.abspath(script_path)
|
159
|
+
job_stdout_path = script_path + ".stdout"
|
160
|
+
job_stderr_path = script_path + ".stderr"
|
153
161
|
|
154
162
|
logger.debug("Requesting {} nodes_per_block, {} tasks_per_node".format(
|
155
163
|
self.nodes_per_block, tasks_per_node)
|
@@ -163,6 +171,8 @@ class PBSProProvider(TorqueProvider):
|
|
163
171
|
job_config["scheduler_options"] = self.scheduler_options
|
164
172
|
job_config["worker_init"] = self.worker_init
|
165
173
|
job_config["user_script"] = command
|
174
|
+
job_config["job_stdout_path"] = job_stdout_path
|
175
|
+
job_config["job_stderr_path"] = job_stderr_path
|
166
176
|
|
167
177
|
# Add a colon to select_options if one isn't included
|
168
178
|
if self.select_options and not self.select_options.startswith(":"):
|
@@ -194,7 +204,11 @@ class PBSProProvider(TorqueProvider):
|
|
194
204
|
for line in stdout.split('\n'):
|
195
205
|
if line.strip():
|
196
206
|
job_id = line.strip()
|
197
|
-
self.resources[job_id] = {'job_id': job_id,
|
207
|
+
self.resources[job_id] = {'job_id': job_id,
|
208
|
+
'status': JobStatus(JobState.PENDING),
|
209
|
+
'job_stdout_path': job_stdout_path,
|
210
|
+
'job_stderr_path': job_stderr_path,
|
211
|
+
}
|
198
212
|
else:
|
199
213
|
message = "Command '{}' failed with return code {}".format(launch_cmd, retcode)
|
200
214
|
if (stdout is not None) and (stderr is not None):
|
@@ -5,8 +5,8 @@ template_string = '''#!/bin/bash
|
|
5
5
|
#PBS -m n
|
6
6
|
#PBS -l walltime=$walltime
|
7
7
|
#PBS -l select=${nodes_per_block}:ncpus=${ncpus}${select_options}
|
8
|
-
#PBS -o ${
|
9
|
-
#PBS -e ${
|
8
|
+
#PBS -o ${job_stdout_path}
|
9
|
+
#PBS -e ${job_stderr_path}
|
10
10
|
${scheduler_options}
|
11
11
|
|
12
12
|
${worker_init}
|
parsl/providers/slurm/slurm.py
CHANGED
@@ -188,14 +188,18 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
|
|
188
188
|
logger.warning(f"Slurm status {slurm_state} is not recognized")
|
189
189
|
status = translate_table.get(slurm_state, JobState.UNKNOWN)
|
190
190
|
logger.debug("Updating job {} with slurm status {} to parsl state {!s}".format(job_id, slurm_state, status))
|
191
|
-
self.resources[job_id]['status'] = JobStatus(status
|
191
|
+
self.resources[job_id]['status'] = JobStatus(status,
|
192
|
+
stdout_path=self.resources[job_id]['job_stdout_path'],
|
193
|
+
stderr_path=self.resources[job_id]['job_stderr_path'])
|
192
194
|
jobs_missing.remove(job_id)
|
193
195
|
|
194
196
|
# squeue does not report on jobs that are not running. So we are filling in the
|
195
197
|
# blanks for missing jobs, we might lose some information about why the jobs failed.
|
196
198
|
for missing_job in jobs_missing:
|
197
199
|
logger.debug("Updating missing job {} to completed status".format(missing_job))
|
198
|
-
self.resources[missing_job]['status'] = JobStatus(JobState.COMPLETED
|
200
|
+
self.resources[missing_job]['status'] = JobStatus(JobState.COMPLETED,
|
201
|
+
stdout_path=self.resources[missing_job]['job_stdout_path'],
|
202
|
+
stderr_path=self.resources[missing_job]['job_stderr_path'])
|
199
203
|
|
200
204
|
def submit(self, command: str, tasks_per_node: int, job_name="parsl.slurm") -> str:
|
201
205
|
"""Submit the command as a slurm job.
|
@@ -226,8 +230,11 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
|
|
226
230
|
|
227
231
|
job_name = "{0}.{1}".format(job_name, time.time())
|
228
232
|
|
229
|
-
|
233
|
+
assert self.script_dir, "Expected script_dir to be set"
|
234
|
+
script_path = os.path.join(self.script_dir, job_name)
|
230
235
|
script_path = os.path.abspath(script_path)
|
236
|
+
job_stdout_path = script_path + ".stdout"
|
237
|
+
job_stderr_path = script_path + ".stderr"
|
231
238
|
|
232
239
|
logger.debug("Requesting one block with {} nodes".format(self.nodes_per_block))
|
233
240
|
|
@@ -239,6 +246,8 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
|
|
239
246
|
job_config["scheduler_options"] = scheduler_options
|
240
247
|
job_config["worker_init"] = worker_init
|
241
248
|
job_config["user_script"] = command
|
249
|
+
job_config["job_stdout_path"] = job_stdout_path
|
250
|
+
job_config["job_stderr_path"] = job_stderr_path
|
242
251
|
|
243
252
|
# Wrap the command
|
244
253
|
job_config["user_script"] = self.launcher(command,
|
@@ -262,7 +271,11 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
|
|
262
271
|
match = re.match(self.regex_job_id, line)
|
263
272
|
if match:
|
264
273
|
job_id = match.group("id")
|
265
|
-
self.resources[job_id] = {'job_id': job_id,
|
274
|
+
self.resources[job_id] = {'job_id': job_id,
|
275
|
+
'status': JobStatus(JobState.PENDING),
|
276
|
+
'job_stdout_path': job_stdout_path,
|
277
|
+
'job_stderr_path': job_stderr_path,
|
278
|
+
}
|
266
279
|
return job_id
|
267
280
|
else:
|
268
281
|
logger.error("Could not read job ID from submit command standard output.")
|
@@ -1,8 +1,8 @@
|
|
1
1
|
template_string = '''#!/bin/bash
|
2
2
|
|
3
3
|
#SBATCH --job-name=${jobname}
|
4
|
-
#SBATCH --output=${
|
5
|
-
#SBATCH --error=${
|
4
|
+
#SBATCH --output=${job_stdout_path}
|
5
|
+
#SBATCH --error=${job_stderr_path}
|
6
6
|
#SBATCH --nodes=${nodes}
|
7
7
|
#SBATCH --time=${walltime}
|
8
8
|
#SBATCH --ntasks-per-node=${tasks_per_node}
|
parsl/serialize/__init__.py
CHANGED
@@ -1,6 +1,11 @@
|
|
1
|
-
from parsl.serialize.facade import serialize, deserialize, pack_apply_message,
|
1
|
+
from parsl.serialize.facade import (serialize, deserialize, pack_apply_message,
|
2
|
+
unpack_apply_message, unpack_res_spec_apply_message,
|
3
|
+
pack_res_spec_apply_message)
|
2
4
|
|
3
5
|
__all__ = ['serialize',
|
4
6
|
'deserialize',
|
5
7
|
'pack_apply_message',
|
6
|
-
'unpack_apply_message'
|
8
|
+
'unpack_apply_message',
|
9
|
+
'unpack_res_spec_apply_message',
|
10
|
+
'pack_res_spec_apply_message'
|
11
|
+
]
|
parsl/serialize/facade.py
CHANGED
@@ -62,13 +62,44 @@ def pack_apply_message(func: Any, args: Any, kwargs: Any, buffer_threshold: int
|
|
62
62
|
return packed_buffer
|
63
63
|
|
64
64
|
|
65
|
+
def pack_res_spec_apply_message(func: Any, args: Any, kwargs: Any, resource_specification: Any, buffer_threshold: int = int(128 * 1e6)) -> bytes:
|
66
|
+
"""Serialize and pack function, parameters, and resource_specification
|
67
|
+
|
68
|
+
Parameters
|
69
|
+
----------
|
70
|
+
|
71
|
+
func: Function
|
72
|
+
A function to ship
|
73
|
+
|
74
|
+
args: Tuple/list of objects
|
75
|
+
positional parameters as a list
|
76
|
+
|
77
|
+
kwargs: Dict
|
78
|
+
Dict containing named parameters
|
79
|
+
|
80
|
+
resource_specification: Dict
|
81
|
+
Dict containing application resource specification
|
82
|
+
|
83
|
+
buffer_threshold: int
|
84
|
+
Limits buffer to specified size in bytes. Exceeding this limit would give you
|
85
|
+
a warning in the log. Default is 128MB.
|
86
|
+
"""
|
87
|
+
return pack_apply_message(func, args, (kwargs, resource_specification), buffer_threshold=buffer_threshold)
|
88
|
+
|
89
|
+
|
65
90
|
def unpack_apply_message(packed_buffer: bytes, user_ns: Any = None, copy: Any = False) -> List[Any]:
|
66
91
|
""" Unpack and deserialize function and parameters
|
67
|
-
|
68
92
|
"""
|
69
93
|
return [deserialize(buf) for buf in unpack_buffers(packed_buffer)]
|
70
94
|
|
71
95
|
|
96
|
+
def unpack_res_spec_apply_message(packed_buffer: bytes, user_ns: Any = None, copy: Any = False) -> List[Any]:
|
97
|
+
""" Unpack and deserialize function, parameters, and resource_specification
|
98
|
+
"""
|
99
|
+
func, args, (kwargs, resource_spec) = unpack_apply_message(packed_buffer, user_ns=user_ns, copy=copy)
|
100
|
+
return [func, args, kwargs, resource_spec]
|
101
|
+
|
102
|
+
|
72
103
|
def serialize(obj: Any, buffer_threshold: int = int(1e6)) -> bytes:
|
73
104
|
""" Try available serialization methods one at a time
|
74
105
|
|
@@ -2,6 +2,8 @@ import parsl
|
|
2
2
|
from parsl.app.app import python_app
|
3
3
|
from parsl.executors.errors import UnsupportedFeatureError, ExecutorError
|
4
4
|
from parsl.executors import WorkQueueExecutor
|
5
|
+
from parsl.executors.high_throughput.mpi_prefix_composer import InvalidResourceSpecification
|
6
|
+
from parsl.executors.high_throughput.executor import HighThroughputExecutor
|
5
7
|
|
6
8
|
|
7
9
|
@python_app
|
@@ -22,6 +24,8 @@ def test_resource(n=2):
|
|
22
24
|
fut = double(n, parsl_resource_specification=spec)
|
23
25
|
try:
|
24
26
|
fut.result()
|
27
|
+
except InvalidResourceSpecification:
|
28
|
+
assert isinstance(executor, HighThroughputExecutor)
|
25
29
|
except UnsupportedFeatureError:
|
26
30
|
assert not isinstance(executor, WorkQueueExecutor)
|
27
31
|
except Exception as e:
|
@@ -33,6 +37,8 @@ def test_resource(n=2):
|
|
33
37
|
fut = double(n, parsl_resource_specification=spec)
|
34
38
|
try:
|
35
39
|
fut.result()
|
40
|
+
except InvalidResourceSpecification:
|
41
|
+
assert isinstance(executor, HighThroughputExecutor)
|
36
42
|
except UnsupportedFeatureError:
|
37
43
|
assert not isinstance(executor, WorkQueueExecutor)
|
38
44
|
except Exception as e:
|