hpcflow-new2 0.2.0a189__py3-none-any.whl → 0.2.0a190__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hpcflow/__pyinstaller/hook-hpcflow.py +8 -6
- hpcflow/_version.py +1 -1
- hpcflow/app.py +1 -0
- hpcflow/data/scripts/main_script_test_hdf5_in_obj.py +1 -1
- hpcflow/data/scripts/main_script_test_hdf5_out_obj.py +1 -1
- hpcflow/sdk/__init__.py +21 -15
- hpcflow/sdk/app.py +2133 -770
- hpcflow/sdk/cli.py +281 -250
- hpcflow/sdk/cli_common.py +6 -2
- hpcflow/sdk/config/__init__.py +1 -1
- hpcflow/sdk/config/callbacks.py +77 -42
- hpcflow/sdk/config/cli.py +126 -103
- hpcflow/sdk/config/config.py +578 -311
- hpcflow/sdk/config/config_file.py +131 -95
- hpcflow/sdk/config/errors.py +112 -85
- hpcflow/sdk/config/types.py +145 -0
- hpcflow/sdk/core/actions.py +1054 -994
- hpcflow/sdk/core/app_aware.py +24 -0
- hpcflow/sdk/core/cache.py +81 -63
- hpcflow/sdk/core/command_files.py +275 -185
- hpcflow/sdk/core/commands.py +111 -107
- hpcflow/sdk/core/element.py +724 -503
- hpcflow/sdk/core/enums.py +192 -0
- hpcflow/sdk/core/environment.py +74 -93
- hpcflow/sdk/core/errors.py +398 -51
- hpcflow/sdk/core/json_like.py +540 -272
- hpcflow/sdk/core/loop.py +380 -334
- hpcflow/sdk/core/loop_cache.py +160 -43
- hpcflow/sdk/core/object_list.py +370 -207
- hpcflow/sdk/core/parameters.py +728 -600
- hpcflow/sdk/core/rule.py +59 -41
- hpcflow/sdk/core/run_dir_files.py +33 -22
- hpcflow/sdk/core/task.py +1546 -1325
- hpcflow/sdk/core/task_schema.py +240 -196
- hpcflow/sdk/core/test_utils.py +126 -88
- hpcflow/sdk/core/types.py +387 -0
- hpcflow/sdk/core/utils.py +410 -305
- hpcflow/sdk/core/validation.py +82 -9
- hpcflow/sdk/core/workflow.py +1192 -1028
- hpcflow/sdk/core/zarr_io.py +98 -137
- hpcflow/sdk/demo/cli.py +46 -33
- hpcflow/sdk/helper/cli.py +18 -16
- hpcflow/sdk/helper/helper.py +75 -63
- hpcflow/sdk/helper/watcher.py +61 -28
- hpcflow/sdk/log.py +83 -59
- hpcflow/sdk/persistence/__init__.py +8 -31
- hpcflow/sdk/persistence/base.py +988 -586
- hpcflow/sdk/persistence/defaults.py +6 -0
- hpcflow/sdk/persistence/discovery.py +38 -0
- hpcflow/sdk/persistence/json.py +408 -153
- hpcflow/sdk/persistence/pending.py +158 -123
- hpcflow/sdk/persistence/store_resource.py +37 -22
- hpcflow/sdk/persistence/types.py +307 -0
- hpcflow/sdk/persistence/utils.py +14 -11
- hpcflow/sdk/persistence/zarr.py +477 -420
- hpcflow/sdk/runtime.py +44 -41
- hpcflow/sdk/submission/{jobscript_info.py → enums.py} +39 -12
- hpcflow/sdk/submission/jobscript.py +444 -404
- hpcflow/sdk/submission/schedulers/__init__.py +133 -40
- hpcflow/sdk/submission/schedulers/direct.py +97 -71
- hpcflow/sdk/submission/schedulers/sge.py +132 -126
- hpcflow/sdk/submission/schedulers/slurm.py +263 -268
- hpcflow/sdk/submission/schedulers/utils.py +7 -2
- hpcflow/sdk/submission/shells/__init__.py +14 -15
- hpcflow/sdk/submission/shells/base.py +102 -29
- hpcflow/sdk/submission/shells/bash.py +72 -55
- hpcflow/sdk/submission/shells/os_version.py +31 -30
- hpcflow/sdk/submission/shells/powershell.py +37 -29
- hpcflow/sdk/submission/submission.py +203 -257
- hpcflow/sdk/submission/types.py +143 -0
- hpcflow/sdk/typing.py +163 -12
- hpcflow/tests/conftest.py +8 -6
- hpcflow/tests/schedulers/slurm/test_slurm_submission.py +5 -2
- hpcflow/tests/scripts/test_main_scripts.py +60 -30
- hpcflow/tests/shells/wsl/test_wsl_submission.py +6 -4
- hpcflow/tests/unit/test_action.py +86 -75
- hpcflow/tests/unit/test_action_rule.py +9 -4
- hpcflow/tests/unit/test_app.py +13 -6
- hpcflow/tests/unit/test_cli.py +1 -1
- hpcflow/tests/unit/test_command.py +71 -54
- hpcflow/tests/unit/test_config.py +20 -15
- hpcflow/tests/unit/test_config_file.py +21 -18
- hpcflow/tests/unit/test_element.py +58 -62
- hpcflow/tests/unit/test_element_iteration.py +3 -1
- hpcflow/tests/unit/test_element_set.py +29 -19
- hpcflow/tests/unit/test_group.py +4 -2
- hpcflow/tests/unit/test_input_source.py +116 -93
- hpcflow/tests/unit/test_input_value.py +29 -24
- hpcflow/tests/unit/test_json_like.py +44 -35
- hpcflow/tests/unit/test_loop.py +65 -58
- hpcflow/tests/unit/test_object_list.py +17 -12
- hpcflow/tests/unit/test_parameter.py +16 -7
- hpcflow/tests/unit/test_persistence.py +48 -35
- hpcflow/tests/unit/test_resources.py +20 -18
- hpcflow/tests/unit/test_run.py +8 -3
- hpcflow/tests/unit/test_runtime.py +2 -1
- hpcflow/tests/unit/test_schema_input.py +23 -15
- hpcflow/tests/unit/test_shell.py +3 -2
- hpcflow/tests/unit/test_slurm.py +8 -7
- hpcflow/tests/unit/test_submission.py +39 -19
- hpcflow/tests/unit/test_task.py +352 -247
- hpcflow/tests/unit/test_task_schema.py +33 -20
- hpcflow/tests/unit/test_utils.py +9 -11
- hpcflow/tests/unit/test_value_sequence.py +15 -12
- hpcflow/tests/unit/test_workflow.py +114 -83
- hpcflow/tests/unit/test_workflow_template.py +0 -1
- hpcflow/tests/workflows/test_jobscript.py +2 -1
- hpcflow/tests/workflows/test_workflows.py +18 -13
- {hpcflow_new2-0.2.0a189.dist-info → hpcflow_new2-0.2.0a190.dist-info}/METADATA +2 -1
- hpcflow_new2-0.2.0a190.dist-info/RECORD +165 -0
- hpcflow/sdk/core/parallel.py +0 -21
- hpcflow_new2-0.2.0a189.dist-info/RECORD +0 -158
- {hpcflow_new2-0.2.0a189.dist-info → hpcflow_new2-0.2.0a190.dist-info}/LICENSE +0 -0
- {hpcflow_new2-0.2.0a189.dist-info → hpcflow_new2-0.2.0a190.dist-info}/WHEEL +0 -0
- {hpcflow_new2-0.2.0a189.dist-info → hpcflow_new2-0.2.0a190.dist-info}/entry_points.txt +0 -0
@@ -2,25 +2,36 @@
|
|
2
2
|
An interface to SLURM.
|
3
3
|
"""
|
4
4
|
|
5
|
-
from
|
5
|
+
from __future__ import annotations
|
6
6
|
import subprocess
|
7
7
|
import time
|
8
|
-
from typing import
|
8
|
+
from typing import TYPE_CHECKING
|
9
|
+
from typing_extensions import override
|
10
|
+
from hpcflow.sdk.typing import hydrate
|
11
|
+
from hpcflow.sdk.core.enums import ParallelMode
|
9
12
|
from hpcflow.sdk.core.errors import (
|
10
13
|
IncompatibleParallelModeError,
|
11
14
|
IncompatibleSLURMArgumentsError,
|
12
15
|
IncompatibleSLURMPartitionError,
|
13
16
|
UnknownSLURMPartitionError,
|
14
17
|
)
|
15
|
-
from hpcflow.sdk.core.parameters import ParallelMode
|
16
18
|
from hpcflow.sdk.log import TimeIt
|
17
|
-
from hpcflow.sdk.submission.
|
18
|
-
from hpcflow.sdk.submission.schedulers import
|
19
|
+
from hpcflow.sdk.submission.enums import JobscriptElementState
|
20
|
+
from hpcflow.sdk.submission.schedulers import QueuedScheduler
|
19
21
|
from hpcflow.sdk.submission.schedulers.utils import run_cmd
|
20
|
-
from hpcflow.sdk.submission.shells.base import Shell
|
21
22
|
|
23
|
+
if TYPE_CHECKING:
|
24
|
+
from collections.abc import Collection, Iterable, Iterator, Mapping, Sequence
|
25
|
+
from typing import Any, ClassVar
|
26
|
+
from ...config.types import SchedulerConfigDescriptor, SLURMPartitionsDescriptor
|
27
|
+
from ...core.element import ElementResources
|
28
|
+
from ..jobscript import Jobscript
|
29
|
+
from ..types import VersionInfo
|
30
|
+
from ..shells.base import Shell
|
22
31
|
|
23
|
-
|
32
|
+
|
33
|
+
@hydrate
|
34
|
+
class SlurmPosix(QueuedScheduler):
|
24
35
|
"""
|
25
36
|
A scheduler that uses SLURM.
|
26
37
|
|
@@ -48,27 +59,29 @@ class SlurmPosix(Scheduler):
|
|
48
59
|
|
49
60
|
"""
|
50
61
|
|
51
|
-
_app_attr = "app"
|
52
|
-
|
53
62
|
#: Default shell.
|
54
|
-
DEFAULT_SHELL_EXECUTABLE = "/bin/bash"
|
63
|
+
DEFAULT_SHELL_EXECUTABLE: ClassVar[str] = "/bin/bash"
|
55
64
|
#: Default args for shebang line.
|
56
|
-
DEFAULT_SHEBANG_ARGS = ""
|
65
|
+
DEFAULT_SHEBANG_ARGS: ClassVar[str] = ""
|
57
66
|
#: Default submission command.
|
58
|
-
DEFAULT_SUBMIT_CMD = "sbatch"
|
67
|
+
DEFAULT_SUBMIT_CMD: ClassVar[str] = "sbatch"
|
59
68
|
#: Default command to show the queue state.
|
60
|
-
DEFAULT_SHOW_CMD =
|
69
|
+
DEFAULT_SHOW_CMD: ClassVar[Sequence[str]] = ("squeue", "--me")
|
61
70
|
#: Default cancel command.
|
62
|
-
DEFAULT_DEL_CMD = "scancel"
|
71
|
+
DEFAULT_DEL_CMD: ClassVar[str] = "scancel"
|
63
72
|
#: Default job control directive prefix.
|
64
|
-
DEFAULT_JS_CMD = "#SBATCH"
|
73
|
+
DEFAULT_JS_CMD: ClassVar[str] = "#SBATCH"
|
65
74
|
#: Default prefix to enable array processing.
|
66
|
-
DEFAULT_ARRAY_SWITCH = "--array"
|
75
|
+
DEFAULT_ARRAY_SWITCH: ClassVar[str] = "--array"
|
67
76
|
#: Default shell variable with array ID.
|
68
|
-
DEFAULT_ARRAY_ITEM_VAR = "SLURM_ARRAY_TASK_ID"
|
77
|
+
DEFAULT_ARRAY_ITEM_VAR: ClassVar[str] = "SLURM_ARRAY_TASK_ID"
|
78
|
+
#: Number of times to try when querying the state.
|
79
|
+
NUM_STATE_QUERY_TRIES: ClassVar[int] = 5
|
80
|
+
#: Delay (in seconds) between attempts to query the state.
|
81
|
+
INTER_STATE_QUERY_DELAY: ClassVar[float] = 0.5
|
69
82
|
|
70
83
|
#: Maps scheduler state codes to :py:class:`JobscriptElementState` values.
|
71
|
-
state_lookup = {
|
84
|
+
state_lookup: ClassVar[Mapping[str, JobscriptElementState]] = {
|
72
85
|
"PENDING": JobscriptElementState.pending,
|
73
86
|
"RUNNING": JobscriptElementState.running,
|
74
87
|
"COMPLETING": JobscriptElementState.running,
|
@@ -79,16 +92,17 @@ class SlurmPosix(Scheduler):
|
|
79
92
|
"TIMEOUT": JobscriptElementState.errored,
|
80
93
|
}
|
81
94
|
|
82
|
-
def __init__(self, *args, **kwargs):
|
83
|
-
super().__init__(*args, **kwargs)
|
84
|
-
|
85
95
|
@classmethod
|
96
|
+
@override
|
86
97
|
@TimeIt.decorator
|
87
|
-
def process_resources(
|
98
|
+
def process_resources(
|
99
|
+
cls, resources: ElementResources, scheduler_config: SchedulerConfigDescriptor
|
100
|
+
) -> None:
|
88
101
|
"""Perform scheduler-specific processing to the element resources.
|
89
102
|
|
90
|
-
Note
|
91
|
-
|
103
|
+
Note
|
104
|
+
----
|
105
|
+
This mutates `resources`.
|
92
106
|
"""
|
93
107
|
if resources.is_parallel:
|
94
108
|
if resources.parallel_mode is None:
|
@@ -97,21 +111,17 @@ class SlurmPosix(Scheduler):
|
|
97
111
|
|
98
112
|
if resources.parallel_mode is ParallelMode.SHARED:
|
99
113
|
if (resources.num_nodes and resources.num_nodes > 1) or (
|
100
|
-
resources.
|
114
|
+
resources.SLURM_num_nodes and resources.SLURM_num_nodes > 1
|
101
115
|
):
|
102
|
-
raise IncompatibleParallelModeError(
|
103
|
-
f"For the {resources.parallel_mode.name.lower()} parallel mode, "
|
104
|
-
f"only a single node may be requested."
|
105
|
-
)
|
116
|
+
raise IncompatibleParallelModeError(resources.parallel_mode)
|
106
117
|
# consider `num_cores` and `num_threads` synonyms in this case:
|
107
|
-
if resources.SLURM_num_tasks and resources.
|
118
|
+
if resources.SLURM_num_tasks and resources.SLURM_num_tasks != 1:
|
108
119
|
raise IncompatibleSLURMArgumentsError(
|
109
120
|
f"For the {resources.parallel_mode.name.lower()} parallel mode, "
|
110
121
|
f"`SLURM_num_tasks` must be set to 1 (to ensure all requested "
|
111
122
|
f"cores reside on the same node)."
|
112
123
|
)
|
113
|
-
|
114
|
-
resources.SLURM_num_tasks = 1
|
124
|
+
resources.SLURM_num_tasks = 1
|
115
125
|
|
116
126
|
if resources.SLURM_num_cpus_per_task == 1:
|
117
127
|
raise IncompatibleSLURMArgumentsError(
|
@@ -120,28 +130,24 @@ class SlurmPosix(Scheduler):
|
|
120
130
|
f"number of threads/cores to use, and so must be greater than 1, "
|
121
131
|
f"but {resources.SLURM_num_cpus_per_task!r} was specified."
|
122
132
|
)
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
resources.num_threads and
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
f"({resources.SLURM_num_cpus_per_task}) for the "
|
142
|
-
f"{resources.parallel_mode.name.lower()} parallel mode."
|
143
|
-
)
|
144
|
-
resources.SLURM_num_cpus_per_task = resources.num_threads
|
133
|
+
resources.num_threads = resources.num_threads or resources.num_cores
|
134
|
+
if not resources.num_threads and not resources.SLURM_num_cpus_per_task:
|
135
|
+
raise ValueError(
|
136
|
+
f"For the {resources.parallel_mode.name.lower()} parallel "
|
137
|
+
f"mode, specify `num_threads` (or its synonym for this "
|
138
|
+
f"parallel mode: `num_cores`), or the SLURM-specific "
|
139
|
+
f"parameter `SLURM_num_cpus_per_task`."
|
140
|
+
)
|
141
|
+
elif (resources.num_threads and resources.SLURM_num_cpus_per_task) and (
|
142
|
+
resources.num_threads != resources.SLURM_num_cpus_per_task
|
143
|
+
):
|
144
|
+
raise IncompatibleSLURMArgumentsError(
|
145
|
+
f"Incompatible parameters for `num_cores`/`num_threads` "
|
146
|
+
f"({resources.num_threads}) and `SLURM_num_cpus_per_task` "
|
147
|
+
f"({resources.SLURM_num_cpus_per_task}) for the "
|
148
|
+
f"{resources.parallel_mode.name.lower()} parallel mode."
|
149
|
+
)
|
150
|
+
resources.SLURM_num_cpus_per_task = resources.num_threads
|
145
151
|
|
146
152
|
elif resources.parallel_mode is ParallelMode.DISTRIBUTED:
|
147
153
|
if resources.num_threads:
|
@@ -197,9 +203,9 @@ class SlurmPosix(Scheduler):
|
|
197
203
|
else:
|
198
204
|
if resources.SLURM_is_parallel:
|
199
205
|
raise IncompatibleSLURMArgumentsError(
|
200
|
-
|
201
|
-
|
202
|
-
|
206
|
+
"Some specified SLURM-specific arguments (which indicate a parallel "
|
207
|
+
"job) conflict with the scheduler-agnostic arguments (which "
|
208
|
+
"indicate a serial job)."
|
203
209
|
)
|
204
210
|
if not resources.SLURM_num_tasks:
|
205
211
|
resources.SLURM_num_tasks = 1
|
@@ -228,155 +234,138 @@ class SlurmPosix(Scheduler):
|
|
228
234
|
try:
|
229
235
|
part = all_parts[resources.SLURM_partition]
|
230
236
|
except KeyError:
|
231
|
-
raise UnknownSLURMPartitionError(
|
232
|
-
f"The SLURM partition {resources.SLURM_partition!r} is not "
|
233
|
-
f"specified in the configuration. Specified partitions are "
|
234
|
-
f"{list(all_parts.keys())!r}."
|
235
|
-
)
|
237
|
+
raise UnknownSLURMPartitionError(resources.SLURM_partition, all_parts)
|
236
238
|
# TODO: we when we support ParallelMode.HYBRID, these checks will have to
|
237
239
|
# consider the total number of cores requested per node
|
238
240
|
# (num_cores_per_node * num_threads)?
|
239
|
-
part_num_cores = part.get("num_cores")
|
240
|
-
part_num_cores_per_node = part.get("num_cores_per_node")
|
241
|
-
part_num_nodes = part.get("num_nodes")
|
242
|
-
part_para_modes = part.get("parallel_modes",
|
243
|
-
if (
|
244
|
-
num_cores
|
245
|
-
and part_num_cores
|
246
|
-
and not cls.is_num_cores_supported(num_cores, part_num_cores)
|
247
|
-
):
|
241
|
+
part_num_cores = part.get("num_cores", ())
|
242
|
+
part_num_cores_per_node = part.get("num_cores_per_node", ())
|
243
|
+
part_num_nodes = part.get("num_nodes", ())
|
244
|
+
part_para_modes = part.get("parallel_modes", ())
|
245
|
+
if cls.__is_present_unsupported(num_cores, part_num_cores):
|
248
246
|
raise IncompatibleSLURMPartitionError(
|
249
|
-
|
250
|
-
f"compatible with the number of cores requested: {num_cores!r}."
|
251
|
-
)
|
252
|
-
if (
|
253
|
-
num_cores_per_node
|
254
|
-
and part_num_cores_per_node
|
255
|
-
and not cls.is_num_cores_supported(
|
256
|
-
num_cores_per_node, part_num_cores_per_node
|
247
|
+
resources.SLURM_partition, "number of cores", num_cores
|
257
248
|
)
|
258
|
-
):
|
249
|
+
if cls.__is_present_unsupported(num_cores_per_node, part_num_cores_per_node):
|
259
250
|
raise IncompatibleSLURMPartitionError(
|
260
|
-
|
261
|
-
|
262
|
-
|
251
|
+
resources.SLURM_partition,
|
252
|
+
"number of cores per node",
|
253
|
+
num_cores_per_node,
|
263
254
|
)
|
264
|
-
if (
|
265
|
-
num_nodes
|
266
|
-
and part_num_nodes
|
267
|
-
and not cls.is_num_cores_supported(num_nodes, part_num_nodes)
|
268
|
-
):
|
255
|
+
if cls.__is_present_unsupported(num_nodes, part_num_nodes):
|
269
256
|
raise IncompatibleSLURMPartitionError(
|
270
|
-
|
271
|
-
f"compatible with the number of nodes requested: {num_nodes!r}."
|
257
|
+
resources.SLURM_partition, "number of nodes", num_nodes
|
272
258
|
)
|
273
259
|
if para_mode and para_mode.name.lower() not in part_para_modes:
|
274
260
|
raise IncompatibleSLURMPartitionError(
|
275
|
-
|
276
|
-
f"compatible with the parallel mode requested: {para_mode!r}."
|
261
|
+
resources.SLURM_partition, "parallel mode", para_mode
|
277
262
|
)
|
278
263
|
else:
|
279
264
|
# find the first compatible partition if one exists:
|
280
265
|
# TODO: bug here? not finding correct partition?
|
281
|
-
part_match = False
|
282
266
|
for part_name, part_info in all_parts.items():
|
283
|
-
|
284
|
-
|
285
|
-
part_num_nodes = part_info.get("num_nodes")
|
286
|
-
part_para_modes = part_info.get("parallel_modes", [])
|
287
|
-
if (
|
288
|
-
num_cores
|
289
|
-
and part_num_cores
|
290
|
-
and cls.is_num_cores_supported(num_cores, part_num_cores)
|
291
|
-
):
|
292
|
-
part_match = True
|
293
|
-
else:
|
294
|
-
part_match = False
|
295
|
-
continue
|
296
|
-
if (
|
297
|
-
num_cores_per_node
|
298
|
-
and part_num_cores_per_node
|
299
|
-
and cls.is_num_cores_supported(
|
300
|
-
num_cores_per_node, part_num_cores_per_node
|
301
|
-
)
|
267
|
+
if cls.__partition_matches(
|
268
|
+
num_cores, num_cores_per_node, num_nodes, para_mode, part_info
|
302
269
|
):
|
303
|
-
|
304
|
-
else:
|
305
|
-
part_match = False
|
306
|
-
continue
|
307
|
-
if (
|
308
|
-
num_nodes
|
309
|
-
and part_num_nodes
|
310
|
-
and cls.is_num_cores_supported(num_nodes, part_num_nodes)
|
311
|
-
):
|
312
|
-
part_match = True
|
313
|
-
else:
|
314
|
-
part_match = False
|
315
|
-
continue
|
316
|
-
if part_match:
|
317
|
-
part_match = part_name
|
318
|
-
break
|
319
|
-
if para_mode and para_mode.name.lower() not in part_para_modes:
|
320
|
-
part_match = False
|
321
|
-
continue
|
322
|
-
if part_match:
|
323
|
-
part_match = part_name
|
270
|
+
resources.SLURM_partition = str(part_name)
|
324
271
|
break
|
325
|
-
if part_match:
|
326
|
-
resources.SLURM_partition = part_match
|
327
272
|
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
273
|
+
@classmethod
|
274
|
+
def __is_present_unsupported(
|
275
|
+
cls, num_req: int | None, part_have: Sequence[int] | None
|
276
|
+
) -> bool:
|
277
|
+
"""
|
278
|
+
Test if information is present on both sides, but doesn't match.
|
279
|
+
"""
|
280
|
+
return bool(
|
281
|
+
num_req and part_have and not cls.is_num_cores_supported(num_req, part_have)
|
282
|
+
)
|
332
283
|
|
333
|
-
|
334
|
-
|
284
|
+
@classmethod
|
285
|
+
def __is_present_supported(
|
286
|
+
cls, num_req: int | None, part_have: Sequence[int] | None
|
287
|
+
) -> bool:
|
288
|
+
"""
|
289
|
+
Test if information is present on both sides, and also matches.
|
290
|
+
"""
|
291
|
+
return bool(
|
292
|
+
num_req and part_have and cls.is_num_cores_supported(num_req, part_have)
|
293
|
+
)
|
335
294
|
|
295
|
+
@classmethod
|
296
|
+
def __partition_matches(
|
297
|
+
cls,
|
298
|
+
num_cores: int | None,
|
299
|
+
num_cores_per_node: int | None,
|
300
|
+
num_nodes: int | None,
|
301
|
+
para_mode: ParallelMode | None,
|
302
|
+
part_info: SLURMPartitionsDescriptor,
|
303
|
+
) -> bool:
|
304
|
+
"""
|
305
|
+
Check whether a partition (part_name, part_info) matches the requested number
|
306
|
+
of cores and nodes.
|
307
|
+
"""
|
308
|
+
part_num_cores = part_info.get("num_cores", [])
|
309
|
+
part_num_cores_per_node = part_info.get("num_cores_per_node", [])
|
310
|
+
part_num_nodes = part_info.get("num_nodes", [])
|
311
|
+
part_para_modes = part_info.get("parallel_modes", [])
|
312
|
+
if (
|
313
|
+
not cls.__is_present_supported(num_cores, part_num_cores)
|
314
|
+
or not cls.__is_present_supported(num_cores_per_node, part_num_cores_per_node)
|
315
|
+
or not cls.__is_present_supported(num_nodes, part_num_nodes)
|
316
|
+
):
|
317
|
+
return False
|
318
|
+
# FIXME: Does the next check come above or below the check below?
|
319
|
+
# Surely not both!
|
320
|
+
part_match = True
|
321
|
+
if part_match:
|
322
|
+
return True
|
323
|
+
if para_mode and para_mode.name.lower() not in part_para_modes:
|
324
|
+
return False
|
325
|
+
if part_match:
|
326
|
+
return True
|
327
|
+
return False
|
328
|
+
|
329
|
+
def __format_core_request_lines(self, resources: ElementResources) -> Iterator[str]:
|
330
|
+
if resources.SLURM_partition:
|
331
|
+
yield f"{self.js_cmd} --partition {resources.SLURM_partition}"
|
332
|
+
if resources.SLURM_num_nodes: # TODO: option for --exclusive ?
|
333
|
+
yield f"{self.js_cmd} --nodes {resources.SLURM_num_nodes}"
|
336
334
|
if resources.SLURM_num_tasks:
|
337
|
-
|
338
|
-
|
335
|
+
yield f"{self.js_cmd} --ntasks {resources.SLURM_num_tasks}"
|
339
336
|
if resources.SLURM_num_tasks_per_node:
|
340
|
-
|
341
|
-
f"{self.js_cmd} --ntasks-per-node {resources.SLURM_num_tasks_per_node}"
|
342
|
-
)
|
343
|
-
|
337
|
+
yield f"{self.js_cmd} --ntasks-per-node {resources.SLURM_num_tasks_per_node}"
|
344
338
|
if resources.SLURM_num_cpus_per_task:
|
345
|
-
|
346
|
-
f"{self.js_cmd} --cpus-per-task {resources.SLURM_num_cpus_per_task}"
|
347
|
-
)
|
348
|
-
|
349
|
-
return lns
|
339
|
+
yield f"{self.js_cmd} --cpus-per-task {resources.SLURM_num_cpus_per_task}"
|
350
340
|
|
351
|
-
def
|
341
|
+
def __format_array_request(self, num_elements: int, resources: ElementResources):
|
352
342
|
# TODO: Slurm docs start indices at zero, why are we starting at one?
|
353
343
|
# https://slurm.schedmd.com/sbatch.html#OPT_array
|
354
344
|
max_str = f"%{resources.max_array_items}" if resources.max_array_items else ""
|
355
345
|
return f"{self.js_cmd} {self.array_switch} 1-{num_elements}{max_str}"
|
356
346
|
|
357
|
-
def
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
def format_options(self, resources, num_elements, is_array, sub_idx):
|
347
|
+
def __format_std_stream_file_option_lines(
|
348
|
+
self, is_array: bool, sub_idx: int
|
349
|
+
) -> Iterator[str]:
|
350
|
+
pattern = R"%x_%A.%a" if is_array else R"%x_%j"
|
351
|
+
base = f"./artifacts/submissions/{sub_idx}/{pattern}"
|
352
|
+
yield f"{self.js_cmd} -o {base}.out"
|
353
|
+
yield f"{self.js_cmd} -e {base}.err"
|
354
|
+
|
355
|
+
@override
|
356
|
+
def format_options(
|
357
|
+
self, resources: ElementResources, num_elements: int, is_array: bool, sub_idx: int
|
358
|
+
) -> str:
|
371
359
|
"""
|
372
360
|
Format the options to the scheduler.
|
373
361
|
"""
|
374
|
-
opts = []
|
375
|
-
opts.extend(self.
|
362
|
+
opts: list[str] = []
|
363
|
+
opts.extend(self.__format_core_request_lines(resources))
|
364
|
+
|
376
365
|
if is_array:
|
377
|
-
opts.append(self.
|
366
|
+
opts.append(self.__format_array_request(num_elements, resources))
|
378
367
|
|
379
|
-
opts.extend(self.
|
368
|
+
opts.extend(self.__format_std_stream_file_option_lines(is_array, sub_idx))
|
380
369
|
|
381
370
|
for opt_k, opt_v in self.options.items():
|
382
371
|
if isinstance(opt_v, list):
|
@@ -389,8 +378,9 @@ class SlurmPosix(Scheduler):
|
|
389
378
|
|
390
379
|
return "\n".join(opts) + "\n"
|
391
380
|
|
381
|
+
@override
|
392
382
|
@TimeIt.decorator
|
393
|
-
def get_version_info(self):
|
383
|
+
def get_version_info(self) -> VersionInfo:
|
394
384
|
vers_cmd = [self.submit_cmd, "--version"]
|
395
385
|
proc = subprocess.run(
|
396
386
|
args=vers_cmd,
|
@@ -402,18 +392,18 @@ class SlurmPosix(Scheduler):
|
|
402
392
|
if stderr:
|
403
393
|
print(stderr)
|
404
394
|
name, version = stdout.split()
|
405
|
-
|
395
|
+
return {
|
406
396
|
"scheduler_name": name,
|
407
397
|
"scheduler_version": version,
|
408
398
|
}
|
409
|
-
return out
|
410
399
|
|
400
|
+
@override
|
411
401
|
def get_submit_command(
|
412
402
|
self,
|
413
403
|
shell: Shell,
|
414
404
|
js_path: str,
|
415
|
-
deps:
|
416
|
-
) ->
|
405
|
+
deps: dict[Any, tuple[Any, ...]],
|
406
|
+
) -> list[str]:
|
417
407
|
"""
|
418
408
|
Get the command to use to submit a job to the scheduler.
|
419
409
|
|
@@ -422,94 +412,96 @@ class SlurmPosix(Scheduler):
|
|
422
412
|
List of argument words.
|
423
413
|
"""
|
424
414
|
cmd = [self.submit_cmd, "--parsable"]
|
415
|
+
if deps:
|
416
|
+
cmd.append("--dependency")
|
417
|
+
cmd.append(",".join(self.__dependency_args(deps)))
|
418
|
+
cmd.append(js_path)
|
419
|
+
return cmd
|
425
420
|
|
426
|
-
|
421
|
+
@staticmethod
|
422
|
+
def __dependency_args(deps: dict[Any, tuple[Any, ...]]) -> Iterator[str]:
|
427
423
|
for job_ID, is_array_dep in deps.values():
|
428
|
-
dep_i_str = ""
|
429
424
|
if is_array_dep: # array dependency
|
430
|
-
|
425
|
+
yield f"aftercorr:{job_ID}"
|
431
426
|
else:
|
432
|
-
|
433
|
-
dep_i_str += str(job_ID)
|
434
|
-
dep_cmd.append(dep_i_str)
|
435
|
-
|
436
|
-
if dep_cmd:
|
437
|
-
cmd.append(f"--dependency")
|
438
|
-
cmd.append(",".join(dep_cmd))
|
439
|
-
|
440
|
-
cmd.append(js_path)
|
441
|
-
|
442
|
-
return cmd
|
427
|
+
yield f"afterany:{job_ID}"
|
443
428
|
|
444
429
|
def parse_submission_output(self, stdout: str) -> str:
|
445
430
|
"""Extract scheduler reference for a newly submitted jobscript"""
|
446
431
|
if ";" in stdout:
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
return job_ID
|
432
|
+
return stdout.split(";")[0] # since we submit with "--parsable"
|
433
|
+
# Try using the whole thing
|
434
|
+
return stdout
|
451
435
|
|
452
436
|
@staticmethod
|
453
|
-
def _parse_job_IDs(job_ID_str: str):
|
454
|
-
"""
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
437
|
+
def _parse_job_IDs(job_ID_str: str) -> tuple[str, None | list[int]]:
|
438
|
+
"""
|
439
|
+
Parse the job ID column from the `squeue` command (the `%i` format option).
|
440
|
+
|
441
|
+
Returns
|
442
|
+
-------
|
443
|
+
job_id
|
444
|
+
The job identifier.
|
445
|
+
array_indices
|
446
|
+
The indices into the job array.
|
447
|
+
"""
|
448
|
+
base_job_ID, *arr_idx_data = job_ID_str.split("_")
|
449
|
+
if not arr_idx_data:
|
450
|
+
return base_job_ID, None
|
451
|
+
arr_idx = arr_idx_data[0]
|
452
|
+
try:
|
453
|
+
return base_job_ID, [int(arr_idx) - 1] # zero-index
|
454
|
+
except ValueError:
|
455
|
+
pass
|
456
|
+
# split on commas (e.g. "[5,8-40]")
|
457
|
+
_arr_idx: list[int] = []
|
458
|
+
for i_range_str in arr_idx.strip("[]").split(","):
|
459
|
+
if "-" in i_range_str:
|
460
|
+
_from, _to = i_range_str.split("-")
|
461
|
+
if "%" in _to:
|
462
|
+
# indicates max concurrent array items; not needed
|
463
|
+
_to = _to.split("%")[0]
|
464
|
+
_arr_idx.extend(range(int(_from) - 1, int(_to)))
|
465
|
+
else:
|
466
|
+
_arr_idx.append(int(i_range_str) - 1)
|
467
|
+
return base_job_ID, _arr_idx
|
468
|
+
|
469
|
+
def __parse_job_states(
|
470
|
+
self, stdout: str
|
471
|
+
) -> dict[str, dict[int | None, JobscriptElementState]]:
|
477
472
|
"""Parse output from Slurm `squeue` command with a simple format."""
|
478
|
-
info = {}
|
473
|
+
info: dict[str, dict[int | None, JobscriptElementState]] = {}
|
479
474
|
for ln in stdout.split("\n"):
|
480
475
|
if not ln:
|
481
476
|
continue
|
482
|
-
|
483
|
-
base_job_ID, arr_idx = self._parse_job_IDs(
|
484
|
-
state = self.state_lookup.get(
|
477
|
+
job_id, job_state, *_ = ln.split()
|
478
|
+
base_job_ID, arr_idx = self._parse_job_IDs(job_id)
|
479
|
+
state = self.state_lookup.get(job_state, JobscriptElementState.errored)
|
485
480
|
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
for arr_idx_i in arr_idx or [None]:
|
490
|
-
info[base_job_ID][arr_idx_i] = state
|
481
|
+
entry = info.setdefault(base_job_ID, {})
|
482
|
+
for arr_idx_i in arr_idx or ():
|
483
|
+
entry[arr_idx_i] = state
|
491
484
|
|
492
485
|
return info
|
493
486
|
|
494
|
-
def
|
487
|
+
def __query_job_states(self, job_IDs: Iterable[str]) -> tuple[str, str]:
|
495
488
|
"""Query the state of the specified jobs."""
|
496
489
|
cmd = [
|
497
|
-
|
498
|
-
"--me",
|
490
|
+
*self.show_cmd,
|
499
491
|
"--noheader",
|
500
492
|
"--format",
|
501
|
-
|
493
|
+
R"%40i %30T",
|
502
494
|
"--jobs",
|
503
495
|
",".join(job_IDs),
|
504
496
|
]
|
505
|
-
return run_cmd(cmd, logger=self.
|
497
|
+
return run_cmd(cmd, logger=self._app.submission_logger)
|
506
498
|
|
507
|
-
def
|
499
|
+
def __get_job_valid_IDs(self, job_IDs: Collection[str] | None = None) -> set[str]:
|
508
500
|
"""Get a list of job IDs that are known by the scheduler, optionally filtered by
|
509
501
|
specified job IDs."""
|
510
502
|
|
511
|
-
cmd = [
|
512
|
-
stdout, stderr = run_cmd(cmd, logger=self.
|
503
|
+
cmd = [*self.show_cmd, "--noheader", "--format", r"%F"]
|
504
|
+
stdout, stderr = run_cmd(cmd, logger=self._app.submission_logger)
|
513
505
|
if stderr:
|
514
506
|
raise ValueError(
|
515
507
|
f"Could not get query Slurm jobs. Command was: {cmd!r}; stderr was: "
|
@@ -517,64 +509,67 @@ class SlurmPosix(Scheduler):
|
|
517
509
|
)
|
518
510
|
else:
|
519
511
|
known_jobs = set(i.strip() for i in stdout.split("\n") if i.strip())
|
520
|
-
|
521
|
-
|
522
|
-
return job_IDs
|
512
|
+
if job_IDs is None:
|
513
|
+
return known_jobs
|
514
|
+
return known_jobs.intersection(job_IDs)
|
523
515
|
|
516
|
+
@override
|
524
517
|
def get_job_state_info(
|
525
|
-
self, js_refs:
|
526
|
-
) ->
|
518
|
+
self, *, js_refs: Sequence[str] | None = None, num_js_elements: int = 0
|
519
|
+
) -> Mapping[str, Mapping[int | None, JobscriptElementState]]:
|
527
520
|
"""Query the scheduler to get the states of all of this user's jobs, optionally
|
528
521
|
filtering by specified job IDs.
|
529
522
|
|
530
523
|
Jobs that are not in the scheduler's status output will not appear in the output
|
531
524
|
of this method.
|
532
|
-
|
533
525
|
"""
|
534
526
|
|
535
527
|
# if job_IDs are passed, then assume they are existant, otherwise retrieve valid
|
536
528
|
# jobs:
|
537
|
-
|
538
|
-
js_refs = self._get_job_valid_IDs()
|
539
|
-
if not js_refs:
|
540
|
-
return {}
|
529
|
+
refs: Collection[str] = js_refs or self.__get_job_valid_IDs()
|
541
530
|
|
542
|
-
stdout, stderr = self._query_job_states(js_refs)
|
543
531
|
count = 0
|
544
|
-
while
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
time.sleep(0.5)
|
553
|
-
js_refs = self._get_job_valid_IDs(js_refs)
|
554
|
-
if not js_refs:
|
555
|
-
return {}
|
556
|
-
stdout, stderr = self._query_job_states(js_refs)
|
557
|
-
count += 1
|
558
|
-
else:
|
532
|
+
while refs:
|
533
|
+
stdout, stderr = self.__query_job_states(refs)
|
534
|
+
if not stderr:
|
535
|
+
return self.__parse_job_states(stdout)
|
536
|
+
if (
|
537
|
+
"Invalid job id specified" not in stderr
|
538
|
+
or count >= self.NUM_STATE_QUERY_TRIES
|
539
|
+
):
|
559
540
|
raise ValueError(f"Could not get Slurm job states. Stderr was: {stderr}")
|
560
541
|
|
561
|
-
|
562
|
-
|
542
|
+
# the job might have finished; this only seems to happen if a single
|
543
|
+
# non-existant job ID is specified; for multiple non-existant jobs, no
|
544
|
+
# error is produced;
|
545
|
+
self._app.submission_logger.info(
|
546
|
+
"A specified job ID is non-existant; refreshing known job IDs..."
|
547
|
+
)
|
548
|
+
time.sleep(self.INTER_STATE_QUERY_DELAY)
|
549
|
+
refs = self.__get_job_valid_IDs(refs)
|
550
|
+
count += 1
|
551
|
+
return {}
|
563
552
|
|
564
|
-
|
553
|
+
@override
|
554
|
+
def cancel_jobs(
|
555
|
+
self,
|
556
|
+
js_refs: list[str],
|
557
|
+
jobscripts: list[Jobscript] | None = None,
|
558
|
+
num_js_elements: int = 0, # Ignored!
|
559
|
+
):
|
565
560
|
"""
|
566
561
|
Cancel submitted jobs.
|
567
562
|
"""
|
568
|
-
cmd = [self.del_cmd
|
569
|
-
self.
|
563
|
+
cmd = [self.del_cmd, *js_refs]
|
564
|
+
self._app.submission_logger.info(
|
570
565
|
f"cancelling {self.__class__.__name__} jobscripts with command: {cmd}."
|
571
566
|
)
|
572
|
-
stdout, stderr = run_cmd(cmd, logger=self.
|
567
|
+
stdout, stderr = run_cmd(cmd, logger=self._app.submission_logger)
|
573
568
|
if stderr:
|
574
569
|
raise ValueError(
|
575
570
|
f"Could not get query {self.__class__.__name__} jobs. Command was: "
|
576
571
|
f"{cmd!r}; stderr was: {stderr}"
|
577
572
|
)
|
578
|
-
self.
|
573
|
+
self._app.submission_logger.info(
|
579
574
|
f"jobscripts cancel command executed; stdout was: {stdout}."
|
580
575
|
)
|