hpcflow-new2 0.2.0a189__py3-none-any.whl → 0.2.0a190__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. hpcflow/__pyinstaller/hook-hpcflow.py +8 -6
  2. hpcflow/_version.py +1 -1
  3. hpcflow/app.py +1 -0
  4. hpcflow/data/scripts/main_script_test_hdf5_in_obj.py +1 -1
  5. hpcflow/data/scripts/main_script_test_hdf5_out_obj.py +1 -1
  6. hpcflow/sdk/__init__.py +21 -15
  7. hpcflow/sdk/app.py +2133 -770
  8. hpcflow/sdk/cli.py +281 -250
  9. hpcflow/sdk/cli_common.py +6 -2
  10. hpcflow/sdk/config/__init__.py +1 -1
  11. hpcflow/sdk/config/callbacks.py +77 -42
  12. hpcflow/sdk/config/cli.py +126 -103
  13. hpcflow/sdk/config/config.py +578 -311
  14. hpcflow/sdk/config/config_file.py +131 -95
  15. hpcflow/sdk/config/errors.py +112 -85
  16. hpcflow/sdk/config/types.py +145 -0
  17. hpcflow/sdk/core/actions.py +1054 -994
  18. hpcflow/sdk/core/app_aware.py +24 -0
  19. hpcflow/sdk/core/cache.py +81 -63
  20. hpcflow/sdk/core/command_files.py +275 -185
  21. hpcflow/sdk/core/commands.py +111 -107
  22. hpcflow/sdk/core/element.py +724 -503
  23. hpcflow/sdk/core/enums.py +192 -0
  24. hpcflow/sdk/core/environment.py +74 -93
  25. hpcflow/sdk/core/errors.py +398 -51
  26. hpcflow/sdk/core/json_like.py +540 -272
  27. hpcflow/sdk/core/loop.py +380 -334
  28. hpcflow/sdk/core/loop_cache.py +160 -43
  29. hpcflow/sdk/core/object_list.py +370 -207
  30. hpcflow/sdk/core/parameters.py +728 -600
  31. hpcflow/sdk/core/rule.py +59 -41
  32. hpcflow/sdk/core/run_dir_files.py +33 -22
  33. hpcflow/sdk/core/task.py +1546 -1325
  34. hpcflow/sdk/core/task_schema.py +240 -196
  35. hpcflow/sdk/core/test_utils.py +126 -88
  36. hpcflow/sdk/core/types.py +387 -0
  37. hpcflow/sdk/core/utils.py +410 -305
  38. hpcflow/sdk/core/validation.py +82 -9
  39. hpcflow/sdk/core/workflow.py +1192 -1028
  40. hpcflow/sdk/core/zarr_io.py +98 -137
  41. hpcflow/sdk/demo/cli.py +46 -33
  42. hpcflow/sdk/helper/cli.py +18 -16
  43. hpcflow/sdk/helper/helper.py +75 -63
  44. hpcflow/sdk/helper/watcher.py +61 -28
  45. hpcflow/sdk/log.py +83 -59
  46. hpcflow/sdk/persistence/__init__.py +8 -31
  47. hpcflow/sdk/persistence/base.py +988 -586
  48. hpcflow/sdk/persistence/defaults.py +6 -0
  49. hpcflow/sdk/persistence/discovery.py +38 -0
  50. hpcflow/sdk/persistence/json.py +408 -153
  51. hpcflow/sdk/persistence/pending.py +158 -123
  52. hpcflow/sdk/persistence/store_resource.py +37 -22
  53. hpcflow/sdk/persistence/types.py +307 -0
  54. hpcflow/sdk/persistence/utils.py +14 -11
  55. hpcflow/sdk/persistence/zarr.py +477 -420
  56. hpcflow/sdk/runtime.py +44 -41
  57. hpcflow/sdk/submission/{jobscript_info.py → enums.py} +39 -12
  58. hpcflow/sdk/submission/jobscript.py +444 -404
  59. hpcflow/sdk/submission/schedulers/__init__.py +133 -40
  60. hpcflow/sdk/submission/schedulers/direct.py +97 -71
  61. hpcflow/sdk/submission/schedulers/sge.py +132 -126
  62. hpcflow/sdk/submission/schedulers/slurm.py +263 -268
  63. hpcflow/sdk/submission/schedulers/utils.py +7 -2
  64. hpcflow/sdk/submission/shells/__init__.py +14 -15
  65. hpcflow/sdk/submission/shells/base.py +102 -29
  66. hpcflow/sdk/submission/shells/bash.py +72 -55
  67. hpcflow/sdk/submission/shells/os_version.py +31 -30
  68. hpcflow/sdk/submission/shells/powershell.py +37 -29
  69. hpcflow/sdk/submission/submission.py +203 -257
  70. hpcflow/sdk/submission/types.py +143 -0
  71. hpcflow/sdk/typing.py +163 -12
  72. hpcflow/tests/conftest.py +8 -6
  73. hpcflow/tests/schedulers/slurm/test_slurm_submission.py +5 -2
  74. hpcflow/tests/scripts/test_main_scripts.py +60 -30
  75. hpcflow/tests/shells/wsl/test_wsl_submission.py +6 -4
  76. hpcflow/tests/unit/test_action.py +86 -75
  77. hpcflow/tests/unit/test_action_rule.py +9 -4
  78. hpcflow/tests/unit/test_app.py +13 -6
  79. hpcflow/tests/unit/test_cli.py +1 -1
  80. hpcflow/tests/unit/test_command.py +71 -54
  81. hpcflow/tests/unit/test_config.py +20 -15
  82. hpcflow/tests/unit/test_config_file.py +21 -18
  83. hpcflow/tests/unit/test_element.py +58 -62
  84. hpcflow/tests/unit/test_element_iteration.py +3 -1
  85. hpcflow/tests/unit/test_element_set.py +29 -19
  86. hpcflow/tests/unit/test_group.py +4 -2
  87. hpcflow/tests/unit/test_input_source.py +116 -93
  88. hpcflow/tests/unit/test_input_value.py +29 -24
  89. hpcflow/tests/unit/test_json_like.py +44 -35
  90. hpcflow/tests/unit/test_loop.py +65 -58
  91. hpcflow/tests/unit/test_object_list.py +17 -12
  92. hpcflow/tests/unit/test_parameter.py +16 -7
  93. hpcflow/tests/unit/test_persistence.py +48 -35
  94. hpcflow/tests/unit/test_resources.py +20 -18
  95. hpcflow/tests/unit/test_run.py +8 -3
  96. hpcflow/tests/unit/test_runtime.py +2 -1
  97. hpcflow/tests/unit/test_schema_input.py +23 -15
  98. hpcflow/tests/unit/test_shell.py +3 -2
  99. hpcflow/tests/unit/test_slurm.py +8 -7
  100. hpcflow/tests/unit/test_submission.py +39 -19
  101. hpcflow/tests/unit/test_task.py +352 -247
  102. hpcflow/tests/unit/test_task_schema.py +33 -20
  103. hpcflow/tests/unit/test_utils.py +9 -11
  104. hpcflow/tests/unit/test_value_sequence.py +15 -12
  105. hpcflow/tests/unit/test_workflow.py +114 -83
  106. hpcflow/tests/unit/test_workflow_template.py +0 -1
  107. hpcflow/tests/workflows/test_jobscript.py +2 -1
  108. hpcflow/tests/workflows/test_workflows.py +18 -13
  109. {hpcflow_new2-0.2.0a189.dist-info → hpcflow_new2-0.2.0a190.dist-info}/METADATA +2 -1
  110. hpcflow_new2-0.2.0a190.dist-info/RECORD +165 -0
  111. hpcflow/sdk/core/parallel.py +0 -21
  112. hpcflow_new2-0.2.0a189.dist-info/RECORD +0 -158
  113. {hpcflow_new2-0.2.0a189.dist-info → hpcflow_new2-0.2.0a190.dist-info}/LICENSE +0 -0
  114. {hpcflow_new2-0.2.0a189.dist-info → hpcflow_new2-0.2.0a190.dist-info}/WHEEL +0 -0
  115. {hpcflow_new2-0.2.0a189.dist-info → hpcflow_new2-0.2.0a190.dist-info}/entry_points.txt +0 -0
@@ -2,25 +2,36 @@
2
2
  An interface to SLURM.
3
3
  """
4
4
 
5
- from pathlib import Path
5
+ from __future__ import annotations
6
6
  import subprocess
7
7
  import time
8
- from typing import Dict, List, Tuple
8
+ from typing import TYPE_CHECKING
9
+ from typing_extensions import override
10
+ from hpcflow.sdk.typing import hydrate
11
+ from hpcflow.sdk.core.enums import ParallelMode
9
12
  from hpcflow.sdk.core.errors import (
10
13
  IncompatibleParallelModeError,
11
14
  IncompatibleSLURMArgumentsError,
12
15
  IncompatibleSLURMPartitionError,
13
16
  UnknownSLURMPartitionError,
14
17
  )
15
- from hpcflow.sdk.core.parameters import ParallelMode
16
18
  from hpcflow.sdk.log import TimeIt
17
- from hpcflow.sdk.submission.jobscript_info import JobscriptElementState
18
- from hpcflow.sdk.submission.schedulers import Scheduler
19
+ from hpcflow.sdk.submission.enums import JobscriptElementState
20
+ from hpcflow.sdk.submission.schedulers import QueuedScheduler
19
21
  from hpcflow.sdk.submission.schedulers.utils import run_cmd
20
- from hpcflow.sdk.submission.shells.base import Shell
21
22
 
23
+ if TYPE_CHECKING:
24
+ from collections.abc import Collection, Iterable, Iterator, Mapping, Sequence
25
+ from typing import Any, ClassVar
26
+ from ...config.types import SchedulerConfigDescriptor, SLURMPartitionsDescriptor
27
+ from ...core.element import ElementResources
28
+ from ..jobscript import Jobscript
29
+ from ..types import VersionInfo
30
+ from ..shells.base import Shell
22
31
 
23
- class SlurmPosix(Scheduler):
32
+
33
+ @hydrate
34
+ class SlurmPosix(QueuedScheduler):
24
35
  """
25
36
  A scheduler that uses SLURM.
26
37
 
@@ -48,27 +59,29 @@ class SlurmPosix(Scheduler):
48
59
 
49
60
  """
50
61
 
51
- _app_attr = "app"
52
-
53
62
  #: Default shell.
54
- DEFAULT_SHELL_EXECUTABLE = "/bin/bash"
63
+ DEFAULT_SHELL_EXECUTABLE: ClassVar[str] = "/bin/bash"
55
64
  #: Default args for shebang line.
56
- DEFAULT_SHEBANG_ARGS = ""
65
+ DEFAULT_SHEBANG_ARGS: ClassVar[str] = ""
57
66
  #: Default submission command.
58
- DEFAULT_SUBMIT_CMD = "sbatch"
67
+ DEFAULT_SUBMIT_CMD: ClassVar[str] = "sbatch"
59
68
  #: Default command to show the queue state.
60
- DEFAULT_SHOW_CMD = ["squeue", "--me"]
69
+ DEFAULT_SHOW_CMD: ClassVar[Sequence[str]] = ("squeue", "--me")
61
70
  #: Default cancel command.
62
- DEFAULT_DEL_CMD = "scancel"
71
+ DEFAULT_DEL_CMD: ClassVar[str] = "scancel"
63
72
  #: Default job control directive prefix.
64
- DEFAULT_JS_CMD = "#SBATCH"
73
+ DEFAULT_JS_CMD: ClassVar[str] = "#SBATCH"
65
74
  #: Default prefix to enable array processing.
66
- DEFAULT_ARRAY_SWITCH = "--array"
75
+ DEFAULT_ARRAY_SWITCH: ClassVar[str] = "--array"
67
76
  #: Default shell variable with array ID.
68
- DEFAULT_ARRAY_ITEM_VAR = "SLURM_ARRAY_TASK_ID"
77
+ DEFAULT_ARRAY_ITEM_VAR: ClassVar[str] = "SLURM_ARRAY_TASK_ID"
78
+ #: Number of times to try when querying the state.
79
+ NUM_STATE_QUERY_TRIES: ClassVar[int] = 5
80
+ #: Delay (in seconds) between attempts to query the state.
81
+ INTER_STATE_QUERY_DELAY: ClassVar[float] = 0.5
69
82
 
70
83
  #: Maps scheduler state codes to :py:class:`JobscriptElementState` values.
71
- state_lookup = {
84
+ state_lookup: ClassVar[Mapping[str, JobscriptElementState]] = {
72
85
  "PENDING": JobscriptElementState.pending,
73
86
  "RUNNING": JobscriptElementState.running,
74
87
  "COMPLETING": JobscriptElementState.running,
@@ -79,16 +92,17 @@ class SlurmPosix(Scheduler):
79
92
  "TIMEOUT": JobscriptElementState.errored,
80
93
  }
81
94
 
82
- def __init__(self, *args, **kwargs):
83
- super().__init__(*args, **kwargs)
84
-
85
95
  @classmethod
96
+ @override
86
97
  @TimeIt.decorator
87
- def process_resources(cls, resources, scheduler_config: Dict) -> None:
98
+ def process_resources(
99
+ cls, resources: ElementResources, scheduler_config: SchedulerConfigDescriptor
100
+ ) -> None:
88
101
  """Perform scheduler-specific processing to the element resources.
89
102
 
90
- Note: this mutates `resources`.
91
-
103
+ Note
104
+ ----
105
+ This mutates `resources`.
92
106
  """
93
107
  if resources.is_parallel:
94
108
  if resources.parallel_mode is None:
@@ -97,21 +111,17 @@ class SlurmPosix(Scheduler):
97
111
 
98
112
  if resources.parallel_mode is ParallelMode.SHARED:
99
113
  if (resources.num_nodes and resources.num_nodes > 1) or (
100
- resources.SLURM_node_nodes and resources.SLURM_num_nodes > 1
114
+ resources.SLURM_num_nodes and resources.SLURM_num_nodes > 1
101
115
  ):
102
- raise IncompatibleParallelModeError(
103
- f"For the {resources.parallel_mode.name.lower()} parallel mode, "
104
- f"only a single node may be requested."
105
- )
116
+ raise IncompatibleParallelModeError(resources.parallel_mode)
106
117
  # consider `num_cores` and `num_threads` synonyms in this case:
107
- if resources.SLURM_num_tasks and resources.SLURM_num_task != 1:
118
+ if resources.SLURM_num_tasks and resources.SLURM_num_tasks != 1:
108
119
  raise IncompatibleSLURMArgumentsError(
109
120
  f"For the {resources.parallel_mode.name.lower()} parallel mode, "
110
121
  f"`SLURM_num_tasks` must be set to 1 (to ensure all requested "
111
122
  f"cores reside on the same node)."
112
123
  )
113
- else:
114
- resources.SLURM_num_tasks = 1
124
+ resources.SLURM_num_tasks = 1
115
125
 
116
126
  if resources.SLURM_num_cpus_per_task == 1:
117
127
  raise IncompatibleSLURMArgumentsError(
@@ -120,28 +130,24 @@ class SlurmPosix(Scheduler):
120
130
  f"number of threads/cores to use, and so must be greater than 1, "
121
131
  f"but {resources.SLURM_num_cpus_per_task!r} was specified."
122
132
  )
123
- else:
124
- resources.num_threads = resources.num_threads or resources.num_cores
125
- if (
126
- not resources.num_threads
127
- and not resources.SLURM_num_cpus_per_task
128
- ):
129
- raise ValueError(
130
- f"For the {resources.parallel_mode.name.lower()} parallel "
131
- f"mode, specify `num_threads` (or its synonym for this "
132
- f"parallel mode: `num_cores`), or the SLURM-specific "
133
- f"parameter `SLURM_num_cpus_per_task`."
134
- )
135
- elif (
136
- resources.num_threads and resources.SLURM_num_cpus_per_task
137
- ) and (resources.num_threads != resources.SLURM_num_cpus_per_task):
138
- raise IncompatibleSLURMArgumentsError(
139
- f"Incompatible parameters for `num_cores`/`num_threads` "
140
- f"({resources.num_threads}) and `SLURM_num_cpus_per_task` "
141
- f"({resources.SLURM_num_cpus_per_task}) for the "
142
- f"{resources.parallel_mode.name.lower()} parallel mode."
143
- )
144
- resources.SLURM_num_cpus_per_task = resources.num_threads
133
+ resources.num_threads = resources.num_threads or resources.num_cores
134
+ if not resources.num_threads and not resources.SLURM_num_cpus_per_task:
135
+ raise ValueError(
136
+ f"For the {resources.parallel_mode.name.lower()} parallel "
137
+ f"mode, specify `num_threads` (or its synonym for this "
138
+ f"parallel mode: `num_cores`), or the SLURM-specific "
139
+ f"parameter `SLURM_num_cpus_per_task`."
140
+ )
141
+ elif (resources.num_threads and resources.SLURM_num_cpus_per_task) and (
142
+ resources.num_threads != resources.SLURM_num_cpus_per_task
143
+ ):
144
+ raise IncompatibleSLURMArgumentsError(
145
+ f"Incompatible parameters for `num_cores`/`num_threads` "
146
+ f"({resources.num_threads}) and `SLURM_num_cpus_per_task` "
147
+ f"({resources.SLURM_num_cpus_per_task}) for the "
148
+ f"{resources.parallel_mode.name.lower()} parallel mode."
149
+ )
150
+ resources.SLURM_num_cpus_per_task = resources.num_threads
145
151
 
146
152
  elif resources.parallel_mode is ParallelMode.DISTRIBUTED:
147
153
  if resources.num_threads:
@@ -197,9 +203,9 @@ class SlurmPosix(Scheduler):
197
203
  else:
198
204
  if resources.SLURM_is_parallel:
199
205
  raise IncompatibleSLURMArgumentsError(
200
- f"Some specified SLURM-specific arguments (which indicate a parallel "
201
- f"job) conflict with the scheduler-agnostic arguments (which "
202
- f"indicate a serial job)."
206
+ "Some specified SLURM-specific arguments (which indicate a parallel "
207
+ "job) conflict with the scheduler-agnostic arguments (which "
208
+ "indicate a serial job)."
203
209
  )
204
210
  if not resources.SLURM_num_tasks:
205
211
  resources.SLURM_num_tasks = 1
@@ -228,155 +234,138 @@ class SlurmPosix(Scheduler):
228
234
  try:
229
235
  part = all_parts[resources.SLURM_partition]
230
236
  except KeyError:
231
- raise UnknownSLURMPartitionError(
232
- f"The SLURM partition {resources.SLURM_partition!r} is not "
233
- f"specified in the configuration. Specified partitions are "
234
- f"{list(all_parts.keys())!r}."
235
- )
237
+ raise UnknownSLURMPartitionError(resources.SLURM_partition, all_parts)
236
238
  # TODO: we when we support ParallelMode.HYBRID, these checks will have to
237
239
  # consider the total number of cores requested per node
238
240
  # (num_cores_per_node * num_threads)?
239
- part_num_cores = part.get("num_cores")
240
- part_num_cores_per_node = part.get("num_cores_per_node")
241
- part_num_nodes = part.get("num_nodes")
242
- part_para_modes = part.get("parallel_modes", [])
243
- if (
244
- num_cores
245
- and part_num_cores
246
- and not cls.is_num_cores_supported(num_cores, part_num_cores)
247
- ):
241
+ part_num_cores = part.get("num_cores", ())
242
+ part_num_cores_per_node = part.get("num_cores_per_node", ())
243
+ part_num_nodes = part.get("num_nodes", ())
244
+ part_para_modes = part.get("parallel_modes", ())
245
+ if cls.__is_present_unsupported(num_cores, part_num_cores):
248
246
  raise IncompatibleSLURMPartitionError(
249
- f"The SLURM partition {resources.SLURM_partition!r} is not "
250
- f"compatible with the number of cores requested: {num_cores!r}."
251
- )
252
- if (
253
- num_cores_per_node
254
- and part_num_cores_per_node
255
- and not cls.is_num_cores_supported(
256
- num_cores_per_node, part_num_cores_per_node
247
+ resources.SLURM_partition, "number of cores", num_cores
257
248
  )
258
- ):
249
+ if cls.__is_present_unsupported(num_cores_per_node, part_num_cores_per_node):
259
250
  raise IncompatibleSLURMPartitionError(
260
- f"The SLURM partition {resources.SLURM_partition!r} is not "
261
- f"compatible with the number of cores per node requested: "
262
- f"{num_cores_per_node!r}."
251
+ resources.SLURM_partition,
252
+ "number of cores per node",
253
+ num_cores_per_node,
263
254
  )
264
- if (
265
- num_nodes
266
- and part_num_nodes
267
- and not cls.is_num_cores_supported(num_nodes, part_num_nodes)
268
- ):
255
+ if cls.__is_present_unsupported(num_nodes, part_num_nodes):
269
256
  raise IncompatibleSLURMPartitionError(
270
- f"The SLURM partition {resources.SLURM_partition!r} is not "
271
- f"compatible with the number of nodes requested: {num_nodes!r}."
257
+ resources.SLURM_partition, "number of nodes", num_nodes
272
258
  )
273
259
  if para_mode and para_mode.name.lower() not in part_para_modes:
274
260
  raise IncompatibleSLURMPartitionError(
275
- f"The SLURM partition {resources.SLURM_partition!r} is not "
276
- f"compatible with the parallel mode requested: {para_mode!r}."
261
+ resources.SLURM_partition, "parallel mode", para_mode
277
262
  )
278
263
  else:
279
264
  # find the first compatible partition if one exists:
280
265
  # TODO: bug here? not finding correct partition?
281
- part_match = False
282
266
  for part_name, part_info in all_parts.items():
283
- part_num_cores = part_info.get("num_cores")
284
- part_num_cores_per_node = part_info.get("num_cores_per_node")
285
- part_num_nodes = part_info.get("num_nodes")
286
- part_para_modes = part_info.get("parallel_modes", [])
287
- if (
288
- num_cores
289
- and part_num_cores
290
- and cls.is_num_cores_supported(num_cores, part_num_cores)
291
- ):
292
- part_match = True
293
- else:
294
- part_match = False
295
- continue
296
- if (
297
- num_cores_per_node
298
- and part_num_cores_per_node
299
- and cls.is_num_cores_supported(
300
- num_cores_per_node, part_num_cores_per_node
301
- )
267
+ if cls.__partition_matches(
268
+ num_cores, num_cores_per_node, num_nodes, para_mode, part_info
302
269
  ):
303
- part_match = True
304
- else:
305
- part_match = False
306
- continue
307
- if (
308
- num_nodes
309
- and part_num_nodes
310
- and cls.is_num_cores_supported(num_nodes, part_num_nodes)
311
- ):
312
- part_match = True
313
- else:
314
- part_match = False
315
- continue
316
- if part_match:
317
- part_match = part_name
318
- break
319
- if para_mode and para_mode.name.lower() not in part_para_modes:
320
- part_match = False
321
- continue
322
- if part_match:
323
- part_match = part_name
270
+ resources.SLURM_partition = str(part_name)
324
271
  break
325
- if part_match:
326
- resources.SLURM_partition = part_match
327
272
 
328
- def _format_core_request_lines(self, resources):
329
- lns = []
330
- if resources.SLURM_partition:
331
- lns.append(f"{self.js_cmd} --partition {resources.SLURM_partition}")
273
+ @classmethod
274
+ def __is_present_unsupported(
275
+ cls, num_req: int | None, part_have: Sequence[int] | None
276
+ ) -> bool:
277
+ """
278
+ Test if information is present on both sides, but doesn't match.
279
+ """
280
+ return bool(
281
+ num_req and part_have and not cls.is_num_cores_supported(num_req, part_have)
282
+ )
332
283
 
333
- if resources.SLURM_num_nodes: # TODO: option for --exclusive ?
334
- lns.append(f"{self.js_cmd} --nodes {resources.SLURM_num_nodes}")
284
+ @classmethod
285
+ def __is_present_supported(
286
+ cls, num_req: int | None, part_have: Sequence[int] | None
287
+ ) -> bool:
288
+ """
289
+ Test if information is present on both sides, and also matches.
290
+ """
291
+ return bool(
292
+ num_req and part_have and cls.is_num_cores_supported(num_req, part_have)
293
+ )
335
294
 
295
+ @classmethod
296
+ def __partition_matches(
297
+ cls,
298
+ num_cores: int | None,
299
+ num_cores_per_node: int | None,
300
+ num_nodes: int | None,
301
+ para_mode: ParallelMode | None,
302
+ part_info: SLURMPartitionsDescriptor,
303
+ ) -> bool:
304
+ """
305
+ Check whether a partition (part_name, part_info) matches the requested number
306
+ of cores and nodes.
307
+ """
308
+ part_num_cores = part_info.get("num_cores", [])
309
+ part_num_cores_per_node = part_info.get("num_cores_per_node", [])
310
+ part_num_nodes = part_info.get("num_nodes", [])
311
+ part_para_modes = part_info.get("parallel_modes", [])
312
+ if (
313
+ not cls.__is_present_supported(num_cores, part_num_cores)
314
+ or not cls.__is_present_supported(num_cores_per_node, part_num_cores_per_node)
315
+ or not cls.__is_present_supported(num_nodes, part_num_nodes)
316
+ ):
317
+ return False
318
+ # FIXME: Does the next check come above or below the check below?
319
+ # Surely not both!
320
+ part_match = True
321
+ if part_match:
322
+ return True
323
+ if para_mode and para_mode.name.lower() not in part_para_modes:
324
+ return False
325
+ if part_match:
326
+ return True
327
+ return False
328
+
329
+ def __format_core_request_lines(self, resources: ElementResources) -> Iterator[str]:
330
+ if resources.SLURM_partition:
331
+ yield f"{self.js_cmd} --partition {resources.SLURM_partition}"
332
+ if resources.SLURM_num_nodes: # TODO: option for --exclusive ?
333
+ yield f"{self.js_cmd} --nodes {resources.SLURM_num_nodes}"
336
334
  if resources.SLURM_num_tasks:
337
- lns.append(f"{self.js_cmd} --ntasks {resources.SLURM_num_tasks}")
338
-
335
+ yield f"{self.js_cmd} --ntasks {resources.SLURM_num_tasks}"
339
336
  if resources.SLURM_num_tasks_per_node:
340
- lns.append(
341
- f"{self.js_cmd} --ntasks-per-node {resources.SLURM_num_tasks_per_node}"
342
- )
343
-
337
+ yield f"{self.js_cmd} --ntasks-per-node {resources.SLURM_num_tasks_per_node}"
344
338
  if resources.SLURM_num_cpus_per_task:
345
- lns.append(
346
- f"{self.js_cmd} --cpus-per-task {resources.SLURM_num_cpus_per_task}"
347
- )
348
-
349
- return lns
339
+ yield f"{self.js_cmd} --cpus-per-task {resources.SLURM_num_cpus_per_task}"
350
340
 
351
- def _format_array_request(self, num_elements, resources):
341
+ def __format_array_request(self, num_elements: int, resources: ElementResources):
352
342
  # TODO: Slurm docs start indices at zero, why are we starting at one?
353
343
  # https://slurm.schedmd.com/sbatch.html#OPT_array
354
344
  max_str = f"%{resources.max_array_items}" if resources.max_array_items else ""
355
345
  return f"{self.js_cmd} {self.array_switch} 1-{num_elements}{max_str}"
356
346
 
357
- def _format_std_stream_file_option_lines(self, is_array, sub_idx):
358
- base = r"%x_"
359
- if is_array:
360
- base += r"%A.%a"
361
- else:
362
- base += r"%j"
363
-
364
- base = f"./artifacts/submissions/{sub_idx}/{base}"
365
- return [
366
- f"{self.js_cmd} -o {base}.out",
367
- f"{self.js_cmd} -e {base}.err",
368
- ]
369
-
370
- def format_options(self, resources, num_elements, is_array, sub_idx):
347
+ def __format_std_stream_file_option_lines(
348
+ self, is_array: bool, sub_idx: int
349
+ ) -> Iterator[str]:
350
+ pattern = R"%x_%A.%a" if is_array else R"%x_%j"
351
+ base = f"./artifacts/submissions/{sub_idx}/{pattern}"
352
+ yield f"{self.js_cmd} -o {base}.out"
353
+ yield f"{self.js_cmd} -e {base}.err"
354
+
355
+ @override
356
+ def format_options(
357
+ self, resources: ElementResources, num_elements: int, is_array: bool, sub_idx: int
358
+ ) -> str:
371
359
  """
372
360
  Format the options to the scheduler.
373
361
  """
374
- opts = []
375
- opts.extend(self._format_core_request_lines(resources))
362
+ opts: list[str] = []
363
+ opts.extend(self.__format_core_request_lines(resources))
364
+
376
365
  if is_array:
377
- opts.append(self._format_array_request(num_elements, resources))
366
+ opts.append(self.__format_array_request(num_elements, resources))
378
367
 
379
- opts.extend(self._format_std_stream_file_option_lines(is_array, sub_idx))
368
+ opts.extend(self.__format_std_stream_file_option_lines(is_array, sub_idx))
380
369
 
381
370
  for opt_k, opt_v in self.options.items():
382
371
  if isinstance(opt_v, list):
@@ -389,8 +378,9 @@ class SlurmPosix(Scheduler):
389
378
 
390
379
  return "\n".join(opts) + "\n"
391
380
 
381
+ @override
392
382
  @TimeIt.decorator
393
- def get_version_info(self):
383
+ def get_version_info(self) -> VersionInfo:
394
384
  vers_cmd = [self.submit_cmd, "--version"]
395
385
  proc = subprocess.run(
396
386
  args=vers_cmd,
@@ -402,18 +392,18 @@ class SlurmPosix(Scheduler):
402
392
  if stderr:
403
393
  print(stderr)
404
394
  name, version = stdout.split()
405
- out = {
395
+ return {
406
396
  "scheduler_name": name,
407
397
  "scheduler_version": version,
408
398
  }
409
- return out
410
399
 
400
+ @override
411
401
  def get_submit_command(
412
402
  self,
413
403
  shell: Shell,
414
404
  js_path: str,
415
- deps: List[Tuple],
416
- ) -> List[str]:
405
+ deps: dict[Any, tuple[Any, ...]],
406
+ ) -> list[str]:
417
407
  """
418
408
  Get the command to use to submit a job to the scheduler.
419
409
 
@@ -422,94 +412,96 @@ class SlurmPosix(Scheduler):
422
412
  List of argument words.
423
413
  """
424
414
  cmd = [self.submit_cmd, "--parsable"]
415
+ if deps:
416
+ cmd.append("--dependency")
417
+ cmd.append(",".join(self.__dependency_args(deps)))
418
+ cmd.append(js_path)
419
+ return cmd
425
420
 
426
- dep_cmd = []
421
+ @staticmethod
422
+ def __dependency_args(deps: dict[Any, tuple[Any, ...]]) -> Iterator[str]:
427
423
  for job_ID, is_array_dep in deps.values():
428
- dep_i_str = ""
429
424
  if is_array_dep: # array dependency
430
- dep_i_str += "aftercorr:"
425
+ yield f"aftercorr:{job_ID}"
431
426
  else:
432
- dep_i_str += "afterany:"
433
- dep_i_str += str(job_ID)
434
- dep_cmd.append(dep_i_str)
435
-
436
- if dep_cmd:
437
- cmd.append(f"--dependency")
438
- cmd.append(",".join(dep_cmd))
439
-
440
- cmd.append(js_path)
441
-
442
- return cmd
427
+ yield f"afterany:{job_ID}"
443
428
 
444
429
  def parse_submission_output(self, stdout: str) -> str:
445
430
  """Extract scheduler reference for a newly submitted jobscript"""
446
431
  if ";" in stdout:
447
- job_ID, _ = stdout.split(";") # since we submit with "--parsable"
448
- else:
449
- job_ID = stdout
450
- return job_ID
432
+ return stdout.split(";")[0] # since we submit with "--parsable"
433
+ # Try using the whole thing
434
+ return stdout
451
435
 
452
436
  @staticmethod
453
- def _parse_job_IDs(job_ID_str: str):
454
- """Parse the job ID column from the `squeue` command (the `%i` format option)."""
455
- parts = job_ID_str.split("_")
456
- base_job_ID, arr_idx = parts if len(parts) == 2 else (parts[0], None)
457
- if arr_idx is not None:
458
- try:
459
- arr_idx = [int(arr_idx) - 1] # zero-index
460
- except ValueError:
461
- # split on commas (e.g. "[5,8-40]")
462
- _arr_idx = []
463
- for i_range_str in arr_idx.strip("[]").split(","):
464
- if "-" in i_range_str:
465
- range_parts = i_range_str.split("-")
466
- if "%" in range_parts[1]:
467
- # indicates max concurrent array items; not needed
468
- range_parts[1] = range_parts[1].split("%")[0]
469
- i_args = [int(j) - 1 for j in range_parts]
470
- _arr_idx.extend(list(range(i_args[0], i_args[1] + 1)))
471
- else:
472
- _arr_idx.append(int(i_range_str) - 1)
473
- arr_idx = _arr_idx
474
- return base_job_ID, arr_idx
475
-
476
- def _parse_job_states(self, stdout) -> Dict[str, Dict[int, JobscriptElementState]]:
437
+ def _parse_job_IDs(job_ID_str: str) -> tuple[str, None | list[int]]:
438
+ """
439
+ Parse the job ID column from the `squeue` command (the `%i` format option).
440
+
441
+ Returns
442
+ -------
443
+ job_id
444
+ The job identifier.
445
+ array_indices
446
+ The indices into the job array.
447
+ """
448
+ base_job_ID, *arr_idx_data = job_ID_str.split("_")
449
+ if not arr_idx_data:
450
+ return base_job_ID, None
451
+ arr_idx = arr_idx_data[0]
452
+ try:
453
+ return base_job_ID, [int(arr_idx) - 1] # zero-index
454
+ except ValueError:
455
+ pass
456
+ # split on commas (e.g. "[5,8-40]")
457
+ _arr_idx: list[int] = []
458
+ for i_range_str in arr_idx.strip("[]").split(","):
459
+ if "-" in i_range_str:
460
+ _from, _to = i_range_str.split("-")
461
+ if "%" in _to:
462
+ # indicates max concurrent array items; not needed
463
+ _to = _to.split("%")[0]
464
+ _arr_idx.extend(range(int(_from) - 1, int(_to)))
465
+ else:
466
+ _arr_idx.append(int(i_range_str) - 1)
467
+ return base_job_ID, _arr_idx
468
+
469
+ def __parse_job_states(
470
+ self, stdout: str
471
+ ) -> dict[str, dict[int | None, JobscriptElementState]]:
477
472
  """Parse output from Slurm `squeue` command with a simple format."""
478
- info = {}
473
+ info: dict[str, dict[int | None, JobscriptElementState]] = {}
479
474
  for ln in stdout.split("\n"):
480
475
  if not ln:
481
476
  continue
482
- ln_s = [i.strip() for i in ln.split()]
483
- base_job_ID, arr_idx = self._parse_job_IDs(ln_s[0])
484
- state = self.state_lookup.get(ln_s[1], None)
477
+ job_id, job_state, *_ = ln.split()
478
+ base_job_ID, arr_idx = self._parse_job_IDs(job_id)
479
+ state = self.state_lookup.get(job_state, JobscriptElementState.errored)
485
480
 
486
- if base_job_ID not in info:
487
- info[base_job_ID] = {}
488
-
489
- for arr_idx_i in arr_idx or [None]:
490
- info[base_job_ID][arr_idx_i] = state
481
+ entry = info.setdefault(base_job_ID, {})
482
+ for arr_idx_i in arr_idx or ():
483
+ entry[arr_idx_i] = state
491
484
 
492
485
  return info
493
486
 
494
- def _query_job_states(self, job_IDs):
487
+ def __query_job_states(self, job_IDs: Iterable[str]) -> tuple[str, str]:
495
488
  """Query the state of the specified jobs."""
496
489
  cmd = [
497
- "squeue",
498
- "--me",
490
+ *self.show_cmd,
499
491
  "--noheader",
500
492
  "--format",
501
- r"%40i %30T",
493
+ R"%40i %30T",
502
494
  "--jobs",
503
495
  ",".join(job_IDs),
504
496
  ]
505
- return run_cmd(cmd, logger=self.app.submission_logger)
497
+ return run_cmd(cmd, logger=self._app.submission_logger)
506
498
 
507
- def _get_job_valid_IDs(self, job_IDs=None):
499
+ def __get_job_valid_IDs(self, job_IDs: Collection[str] | None = None) -> set[str]:
508
500
  """Get a list of job IDs that are known by the scheduler, optionally filtered by
509
501
  specified job IDs."""
510
502
 
511
- cmd = ["squeue", "--me", "--noheader", "--format", r"%F"]
512
- stdout, stderr = run_cmd(cmd, logger=self.app.submission_logger)
503
+ cmd = [*self.show_cmd, "--noheader", "--format", r"%F"]
504
+ stdout, stderr = run_cmd(cmd, logger=self._app.submission_logger)
513
505
  if stderr:
514
506
  raise ValueError(
515
507
  f"Could not get query Slurm jobs. Command was: {cmd!r}; stderr was: "
@@ -517,64 +509,67 @@ class SlurmPosix(Scheduler):
517
509
  )
518
510
  else:
519
511
  known_jobs = set(i.strip() for i in stdout.split("\n") if i.strip())
520
- job_IDs = known_jobs.intersection(job_IDs or [])
521
-
522
- return job_IDs
512
+ if job_IDs is None:
513
+ return known_jobs
514
+ return known_jobs.intersection(job_IDs)
523
515
 
516
+ @override
524
517
  def get_job_state_info(
525
- self, js_refs: List[str] = None
526
- ) -> Dict[str, Dict[int, JobscriptElementState]]:
518
+ self, *, js_refs: Sequence[str] | None = None, num_js_elements: int = 0
519
+ ) -> Mapping[str, Mapping[int | None, JobscriptElementState]]:
527
520
  """Query the scheduler to get the states of all of this user's jobs, optionally
528
521
  filtering by specified job IDs.
529
522
 
530
523
  Jobs that are not in the scheduler's status output will not appear in the output
531
524
  of this method.
532
-
533
525
  """
534
526
 
535
527
  # if job_IDs are passed, then assume they are existant, otherwise retrieve valid
536
528
  # jobs:
537
- if not js_refs:
538
- js_refs = self._get_job_valid_IDs()
539
- if not js_refs:
540
- return {}
529
+ refs: Collection[str] = js_refs or self.__get_job_valid_IDs()
541
530
 
542
- stdout, stderr = self._query_job_states(js_refs)
543
531
  count = 0
544
- while stderr:
545
- if "Invalid job id specified" in stderr and count < 5:
546
- # the job might have finished; this only seems to happen if a single
547
- # non-existant job ID is specified; for multiple non-existant jobs, no
548
- # error is produced;
549
- self.app.submission_logger.info(
550
- f"A specified job ID is non-existant; refreshing known job IDs..."
551
- )
552
- time.sleep(0.5)
553
- js_refs = self._get_job_valid_IDs(js_refs)
554
- if not js_refs:
555
- return {}
556
- stdout, stderr = self._query_job_states(js_refs)
557
- count += 1
558
- else:
532
+ while refs:
533
+ stdout, stderr = self.__query_job_states(refs)
534
+ if not stderr:
535
+ return self.__parse_job_states(stdout)
536
+ if (
537
+ "Invalid job id specified" not in stderr
538
+ or count >= self.NUM_STATE_QUERY_TRIES
539
+ ):
559
540
  raise ValueError(f"Could not get Slurm job states. Stderr was: {stderr}")
560
541
 
561
- info = self._parse_job_states(stdout)
562
- return info
542
+ # the job might have finished; this only seems to happen if a single
543
+ # non-existant job ID is specified; for multiple non-existant jobs, no
544
+ # error is produced;
545
+ self._app.submission_logger.info(
546
+ "A specified job ID is non-existant; refreshing known job IDs..."
547
+ )
548
+ time.sleep(self.INTER_STATE_QUERY_DELAY)
549
+ refs = self.__get_job_valid_IDs(refs)
550
+ count += 1
551
+ return {}
563
552
 
564
- def cancel_jobs(self, js_refs: List[str], jobscripts: List = None):
553
+ @override
554
+ def cancel_jobs(
555
+ self,
556
+ js_refs: list[str],
557
+ jobscripts: list[Jobscript] | None = None,
558
+ num_js_elements: int = 0, # Ignored!
559
+ ):
565
560
  """
566
561
  Cancel submitted jobs.
567
562
  """
568
- cmd = [self.del_cmd] + js_refs
569
- self.app.submission_logger.info(
563
+ cmd = [self.del_cmd, *js_refs]
564
+ self._app.submission_logger.info(
570
565
  f"cancelling {self.__class__.__name__} jobscripts with command: {cmd}."
571
566
  )
572
- stdout, stderr = run_cmd(cmd, logger=self.app.submission_logger)
567
+ stdout, stderr = run_cmd(cmd, logger=self._app.submission_logger)
573
568
  if stderr:
574
569
  raise ValueError(
575
570
  f"Could not get query {self.__class__.__name__} jobs. Command was: "
576
571
  f"{cmd!r}; stderr was: {stderr}"
577
572
  )
578
- self.app.submission_logger.info(
573
+ self._app.submission_logger.info(
579
574
  f"jobscripts cancel command executed; stdout was: {stdout}."
580
575
  )