hpcflow-new2 0.2.0a189__py3-none-any.whl → 0.2.0a199__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. hpcflow/__pyinstaller/hook-hpcflow.py +9 -6
  2. hpcflow/_version.py +1 -1
  3. hpcflow/app.py +1 -0
  4. hpcflow/data/scripts/bad_script.py +2 -0
  5. hpcflow/data/scripts/do_nothing.py +2 -0
  6. hpcflow/data/scripts/env_specifier_test/input_file_generator_pass_env_spec.py +4 -0
  7. hpcflow/data/scripts/env_specifier_test/main_script_test_pass_env_spec.py +8 -0
  8. hpcflow/data/scripts/env_specifier_test/output_file_parser_pass_env_spec.py +4 -0
  9. hpcflow/data/scripts/env_specifier_test/v1/input_file_generator_basic.py +4 -0
  10. hpcflow/data/scripts/env_specifier_test/v1/main_script_test_direct_in_direct_out.py +7 -0
  11. hpcflow/data/scripts/env_specifier_test/v1/output_file_parser_basic.py +4 -0
  12. hpcflow/data/scripts/env_specifier_test/v2/main_script_test_direct_in_direct_out.py +7 -0
  13. hpcflow/data/scripts/input_file_generator_basic.py +3 -0
  14. hpcflow/data/scripts/input_file_generator_basic_FAIL.py +3 -0
  15. hpcflow/data/scripts/input_file_generator_test_stdout_stderr.py +8 -0
  16. hpcflow/data/scripts/main_script_test_direct_in.py +3 -0
  17. hpcflow/data/scripts/main_script_test_direct_in_direct_out_2.py +6 -0
  18. hpcflow/data/scripts/main_script_test_direct_in_direct_out_2_fail_allowed.py +6 -0
  19. hpcflow/data/scripts/main_script_test_direct_in_direct_out_2_fail_allowed_group.py +7 -0
  20. hpcflow/data/scripts/main_script_test_direct_in_direct_out_3.py +6 -0
  21. hpcflow/data/scripts/main_script_test_direct_in_group_direct_out_3.py +6 -0
  22. hpcflow/data/scripts/main_script_test_direct_in_group_one_fail_direct_out_3.py +6 -0
  23. hpcflow/data/scripts/main_script_test_hdf5_in_obj.py +1 -1
  24. hpcflow/data/scripts/main_script_test_hdf5_in_obj_2.py +12 -0
  25. hpcflow/data/scripts/main_script_test_hdf5_out_obj.py +1 -1
  26. hpcflow/data/scripts/main_script_test_json_out_FAIL.py +3 -0
  27. hpcflow/data/scripts/main_script_test_shell_env_vars.py +12 -0
  28. hpcflow/data/scripts/main_script_test_std_out_std_err.py +6 -0
  29. hpcflow/data/scripts/output_file_parser_basic.py +3 -0
  30. hpcflow/data/scripts/output_file_parser_basic_FAIL.py +7 -0
  31. hpcflow/data/scripts/output_file_parser_test_stdout_stderr.py +8 -0
  32. hpcflow/data/scripts/script_exit_test.py +5 -0
  33. hpcflow/data/template_components/environments.yaml +1 -1
  34. hpcflow/sdk/__init__.py +26 -15
  35. hpcflow/sdk/app.py +2192 -768
  36. hpcflow/sdk/cli.py +506 -296
  37. hpcflow/sdk/cli_common.py +105 -7
  38. hpcflow/sdk/config/__init__.py +1 -1
  39. hpcflow/sdk/config/callbacks.py +115 -43
  40. hpcflow/sdk/config/cli.py +126 -103
  41. hpcflow/sdk/config/config.py +674 -318
  42. hpcflow/sdk/config/config_file.py +131 -95
  43. hpcflow/sdk/config/errors.py +125 -84
  44. hpcflow/sdk/config/types.py +148 -0
  45. hpcflow/sdk/core/__init__.py +25 -1
  46. hpcflow/sdk/core/actions.py +1771 -1059
  47. hpcflow/sdk/core/app_aware.py +24 -0
  48. hpcflow/sdk/core/cache.py +139 -79
  49. hpcflow/sdk/core/command_files.py +263 -287
  50. hpcflow/sdk/core/commands.py +145 -112
  51. hpcflow/sdk/core/element.py +828 -535
  52. hpcflow/sdk/core/enums.py +192 -0
  53. hpcflow/sdk/core/environment.py +74 -93
  54. hpcflow/sdk/core/errors.py +455 -52
  55. hpcflow/sdk/core/execute.py +207 -0
  56. hpcflow/sdk/core/json_like.py +540 -272
  57. hpcflow/sdk/core/loop.py +751 -347
  58. hpcflow/sdk/core/loop_cache.py +164 -47
  59. hpcflow/sdk/core/object_list.py +370 -207
  60. hpcflow/sdk/core/parameters.py +1100 -627
  61. hpcflow/sdk/core/rule.py +59 -41
  62. hpcflow/sdk/core/run_dir_files.py +21 -37
  63. hpcflow/sdk/core/skip_reason.py +7 -0
  64. hpcflow/sdk/core/task.py +1649 -1339
  65. hpcflow/sdk/core/task_schema.py +308 -196
  66. hpcflow/sdk/core/test_utils.py +191 -114
  67. hpcflow/sdk/core/types.py +440 -0
  68. hpcflow/sdk/core/utils.py +485 -309
  69. hpcflow/sdk/core/validation.py +82 -9
  70. hpcflow/sdk/core/workflow.py +2544 -1178
  71. hpcflow/sdk/core/zarr_io.py +98 -137
  72. hpcflow/sdk/data/workflow_spec_schema.yaml +2 -0
  73. hpcflow/sdk/demo/cli.py +53 -33
  74. hpcflow/sdk/helper/cli.py +18 -15
  75. hpcflow/sdk/helper/helper.py +75 -63
  76. hpcflow/sdk/helper/watcher.py +61 -28
  77. hpcflow/sdk/log.py +122 -71
  78. hpcflow/sdk/persistence/__init__.py +8 -31
  79. hpcflow/sdk/persistence/base.py +1360 -606
  80. hpcflow/sdk/persistence/defaults.py +6 -0
  81. hpcflow/sdk/persistence/discovery.py +38 -0
  82. hpcflow/sdk/persistence/json.py +568 -188
  83. hpcflow/sdk/persistence/pending.py +382 -179
  84. hpcflow/sdk/persistence/store_resource.py +39 -23
  85. hpcflow/sdk/persistence/types.py +318 -0
  86. hpcflow/sdk/persistence/utils.py +14 -11
  87. hpcflow/sdk/persistence/zarr.py +1337 -433
  88. hpcflow/sdk/runtime.py +44 -41
  89. hpcflow/sdk/submission/{jobscript_info.py → enums.py} +39 -12
  90. hpcflow/sdk/submission/jobscript.py +1651 -692
  91. hpcflow/sdk/submission/schedulers/__init__.py +167 -39
  92. hpcflow/sdk/submission/schedulers/direct.py +121 -81
  93. hpcflow/sdk/submission/schedulers/sge.py +170 -129
  94. hpcflow/sdk/submission/schedulers/slurm.py +291 -268
  95. hpcflow/sdk/submission/schedulers/utils.py +12 -2
  96. hpcflow/sdk/submission/shells/__init__.py +14 -15
  97. hpcflow/sdk/submission/shells/base.py +150 -29
  98. hpcflow/sdk/submission/shells/bash.py +283 -173
  99. hpcflow/sdk/submission/shells/os_version.py +31 -30
  100. hpcflow/sdk/submission/shells/powershell.py +228 -170
  101. hpcflow/sdk/submission/submission.py +1014 -335
  102. hpcflow/sdk/submission/types.py +140 -0
  103. hpcflow/sdk/typing.py +182 -12
  104. hpcflow/sdk/utils/arrays.py +71 -0
  105. hpcflow/sdk/utils/deferred_file.py +55 -0
  106. hpcflow/sdk/utils/hashing.py +16 -0
  107. hpcflow/sdk/utils/patches.py +12 -0
  108. hpcflow/sdk/utils/strings.py +33 -0
  109. hpcflow/tests/api/test_api.py +32 -0
  110. hpcflow/tests/conftest.py +27 -6
  111. hpcflow/tests/data/multi_path_sequences.yaml +29 -0
  112. hpcflow/tests/data/workflow_test_run_abort.yaml +34 -35
  113. hpcflow/tests/schedulers/sge/test_sge_submission.py +36 -0
  114. hpcflow/tests/schedulers/slurm/test_slurm_submission.py +5 -2
  115. hpcflow/tests/scripts/test_input_file_generators.py +282 -0
  116. hpcflow/tests/scripts/test_main_scripts.py +866 -85
  117. hpcflow/tests/scripts/test_non_snippet_script.py +46 -0
  118. hpcflow/tests/scripts/test_ouput_file_parsers.py +353 -0
  119. hpcflow/tests/shells/wsl/test_wsl_submission.py +12 -4
  120. hpcflow/tests/unit/test_action.py +262 -75
  121. hpcflow/tests/unit/test_action_rule.py +9 -4
  122. hpcflow/tests/unit/test_app.py +33 -6
  123. hpcflow/tests/unit/test_cache.py +46 -0
  124. hpcflow/tests/unit/test_cli.py +134 -1
  125. hpcflow/tests/unit/test_command.py +71 -54
  126. hpcflow/tests/unit/test_config.py +142 -16
  127. hpcflow/tests/unit/test_config_file.py +21 -18
  128. hpcflow/tests/unit/test_element.py +58 -62
  129. hpcflow/tests/unit/test_element_iteration.py +50 -1
  130. hpcflow/tests/unit/test_element_set.py +29 -19
  131. hpcflow/tests/unit/test_group.py +4 -2
  132. hpcflow/tests/unit/test_input_source.py +116 -93
  133. hpcflow/tests/unit/test_input_value.py +29 -24
  134. hpcflow/tests/unit/test_jobscript_unit.py +757 -0
  135. hpcflow/tests/unit/test_json_like.py +44 -35
  136. hpcflow/tests/unit/test_loop.py +1396 -84
  137. hpcflow/tests/unit/test_meta_task.py +325 -0
  138. hpcflow/tests/unit/test_multi_path_sequences.py +229 -0
  139. hpcflow/tests/unit/test_object_list.py +17 -12
  140. hpcflow/tests/unit/test_parameter.py +29 -7
  141. hpcflow/tests/unit/test_persistence.py +237 -42
  142. hpcflow/tests/unit/test_resources.py +20 -18
  143. hpcflow/tests/unit/test_run.py +117 -6
  144. hpcflow/tests/unit/test_run_directories.py +29 -0
  145. hpcflow/tests/unit/test_runtime.py +2 -1
  146. hpcflow/tests/unit/test_schema_input.py +23 -15
  147. hpcflow/tests/unit/test_shell.py +23 -2
  148. hpcflow/tests/unit/test_slurm.py +8 -7
  149. hpcflow/tests/unit/test_submission.py +38 -89
  150. hpcflow/tests/unit/test_task.py +352 -247
  151. hpcflow/tests/unit/test_task_schema.py +33 -20
  152. hpcflow/tests/unit/test_utils.py +9 -11
  153. hpcflow/tests/unit/test_value_sequence.py +15 -12
  154. hpcflow/tests/unit/test_workflow.py +114 -83
  155. hpcflow/tests/unit/test_workflow_template.py +0 -1
  156. hpcflow/tests/unit/utils/test_arrays.py +40 -0
  157. hpcflow/tests/unit/utils/test_deferred_file_writer.py +34 -0
  158. hpcflow/tests/unit/utils/test_hashing.py +65 -0
  159. hpcflow/tests/unit/utils/test_patches.py +5 -0
  160. hpcflow/tests/unit/utils/test_redirect_std.py +50 -0
  161. hpcflow/tests/workflows/__init__.py +0 -0
  162. hpcflow/tests/workflows/test_directory_structure.py +31 -0
  163. hpcflow/tests/workflows/test_jobscript.py +334 -1
  164. hpcflow/tests/workflows/test_run_status.py +198 -0
  165. hpcflow/tests/workflows/test_skip_downstream.py +696 -0
  166. hpcflow/tests/workflows/test_submission.py +140 -0
  167. hpcflow/tests/workflows/test_workflows.py +160 -15
  168. hpcflow/tests/workflows/test_zip.py +18 -0
  169. hpcflow/viz_demo.ipynb +6587 -3
  170. {hpcflow_new2-0.2.0a189.dist-info → hpcflow_new2-0.2.0a199.dist-info}/METADATA +8 -4
  171. hpcflow_new2-0.2.0a199.dist-info/RECORD +221 -0
  172. hpcflow/sdk/core/parallel.py +0 -21
  173. hpcflow_new2-0.2.0a189.dist-info/RECORD +0 -158
  174. {hpcflow_new2-0.2.0a189.dist-info → hpcflow_new2-0.2.0a199.dist-info}/LICENSE +0 -0
  175. {hpcflow_new2-0.2.0a189.dist-info → hpcflow_new2-0.2.0a199.dist-info}/WHEEL +0 -0
  176. {hpcflow_new2-0.2.0a189.dist-info → hpcflow_new2-0.2.0a199.dist-info}/entry_points.txt +0 -0
@@ -2,25 +2,36 @@
2
2
  An interface to SLURM.
3
3
  """
4
4
 
5
- from pathlib import Path
5
+ from __future__ import annotations
6
6
  import subprocess
7
7
  import time
8
- from typing import Dict, List, Tuple
8
+ from typing import cast, TYPE_CHECKING
9
+ from typing_extensions import override
10
+ from hpcflow.sdk.typing import hydrate
11
+ from hpcflow.sdk.core.enums import ParallelMode
9
12
  from hpcflow.sdk.core.errors import (
10
13
  IncompatibleParallelModeError,
11
14
  IncompatibleSLURMArgumentsError,
12
15
  IncompatibleSLURMPartitionError,
13
16
  UnknownSLURMPartitionError,
14
17
  )
15
- from hpcflow.sdk.core.parameters import ParallelMode
16
18
  from hpcflow.sdk.log import TimeIt
17
- from hpcflow.sdk.submission.jobscript_info import JobscriptElementState
18
- from hpcflow.sdk.submission.schedulers import Scheduler
19
+ from hpcflow.sdk.submission.enums import JobscriptElementState
20
+ from hpcflow.sdk.submission.schedulers import QueuedScheduler
19
21
  from hpcflow.sdk.submission.schedulers.utils import run_cmd
20
- from hpcflow.sdk.submission.shells.base import Shell
21
22
 
23
+ if TYPE_CHECKING:
24
+ from collections.abc import Collection, Iterable, Iterator, Mapping, Sequence
25
+ from typing import Any, ClassVar
26
+ from ...config.types import SchedulerConfigDescriptor, SLURMPartitionsDescriptor
27
+ from ...core.element import ElementResources
28
+ from ..jobscript import Jobscript
29
+ from ..types import VersionInfo
30
+ from ..shells.base import Shell
22
31
 
23
- class SlurmPosix(Scheduler):
32
+
33
+ @hydrate
34
+ class SlurmPosix(QueuedScheduler):
24
35
  """
25
36
  A scheduler that uses SLURM.
26
37
 
@@ -48,27 +59,29 @@ class SlurmPosix(Scheduler):
48
59
 
49
60
  """
50
61
 
51
- _app_attr = "app"
52
-
53
62
  #: Default shell.
54
- DEFAULT_SHELL_EXECUTABLE = "/bin/bash"
63
+ DEFAULT_SHELL_EXECUTABLE: ClassVar[str] = "/bin/bash"
55
64
  #: Default args for shebang line.
56
- DEFAULT_SHEBANG_ARGS = ""
65
+ DEFAULT_SHEBANG_ARGS: ClassVar[str] = ""
57
66
  #: Default submission command.
58
- DEFAULT_SUBMIT_CMD = "sbatch"
67
+ DEFAULT_SUBMIT_CMD: ClassVar[str] = "sbatch"
59
68
  #: Default command to show the queue state.
60
- DEFAULT_SHOW_CMD = ["squeue", "--me"]
69
+ DEFAULT_SHOW_CMD: ClassVar[Sequence[str]] = ("squeue", "--me")
61
70
  #: Default cancel command.
62
- DEFAULT_DEL_CMD = "scancel"
71
+ DEFAULT_DEL_CMD: ClassVar[str] = "scancel"
63
72
  #: Default job control directive prefix.
64
- DEFAULT_JS_CMD = "#SBATCH"
73
+ DEFAULT_JS_CMD: ClassVar[str] = "#SBATCH"
65
74
  #: Default prefix to enable array processing.
66
- DEFAULT_ARRAY_SWITCH = "--array"
75
+ DEFAULT_ARRAY_SWITCH: ClassVar[str] = "--array"
67
76
  #: Default shell variable with array ID.
68
- DEFAULT_ARRAY_ITEM_VAR = "SLURM_ARRAY_TASK_ID"
77
+ DEFAULT_ARRAY_ITEM_VAR: ClassVar[str] = "SLURM_ARRAY_TASK_ID"
78
+ #: Number of times to try when querying the state.
79
+ NUM_STATE_QUERY_TRIES: ClassVar[int] = 5
80
+ #: Delay (in seconds) between attempts to query the state.
81
+ INTER_STATE_QUERY_DELAY: ClassVar[float] = 0.5
69
82
 
70
83
  #: Maps scheduler state codes to :py:class:`JobscriptElementState` values.
71
- state_lookup = {
84
+ state_lookup: ClassVar[Mapping[str, JobscriptElementState]] = {
72
85
  "PENDING": JobscriptElementState.pending,
73
86
  "RUNNING": JobscriptElementState.running,
74
87
  "COMPLETING": JobscriptElementState.running,
@@ -79,16 +92,17 @@ class SlurmPosix(Scheduler):
79
92
  "TIMEOUT": JobscriptElementState.errored,
80
93
  }
81
94
 
82
- def __init__(self, *args, **kwargs):
83
- super().__init__(*args, **kwargs)
84
-
85
95
  @classmethod
96
+ @override
86
97
  @TimeIt.decorator
87
- def process_resources(cls, resources, scheduler_config: Dict) -> None:
98
+ def process_resources(
99
+ cls, resources: ElementResources, scheduler_config: SchedulerConfigDescriptor
100
+ ) -> None:
88
101
  """Perform scheduler-specific processing to the element resources.
89
102
 
90
- Note: this mutates `resources`.
91
-
103
+ Note
104
+ ----
105
+ This mutates `resources`.
92
106
  """
93
107
  if resources.is_parallel:
94
108
  if resources.parallel_mode is None:
@@ -97,21 +111,17 @@ class SlurmPosix(Scheduler):
97
111
 
98
112
  if resources.parallel_mode is ParallelMode.SHARED:
99
113
  if (resources.num_nodes and resources.num_nodes > 1) or (
100
- resources.SLURM_node_nodes and resources.SLURM_num_nodes > 1
114
+ resources.SLURM_num_nodes and resources.SLURM_num_nodes > 1
101
115
  ):
102
- raise IncompatibleParallelModeError(
103
- f"For the {resources.parallel_mode.name.lower()} parallel mode, "
104
- f"only a single node may be requested."
105
- )
116
+ raise IncompatibleParallelModeError(resources.parallel_mode)
106
117
  # consider `num_cores` and `num_threads` synonyms in this case:
107
- if resources.SLURM_num_tasks and resources.SLURM_num_task != 1:
118
+ if resources.SLURM_num_tasks and resources.SLURM_num_tasks != 1:
108
119
  raise IncompatibleSLURMArgumentsError(
109
120
  f"For the {resources.parallel_mode.name.lower()} parallel mode, "
110
121
  f"`SLURM_num_tasks` must be set to 1 (to ensure all requested "
111
122
  f"cores reside on the same node)."
112
123
  )
113
- else:
114
- resources.SLURM_num_tasks = 1
124
+ resources.SLURM_num_tasks = 1
115
125
 
116
126
  if resources.SLURM_num_cpus_per_task == 1:
117
127
  raise IncompatibleSLURMArgumentsError(
@@ -120,28 +130,24 @@ class SlurmPosix(Scheduler):
120
130
  f"number of threads/cores to use, and so must be greater than 1, "
121
131
  f"but {resources.SLURM_num_cpus_per_task!r} was specified."
122
132
  )
123
- else:
124
- resources.num_threads = resources.num_threads or resources.num_cores
125
- if (
126
- not resources.num_threads
127
- and not resources.SLURM_num_cpus_per_task
128
- ):
129
- raise ValueError(
130
- f"For the {resources.parallel_mode.name.lower()} parallel "
131
- f"mode, specify `num_threads` (or its synonym for this "
132
- f"parallel mode: `num_cores`), or the SLURM-specific "
133
- f"parameter `SLURM_num_cpus_per_task`."
134
- )
135
- elif (
136
- resources.num_threads and resources.SLURM_num_cpus_per_task
137
- ) and (resources.num_threads != resources.SLURM_num_cpus_per_task):
138
- raise IncompatibleSLURMArgumentsError(
139
- f"Incompatible parameters for `num_cores`/`num_threads` "
140
- f"({resources.num_threads}) and `SLURM_num_cpus_per_task` "
141
- f"({resources.SLURM_num_cpus_per_task}) for the "
142
- f"{resources.parallel_mode.name.lower()} parallel mode."
143
- )
144
- resources.SLURM_num_cpus_per_task = resources.num_threads
133
+ resources.num_threads = resources.num_threads or resources.num_cores
134
+ if not resources.num_threads and not resources.SLURM_num_cpus_per_task:
135
+ raise ValueError(
136
+ f"For the {resources.parallel_mode.name.lower()} parallel "
137
+ f"mode, specify `num_threads` (or its synonym for this "
138
+ f"parallel mode: `num_cores`), or the SLURM-specific "
139
+ f"parameter `SLURM_num_cpus_per_task`."
140
+ )
141
+ elif (resources.num_threads and resources.SLURM_num_cpus_per_task) and (
142
+ resources.num_threads != resources.SLURM_num_cpus_per_task
143
+ ):
144
+ raise IncompatibleSLURMArgumentsError(
145
+ f"Incompatible parameters for `num_cores`/`num_threads` "
146
+ f"({resources.num_threads}) and `SLURM_num_cpus_per_task` "
147
+ f"({resources.SLURM_num_cpus_per_task}) for the "
148
+ f"{resources.parallel_mode.name.lower()} parallel mode."
149
+ )
150
+ resources.SLURM_num_cpus_per_task = resources.num_threads
145
151
 
146
152
  elif resources.parallel_mode is ParallelMode.DISTRIBUTED:
147
153
  if resources.num_threads:
@@ -197,9 +203,9 @@ class SlurmPosix(Scheduler):
197
203
  else:
198
204
  if resources.SLURM_is_parallel:
199
205
  raise IncompatibleSLURMArgumentsError(
200
- f"Some specified SLURM-specific arguments (which indicate a parallel "
201
- f"job) conflict with the scheduler-agnostic arguments (which "
202
- f"indicate a serial job)."
206
+ "Some specified SLURM-specific arguments (which indicate a parallel "
207
+ "job) conflict with the scheduler-agnostic arguments (which "
208
+ "indicate a serial job)."
203
209
  )
204
210
  if not resources.SLURM_num_tasks:
205
211
  resources.SLURM_num_tasks = 1
@@ -228,155 +234,162 @@ class SlurmPosix(Scheduler):
228
234
  try:
229
235
  part = all_parts[resources.SLURM_partition]
230
236
  except KeyError:
231
- raise UnknownSLURMPartitionError(
232
- f"The SLURM partition {resources.SLURM_partition!r} is not "
233
- f"specified in the configuration. Specified partitions are "
234
- f"{list(all_parts.keys())!r}."
235
- )
237
+ raise UnknownSLURMPartitionError(resources.SLURM_partition, all_parts)
236
238
  # TODO: we when we support ParallelMode.HYBRID, these checks will have to
237
239
  # consider the total number of cores requested per node
238
240
  # (num_cores_per_node * num_threads)?
239
- part_num_cores = part.get("num_cores")
240
- part_num_cores_per_node = part.get("num_cores_per_node")
241
- part_num_nodes = part.get("num_nodes")
242
- part_para_modes = part.get("parallel_modes", [])
243
- if (
244
- num_cores
245
- and part_num_cores
246
- and not cls.is_num_cores_supported(num_cores, part_num_cores)
247
- ):
241
+ part_num_cores = part.get("num_cores", ())
242
+ part_num_cores_per_node = part.get("num_cores_per_node", ())
243
+ part_num_nodes = part.get("num_nodes", ())
244
+ part_para_modes = part.get("parallel_modes", ())
245
+ if cls.__is_present_unsupported(num_cores, part_num_cores):
248
246
  raise IncompatibleSLURMPartitionError(
249
- f"The SLURM partition {resources.SLURM_partition!r} is not "
250
- f"compatible with the number of cores requested: {num_cores!r}."
247
+ resources.SLURM_partition, "number of cores", num_cores
251
248
  )
252
- if (
253
- num_cores_per_node
254
- and part_num_cores_per_node
255
- and not cls.is_num_cores_supported(
256
- num_cores_per_node, part_num_cores_per_node
257
- )
258
- ):
249
+ if cls.__is_present_unsupported(num_cores_per_node, part_num_cores_per_node):
259
250
  raise IncompatibleSLURMPartitionError(
260
- f"The SLURM partition {resources.SLURM_partition!r} is not "
261
- f"compatible with the number of cores per node requested: "
262
- f"{num_cores_per_node!r}."
251
+ resources.SLURM_partition,
252
+ "number of cores per node",
253
+ num_cores_per_node,
263
254
  )
264
- if (
265
- num_nodes
266
- and part_num_nodes
267
- and not cls.is_num_cores_supported(num_nodes, part_num_nodes)
268
- ):
255
+ if cls.__is_present_unsupported(num_nodes, part_num_nodes):
269
256
  raise IncompatibleSLURMPartitionError(
270
- f"The SLURM partition {resources.SLURM_partition!r} is not "
271
- f"compatible with the number of nodes requested: {num_nodes!r}."
257
+ resources.SLURM_partition, "number of nodes", num_nodes
272
258
  )
273
259
  if para_mode and para_mode.name.lower() not in part_para_modes:
274
260
  raise IncompatibleSLURMPartitionError(
275
- f"The SLURM partition {resources.SLURM_partition!r} is not "
276
- f"compatible with the parallel mode requested: {para_mode!r}."
261
+ resources.SLURM_partition, "parallel mode", para_mode
277
262
  )
278
263
  else:
279
264
  # find the first compatible partition if one exists:
280
265
  # TODO: bug here? not finding correct partition?
281
- part_match = False
282
266
  for part_name, part_info in all_parts.items():
283
- part_num_cores = part_info.get("num_cores")
284
- part_num_cores_per_node = part_info.get("num_cores_per_node")
285
- part_num_nodes = part_info.get("num_nodes")
286
- part_para_modes = part_info.get("parallel_modes", [])
287
- if (
288
- num_cores
289
- and part_num_cores
290
- and cls.is_num_cores_supported(num_cores, part_num_cores)
291
- ):
292
- part_match = True
293
- else:
294
- part_match = False
295
- continue
296
- if (
297
- num_cores_per_node
298
- and part_num_cores_per_node
299
- and cls.is_num_cores_supported(
300
- num_cores_per_node, part_num_cores_per_node
301
- )
302
- ):
303
- part_match = True
304
- else:
305
- part_match = False
306
- continue
307
- if (
308
- num_nodes
309
- and part_num_nodes
310
- and cls.is_num_cores_supported(num_nodes, part_num_nodes)
267
+ if cls.__partition_matches(
268
+ num_cores, num_cores_per_node, num_nodes, para_mode, part_info
311
269
  ):
312
- part_match = True
313
- else:
314
- part_match = False
315
- continue
316
- if part_match:
317
- part_match = part_name
270
+ resources.SLURM_partition = str(part_name)
318
271
  break
319
- if para_mode and para_mode.name.lower() not in part_para_modes:
320
- part_match = False
321
- continue
322
- if part_match:
323
- part_match = part_name
324
- break
325
- if part_match:
326
- resources.SLURM_partition = part_match
327
272
 
328
- def _format_core_request_lines(self, resources):
329
- lns = []
330
- if resources.SLURM_partition:
331
- lns.append(f"{self.js_cmd} --partition {resources.SLURM_partition}")
273
+ @classmethod
274
+ def __is_present_unsupported(
275
+ cls, num_req: int | None, part_have: Sequence[int] | None
276
+ ) -> bool:
277
+ """
278
+ Test if information is present on both sides, but doesn't match.
279
+ """
280
+ return bool(
281
+ num_req and part_have and not cls.is_num_cores_supported(num_req, part_have)
282
+ )
332
283
 
333
- if resources.SLURM_num_nodes: # TODO: option for --exclusive ?
334
- lns.append(f"{self.js_cmd} --nodes {resources.SLURM_num_nodes}")
284
+ @classmethod
285
+ def __is_present_supported(
286
+ cls, num_req: int | None, part_have: Sequence[int] | None
287
+ ) -> bool:
288
+ """
289
+ Test if information is present on both sides, and also matches.
290
+ """
291
+ return bool(
292
+ num_req and part_have and cls.is_num_cores_supported(num_req, part_have)
293
+ )
335
294
 
295
+ @classmethod
296
+ def __partition_matches(
297
+ cls,
298
+ num_cores: int | None,
299
+ num_cores_per_node: int | None,
300
+ num_nodes: int | None,
301
+ para_mode: ParallelMode | None,
302
+ part_info: SLURMPartitionsDescriptor,
303
+ ) -> bool:
304
+ """
305
+ Check whether a partition (part_name, part_info) matches the requested number
306
+ of cores and nodes.
307
+ """
308
+ part_num_cores = part_info.get("num_cores", [])
309
+ part_num_cores_per_node = part_info.get("num_cores_per_node", [])
310
+ part_num_nodes = part_info.get("num_nodes", [])
311
+ part_para_modes = part_info.get("parallel_modes", [])
312
+ if (
313
+ not cls.__is_present_supported(num_cores, part_num_cores)
314
+ or not cls.__is_present_supported(num_cores_per_node, part_num_cores_per_node)
315
+ or not cls.__is_present_supported(num_nodes, part_num_nodes)
316
+ ):
317
+ return False
318
+ # FIXME: Does the next check come above or below the check below?
319
+ # Surely not both!
320
+ part_match = True
321
+ if part_match:
322
+ return True
323
+ if para_mode and para_mode.name.lower() not in part_para_modes:
324
+ return False
325
+ if part_match:
326
+ return True
327
+ return False
328
+
329
+ def __format_core_request_lines(self, resources: ElementResources) -> Iterator[str]:
330
+ if resources.SLURM_partition:
331
+ yield f"{self.js_cmd} --partition {resources.SLURM_partition}"
332
+ if resources.SLURM_num_nodes: # TODO: option for --exclusive ?
333
+ yield f"{self.js_cmd} --nodes {resources.SLURM_num_nodes}"
336
334
  if resources.SLURM_num_tasks:
337
- lns.append(f"{self.js_cmd} --ntasks {resources.SLURM_num_tasks}")
338
-
335
+ yield f"{self.js_cmd} --ntasks {resources.SLURM_num_tasks}"
339
336
  if resources.SLURM_num_tasks_per_node:
340
- lns.append(
341
- f"{self.js_cmd} --ntasks-per-node {resources.SLURM_num_tasks_per_node}"
342
- )
343
-
337
+ yield f"{self.js_cmd} --ntasks-per-node {resources.SLURM_num_tasks_per_node}"
344
338
  if resources.SLURM_num_cpus_per_task:
345
- lns.append(
346
- f"{self.js_cmd} --cpus-per-task {resources.SLURM_num_cpus_per_task}"
347
- )
348
-
349
- return lns
339
+ yield f"{self.js_cmd} --cpus-per-task {resources.SLURM_num_cpus_per_task}"
350
340
 
351
- def _format_array_request(self, num_elements, resources):
341
+ def __format_array_request(self, num_elements: int, resources: ElementResources):
352
342
  # TODO: Slurm docs start indices at zero, why are we starting at one?
353
343
  # https://slurm.schedmd.com/sbatch.html#OPT_array
354
344
  max_str = f"%{resources.max_array_items}" if resources.max_array_items else ""
355
345
  return f"{self.js_cmd} {self.array_switch} 1-{num_elements}{max_str}"
356
346
 
357
- def _format_std_stream_file_option_lines(self, is_array, sub_idx):
358
- base = r"%x_"
359
- if is_array:
360
- base += r"%A.%a"
361
- else:
362
- base += r"%j"
363
-
364
- base = f"./artifacts/submissions/{sub_idx}/{base}"
365
- return [
366
- f"{self.js_cmd} -o {base}.out",
367
- f"{self.js_cmd} -e {base}.err",
368
- ]
369
-
370
- def format_options(self, resources, num_elements, is_array, sub_idx):
347
+ def get_stdout_filename(
348
+ self, js_idx: int, job_ID: str, array_idx: int | None = None
349
+ ) -> str:
350
+ """File name of the standard output stream file."""
351
+ array_idx_str = f".{array_idx}" if array_idx is not None else ""
352
+ return f"js_{js_idx}.sh_{job_ID}{array_idx_str}.out"
353
+
354
+ def get_stderr_filename(
355
+ self, js_idx: int, job_ID: str, array_idx: int | None = None
356
+ ) -> str:
357
+ """File name of the standard error stream file."""
358
+ array_idx_str = f".{array_idx}" if array_idx is not None else ""
359
+ return f"js_{js_idx}.sh_{job_ID}{array_idx_str}.err"
360
+
361
+ def __format_std_stream_file_option_lines(
362
+ self, is_array: bool, sub_idx: int, js_idx: int, combine_std: bool
363
+ ) -> Iterator[str]:
364
+ pattern = R"%x_%A.%a" if is_array else R"%x_%j"
365
+ base = f"./artifacts/submissions/{sub_idx}/js_std/{js_idx}/{pattern}"
366
+ yield f"{self.js_cmd} --output {base}.out"
367
+ if not combine_std:
368
+ yield f"{self.js_cmd} --error {base}.err"
369
+
370
+ @override
371
+ def format_options(
372
+ self,
373
+ resources: ElementResources,
374
+ num_elements: int,
375
+ is_array: bool,
376
+ sub_idx: int,
377
+ js_idx: int,
378
+ ) -> str:
371
379
  """
372
380
  Format the options to the scheduler.
373
381
  """
374
- opts = []
375
- opts.extend(self._format_core_request_lines(resources))
382
+ opts: list[str] = []
383
+ opts.extend(self.__format_core_request_lines(resources))
384
+
376
385
  if is_array:
377
- opts.append(self._format_array_request(num_elements, resources))
386
+ opts.append(self.__format_array_request(num_elements, resources))
378
387
 
379
- opts.extend(self._format_std_stream_file_option_lines(is_array, sub_idx))
388
+ opts.extend(
389
+ self.__format_std_stream_file_option_lines(
390
+ is_array, sub_idx, js_idx, resources.combine_jobscript_std
391
+ )
392
+ )
380
393
 
381
394
  for opt_k, opt_v in self.options.items():
382
395
  if isinstance(opt_v, list):
@@ -389,8 +402,9 @@ class SlurmPosix(Scheduler):
389
402
 
390
403
  return "\n".join(opts) + "\n"
391
404
 
405
+ @override
392
406
  @TimeIt.decorator
393
- def get_version_info(self):
407
+ def get_version_info(self) -> VersionInfo:
394
408
  vers_cmd = [self.submit_cmd, "--version"]
395
409
  proc = subprocess.run(
396
410
  args=vers_cmd,
@@ -402,18 +416,18 @@ class SlurmPosix(Scheduler):
402
416
  if stderr:
403
417
  print(stderr)
404
418
  name, version = stdout.split()
405
- out = {
419
+ return {
406
420
  "scheduler_name": name,
407
421
  "scheduler_version": version,
408
422
  }
409
- return out
410
423
 
424
+ @override
411
425
  def get_submit_command(
412
426
  self,
413
427
  shell: Shell,
414
428
  js_path: str,
415
- deps: List[Tuple],
416
- ) -> List[str]:
429
+ deps: dict[Any, tuple[Any, ...]],
430
+ ) -> list[str]:
417
431
  """
418
432
  Get the command to use to submit a job to the scheduler.
419
433
 
@@ -422,94 +436,101 @@ class SlurmPosix(Scheduler):
422
436
  List of argument words.
423
437
  """
424
438
  cmd = [self.submit_cmd, "--parsable"]
439
+ if deps:
440
+ cmd.append("--dependency")
441
+ cmd.append(",".join(self.__dependency_args(deps)))
442
+ cmd.append(js_path)
443
+ return cmd
425
444
 
426
- dep_cmd = []
445
+ @staticmethod
446
+ def __dependency_args(deps: dict[Any, tuple[Any, ...]]) -> Iterator[str]:
427
447
  for job_ID, is_array_dep in deps.values():
428
- dep_i_str = ""
429
448
  if is_array_dep: # array dependency
430
- dep_i_str += "aftercorr:"
449
+ yield f"aftercorr:{job_ID}"
431
450
  else:
432
- dep_i_str += "afterany:"
433
- dep_i_str += str(job_ID)
434
- dep_cmd.append(dep_i_str)
435
-
436
- if dep_cmd:
437
- cmd.append(f"--dependency")
438
- cmd.append(",".join(dep_cmd))
439
-
440
- cmd.append(js_path)
441
-
442
- return cmd
451
+ yield f"afterany:{job_ID}"
443
452
 
444
453
  def parse_submission_output(self, stdout: str) -> str:
445
454
  """Extract scheduler reference for a newly submitted jobscript"""
446
455
  if ";" in stdout:
447
- job_ID, _ = stdout.split(";") # since we submit with "--parsable"
448
- else:
449
- job_ID = stdout
450
- return job_ID
456
+ return stdout.split(";")[0] # since we submit with "--parsable"
457
+ # Try using the whole thing
458
+ return stdout
451
459
 
452
460
  @staticmethod
453
- def _parse_job_IDs(job_ID_str: str):
454
- """Parse the job ID column from the `squeue` command (the `%i` format option)."""
455
- parts = job_ID_str.split("_")
456
- base_job_ID, arr_idx = parts if len(parts) == 2 else (parts[0], None)
457
- if arr_idx is not None:
458
- try:
459
- arr_idx = [int(arr_idx) - 1] # zero-index
460
- except ValueError:
461
- # split on commas (e.g. "[5,8-40]")
462
- _arr_idx = []
463
- for i_range_str in arr_idx.strip("[]").split(","):
464
- if "-" in i_range_str:
465
- range_parts = i_range_str.split("-")
466
- if "%" in range_parts[1]:
467
- # indicates max concurrent array items; not needed
468
- range_parts[1] = range_parts[1].split("%")[0]
469
- i_args = [int(j) - 1 for j in range_parts]
470
- _arr_idx.extend(list(range(i_args[0], i_args[1] + 1)))
471
- else:
472
- _arr_idx.append(int(i_range_str) - 1)
473
- arr_idx = _arr_idx
474
- return base_job_ID, arr_idx
475
-
476
- def _parse_job_states(self, stdout) -> Dict[str, Dict[int, JobscriptElementState]]:
461
+ def _parse_job_IDs(job_ID_str: str) -> tuple[str, None | list[int]]:
462
+ """
463
+ Parse the job ID column from the `squeue` command (the `%i` format option).
464
+
465
+ Returns
466
+ -------
467
+ job_id
468
+ The job identifier.
469
+ array_indices
470
+ The indices into the job array.
471
+ """
472
+ base_job_ID, *arr_idx_data = job_ID_str.split("_")
473
+ if not arr_idx_data:
474
+ return base_job_ID, None
475
+ arr_idx = arr_idx_data[0]
476
+ try:
477
+ return base_job_ID, [int(arr_idx) - 1] # zero-index
478
+ except ValueError:
479
+ pass
480
+ # split on commas (e.g. "[5,8-40]")
481
+ _arr_idx: list[int] = []
482
+ for i_range_str in arr_idx.strip("[]").split(","):
483
+ if "-" in i_range_str:
484
+ _from, _to = i_range_str.split("-")
485
+ if "%" in _to:
486
+ # indicates max concurrent array items; not needed
487
+ _to = _to.split("%")[0]
488
+ _arr_idx.extend(range(int(_from) - 1, int(_to)))
489
+ else:
490
+ _arr_idx.append(int(i_range_str) - 1)
491
+ return base_job_ID, _arr_idx
492
+
493
+ def __parse_job_states(
494
+ self, stdout: str
495
+ ) -> dict[str, JobscriptElementState | dict[int, JobscriptElementState]]:
477
496
  """Parse output from Slurm `squeue` command with a simple format."""
478
- info = {}
497
+ info: dict[str, JobscriptElementState | dict[int, JobscriptElementState]] = {}
479
498
  for ln in stdout.split("\n"):
480
499
  if not ln:
481
500
  continue
482
- ln_s = [i.strip() for i in ln.split()]
483
- base_job_ID, arr_idx = self._parse_job_IDs(ln_s[0])
484
- state = self.state_lookup.get(ln_s[1], None)
501
+ job_id, job_state, *_ = ln.split()
502
+ base_job_ID, arr_idx = self._parse_job_IDs(job_id)
503
+ state = self.state_lookup.get(job_state, JobscriptElementState.errored)
485
504
 
486
- if base_job_ID not in info:
487
- info[base_job_ID] = {}
488
-
489
- for arr_idx_i in arr_idx or [None]:
490
- info[base_job_ID][arr_idx_i] = state
505
+ if arr_idx is not None:
506
+ entry = cast(
507
+ dict[int, JobscriptElementState], info.setdefault(base_job_ID, {})
508
+ )
509
+ for arr_idx_i in arr_idx:
510
+ entry[arr_idx_i] = state
511
+ else:
512
+ info[base_job_ID] = state
491
513
 
492
514
  return info
493
515
 
494
- def _query_job_states(self, job_IDs):
516
+ def __query_job_states(self, job_IDs: Iterable[str]) -> tuple[str, str]:
495
517
  """Query the state of the specified jobs."""
496
518
  cmd = [
497
- "squeue",
498
- "--me",
519
+ *self.show_cmd,
499
520
  "--noheader",
500
521
  "--format",
501
- r"%40i %30T",
522
+ R"%200i %30T", # job ID (<base_job_id>_<index> for array job) and job state
502
523
  "--jobs",
503
524
  ",".join(job_IDs),
504
525
  ]
505
- return run_cmd(cmd, logger=self.app.submission_logger)
526
+ return run_cmd(cmd, logger=self._app.submission_logger)
506
527
 
507
- def _get_job_valid_IDs(self, job_IDs=None):
528
+ def __get_job_valid_IDs(self, job_IDs: Collection[str] | None = None) -> set[str]:
508
529
  """Get a list of job IDs that are known by the scheduler, optionally filtered by
509
530
  specified job IDs."""
510
531
 
511
- cmd = ["squeue", "--me", "--noheader", "--format", r"%F"]
512
- stdout, stderr = run_cmd(cmd, logger=self.app.submission_logger)
532
+ cmd = [*self.show_cmd, "--noheader", "--format", r"%F"]
533
+ stdout, stderr = run_cmd(cmd, logger=self._app.submission_logger)
513
534
  if stderr:
514
535
  raise ValueError(
515
536
  f"Could not get query Slurm jobs. Command was: {cmd!r}; stderr was: "
@@ -517,64 +538,66 @@ class SlurmPosix(Scheduler):
517
538
  )
518
539
  else:
519
540
  known_jobs = set(i.strip() for i in stdout.split("\n") if i.strip())
520
- job_IDs = known_jobs.intersection(job_IDs or [])
521
-
522
- return job_IDs
541
+ if job_IDs is None:
542
+ return known_jobs
543
+ return known_jobs.intersection(job_IDs)
523
544
 
545
+ @override
524
546
  def get_job_state_info(
525
- self, js_refs: List[str] = None
526
- ) -> Dict[str, Dict[int, JobscriptElementState]]:
547
+ self, *, js_refs: Sequence[str] | None = None
548
+ ) -> Mapping[str, JobscriptElementState | Mapping[int, JobscriptElementState]]:
527
549
  """Query the scheduler to get the states of all of this user's jobs, optionally
528
550
  filtering by specified job IDs.
529
551
 
530
552
  Jobs that are not in the scheduler's status output will not appear in the output
531
553
  of this method.
532
-
533
554
  """
534
555
 
535
556
  # if job_IDs are passed, then assume they are existant, otherwise retrieve valid
536
557
  # jobs:
537
- if not js_refs:
538
- js_refs = self._get_job_valid_IDs()
539
- if not js_refs:
540
- return {}
558
+ refs: Collection[str] = js_refs or self.__get_job_valid_IDs()
541
559
 
542
- stdout, stderr = self._query_job_states(js_refs)
543
560
  count = 0
544
- while stderr:
545
- if "Invalid job id specified" in stderr and count < 5:
546
- # the job might have finished; this only seems to happen if a single
547
- # non-existant job ID is specified; for multiple non-existant jobs, no
548
- # error is produced;
549
- self.app.submission_logger.info(
550
- f"A specified job ID is non-existant; refreshing known job IDs..."
551
- )
552
- time.sleep(0.5)
553
- js_refs = self._get_job_valid_IDs(js_refs)
554
- if not js_refs:
555
- return {}
556
- stdout, stderr = self._query_job_states(js_refs)
557
- count += 1
558
- else:
561
+ while refs:
562
+ stdout, stderr = self.__query_job_states(refs)
563
+ if not stderr:
564
+ return self.__parse_job_states(stdout)
565
+ if (
566
+ "Invalid job id specified" not in stderr
567
+ or count >= self.NUM_STATE_QUERY_TRIES
568
+ ):
559
569
  raise ValueError(f"Could not get Slurm job states. Stderr was: {stderr}")
560
570
 
561
- info = self._parse_job_states(stdout)
562
- return info
571
+ # the job might have finished; this only seems to happen if a single
572
+ # non-existant job ID is specified; for multiple non-existant jobs, no
573
+ # error is produced;
574
+ self._app.submission_logger.info(
575
+ "A specified job ID is non-existant; refreshing known job IDs..."
576
+ )
577
+ time.sleep(self.INTER_STATE_QUERY_DELAY)
578
+ refs = self.__get_job_valid_IDs(refs)
579
+ count += 1
580
+ return {}
563
581
 
564
- def cancel_jobs(self, js_refs: List[str], jobscripts: List = None):
582
+ @override
583
+ def cancel_jobs(
584
+ self,
585
+ js_refs: list[str],
586
+ jobscripts: list[Jobscript] | None = None,
587
+ ):
565
588
  """
566
589
  Cancel submitted jobs.
567
590
  """
568
- cmd = [self.del_cmd] + js_refs
569
- self.app.submission_logger.info(
591
+ cmd = [self.del_cmd, *js_refs]
592
+ self._app.submission_logger.info(
570
593
  f"cancelling {self.__class__.__name__} jobscripts with command: {cmd}."
571
594
  )
572
- stdout, stderr = run_cmd(cmd, logger=self.app.submission_logger)
595
+ stdout, stderr = run_cmd(cmd, logger=self._app.submission_logger)
573
596
  if stderr:
574
597
  raise ValueError(
575
598
  f"Could not get query {self.__class__.__name__} jobs. Command was: "
576
599
  f"{cmd!r}; stderr was: {stderr}"
577
600
  )
578
- self.app.submission_logger.info(
601
+ self._app.submission_logger.info(
579
602
  f"jobscripts cancel command executed; stdout was: {stdout}."
580
603
  )