hpcflow-new2 0.2.0a190__py3-none-any.whl → 0.2.0a199__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hpcflow/__pyinstaller/hook-hpcflow.py +1 -0
- hpcflow/_version.py +1 -1
- hpcflow/data/scripts/bad_script.py +2 -0
- hpcflow/data/scripts/do_nothing.py +2 -0
- hpcflow/data/scripts/env_specifier_test/input_file_generator_pass_env_spec.py +4 -0
- hpcflow/data/scripts/env_specifier_test/main_script_test_pass_env_spec.py +8 -0
- hpcflow/data/scripts/env_specifier_test/output_file_parser_pass_env_spec.py +4 -0
- hpcflow/data/scripts/env_specifier_test/v1/input_file_generator_basic.py +4 -0
- hpcflow/data/scripts/env_specifier_test/v1/main_script_test_direct_in_direct_out.py +7 -0
- hpcflow/data/scripts/env_specifier_test/v1/output_file_parser_basic.py +4 -0
- hpcflow/data/scripts/env_specifier_test/v2/main_script_test_direct_in_direct_out.py +7 -0
- hpcflow/data/scripts/input_file_generator_basic.py +3 -0
- hpcflow/data/scripts/input_file_generator_basic_FAIL.py +3 -0
- hpcflow/data/scripts/input_file_generator_test_stdout_stderr.py +8 -0
- hpcflow/data/scripts/main_script_test_direct_in.py +3 -0
- hpcflow/data/scripts/main_script_test_direct_in_direct_out_2.py +6 -0
- hpcflow/data/scripts/main_script_test_direct_in_direct_out_2_fail_allowed.py +6 -0
- hpcflow/data/scripts/main_script_test_direct_in_direct_out_2_fail_allowed_group.py +7 -0
- hpcflow/data/scripts/main_script_test_direct_in_direct_out_3.py +6 -0
- hpcflow/data/scripts/main_script_test_direct_in_group_direct_out_3.py +6 -0
- hpcflow/data/scripts/main_script_test_direct_in_group_one_fail_direct_out_3.py +6 -0
- hpcflow/data/scripts/main_script_test_hdf5_in_obj_2.py +12 -0
- hpcflow/data/scripts/main_script_test_json_out_FAIL.py +3 -0
- hpcflow/data/scripts/main_script_test_shell_env_vars.py +12 -0
- hpcflow/data/scripts/main_script_test_std_out_std_err.py +6 -0
- hpcflow/data/scripts/output_file_parser_basic.py +3 -0
- hpcflow/data/scripts/output_file_parser_basic_FAIL.py +7 -0
- hpcflow/data/scripts/output_file_parser_test_stdout_stderr.py +8 -0
- hpcflow/data/scripts/script_exit_test.py +5 -0
- hpcflow/data/template_components/environments.yaml +1 -1
- hpcflow/sdk/__init__.py +5 -0
- hpcflow/sdk/app.py +150 -89
- hpcflow/sdk/cli.py +263 -84
- hpcflow/sdk/cli_common.py +99 -5
- hpcflow/sdk/config/callbacks.py +38 -1
- hpcflow/sdk/config/config.py +102 -13
- hpcflow/sdk/config/errors.py +19 -5
- hpcflow/sdk/config/types.py +3 -0
- hpcflow/sdk/core/__init__.py +25 -1
- hpcflow/sdk/core/actions.py +914 -262
- hpcflow/sdk/core/cache.py +76 -34
- hpcflow/sdk/core/command_files.py +14 -128
- hpcflow/sdk/core/commands.py +35 -6
- hpcflow/sdk/core/element.py +122 -50
- hpcflow/sdk/core/errors.py +58 -2
- hpcflow/sdk/core/execute.py +207 -0
- hpcflow/sdk/core/loop.py +408 -50
- hpcflow/sdk/core/loop_cache.py +4 -4
- hpcflow/sdk/core/parameters.py +382 -37
- hpcflow/sdk/core/run_dir_files.py +13 -40
- hpcflow/sdk/core/skip_reason.py +7 -0
- hpcflow/sdk/core/task.py +119 -30
- hpcflow/sdk/core/task_schema.py +68 -0
- hpcflow/sdk/core/test_utils.py +66 -27
- hpcflow/sdk/core/types.py +54 -1
- hpcflow/sdk/core/utils.py +78 -7
- hpcflow/sdk/core/workflow.py +1538 -336
- hpcflow/sdk/data/workflow_spec_schema.yaml +2 -0
- hpcflow/sdk/demo/cli.py +7 -0
- hpcflow/sdk/helper/cli.py +1 -0
- hpcflow/sdk/log.py +42 -15
- hpcflow/sdk/persistence/base.py +405 -53
- hpcflow/sdk/persistence/json.py +177 -52
- hpcflow/sdk/persistence/pending.py +237 -69
- hpcflow/sdk/persistence/store_resource.py +3 -2
- hpcflow/sdk/persistence/types.py +15 -4
- hpcflow/sdk/persistence/zarr.py +928 -81
- hpcflow/sdk/submission/jobscript.py +1408 -489
- hpcflow/sdk/submission/schedulers/__init__.py +40 -5
- hpcflow/sdk/submission/schedulers/direct.py +33 -19
- hpcflow/sdk/submission/schedulers/sge.py +51 -16
- hpcflow/sdk/submission/schedulers/slurm.py +44 -16
- hpcflow/sdk/submission/schedulers/utils.py +7 -2
- hpcflow/sdk/submission/shells/base.py +68 -20
- hpcflow/sdk/submission/shells/bash.py +222 -129
- hpcflow/sdk/submission/shells/powershell.py +200 -150
- hpcflow/sdk/submission/submission.py +852 -119
- hpcflow/sdk/submission/types.py +18 -21
- hpcflow/sdk/typing.py +24 -5
- hpcflow/sdk/utils/arrays.py +71 -0
- hpcflow/sdk/utils/deferred_file.py +55 -0
- hpcflow/sdk/utils/hashing.py +16 -0
- hpcflow/sdk/utils/patches.py +12 -0
- hpcflow/sdk/utils/strings.py +33 -0
- hpcflow/tests/api/test_api.py +32 -0
- hpcflow/tests/conftest.py +19 -0
- hpcflow/tests/data/multi_path_sequences.yaml +29 -0
- hpcflow/tests/data/workflow_test_run_abort.yaml +34 -35
- hpcflow/tests/schedulers/sge/test_sge_submission.py +36 -0
- hpcflow/tests/scripts/test_input_file_generators.py +282 -0
- hpcflow/tests/scripts/test_main_scripts.py +821 -70
- hpcflow/tests/scripts/test_non_snippet_script.py +46 -0
- hpcflow/tests/scripts/test_ouput_file_parsers.py +353 -0
- hpcflow/tests/shells/wsl/test_wsl_submission.py +6 -0
- hpcflow/tests/unit/test_action.py +176 -0
- hpcflow/tests/unit/test_app.py +20 -0
- hpcflow/tests/unit/test_cache.py +46 -0
- hpcflow/tests/unit/test_cli.py +133 -0
- hpcflow/tests/unit/test_config.py +122 -1
- hpcflow/tests/unit/test_element_iteration.py +47 -0
- hpcflow/tests/unit/test_jobscript_unit.py +757 -0
- hpcflow/tests/unit/test_loop.py +1332 -27
- hpcflow/tests/unit/test_meta_task.py +325 -0
- hpcflow/tests/unit/test_multi_path_sequences.py +229 -0
- hpcflow/tests/unit/test_parameter.py +13 -0
- hpcflow/tests/unit/test_persistence.py +190 -8
- hpcflow/tests/unit/test_run.py +109 -3
- hpcflow/tests/unit/test_run_directories.py +29 -0
- hpcflow/tests/unit/test_shell.py +20 -0
- hpcflow/tests/unit/test_submission.py +5 -76
- hpcflow/tests/unit/utils/test_arrays.py +40 -0
- hpcflow/tests/unit/utils/test_deferred_file_writer.py +34 -0
- hpcflow/tests/unit/utils/test_hashing.py +65 -0
- hpcflow/tests/unit/utils/test_patches.py +5 -0
- hpcflow/tests/unit/utils/test_redirect_std.py +50 -0
- hpcflow/tests/workflows/__init__.py +0 -0
- hpcflow/tests/workflows/test_directory_structure.py +31 -0
- hpcflow/tests/workflows/test_jobscript.py +332 -0
- hpcflow/tests/workflows/test_run_status.py +198 -0
- hpcflow/tests/workflows/test_skip_downstream.py +696 -0
- hpcflow/tests/workflows/test_submission.py +140 -0
- hpcflow/tests/workflows/test_workflows.py +142 -2
- hpcflow/tests/workflows/test_zip.py +18 -0
- hpcflow/viz_demo.ipynb +6587 -3
- {hpcflow_new2-0.2.0a190.dist-info → hpcflow_new2-0.2.0a199.dist-info}/METADATA +7 -4
- hpcflow_new2-0.2.0a199.dist-info/RECORD +221 -0
- hpcflow_new2-0.2.0a190.dist-info/RECORD +0 -165
- {hpcflow_new2-0.2.0a190.dist-info → hpcflow_new2-0.2.0a199.dist-info}/LICENSE +0 -0
- {hpcflow_new2-0.2.0a190.dist-info → hpcflow_new2-0.2.0a199.dist-info}/WHEEL +0 -0
- {hpcflow_new2-0.2.0a190.dist-info → hpcflow_new2-0.2.0a199.dist-info}/entry_points.txt +0 -0
@@ -30,6 +30,13 @@ class Scheduler(ABC, Generic[JSRefType], AppAware):
|
|
30
30
|
"""
|
31
31
|
Abstract base class for schedulers.
|
32
32
|
|
33
|
+
Note
|
34
|
+
----
|
35
|
+
Do not make immediate subclasses of this class other than
|
36
|
+
:py:class:`DirectScheduler` and :py:class:`QueuedScheduler`;
|
37
|
+
subclass those two instead. Code (e.g., in :py:class:`Jobscript`)
|
38
|
+
assumes that this model is followed and does not check it.
|
39
|
+
|
33
40
|
Parameters
|
34
41
|
----------
|
35
42
|
shell_args: str
|
@@ -119,8 +126,8 @@ class Scheduler(ABC, Generic[JSRefType], AppAware):
|
|
119
126
|
|
120
127
|
@abstractmethod
|
121
128
|
def get_job_state_info(
|
122
|
-
self, *, js_refs: Sequence[JSRefType] | None = None
|
123
|
-
) -> Mapping[str, Mapping[int
|
129
|
+
self, *, js_refs: Sequence[JSRefType] | None = None
|
130
|
+
) -> Mapping[str, JobscriptElementState | Mapping[int, JobscriptElementState]]:
|
124
131
|
"""
|
125
132
|
Get the state of one or more jobscripts.
|
126
133
|
"""
|
@@ -136,12 +143,23 @@ class Scheduler(ABC, Generic[JSRefType], AppAware):
|
|
136
143
|
self,
|
137
144
|
js_refs: list[JSRefType],
|
138
145
|
jobscripts: list[Jobscript] | None = None,
|
139
|
-
num_js_elements: int = 0, # Ignored!
|
140
146
|
) -> None:
|
141
147
|
"""
|
142
148
|
Cancel one or more jobscripts.
|
143
149
|
"""
|
144
150
|
|
151
|
+
@abstractmethod
|
152
|
+
def get_std_out_err_filename(self, js_idx: int, *args, **kwargs) -> str:
|
153
|
+
"""File name of combined standard output and error streams."""
|
154
|
+
|
155
|
+
@abstractmethod
|
156
|
+
def get_stdout_filename(self, js_idx: int, *args, **kwargs) -> str:
|
157
|
+
"""File name of the standard output stream file."""
|
158
|
+
|
159
|
+
@abstractmethod
|
160
|
+
def get_stderr_filename(self, js_idx: int, *args, **kwargs) -> str:
|
161
|
+
"""File name of the standard error stream file."""
|
162
|
+
|
145
163
|
|
146
164
|
@hydrate
|
147
165
|
class QueuedScheduler(Scheduler[str]):
|
@@ -226,7 +244,6 @@ class QueuedScheduler(Scheduler[str]):
|
|
226
244
|
"""
|
227
245
|
while js_refs:
|
228
246
|
info: Mapping[str, Any] = self.get_job_state_info(js_refs=js_refs)
|
229
|
-
print(info)
|
230
247
|
if not info:
|
231
248
|
break
|
232
249
|
js_refs = list(info)
|
@@ -234,8 +251,26 @@ class QueuedScheduler(Scheduler[str]):
|
|
234
251
|
|
235
252
|
@abstractmethod
|
236
253
|
def format_options(
|
237
|
-
self,
|
254
|
+
self,
|
255
|
+
resources: ElementResources,
|
256
|
+
num_elements: int,
|
257
|
+
is_array: bool,
|
258
|
+
sub_idx: int,
|
259
|
+
js_idx: int,
|
238
260
|
) -> str:
|
239
261
|
"""
|
240
262
|
Render options in a way that the scheduler can handle.
|
241
263
|
"""
|
264
|
+
|
265
|
+
def get_std_out_err_filename(
|
266
|
+
self, js_idx: int, job_ID: str, array_idx: int | None = None
|
267
|
+
):
|
268
|
+
"""File name of combined standard output and error streams.
|
269
|
+
|
270
|
+
Notes
|
271
|
+
-----
|
272
|
+
We use the standard output stream filename format for the combined output and
|
273
|
+
error streams file.
|
274
|
+
|
275
|
+
"""
|
276
|
+
return self.get_stdout_filename(js_idx=js_idx, job_ID=job_ID, array_idx=array_idx)
|
@@ -23,6 +23,20 @@ if TYPE_CHECKING:
|
|
23
23
|
DirectRef: TypeAlias = "tuple[int, list[str]]"
|
24
24
|
|
25
25
|
|
26
|
+
def _is_process_cmdline_equal(proc: psutil.Process, cmdline: list[str]) -> bool:
|
27
|
+
"""Check if the `cmdline` of a psutil `Process` is equal to the specified
|
28
|
+
`cmdline`."""
|
29
|
+
try:
|
30
|
+
if proc.cmdline() == cmdline:
|
31
|
+
return True
|
32
|
+
else:
|
33
|
+
return False
|
34
|
+
except (psutil.NoSuchProcess, psutil.ZombieProcess):
|
35
|
+
# process no longer exists or, on unix, process has completed but still has a
|
36
|
+
# record
|
37
|
+
return False
|
38
|
+
|
39
|
+
|
26
40
|
class DirectScheduler(Scheduler[DirectRef]):
|
27
41
|
"""
|
28
42
|
A direct scheduler, that just runs jobs immediately as direct subprocesses.
|
@@ -95,8 +109,7 @@ class DirectScheduler(Scheduler[DirectRef]):
|
|
95
109
|
except psutil.NoSuchProcess:
|
96
110
|
# process might have completed already
|
97
111
|
continue
|
98
|
-
if proc_i
|
99
|
-
# additional check this is the same process that we submitted
|
112
|
+
if _is_process_cmdline_equal(proc_i, p_cmdline):
|
100
113
|
procs.append(proc_i)
|
101
114
|
return procs
|
102
115
|
|
@@ -131,23 +144,18 @@ class DirectScheduler(Scheduler[DirectRef]):
|
|
131
144
|
|
132
145
|
@override
|
133
146
|
def get_job_state_info(
|
134
|
-
self,
|
135
|
-
|
136
|
-
js_refs: Sequence[DirectRef] | None = None,
|
137
|
-
num_js_elements: int = 0,
|
138
|
-
) -> Mapping[str, Mapping[int | None, JobscriptElementState]]:
|
147
|
+
self, *, js_refs: Sequence[DirectRef] | None = None
|
148
|
+
) -> Mapping[str, JobscriptElementState]:
|
139
149
|
"""Query the scheduler to get the states of all of this user's jobs, optionally
|
140
150
|
filtering by specified job IDs.
|
141
151
|
|
142
152
|
Jobs that are not in the scheduler's status output will not appear in the output
|
143
153
|
of this method."""
|
144
|
-
info: dict[str,
|
154
|
+
info: dict[str, JobscriptElementState] = {}
|
145
155
|
for p_id, p_cmdline in js_refs or ():
|
146
156
|
if self.is_jobscript_active(p_id, p_cmdline):
|
147
157
|
# as far as the "scheduler" is concerned, all elements are running:
|
148
|
-
info[str(p_id)] =
|
149
|
-
i: JobscriptElementState.running for i in range(num_js_elements)
|
150
|
-
}
|
158
|
+
info[str(p_id)] = JobscriptElementState.running
|
151
159
|
|
152
160
|
return info
|
153
161
|
|
@@ -156,7 +164,6 @@ class DirectScheduler(Scheduler[DirectRef]):
|
|
156
164
|
self,
|
157
165
|
js_refs: list[DirectRef],
|
158
166
|
jobscripts: list[Jobscript] | None = None,
|
159
|
-
num_js_elements: int = 0, # Ignored!
|
160
167
|
):
|
161
168
|
"""
|
162
169
|
Cancel some jobs.
|
@@ -166,18 +173,13 @@ class DirectScheduler(Scheduler[DirectRef]):
|
|
166
173
|
|
167
174
|
def callback(proc: psutil.Process):
|
168
175
|
try:
|
169
|
-
|
176
|
+
js_proc_id[proc.pid]
|
170
177
|
except KeyError:
|
171
178
|
# child process of one of the jobscripts
|
172
179
|
self._app.submission_logger.debug(
|
173
180
|
f"jobscript child process ({proc.pid}) killed"
|
174
181
|
)
|
175
182
|
return
|
176
|
-
assert hasattr(proc, "returncode")
|
177
|
-
print(
|
178
|
-
f"Jobscript {js.index} from submission {js.submission.index} "
|
179
|
-
f"terminated (user-initiated cancel) with exit code {proc.returncode}."
|
180
|
-
)
|
181
183
|
|
182
184
|
procs = self.__get_jobscript_processes(js_refs)
|
183
185
|
self._app.submission_logger.info(
|
@@ -185,6 +187,7 @@ class DirectScheduler(Scheduler[DirectRef]):
|
|
185
187
|
)
|
186
188
|
js_proc_id = {i.pid: jobscripts[idx] for idx, i in enumerate(procs) if jobscripts}
|
187
189
|
self.__kill_processes(procs, timeout=3, on_terminate=callback)
|
190
|
+
print(f"Cancelled {len(procs)} jobscript{'s' if len(procs) > 1 else ''}.")
|
188
191
|
self._app.submission_logger.info("jobscripts cancel command executed.")
|
189
192
|
|
190
193
|
def is_jobscript_active(self, process_ID: int, process_cmdline: list[str]):
|
@@ -198,8 +201,19 @@ class DirectScheduler(Scheduler[DirectRef]):
|
|
198
201
|
proc = psutil.Process(process_ID)
|
199
202
|
except psutil.NoSuchProcess:
|
200
203
|
return False
|
204
|
+
return _is_process_cmdline_equal(proc, process_cmdline)
|
205
|
+
|
206
|
+
def get_std_out_err_filename(self, js_idx: int, **kwargs) -> str:
|
207
|
+
"""File name of combined standard output and error streams."""
|
208
|
+
return f"js_{js_idx}_std.log"
|
209
|
+
|
210
|
+
def get_stdout_filename(self, js_idx: int, **kwargs) -> str:
|
211
|
+
"""File name of the standard output stream file."""
|
212
|
+
return f"js_{js_idx}_stdout.log"
|
201
213
|
|
202
|
-
|
214
|
+
def get_stderr_filename(self, js_idx: int, **kwargs) -> str:
|
215
|
+
"""File name of the standard error stream file."""
|
216
|
+
return f"js_{js_idx}_stderr.log"
|
203
217
|
|
204
218
|
|
205
219
|
@hydrate
|
@@ -5,7 +5,7 @@ An interface to SGE.
|
|
5
5
|
from __future__ import annotations
|
6
6
|
from collections.abc import Sequence
|
7
7
|
import re
|
8
|
-
from typing import TYPE_CHECKING
|
8
|
+
from typing import cast, TYPE_CHECKING
|
9
9
|
from typing_extensions import override
|
10
10
|
from hpcflow.sdk.typing import hydrate
|
11
11
|
from hpcflow.sdk.core.errors import (
|
@@ -131,7 +131,7 @@ class SGEPosix(QueuedScheduler):
|
|
131
131
|
if resources.SGE_parallel_env is not None:
|
132
132
|
# check user-specified `parallel_env` is valid and compatible with
|
133
133
|
# `num_cores`:
|
134
|
-
if resources.num_cores and resources.num_cores
|
134
|
+
if resources.num_cores and resources.num_cores == 1:
|
135
135
|
raise ValueError(
|
136
136
|
f"An SGE parallel environment should not be specified if `num_cores` "
|
137
137
|
f"is 1 (`SGE_parallel_env` was specified as "
|
@@ -174,16 +174,42 @@ class SGEPosix(QueuedScheduler):
|
|
174
174
|
def __format_array_request(self, num_elements: int) -> str:
|
175
175
|
return f"{self.js_cmd} {self.array_switch} 1-{num_elements}"
|
176
176
|
|
177
|
+
def get_stdout_filename(
|
178
|
+
self, js_idx: int, job_ID: str, array_idx: int | None = None
|
179
|
+
) -> str:
|
180
|
+
"""File name of the standard output stream file."""
|
181
|
+
# TODO: untested, might not work!
|
182
|
+
array_idx_str = f".{array_idx}" if array_idx is not None else ""
|
183
|
+
return f"js_{js_idx}.sh.o{job_ID}{array_idx_str}"
|
184
|
+
|
185
|
+
def get_stderr_filename(
|
186
|
+
self, js_idx: int, job_ID: str, array_idx: int | None = None
|
187
|
+
) -> str:
|
188
|
+
"""File name of the standard error stream file."""
|
189
|
+
# TODO: untested, might not work!
|
190
|
+
array_idx_str = f".{array_idx}" if array_idx is not None else ""
|
191
|
+
return f"js_{js_idx}.sh.e{job_ID}{array_idx_str}"
|
192
|
+
|
177
193
|
def __format_std_stream_file_option_lines(
|
178
|
-
self, is_array: bool, sub_idx: int
|
194
|
+
self, is_array: bool, sub_idx: int, js_idx: int, combine_std: bool
|
179
195
|
) -> Iterator[str]:
|
180
|
-
# note: we
|
181
|
-
|
182
|
-
|
196
|
+
# note: if we modify the file names, there is, I believe, no way to include the
|
197
|
+
# job ID; so we don't modify the file names:
|
198
|
+
base = f"./artifacts/submissions/{sub_idx}/js_std/{js_idx}"
|
199
|
+
yield f"{self.js_cmd} -o {base}"
|
200
|
+
if combine_std:
|
201
|
+
yield f"{self.js_cmd} -j y" # redirect stderr to stdout
|
202
|
+
else:
|
203
|
+
yield f"{self.js_cmd} -e {base}"
|
183
204
|
|
184
205
|
@override
|
185
206
|
def format_options(
|
186
|
-
self,
|
207
|
+
self,
|
208
|
+
resources: ElementResources,
|
209
|
+
num_elements: int,
|
210
|
+
is_array: bool,
|
211
|
+
sub_idx: int,
|
212
|
+
js_idx: int,
|
187
213
|
) -> str:
|
188
214
|
"""
|
189
215
|
Format the options to the jobscript command.
|
@@ -194,7 +220,11 @@ class SGEPosix(QueuedScheduler):
|
|
194
220
|
if is_array:
|
195
221
|
opts.append(self.__format_array_request(num_elements))
|
196
222
|
|
197
|
-
opts.extend(
|
223
|
+
opts.extend(
|
224
|
+
self.__format_std_stream_file_option_lines(
|
225
|
+
is_array, sub_idx, js_idx, resources.combine_jobscript_std
|
226
|
+
)
|
227
|
+
)
|
198
228
|
|
199
229
|
for opt_k, opt_v in self.options.items():
|
200
230
|
if opt_v is None:
|
@@ -264,9 +294,9 @@ class SGEPosix(QueuedScheduler):
|
|
264
294
|
|
265
295
|
def get_job_statuses(
|
266
296
|
self,
|
267
|
-
) -> Mapping[str, Mapping[int
|
268
|
-
"""Get information about all of this user's jobscripts that currently listed
|
269
|
-
the scheduler."""
|
297
|
+
) -> Mapping[str, JobscriptElementState | Mapping[int, JobscriptElementState]]:
|
298
|
+
"""Get information about all of this user's jobscripts that are currently listed
|
299
|
+
by the scheduler."""
|
270
300
|
cmd = [*self.show_cmd, "-u", "$USER", "-g", "d"] # "-g d": separate arrays items
|
271
301
|
stdout, stderr = run_cmd(cmd, logger=self._app.submission_logger)
|
272
302
|
if stderr:
|
@@ -277,7 +307,7 @@ class SGEPosix(QueuedScheduler):
|
|
277
307
|
elif not stdout:
|
278
308
|
return {}
|
279
309
|
|
280
|
-
info: dict[str, dict[int |
|
310
|
+
info: dict[str, dict[int, JobscriptElementState] | JobscriptElementState] = {}
|
281
311
|
lines = stdout.split("\n")
|
282
312
|
# assuming a job name with spaces means we can't split on spaces to get
|
283
313
|
# anywhere beyond the job name, so get the column index of the state heading
|
@@ -300,13 +330,19 @@ class SGEPosix(QueuedScheduler):
|
|
300
330
|
else None
|
301
331
|
)
|
302
332
|
|
303
|
-
|
333
|
+
if arr_idx is not None:
|
334
|
+
entry = cast(
|
335
|
+
dict[int, JobscriptElementState], info.setdefault(base_job_ID, {})
|
336
|
+
)
|
337
|
+
entry[arr_idx] = state
|
338
|
+
else:
|
339
|
+
info[base_job_ID] = state
|
304
340
|
return info
|
305
341
|
|
306
342
|
@override
|
307
343
|
def get_job_state_info(
|
308
|
-
self, *, js_refs: Sequence[str] | None = None
|
309
|
-
) -> Mapping[str, Mapping[int
|
344
|
+
self, *, js_refs: Sequence[str] | None = None
|
345
|
+
) -> Mapping[str, JobscriptElementState | Mapping[int, JobscriptElementState]]:
|
310
346
|
"""Query the scheduler to get the states of all of this user's jobs, optionally
|
311
347
|
filtering by specified job IDs.
|
312
348
|
|
@@ -324,7 +360,6 @@ class SGEPosix(QueuedScheduler):
|
|
324
360
|
self,
|
325
361
|
js_refs: list[str],
|
326
362
|
jobscripts: list[Jobscript] | None = None,
|
327
|
-
num_js_elements: int = 0, # Ignored!
|
328
363
|
):
|
329
364
|
"""
|
330
365
|
Cancel submitted jobs.
|
@@ -5,7 +5,7 @@ An interface to SLURM.
|
|
5
5
|
from __future__ import annotations
|
6
6
|
import subprocess
|
7
7
|
import time
|
8
|
-
from typing import TYPE_CHECKING
|
8
|
+
from typing import cast, TYPE_CHECKING
|
9
9
|
from typing_extensions import override
|
10
10
|
from hpcflow.sdk.typing import hydrate
|
11
11
|
from hpcflow.sdk.core.enums import ParallelMode
|
@@ -344,17 +344,37 @@ class SlurmPosix(QueuedScheduler):
|
|
344
344
|
max_str = f"%{resources.max_array_items}" if resources.max_array_items else ""
|
345
345
|
return f"{self.js_cmd} {self.array_switch} 1-{num_elements}{max_str}"
|
346
346
|
|
347
|
+
def get_stdout_filename(
|
348
|
+
self, js_idx: int, job_ID: str, array_idx: int | None = None
|
349
|
+
) -> str:
|
350
|
+
"""File name of the standard output stream file."""
|
351
|
+
array_idx_str = f".{array_idx}" if array_idx is not None else ""
|
352
|
+
return f"js_{js_idx}.sh_{job_ID}{array_idx_str}.out"
|
353
|
+
|
354
|
+
def get_stderr_filename(
|
355
|
+
self, js_idx: int, job_ID: str, array_idx: int | None = None
|
356
|
+
) -> str:
|
357
|
+
"""File name of the standard error stream file."""
|
358
|
+
array_idx_str = f".{array_idx}" if array_idx is not None else ""
|
359
|
+
return f"js_{js_idx}.sh_{job_ID}{array_idx_str}.err"
|
360
|
+
|
347
361
|
def __format_std_stream_file_option_lines(
|
348
|
-
self, is_array: bool, sub_idx: int
|
362
|
+
self, is_array: bool, sub_idx: int, js_idx: int, combine_std: bool
|
349
363
|
) -> Iterator[str]:
|
350
364
|
pattern = R"%x_%A.%a" if is_array else R"%x_%j"
|
351
|
-
base = f"./artifacts/submissions/{sub_idx}/{pattern}"
|
352
|
-
yield f"{self.js_cmd}
|
353
|
-
|
365
|
+
base = f"./artifacts/submissions/{sub_idx}/js_std/{js_idx}/{pattern}"
|
366
|
+
yield f"{self.js_cmd} --output {base}.out"
|
367
|
+
if not combine_std:
|
368
|
+
yield f"{self.js_cmd} --error {base}.err"
|
354
369
|
|
355
370
|
@override
|
356
371
|
def format_options(
|
357
|
-
self,
|
372
|
+
self,
|
373
|
+
resources: ElementResources,
|
374
|
+
num_elements: int,
|
375
|
+
is_array: bool,
|
376
|
+
sub_idx: int,
|
377
|
+
js_idx: int,
|
358
378
|
) -> str:
|
359
379
|
"""
|
360
380
|
Format the options to the scheduler.
|
@@ -365,7 +385,11 @@ class SlurmPosix(QueuedScheduler):
|
|
365
385
|
if is_array:
|
366
386
|
opts.append(self.__format_array_request(num_elements, resources))
|
367
387
|
|
368
|
-
opts.extend(
|
388
|
+
opts.extend(
|
389
|
+
self.__format_std_stream_file_option_lines(
|
390
|
+
is_array, sub_idx, js_idx, resources.combine_jobscript_std
|
391
|
+
)
|
392
|
+
)
|
369
393
|
|
370
394
|
for opt_k, opt_v in self.options.items():
|
371
395
|
if isinstance(opt_v, list):
|
@@ -468,9 +492,9 @@ class SlurmPosix(QueuedScheduler):
|
|
468
492
|
|
469
493
|
def __parse_job_states(
|
470
494
|
self, stdout: str
|
471
|
-
) -> dict[str, dict[int
|
495
|
+
) -> dict[str, JobscriptElementState | dict[int, JobscriptElementState]]:
|
472
496
|
"""Parse output from Slurm `squeue` command with a simple format."""
|
473
|
-
info: dict[str, dict[int
|
497
|
+
info: dict[str, JobscriptElementState | dict[int, JobscriptElementState]] = {}
|
474
498
|
for ln in stdout.split("\n"):
|
475
499
|
if not ln:
|
476
500
|
continue
|
@@ -478,9 +502,14 @@ class SlurmPosix(QueuedScheduler):
|
|
478
502
|
base_job_ID, arr_idx = self._parse_job_IDs(job_id)
|
479
503
|
state = self.state_lookup.get(job_state, JobscriptElementState.errored)
|
480
504
|
|
481
|
-
|
482
|
-
|
483
|
-
|
505
|
+
if arr_idx is not None:
|
506
|
+
entry = cast(
|
507
|
+
dict[int, JobscriptElementState], info.setdefault(base_job_ID, {})
|
508
|
+
)
|
509
|
+
for arr_idx_i in arr_idx:
|
510
|
+
entry[arr_idx_i] = state
|
511
|
+
else:
|
512
|
+
info[base_job_ID] = state
|
484
513
|
|
485
514
|
return info
|
486
515
|
|
@@ -490,7 +519,7 @@ class SlurmPosix(QueuedScheduler):
|
|
490
519
|
*self.show_cmd,
|
491
520
|
"--noheader",
|
492
521
|
"--format",
|
493
|
-
R"%
|
522
|
+
R"%200i %30T", # job ID (<base_job_id>_<index> for array job) and job state
|
494
523
|
"--jobs",
|
495
524
|
",".join(job_IDs),
|
496
525
|
]
|
@@ -515,8 +544,8 @@ class SlurmPosix(QueuedScheduler):
|
|
515
544
|
|
516
545
|
@override
|
517
546
|
def get_job_state_info(
|
518
|
-
self, *, js_refs: Sequence[str] | None = None
|
519
|
-
) -> Mapping[str, Mapping[int
|
547
|
+
self, *, js_refs: Sequence[str] | None = None
|
548
|
+
) -> Mapping[str, JobscriptElementState | Mapping[int, JobscriptElementState]]:
|
520
549
|
"""Query the scheduler to get the states of all of this user's jobs, optionally
|
521
550
|
filtering by specified job IDs.
|
522
551
|
|
@@ -555,7 +584,6 @@ class SlurmPosix(QueuedScheduler):
|
|
555
584
|
self,
|
556
585
|
js_refs: list[str],
|
557
586
|
jobscripts: list[Jobscript] | None = None,
|
558
|
-
num_js_elements: int = 0, # Ignored!
|
559
587
|
):
|
560
588
|
"""
|
561
589
|
Cancel submitted jobs.
|
@@ -1,6 +1,7 @@
|
|
1
1
|
"""
|
2
2
|
Helper for running a subprocess.
|
3
3
|
"""
|
4
|
+
|
4
5
|
from __future__ import annotations
|
5
6
|
import subprocess
|
6
7
|
from typing import TYPE_CHECKING
|
@@ -10,11 +11,15 @@ if TYPE_CHECKING:
|
|
10
11
|
from logging import Logger
|
11
12
|
|
12
13
|
|
13
|
-
def run_cmd(
|
14
|
+
def run_cmd(
|
15
|
+
cmd: str | Sequence[str], logger: Logger | None = None, **kwargs
|
16
|
+
) -> tuple[str, str]:
|
14
17
|
"""Execute a command and return stdout, stderr as strings."""
|
15
18
|
if logger:
|
16
19
|
logger.debug(f"running shell command: {cmd}")
|
17
|
-
proc = subprocess.run(
|
20
|
+
proc = subprocess.run(
|
21
|
+
args=cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs
|
22
|
+
)
|
18
23
|
stdout = proc.stdout.decode()
|
19
24
|
stderr = proc.stderr.decode()
|
20
25
|
return stdout, stderr
|
@@ -13,6 +13,8 @@ if TYPE_CHECKING:
|
|
13
13
|
from typing import Any, ClassVar
|
14
14
|
from ..types import JobscriptHeaderArgs, VersionInfo
|
15
15
|
|
16
|
+
from hpcflow.sdk.utils.hashing import get_hash
|
17
|
+
|
16
18
|
|
17
19
|
@hydrate
|
18
20
|
class Shell(ABC):
|
@@ -30,28 +32,53 @@ class Shell(ABC):
|
|
30
32
|
Arguments to pass to the shell.
|
31
33
|
"""
|
32
34
|
|
35
|
+
#: Default for executable name.
|
36
|
+
DEFAULT_EXE: ClassVar[str] = "/bin/bash"
|
33
37
|
#: File extension for jobscripts.
|
34
38
|
JS_EXT: ClassVar[str]
|
35
|
-
#:
|
36
|
-
|
39
|
+
#: Basic indent.
|
40
|
+
JS_INDENT: ClassVar[str]
|
37
41
|
#: Indent for environment setup.
|
38
42
|
JS_ENV_SETUP_INDENT: ClassVar[str]
|
39
43
|
#: Template for the jobscript shebang line.
|
40
44
|
JS_SHEBANG: ClassVar[str]
|
45
|
+
#: Template for the jobscript functions file.
|
46
|
+
JS_FUNCS: ClassVar[str]
|
41
47
|
#: Template for the common part of the jobscript header.
|
42
48
|
JS_HEADER: ClassVar[str]
|
43
49
|
#: Template for the jobscript header when scheduled.
|
44
50
|
JS_SCHEDULER_HEADER: ClassVar[str]
|
45
51
|
#: Template for the jobscript header when directly executed.
|
46
52
|
JS_DIRECT_HEADER: ClassVar[str]
|
47
|
-
#: Template for the
|
53
|
+
#: Template for enabling writing of the app log.
|
54
|
+
JS_RUN_LOG_PATH_ENABLE: ClassVar[str]
|
55
|
+
#: Template for disabling writing of the app log.
|
56
|
+
JS_RUN_LOG_PATH_DISABLE: ClassVar[str]
|
57
|
+
#: Template for the run execution command.
|
58
|
+
JS_RUN_CMD: ClassVar[str]
|
59
|
+
#: Template for the execution command for multiple combined runs.
|
60
|
+
JS_RUN_CMD_COMBINED: ClassVar[str]
|
61
|
+
#: Template for setting up run environment variables and executing the run.
|
62
|
+
JS_RUN: ClassVar[str]
|
63
|
+
#: Template for the action-run processing loop in a jobscript.
|
64
|
+
JS_ACT_MULTI: ClassVar[str]
|
65
|
+
#: Template for the single-action-run execution in a jobscript.
|
66
|
+
JS_ACT_SINGLE: ClassVar[str]
|
67
|
+
#: Template for setting up environment variables and running one or more action-runs.
|
48
68
|
JS_MAIN: ClassVar[str]
|
49
|
-
#: Template for
|
50
|
-
|
69
|
+
#: Template for a jobscript-block header.
|
70
|
+
JS_BLOCK_HEADER: ClassVar[str]
|
71
|
+
#: Template for single-element execution.
|
72
|
+
JS_ELEMENT_SINGLE: ClassVar[str]
|
51
73
|
#: Template for the element processing loop in a jobscript.
|
52
|
-
|
53
|
-
#:
|
54
|
-
|
74
|
+
JS_ELEMENT_MULTI_LOOP: ClassVar[str]
|
75
|
+
#: Template for the array handling code in a jobscript.
|
76
|
+
JS_ELEMENT_MULTI_ARRAY: ClassVar[str]
|
77
|
+
#: Template for the jobscript block loop in a jobscript.
|
78
|
+
JS_BLOCK_LOOP: ClassVar[str]
|
79
|
+
#: Template for the jobscript footer.
|
80
|
+
JS_FOOTER: ClassVar[str]
|
81
|
+
|
55
82
|
__slots__ = ("_executable", "os_args")
|
56
83
|
|
57
84
|
def __init__(
|
@@ -67,6 +94,9 @@ class Shell(ABC):
|
|
67
94
|
return False
|
68
95
|
return self._executable == other._executable and self.os_args == other.os_args
|
69
96
|
|
97
|
+
def __hash__(self):
|
98
|
+
return get_hash((self._executable, self.os_args))
|
99
|
+
|
70
100
|
@property
|
71
101
|
def executable(self) -> list[str]:
|
72
102
|
"""
|
@@ -85,6 +115,10 @@ class Shell(ABC):
|
|
85
115
|
"""Get the command for submitting a non-scheduled jobscript."""
|
86
116
|
return self.executable + [js_path]
|
87
117
|
|
118
|
+
def get_command_file_launch_command(self, cmd_file_path: str) -> list[str]:
|
119
|
+
"""Get the command for launching the commands file for a given run."""
|
120
|
+
return self.executable + [cmd_file_path]
|
121
|
+
|
88
122
|
@abstractmethod
|
89
123
|
def get_version_info(self, exclude_os: bool = False) -> VersionInfo:
|
90
124
|
"""Get shell and operating system information."""
|
@@ -141,32 +175,46 @@ class Shell(ABC):
|
|
141
175
|
workflow_app_alias: str,
|
142
176
|
param_name: str,
|
143
177
|
shell_var_name: str,
|
144
|
-
EAR_ID: int,
|
145
178
|
cmd_idx: int,
|
146
179
|
stderr: bool,
|
147
|
-
|
180
|
+
app_name: str,
|
181
|
+
) -> str:
|
182
|
+
"""
|
183
|
+
Produce code to save a parameter's value into the workflow persistent store.
|
148
184
|
"""
|
149
|
-
|
185
|
+
|
186
|
+
@abstractmethod
|
187
|
+
def format_stream_assignment(self, shell_var_name: str, command: str) -> str:
|
188
|
+
"""
|
189
|
+
Format a stream assignment.
|
150
190
|
"""
|
151
191
|
|
152
192
|
@abstractmethod
|
153
|
-
def
|
193
|
+
def format_env_var_get(self, var: str) -> str:
|
194
|
+
"""
|
195
|
+
Format retrieval of a shell environment variable.
|
154
196
|
"""
|
155
|
-
Format commands to run within a child scope.
|
156
197
|
|
157
|
-
|
198
|
+
@abstractmethod
|
199
|
+
def format_array(self, lst: list) -> str:
|
200
|
+
"""
|
201
|
+
Format construction of a shell array.
|
158
202
|
"""
|
159
203
|
|
160
204
|
@abstractmethod
|
161
|
-
def
|
162
|
-
self, workflow_app_alias: str, loop_name: str, run_ID: int
|
163
|
-
) -> str:
|
205
|
+
def format_array_get_item(self, arr_name: str, index: int | str) -> str:
|
164
206
|
"""
|
165
|
-
Format a
|
207
|
+
Format retrieval of a shell array item at a specified index.
|
166
208
|
"""
|
167
209
|
|
168
210
|
@abstractmethod
|
169
|
-
def
|
211
|
+
def format_source_functions_file(self, app_name: str, commands: str) -> str:
|
170
212
|
"""
|
171
|
-
Format
|
213
|
+
Format sourcing (i.e. invocation) of the jobscript functions file.
|
214
|
+
"""
|
215
|
+
|
216
|
+
@abstractmethod
|
217
|
+
def format_commands_file(self, app_name: str, commands: str) -> str:
|
218
|
+
"""
|
219
|
+
Format the commands file.
|
172
220
|
"""
|