hpcflow-new2 0.2.0a190__py3-none-any.whl → 0.2.0a199__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. hpcflow/__pyinstaller/hook-hpcflow.py +1 -0
  2. hpcflow/_version.py +1 -1
  3. hpcflow/data/scripts/bad_script.py +2 -0
  4. hpcflow/data/scripts/do_nothing.py +2 -0
  5. hpcflow/data/scripts/env_specifier_test/input_file_generator_pass_env_spec.py +4 -0
  6. hpcflow/data/scripts/env_specifier_test/main_script_test_pass_env_spec.py +8 -0
  7. hpcflow/data/scripts/env_specifier_test/output_file_parser_pass_env_spec.py +4 -0
  8. hpcflow/data/scripts/env_specifier_test/v1/input_file_generator_basic.py +4 -0
  9. hpcflow/data/scripts/env_specifier_test/v1/main_script_test_direct_in_direct_out.py +7 -0
  10. hpcflow/data/scripts/env_specifier_test/v1/output_file_parser_basic.py +4 -0
  11. hpcflow/data/scripts/env_specifier_test/v2/main_script_test_direct_in_direct_out.py +7 -0
  12. hpcflow/data/scripts/input_file_generator_basic.py +3 -0
  13. hpcflow/data/scripts/input_file_generator_basic_FAIL.py +3 -0
  14. hpcflow/data/scripts/input_file_generator_test_stdout_stderr.py +8 -0
  15. hpcflow/data/scripts/main_script_test_direct_in.py +3 -0
  16. hpcflow/data/scripts/main_script_test_direct_in_direct_out_2.py +6 -0
  17. hpcflow/data/scripts/main_script_test_direct_in_direct_out_2_fail_allowed.py +6 -0
  18. hpcflow/data/scripts/main_script_test_direct_in_direct_out_2_fail_allowed_group.py +7 -0
  19. hpcflow/data/scripts/main_script_test_direct_in_direct_out_3.py +6 -0
  20. hpcflow/data/scripts/main_script_test_direct_in_group_direct_out_3.py +6 -0
  21. hpcflow/data/scripts/main_script_test_direct_in_group_one_fail_direct_out_3.py +6 -0
  22. hpcflow/data/scripts/main_script_test_hdf5_in_obj_2.py +12 -0
  23. hpcflow/data/scripts/main_script_test_json_out_FAIL.py +3 -0
  24. hpcflow/data/scripts/main_script_test_shell_env_vars.py +12 -0
  25. hpcflow/data/scripts/main_script_test_std_out_std_err.py +6 -0
  26. hpcflow/data/scripts/output_file_parser_basic.py +3 -0
  27. hpcflow/data/scripts/output_file_parser_basic_FAIL.py +7 -0
  28. hpcflow/data/scripts/output_file_parser_test_stdout_stderr.py +8 -0
  29. hpcflow/data/scripts/script_exit_test.py +5 -0
  30. hpcflow/data/template_components/environments.yaml +1 -1
  31. hpcflow/sdk/__init__.py +5 -0
  32. hpcflow/sdk/app.py +150 -89
  33. hpcflow/sdk/cli.py +263 -84
  34. hpcflow/sdk/cli_common.py +99 -5
  35. hpcflow/sdk/config/callbacks.py +38 -1
  36. hpcflow/sdk/config/config.py +102 -13
  37. hpcflow/sdk/config/errors.py +19 -5
  38. hpcflow/sdk/config/types.py +3 -0
  39. hpcflow/sdk/core/__init__.py +25 -1
  40. hpcflow/sdk/core/actions.py +914 -262
  41. hpcflow/sdk/core/cache.py +76 -34
  42. hpcflow/sdk/core/command_files.py +14 -128
  43. hpcflow/sdk/core/commands.py +35 -6
  44. hpcflow/sdk/core/element.py +122 -50
  45. hpcflow/sdk/core/errors.py +58 -2
  46. hpcflow/sdk/core/execute.py +207 -0
  47. hpcflow/sdk/core/loop.py +408 -50
  48. hpcflow/sdk/core/loop_cache.py +4 -4
  49. hpcflow/sdk/core/parameters.py +382 -37
  50. hpcflow/sdk/core/run_dir_files.py +13 -40
  51. hpcflow/sdk/core/skip_reason.py +7 -0
  52. hpcflow/sdk/core/task.py +119 -30
  53. hpcflow/sdk/core/task_schema.py +68 -0
  54. hpcflow/sdk/core/test_utils.py +66 -27
  55. hpcflow/sdk/core/types.py +54 -1
  56. hpcflow/sdk/core/utils.py +78 -7
  57. hpcflow/sdk/core/workflow.py +1538 -336
  58. hpcflow/sdk/data/workflow_spec_schema.yaml +2 -0
  59. hpcflow/sdk/demo/cli.py +7 -0
  60. hpcflow/sdk/helper/cli.py +1 -0
  61. hpcflow/sdk/log.py +42 -15
  62. hpcflow/sdk/persistence/base.py +405 -53
  63. hpcflow/sdk/persistence/json.py +177 -52
  64. hpcflow/sdk/persistence/pending.py +237 -69
  65. hpcflow/sdk/persistence/store_resource.py +3 -2
  66. hpcflow/sdk/persistence/types.py +15 -4
  67. hpcflow/sdk/persistence/zarr.py +928 -81
  68. hpcflow/sdk/submission/jobscript.py +1408 -489
  69. hpcflow/sdk/submission/schedulers/__init__.py +40 -5
  70. hpcflow/sdk/submission/schedulers/direct.py +33 -19
  71. hpcflow/sdk/submission/schedulers/sge.py +51 -16
  72. hpcflow/sdk/submission/schedulers/slurm.py +44 -16
  73. hpcflow/sdk/submission/schedulers/utils.py +7 -2
  74. hpcflow/sdk/submission/shells/base.py +68 -20
  75. hpcflow/sdk/submission/shells/bash.py +222 -129
  76. hpcflow/sdk/submission/shells/powershell.py +200 -150
  77. hpcflow/sdk/submission/submission.py +852 -119
  78. hpcflow/sdk/submission/types.py +18 -21
  79. hpcflow/sdk/typing.py +24 -5
  80. hpcflow/sdk/utils/arrays.py +71 -0
  81. hpcflow/sdk/utils/deferred_file.py +55 -0
  82. hpcflow/sdk/utils/hashing.py +16 -0
  83. hpcflow/sdk/utils/patches.py +12 -0
  84. hpcflow/sdk/utils/strings.py +33 -0
  85. hpcflow/tests/api/test_api.py +32 -0
  86. hpcflow/tests/conftest.py +19 -0
  87. hpcflow/tests/data/multi_path_sequences.yaml +29 -0
  88. hpcflow/tests/data/workflow_test_run_abort.yaml +34 -35
  89. hpcflow/tests/schedulers/sge/test_sge_submission.py +36 -0
  90. hpcflow/tests/scripts/test_input_file_generators.py +282 -0
  91. hpcflow/tests/scripts/test_main_scripts.py +821 -70
  92. hpcflow/tests/scripts/test_non_snippet_script.py +46 -0
  93. hpcflow/tests/scripts/test_ouput_file_parsers.py +353 -0
  94. hpcflow/tests/shells/wsl/test_wsl_submission.py +6 -0
  95. hpcflow/tests/unit/test_action.py +176 -0
  96. hpcflow/tests/unit/test_app.py +20 -0
  97. hpcflow/tests/unit/test_cache.py +46 -0
  98. hpcflow/tests/unit/test_cli.py +133 -0
  99. hpcflow/tests/unit/test_config.py +122 -1
  100. hpcflow/tests/unit/test_element_iteration.py +47 -0
  101. hpcflow/tests/unit/test_jobscript_unit.py +757 -0
  102. hpcflow/tests/unit/test_loop.py +1332 -27
  103. hpcflow/tests/unit/test_meta_task.py +325 -0
  104. hpcflow/tests/unit/test_multi_path_sequences.py +229 -0
  105. hpcflow/tests/unit/test_parameter.py +13 -0
  106. hpcflow/tests/unit/test_persistence.py +190 -8
  107. hpcflow/tests/unit/test_run.py +109 -3
  108. hpcflow/tests/unit/test_run_directories.py +29 -0
  109. hpcflow/tests/unit/test_shell.py +20 -0
  110. hpcflow/tests/unit/test_submission.py +5 -76
  111. hpcflow/tests/unit/utils/test_arrays.py +40 -0
  112. hpcflow/tests/unit/utils/test_deferred_file_writer.py +34 -0
  113. hpcflow/tests/unit/utils/test_hashing.py +65 -0
  114. hpcflow/tests/unit/utils/test_patches.py +5 -0
  115. hpcflow/tests/unit/utils/test_redirect_std.py +50 -0
  116. hpcflow/tests/workflows/__init__.py +0 -0
  117. hpcflow/tests/workflows/test_directory_structure.py +31 -0
  118. hpcflow/tests/workflows/test_jobscript.py +332 -0
  119. hpcflow/tests/workflows/test_run_status.py +198 -0
  120. hpcflow/tests/workflows/test_skip_downstream.py +696 -0
  121. hpcflow/tests/workflows/test_submission.py +140 -0
  122. hpcflow/tests/workflows/test_workflows.py +142 -2
  123. hpcflow/tests/workflows/test_zip.py +18 -0
  124. hpcflow/viz_demo.ipynb +6587 -3
  125. {hpcflow_new2-0.2.0a190.dist-info → hpcflow_new2-0.2.0a199.dist-info}/METADATA +7 -4
  126. hpcflow_new2-0.2.0a199.dist-info/RECORD +221 -0
  127. hpcflow_new2-0.2.0a190.dist-info/RECORD +0 -165
  128. {hpcflow_new2-0.2.0a190.dist-info → hpcflow_new2-0.2.0a199.dist-info}/LICENSE +0 -0
  129. {hpcflow_new2-0.2.0a190.dist-info → hpcflow_new2-0.2.0a199.dist-info}/WHEEL +0 -0
  130. {hpcflow_new2-0.2.0a190.dist-info → hpcflow_new2-0.2.0a199.dist-info}/entry_points.txt +0 -0
@@ -4,28 +4,53 @@ Main workflow model.
4
4
 
5
5
  from __future__ import annotations
6
6
  from collections import defaultdict
7
+ from collections.abc import Callable
7
8
  from contextlib import contextmanager, nullcontext
8
9
  import copy
9
10
  from dataclasses import dataclass, field
10
11
 
12
+ from functools import wraps
13
+ import os
11
14
  from pathlib import Path
12
15
  import random
16
+ import shutil
13
17
  import string
14
18
  from threading import Thread
15
19
  import time
16
- from typing import overload, cast, TYPE_CHECKING
20
+ from typing import overload, cast, TYPE_CHECKING, TypeVar
21
+ from typing_extensions import ParamSpec, Concatenate
22
+
17
23
  from uuid import uuid4
18
24
  from warnings import warn
19
25
  from fsspec.implementations.local import LocalFileSystem # type: ignore
20
26
  from fsspec.implementations.zip import ZipFileSystem # type: ignore
21
27
  import numpy as np
22
28
  from fsspec.core import url_to_fs # type: ignore
29
+ from rich import print as rich_print
23
30
  import rich.console
31
+ import rich.panel
32
+ import rich.table
33
+ import rich.text
34
+ import rich.box
35
+
24
36
 
37
+ from hpcflow.sdk import app
25
38
  from hpcflow.sdk.typing import hydrate
26
- from hpcflow.sdk.core import ALL_TEMPLATE_FORMATS, ABORT_EXIT_CODE
39
+ from hpcflow.sdk.config.errors import (
40
+ ConfigNonConfigurableError,
41
+ UnknownMetaTaskConstitutiveSchema,
42
+ )
43
+ from hpcflow.sdk.core import (
44
+ ALL_TEMPLATE_FORMATS,
45
+ ABORT_EXIT_CODE,
46
+ RUN_DIR_ARR_FILL,
47
+ SKIPPED_EXIT_CODE,
48
+ NO_COMMANDS_EXIT_CODE,
49
+ )
27
50
  from hpcflow.sdk.core.app_aware import AppAware
28
51
  from hpcflow.sdk.core.enums import EARStatus
52
+ from hpcflow.sdk.core.skip_reason import SkipReason
53
+ from hpcflow.sdk.core.cache import ObjectCache
29
54
  from hpcflow.sdk.core.loop_cache import LoopCache, LoopIndex
30
55
  from hpcflow.sdk.log import TimeIt
31
56
  from hpcflow.sdk.persistence import store_cls_from_str
@@ -35,18 +60,22 @@ from hpcflow.sdk.persistence.utils import ask_pw_on_auth_exc, infer_store
35
60
  from hpcflow.sdk.submission.jobscript import (
36
61
  generate_EAR_resource_map,
37
62
  group_resource_map_into_jobscripts,
38
- jobscripts_to_list,
63
+ is_jobscript_array,
39
64
  merge_jobscripts_across_tasks,
65
+ resolve_jobscript_blocks,
40
66
  resolve_jobscript_dependencies,
41
67
  )
42
68
  from hpcflow.sdk.submission.enums import JobscriptElementState
43
69
  from hpcflow.sdk.submission.schedulers.direct import DirectScheduler
70
+ from hpcflow.sdk.submission.submission import Submission
44
71
  from hpcflow.sdk.core.json_like import ChildObjectSpec, JSONLike
72
+ from hpcflow.sdk.utils.strings import shorten_list_str
45
73
  from hpcflow.sdk.core.utils import (
46
74
  read_JSON_file,
47
75
  read_JSON_string,
48
76
  read_YAML_str,
49
77
  read_YAML_file,
78
+ redirect_std_to_file,
50
79
  replace_items,
51
80
  current_timestamp,
52
81
  normalise_timestamp,
@@ -58,6 +87,7 @@ from hpcflow.sdk.core.errors import (
58
87
  OutputFileParserNoOutputError,
59
88
  RunNotAbortableError,
60
89
  SubmissionFailure,
90
+ UnsetParameterDataErrorBase,
61
91
  WorkflowSubmissionFailure,
62
92
  )
63
93
 
@@ -70,7 +100,7 @@ if TYPE_CHECKING:
70
100
  import psutil
71
101
  from rich.status import Status
72
102
  from ..typing import DataIndex, ParamSource, PathLike, TemplateComponents
73
- from .actions import ElementActionRun
103
+ from .actions import ElementActionRun, UnsetParamTracker
74
104
  from .element import Element, ElementIteration
75
105
  from .loop import Loop, WorkflowLoop
76
106
  from .object_list import ObjectList, ResourceList, WorkflowLoopList, WorkflowTaskList
@@ -82,6 +112,8 @@ if TYPE_CHECKING:
82
112
  Pending,
83
113
  Resources,
84
114
  WorkflowTemplateTaskData,
115
+ WorkflowTemplateElementSetData,
116
+ BlockActionKey,
85
117
  )
86
118
  from ..submission.submission import Submission
87
119
  from ..submission.jobscript import (
@@ -97,10 +129,15 @@ if TYPE_CHECKING:
97
129
  StoreEAR,
98
130
  )
99
131
  from ..persistence.types import TemplateMeta
132
+ from .json_like import JSONed
100
133
 
101
134
  #: Convenience alias
102
135
  _TemplateComponents: TypeAlias = "dict[str, ObjectList[JSONLike]]"
103
136
 
137
+ P = ParamSpec("P")
138
+ T = TypeVar("T")
139
+ S = TypeVar("S", bound="Workflow")
140
+
104
141
 
105
142
  @dataclass
106
143
  class _Pathway:
@@ -202,6 +239,7 @@ class WorkflowTemplate(JSONLike):
202
239
  workflow: Workflow | None = None
203
240
  #: Template-level resources to apply to all tasks as default values.
204
241
  resources: Resources = None
242
+ config: dict = field(default_factory=lambda: {})
205
243
  #: The execution environments to use.
206
244
  environments: Mapping[str, Mapping[str, Any]] | None = None
207
245
  #: The environment presets to use.
@@ -216,6 +254,34 @@ class WorkflowTemplate(JSONLike):
216
254
  merge_envs: bool = True
217
255
 
218
256
  def __post_init__(self) -> None:
257
+
258
+ # TODO: in what scenario is the reindex required? are loops initialised?
259
+
260
+ # replace metatasks with tasks
261
+ new_tasks: list[Task] = []
262
+ do_reindex = False
263
+ reindex = {}
264
+ for task_idx, i in enumerate(self.tasks):
265
+ if isinstance(i, app.MetaTask):
266
+ do_reindex = True
267
+ tasks_from_meta = copy.deepcopy(i.tasks)
268
+ reindex[task_idx] = [
269
+ len(new_tasks) + i for i in range(len(tasks_from_meta))
270
+ ]
271
+ new_tasks.extend(tasks_from_meta)
272
+ else:
273
+ reindex[task_idx] = [len(new_tasks)]
274
+ new_tasks.append(i)
275
+ if do_reindex:
276
+ if self.loops:
277
+ for loop_idx, loop in enumerate(cast("list[dict[str, Any]]", self.loops)):
278
+ loop["tasks"] = [j for i in loop["tasks"] for j in reindex[i]]
279
+ term_task = loop.get("termination_task")
280
+ if term_task is not None:
281
+ loop["termination_task"] = reindex[term_task][0]
282
+
283
+ self.tasks = new_tasks
284
+
219
285
  resources = self._app.ResourceList.normalise(self.resources)
220
286
  self.resources = resources
221
287
  self._set_parent_refs()
@@ -235,6 +301,13 @@ class WorkflowTemplate(JSONLike):
235
301
  if self.doc and not isinstance(self.doc, list):
236
302
  self.doc = [self.doc]
237
303
 
304
+ if self.config:
305
+ # don't do a full validation (which would require loading the config file),
306
+ # just check all specified keys are configurable:
307
+ bad_keys = set(self.config) - set(self._app.config_options._configurable_keys)
308
+ if bad_keys:
309
+ raise ConfigNonConfigurableError(name=bad_keys)
310
+
238
311
  @property
239
312
  def _resources(self) -> ResourceList:
240
313
  res = self.resources
@@ -324,22 +397,121 @@ class WorkflowTemplate(JSONLike):
324
397
  @classmethod
325
398
  @TimeIt.decorator
326
399
  def _from_data(cls, data: dict[str, Any]) -> WorkflowTemplate:
327
- task_dat: WorkflowTemplateTaskData
328
- # use element_sets if not already:
329
- for task_idx, task_dat in enumerate(data["tasks"]):
330
- schema = task_dat.pop("schema")
331
- schema_list: list = schema if isinstance(schema, list) else [schema]
332
- if "element_sets" in task_dat:
333
- # just update the schema to a list:
334
- data["tasks"][task_idx]["schema"] = schema_list
335
- else:
336
- # add a single element set, and update the schema to a list:
337
- out_labels = task_dat.pop("output_labels", [])
338
- data["tasks"][task_idx] = {
339
- "schema": schema_list,
340
- "element_sets": [task_dat],
341
- "output_labels": out_labels,
342
- }
400
+ def _normalise_task_parametrisation(task_lst: list[WorkflowTemplateTaskData]):
401
+ """
402
+ For each dict in a list of task parametrisations, ensure the `schema` key is
403
+ a list of values, and ensure `element_sets` are defined.
404
+
405
+ This mutates `task_lst`.
406
+
407
+ """
408
+ # use element_sets if not already:
409
+ task_dat: WorkflowTemplateTaskData
410
+ for task_idx, task_dat in enumerate(task_lst):
411
+ schema = task_dat.pop("schema")
412
+ schema_list: list = schema if isinstance(schema, list) else [schema]
413
+ if "element_sets" in task_dat:
414
+ # just update the schema to a list:
415
+ task_lst[task_idx]["schema"] = schema_list
416
+ else:
417
+ # add a single element set, and update the schema to a list:
418
+ out_labels = task_dat.pop("output_labels", [])
419
+ es_dat = cast("WorkflowTemplateElementSetData", task_dat)
420
+ new_task_dat: WorkflowTemplateTaskData = {
421
+ "schema": schema_list,
422
+ "element_sets": [es_dat],
423
+ "output_labels": out_labels,
424
+ }
425
+ task_lst[task_idx] = new_task_dat
426
+ # move sequences with `paths` (note: plural) to multi_path_sequences:
427
+ for elem_set in task_lst[task_idx]["element_sets"]:
428
+ new_mps = []
429
+ seqs = elem_set.get("sequences", [])
430
+ seqs = list(seqs) # copy
431
+ # loop in reverse so indices for pop are valid:
432
+ for seq_idx, seq_dat in zip(range(len(seqs) - 1, -1, -1), seqs[::-1]):
433
+ if "paths" in seq_dat: # (note: plural)
434
+ # move to a multi-path sequence:
435
+ new_mps.append(elem_set["sequences"].pop(seq_idx))
436
+ elem_set.setdefault("multi_path_sequences", []).extend(new_mps[::-1])
437
+
438
+ meta_tasks = data.pop("meta_tasks", {})
439
+ if meta_tasks:
440
+ for i in list(meta_tasks):
441
+ _normalise_task_parametrisation(meta_tasks[i])
442
+ new_task_dat: list[WorkflowTemplateTaskData] = []
443
+ reindex = {}
444
+ for task_idx, task_dat in enumerate(data["tasks"]):
445
+ if meta_task_dat := meta_tasks.get(task_dat["schema"]):
446
+ reindex[task_idx] = [
447
+ len(new_task_dat) + i for i in range(len(meta_task_dat))
448
+ ]
449
+
450
+ all_schema_names = [j for i in meta_task_dat for j in i["schema"]]
451
+
452
+ # update any parametrisation provided in the task list:
453
+ base_data = copy.deepcopy(meta_task_dat)
454
+
455
+ # any other keys in `task_dat` should be mappings whose keys are
456
+ # the schema name (within the meta task) optionally suffixed by
457
+ # a period and the element set index to which the updates should be
458
+ # copied (no integer suffix indicates the zeroth element set):
459
+ for k, v in task_dat.items():
460
+ if k == "schema":
461
+ continue
462
+
463
+ for elem_set_id, dat in v.items():
464
+
465
+ elem_set_id_split = elem_set_id.split(".")
466
+ try:
467
+ es_idx = int(elem_set_id_split[-1])
468
+ schema_name = ".".join(elem_set_id_split[:-1])
469
+ except ValueError:
470
+ es_idx = 0
471
+ schema_name = ".".join(elem_set_id_split)
472
+ schema_name = schema_name.strip(".")
473
+
474
+ # check valid schema name:
475
+ if schema_name not in all_schema_names:
476
+ raise UnknownMetaTaskConstitutiveSchema(
477
+ f"Task schema with objective {schema_name!r} is not "
478
+ f"part of the meta-task with objective "
479
+ f"{task_dat['schema']!r}. The constitutive schemas of"
480
+ f" this meta-task have objectives: "
481
+ f"{all_schema_names!r}."
482
+ )
483
+
484
+ # copy `dat` to the correct schema and element set in the
485
+ # meta-task:
486
+ for s_idx, s in enumerate(base_data):
487
+ if s["schema"] == [schema_name]:
488
+ if k == "inputs":
489
+ # special case; merge inputs
490
+ base_data[s_idx]["element_sets"][es_idx][
491
+ k
492
+ ].update(dat)
493
+ else:
494
+ # just overwrite
495
+ base_data[s_idx]["element_sets"][es_idx][k] = dat
496
+
497
+ new_task_dat.extend(base_data)
498
+
499
+ else:
500
+ reindex[task_idx] = [len(new_task_dat)]
501
+ new_task_dat.append(task_dat)
502
+
503
+ data["tasks"] = new_task_dat
504
+
505
+ if loops := data.get("loops"):
506
+ for loop_idx, loop in enumerate(loops):
507
+ loops[loop_idx]["tasks"] = [
508
+ j for i in loop["tasks"] for j in reindex[i]
509
+ ]
510
+ term_task = loop.get("termination_task")
511
+ if term_task is not None:
512
+ loops[loop_idx]["termination_task"] = reindex[term_task][0]
513
+
514
+ _normalise_task_parametrisation(data["tasks"])
343
515
 
344
516
  # extract out any template components:
345
517
  # TODO: TypedDict for data
@@ -368,7 +540,24 @@ class WorkflowTemplate(JSONLike):
368
540
  )
369
541
  cls._app.task_schemas.add_objects(task_schemas, skip_duplicates=True)
370
542
 
371
- return cls.from_json_like(data, shared_data=cls._app._shared_data)
543
+ if mts_dat := tcs.pop("meta_task_schemas", []):
544
+ meta_ts = [
545
+ cls._app.MetaTaskSchema.from_json_like(
546
+ i, shared_data=cls._app.template_components
547
+ )
548
+ for i in mts_dat
549
+ ]
550
+ cls._app.task_schemas.add_objects(meta_ts, skip_duplicates=True)
551
+
552
+ wkt = cls.from_json_like(data, shared_data=cls._app._shared_data)
553
+
554
+ # print(f"WorkflowTemplate._from_data: {wkt=!r}")
555
+ # TODO: what is this for!?
556
+ # for idx, task in enumerate(wkt.tasks):
557
+ # if isinstance(task.schema, cls._app.MetaTaskSchema):
558
+ # print(f"{task=!r}")
559
+ # wkt.tasks[idx] = cls._app.MetaTask(schema=task.schema, tasks=task.tasks)
560
+ return wkt
372
561
 
373
562
  @classmethod
374
563
  @TimeIt.decorator
@@ -571,6 +760,25 @@ class _IterationData:
571
760
  idx: int
572
761
 
573
762
 
763
+ def load_workflow_config(
764
+ func: Callable[Concatenate[S, P], T],
765
+ ) -> Callable[Concatenate[S, P], T]:
766
+ """Decorator to apply workflow-level config items during execution of a Workflow
767
+ method."""
768
+
769
+ @wraps(func)
770
+ def wrapped(self: S, *args: P.args, **kwargs: P.kwargs) -> T:
771
+
772
+ updates = self.template.config
773
+ if updates:
774
+ with self._app.config._with_updates(updates):
775
+ return func(self, *args, **kwargs)
776
+ else:
777
+ return func(self, *args, **kwargs)
778
+
779
+ return wrapped
780
+
781
+
574
782
  class Workflow(AppAware):
575
783
  """
576
784
  A concrete workflow.
@@ -630,9 +838,18 @@ class Workflow(AppAware):
630
838
  self._store = store_cls(self._app, self, self.path, fs)
631
839
  self._in_batch_mode = False # flag to track when processing batch updates
632
840
 
841
+ self._use_merged_parameters_cache = False
842
+ self._merged_parameters_cache: dict[
843
+ tuple[str | None, tuple[tuple[str, tuple[int, ...] | int], ...]], Any
844
+ ] = {}
845
+
633
846
  # store indices of updates during batch update, so we can revert on failure:
634
847
  self._pending = self._get_empty_pending()
635
848
 
849
+ # reassigned within `ElementActionRun.raise_on_failure_threshold` context manager:
850
+ self._is_tracking_unset: bool = False
851
+ self._tracked_unset: dict[str, UnsetParamTracker] | None = None
852
+
636
853
  def reload(self) -> Self:
637
854
  """Reload the workflow from disk."""
638
855
  return self.__class__(self.url)
@@ -743,7 +960,12 @@ class Workflow(AppAware):
743
960
  f"{len(template.loops)} ({loop.name!r})"
744
961
  )
745
962
  wk._add_loop(loop, cache=cache, status=status)
746
- except Exception:
963
+ if status:
964
+ status.update(
965
+ f"Added {len(template.loops)} loops. "
966
+ f"Committing to store..."
967
+ )
968
+ except (Exception, NotImplementedError):
747
969
  if status:
748
970
  status.stop()
749
971
  raise
@@ -820,6 +1042,7 @@ class Workflow(AppAware):
820
1042
  ts_name_fmt: str | None = None,
821
1043
  store_kwargs: dict[str, Any] | None = None,
822
1044
  variables: dict[str, str] | None = None,
1045
+ status: Status | None = None,
823
1046
  ) -> Workflow:
824
1047
  """Generate from a YAML string.
825
1048
 
@@ -864,6 +1087,7 @@ class Workflow(AppAware):
864
1087
  ts_fmt,
865
1088
  ts_name_fmt,
866
1089
  store_kwargs,
1090
+ status,
867
1091
  )
868
1092
 
869
1093
  @classmethod
@@ -1066,6 +1290,7 @@ class Workflow(AppAware):
1066
1290
  tasks: list[Task] | None = None,
1067
1291
  loops: list[Loop] | None = None,
1068
1292
  resources: Resources = None,
1293
+ config: dict | None = None,
1069
1294
  path: PathLike | None = None,
1070
1295
  workflow_name: str | None = None,
1071
1296
  overwrite: bool = False,
@@ -1089,6 +1314,9 @@ class Workflow(AppAware):
1089
1314
  Mapping of action scopes to resource requirements, to be applied to all
1090
1315
  element sets in the workflow. `resources` specified in an element set take
1091
1316
  precedence of those defined here for the whole workflow.
1317
+ config:
1318
+ Configuration items that should be set whenever the resulting workflow is
1319
+ loaded. This includes config items that apply during workflow execution.
1092
1320
  path:
1093
1321
  The directory in which the workflow will be generated. The current directory
1094
1322
  if not specified.
@@ -1116,6 +1344,7 @@ class Workflow(AppAware):
1116
1344
  tasks=tasks or [],
1117
1345
  loops=loops or [],
1118
1346
  resources=resources,
1347
+ config=config or {},
1119
1348
  )
1120
1349
  return cls.from_template(
1121
1350
  template,
@@ -1248,6 +1477,7 @@ class Workflow(AppAware):
1248
1477
  self._store.add_loop(
1249
1478
  loop_template=cast("Mapping", loop_js),
1250
1479
  iterable_parameters=wk_loop.iterable_parameters,
1480
+ output_parameters=wk_loop.output_parameters,
1251
1481
  parents=wk_loop.parents,
1252
1482
  num_added_iterations=wk_loop.num_added_iterations,
1253
1483
  iter_IDs=iter_IDs,
@@ -1275,7 +1505,7 @@ class Workflow(AppAware):
1275
1505
  status.update(
1276
1506
  f"{status_prev}: iteration {iter_idx + 2}/{loop.num_iterations}."
1277
1507
  )
1278
- new_wk_loop.add_iteration(cache=cache_)
1508
+ new_wk_loop.add_iteration(cache=cache_, status=status)
1279
1509
 
1280
1510
  def add_loop(self, loop: Loop) -> None:
1281
1511
  """Add a loop to a subset of workflow tasks."""
@@ -1360,6 +1590,7 @@ class Workflow(AppAware):
1360
1590
  return self._template
1361
1591
 
1362
1592
  @property
1593
+ @TimeIt.decorator
1363
1594
  def tasks(self) -> WorkflowTaskList:
1364
1595
  """
1365
1596
  The tasks in this workflow.
@@ -1410,12 +1641,14 @@ class Workflow(AppAware):
1410
1641
  repack_iteration_tuples(loop_dat["num_added_iterations"])
1411
1642
  ),
1412
1643
  iterable_parameters=loop_dat["iterable_parameters"],
1644
+ output_parameters=loop_dat["output_parameters"],
1413
1645
  )
1414
1646
  for idx, loop_dat in self._store.get_loops().items()
1415
1647
  )
1416
1648
  return self._loops
1417
1649
 
1418
1650
  @property
1651
+ @TimeIt.decorator
1419
1652
  def submissions(self) -> list[Submission]:
1420
1653
  """
1421
1654
  The job submissions done by this workflow.
@@ -1587,56 +1820,70 @@ class Workflow(AppAware):
1587
1820
 
1588
1821
  @TimeIt.decorator
1589
1822
  def get_EARs_from_IDs(
1590
- self, ids: Iterable[int] | int
1591
- ) -> list[ElementActionRun] | ElementActionRun:
1823
+ self, ids: Iterable[int] | int, as_dict: bool = False
1824
+ ) -> list[ElementActionRun] | dict[int, ElementActionRun] | ElementActionRun:
1592
1825
  """Get element action run objects from a list of IDs."""
1593
1826
  id_lst = [ids] if isinstance(ids, int) else list(ids)
1594
- self._app.persistence_logger.debug(f"get_EARs_from_IDs: id_lst={id_lst!r}")
1595
1827
 
1596
- store_EARs = self.get_store_EARs(id_lst)
1597
- store_iters = self.get_store_element_iterations(
1598
- ear.elem_iter_ID for ear in store_EARs
1599
- )
1600
- store_elems = self.get_store_elements(it.element_ID for it in store_iters)
1601
- store_tasks = self.get_store_tasks(el.task_ID for el in store_elems)
1828
+ with self._store.cached_load(), self._store.cache_ctx():
1602
1829
 
1603
- # to allow for bulk retrieval of elements/iterations
1604
- element_idx_by_task: dict[int, set[int]] = defaultdict(set)
1605
- iter_idx_by_task_elem: dict[int, dict[int, set[int]]] = defaultdict(
1606
- lambda: defaultdict(set)
1607
- )
1830
+ self._app.persistence_logger.debug(
1831
+ f"get_EARs_from_IDs: {len(id_lst)} EARs: {shorten_list_str(id_lst)}."
1832
+ )
1608
1833
 
1609
- index_paths: list[Workflow._IndexPath3] = []
1610
- for rn, it, el, tk in zip(store_EARs, store_iters, store_elems, store_tasks):
1611
- act_idx = rn.action_idx
1612
- run_idx = it.EAR_IDs[act_idx].index(rn.id_) if it.EAR_IDs is not None else -1
1613
- iter_idx = el.iteration_IDs.index(it.id_)
1614
- elem_idx = tk.element_IDs.index(el.id_)
1615
- index_paths.append(
1616
- Workflow._IndexPath3(run_idx, act_idx, iter_idx, elem_idx, tk.index)
1834
+ store_EARs = self.get_store_EARs(id_lst)
1835
+ store_iters = self.get_store_element_iterations(
1836
+ ear.elem_iter_ID for ear in store_EARs
1617
1837
  )
1618
- element_idx_by_task[tk.index].add(elem_idx)
1619
- iter_idx_by_task_elem[tk.index][elem_idx].add(iter_idx)
1620
-
1621
- # retrieve elements/iterations:
1622
- iters = {
1623
- task_idx: {
1624
- elem_i.index: {
1625
- iter_idx: elem_i.iterations[iter_idx]
1626
- for iter_idx in iter_idx_by_task_elem[task_idx][elem_i.index]
1838
+ store_elems = self.get_store_elements(it.element_ID for it in store_iters)
1839
+ store_tasks = self.get_store_tasks(el.task_ID for el in store_elems)
1840
+
1841
+ # to allow for bulk retrieval of elements/iterations
1842
+ element_idx_by_task: dict[int, set[int]] = defaultdict(set)
1843
+ iter_idx_by_task_elem: dict[int, dict[int, set[int]]] = defaultdict(
1844
+ lambda: defaultdict(set)
1845
+ )
1846
+
1847
+ index_paths: list[Workflow._IndexPath3] = []
1848
+ for rn, it, el, tk in zip(store_EARs, store_iters, store_elems, store_tasks):
1849
+ act_idx = rn.action_idx
1850
+ run_idx = (
1851
+ it.EAR_IDs[act_idx].index(rn.id_) if it.EAR_IDs is not None else -1
1852
+ )
1853
+ iter_idx = el.iteration_IDs.index(it.id_)
1854
+ elem_idx = tk.element_IDs.index(el.id_)
1855
+ index_paths.append(
1856
+ Workflow._IndexPath3(run_idx, act_idx, iter_idx, elem_idx, tk.index)
1857
+ )
1858
+ element_idx_by_task[tk.index].add(elem_idx)
1859
+ iter_idx_by_task_elem[tk.index][elem_idx].add(iter_idx)
1860
+
1861
+ # retrieve elements/iterations:
1862
+ iters = {
1863
+ task_idx: {
1864
+ elem_i.index: {
1865
+ iter_idx: elem_i.iterations[iter_idx]
1866
+ for iter_idx in iter_idx_by_task_elem[task_idx][elem_i.index]
1867
+ }
1868
+ for elem_i in self.tasks[task_idx].elements[list(elem_idxes)]
1627
1869
  }
1628
- for elem_i in self.tasks[task_idx].elements[list(elem_idxes)]
1870
+ for task_idx, elem_idxes in element_idx_by_task.items()
1629
1871
  }
1630
- for task_idx, elem_idxes in element_idx_by_task.items()
1631
- }
1632
1872
 
1633
- result = [
1634
- iters[path.task][path.elem][path.iter].actions[path.act].runs[path.run]
1635
- for path in index_paths
1636
- ]
1637
- if isinstance(ids, int):
1638
- return result[0]
1639
- return result
1873
+ result = {}
1874
+ for path in index_paths:
1875
+ run = (
1876
+ iters[path.task][path.elem][path.iter]
1877
+ .actions[path.act]
1878
+ .runs[path.run]
1879
+ )
1880
+ result[run.id_] = run
1881
+
1882
+ if not as_dict:
1883
+ res_lst = list(result.values())
1884
+ return res_lst[0] if isinstance(ids, int) else res_lst
1885
+
1886
+ return result
1640
1887
 
1641
1888
  @TimeIt.decorator
1642
1889
  def get_all_elements(self) -> list[Element]:
@@ -1722,6 +1969,20 @@ class Workflow(AppAware):
1722
1969
  self._app.persistence_logger.info("exiting batch update")
1723
1970
  self._in_batch_mode = False
1724
1971
 
1972
+ @contextmanager
1973
+ def cached_merged_parameters(self):
1974
+ if self._use_merged_parameters_cache:
1975
+ yield
1976
+ else:
1977
+ try:
1978
+ self._app.logger.debug("entering merged-parameters cache.")
1979
+ self._use_merged_parameters_cache = True
1980
+ yield
1981
+ finally:
1982
+ self._app.logger.debug("exiting merged-parameters cache.")
1983
+ self._use_merged_parameters_cache = False
1984
+ self._merged_parameters_cache = {} # reset the cache
1985
+
1725
1986
  @classmethod
1726
1987
  def temporary_rename(cls, path: str, fs: AbstractFileSystem) -> str:
1727
1988
  """Rename an existing same-path workflow (directory) so we can restore it if
@@ -1883,7 +2144,7 @@ class Workflow(AppAware):
1883
2144
  if template.source_file:
1884
2145
  wk.artifacts_path.mkdir(exist_ok=False)
1885
2146
  src = Path(template.source_file)
1886
- wk.artifacts_path.joinpath(src.name).write_text(src.read_text())
2147
+ shutil.copy(src, wk.artifacts_path.joinpath(src.name))
1887
2148
 
1888
2149
  return wk
1889
2150
 
@@ -2193,7 +2454,11 @@ class Workflow(AppAware):
2193
2454
  """
2194
2455
  The total number of job submissions.
2195
2456
  """
2196
- return self._store._get_num_total_submissions()
2457
+ return (
2458
+ len(self._submissions)
2459
+ if self._submissions is not None
2460
+ else self._store._get_num_total_submissions()
2461
+ )
2197
2462
 
2198
2463
  @property
2199
2464
  def num_elements(self) -> int:
@@ -2276,22 +2541,26 @@ class Workflow(AppAware):
2276
2541
  for te in self._store.get_task_elements(task.insert_ID, idx_lst)
2277
2542
  ]
2278
2543
 
2279
- def set_EAR_submission_index(self, EAR_ID: int, sub_idx: int) -> None:
2280
- """Set the submission index of an EAR."""
2544
+ def set_EAR_start(
2545
+ self, run_id: int, run_dir: Path | None, port_number: int | None
2546
+ ) -> None:
2547
+ """Set the start time on an EAR."""
2548
+ self._app.logger.debug(f"Setting start for EAR ID {run_id!r}")
2281
2549
  with self._store.cached_load(), self.batch_update():
2282
- self._store.set_EAR_submission_index(EAR_ID, sub_idx)
2550
+ self._store.set_EAR_start(run_id, run_dir, port_number)
2283
2551
 
2284
- def set_EAR_start(self, EAR_ID: int) -> None:
2285
- """Set the start time on an EAR."""
2286
- self._app.logger.debug(f"Setting start for EAR ID {EAR_ID!r}")
2552
+ def set_multi_run_starts(
2553
+ self, run_ids: list[int], run_dirs: list[Path | None], port_number: int
2554
+ ) -> None:
2555
+ """Set the start time on multiple runs."""
2556
+ self._app.logger.debug(f"Setting start for multiple run IDs {run_ids!r}")
2287
2557
  with self._store.cached_load(), self.batch_update():
2288
- self._store.set_EAR_start(EAR_ID)
2558
+ self._store.set_multi_run_starts(run_ids, run_dirs, port_number)
2289
2559
 
2290
2560
  def set_EAR_end(
2291
2561
  self,
2292
- js_idx: int,
2293
- js_act_idx: int,
2294
- EAR_ID: int,
2562
+ block_act_key: BlockActionKey,
2563
+ run: ElementActionRun,
2295
2564
  exit_code: int,
2296
2565
  ) -> None:
2297
2566
  """Set the end time and exit code on an EAR.
@@ -2301,108 +2570,430 @@ class Workflow(AppAware):
2301
2570
 
2302
2571
  """
2303
2572
  self._app.logger.debug(
2304
- f"Setting end for EAR ID {EAR_ID!r} with exit code {exit_code!r}."
2573
+ f"Setting end for run ID {run.id_!r} with exit code {exit_code!r}."
2305
2574
  )
2306
- with self._store.cached_load():
2307
- EAR = self.get_EARs_from_IDs(EAR_ID)
2308
- with self.batch_update():
2309
- success = exit_code == 0 # TODO more sophisticated success heuristics
2310
- if EAR.action.abortable and exit_code == ABORT_EXIT_CODE:
2575
+ param_id: int | list[int] | None
2576
+ with self._store.cached_load(), self.batch_update():
2577
+ success = exit_code == 0 # TODO more sophisticated success heuristics
2578
+ if not run.skip:
2579
+
2580
+ is_aborted = False
2581
+ if run.action.abortable and exit_code == ABORT_EXIT_CODE:
2311
2582
  # the point of aborting an EAR is to continue with the workflow:
2583
+ is_aborted = True
2312
2584
  success = True
2313
2585
 
2314
- for IFG_i in EAR.action.input_file_generators:
2315
- inp_file = IFG_i.input_file
2316
- self._app.logger.debug(
2317
- f"Saving EAR input file: {inp_file.label!r} for EAR ID "
2318
- f"{EAR_ID!r}."
2319
- )
2320
- param_id = EAR.data_idx[f"input_files.{inp_file.label}"]
2321
-
2322
- file_paths = inp_file.value()
2323
- for path_i in (
2324
- file_paths if isinstance(file_paths, list) else [file_paths]
2325
- ):
2326
- self._set_file(
2327
- param_id=param_id,
2328
- store_contents=True, # TODO: make optional according to IFG
2329
- is_input=False,
2330
- path=Path(path_i).resolve(),
2586
+ run_dir = run.get_directory()
2587
+ if run_dir:
2588
+ assert isinstance(run_dir, Path)
2589
+ for IFG_i in run.action.input_file_generators:
2590
+ inp_file = IFG_i.input_file
2591
+ self._app.logger.debug(
2592
+ f"Saving EAR input file: {inp_file.label!r} for EAR ID "
2593
+ f"{run.id_!r}."
2331
2594
  )
2595
+ param_id = run.data_idx[f"input_files.{inp_file.label}"]
2332
2596
 
2333
- if EAR.action.script_data_out_has_files:
2334
- EAR._param_save(js_idx=js_idx, js_act_idx=js_act_idx)
2597
+ file_paths = inp_file.value(directory=run_dir)
2598
+ for path_i in (
2599
+ file_paths if isinstance(file_paths, list) else [file_paths]
2600
+ ):
2601
+ full_path = run_dir.joinpath(path_i)
2602
+ if not full_path.exists():
2603
+ self._app.logger.debug(
2604
+ f"expected input file {path_i!r} does not "
2605
+ f"exist, so setting run to an error state "
2606
+ f"(if not aborted)."
2607
+ )
2608
+ if not is_aborted and success is True:
2609
+ # this is unlikely to happen, but could happen
2610
+ # if the input file is deleted in between
2611
+ # the input file generator completing and this
2612
+ # code being run
2613
+ success = False
2614
+ exit_code = 1 # TODO more custom exit codes?
2615
+ else:
2616
+ self._set_file(
2617
+ param_id=param_id,
2618
+ store_contents=True, # TODO: make optional according to IFG
2619
+ is_input=False,
2620
+ path=full_path,
2621
+ )
2335
2622
 
2336
- # Save action-level files: (TODO: refactor with below for OFPs)
2337
- for save_file_j in EAR.action.save_files:
2338
- self._app.logger.debug(
2339
- f"Saving file: {save_file_j.label!r} for EAR ID " f"{EAR_ID!r}."
2340
- )
2341
- # We might be saving a file that is not a defined
2342
- # "output file"; this will avoid saving a reference in the
2343
- # parameter data in that case
2344
- param_id_j = EAR.data_idx.get(f"output_files.{save_file_j.label}")
2345
-
2346
- file_paths = save_file_j.value()
2347
- self._app.logger.debug(f"Saving output file paths: {file_paths!r}")
2348
- for path_i in (
2349
- file_paths if isinstance(file_paths, list) else [file_paths]
2350
- ):
2351
- self._set_file(
2352
- param_id=param_id_j,
2353
- store_contents=True,
2354
- is_input=False,
2355
- path=Path(path_i).resolve(),
2356
- clean_up=(save_file_j in EAR.action.clean_up),
2357
- )
2623
+ if run.action.script_data_out_has_files:
2624
+ try:
2625
+ run._param_save(block_act_key, run_dir)
2626
+ except FileNotFoundError:
2627
+ self._app.logger.debug(
2628
+ f"script did not generate an expected output parameter "
2629
+ f"file (block_act_key={block_act_key!r}), so setting run "
2630
+ f"to an error state (if not aborted)."
2631
+ )
2632
+ if not is_aborted and success is True:
2633
+ success = False
2634
+ exit_code = 1 # TODO more custom exit codes?
2358
2635
 
2359
- for OFP_i in EAR.action.output_file_parsers:
2360
- for save_file_j in OFP_i._save_files:
2636
+ # Save action-level files: (TODO: refactor with below for OFPs)
2637
+ for save_file_j in run.action.save_files:
2361
2638
  self._app.logger.debug(
2362
- f"Saving EAR output file: {save_file_j.label!r} for EAR ID "
2363
- f"{EAR_ID!r}."
2639
+ f"Saving file: {save_file_j.label!r} for EAR ID "
2640
+ f"{run.id_!r}."
2364
2641
  )
2365
- # We might be saving a file that is not a defined
2366
- # "output file"; this will avoid saving a reference in the
2367
- # parameter data in that case
2368
- param_id_j = EAR.data_idx.get(f"output_files.{save_file_j.label}")
2369
-
2370
- file_paths = save_file_j.value()
2642
+ try:
2643
+ param_id = run.data_idx[f"output_files.{save_file_j.label}"]
2644
+ except KeyError:
2645
+ # We might be saving a file that is not a defined
2646
+ # "output file"; this will avoid saving a reference in the
2647
+ # parameter data:
2648
+ param_id = None
2649
+
2650
+ file_paths = save_file_j.value(directory=run_dir)
2371
2651
  self._app.logger.debug(
2372
- f"Saving EAR output file paths: {file_paths!r}"
2652
+ f"Saving output file paths: {file_paths!r}"
2373
2653
  )
2654
+
2374
2655
  for path_i in (
2375
2656
  file_paths if isinstance(file_paths, list) else [file_paths]
2376
2657
  ):
2377
- self._set_file(
2378
- param_id=param_id_j,
2379
- store_contents=True, # TODO: make optional according to OFP
2380
- is_input=False,
2381
- path=Path(path_i).resolve(),
2382
- clean_up=(save_file_j in OFP_i.clean_up),
2658
+ full_path = run_dir.joinpath(path_i)
2659
+ if not full_path.exists():
2660
+ self._app.logger.debug(
2661
+ f"expected file to save {path_i!r} does not "
2662
+ f"exist, so setting run to an error state "
2663
+ f"(if not aborted)."
2664
+ )
2665
+ if not is_aborted and success is True:
2666
+ # this is unlikely to happen, but could happen
2667
+ # if the input file is deleted in between
2668
+ # the input file generator completing and this
2669
+ # code being run
2670
+ success = False
2671
+ exit_code = 1 # TODO more custom exit codes?
2672
+ else:
2673
+ self._set_file(
2674
+ param_id=param_id,
2675
+ store_contents=True,
2676
+ is_input=False,
2677
+ path=full_path,
2678
+ clean_up=(save_file_j in run.action.clean_up),
2679
+ )
2680
+
2681
+ for OFP_i in run.action.output_file_parsers:
2682
+ for save_file_j in OFP_i._save_files:
2683
+ self._app.logger.debug(
2684
+ f"Saving EAR output file: {save_file_j.label!r} for EAR ID "
2685
+ f"{run.id_!r}."
2686
+ )
2687
+ try:
2688
+ param_id = run.data_idx[
2689
+ f"output_files.{save_file_j.label}"
2690
+ ]
2691
+ except KeyError:
2692
+ # We might be saving a file that is not a defined
2693
+ # "output file"; this will avoid saving a reference in the
2694
+ # parameter data:
2695
+ param_id = None
2696
+
2697
+ file_paths = save_file_j.value(directory=run_dir)
2698
+ self._app.logger.debug(
2699
+ f"Saving EAR output file paths: {file_paths!r}"
2383
2700
  )
2384
2701
 
2385
- if not success:
2386
- for EAR_dep_ID in EAR.get_dependent_EARs():
2387
- # TODO: this needs to be recursive?
2388
- self._app.logger.debug(
2389
- f"Setting EAR ID {EAR_dep_ID!r} to skip because it depends on"
2390
- f" EAR ID {EAR_ID!r}, which exited with a non-zero exit code:"
2391
- f" {exit_code!r}."
2702
+ for path_i in (
2703
+ file_paths
2704
+ if isinstance(file_paths, list)
2705
+ else [file_paths]
2706
+ ):
2707
+ full_path = run_dir.joinpath(path_i)
2708
+ if not full_path.exists():
2709
+ self._app.logger.debug(
2710
+ f"expected output file parser `save_files` file "
2711
+ f"{path_i!r} does not exist, so setting run "
2712
+ f"to an error state (if not aborted)."
2713
+ )
2714
+ if not is_aborted and success is True:
2715
+ success = False
2716
+ exit_code = 1 # TODO more custom exit codes?
2717
+ else:
2718
+ self._set_file(
2719
+ param_id=param_id,
2720
+ store_contents=True, # TODO: make optional according to OFP
2721
+ is_input=False,
2722
+ path=full_path,
2723
+ clean_up=(save_file_j in OFP_i.clean_up),
2724
+ )
2725
+
2726
+ if (
2727
+ run.resources.skip_downstream_on_failure
2728
+ and not success
2729
+ and run.skip_reason is not SkipReason.LOOP_TERMINATION
2730
+ ):
2731
+ # loop termination skips are already propagated
2732
+ for EAR_dep_ID in run.get_dependent_EARs(as_objects=False):
2733
+ self._app.logger.debug(
2734
+ f"Setting EAR ID {EAR_dep_ID!r} to skip because it depends on"
2735
+ f" EAR ID {run.id_!r}, which exited with a non-zero exit code:"
2736
+ f" {exit_code!r}."
2737
+ )
2738
+ self._store.set_EAR_skip(
2739
+ {EAR_dep_ID: SkipReason.UPSTREAM_FAILURE.value}
2740
+ )
2741
+
2742
+ self._store.set_EAR_end(run.id_, exit_code, success, run.action.requires_dir)
2743
+
2744
+ def set_multi_run_ends(
2745
+ self,
2746
+ runs: dict[
2747
+ BlockActionKey,
2748
+ list[tuple[ElementActionRun, int, Path | None]],
2749
+ ],
2750
+ ) -> None:
2751
+ """Set end times and exit codes on multiple runs.
2752
+
2753
+ If the exit code is non-zero, also set all downstream dependent runs to be
2754
+ skipped. Also save any generated input/output files."""
2755
+
2756
+ self._app.logger.debug(f"Setting end for multiple run IDs.")
2757
+ param_id: int | list[int] | None
2758
+ with self._store.cached_load(), self.batch_update():
2759
+ run_ids = []
2760
+ run_dirs = []
2761
+ exit_codes = []
2762
+ successes = []
2763
+ for block_act_key, run_dat in runs.items():
2764
+ for run, exit_code, run_dir in run_dat:
2765
+
2766
+ success = (
2767
+ exit_code == 0
2768
+ ) # TODO more sophisticated success heuristics
2769
+ self._app.logger.info(
2770
+ f"setting end for run {run.id_} with exit_code={exit_code}, "
2771
+ f"success={success}, skip={run.skip!r}, and skip_reason="
2772
+ f"{run.skip_reason!r}."
2773
+ )
2774
+ if not run.skip:
2775
+ self._app.logger.info(f"run was not skipped.")
2776
+ is_aborted = False
2777
+ if run.action.abortable and exit_code == ABORT_EXIT_CODE:
2778
+ # the point of aborting an EAR is to continue with the
2779
+ # workflow:
2780
+ self._app.logger.info(
2781
+ "run was abortable and exit code was ABORT_EXIT_CODE,"
2782
+ " so setting success to True."
2783
+ )
2784
+ is_aborted = True
2785
+ success = True
2786
+
2787
+ run_dir = run.get_directory()
2788
+ if run_dir:
2789
+ assert isinstance(run_dir, Path)
2790
+ for IFG_i in run.action.input_file_generators:
2791
+ self._app.logger.info(f"setting IFG file {IFG_i!r}")
2792
+ inp_file = IFG_i.input_file
2793
+ self._app.logger.debug(
2794
+ f"Saving EAR input file: {inp_file.label!r} for EAR "
2795
+ f"ID {run.id_!r}."
2796
+ )
2797
+ param_id = run.data_idx[f"input_files.{inp_file.label}"]
2798
+
2799
+ file_paths = inp_file.value(directory=run_dir)
2800
+ for path_i in (
2801
+ file_paths
2802
+ if isinstance(file_paths, list)
2803
+ else [file_paths]
2804
+ ):
2805
+ full_path = run_dir.joinpath(path_i)
2806
+ if not full_path.exists():
2807
+ self._app.logger.debug(
2808
+ f"expected input file {path_i!r} does not "
2809
+ f"exist, so setting run to an error state "
2810
+ f"(if not aborted)."
2811
+ )
2812
+ if not is_aborted and success is True:
2813
+ # this is unlikely to happen, but could happen
2814
+ # if the input file is deleted in between
2815
+ # the input file generator completing and this
2816
+ # code being run
2817
+ success = False
2818
+ exit_code = 1 # TODO more custom exit codes?
2819
+ else:
2820
+ self._set_file(
2821
+ param_id=param_id,
2822
+ store_contents=True, # TODO: make optional according to IFG
2823
+ is_input=False,
2824
+ path=full_path,
2825
+ )
2826
+
2827
+ if run.action.script_data_out_has_files:
2828
+ self._app.logger.info(
2829
+ f"saving script-generated parameters."
2830
+ )
2831
+ try:
2832
+ run._param_save(block_act_key, run_dir)
2833
+ except FileNotFoundError:
2834
+ # script did not generate the output parameter file, so
2835
+ # set a failed exit code (if we did not abort the run):
2836
+ self._app.logger.debug(
2837
+ f"script did not generate an expected output "
2838
+ f"parameter file (block_act_key="
2839
+ f"{block_act_key!r}), so setting run to an error "
2840
+ f"state (if not aborted)."
2841
+ )
2842
+ if not is_aborted and success is True:
2843
+ success = False
2844
+ exit_code = 1 # TODO more custom exit codes?
2845
+
2846
+ # Save action-level files: (TODO: refactor with below for OFPs)
2847
+ for save_file_j in run.action.save_files:
2848
+ self._app.logger.info(
2849
+ f"saving action-level file {save_file_j!r}."
2850
+ )
2851
+ self._app.logger.debug(
2852
+ f"Saving file: {save_file_j.label!r} for EAR ID "
2853
+ f"{run.id_!r}."
2854
+ )
2855
+ try:
2856
+ param_id = run.data_idx[
2857
+ f"output_files.{save_file_j.label}"
2858
+ ]
2859
+ except KeyError:
2860
+ # We might be saving a file that is not a defined
2861
+ # "output file"; this will avoid saving a reference in
2862
+ # the parameter data:
2863
+ param_id = None
2864
+
2865
+ file_paths = save_file_j.value(directory=run_dir)
2866
+ self._app.logger.debug(
2867
+ f"Saving output file paths: {file_paths!r}"
2868
+ )
2869
+ for path_i in (
2870
+ file_paths
2871
+ if isinstance(file_paths, list)
2872
+ else [file_paths]
2873
+ ):
2874
+ full_path = run_dir.joinpath(path_i)
2875
+ if not full_path.exists():
2876
+ self._app.logger.debug(
2877
+ f"expected file to save {path_i!r} does not "
2878
+ f"exist, so setting run to an error state "
2879
+ f"(if not aborted)."
2880
+ )
2881
+ if not is_aborted and success is True:
2882
+ # this is unlikely to happen, but could happen
2883
+ # if the input file is deleted in between
2884
+ # the input file generator completing and this
2885
+ # code being run
2886
+ success = False
2887
+ exit_code = 1 # TODO more custom exit codes?
2888
+ else:
2889
+ self._set_file(
2890
+ param_id=param_id,
2891
+ store_contents=True,
2892
+ is_input=False,
2893
+ path=full_path,
2894
+ clean_up=(save_file_j in run.action.clean_up),
2895
+ )
2896
+
2897
+ for OFP_i in run.action.output_file_parsers:
2898
+ self._app.logger.info(
2899
+ f"saving files from OFP: {OFP_i!r}."
2900
+ )
2901
+ for save_file_j in OFP_i._save_files:
2902
+ self._app.logger.debug(
2903
+ f"Saving EAR output file: {save_file_j.label!r} "
2904
+ f"for EAR ID {run.id_!r}."
2905
+ )
2906
+ try:
2907
+ param_id = run.data_idx[
2908
+ f"output_files.{save_file_j.label}"
2909
+ ]
2910
+ except KeyError:
2911
+ # We might be saving a file that is not a defined
2912
+ # "output file"; this will avoid saving a
2913
+ # reference in the parameter data:
2914
+ param_id = None
2915
+
2916
+ file_paths = save_file_j.value(directory=run_dir)
2917
+ self._app.logger.debug(
2918
+ f"Saving EAR output file paths: {file_paths!r}"
2919
+ )
2920
+
2921
+ for path_i in (
2922
+ file_paths
2923
+ if isinstance(file_paths, list)
2924
+ else [file_paths]
2925
+ ):
2926
+ full_path = run_dir.joinpath(path_i)
2927
+ if not full_path.exists():
2928
+ self._app.logger.debug(
2929
+ f"expected output file parser `save_files` file "
2930
+ f"{path_i!r} does not exist, so setting run "
2931
+ f"to an error state (if not aborted)."
2932
+ )
2933
+ if not is_aborted and success is True:
2934
+ success = False
2935
+ exit_code = (
2936
+ 1 # TODO more custom exit codes?
2937
+ )
2938
+ else:
2939
+ self._set_file(
2940
+ param_id=param_id,
2941
+ store_contents=True, # TODO: make optional according to OFP
2942
+ is_input=False,
2943
+ path=full_path,
2944
+ clean_up=(save_file_j in OFP_i.clean_up),
2945
+ )
2946
+
2947
+ else:
2948
+ self._app.logger.info(
2949
+ f"run was skipped: reason: {run.skip_reason!r}."
2950
+ )
2951
+
2952
+ if (
2953
+ run.resources.skip_downstream_on_failure
2954
+ and not success
2955
+ and run.skip_reason is not SkipReason.LOOP_TERMINATION
2956
+ ):
2957
+ # run failed
2958
+ self._app.logger.info(
2959
+ "run was not succcess and skip reason was not "
2960
+ "LOOP_TERMINATION."
2961
+ )
2962
+ # loop termination skips are already propagated
2963
+ for EAR_dep_ID in run.get_dependent_EARs(as_objects=False):
2964
+ # TODO: `get_dependent_EARs` seems to be stuck in a
2965
+ # recursion for some workflows
2966
+ # TODO: this needs to be recursive?
2967
+ self._app.logger.info(
2968
+ f"Setting EAR ID {EAR_dep_ID!r} to skip because it "
2969
+ f"depends on EAR ID {run.id_!r}, which exited with a "
2970
+ f"non-zero exit code: {exit_code!r}."
2971
+ )
2972
+ self._store.set_EAR_skip(
2973
+ {EAR_dep_ID: SkipReason.UPSTREAM_FAILURE.value}
2974
+ )
2975
+ else:
2976
+ self._app.logger.info(
2977
+ "`skip_downstream_on_failure` is False, run was "
2978
+ "succcess, or skip reason was LOOP_TERMINATION."
2392
2979
  )
2393
- self._store.set_EAR_skip(EAR_dep_ID)
2394
2980
 
2395
- self._store.set_EAR_end(EAR_ID, exit_code, success)
2981
+ run_ids.append(run.id_)
2982
+ run_dirs.append(run_dir)
2983
+ exit_codes.append(exit_code)
2984
+ successes.append(success)
2985
+
2986
+ self._store.set_multi_run_ends(run_ids, run_dirs, exit_codes, successes)
2396
2987
 
2397
- def set_EAR_skip(self, EAR_ID: int) -> None:
2988
+ def set_EAR_skip(self, skip_reasons: dict[int, SkipReason]) -> None:
2398
2989
  """
2399
2990
  Record that an EAR is to be skipped due to an upstream failure or loop
2400
2991
  termination condition being met.
2401
2992
  """
2402
2993
  with self._store.cached_load(), self.batch_update():
2403
- self._store.set_EAR_skip(EAR_ID)
2994
+ self._store.set_EAR_skip({k: v.value for k, v in skip_reasons.items()})
2404
2995
 
2405
- def get_EAR_skipped(self, EAR_ID: int) -> bool:
2996
+ def get_EAR_skipped(self, EAR_ID: int) -> int:
2406
2997
  """Check if an EAR is to be skipped."""
2407
2998
  with self._store.cached_load():
2408
2999
  return self._store.get_EAR_skipped(EAR_ID)
@@ -2421,6 +3012,15 @@ class Workflow(AppAware):
2421
3012
  # force commit now:
2422
3013
  self._store._pending.commit_all()
2423
3014
 
3015
+ @TimeIt.decorator
3016
+ def set_parameter_values(self, values: dict[int, Any], commit: bool = False) -> None:
3017
+ with self._store.cached_load(), self.batch_update(), self._store.cache_ctx():
3018
+ self._store.set_parameter_values(values)
3019
+
3020
+ if commit:
3021
+ # force commit now:
3022
+ self._store._pending.commit_all()
3023
+
2424
3024
  def set_EARs_initialised(self, iter_ID: int) -> None:
2425
3025
  """
2426
3026
  Set :py:attr:`~hpcflow.app.ElementIteration.EARs_initialised` to True for the
@@ -2549,7 +3149,7 @@ class Workflow(AppAware):
2549
3149
  self,
2550
3150
  status: Status | None = None,
2551
3151
  ignore_errors: bool = False,
2552
- JS_parallelism: bool | None = None,
3152
+ JS_parallelism: bool | Literal["direct", "scheduled"] | None = None,
2553
3153
  print_stdout: bool = False,
2554
3154
  add_to_known: bool = True,
2555
3155
  tasks: Sequence[int] | None = None,
@@ -2560,16 +3160,23 @@ class Workflow(AppAware):
2560
3160
  if not (pending := [sub for sub in self.submissions if sub.needs_submit]):
2561
3161
  if status:
2562
3162
  status.update("Adding new submission...")
2563
- if not (new_sub := self._add_submission(tasks, JS_parallelism)):
3163
+ if not (
3164
+ new_sub := self._add_submission(
3165
+ tasks=tasks,
3166
+ JS_parallelism=JS_parallelism,
3167
+ status=status,
3168
+ )
3169
+ ):
3170
+ if status:
3171
+ status.stop()
2564
3172
  raise ValueError("No pending element action runs to submit!")
2565
3173
  pending = [new_sub]
2566
3174
 
2567
- self.submissions_path.mkdir(exist_ok=True, parents=True)
2568
3175
  self.execution_path.mkdir(exist_ok=True, parents=True)
2569
3176
  self.task_artifacts_path.mkdir(exist_ok=True, parents=True)
2570
3177
 
2571
- # for direct execution the submission must be persistent at submit-time, because
2572
- # it will be read by a new instance of the app:
3178
+ # the submission must be persistent at submit-time, because it will be read by a
3179
+ # new instance of the app:
2573
3180
  if status:
2574
3181
  status.update("Committing to the store...")
2575
3182
  self._store._pending.commit_all()
@@ -2598,7 +3205,7 @@ class Workflow(AppAware):
2598
3205
  self,
2599
3206
  *,
2600
3207
  ignore_errors: bool = False,
2601
- JS_parallelism: bool | None = None,
3208
+ JS_parallelism: bool | Literal["direct", "scheduled"] | None = None,
2602
3209
  print_stdout: bool = False,
2603
3210
  wait: bool = False,
2604
3211
  add_to_known: bool = True,
@@ -2614,7 +3221,7 @@ class Workflow(AppAware):
2614
3221
  self,
2615
3222
  *,
2616
3223
  ignore_errors: bool = False,
2617
- JS_parallelism: bool | None = None,
3224
+ JS_parallelism: bool | Literal["direct", "scheduled"] | None = None,
2618
3225
  print_stdout: bool = False,
2619
3226
  wait: bool = False,
2620
3227
  add_to_known: bool = True,
@@ -2629,7 +3236,7 @@ class Workflow(AppAware):
2629
3236
  self,
2630
3237
  *,
2631
3238
  ignore_errors: bool = False,
2632
- JS_parallelism: bool | None = None,
3239
+ JS_parallelism: bool | Literal["direct", "scheduled"] | None = None,
2633
3240
  print_stdout: bool = False,
2634
3241
  wait: bool = False,
2635
3242
  add_to_known: bool = True,
@@ -2646,9 +3253,12 @@ class Workflow(AppAware):
2646
3253
  If True, ignore jobscript submission errors. If False (the default) jobscript
2647
3254
  submission will halt when a jobscript fails to submit.
2648
3255
  JS_parallelism
2649
- If True, allow multiple jobscripts to execute simultaneously. Raises if set to
2650
- True but the store type does not support the `jobscript_parallelism` feature.
2651
- If not set, jobscript parallelism will be used if the store type supports it.
3256
+ If True, allow multiple jobscripts to execute simultaneously. If
3257
+ 'scheduled'/'direct', only allow simultaneous execution of scheduled/direct
3258
+ jobscripts. Raises if set to True, 'scheduled', or 'direct', but the store
3259
+ type does not support the `jobscript_parallelism` feature. If not set,
3260
+ jobscript parallelism will be used if the store type supports it, for
3261
+ scheduled jobscripts only.
2652
3262
  print_stdout
2653
3263
  If True, print any jobscript submission standard output, otherwise hide it.
2654
3264
  wait
@@ -2679,7 +3289,11 @@ class Workflow(AppAware):
2679
3289
  if not self._store.is_submittable:
2680
3290
  raise NotImplementedError("The workflow is not submittable.")
2681
3291
  # commit updates before raising exception:
2682
- with self.batch_update(), self._store.cache_ctx():
3292
+ with (
3293
+ self.batch_update(),
3294
+ self._store.parameters_metadata_cache(),
3295
+ self._store.cache_ctx(),
3296
+ ):
2683
3297
  exceptions, submitted_js = self._submit(
2684
3298
  ignore_errors=ignore_errors,
2685
3299
  JS_parallelism=JS_parallelism,
@@ -2693,7 +3307,7 @@ class Workflow(AppAware):
2693
3307
  raise WorkflowSubmissionFailure(exceptions)
2694
3308
 
2695
3309
  if cancel:
2696
- self.cancel()
3310
+ self.cancel(status=status)
2697
3311
 
2698
3312
  elif wait:
2699
3313
  self.wait(submitted_js)
@@ -2822,14 +3436,16 @@ class Workflow(AppAware):
2822
3436
  # keys are task_insert_IDs, values are element indices:
2823
3437
  active_elems: dict[int, set[int]] = defaultdict(set)
2824
3438
  sub = self.submissions[submission_idx]
2825
- for js_idx, states in sub.get_active_jobscripts().items():
3439
+ for js_idx, block_states in sub.get_active_jobscripts().items():
2826
3440
  js = sub.jobscripts[js_idx]
2827
- for js_elem_idx, state in states.items():
2828
- if state is JobscriptElementState.running:
2829
- for task_iID, elem_idx in zip(
2830
- js.task_insert_IDs, js.task_elements[js_elem_idx]
2831
- ):
2832
- active_elems[task_iID].add(elem_idx)
3441
+ for block_idx, block in enumerate(js.blocks):
3442
+ states = block_states[block_idx]
3443
+ for js_elem_idx, state in states.items():
3444
+ if state is JobscriptElementState.running:
3445
+ for task_iID, elem_idx in zip(
3446
+ block.task_insert_IDs, block.task_elements[js_elem_idx]
3447
+ ):
3448
+ active_elems[task_iID].add(elem_idx)
2833
3449
 
2834
3450
  # retrieve Element objects:
2835
3451
  out: list[Element] = []
@@ -2862,18 +3478,22 @@ class Workflow(AppAware):
2862
3478
  for elem in elems:
2863
3479
  if element_idx is not None and elem.index != element_idx:
2864
3480
  continue
2865
- # for a given element, only one iteration will be running (assume for now the
2866
- # this is the latest iteration, as provided by `action_runs`):
2867
- for act_run in elem.action_runs:
2868
- if act_run.status is EARStatus.running:
2869
- out.append(act_run)
2870
- break # only one element action may be running at a time
3481
+ for iter_i in elem.iterations:
3482
+ for elem_acts in iter_i.actions.values():
3483
+ for run in elem_acts.runs:
3484
+ if run.status is EARStatus.running:
3485
+ out.append(run)
3486
+ # for a given element and submission, only one run
3487
+ # may be running at a time:
3488
+ break
2871
3489
  return out
2872
3490
 
2873
- def _abort_run_ID(self, submission_idx: int, run_ID: int):
2874
- """Modify the submission abort runs text file to signal that a run should be
2875
- aborted."""
2876
- self.submissions[submission_idx]._set_run_abort(run_ID)
3491
+ def _abort_run(self, run: ElementActionRun):
3492
+ # connect to the ZeroMQ server on the worker node:
3493
+ self._app.logger.info(f"abort run: {run!r}")
3494
+ self._app.Executor.send_abort(
3495
+ hostname=run.run_hostname, port_number=run.port_number
3496
+ )
2877
3497
 
2878
3498
  def abort_run(
2879
3499
  self,
@@ -2916,38 +3536,77 @@ class Workflow(AppAware):
2916
3536
  run = running[0]
2917
3537
  if not run.action.abortable:
2918
3538
  raise RunNotAbortableError()
2919
- self._abort_run_ID(submission_idx, run.id_)
3539
+ self._abort_run(run)
2920
3540
 
2921
3541
  @TimeIt.decorator
2922
- def cancel(self, hard: bool = False):
3542
+ def cancel(self, status: bool = True):
2923
3543
  """Cancel any running jobscripts."""
2924
- for sub in self.submissions:
2925
- sub.cancel()
3544
+ status_msg = f"Cancelling jobscripts of workflow {self.path!r}"
3545
+ # Type hint for mypy
3546
+ status_context: AbstractContextManager[Status] | AbstractContextManager[None] = (
3547
+ rich.console.Console().status(status_msg) if status else nullcontext()
3548
+ )
3549
+ with status_context as status_, self._store.cached_load():
3550
+ for sub in self.submissions:
3551
+ sub.cancel()
2926
3552
 
2927
3553
  def add_submission(
2928
- self, tasks: list[int] | None = None, JS_parallelism: bool | None = None
3554
+ self,
3555
+ tasks: list[int] | None = None,
3556
+ JS_parallelism: bool | Literal["direct", "scheduled"] | None = None,
3557
+ force_array: bool = False,
3558
+ status: bool = True,
2929
3559
  ) -> Submission | None:
2930
- """
2931
- Add a job submission to this workflow.
3560
+ """Add a new submission.
3561
+
3562
+ Parameters
3563
+ ----------
3564
+ force_array
3565
+ Used to force the use of job arrays, even if the scheduler does not support
3566
+ it. This is provided for testing purposes only.
2932
3567
  """
2933
3568
  # JS_parallelism=None means guess
2934
- with self._store.cached_load(), self.batch_update():
2935
- return self._add_submission(tasks, JS_parallelism)
3569
+ # Type hint for mypy
3570
+ status_context: AbstractContextManager[Status] | AbstractContextManager[None] = (
3571
+ rich.console.Console().status("") if status else nullcontext()
3572
+ )
3573
+ with status_context as status_, self._store.cached_load(), self.batch_update():
3574
+ return self._add_submission(tasks, JS_parallelism, force_array, status_)
2936
3575
 
2937
3576
  @TimeIt.decorator
3577
+ @load_workflow_config
2938
3578
  def _add_submission(
2939
- self, tasks: Sequence[int] | None = None, JS_parallelism: bool | None = None
3579
+ self,
3580
+ tasks: Sequence[int] | None = None,
3581
+ JS_parallelism: bool | Literal["direct", "scheduled"] | None = None,
3582
+ force_array: bool = False,
3583
+ status: Status | None = None,
2940
3584
  ) -> Submission | None:
3585
+ """Add a new submission.
3586
+
3587
+ Parameters
3588
+ ----------
3589
+ force_array
3590
+ Used to force the use of job arrays, even if the scheduler does not support
3591
+ it. This is provided for testing purposes only.
3592
+ """
2941
3593
  new_idx = self.num_submissions
2942
3594
  _ = self.submissions # TODO: just to ensure `submissions` is loaded
3595
+ if status:
3596
+ status.update("Adding new submission: resolving jobscripts...")
3597
+
3598
+ cache = ObjectCache.build(self, elements=True, iterations=True, runs=True)
3599
+
2943
3600
  sub_obj: Submission = self._app.Submission(
2944
3601
  index=new_idx,
2945
3602
  workflow=self,
2946
- jobscripts=self.resolve_jobscripts(tasks),
3603
+ jobscripts=self.resolve_jobscripts(cache, tasks, force_array),
2947
3604
  JS_parallelism=JS_parallelism,
2948
3605
  )
3606
+ if status:
3607
+ status.update("Adding new submission: setting environments...")
2949
3608
  sub_obj._set_environments()
2950
- all_EAR_ID = [i for js in sub_obj.jobscripts for i in js.EAR_ID.flatten()]
3609
+ all_EAR_ID = sub_obj.all_EAR_IDs
2951
3610
  if not all_EAR_ID:
2952
3611
  print(
2953
3612
  "There are no pending element action runs, so a new submission was not "
@@ -2955,33 +3614,97 @@ class Workflow(AppAware):
2955
3614
  )
2956
3615
  return None
2957
3616
 
3617
+ if status:
3618
+ status.update("Adding new submission: making artifact directories...")
3619
+
3620
+ # TODO: a submission should only be "submitted" once shouldn't it?
3621
+ # no; there could be an IO error (e.g. internet connectivity), so might
3622
+ # need to be able to reattempt submission of outstanding jobscripts.
3623
+ self.submissions_path.mkdir(exist_ok=True, parents=True)
3624
+ sub_obj.path.mkdir(exist_ok=True)
3625
+ sub_obj.tmp_path.mkdir(exist_ok=True)
3626
+ sub_obj.app_std_path.mkdir(exist_ok=True)
3627
+ sub_obj.js_path.mkdir(exist_ok=True) # for jobscripts
3628
+ sub_obj.js_std_path.mkdir(exist_ok=True) # for stdout/err stream files
3629
+ sub_obj.js_funcs_path.mkdir(exist_ok=True)
3630
+ sub_obj.js_run_ids_path.mkdir(exist_ok=True)
3631
+ sub_obj.scripts_path.mkdir(exist_ok=True)
3632
+ sub_obj.commands_path.mkdir(exist_ok=True)
3633
+
3634
+ if sub_obj.needs_app_log_dir:
3635
+ sub_obj.app_log_path.mkdir(exist_ok=True)
3636
+
3637
+ if sub_obj.needs_win_pids_dir:
3638
+ sub_obj.js_win_pids_path.mkdir(exist_ok=True)
3639
+
3640
+ if sub_obj.needs_script_indices_dir:
3641
+ sub_obj.js_script_indices_path.mkdir(exist_ok=True)
3642
+
3643
+ if status:
3644
+ status.update("Adding new submission: writing scripts and command files...")
3645
+
3646
+ # write scripts and command files where possible to the submission directory:
3647
+ cmd_file_IDs, run_indices, run_inp_files = sub_obj._write_scripts(cache, status)
3648
+
3649
+ sub_obj._write_execute_dirs(run_indices, run_inp_files, cache, status)
3650
+
3651
+ if status:
3652
+ status.update("Adding new submission: updating the store...")
3653
+
2958
3654
  with self._store.cached_load(), self.batch_update():
2959
3655
  for id_ in all_EAR_ID:
2960
- self._store.set_EAR_submission_index(EAR_ID=id_, sub_idx=new_idx)
3656
+ self._store.set_run_submission_data(
3657
+ EAR_ID=id_,
3658
+ cmds_ID=cmd_file_IDs[id_],
3659
+ sub_idx=new_idx,
3660
+ )
2961
3661
 
3662
+ sub_obj._ensure_JS_parallelism_set()
2962
3663
  sub_obj_js, _ = sub_obj.to_json_like()
2963
3664
  assert self._submissions is not None
2964
3665
  self._submissions.append(sub_obj)
2965
3666
  self._pending["submissions"].append(new_idx)
2966
3667
  with self._store.cached_load(), self.batch_update():
2967
- self._store.add_submission(new_idx, sub_obj_js)
3668
+ self._store.add_submission(new_idx, cast("Mapping[str, JSONed]", sub_obj_js))
2968
3669
 
2969
3670
  return self.submissions[new_idx]
2970
3671
 
2971
3672
  @TimeIt.decorator
2972
- def resolve_jobscripts(self, tasks: Sequence[int] | None = None) -> list[Jobscript]:
3673
+ def resolve_jobscripts(
3674
+ self,
3675
+ cache: ObjectCache,
3676
+ tasks: Sequence[int] | None = None,
3677
+ force_array: bool = False,
3678
+ ) -> list[Jobscript]:
2973
3679
  """
2974
- Resolve this workflow to a set of job scripts to run.
3680
+ Resolve this workflow to a set of jobscripts to run for a new submission.
3681
+
3682
+ Parameters
3683
+ ----------
3684
+ force_array
3685
+ Used to force the use of job arrays, even if the scheduler does not support
3686
+ it. This is provided for testing purposes only.
3687
+
2975
3688
  """
2976
- js, element_deps = self._resolve_singular_jobscripts(tasks)
2977
- js_deps = resolve_jobscript_dependencies(js, element_deps)
3689
+ with self._app.config.cached_config():
3690
+ with self.cached_merged_parameters():
3691
+ js, element_deps = self._resolve_singular_jobscripts(
3692
+ cache, tasks, force_array
3693
+ )
3694
+
3695
+ js_deps = resolve_jobscript_dependencies(js, element_deps)
2978
3696
 
2979
- for js_idx, jsca in js.items():
2980
- if js_idx in js_deps:
2981
- jsca["dependencies"] = js_deps[js_idx]
3697
+ for js_idx, jsca in js.items():
3698
+ if js_idx in js_deps:
3699
+ jsca["dependencies"] = js_deps[js_idx] # type: ignore
2982
3700
 
2983
- js = merge_jobscripts_across_tasks(js)
2984
- return [self._app.Jobscript(**jsca) for jsca in jobscripts_to_list(js)]
3701
+ js = merge_jobscripts_across_tasks(js)
3702
+
3703
+ # for direct or (non-array scheduled), combine into jobscripts of multiple
3704
+ # blocks for dependent jobscripts that have the same resource hashes
3705
+ js_ = resolve_jobscript_blocks(js)
3706
+
3707
+ return [self._app.Jobscript(**i, index=idx) for idx, i in enumerate(js_)]
2985
3708
 
2986
3709
  def __EAR_obj_map(
2987
3710
  self,
@@ -2990,7 +3713,9 @@ class Workflow(AppAware):
2990
3713
  task: WorkflowTask,
2991
3714
  task_actions: Sequence[tuple[int, int, int]],
2992
3715
  EAR_map: NDArray,
3716
+ cache: ObjectCache,
2993
3717
  ) -> Mapping[int, ElementActionRun]:
3718
+ assert cache.runs is not None
2994
3719
  all_EAR_IDs: list[int] = []
2995
3720
  for js_elem_idx, (elem_idx, act_indices) in enumerate(
2996
3721
  js_desc["elements"].items()
@@ -3000,11 +3725,14 @@ class Workflow(AppAware):
3000
3725
  all_EAR_IDs.append(EAR_ID_i)
3001
3726
  js_act_idx = task_actions.index((task.insert_ID, act_idx, 0))
3002
3727
  jsca["EAR_ID"][js_act_idx][js_elem_idx] = EAR_ID_i
3003
- return dict(zip(all_EAR_IDs, self.get_EARs_from_IDs(all_EAR_IDs)))
3728
+ return dict(zip(all_EAR_IDs, (cache.runs[i] for i in all_EAR_IDs)))
3004
3729
 
3005
3730
  @TimeIt.decorator
3006
3731
  def _resolve_singular_jobscripts(
3007
- self, tasks: Sequence[int] | None = None
3732
+ self,
3733
+ cache: ObjectCache,
3734
+ tasks: Sequence[int] | None = None,
3735
+ force_array: bool = False,
3008
3736
  ) -> tuple[
3009
3737
  Mapping[int, JobScriptCreationArguments],
3010
3738
  Mapping[int, Mapping[int, Sequence[int]]],
@@ -3013,6 +3741,12 @@ class Workflow(AppAware):
3013
3741
  We arrange EARs into `EARs` and `elements` so we can quickly look up membership
3014
3742
  by EAR idx in the `EARs` dict.
3015
3743
 
3744
+ Parameters
3745
+ ----------
3746
+ force_array
3747
+ Used to force the use of job arrays, even if the scheduler does not support
3748
+ it. This is provided for testing purposes only.
3749
+
3016
3750
  Returns
3017
3751
  -------
3018
3752
  submission_jobscripts
@@ -3025,6 +3759,7 @@ class Workflow(AppAware):
3025
3759
 
3026
3760
  if self._store.use_cache:
3027
3761
  # pre-cache parameter sources (used in `EAR.get_EAR_dependencies`):
3762
+ # note: this cache is unrelated to the `cache` argument
3028
3763
  self.get_all_parameter_sources()
3029
3764
 
3030
3765
  submission_jobscripts: dict[int, JobScriptCreationArguments] = {}
@@ -3034,7 +3769,9 @@ class Workflow(AppAware):
3034
3769
  task = self.tasks.get(insert_ID=task_iID)
3035
3770
  if task.index not in task_set:
3036
3771
  continue
3037
- res, res_hash, res_map, EAR_map = generate_EAR_resource_map(task, loop_idx_i)
3772
+ res, res_hash, res_map, EAR_map = generate_EAR_resource_map(
3773
+ task, loop_idx_i, cache
3774
+ )
3038
3775
  jobscripts, _ = group_resource_map_into_jobscripts(res_map)
3039
3776
 
3040
3777
  for js_dat in jobscripts:
@@ -3063,6 +3800,11 @@ class Workflow(AppAware):
3063
3800
 
3064
3801
  new_js_idx = len(submission_jobscripts)
3065
3802
 
3803
+ is_array = force_array or is_jobscript_array(
3804
+ res[js_dat["resources"]],
3805
+ EAR_ID_arr.shape[1],
3806
+ self._store,
3807
+ )
3066
3808
  js_i: JobScriptCreationArguments = {
3067
3809
  "task_insert_IDs": [task.insert_ID],
3068
3810
  "task_loop_idx": [loop_idx_i],
@@ -3072,10 +3814,11 @@ class Workflow(AppAware):
3072
3814
  "resources": res[js_dat["resources"]],
3073
3815
  "resource_hash": res_hash[js_dat["resources"]],
3074
3816
  "dependencies": {},
3817
+ "is_array": is_array,
3075
3818
  }
3076
3819
 
3077
3820
  all_EAR_objs = self.__EAR_obj_map(
3078
- js_dat, js_i, task, task_actions, EAR_map
3821
+ js_dat, js_i, task, task_actions, EAR_map, cache
3079
3822
  )
3080
3823
 
3081
3824
  for js_elem_idx, (elem_idx, act_indices) in enumerate(
@@ -3104,76 +3847,290 @@ class Workflow(AppAware):
3104
3847
 
3105
3848
  return submission_jobscripts, all_element_deps
3106
3849
 
3107
- def __get_commands(
3108
- self, jobscript: Jobscript, JS_action_idx: int, ear: ElementActionRun
3109
- ):
3110
- try:
3111
- commands, shell_vars = ear.compose_commands(jobscript, JS_action_idx)
3112
- except OutputFileParserNoOutputError:
3113
- # no commands to write but still need to write the file,
3114
- # the jobscript is expecting it.
3115
- return ""
3116
-
3117
- self._app.persistence_logger.debug("need to write commands")
3118
- pieces = [commands]
3119
- for cmd_idx, var_dat in shell_vars.items():
3120
- for param_name, shell_var_name, st_typ in var_dat:
3121
- pieces.append(
3122
- jobscript.shell.format_save_parameter(
3123
- workflow_app_alias=jobscript.workflow_app_alias,
3124
- param_name=param_name,
3125
- shell_var_name=shell_var_name,
3126
- EAR_ID=ear.id_,
3127
- cmd_idx=cmd_idx,
3128
- stderr=(st_typ == "stderr"),
3850
+ @load_workflow_config
3851
+ def execute_run(
3852
+ self,
3853
+ submission_idx: int,
3854
+ block_act_key: BlockActionKey,
3855
+ run_ID: int,
3856
+ ) -> None:
3857
+ """Execute commands of a run via a subprocess."""
3858
+
3859
+ # CD to submission tmp dir to ensure std streams and exceptions have somewhere
3860
+ # sensible to go:
3861
+ os.chdir(Submission.get_tmp_path(self.submissions_path, submission_idx))
3862
+
3863
+ sub_str_path = Submission.get_app_std_path(self.submissions_path, submission_idx)
3864
+ run_std_path = sub_str_path / f"{str(run_ID)}.txt" # TODO: refactor
3865
+ has_commands = False
3866
+
3867
+ # redirect (as much as possible) app-generated stdout/err to a dedicated file:
3868
+ with redirect_std_to_file(run_std_path):
3869
+ with self._store.cached_load():
3870
+ js_idx = cast("int", block_act_key[0])
3871
+ run = self.get_EARs_from_IDs([run_ID])[0]
3872
+ run_dir = None
3873
+ if run.action.requires_dir:
3874
+ run_dir = run.get_directory()
3875
+ assert run_dir
3876
+ self._app.submission_logger.debug(
3877
+ f"changing directory to run execution directory: {run_dir}."
3129
3878
  )
3130
- )
3131
- commands = jobscript.shell.wrap_in_subshell("".join(pieces), ear.action.abortable)
3132
-
3133
- # add loop-check command if this is the last action of this loop iteration
3134
- # for this element:
3135
- if self.loops:
3136
- final_runs = (
3137
- # TODO: excessive reads here
3138
- self.get_iteration_final_run_IDs(id_lst=jobscript.all_EAR_IDs)
3879
+ os.chdir(run_dir)
3880
+ self._app.submission_logger.debug(f"{run.skip=}; {run.skip_reason=}")
3881
+
3882
+ # check if we should skip:
3883
+ if not run.skip:
3884
+
3885
+ try:
3886
+ with run.raise_on_failure_threshold() as unset_params:
3887
+ if run.action.script:
3888
+ run.write_script_input_files(block_act_key)
3889
+
3890
+ # write the command file that will be executed:
3891
+ cmd_file_path = self.ensure_commands_file(
3892
+ submission_idx, js_idx, run
3893
+ )
3894
+
3895
+ except UnsetParameterDataErrorBase:
3896
+ # not all required parameter data is set, so fail this run:
3897
+ self._app.submission_logger.debug(
3898
+ f"unset parameter threshold satisfied (or any unset "
3899
+ f"parameters found when trying to write commands file), so "
3900
+ f"not attempting run. unset_params={unset_params!r}."
3901
+ )
3902
+ self.set_EAR_start(run_ID, run_dir, port_number=None)
3903
+ self._check_loop_termination(run) # not sure if this is required
3904
+ self.set_EAR_end(
3905
+ block_act_key=block_act_key,
3906
+ run=run,
3907
+ exit_code=1,
3908
+ )
3909
+ return
3910
+
3911
+ # sufficient parameter data is set so far, but need to pass `unset_params`
3912
+ # on as an environment variable so it can be appended to and failure
3913
+ # thresholds can be rechecked if necessary (i.e. in a Python script
3914
+ # where we also load input parameters "directly")
3915
+ if unset_params:
3916
+ self._app.submission_logger.debug(
3917
+ f"some unset parameters found, but no unset-thresholds met: "
3918
+ f"unset_params={unset_params!r}."
3919
+ )
3920
+
3921
+ # TODO: pass on unset_params to script as environment variable
3922
+
3923
+ if has_commands := bool(cmd_file_path):
3924
+
3925
+ assert isinstance(cmd_file_path, Path)
3926
+ if not cmd_file_path.is_file():
3927
+ raise RuntimeError(
3928
+ f"Command file {cmd_file_path!r} does not exist."
3929
+ )
3930
+ # prepare subprocess command:
3931
+ jobscript = self.submissions[submission_idx].jobscripts[js_idx]
3932
+ cmd = jobscript.shell.get_command_file_launch_command(
3933
+ str(cmd_file_path)
3934
+ )
3935
+ loop_idx_str = ";".join(
3936
+ f"{k}={v}" for k, v in run.element_iteration.loop_idx.items()
3937
+ )
3938
+ app_caps = self._app.package_name.upper()
3939
+
3940
+ # TODO: make these optionally set (more difficult to set in combine_script,
3941
+ # so have the option to turn off) [default ON]
3942
+ add_env = {
3943
+ f"{app_caps}_RUN_ID": str(run_ID),
3944
+ f"{app_caps}_RUN_IDX": str(run.index),
3945
+ f"{app_caps}_ELEMENT_IDX": str(run.element.index),
3946
+ f"{app_caps}_ELEMENT_ID": str(run.element.id_),
3947
+ f"{app_caps}_ELEMENT_ITER_IDX": str(
3948
+ run.element_iteration.index
3949
+ ),
3950
+ f"{app_caps}_ELEMENT_ITER_ID": str(run.element_iteration.id_),
3951
+ f"{app_caps}_ELEMENT_ITER_LOOP_IDX": loop_idx_str,
3952
+ }
3953
+
3954
+ if run.action.script:
3955
+ if run.is_snippet_script:
3956
+ script_artifact_name = run.get_script_artifact_name()
3957
+ script_dir = Path(
3958
+ os.environ[f"{app_caps}_SUB_SCRIPTS_DIR"]
3959
+ )
3960
+ script_name = script_artifact_name
3961
+ else:
3962
+ # not a snippet script; expect the script in the run execute
3963
+ # directory (i.e. created by a previous action)
3964
+ script_dir = Path.cwd()
3965
+ script_name = run.action.script
3966
+ script_name_no_ext = Path(script_name).stem
3967
+ add_env.update(
3968
+ {
3969
+ f"{app_caps}_RUN_SCRIPT_NAME": script_name,
3970
+ f"{app_caps}_RUN_SCRIPT_NAME_NO_EXT": script_name_no_ext,
3971
+ f"{app_caps}_RUN_SCRIPT_DIR": str(script_dir),
3972
+ f"{app_caps}_RUN_SCRIPT_PATH": str(
3973
+ script_dir / script_name
3974
+ ),
3975
+ }
3976
+ )
3977
+
3978
+ env = {**dict(os.environ), **add_env}
3979
+
3980
+ self._app.submission_logger.debug(
3981
+ f"Executing run commands via subprocess with command {cmd!r}, and "
3982
+ f"environment variables as below."
3983
+ )
3984
+ for k, v in env.items():
3985
+ if k.startswith(app_caps):
3986
+ self._app.submission_logger.debug(f"{k} = {v!r}")
3987
+ exe = self._app.Executor(cmd, env, self._app.package_name)
3988
+ port = (
3989
+ exe.start_zmq_server()
3990
+ ) # start the server so we know the port
3991
+
3992
+ try:
3993
+ self.set_EAR_start(run_ID, run_dir, port)
3994
+ except:
3995
+ self._app.submission_logger.error(f"Failed to set run start.")
3996
+ exe.stop_zmq_server()
3997
+ raise
3998
+
3999
+ # this subprocess may include commands that redirect to the std_stream file (e.g.
4000
+ # calling the app to save a parameter from a shell command output):
4001
+ if not run.skip and has_commands:
4002
+ ret_code = exe.run() # this also shuts down the server
4003
+
4004
+ # redirect (as much as possible) app-generated stdout/err to a dedicated file:
4005
+ with redirect_std_to_file(run_std_path):
4006
+ if run.skip:
4007
+ ret_code = SKIPPED_EXIT_CODE
4008
+ elif not has_commands:
4009
+ ret_code = NO_COMMANDS_EXIT_CODE
4010
+ else:
4011
+ self._check_loop_termination(run)
4012
+
4013
+ # set run end:
4014
+ self.set_EAR_end(
4015
+ block_act_key=block_act_key,
4016
+ run=run,
4017
+ exit_code=ret_code,
3139
4018
  )
3140
- self._app.persistence_logger.debug(f"final_runs: {final_runs!r}")
3141
- pieces = []
3142
- for loop_name, run_IDs in final_runs.items():
3143
- if ear.id_ in run_IDs:
3144
- loop_cmd = jobscript.shell.format_loop_check(
3145
- workflow_app_alias=jobscript.workflow_app_alias,
3146
- loop_name=loop_name,
3147
- run_ID=ear.id_,
4019
+
4020
+ def _check_loop_termination(self, run: ElementActionRun) -> set[int]:
4021
+ """Check if we need to terminate a loop if this is the last action of the loop
4022
+ iteration for this element, and set downstream iteration runs to skip."""
4023
+
4024
+ elem_iter = run.element_iteration
4025
+ task = elem_iter.task
4026
+ check_loops = []
4027
+ to_skip = set()
4028
+ for loop_name in elem_iter.loop_idx:
4029
+ self._app.logger.info(f"checking loop termination of loop {loop_name!r}.")
4030
+ loop = self.loops.get(loop_name)
4031
+ if (
4032
+ loop.template.termination
4033
+ and task.insert_ID == loop.template.termination_task_insert_ID
4034
+ and run.element_action.action_idx == max(elem_iter.actions)
4035
+ ):
4036
+ check_loops.append(loop_name)
4037
+ # TODO: test with condition actions
4038
+ if loop.test_termination(elem_iter):
4039
+ self._app.logger.info(
4040
+ f"loop {loop_name!r} termination condition met for run "
4041
+ f"ID {run.id_!r}."
3148
4042
  )
3149
- pieces.append(jobscript.shell.wrap_in_subshell(loop_cmd, False))
3150
- commands += "".join(pieces)
3151
- return commands
4043
+ to_skip.update(loop.skip_downstream_iterations(elem_iter))
4044
+ return to_skip
4045
+
4046
+ @load_workflow_config
4047
+ def execute_combined_runs(self, submission_idx: int, jobscript_idx: int) -> None:
4048
+ """Execute a combined script (multiple runs) via a subprocess."""
4049
+
4050
+ # CD to submission tmp dir to ensure std streams and exceptions have somewhere
4051
+ # sensible to go:
4052
+ os.chdir(Submission.get_tmp_path(self.submissions_path, submission_idx))
4053
+
4054
+ sub = self.submissions[submission_idx]
4055
+ js = sub.jobscripts[jobscript_idx]
4056
+
4057
+ app_caps = self._app.package_name.upper()
4058
+ script_dir = Path(os.environ[f"{app_caps}_SUB_SCRIPTS_DIR"])
4059
+ script_name = f"js_{jobscript_idx}.py" # TODO: refactor script name
4060
+ script_path = script_dir / script_name
4061
+
4062
+ add_env = {
4063
+ f"{app_caps}_RUN_SCRIPT_NAME": script_name,
4064
+ f"{app_caps}_RUN_SCRIPT_NAME_NO_EXT": script_path.stem,
4065
+ f"{app_caps}_RUN_SCRIPT_DIR": str(script_dir),
4066
+ f"{app_caps}_RUN_SCRIPT_PATH": str(script_path),
4067
+ f"{app_caps}_SCRIPT_INDICES_FILE": str(js.combined_script_indices_file_path),
4068
+ }
4069
+ env = {**dict(os.environ), **add_env}
4070
+
4071
+ # note: unlike in `Workflow.execute_run`, here we can be reasonably sure the
4072
+ # commands file already exists, because we call `Action.try_write_commands` with
4073
+ # `raise_on_unset=True` in `Workflow._add_submission` during submission.
4074
+
4075
+ # TODO: refactor cmd file name:
4076
+ cmd_file_path = sub.commands_path / f"js_{jobscript_idx}{js.shell.JS_EXT}"
4077
+ cmd = js.shell.get_command_file_launch_command(str(cmd_file_path))
3152
4078
 
3153
- def write_commands(
4079
+ self._app.submission_logger.debug(
4080
+ f"Executing combined runs via subprocess with command {cmd!r}, and "
4081
+ f"environment variables as below."
4082
+ )
4083
+ for k, v in env.items():
4084
+ if k.startswith(app_caps):
4085
+ self._app.submission_logger.debug(f"{k} = {v}")
4086
+
4087
+ exe = self._app.Executor(cmd, env, self._app.package_name)
4088
+ exe.start_zmq_server() # start the server
4089
+ exe.run() # this also shuts down the server
4090
+
4091
+ def ensure_commands_file(
3154
4092
  self,
3155
4093
  submission_idx: int,
3156
- jobscript_idx: int,
3157
- JS_action_idx: int,
3158
- EAR_ID: int,
3159
- ) -> None:
3160
- """Write run-time commands for a given EAR."""
4094
+ js_idx: int,
4095
+ run: ElementActionRun,
4096
+ ) -> Path | bool:
4097
+ """Ensure a commands file exists for the specified run."""
4098
+ self._app.persistence_logger.debug("Workflow.ensure_commands_file")
4099
+
4100
+ if run.commands_file_ID is None:
4101
+ # no commands to write
4102
+ return False
4103
+
3161
4104
  with self._store.cached_load():
3162
- self._app.persistence_logger.debug("Workflow.write_commands")
3163
- self._app.persistence_logger.debug(
3164
- f"loading jobscript (submission index: {submission_idx}; jobscript "
3165
- f"index: {jobscript_idx})"
3166
- )
3167
- jobscript = self.submissions[submission_idx].jobscripts[jobscript_idx]
3168
- self._app.persistence_logger.debug(f"loading run {EAR_ID!r}")
3169
- EAR = self.get_EARs_from_IDs(EAR_ID)
3170
- self._app.persistence_logger.debug(f"run {EAR_ID!r} loaded: {EAR!r}")
3171
- commands = self.__get_commands(jobscript, JS_action_idx, EAR)
3172
- self._app.persistence_logger.debug(f"commands to write: {commands!r}")
3173
- cmd_file_name = jobscript.get_commands_file_name(JS_action_idx)
3174
- with Path(cmd_file_name).open("wt", newline="\n") as fp:
3175
- # (assuming we have CD'd correctly to the element run directory)
3176
- fp.write(commands)
4105
+ sub = self.submissions[submission_idx]
4106
+ jobscript = sub.jobscripts[js_idx]
4107
+
4108
+ # check if a commands file already exists, first checking using the run ID:
4109
+ cmd_file_name = f"{run.id_}{jobscript.shell.JS_EXT}" # TODO: refactor
4110
+ cmd_file_path = jobscript.submission.commands_path / cmd_file_name
4111
+
4112
+ if not cmd_file_path.is_file():
4113
+ # then check for a file from the "root" run ID (the run ID of a run that
4114
+ # shares the same commands file):
4115
+
4116
+ cmd_file_name = (
4117
+ f"{run.commands_file_ID}{jobscript.shell.JS_EXT}" # TODO: refactor
4118
+ )
4119
+ cmd_file_path = jobscript.submission.commands_path / cmd_file_name
4120
+
4121
+ if not cmd_file_path.is_file():
4122
+ # no file available, so write (using the run ID):
4123
+ try:
4124
+ cmd_file_path = run.try_write_commands(
4125
+ jobscript=jobscript,
4126
+ environments=sub.environments,
4127
+ raise_on_unset=True,
4128
+ )
4129
+ except OutputFileParserNoOutputError:
4130
+ # no commands to write, might be used just for saving files
4131
+ return False
4132
+
4133
+ return cmd_file_path
3177
4134
 
3178
4135
  def process_shell_parameter_output(
3179
4136
  self, name: str, value: str, EAR_ID: int, cmd_idx: int, stderr: bool = False
@@ -3257,9 +4214,11 @@ class Workflow(AppAware):
3257
4214
  input_source.task_ref = uniq_names_cur[input_source.task_ref]
3258
4215
  except KeyError:
3259
4216
  raise InvalidInputSourceTaskReference(
3260
- input_source, input_source.task_ref
4217
+ f"Input source {input_source.to_string()!r} refers to a missing "
4218
+ f"or inaccessible task: {input_source.task_ref!r}."
3261
4219
  )
3262
4220
 
4221
+ @TimeIt.decorator
3263
4222
  def get_all_submission_run_IDs(self) -> Iterable[int]:
3264
4223
  """
3265
4224
  Get the run IDs of all submissions.
@@ -3268,68 +4227,6 @@ class Workflow(AppAware):
3268
4227
  for sub in self.submissions:
3269
4228
  yield from sub.all_EAR_IDs
3270
4229
 
3271
- def check_loop_termination(self, loop_name: str, run_ID: int) -> None:
3272
- """Check if a loop should terminate, given the specified completed run, and if so,
3273
- set downstream iteration runs to be skipped."""
3274
- loop = self.loops.get(loop_name)
3275
- elem_iter = self.get_EARs_from_IDs(run_ID).element_iteration
3276
- if loop.test_termination(elem_iter):
3277
- # run IDs of downstream iterations that can be skipped
3278
- to_skip: set[int] = set()
3279
- elem_id = elem_iter.element.id_
3280
- loop_map = self.get_loop_map() # over all jobscripts
3281
- for iter_idx, iter_dat in loop_map[loop_name][elem_id].items():
3282
- if iter_idx > elem_iter.index:
3283
- to_skip.update(itr_d.id_ for itr_d in iter_dat)
3284
- self._app.logger.info(
3285
- f"Loop {loop_name!r} termination condition met for run_ID {run_ID!r}."
3286
- )
3287
- for run_ID in to_skip:
3288
- self.set_EAR_skip(run_ID)
3289
-
3290
- def get_loop_map(
3291
- self, id_lst: Iterable[int] | None = None
3292
- ) -> Mapping[str, Mapping[int, Mapping[int, Sequence[_IterationData]]]]:
3293
- """
3294
- Get a description of what is going on with looping.
3295
- """
3296
- # TODO: test this works across multiple jobscripts
3297
- self._app.persistence_logger.debug("Workflow.get_loop_map")
3298
- if id_lst is None:
3299
- id_lst = self.get_all_submission_run_IDs()
3300
- loop_map: dict[str, dict[int, dict[int, list[_IterationData]]]] = defaultdict(
3301
- lambda: defaultdict(lambda: defaultdict(list))
3302
- )
3303
- for EAR in self.get_EARs_from_IDs(id_lst):
3304
- for loop_name, iter_idx in EAR.element_iteration.loop_idx.items():
3305
- act_idx = EAR.element_action.action_idx
3306
- loop_map[loop_name][EAR.element.id_][iter_idx].append(
3307
- _IterationData(EAR.id_, act_idx)
3308
- )
3309
- return loop_map
3310
-
3311
- def get_iteration_final_run_IDs(
3312
- self,
3313
- id_lst: Iterable[int] | None = None,
3314
- ) -> Mapping[str, Sequence[int]]:
3315
- """Retrieve the run IDs of those runs that correspond to the final action within
3316
- a named loop iteration.
3317
-
3318
- These runs represent the final action of a given element-iteration; this is used to
3319
- identify which commands file to append a loop-termination check to.
3320
- """
3321
- self._app.persistence_logger.debug("Workflow.get_iteration_final_run_IDs")
3322
-
3323
- loop_map = self.get_loop_map(id_lst)
3324
-
3325
- # find final EARs for each loop:
3326
- final_runs: dict[str, list[int]] = defaultdict(list)
3327
- for loop_name, dat in loop_map.items():
3328
- for elem_dat in dat.values():
3329
- for iter_dat in elem_dat.values():
3330
- final_runs[loop_name].append(max(iter_dat, key=lambda x: x.idx).id_)
3331
- return final_runs
3332
-
3333
4230
  def rechunk_runs(
3334
4231
  self,
3335
4232
  chunk_size: int | None = None,
@@ -3348,7 +4245,7 @@ class Workflow(AppAware):
3348
4245
  status: bool = True,
3349
4246
  ):
3350
4247
  """
3351
- Reorganise the stored data chunks for parameterss to be more efficient.
4248
+ Reorganise the stored data chunks for parameters to be more efficient.
3352
4249
  """
3353
4250
  self._store.rechunk_parameter_base(
3354
4251
  chunk_size=chunk_size, backup=backup, status=status
@@ -3366,6 +4263,311 @@ class Workflow(AppAware):
3366
4263
  self.rechunk_runs(chunk_size=chunk_size, backup=backup, status=status)
3367
4264
  self.rechunk_parameter_base(chunk_size=chunk_size, backup=backup, status=status)
3368
4265
 
4266
+ @TimeIt.decorator
4267
+ def get_run_directories(
4268
+ self,
4269
+ run_ids: list[int] | None = None,
4270
+ dir_indices_arr: np.ndarray | None = None,
4271
+ ) -> list[Path | None]:
4272
+ """"""
4273
+
4274
+ @TimeIt.decorator
4275
+ def _get_depth_dirs(
4276
+ item_idx: int,
4277
+ max_per_dir: int,
4278
+ max_depth: int,
4279
+ depth_idx_cache: dict[tuple[int, int], NDArray],
4280
+ prefix: str,
4281
+ ) -> list[str]:
4282
+ dirs = []
4283
+ max_avail_items = max_per_dir**max_depth
4284
+ for depth_i in range(1, max_depth):
4285
+ tot_items_per_level = int(max_avail_items / max_per_dir**depth_i)
4286
+ key = (max_avail_items, tot_items_per_level)
4287
+ if (depth_idx := depth_idx_cache.get(key)) is None:
4288
+ depth_idx = np.repeat(
4289
+ np.arange(max_avail_items / tot_items_per_level, dtype=int),
4290
+ tot_items_per_level,
4291
+ )
4292
+ depth_idx_cache[key] = depth_idx
4293
+ idx_i = cast("NDArray", depth_idx)[item_idx]
4294
+ start_idx = idx_i * tot_items_per_level
4295
+ end_idx = start_idx + tot_items_per_level - 1
4296
+ dirs.append(f"{prefix}_{start_idx}-{end_idx}")
4297
+ return dirs
4298
+
4299
+ if dir_indices_arr is None: # TODO: document behaviour!
4300
+ dir_indices_arr = self._store.get_dirs_array()
4301
+ if run_ids is not None:
4302
+ dir_indices_arr = dir_indices_arr[run_ids]
4303
+
4304
+ # TODO: make these configurable so easier to test!
4305
+ MAX_ELEMS_PER_DIR = 1000 # TODO: configurable (add `workflow_defaults` to Config)
4306
+ MAX_ITERS_PER_DIR = 1000
4307
+
4308
+ exec_path = self.execution_path
4309
+
4310
+ # a fill value means no sub directory should be created
4311
+ T_FILL, E_FILL, I_FILL, A_FILL, R_FILL, _, _ = RUN_DIR_ARR_FILL
4312
+
4313
+ depth_idx_cache: dict[
4314
+ tuple[int, int], NDArray
4315
+ ] = {} # keys are (max_avail, tot_elems_per_dir_level)
4316
+
4317
+ # format run directories:
4318
+ dirs = []
4319
+ for dir_data in dir_indices_arr:
4320
+
4321
+ # TODO: retrieve task,element,iteration,action,run dir formats from
4322
+ # (t_iID, act_idx) combo (cached)?
4323
+
4324
+ t_iID, e_idx, i_idx, _, r_idx, e_depth, i_depth = dir_data
4325
+ path_args = []
4326
+
4327
+ if t_iID != T_FILL:
4328
+ path_args.append(f"t_{t_iID}")
4329
+
4330
+ if e_idx != E_FILL:
4331
+ if e_depth > 1:
4332
+ path_args.extend(
4333
+ _get_depth_dirs(
4334
+ item_idx=e_idx,
4335
+ max_per_dir=MAX_ELEMS_PER_DIR,
4336
+ max_depth=e_depth,
4337
+ depth_idx_cache=depth_idx_cache,
4338
+ prefix="e",
4339
+ )
4340
+ )
4341
+ path_args.append(f"e_{e_idx}")
4342
+
4343
+ if i_idx != I_FILL:
4344
+ if i_depth > 1:
4345
+ path_args.extend(
4346
+ _get_depth_dirs(
4347
+ item_idx=i_idx,
4348
+ max_per_dir=MAX_ITERS_PER_DIR,
4349
+ max_depth=i_depth,
4350
+ depth_idx_cache=depth_idx_cache,
4351
+ prefix="i",
4352
+ )
4353
+ )
4354
+ path_args.append(f"i_{i_idx}")
4355
+
4356
+ if r_idx != R_FILL:
4357
+ path_args.append(f"r_{r_idx}")
4358
+
4359
+ if path_args:
4360
+ run_dir = exec_path.joinpath(*path_args)
4361
+ elif e_depth == 1:
4362
+ run_dir = exec_path
4363
+ else:
4364
+ run_dir = None
4365
+
4366
+ dirs.append(run_dir)
4367
+
4368
+ return dirs
4369
+
4370
+ @TimeIt.decorator
4371
+ def get_scheduler_job_IDs(self) -> tuple[str, ...]:
4372
+ """Return jobscript scheduler job IDs from all submissions of this workflow."""
4373
+ return tuple(
4374
+ IDs_j for sub_i in self.submissions for IDs_j in sub_i.get_scheduler_job_IDs()
4375
+ )
4376
+
4377
+ @TimeIt.decorator
4378
+ def get_process_IDs(self) -> tuple[int, ...]:
4379
+ """Return jobscript process IDs from all submissions of this workflow."""
4380
+ return tuple(
4381
+ IDs_j for sub_i in self.submissions for IDs_j in sub_i.get_process_IDs()
4382
+ )
4383
+
4384
+ @TimeIt.decorator
4385
+ def list_jobscripts(
4386
+ self,
4387
+ sub_idx: int = 0,
4388
+ max_js: int | None = None,
4389
+ jobscripts: list[int] | None = None,
4390
+ width: int | None = None,
4391
+ ) -> None:
4392
+ """Print a table listing jobscripts and associated information from the specified
4393
+ submission.
4394
+
4395
+ Parameters
4396
+ ----------
4397
+ sub_idx
4398
+ The submission index whose jobscripts are to be displayed.
4399
+ max_js
4400
+ Maximum jobscript index to display. This cannot be specified with `jobscripts`.
4401
+ jobscripts
4402
+ A list of jobscripts to display. This cannot be specified with `max_js`.
4403
+ width
4404
+ Width in characters of the printed table.
4405
+ """
4406
+
4407
+ with self._store.cached_load():
4408
+
4409
+ if max_js is not None and jobscripts is not None:
4410
+ raise ValueError("Do not specify both `max_js` and `jobscripts`.")
4411
+
4412
+ loop_names = [i.name for i in self.loops][::-1]
4413
+ loop_names_panel: rich.panel.Panel | str = ""
4414
+ if loop_names:
4415
+ loop_names_panel = rich.panel.Panel(
4416
+ "\n".join(f"{idx}: {i}" for idx, i in enumerate(loop_names)),
4417
+ title="[b]Loops[/b]",
4418
+ title_align="left",
4419
+ box=rich.box.SIMPLE,
4420
+ )
4421
+
4422
+ table = rich.table.Table(width=width)
4423
+
4424
+ table.add_column("Jobscript", justify="right", style="cyan", no_wrap=True)
4425
+ table.add_column("Acts, Elms", justify="right", style="green")
4426
+ table.add_column("Deps.", style="orange3")
4427
+ table.add_column("Tasks", overflow="fold")
4428
+ table.add_column("Loops")
4429
+
4430
+ sub_js = self.submissions[sub_idx].jobscripts
4431
+ max_js = max_js if max_js is not None else len(sub_js)
4432
+ for js in sub_js:
4433
+ if jobscripts is not None and js.index not in jobscripts:
4434
+ continue
4435
+ if js.index > max_js:
4436
+ break
4437
+ for blk in js.blocks:
4438
+ blk_task_actions = blk.task_actions
4439
+ num_actions = blk_task_actions.shape[0]
4440
+
4441
+ if blk.index == 0:
4442
+ c1 = f"{js.index} - {blk.index}"
4443
+ else:
4444
+ c1 = f"{blk.index}"
4445
+ c3 = f"{num_actions}, {blk.num_elements}"
4446
+
4447
+ deps = "; ".join(f"{i[0],i[1]}" for i in blk.dependencies)
4448
+
4449
+ for blk_t_idx, t_iID in enumerate(blk.task_insert_IDs):
4450
+
4451
+ # loop indices are the same for all actions within a task, so get the
4452
+ # first `task_action` for this task insert ID:
4453
+ for i in blk_task_actions:
4454
+ if i[0] == t_iID:
4455
+ loop_idx = [
4456
+ blk.task_loop_idx[i[2]].get(loop_name_i, "-")
4457
+ for loop_name_i in loop_names
4458
+ ]
4459
+ break
4460
+
4461
+ c2 = self.tasks.get(insert_ID=t_iID).unique_name
4462
+
4463
+ if blk_t_idx > 0:
4464
+ c1 = ""
4465
+ c3 = ""
4466
+ deps = ""
4467
+
4468
+ table.add_row(
4469
+ c1, c3, deps, c2, (" | ".join(f"{i}" for i in loop_idx))
4470
+ )
4471
+
4472
+ table.add_section()
4473
+
4474
+ group = rich.console.Group(
4475
+ rich.text.Text(f"Workflow: {self.name}"),
4476
+ rich.text.Text(f"Submission: {sub_idx}" + ("\n" if loop_names_panel else "")),
4477
+ loop_names_panel,
4478
+ table,
4479
+ )
4480
+ rich_print(group)
4481
+
4482
+ def list_task_jobscripts(
4483
+ self,
4484
+ sub_idx: int = 0,
4485
+ task_names: list[str] | None = None,
4486
+ max_js: int | None = None,
4487
+ width: int | None = None,
4488
+ ):
4489
+ """Print a table listing the jobscripts associated with the specified (or all)
4490
+ tasks for the specified submission.
4491
+
4492
+ Parameters
4493
+ ----------
4494
+ sub_idx
4495
+ The submission index whose jobscripts are to be displayed.
4496
+ task_names
4497
+ List of sub-strings to match to task names. Only matching task names will be
4498
+ included.
4499
+ max_js
4500
+ Maximum jobscript index to display.
4501
+ width
4502
+ Width in characters of the printed table.
4503
+ """
4504
+
4505
+ with self._store.cached_load():
4506
+ loop_names = [i.name for i in self.loops][::-1]
4507
+ loop_names_panel: rich.panel.Panel | str = ""
4508
+ if loop_names:
4509
+ loop_names_panel = rich.panel.Panel(
4510
+ "\n".join(f"{idx}: {i}" for idx, i in enumerate(loop_names)),
4511
+ title="[b]Loops[/b]",
4512
+ title_align="left",
4513
+ box=rich.box.SIMPLE,
4514
+ )
4515
+
4516
+ sub_js = self.submissions[sub_idx].jobscripts
4517
+ all_task_names = {i.insert_ID: i.unique_name for i in self.tasks}
4518
+
4519
+ # filter task names by those matching the specified names
4520
+ matched = all_task_names
4521
+ if task_names:
4522
+ matched = {
4523
+ k: v
4524
+ for k, v in all_task_names.items()
4525
+ if any(i in v for i in task_names)
4526
+ }
4527
+
4528
+ task_jobscripts = defaultdict(list)
4529
+ for js in sub_js:
4530
+ if max_js is not None and js.index > max_js:
4531
+ break
4532
+ for blk in js.blocks:
4533
+ blk_task_actions = blk.task_actions
4534
+ for i in blk.task_insert_IDs:
4535
+ if i in matched:
4536
+ for j in blk_task_actions:
4537
+ if j[0] == i:
4538
+ loop_idx = [
4539
+ blk.task_loop_idx[j[2]].get(loop_name_i, "-")
4540
+ for loop_name_i in loop_names
4541
+ ]
4542
+ break
4543
+ task_jobscripts[i].append((js.index, blk.index, loop_idx))
4544
+
4545
+ table = rich.table.Table(width=width)
4546
+ table.add_column("Task")
4547
+ table.add_column("Jobscripts", style="cyan", no_wrap=True)
4548
+ table.add_column("Loops")
4549
+ for insert_ID_i, jobscripts_i in task_jobscripts.items():
4550
+ for idx, js_j in enumerate(jobscripts_i):
4551
+ js_idx, blk_idx, loop_idx = js_j
4552
+ table.add_row(
4553
+ matched[insert_ID_i] if idx == 0 else "",
4554
+ f"({js_idx}, {blk_idx})",
4555
+ (" | ".join(f"{i}" for i in loop_idx)),
4556
+ )
4557
+ table.add_section()
4558
+
4559
+ group = rich.console.Group(
4560
+ rich.text.Text(f"Workflow: {self.name}"),
4561
+ rich.text.Text(f"Submission: {sub_idx}" + ("\n" if loop_names_panel else "")),
4562
+ loop_names_panel,
4563
+ table,
4564
+ )
4565
+ rich_print(group)
4566
+
4567
+ def get_text_file(self, path: str | Path) -> str:
4568
+ """Retrieve the contents of a text file stored within the workflow."""
4569
+ return self._store.get_text_file(path)
4570
+
3369
4571
 
3370
4572
  @dataclass
3371
4573
  class WorkflowBlueprint: