hpcflow-new2 0.2.0a189__py3-none-any.whl → 0.2.0a199__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. hpcflow/__pyinstaller/hook-hpcflow.py +9 -6
  2. hpcflow/_version.py +1 -1
  3. hpcflow/app.py +1 -0
  4. hpcflow/data/scripts/bad_script.py +2 -0
  5. hpcflow/data/scripts/do_nothing.py +2 -0
  6. hpcflow/data/scripts/env_specifier_test/input_file_generator_pass_env_spec.py +4 -0
  7. hpcflow/data/scripts/env_specifier_test/main_script_test_pass_env_spec.py +8 -0
  8. hpcflow/data/scripts/env_specifier_test/output_file_parser_pass_env_spec.py +4 -0
  9. hpcflow/data/scripts/env_specifier_test/v1/input_file_generator_basic.py +4 -0
  10. hpcflow/data/scripts/env_specifier_test/v1/main_script_test_direct_in_direct_out.py +7 -0
  11. hpcflow/data/scripts/env_specifier_test/v1/output_file_parser_basic.py +4 -0
  12. hpcflow/data/scripts/env_specifier_test/v2/main_script_test_direct_in_direct_out.py +7 -0
  13. hpcflow/data/scripts/input_file_generator_basic.py +3 -0
  14. hpcflow/data/scripts/input_file_generator_basic_FAIL.py +3 -0
  15. hpcflow/data/scripts/input_file_generator_test_stdout_stderr.py +8 -0
  16. hpcflow/data/scripts/main_script_test_direct_in.py +3 -0
  17. hpcflow/data/scripts/main_script_test_direct_in_direct_out_2.py +6 -0
  18. hpcflow/data/scripts/main_script_test_direct_in_direct_out_2_fail_allowed.py +6 -0
  19. hpcflow/data/scripts/main_script_test_direct_in_direct_out_2_fail_allowed_group.py +7 -0
  20. hpcflow/data/scripts/main_script_test_direct_in_direct_out_3.py +6 -0
  21. hpcflow/data/scripts/main_script_test_direct_in_group_direct_out_3.py +6 -0
  22. hpcflow/data/scripts/main_script_test_direct_in_group_one_fail_direct_out_3.py +6 -0
  23. hpcflow/data/scripts/main_script_test_hdf5_in_obj.py +1 -1
  24. hpcflow/data/scripts/main_script_test_hdf5_in_obj_2.py +12 -0
  25. hpcflow/data/scripts/main_script_test_hdf5_out_obj.py +1 -1
  26. hpcflow/data/scripts/main_script_test_json_out_FAIL.py +3 -0
  27. hpcflow/data/scripts/main_script_test_shell_env_vars.py +12 -0
  28. hpcflow/data/scripts/main_script_test_std_out_std_err.py +6 -0
  29. hpcflow/data/scripts/output_file_parser_basic.py +3 -0
  30. hpcflow/data/scripts/output_file_parser_basic_FAIL.py +7 -0
  31. hpcflow/data/scripts/output_file_parser_test_stdout_stderr.py +8 -0
  32. hpcflow/data/scripts/script_exit_test.py +5 -0
  33. hpcflow/data/template_components/environments.yaml +1 -1
  34. hpcflow/sdk/__init__.py +26 -15
  35. hpcflow/sdk/app.py +2192 -768
  36. hpcflow/sdk/cli.py +506 -296
  37. hpcflow/sdk/cli_common.py +105 -7
  38. hpcflow/sdk/config/__init__.py +1 -1
  39. hpcflow/sdk/config/callbacks.py +115 -43
  40. hpcflow/sdk/config/cli.py +126 -103
  41. hpcflow/sdk/config/config.py +674 -318
  42. hpcflow/sdk/config/config_file.py +131 -95
  43. hpcflow/sdk/config/errors.py +125 -84
  44. hpcflow/sdk/config/types.py +148 -0
  45. hpcflow/sdk/core/__init__.py +25 -1
  46. hpcflow/sdk/core/actions.py +1771 -1059
  47. hpcflow/sdk/core/app_aware.py +24 -0
  48. hpcflow/sdk/core/cache.py +139 -79
  49. hpcflow/sdk/core/command_files.py +263 -287
  50. hpcflow/sdk/core/commands.py +145 -112
  51. hpcflow/sdk/core/element.py +828 -535
  52. hpcflow/sdk/core/enums.py +192 -0
  53. hpcflow/sdk/core/environment.py +74 -93
  54. hpcflow/sdk/core/errors.py +455 -52
  55. hpcflow/sdk/core/execute.py +207 -0
  56. hpcflow/sdk/core/json_like.py +540 -272
  57. hpcflow/sdk/core/loop.py +751 -347
  58. hpcflow/sdk/core/loop_cache.py +164 -47
  59. hpcflow/sdk/core/object_list.py +370 -207
  60. hpcflow/sdk/core/parameters.py +1100 -627
  61. hpcflow/sdk/core/rule.py +59 -41
  62. hpcflow/sdk/core/run_dir_files.py +21 -37
  63. hpcflow/sdk/core/skip_reason.py +7 -0
  64. hpcflow/sdk/core/task.py +1649 -1339
  65. hpcflow/sdk/core/task_schema.py +308 -196
  66. hpcflow/sdk/core/test_utils.py +191 -114
  67. hpcflow/sdk/core/types.py +440 -0
  68. hpcflow/sdk/core/utils.py +485 -309
  69. hpcflow/sdk/core/validation.py +82 -9
  70. hpcflow/sdk/core/workflow.py +2544 -1178
  71. hpcflow/sdk/core/zarr_io.py +98 -137
  72. hpcflow/sdk/data/workflow_spec_schema.yaml +2 -0
  73. hpcflow/sdk/demo/cli.py +53 -33
  74. hpcflow/sdk/helper/cli.py +18 -15
  75. hpcflow/sdk/helper/helper.py +75 -63
  76. hpcflow/sdk/helper/watcher.py +61 -28
  77. hpcflow/sdk/log.py +122 -71
  78. hpcflow/sdk/persistence/__init__.py +8 -31
  79. hpcflow/sdk/persistence/base.py +1360 -606
  80. hpcflow/sdk/persistence/defaults.py +6 -0
  81. hpcflow/sdk/persistence/discovery.py +38 -0
  82. hpcflow/sdk/persistence/json.py +568 -188
  83. hpcflow/sdk/persistence/pending.py +382 -179
  84. hpcflow/sdk/persistence/store_resource.py +39 -23
  85. hpcflow/sdk/persistence/types.py +318 -0
  86. hpcflow/sdk/persistence/utils.py +14 -11
  87. hpcflow/sdk/persistence/zarr.py +1337 -433
  88. hpcflow/sdk/runtime.py +44 -41
  89. hpcflow/sdk/submission/{jobscript_info.py → enums.py} +39 -12
  90. hpcflow/sdk/submission/jobscript.py +1651 -692
  91. hpcflow/sdk/submission/schedulers/__init__.py +167 -39
  92. hpcflow/sdk/submission/schedulers/direct.py +121 -81
  93. hpcflow/sdk/submission/schedulers/sge.py +170 -129
  94. hpcflow/sdk/submission/schedulers/slurm.py +291 -268
  95. hpcflow/sdk/submission/schedulers/utils.py +12 -2
  96. hpcflow/sdk/submission/shells/__init__.py +14 -15
  97. hpcflow/sdk/submission/shells/base.py +150 -29
  98. hpcflow/sdk/submission/shells/bash.py +283 -173
  99. hpcflow/sdk/submission/shells/os_version.py +31 -30
  100. hpcflow/sdk/submission/shells/powershell.py +228 -170
  101. hpcflow/sdk/submission/submission.py +1014 -335
  102. hpcflow/sdk/submission/types.py +140 -0
  103. hpcflow/sdk/typing.py +182 -12
  104. hpcflow/sdk/utils/arrays.py +71 -0
  105. hpcflow/sdk/utils/deferred_file.py +55 -0
  106. hpcflow/sdk/utils/hashing.py +16 -0
  107. hpcflow/sdk/utils/patches.py +12 -0
  108. hpcflow/sdk/utils/strings.py +33 -0
  109. hpcflow/tests/api/test_api.py +32 -0
  110. hpcflow/tests/conftest.py +27 -6
  111. hpcflow/tests/data/multi_path_sequences.yaml +29 -0
  112. hpcflow/tests/data/workflow_test_run_abort.yaml +34 -35
  113. hpcflow/tests/schedulers/sge/test_sge_submission.py +36 -0
  114. hpcflow/tests/schedulers/slurm/test_slurm_submission.py +5 -2
  115. hpcflow/tests/scripts/test_input_file_generators.py +282 -0
  116. hpcflow/tests/scripts/test_main_scripts.py +866 -85
  117. hpcflow/tests/scripts/test_non_snippet_script.py +46 -0
  118. hpcflow/tests/scripts/test_ouput_file_parsers.py +353 -0
  119. hpcflow/tests/shells/wsl/test_wsl_submission.py +12 -4
  120. hpcflow/tests/unit/test_action.py +262 -75
  121. hpcflow/tests/unit/test_action_rule.py +9 -4
  122. hpcflow/tests/unit/test_app.py +33 -6
  123. hpcflow/tests/unit/test_cache.py +46 -0
  124. hpcflow/tests/unit/test_cli.py +134 -1
  125. hpcflow/tests/unit/test_command.py +71 -54
  126. hpcflow/tests/unit/test_config.py +142 -16
  127. hpcflow/tests/unit/test_config_file.py +21 -18
  128. hpcflow/tests/unit/test_element.py +58 -62
  129. hpcflow/tests/unit/test_element_iteration.py +50 -1
  130. hpcflow/tests/unit/test_element_set.py +29 -19
  131. hpcflow/tests/unit/test_group.py +4 -2
  132. hpcflow/tests/unit/test_input_source.py +116 -93
  133. hpcflow/tests/unit/test_input_value.py +29 -24
  134. hpcflow/tests/unit/test_jobscript_unit.py +757 -0
  135. hpcflow/tests/unit/test_json_like.py +44 -35
  136. hpcflow/tests/unit/test_loop.py +1396 -84
  137. hpcflow/tests/unit/test_meta_task.py +325 -0
  138. hpcflow/tests/unit/test_multi_path_sequences.py +229 -0
  139. hpcflow/tests/unit/test_object_list.py +17 -12
  140. hpcflow/tests/unit/test_parameter.py +29 -7
  141. hpcflow/tests/unit/test_persistence.py +237 -42
  142. hpcflow/tests/unit/test_resources.py +20 -18
  143. hpcflow/tests/unit/test_run.py +117 -6
  144. hpcflow/tests/unit/test_run_directories.py +29 -0
  145. hpcflow/tests/unit/test_runtime.py +2 -1
  146. hpcflow/tests/unit/test_schema_input.py +23 -15
  147. hpcflow/tests/unit/test_shell.py +23 -2
  148. hpcflow/tests/unit/test_slurm.py +8 -7
  149. hpcflow/tests/unit/test_submission.py +38 -89
  150. hpcflow/tests/unit/test_task.py +352 -247
  151. hpcflow/tests/unit/test_task_schema.py +33 -20
  152. hpcflow/tests/unit/test_utils.py +9 -11
  153. hpcflow/tests/unit/test_value_sequence.py +15 -12
  154. hpcflow/tests/unit/test_workflow.py +114 -83
  155. hpcflow/tests/unit/test_workflow_template.py +0 -1
  156. hpcflow/tests/unit/utils/test_arrays.py +40 -0
  157. hpcflow/tests/unit/utils/test_deferred_file_writer.py +34 -0
  158. hpcflow/tests/unit/utils/test_hashing.py +65 -0
  159. hpcflow/tests/unit/utils/test_patches.py +5 -0
  160. hpcflow/tests/unit/utils/test_redirect_std.py +50 -0
  161. hpcflow/tests/workflows/__init__.py +0 -0
  162. hpcflow/tests/workflows/test_directory_structure.py +31 -0
  163. hpcflow/tests/workflows/test_jobscript.py +334 -1
  164. hpcflow/tests/workflows/test_run_status.py +198 -0
  165. hpcflow/tests/workflows/test_skip_downstream.py +696 -0
  166. hpcflow/tests/workflows/test_submission.py +140 -0
  167. hpcflow/tests/workflows/test_workflows.py +160 -15
  168. hpcflow/tests/workflows/test_zip.py +18 -0
  169. hpcflow/viz_demo.ipynb +6587 -3
  170. {hpcflow_new2-0.2.0a189.dist-info → hpcflow_new2-0.2.0a199.dist-info}/METADATA +8 -4
  171. hpcflow_new2-0.2.0a199.dist-info/RECORD +221 -0
  172. hpcflow/sdk/core/parallel.py +0 -21
  173. hpcflow_new2-0.2.0a189.dist-info/RECORD +0 -158
  174. {hpcflow_new2-0.2.0a189.dist-info → hpcflow_new2-0.2.0a199.dist-info}/LICENSE +0 -0
  175. {hpcflow_new2-0.2.0a189.dist-info → hpcflow_new2-0.2.0a199.dist-info}/WHEEL +0 -0
  176. {hpcflow_new2-0.2.0a189.dist-info → hpcflow_new2-0.2.0a199.dist-info}/entry_points.txt +0 -0
@@ -7,19 +7,24 @@ from __future__ import annotations
7
7
  import copy
8
8
  from contextlib import contextmanager
9
9
  from dataclasses import dataclass
10
- from datetime import datetime
11
10
  from pathlib import Path
11
+ from typing import Any, cast, TYPE_CHECKING
12
+ from typing_extensions import override
12
13
  import shutil
13
14
  import time
14
- from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Union
15
15
 
16
16
  import numpy as np
17
- import zarr
18
- from fsspec.implementations.zip import ZipFileSystem
17
+ from numpy.ma.core import MaskedArray
18
+ import zarr # type: ignore
19
+ from zarr.errors import BoundsCheckError # type: ignore
20
+ from zarr.storage import DirectoryStore, FSStore # type: ignore
21
+ from fsspec.implementations.zip import ZipFileSystem # type: ignore
19
22
  from rich.console import Console
20
- from numcodecs import MsgPack, VLenArray, blosc, Blosc, Zstd
21
- from reretry import retry
23
+ from numcodecs import MsgPack, VLenArray, blosc, Blosc, Zstd # type: ignore
24
+ from reretry import retry # type: ignore
22
25
 
26
+ from hpcflow.sdk.typing import hydrate
27
+ from hpcflow.sdk.core import RUN_DIR_ARR_DTYPE, RUN_DIR_ARR_FILL
23
28
  from hpcflow.sdk.core.errors import (
24
29
  MissingParameterData,
25
30
  MissingStoreEARError,
@@ -38,18 +43,60 @@ from hpcflow.sdk.persistence.base import (
38
43
  StoreParameter,
39
44
  StoreTask,
40
45
  )
46
+ from hpcflow.sdk.persistence.types import (
47
+ LoopDescriptor,
48
+ StoreCreationInfo,
49
+ TemplateMeta,
50
+ ZarrAttrsDict,
51
+ )
41
52
  from hpcflow.sdk.persistence.store_resource import ZarrAttrsStoreResource
42
53
  from hpcflow.sdk.persistence.utils import ask_pw_on_auth_exc
43
54
  from hpcflow.sdk.persistence.pending import CommitResourceMap
44
55
  from hpcflow.sdk.persistence.base import update_param_source_dict
45
56
  from hpcflow.sdk.log import TimeIt
57
+ from hpcflow.sdk.submission.submission import (
58
+ JOBSCRIPT_SUBMIT_TIME_KEYS,
59
+ SUBMISSION_SUBMIT_TIME_KEYS,
60
+ )
61
+ from hpcflow.sdk.utils.arrays import get_2D_idx, split_arr
62
+ from hpcflow.sdk.utils.strings import shorten_list_str
63
+
64
+ if TYPE_CHECKING:
65
+ from collections.abc import (
66
+ Callable,
67
+ Iterable,
68
+ Iterator,
69
+ Mapping,
70
+ MutableMapping,
71
+ Sequence,
72
+ )
73
+ from datetime import datetime
74
+ from fsspec import AbstractFileSystem # type: ignore
75
+ from logging import Logger
76
+ from typing import ClassVar
77
+ from typing_extensions import Self, TypeAlias
78
+ from numpy.typing import NDArray
79
+ from zarr import Array, Group # type: ignore
80
+ from zarr.attrs import Attributes # type: ignore
81
+ from zarr.storage import Store # type: ignore
82
+ from ..submission.types import ResolvedJobscriptBlockDependencies
83
+ from .types import TypeLookup
84
+ from ..app import BaseApp
85
+ from ..core.json_like import JSONed, JSONDocument
86
+ from ..typing import ParamSource, PathLike, DataIndex
87
+
88
+ #: List of any (Zarr-serializable) value.
89
+ ListAny: TypeAlias = "list[Any]"
90
+ #: Zarr attribute mapping context.
91
+ ZarrAttrs: TypeAlias = "dict[str, Any]"
92
+ _JS: TypeAlias = "dict[str, list[dict[str, dict]]]"
46
93
 
47
94
 
48
95
  blosc.use_threads = False # hpcflow is a multiprocess program in general
49
96
 
50
97
 
51
98
  @TimeIt.decorator
52
- def _zarr_get_coord_selection(arr, selection, logger):
99
+ def _zarr_get_coord_selection(arr: Array, selection: Any, logger: Logger):
53
100
  @retry(
54
101
  RuntimeError,
55
102
  tries=10,
@@ -59,53 +106,84 @@ def _zarr_get_coord_selection(arr, selection, logger):
59
106
  logger=logger,
60
107
  )
61
108
  @TimeIt.decorator
62
- def _inner(arr, selection):
109
+ def _inner(arr: Array, selection: Any):
63
110
  return arr.get_coordinate_selection(selection)
64
111
 
65
112
  return _inner(arr, selection)
66
113
 
67
114
 
68
- def _encode_numpy_array(obj, type_lookup, path, root_group, arr_path):
115
+ def _encode_numpy_array(
116
+ obj: NDArray,
117
+ type_lookup: TypeLookup,
118
+ path: list[int],
119
+ root_group: Group,
120
+ arr_path: list[int],
121
+ ) -> int:
69
122
  # Might need to generate new group:
70
123
  param_arr_group = root_group.require_group(arr_path)
71
- names = [int(i.split("arr_")[1]) for i in param_arr_group.keys()]
72
- if not names:
73
- new_idx = 0
74
- else:
75
- new_idx = max(names) + 1
124
+ new_idx = (
125
+ max((int(i.removeprefix("arr_")) for i in param_arr_group.keys()), default=-1) + 1
126
+ )
76
127
  param_arr_group.create_dataset(name=f"arr_{new_idx}", data=obj)
77
128
  type_lookup["arrays"].append([path, new_idx])
78
129
 
79
130
  return len(type_lookup["arrays"]) - 1
80
131
 
81
132
 
82
- def _decode_numpy_arrays(obj, type_lookup, path, arr_group, dataset_copy):
83
- for arr_path, arr_idx in type_lookup["arrays"]:
133
+ def _decode_numpy_arrays(
134
+ obj: dict | None,
135
+ type_lookup: TypeLookup,
136
+ path: list[int],
137
+ arr_group: Group,
138
+ dataset_copy: bool,
139
+ ):
140
+ # Yuck! Type lies! Zarr's internal types are not modern Python types.
141
+ arrays = cast("Iterable[tuple[list[int], int]]", type_lookup.get("arrays", []))
142
+ obj_: dict | NDArray | None = obj
143
+ for arr_path, arr_idx in arrays:
84
144
  try:
85
145
  rel_path = get_relative_path(arr_path, path)
86
146
  except ValueError:
87
147
  continue
88
148
 
89
- dataset = arr_group.get(f"arr_{arr_idx}")
149
+ dataset: NDArray = arr_group.get(f"arr_{arr_idx}")
90
150
  if dataset_copy:
91
151
  dataset = dataset[:]
92
152
 
93
153
  if rel_path:
94
- set_in_container(obj, rel_path, dataset)
154
+ set_in_container(obj_, rel_path, dataset)
95
155
  else:
96
- obj = dataset
156
+ obj_ = dataset
97
157
 
98
- return obj
158
+ return obj_
99
159
 
100
160
 
101
- def _encode_masked_array(obj, type_lookup, path, root_group, arr_path):
161
+ def _encode_masked_array(
162
+ obj: MaskedArray,
163
+ type_lookup: TypeLookup,
164
+ path: list[int],
165
+ root_group: Group,
166
+ arr_path: list[int],
167
+ ):
102
168
  data_idx = _encode_numpy_array(obj.data, type_lookup, path, root_group, arr_path)
103
169
  mask_idx = _encode_numpy_array(obj.mask, type_lookup, path, root_group, arr_path)
104
170
  type_lookup["masked_arrays"].append([path, [data_idx, mask_idx]])
105
171
 
106
172
 
107
- def _decode_masked_arrays(obj, type_lookup, path, arr_group, dataset_copy):
108
- for arr_path, (data_idx, mask_idx) in type_lookup["masked_arrays"]:
173
+ def _decode_masked_arrays(
174
+ obj: dict,
175
+ type_lookup: TypeLookup,
176
+ path: list[int],
177
+ arr_group: Group,
178
+ dataset_copy: bool,
179
+ ):
180
+ # Yuck! Type lies! Zarr's internal types are not modern Python types.
181
+ masked_arrays = cast(
182
+ "Iterable[tuple[list[int], tuple[int, int]]]",
183
+ type_lookup.get("masked_arrays", []),
184
+ )
185
+ obj_: dict | MaskedArray = obj
186
+ for arr_path, (data_idx, mask_idx) in masked_arrays:
109
187
  try:
110
188
  rel_path = get_relative_path(arr_path, path)
111
189
  except ValueError:
@@ -113,17 +191,17 @@ def _decode_masked_arrays(obj, type_lookup, path, arr_group, dataset_copy):
113
191
 
114
192
  data = arr_group.get(f"arr_{data_idx}")
115
193
  mask = arr_group.get(f"arr_{mask_idx}")
116
- dataset = np.ma.core.MaskedArray(data=data, mask=mask)
194
+ dataset: MaskedArray = MaskedArray(data=data, mask=mask)
117
195
 
118
196
  if rel_path:
119
- set_in_container(obj, rel_path, dataset)
197
+ set_in_container(obj_, rel_path, dataset)
120
198
  else:
121
- obj = dataset
199
+ obj_ = dataset
122
200
 
123
- return obj
201
+ return obj_
124
202
 
125
203
 
126
- def append_items_to_ragged_array(arr, items):
204
+ def append_items_to_ragged_array(arr: Array, items: Sequence[int]):
127
205
  """Append an array to a Zarr ragged array.
128
206
 
129
207
  I think `arr.append([item])` should work, but does not for some reason, so we do it
@@ -135,36 +213,39 @@ def append_items_to_ragged_array(arr, items):
135
213
 
136
214
 
137
215
  @dataclass
138
- class ZarrStoreTask(StoreTask):
216
+ class ZarrStoreTask(StoreTask[dict]):
139
217
  """
140
218
  Represents a task in a Zarr persistent store.
141
219
  """
142
220
 
143
- def encode(self) -> Tuple[int, np.ndarray, Dict]:
221
+ @override
222
+ def encode(self) -> tuple[int, dict, dict[str, Any]]:
144
223
  """Prepare store task data for the persistent store."""
145
224
  wk_task = {"id_": self.id_, "element_IDs": np.array(self.element_IDs)}
146
- task = {"id_": self.id_, **self.task_template}
225
+ task = {"id_": self.id_, **(self.task_template or {})}
147
226
  return self.index, wk_task, task
148
227
 
228
+ @override
149
229
  @classmethod
150
- def decode(cls, task_dat: Dict) -> ZarrStoreTask:
230
+ def decode(cls, task_dat: dict) -> Self:
151
231
  """Initialise a `StoreTask` from persistent task data"""
152
232
  task_dat["element_IDs"] = task_dat["element_IDs"].tolist()
153
- return super().decode(task_dat)
233
+ return cls(is_pending=False, **task_dat)
154
234
 
155
235
 
156
236
  @dataclass
157
- class ZarrStoreElement(StoreElement):
237
+ class ZarrStoreElement(StoreElement[ListAny, ZarrAttrs]):
158
238
  """
159
239
  Represents an element in a Zarr persistent store.
160
240
  """
161
241
 
162
- def encode(self, attrs: Dict) -> List:
242
+ @override
243
+ def encode(self, attrs: ZarrAttrs) -> ListAny:
163
244
  """Prepare store elements data for the persistent store.
164
245
 
165
246
  This method mutates `attrs`.
166
247
  """
167
- elem_enc = [
248
+ return [
168
249
  self.id_,
169
250
  self.index,
170
251
  self.es_idx,
@@ -173,10 +254,10 @@ class ZarrStoreElement(StoreElement):
173
254
  self.task_ID,
174
255
  self.iteration_IDs,
175
256
  ]
176
- return elem_enc
177
257
 
258
+ @override
178
259
  @classmethod
179
- def decode(cls, elem_dat: List, attrs: Dict) -> ZarrStoreElement:
260
+ def decode(cls, elem_dat: ListAny, attrs: ZarrAttrs) -> Self:
180
261
  """Initialise a `StoreElement` from persistent element data"""
181
262
  obj_dat = {
182
263
  "id_": elem_dat[0],
@@ -191,21 +272,22 @@ class ZarrStoreElement(StoreElement):
191
272
 
192
273
 
193
274
  @dataclass
194
- class ZarrStoreElementIter(StoreElementIter):
275
+ class ZarrStoreElementIter(StoreElementIter[ListAny, ZarrAttrs]):
195
276
  """
196
277
  Represents an element iteration in a Zarr persistent store.
197
278
  """
198
279
 
199
- def encode(self, attrs: Dict) -> List:
280
+ @override
281
+ def encode(self, attrs: ZarrAttrs) -> ListAny:
200
282
  """Prepare store element iteration data for the persistent store.
201
283
 
202
284
  This method mutates `attrs`.
203
285
  """
204
- iter_enc = [
286
+ return [
205
287
  self.id_,
206
288
  self.element_ID,
207
289
  int(self.EARs_initialised),
208
- [[k, v] for k, v in self.EAR_IDs.items()] if self.EAR_IDs else None,
290
+ [[ek, ev] for ek, ev in self.EAR_IDs.items()] if self.EAR_IDs else None,
209
291
  [
210
292
  [ensure_in(dk, attrs["parameter_paths"]), dv]
211
293
  for dk, dv in self.data_idx.items()
@@ -213,11 +295,11 @@ class ZarrStoreElementIter(StoreElementIter):
213
295
  [ensure_in(i, attrs["schema_parameters"]) for i in self.schema_parameters],
214
296
  [[ensure_in(dk, attrs["loops"]), dv] for dk, dv in self.loop_idx.items()],
215
297
  ]
216
- return iter_enc
217
298
 
299
+ @override
218
300
  @classmethod
219
- def decode(cls, iter_dat: List, attrs: Dict) -> StoreElementIter:
220
- """Initialise a `StoreElementIter` from persistent element iteration data"""
301
+ def decode(cls, iter_dat: ListAny, attrs: ZarrAttrs) -> Self:
302
+ """Initialise a `ZarrStoreElementIter` from persistent element iteration data"""
221
303
  obj_dat = {
222
304
  "id_": iter_dat[0],
223
305
  "element_ID": iter_dat[1],
@@ -231,17 +313,18 @@ class ZarrStoreElementIter(StoreElementIter):
231
313
 
232
314
 
233
315
  @dataclass
234
- class ZarrStoreEAR(StoreEAR):
316
+ class ZarrStoreEAR(StoreEAR[ListAny, ZarrAttrs]):
235
317
  """
236
318
  Represents an element action run in a Zarr persistent store.
237
319
  """
238
320
 
239
- def encode(self, attrs: Dict, ts_fmt: str) -> Tuple[List, Tuple[np.datetime64]]:
321
+ @override
322
+ def encode(self, ts_fmt: str, attrs: ZarrAttrs) -> ListAny:
240
323
  """Prepare store EAR data for the persistent store.
241
324
 
242
325
  This method mutates `attrs`.
243
326
  """
244
- EAR_enc = [
327
+ return [
245
328
  self.id_,
246
329
  self.elem_iter_ID,
247
330
  self.action_idx,
@@ -260,11 +343,13 @@ class ZarrStoreEAR(StoreEAR):
260
343
  self.metadata,
261
344
  self.run_hostname,
262
345
  self.commands_idx,
346
+ self.port_number,
347
+ self.commands_file_ID,
263
348
  ]
264
- return EAR_enc
265
349
 
350
+ @override
266
351
  @classmethod
267
- def decode(cls, EAR_dat: List, attrs: Dict, ts_fmt: str) -> ZarrStoreEAR:
352
+ def decode(cls, EAR_dat: ListAny, ts_fmt: str, attrs: ZarrAttrs) -> Self:
268
353
  """Initialise a `ZarrStoreEAR` from persistent EAR data"""
269
354
  obj_dat = {
270
355
  "id_": EAR_dat[0],
@@ -282,55 +367,44 @@ class ZarrStoreEAR(StoreEAR):
282
367
  "metadata": EAR_dat[12],
283
368
  "run_hostname": EAR_dat[13],
284
369
  "commands_idx": EAR_dat[14],
370
+ "port_number": EAR_dat[15],
371
+ "commands_file_ID": EAR_dat[16],
285
372
  }
286
373
  return cls(is_pending=False, **obj_dat)
287
374
 
288
375
 
289
376
  @dataclass
377
+ @hydrate
290
378
  class ZarrStoreParameter(StoreParameter):
291
379
  """
292
380
  Represents a parameter in a Zarr persistent store.
293
381
  """
294
382
 
295
- _encoders = { # keys are types
383
+ _encoders: ClassVar[dict[type, Callable]] = { # keys are types
296
384
  np.ndarray: _encode_numpy_array,
297
- np.ma.core.MaskedArray: _encode_masked_array,
385
+ MaskedArray: _encode_masked_array,
298
386
  }
299
- _decoders = { # keys are keys in type_lookup
387
+ _decoders: ClassVar[dict[str, Callable]] = { # keys are keys in type_lookup
300
388
  "arrays": _decode_numpy_arrays,
301
389
  "masked_arrays": _decode_masked_arrays,
302
390
  }
303
391
 
304
- def encode(self, root_group: zarr.Group, arr_path: str) -> Dict[str, Any]:
305
- return super().encode(root_group=root_group, arr_path=arr_path)
306
-
307
- @classmethod
308
- def decode(
309
- cls,
310
- id_: int,
311
- data: Union[None, Dict],
312
- source: Dict,
313
- arr_group: zarr.Group,
314
- path: Optional[List[str]] = None,
315
- dataset_copy: bool = False,
316
- ) -> Any:
317
- return super().decode(
318
- id_=id_,
319
- data=data,
320
- source=source,
321
- path=path,
322
- arr_group=arr_group,
323
- dataset_copy=dataset_copy,
324
- )
325
-
326
392
 
327
- class ZarrPersistentStore(PersistentStore):
393
+ class ZarrPersistentStore(
394
+ PersistentStore[
395
+ ZarrStoreTask,
396
+ ZarrStoreElement,
397
+ ZarrStoreElementIter,
398
+ ZarrStoreEAR,
399
+ ZarrStoreParameter,
400
+ ]
401
+ ):
328
402
  """
329
403
  A persistent store implemented using Zarr.
330
404
  """
331
405
 
332
- _name = "zarr"
333
- _features = PersistentStoreFeatures(
406
+ _name: ClassVar[str] = "zarr"
407
+ _features: ClassVar[PersistentStoreFeatures] = PersistentStoreFeatures(
334
408
  create=True,
335
409
  edit=True,
336
410
  jobscript_parallelism=True,
@@ -339,39 +413,82 @@ class ZarrPersistentStore(PersistentStore):
339
413
  submission=True,
340
414
  )
341
415
 
342
- _store_task_cls = ZarrStoreTask
343
- _store_elem_cls = ZarrStoreElement
344
- _store_iter_cls = ZarrStoreElementIter
345
- _store_EAR_cls = ZarrStoreEAR
346
- _store_param_cls = ZarrStoreParameter
347
-
348
- _param_grp_name = "parameters"
349
- _param_base_arr_name = "base"
350
- _param_sources_arr_name = "sources"
351
- _param_user_arr_grp_name = "arrays"
352
- _param_data_arr_grp_name = lambda _, param_idx: f"param_{param_idx}"
353
- _task_arr_name = "tasks"
354
- _elem_arr_name = "elements"
355
- _iter_arr_name = "iters"
356
- _EAR_arr_name = "runs"
357
- _time_res = "us" # microseconds; must not be smaller than micro!
358
-
359
- _res_map = CommitResourceMap(commit_template_components=("attrs",))
360
-
361
- def __init__(self, app, workflow, path, fs) -> None:
416
+ @classmethod
417
+ def _store_task_cls(cls) -> type[ZarrStoreTask]:
418
+ return ZarrStoreTask
419
+
420
+ @classmethod
421
+ def _store_elem_cls(cls) -> type[ZarrStoreElement]:
422
+ return ZarrStoreElement
423
+
424
+ @classmethod
425
+ def _store_iter_cls(cls) -> type[ZarrStoreElementIter]:
426
+ return ZarrStoreElementIter
427
+
428
+ @classmethod
429
+ def _store_EAR_cls(cls) -> type[ZarrStoreEAR]:
430
+ return ZarrStoreEAR
431
+
432
+ @classmethod
433
+ def _store_param_cls(cls) -> type[ZarrStoreParameter]:
434
+ return ZarrStoreParameter
435
+
436
+ _param_grp_name: ClassVar[str] = "parameters"
437
+ _param_base_arr_name: ClassVar[str] = "base"
438
+ _param_sources_arr_name: ClassVar[str] = "sources"
439
+ _param_user_arr_grp_name: ClassVar[str] = "arrays"
440
+ _param_data_arr_grp_name: ClassVar = lambda _, param_idx: f"param_{param_idx}"
441
+ _subs_md_group_name: ClassVar[str] = "submissions"
442
+ _task_arr_name: ClassVar[str] = "tasks"
443
+ _elem_arr_name: ClassVar[str] = "elements"
444
+ _iter_arr_name: ClassVar[str] = "iters"
445
+ _EAR_arr_name: ClassVar[str] = "runs"
446
+ _run_dir_arr_name: ClassVar[str] = "run_dirs"
447
+ _js_at_submit_md_arr_name: ClassVar[str] = "js_at_submit_md"
448
+ _js_run_IDs_arr_name: ClassVar[str] = "js_run_IDs"
449
+ _js_task_elems_arr_name: ClassVar[str] = "js_task_elems"
450
+ _js_task_acts_arr_name: ClassVar[str] = "js_task_acts"
451
+ _js_deps_arr_name: ClassVar[str] = "js_deps"
452
+ _time_res: ClassVar[str] = "us" # microseconds; must not be smaller than micro!
453
+
454
+ _res_map: ClassVar[CommitResourceMap] = CommitResourceMap(
455
+ commit_template_components=("attrs",)
456
+ )
457
+
458
+ def __init__(self, app, workflow, path: str | Path, fs: AbstractFileSystem) -> None:
362
459
  self._zarr_store = None # assigned on first access to `zarr_store`
363
460
  self._resources = {
364
461
  "attrs": ZarrAttrsStoreResource(
365
462
  app, name="attrs", open_call=self._get_root_group
366
463
  ),
367
464
  }
465
+ self._jobscript_at_submit_metadata: dict[
466
+ int, dict[str, Any]
467
+ ] = {} # this is a cache
468
+
469
+ # these are caches; keys are submission index and then tuples of
470
+ # (jobscript index, jobscript-block index):
471
+ self._jobscript_run_ID_arrays: dict[int, dict[tuple[int, int], NDArray]] = {}
472
+ self._jobscript_task_element_maps: dict[
473
+ int, dict[tuple[int, int], dict[int, list[int]]]
474
+ ] = {}
475
+ self._jobscript_task_actions_arrays: dict[
476
+ int, dict[tuple[int, int], NDArray]
477
+ ] = {}
478
+ self._jobscript_dependencies: dict[
479
+ int,
480
+ dict[
481
+ tuple[int, int], dict[tuple[int, int], ResolvedJobscriptBlockDependencies]
482
+ ],
483
+ ] = {}
484
+
368
485
  super().__init__(app, workflow, path, fs)
369
486
 
370
487
  @contextmanager
371
- def cached_load(self) -> Iterator[Dict]:
488
+ def cached_load(self) -> Iterator[None]:
372
489
  """Context manager to cache the root attributes."""
373
490
  with self.using_resource("attrs", "read") as attrs:
374
- yield attrs
491
+ yield
375
492
 
376
493
  def remove_replaced_dir(self) -> None:
377
494
  """
@@ -380,8 +497,8 @@ class ZarrPersistentStore(PersistentStore):
380
497
  with self.using_resource("attrs", "update") as md:
381
498
  if "replaced_workflow" in md:
382
499
  self.logger.debug("removing temporarily renamed pre-existing workflow.")
383
- self.remove_path(md["replaced_workflow"], self.fs)
384
- md["replaced_workflow"] = None
500
+ self.remove_path(md["replaced_workflow"])
501
+ del md["replaced_workflow"]
385
502
 
386
503
  def reinstate_replaced_dir(self) -> None:
387
504
  """
@@ -392,32 +509,38 @@ class ZarrPersistentStore(PersistentStore):
392
509
  self.logger.debug(
393
510
  "reinstating temporarily renamed pre-existing workflow."
394
511
  )
395
- self.rename_path(md["replaced_workflow"], self.path, self.fs)
512
+ self.rename_path(
513
+ md["replaced_workflow"],
514
+ self.path,
515
+ )
396
516
 
397
517
  @staticmethod
398
- def _get_zarr_store(path: str, fs) -> zarr.storage.Store:
399
- return zarr.storage.FSStore(url=path, fs=fs)
518
+ def _get_zarr_store(path: str | Path, fs: AbstractFileSystem) -> Store:
519
+ return FSStore(url=str(path), fs=fs)
520
+
521
+ _CODEC: ClassVar = MsgPack()
400
522
 
401
523
  @classmethod
402
524
  def write_empty_workflow(
403
525
  cls,
404
- app,
405
- template_js: Dict,
406
- template_components_js: Dict,
526
+ app: BaseApp,
527
+ *,
528
+ template_js: TemplateMeta,
529
+ template_components_js: dict[str, Any],
407
530
  wk_path: str,
408
- fs,
531
+ fs: AbstractFileSystem,
409
532
  name: str,
410
- replaced_wk: str,
533
+ replaced_wk: str | None,
411
534
  ts_fmt: str,
412
535
  ts_name_fmt: str,
413
- creation_info: Dict,
414
- compressor: Optional[Union[str, None]] = "blosc",
415
- compressor_kwargs: Optional[Dict[str, Any]] = None,
536
+ creation_info: StoreCreationInfo,
537
+ compressor: str | None = "blosc",
538
+ compressor_kwargs: dict[str, Any] | None = None,
416
539
  ) -> None:
417
540
  """
418
541
  Write an empty persistent workflow.
419
542
  """
420
- attrs = {
543
+ attrs: ZarrAttrsDict = {
421
544
  "name": name,
422
545
  "ts_fmt": ts_fmt,
423
546
  "ts_name_fmt": ts_name_fmt,
@@ -436,7 +559,11 @@ class ZarrPersistentStore(PersistentStore):
436
559
  root = zarr.group(store=store, overwrite=False)
437
560
  root.attrs.update(attrs)
438
561
 
439
- md = root.create_group("metadata")
562
+ # use a nested directory store for the metadata group so the runs array
563
+ # can be stored as a 2D array in nested directories, thereby limiting the maximum
564
+ # number of files stored in a given directory:
565
+ md_store = zarr.NestedDirectoryStore(Path(root.store.path).joinpath("metadata"))
566
+ md = zarr.group(store=md_store)
440
567
 
441
568
  compressor_lookup = {
442
569
  "blosc": Blosc,
@@ -459,7 +586,7 @@ class ZarrPersistentStore(PersistentStore):
459
586
  name=cls._elem_arr_name,
460
587
  shape=0,
461
588
  dtype=object,
462
- object_codec=MsgPack(),
589
+ object_codec=cls._CODEC,
463
590
  chunks=1000,
464
591
  compressor=cmp,
465
592
  )
@@ -469,7 +596,7 @@ class ZarrPersistentStore(PersistentStore):
469
596
  name=cls._iter_arr_name,
470
597
  shape=0,
471
598
  dtype=object,
472
- object_codec=MsgPack(),
599
+ object_codec=cls._CODEC,
473
600
  chunks=1000,
474
601
  compressor=cmp,
475
602
  )
@@ -483,20 +610,31 @@ class ZarrPersistentStore(PersistentStore):
483
610
 
484
611
  EARs_arr = md.create_dataset(
485
612
  name=cls._EAR_arr_name,
486
- shape=0,
613
+ shape=(0, 1000),
487
614
  dtype=object,
488
- object_codec=MsgPack(),
615
+ object_codec=cls._CODEC,
489
616
  chunks=1, # single-chunk rows for multiprocess writing
490
617
  compressor=cmp,
618
+ dimension_separator="/",
619
+ )
620
+ EARs_arr.attrs.update({"parameter_paths": [], "num_runs": 0})
621
+
622
+ # array for storing indices that can be used to reproduce run directory paths:
623
+ run_dir_arr = md.create_dataset(
624
+ name=cls._run_dir_arr_name,
625
+ shape=0,
626
+ chunks=10_000,
627
+ dtype=RUN_DIR_ARR_DTYPE,
628
+ fill_value=RUN_DIR_ARR_FILL,
629
+ write_empty_chunks=False,
491
630
  )
492
- EARs_arr.attrs.update({"parameter_paths": []})
493
631
 
494
632
  parameter_data = root.create_group(name=cls._param_grp_name)
495
633
  parameter_data.create_dataset(
496
634
  name=cls._param_base_arr_name,
497
635
  shape=0,
498
636
  dtype=object,
499
- object_codec=MsgPack(),
637
+ object_codec=cls._CODEC,
500
638
  chunks=1,
501
639
  compressor=cmp,
502
640
  write_empty_chunks=False,
@@ -506,15 +644,18 @@ class ZarrPersistentStore(PersistentStore):
506
644
  name=cls._param_sources_arr_name,
507
645
  shape=0,
508
646
  dtype=object,
509
- object_codec=MsgPack(),
647
+ object_codec=cls._CODEC,
510
648
  chunks=1000, # TODO: check this is a sensible size with many parameters
511
649
  compressor=cmp,
512
650
  )
513
651
  parameter_data.create_group(name=cls._param_user_arr_grp_name)
514
652
 
515
- def _append_tasks(self, tasks: List[ZarrStoreTask]):
653
+ # for storing submission metadata that should not be stored in the root group:
654
+ md.create_group(name=cls._subs_md_group_name)
655
+
656
+ def _append_tasks(self, tasks: Iterable[ZarrStoreTask]):
516
657
  elem_IDs_arr = self._get_tasks_arr(mode="r+")
517
- elem_IDs = []
658
+ elem_IDs: list[int] = []
518
659
  with self.using_resource("attrs", "update") as attrs:
519
660
  for i_idx, i in enumerate(tasks):
520
661
  idx, wk_task_i, task_i = i.encode()
@@ -529,24 +670,350 @@ class ZarrPersistentStore(PersistentStore):
529
670
  # increasing IDs.
530
671
  append_items_to_ragged_array(arr=elem_IDs_arr, items=elem_IDs)
531
672
 
532
- def _append_loops(self, loops: Dict[int, Dict]):
673
+ def _append_loops(self, loops: dict[int, LoopDescriptor]):
533
674
  with self.using_resource("attrs", action="update") as attrs:
534
- for loop_idx, loop in loops.items():
675
+ for loop in loops.values():
535
676
  attrs["loops"].append(
536
677
  {
537
678
  "num_added_iterations": loop["num_added_iterations"],
538
679
  "iterable_parameters": loop["iterable_parameters"],
680
+ "output_parameters": loop["output_parameters"],
539
681
  "parents": loop["parents"],
540
682
  }
541
683
  )
542
684
  attrs["template"]["loops"].append(loop["loop_template"])
543
685
 
544
- def _append_submissions(self, subs: Dict[int, Dict]):
686
+ @staticmethod
687
+ def _extract_submission_run_IDs_array(
688
+ sub_js: Mapping[str, JSONed],
689
+ ) -> tuple[np.ndarray, list[list[list[int]]]]:
690
+ """For a JSON-like representation of a Submission object, remove and combine all
691
+ jobscript-block run ID lists into a single array with a fill value.
692
+
693
+ Notes
694
+ -----
695
+ This mutates `sub_js`, by setting `EAR_ID` jobscript-block keys to `None`.
696
+
697
+ Parameters
698
+ ----------
699
+ sub_js
700
+ JSON-like representation of a `Submission` object.
701
+
702
+ Returns
703
+ -------
704
+ combined_run_IDs
705
+ Integer Numpy array that contains a concatenation of all 2D run ID arrays
706
+ from each jobscript-block. Technically a "jagged"/"ragged" array that is made
707
+ square with a large fill value.
708
+ block_shapes
709
+ List of length equal to the number of jobscripts in the submission. Each
710
+ sub-list contains a list of shapes (as a two-item list:
711
+ `[num_actions, num_elements]`) of the constituent blocks of that jobscript.
712
+
713
+ """
714
+ arrs = []
715
+ max_acts, max_elems = 0, 0
716
+
717
+ # a list for each jobscript, containing shapes of run ID arrays in each block:
718
+ block_shapes = []
719
+ for js in cast("Sequence[Mapping[str, JSONed]]", sub_js["jobscripts"]):
720
+ block_shapes_js_i = []
721
+ for blk in cast("Sequence[MutableMapping[str, JSONed]]", js["blocks"]):
722
+ run_IDs_i = np.array(blk["EAR_ID"])
723
+ blk["EAR_ID"] = None # TODO: how to type?
724
+ block_shapes_js_i.append(list(run_IDs_i.shape))
725
+ if run_IDs_i.shape[0] > max_acts:
726
+ max_acts = run_IDs_i.shape[0]
727
+ if run_IDs_i.shape[1] > max_elems:
728
+ max_elems = run_IDs_i.shape[1]
729
+ arrs.append(run_IDs_i)
730
+ block_shapes.append(block_shapes_js_i)
731
+
732
+ combined_run_IDs = np.full(
733
+ (len(arrs), max_acts, max_elems),
734
+ dtype=np.uint32,
735
+ fill_value=np.iinfo(np.uint32).max,
736
+ )
737
+ for arr_idx, arr in enumerate(arrs):
738
+ combined_run_IDs[arr_idx][: arr.shape[0], : arr.shape[1]] = arr
739
+
740
+ return combined_run_IDs, block_shapes
741
+
742
+ @staticmethod
743
+ def _extract_submission_task_elements_array(
744
+ sub_js: Mapping[str, JSONed],
745
+ ) -> tuple[np.ndarray, list[list[list[int]]]]:
746
+ """For a JSON-like representation of a Submission object, remove and combine all
747
+ jobscript-block task-element mappings into a single array with a fill value.
748
+
749
+ Notes
750
+ -----
751
+ This mutates `sub_js`, by setting `task_elements` jobscript-block keys to `None`.
752
+
753
+ Parameters
754
+ ----------
755
+ sub_js
756
+ JSON-like representation of a `Submission` object.
757
+
758
+ Returns
759
+ -------
760
+ combined_task_elems
761
+ Integer Numpy array that contains a concatenation of each task-element,
762
+ mapping, where each mapping is expressed as a 2D array whose first column
763
+ corresponds to the keys of the mappings, and whose remaining columns
764
+ correspond to the values of the mappings. Technically a "jagged"/"ragged"
765
+ array that is made square with a large fill value.
766
+ block_shapes
767
+ List of length equal to the number of jobscripts in the submission. Each
768
+ sub-list contains a list of shapes (as a two-item list:
769
+ `[num_actions, num_elements]`) of the constituent blocks of that jobscript.
770
+
771
+ """
772
+ arrs = []
773
+ max_x, max_y = 0, 0
774
+
775
+ # a list for each jobscript, containing shapes of run ID arrays in each block:
776
+ block_shapes = []
777
+ for js in cast("Sequence[Mapping[str, JSONed]]", sub_js["jobscripts"]):
778
+ block_shapes_js_i = []
779
+ for blk in cast("Sequence[MutableMapping[str, JSONed]]", js["blocks"]):
780
+
781
+ task_elems_lst = []
782
+ for k, v in cast("Mapping[int, list[int]]", blk["task_elements"]).items():
783
+ task_elems_lst.append([k] + v)
784
+ task_elems_i = np.array(task_elems_lst)
785
+
786
+ block_shape_j = [task_elems_i.shape[1] - 1, task_elems_i.shape[0]]
787
+ block_shapes_js_i.append(block_shape_j)
788
+
789
+ blk["task_elements"] = None # TODO: how to type?
790
+ if task_elems_i.shape[1] > max_x:
791
+ max_x = task_elems_i.shape[1]
792
+ if task_elems_i.shape[0] > max_y:
793
+ max_y = task_elems_i.shape[0]
794
+ arrs.append(task_elems_i)
795
+ block_shapes.append(block_shapes_js_i)
796
+
797
+ combined_task_elems = np.full(
798
+ (len(arrs), max_y, max_x),
799
+ dtype=np.uint32,
800
+ fill_value=np.iinfo(np.uint32).max,
801
+ )
802
+ for arr_idx, arr in enumerate(arrs):
803
+ combined_task_elems[arr_idx][: arr.shape[0], : arr.shape[1]] = arr
804
+
805
+ return combined_task_elems, block_shapes
806
+
807
+ @staticmethod
808
+ def _extract_submission_task_actions_array(
809
+ sub_js: Mapping[str, JSONed],
810
+ ) -> tuple[np.ndarray, list[list[int]]]:
811
+ """For a JSON-like representation of a Submission object, remove and concatenate
812
+ all jobscript-block task-action arrays into a single array.
813
+
814
+ Notes
815
+ -----
816
+ This mutates `sub_js`, by setting `task_actions` jobscript-block keys to `None`.
817
+
818
+ Parameters
819
+ ----------
820
+ sub_js
821
+ JSON-like representation of a `Submission` object.
822
+
823
+ Returns
824
+ -------
825
+ combined_task_acts
826
+ Integer 2D Numpy array which is a concatenation along the first axis of
827
+ task-action actions from all jobscript blocks. The second dimension is of
828
+ length three.
829
+ block_num_acts
830
+ List of length equal to the number of jobscripts in the submission. Each
831
+ sub-list contains a list of `num_actions` of the constituent blocks of that
832
+ jobscript.
833
+
834
+ """
835
+ arrs = []
836
+
837
+ # a list for each jobscript, containing shapes of run ID arrays in each block:
838
+
839
+ blk_num_acts = []
840
+ for js in cast("Sequence[Mapping[str, JSONed]]", sub_js["jobscripts"]):
841
+
842
+ blk_num_acts_js_i = []
843
+ for blk in cast("Sequence[MutableMapping[str, JSONed]]", js["blocks"]):
844
+
845
+ blk_acts = np.array(blk["task_actions"])
846
+ blk["task_actions"] = None # TODO: how to type?
847
+ blk_num_acts_js_i.append(blk_acts.shape[0])
848
+ arrs.append(blk_acts)
849
+
850
+ blk_num_acts.append(blk_num_acts_js_i)
851
+
852
+ combined_task_acts = np.vstack(arrs)
853
+
854
+ return combined_task_acts, blk_num_acts
855
+
856
+ @staticmethod
857
+ def _encode_jobscript_block_dependencies(sub_js: Mapping[str, JSONed]) -> np.ndarray:
858
+ """For a JSON-like representation of a Submission object, remove jobscript-block
859
+ dependencies for all jobscripts and transform to a single 1D integer array, that
860
+ can be transformed back by `_decode_jobscript_block_dependencies`.
861
+
862
+ Notes
863
+ -----
864
+ This mutates `sub_js`, by setting `depdendencies` jobscript-block keys to `None`.
865
+ """
866
+
867
+ # TODO: avoid this horrible mess of casts
868
+
869
+ all_deps_arr = []
870
+ assert sub_js["jobscripts"] is not None
871
+ for js in cast("Sequence[Mapping[str, JSONed]]", sub_js["jobscripts"]):
872
+ for blk in cast("Sequence[MutableMapping[str, JSONed]]", js["blocks"]):
873
+ all_deps_i: list[int] = []
874
+ assert blk["dependencies"] is not None
875
+ blk_deps = cast(
876
+ "list[tuple[tuple[int, int], Mapping[str, JSONed]]]",
877
+ blk["dependencies"],
878
+ )
879
+ for (dep_js_idx, dep_blk_idx), dep in blk_deps:
880
+ deps_arr: list[int] = []
881
+ for elem_i, elements_j in cast(
882
+ "Mapping[int, Sequence[int]]", dep["js_element_mapping"]
883
+ ).items():
884
+ deps_arr.extend([len(elements_j) + 1, elem_i] + list(elements_j))
885
+ blk_arr = [
886
+ dep_js_idx,
887
+ dep_blk_idx,
888
+ int(cast("bool", dep["is_array"])),
889
+ ] + deps_arr
890
+ blk_arr = [len(blk_arr)] + blk_arr
891
+ all_deps_i.extend(blk_arr)
892
+ all_deps_i = [
893
+ cast("int", js["index"]),
894
+ cast("int", blk["index"]),
895
+ ] + all_deps_i
896
+ blk["dependencies"] = None # TODO: how to type?
897
+ all_deps_arr.extend([len(all_deps_i)] + all_deps_i)
898
+
899
+ return np.array(all_deps_arr)
900
+
901
+ @staticmethod
902
+ def _decode_jobscript_block_dependencies(
903
+ arr: np.ndarray,
904
+ ) -> dict[tuple[int, int], dict[tuple[int, int], ResolvedJobscriptBlockDependencies]]:
905
+ """Re-generate jobscript-block dependencies that have been transformed by
906
+ `_encode_jobscript_block_dependencies` into a single 1D integer array.
907
+
908
+ Parameters
909
+ ----------
910
+ arr:
911
+ The 1D integer array to transform back to a verbose jobscript-block dependency
912
+ mapping.
913
+ """
914
+ # metadata is js/blk_idx for which the dependencies are stored:
915
+ block_arrs = split_arr(arr, metadata_size=2)
916
+ block_deps = {}
917
+ for i in block_arrs:
918
+
919
+ js_idx: int
920
+ blk_idx: int
921
+ dep_js_idx: int
922
+ dep_blk_idx: int
923
+ is_array: int
924
+
925
+ js_idx, blk_idx = i[0]
926
+ # metadata is js/blk_idx that this block depends on, plus whether the
927
+ # dependency is an array dependency:
928
+ deps_arrs = split_arr(i[1], metadata_size=3)
929
+ all_deps_ij: dict[tuple[int, int], ResolvedJobscriptBlockDependencies] = {}
930
+ for j in deps_arrs:
931
+ dep_js_idx, dep_blk_idx, is_array = j[0]
932
+ # no metadata:
933
+ elem_deps = split_arr(j[1], metadata_size=0)
934
+ all_deps_ij[(dep_js_idx, dep_blk_idx)] = {
935
+ "js_element_mapping": {},
936
+ "is_array": bool(is_array),
937
+ }
938
+ for k in elem_deps:
939
+ all_deps_ij[(dep_js_idx, dep_blk_idx)]["js_element_mapping"].update(
940
+ {k[1][0]: list(k[1][1:])}
941
+ )
942
+
943
+ block_deps[(js_idx, blk_idx)] = all_deps_ij
944
+ return block_deps
945
+
946
+ def _append_submissions(self, subs: dict[int, Mapping[str, JSONed]]):
947
+
948
+ for sub_idx, sub_i in subs.items():
949
+
950
+ # add a new metadata group for this submission:
951
+ sub_grp = self._get_all_submissions_metadata_group(mode="r+").create_group(
952
+ sub_idx
953
+ )
954
+
955
+ # add a new at-submit metadata array for jobscripts of this submission:
956
+ num_js = len(cast("list", sub_i["jobscripts"]))
957
+ sub_grp.create_dataset(
958
+ name=self._js_at_submit_md_arr_name,
959
+ shape=num_js,
960
+ dtype=object,
961
+ object_codec=MsgPack(),
962
+ chunks=1,
963
+ write_empty_chunks=False,
964
+ )
965
+
966
+ # add a new array to store run IDs for each jobscript:
967
+ combined_run_IDs, block_shapes = self._extract_submission_run_IDs_array(sub_i)
968
+ run_IDs_arr = sub_grp.create_dataset(
969
+ name=self._js_run_IDs_arr_name,
970
+ data=combined_run_IDs,
971
+ chunks=(None, None, None), # single chunk for the whole array
972
+ )
973
+ run_IDs_arr.attrs["block_shapes"] = block_shapes
974
+
975
+ # add a new array to store task-element map for each jobscript:
976
+ (
977
+ combined_task_elems,
978
+ block_shapes,
979
+ ) = self._extract_submission_task_elements_array(sub_i)
980
+ task_elems_arr = sub_grp.create_dataset(
981
+ name=self._js_task_elems_arr_name,
982
+ data=combined_task_elems,
983
+ chunks=(None, None, None),
984
+ )
985
+ task_elems_arr.attrs["block_shapes"] = block_shapes
986
+
987
+ # add a new array to store task-actions for each jobscript:
988
+ (
989
+ combined_task_acts,
990
+ block_num_acts,
991
+ ) = self._extract_submission_task_actions_array(sub_i)
992
+ task_acts_arr = sub_grp.create_dataset(
993
+ name=self._js_task_acts_arr_name,
994
+ data=combined_task_acts,
995
+ chunks=(None, None),
996
+ )
997
+ task_acts_arr.attrs["block_num_acts"] = block_num_acts
998
+
999
+ # add a new array to store jobscript-block dependencies for this submission:
1000
+ sub_grp.create_dataset(
1001
+ name=self._js_deps_arr_name,
1002
+ data=self._encode_jobscript_block_dependencies(sub_i),
1003
+ chunks=(None,),
1004
+ )
1005
+
1006
+ # TODO: store block shapes in `grp.attrs` since it is defined at the
1007
+ # submission level
1008
+
1009
+ # add attributes for at-submit-time submission metadata:
1010
+ grp = self._get_submission_metadata_group(sub_idx, mode="r+")
1011
+ grp.attrs["submission_parts"] = {}
1012
+
545
1013
  with self.using_resource("attrs", action="update") as attrs:
546
- for sub_idx, sub_i in subs.items():
547
- attrs["submissions"].append(sub_i)
1014
+ attrs["submissions"].extend(subs.values())
548
1015
 
549
- def _append_task_element_IDs(self, task_ID: int, elem_IDs: List[int]):
1016
+ def _append_task_element_IDs(self, task_ID: int, elem_IDs: list[int]):
550
1017
  # I don't think there's a way to "append" to an existing array in a zarr ragged
551
1018
  # array? So we have to build a new array from existing + new.
552
1019
  arr = self._get_tasks_arr(mode="r+")
@@ -554,169 +1021,262 @@ class ZarrPersistentStore(PersistentStore):
554
1021
  elem_IDs_new = np.concatenate((elem_IDs_cur, elem_IDs))
555
1022
  arr[task_ID] = elem_IDs_new
556
1023
 
557
- def _append_elements(self, elems: List[ZarrStoreElement]):
558
- arr = self._get_elements_arr(mode="r+")
559
- attrs_orig = arr.attrs.asdict()
1024
+ @staticmethod
1025
+ def __as_dict(attrs: Attributes) -> ZarrAttrs:
1026
+ """
1027
+ Type thunk to work around incomplete typing in zarr.
1028
+ """
1029
+ return cast("ZarrAttrs", attrs.asdict())
1030
+
1031
+ @contextmanager
1032
+ def __mutate_attrs(self, arr: Array) -> Iterator[ZarrAttrs]:
1033
+ attrs_orig = self.__as_dict(arr.attrs)
560
1034
  attrs = copy.deepcopy(attrs_orig)
561
- arr_add = np.empty((len(elems)), dtype=object)
562
- arr_add[:] = [i.encode(attrs) for i in elems]
563
- arr.append(arr_add)
1035
+ yield attrs
564
1036
  if attrs != attrs_orig:
565
1037
  arr.attrs.put(attrs)
566
1038
 
567
- def _append_element_sets(self, task_id: int, es_js: List[Dict]):
1039
+ def _append_elements(self, elems: Sequence[ZarrStoreElement]):
1040
+ arr = self._get_elements_arr(mode="r+")
1041
+ with self.__mutate_attrs(arr) as attrs:
1042
+ arr_add = np.empty((len(elems)), dtype=object)
1043
+ arr_add[:] = [elem.encode(attrs) for elem in elems]
1044
+ arr.append(arr_add)
1045
+
1046
+ def _append_element_sets(self, task_id: int, es_js: Sequence[Mapping]):
568
1047
  task_idx = task_idx = self._get_task_id_to_idx_map()[task_id]
569
1048
  with self.using_resource("attrs", "update") as attrs:
570
1049
  attrs["template"]["tasks"][task_idx]["element_sets"].extend(es_js)
571
1050
 
572
- def _append_elem_iter_IDs(self, elem_ID: int, iter_IDs: List[int]):
1051
+ def _append_elem_iter_IDs(self, elem_ID: int, iter_IDs: Iterable[int]):
573
1052
  arr = self._get_elements_arr(mode="r+")
574
- attrs = arr.attrs.asdict()
575
- elem_dat = arr[elem_ID]
1053
+ attrs = self.__as_dict(arr.attrs)
1054
+ elem_dat = cast("list", arr[elem_ID])
576
1055
  store_elem = ZarrStoreElement.decode(elem_dat, attrs)
577
1056
  store_elem = store_elem.append_iteration_IDs(iter_IDs)
578
- arr[elem_ID] = store_elem.encode(
579
- attrs
580
- ) # attrs shouldn't be mutated (TODO: test!)
1057
+ arr[elem_ID] = store_elem.encode(attrs)
1058
+ # attrs shouldn't be mutated (TODO: test!)
581
1059
 
582
- def _append_elem_iters(self, iters: List[ZarrStoreElementIter]):
1060
+ def _append_elem_iters(self, iters: Sequence[ZarrStoreElementIter]):
583
1061
  arr = self._get_iters_arr(mode="r+")
584
- attrs_orig = arr.attrs.asdict()
585
- attrs = copy.deepcopy(attrs_orig)
586
- arr_add = np.empty((len(iters)), dtype=object)
587
- arr_add[:] = [i.encode(attrs) for i in iters]
588
- arr.append(arr_add)
589
- if attrs != attrs_orig:
590
- arr.attrs.put(attrs)
1062
+ with self.__mutate_attrs(arr) as attrs:
1063
+ arr_add = np.empty((len(iters)), dtype=object)
1064
+ arr_add[:] = [i.encode(attrs) for i in iters]
1065
+ arr.append(arr_add)
591
1066
 
592
- def _append_elem_iter_EAR_IDs(self, iter_ID: int, act_idx: int, EAR_IDs: List[int]):
1067
+ def _append_elem_iter_EAR_IDs(
1068
+ self, iter_ID: int, act_idx: int, EAR_IDs: Sequence[int]
1069
+ ):
593
1070
  arr = self._get_iters_arr(mode="r+")
594
- attrs = arr.attrs.asdict()
595
- iter_dat = arr[iter_ID]
1071
+ attrs = self.__as_dict(arr.attrs)
1072
+ iter_dat = cast("list", arr[iter_ID])
596
1073
  store_iter = ZarrStoreElementIter.decode(iter_dat, attrs)
597
1074
  store_iter = store_iter.append_EAR_IDs(pend_IDs={act_idx: EAR_IDs})
598
- arr[iter_ID] = store_iter.encode(
599
- attrs
600
- ) # attrs shouldn't be mutated (TODO: test!)
1075
+ arr[iter_ID] = store_iter.encode(attrs)
1076
+ # attrs shouldn't be mutated (TODO: test!)
601
1077
 
602
1078
  def _update_elem_iter_EARs_initialised(self, iter_ID: int):
603
1079
  arr = self._get_iters_arr(mode="r+")
604
- attrs = arr.attrs.asdict()
605
- iter_dat = arr[iter_ID]
1080
+ attrs = self.__as_dict(arr.attrs)
1081
+ iter_dat = cast("list", arr[iter_ID])
606
1082
  store_iter = ZarrStoreElementIter.decode(iter_dat, attrs)
607
1083
  store_iter = store_iter.set_EARs_initialised()
608
- arr[iter_ID] = store_iter.encode(
609
- attrs
610
- ) # attrs shouldn't be mutated (TODO: test!)
1084
+ arr[iter_ID] = store_iter.encode(attrs)
1085
+ # attrs shouldn't be mutated (TODO: test!)
611
1086
 
612
- def _append_submission_parts(self, sub_parts: Dict[int, Dict[str, List[int]]]):
613
- with self.using_resource("attrs", action="update") as attrs:
614
- for sub_idx, sub_i_parts in sub_parts.items():
615
- for dt_str, parts_j in sub_i_parts.items():
616
- attrs["submissions"][sub_idx]["submission_parts"][dt_str] = parts_j
1087
+ def _update_at_submit_metadata(
1088
+ self,
1089
+ at_submit_metadata: dict[int, dict[str, Any]],
1090
+ ):
1091
+ for sub_idx, metadata_i in at_submit_metadata.items():
1092
+ grp = self._get_submission_metadata_group(sub_idx, mode="r+")
1093
+ attrs = self.__as_dict(grp.attrs)
1094
+ attrs["submission_parts"].update(metadata_i["submission_parts"])
1095
+ grp.attrs.put(attrs)
1096
+
1097
+ def _update_loop_index(self, loop_indices: dict[int, dict[str, int]]):
617
1098
 
618
- def _update_loop_index(self, iter_ID: int, loop_idx: Dict):
619
1099
  arr = self._get_iters_arr(mode="r+")
620
- attrs = arr.attrs.asdict()
621
- iter_dat = arr[iter_ID]
622
- store_iter = ZarrStoreElementIter.decode(iter_dat, attrs)
623
- store_iter = store_iter.update_loop_idx(loop_idx)
624
- arr[iter_ID] = store_iter.encode(attrs)
1100
+ attrs = self.__as_dict(arr.attrs)
1101
+ iter_IDs = list(loop_indices.keys())
1102
+ iter_dat = arr.get_coordinate_selection(iter_IDs)
1103
+ store_iters = [ZarrStoreElementIter.decode(i, attrs) for i in iter_dat]
625
1104
 
626
- def _update_loop_num_iters(self, index: int, num_iters: int):
1105
+ for idx, iter_ID_i in enumerate(iter_IDs):
1106
+ new_iter_i = store_iters[idx].update_loop_idx(loop_indices[iter_ID_i])
1107
+ # seems to be a Zarr bug that prevents `set_coordinate_selection` with an
1108
+ # object array, so set one-by-one:
1109
+ arr[iter_ID_i] = new_iter_i.encode(attrs)
1110
+
1111
+ def _update_loop_num_iters(self, index: int, num_iters: list[list[list[int] | int]]):
627
1112
  with self.using_resource("attrs", action="update") as attrs:
628
1113
  attrs["loops"][index]["num_added_iterations"] = num_iters
629
1114
 
630
- def _update_loop_parents(self, index: int, parents: List[str]):
1115
+ def _update_loop_parents(self, index: int, parents: list[str]):
631
1116
  with self.using_resource("attrs", action="update") as attrs:
632
1117
  attrs["loops"][index]["parents"] = parents
633
1118
 
634
- def _append_EARs(self, EARs: List[ZarrStoreEAR]):
635
- arr = self._get_EARs_arr(mode="r+")
636
- attrs_orig = arr.attrs.asdict()
637
- attrs = copy.deepcopy(attrs_orig)
638
- arr_add = np.empty((len(EARs)), dtype=object)
639
- arr_add[:] = [i.encode(attrs, self.ts_fmt) for i in EARs]
640
- arr.append(arr_add)
1119
+ def _update_iter_data_indices(self, iter_data_indices: dict[int, DataIndex]):
641
1120
 
642
- if attrs != attrs_orig:
643
- arr.attrs.put(attrs)
1121
+ arr = self._get_iters_arr(mode="r+")
1122
+ attrs = self.__as_dict(arr.attrs)
1123
+ iter_IDs = list(iter_data_indices.keys())
1124
+ iter_dat = arr.get_coordinate_selection(iter_IDs)
1125
+ store_iters = [ZarrStoreElementIter.decode(i, attrs) for i in iter_dat]
644
1126
 
645
- @TimeIt.decorator
646
- def _update_EAR_submission_indices(self, sub_indices: Dict[int:int]):
647
- EAR_IDs = list(sub_indices.keys())
648
- EARs = self._get_persistent_EARs(EAR_IDs)
1127
+ for idx, iter_ID_i in enumerate(iter_IDs):
1128
+ new_iter_i = store_iters[idx].update_data_idx(iter_data_indices[iter_ID_i])
1129
+ # seems to be a Zarr bug that prevents `set_coordinate_selection` with an
1130
+ # object array, so set one-by-one:
1131
+ arr[iter_ID_i] = new_iter_i.encode(attrs)
1132
+
1133
+ def _update_run_data_indices(self, run_data_indices: dict[int, DataIndex]):
1134
+ self._update_runs(
1135
+ updates={k: {"data_idx": v} for k, v in run_data_indices.items()}
1136
+ )
649
1137
 
1138
+ def _append_EARs(self, EARs: Sequence[ZarrStoreEAR]):
650
1139
  arr = self._get_EARs_arr(mode="r+")
651
- attrs_orig = arr.attrs.asdict()
652
- attrs = copy.deepcopy(attrs_orig)
1140
+ with self.__mutate_attrs(arr) as attrs:
1141
+ num_existing = attrs["num_runs"]
1142
+ num_add = len(EARs)
1143
+ num_tot = num_existing + num_add
1144
+ arr_add = np.empty(num_add, dtype=object)
1145
+ arr_add[:] = [i.encode(self.ts_fmt, attrs) for i in EARs]
653
1146
 
654
- encoded_EARs = []
655
- for EAR_ID_i, sub_idx_i in sub_indices.items():
656
- new_EAR_i = EARs[EAR_ID_i].update(submission_idx=sub_idx_i)
657
- # seems to be a Zarr bug that prevents `set_coordinate_selection` with an
658
- # object array, so set one-by-one:
659
- arr[EAR_ID_i] = new_EAR_i.encode(attrs, self.ts_fmt)
1147
+ # get new 1D indices:
1148
+ new_idx: NDArray = np.arange(num_existing, num_tot)
660
1149
 
661
- if attrs != attrs_orig:
662
- arr.attrs.put(attrs)
1150
+ # transform to 2D indices:
1151
+ r_idx, c_idx = get_2D_idx(new_idx, num_cols=arr.shape[1])
1152
+
1153
+ # add rows to accomodate new runs:
1154
+ max_r_idx = np.max(r_idx)
1155
+ if max_r_idx + 1 > arr.shape[0]:
1156
+ arr.resize(max_r_idx + 1, arr.shape[1])
1157
+
1158
+ # fill in new data:
1159
+ for arr_add_idx_i, (r_idx_i, c_idx_i) in enumerate(zip(r_idx, c_idx)):
1160
+ # seems to be a Zarr bug that prevents `set_coordinate_selection` with an
1161
+ # object array, so set one-by-one:
1162
+ arr[r_idx_i, c_idx_i] = arr_add[arr_add_idx_i]
1163
+
1164
+ attrs["num_runs"] = num_tot
1165
+
1166
+ # add more rows to run dirs array:
1167
+ dirs_arr = self._get_dirs_arr(mode="r+")
1168
+ dirs_arr.resize(num_tot)
1169
+
1170
+ def _set_run_dirs(self, run_dir_arr: np.ndarray, run_idx: np.ndarray):
1171
+ dirs_arr = self._get_dirs_arr(mode="r+")
1172
+ dirs_arr[run_idx] = run_dir_arr
1173
+
1174
+ @TimeIt.decorator
1175
+ def _update_runs(self, updates: dict[int, dict[str, Any]]):
1176
+ """Update the provided EAR attribute values in the specified existing runs."""
1177
+ run_IDs = list(updates.keys())
1178
+ runs = self._get_persistent_EARs(run_IDs)
663
1179
 
664
- def _update_EAR_start(self, EAR_id: int, s_time: datetime, s_snap: Dict, s_hn: str):
665
1180
  arr = self._get_EARs_arr(mode="r+")
666
- attrs_orig = arr.attrs.asdict()
667
- attrs = copy.deepcopy(attrs_orig)
1181
+ with self.__mutate_attrs(arr) as attrs:
1182
+ # convert to 2D array indices:
1183
+ r_idx, c_idx = get_2D_idx(
1184
+ np.array(list(updates.keys())), num_cols=arr.shape[1]
1185
+ )
1186
+ for ri, ci, rID_i, upd_i in zip(
1187
+ r_idx, c_idx, updates.keys(), updates.values()
1188
+ ):
1189
+ new_run_i = runs[rID_i].update(**upd_i)
1190
+ # seems to be a Zarr bug that prevents `set_coordinate_selection` with an
1191
+ # object array, so set one-by-one:
1192
+ arr[ri, ci] = new_run_i.encode(self.ts_fmt, attrs)
668
1193
 
669
- EAR_i = self._get_persistent_EARs([EAR_id])[EAR_id]
670
- EAR_i = EAR_i.update(
671
- start_time=s_time,
672
- snapshot_start=s_snap,
673
- run_hostname=s_hn,
1194
+ @TimeIt.decorator
1195
+ def _update_EAR_submission_data(self, sub_data: Mapping[int, tuple[int, int | None]]):
1196
+ self._update_runs(
1197
+ updates={
1198
+ k: {"submission_idx": v[0], "commands_file_ID": v[1]}
1199
+ for k, v in sub_data.items()
1200
+ }
674
1201
  )
675
- arr[EAR_id] = EAR_i.encode(attrs, self.ts_fmt)
676
1202
 
677
- if attrs != attrs_orig:
678
- arr.attrs.put(attrs)
1203
+ def _update_EAR_start(
1204
+ self,
1205
+ run_starts: dict[int, tuple[datetime, dict[str, Any] | None, str, int | None]],
1206
+ ):
1207
+ self._update_runs(
1208
+ updates={
1209
+ k: {
1210
+ "start_time": v[0],
1211
+ "snapshot_start": v[1],
1212
+ "run_hostname": v[2],
1213
+ "port_number": v[3],
1214
+ }
1215
+ for k, v in run_starts.items()
1216
+ }
1217
+ )
679
1218
 
680
1219
  def _update_EAR_end(
681
- self, EAR_id: int, e_time: datetime, e_snap: Dict, ext_code: int, success: bool
1220
+ self, run_ends: dict[int, tuple[datetime, dict[str, Any] | None, int, bool]]
682
1221
  ):
683
- arr = self._get_EARs_arr(mode="r+")
684
- attrs_orig = arr.attrs.asdict()
685
- attrs = copy.deepcopy(attrs_orig)
686
-
687
- EAR_i = self._get_persistent_EARs([EAR_id])[EAR_id]
688
- EAR_i = EAR_i.update(
689
- end_time=e_time,
690
- snapshot_end=e_snap,
691
- exit_code=ext_code,
692
- success=success,
1222
+ self._update_runs(
1223
+ updates={
1224
+ k: {
1225
+ "end_time": v[0],
1226
+ "snapshot_end": v[1],
1227
+ "exit_code": v[2],
1228
+ "success": v[3],
1229
+ }
1230
+ for k, v in run_ends.items()
1231
+ }
693
1232
  )
694
- arr[EAR_id] = EAR_i.encode(attrs, self.ts_fmt)
695
1233
 
696
- if attrs != attrs_orig:
697
- arr.attrs.put(attrs)
1234
+ def _update_EAR_skip(self, skips: dict[int, int]):
1235
+ self._update_runs(updates={k: {"skip": v} for k, v in skips.items()})
698
1236
 
699
- def _update_EAR_skip(self, EAR_id: int):
700
- arr = self._get_EARs_arr(mode="r+")
701
- attrs_orig = arr.attrs.asdict()
702
- attrs = copy.deepcopy(attrs_orig)
1237
+ def _update_js_metadata(self, js_meta: dict[int, dict[int, dict[str, Any]]]):
703
1238
 
704
- EAR_i = self._get_persistent_EARs([EAR_id])[EAR_id]
705
- EAR_i = EAR_i.update(skip=True)
706
- arr[EAR_id] = EAR_i.encode(attrs, self.ts_fmt)
1239
+ arr_keys = JOBSCRIPT_SUBMIT_TIME_KEYS # these items go to the Zarr array
707
1240
 
708
- if attrs != attrs_orig:
709
- arr.attrs.put(attrs)
1241
+ # split into attributes to save to the root group metadata, and those to save to
1242
+ # the submit-time jobscript metadata array
710
1243
 
711
- def _update_js_metadata(self, js_meta: Dict):
712
- with self.using_resource("attrs", action="update") as attrs:
713
- for sub_idx, all_js_md in js_meta.items():
714
- for js_idx, js_meta_i in all_js_md.items():
715
- attrs["submissions"][sub_idx]["jobscripts"][js_idx].update(
716
- **js_meta_i
1244
+ grp_dat = {} # keys are tuples of (sub_idx, js_idx), values are metadata dicts
1245
+
1246
+ for sub_idx, all_js_md in js_meta.items():
1247
+ js_arr = None
1248
+ for js_idx, js_meta_i in all_js_md.items():
1249
+
1250
+ grp_dat_i = {k: v for k, v in js_meta_i.items() if k not in arr_keys}
1251
+ if grp_dat_i:
1252
+ grp_dat[(sub_idx, js_idx)] = grp_dat_i
1253
+ arr_dat = [js_meta_i.get(k) for k in arr_keys]
1254
+
1255
+ if any(arr_dat):
1256
+ # we are updating the at-sumbmit metadata, so clear the cache:
1257
+ self.clear_jobscript_at_submit_metadata_cache()
1258
+
1259
+ js_arr = js_arr or self._get_jobscripts_at_submit_metadata_arr(
1260
+ mode="r+", sub_idx=sub_idx
717
1261
  )
1262
+ self.logger.info(
1263
+ f"updating submit-time jobscript metadata array: {arr_dat!r}."
1264
+ )
1265
+ js_arr[js_idx] = arr_dat
1266
+
1267
+ if grp_dat:
1268
+ with self.using_resource("attrs", action="update") as attrs:
1269
+ for (sub_idx, js_idx), js_meta_i in grp_dat.items():
1270
+ self.logger.info(
1271
+ f"updating jobscript metadata in the root group for "
1272
+ f"(sub={sub_idx}, js={js_idx}): {js_meta_i!r}."
1273
+ )
1274
+ sub = cast(
1275
+ "dict[str, list[dict[str, Any]]]", attrs["submissions"][sub_idx]
1276
+ )
1277
+ sub["jobscripts"][js_idx].update(js_meta_i)
718
1278
 
719
- def _append_parameters(self, params: List[ZarrStoreParameter]):
1279
+ def _append_parameters(self, params: Sequence[StoreParameter]):
720
1280
  """Add new persistent parameters."""
721
1281
  base_arr = self._get_parameter_base_array(mode="r+", write_empty_chunks=False)
722
1282
  src_arr = self._get_parameter_sources_array(mode="r+")
@@ -725,8 +1285,8 @@ class ZarrPersistentStore(PersistentStore):
725
1285
  )
726
1286
 
727
1287
  param_encode_root_group = self._get_parameter_user_array_group(mode="r+")
728
- param_enc = []
729
- src_enc = []
1288
+ param_enc: list[dict[str, Any] | int] = []
1289
+ src_enc: list[dict] = []
730
1290
  for param_i in params:
731
1291
  dat_i = param_i.encode(
732
1292
  root_group=param_encode_root_group,
@@ -741,16 +1301,15 @@ class ZarrPersistentStore(PersistentStore):
741
1301
  f"PersistentStore._append_parameters: finished adding {len(params)} parameters."
742
1302
  )
743
1303
 
744
- def _set_parameter_values(self, set_parameters: Dict[int, Tuple[Any, bool]]):
1304
+ def _set_parameter_values(self, set_parameters: dict[int, tuple[Any, bool]]):
745
1305
  """Set multiple unset persistent parameters."""
746
1306
 
747
- param_ids = list(set_parameters.keys())
1307
+ param_ids = list(set_parameters)
748
1308
  # the `decode` call in `_get_persistent_parameters` should be quick:
749
1309
  params = self._get_persistent_parameters(param_ids)
750
- new_data = []
1310
+ new_data: list[dict[str, Any] | int] = []
751
1311
  param_encode_root_group = self._get_parameter_user_array_group(mode="r+")
752
1312
  for param_id, (value, is_file) in set_parameters.items():
753
-
754
1313
  param_i = params[param_id]
755
1314
  if is_file:
756
1315
  param_i = param_i.set_file(value)
@@ -768,19 +1327,19 @@ class ZarrPersistentStore(PersistentStore):
768
1327
  base_arr = self._get_parameter_base_array(mode="r+")
769
1328
  base_arr.set_coordinate_selection(param_ids, new_data)
770
1329
 
771
- def _update_parameter_sources(self, sources: Dict[int, Dict]):
1330
+ def _update_parameter_sources(self, sources: Mapping[int, ParamSource]):
772
1331
  """Update the sources of multiple persistent parameters."""
773
1332
 
774
- param_ids = list(sources.keys())
1333
+ param_ids = list(sources)
775
1334
  src_arr = self._get_parameter_sources_array(mode="r+")
776
1335
  existing_sources = src_arr.get_coordinate_selection(param_ids)
777
- new_sources = []
778
- for idx, source_i in enumerate(sources.values()):
779
- new_src_i = update_param_source_dict(existing_sources[idx], source_i)
780
- new_sources.append(new_src_i)
1336
+ new_sources = [
1337
+ update_param_source_dict(cast("ParamSource", existing_sources[idx]), source_i)
1338
+ for idx, source_i in enumerate(sources.values())
1339
+ ]
781
1340
  src_arr.set_coordinate_selection(param_ids, new_sources)
782
1341
 
783
- def _update_template_components(self, tc: Dict):
1342
+ def _update_template_components(self, tc: dict[str, Any]):
784
1343
  with self.using_resource("attrs", "update") as md:
785
1344
  md["template_components"] = tc
786
1345
 
@@ -819,7 +1378,7 @@ class ZarrPersistentStore(PersistentStore):
819
1378
  if self.use_cache and self.num_EARs_cache is not None:
820
1379
  num = self.num_EARs_cache
821
1380
  else:
822
- num = len(self._get_EARs_arr())
1381
+ num = self._get_EARs_arr().attrs["num_runs"]
823
1382
  if self.use_cache and self.num_EARs_cache is None:
824
1383
  self.num_EARs_cache = num
825
1384
  return num
@@ -832,46 +1391,55 @@ class ZarrPersistentStore(PersistentStore):
832
1391
  return attrs["num_added_tasks"]
833
1392
 
834
1393
  @property
835
- def zarr_store(self) -> zarr.storage.Store:
1394
+ def zarr_store(self) -> Store:
836
1395
  """
837
1396
  The underlying store object.
838
1397
  """
839
1398
  if self._zarr_store is None:
1399
+ assert self.fs is not None
840
1400
  self._zarr_store = self._get_zarr_store(self.path, self.fs)
841
1401
  return self._zarr_store
842
1402
 
843
- def _get_root_group(self, mode: str = "r", **kwargs) -> zarr.Group:
1403
+ def _get_root_group(self, mode: str = "r", **kwargs) -> Group:
1404
+ # TODO: investigate if there are inefficiencies in how we retrieve zarr groups
1405
+ # and arrays, e.g. opening sub groups sequentially would open the root group
1406
+ # multiple times, and so read the root group attrs file multiple times?
1407
+ # it might make sense to define a ZarrAttrsStoreResource for each zarr group and
1408
+ # array (or at least non-parameter groups/arrays?), there could be some built-in
1409
+ # understanding of the hierarchy (e.g. via a `path` attribute) which would then
1410
+ # avoid reading parent groups multiple times --- if that is happening currently.
844
1411
  return zarr.open(self.zarr_store, mode=mode, **kwargs)
845
1412
 
846
- def _get_parameter_group(self, mode: str = "r", **kwargs) -> zarr.Group:
1413
+ def _get_parameter_group(self, mode: str = "r", **kwargs) -> Group:
847
1414
  return self._get_root_group(mode=mode, **kwargs).get(self._param_grp_name)
848
1415
 
849
- def _get_parameter_base_array(self, mode: str = "r", **kwargs) -> zarr.Array:
1416
+ def _get_parameter_base_array(self, mode: str = "r", **kwargs) -> Array:
850
1417
  path = f"{self._param_grp_name}/{self._param_base_arr_name}"
851
1418
  return zarr.open(self.zarr_store, mode=mode, path=path, **kwargs)
852
1419
 
853
- def _get_parameter_sources_array(self, mode: str = "r") -> zarr.Array:
1420
+ def _get_parameter_sources_array(self, mode: str = "r") -> Array:
854
1421
  return self._get_parameter_group(mode=mode).get(self._param_sources_arr_name)
855
1422
 
856
- def _get_parameter_user_array_group(self, mode: str = "r") -> zarr.Group:
1423
+ def _get_parameter_user_array_group(self, mode: str = "r") -> Group:
857
1424
  return self._get_parameter_group(mode=mode).get(self._param_user_arr_grp_name)
858
1425
 
859
1426
  def _get_parameter_data_array_group(
860
1427
  self,
861
1428
  parameter_idx: int,
862
1429
  mode: str = "r",
863
- ) -> zarr.Group:
1430
+ ) -> Group:
864
1431
  return self._get_parameter_user_array_group(mode=mode).get(
865
1432
  self._param_data_arr_grp_name(parameter_idx)
866
1433
  )
867
1434
 
868
- def _get_array_group_and_dataset(self, mode: str, param_id: int, data_path):
1435
+ def _get_array_group_and_dataset(
1436
+ self, mode: str, param_id: int, data_path: list[int]
1437
+ ):
869
1438
  base_dat = self._get_parameter_base_array(mode="r")[param_id]
870
- arr_idx = None
871
1439
  for arr_dat_path, arr_idx in base_dat["type_lookup"]["arrays"]:
872
1440
  if arr_dat_path == data_path:
873
1441
  break
874
- if arr_idx is None:
1442
+ else:
875
1443
  raise ValueError(
876
1444
  f"Could not find array path {data_path} in the base data for parameter "
877
1445
  f"ID {param_id}."
@@ -881,21 +1449,72 @@ class ZarrPersistentStore(PersistentStore):
881
1449
  )
882
1450
  return group, f"arr_{arr_idx}"
883
1451
 
884
- def _get_metadata_group(self, mode: str = "r") -> zarr.Group:
885
- return self._get_root_group(mode=mode).get("metadata")
1452
+ def _get_metadata_group(self, mode: str = "r") -> Group:
1453
+ try:
1454
+ path = Path(self.workflow.url).joinpath("metadata")
1455
+ md_store = zarr.NestedDirectoryStore(path)
1456
+ return zarr.open_group(store=md_store, mode=mode)
1457
+ except (FileNotFoundError, zarr.errors.GroupNotFoundError):
1458
+ # zip store?
1459
+ return zarr.open_group(self.zarr_store, path="metadata", mode=mode)
1460
+
1461
+ def _get_all_submissions_metadata_group(self, mode: str = "r") -> Group:
1462
+ return self._get_metadata_group(mode=mode).get(self._subs_md_group_name)
1463
+
1464
+ def _get_submission_metadata_group(self, sub_idx: int, mode: str = "r") -> Group:
1465
+ return self._get_all_submissions_metadata_group(mode=mode).get(sub_idx)
1466
+
1467
+ def _get_submission_metadata_group_path(self, sub_idx: int) -> Path:
1468
+ grp = self._get_submission_metadata_group(sub_idx)
1469
+ return Path(grp.store.path).joinpath(grp.path)
1470
+
1471
+ def _get_jobscripts_at_submit_metadata_arr(
1472
+ self, sub_idx: int, mode: str = "r"
1473
+ ) -> Array:
1474
+ return self._get_submission_metadata_group(sub_idx=sub_idx, mode=mode).get(
1475
+ self._js_at_submit_md_arr_name
1476
+ )
1477
+
1478
+ def _get_jobscripts_at_submit_metadata_arr_path(self, sub_idx: int) -> Path:
1479
+ arr = self._get_jobscripts_at_submit_metadata_arr(sub_idx)
1480
+ return Path(arr.store.path).joinpath(arr.path)
1481
+
1482
+ @TimeIt.decorator
1483
+ def _get_jobscripts_run_ID_arr(self, sub_idx: int, mode: str = "r") -> Array:
1484
+ return self._get_submission_metadata_group(sub_idx=sub_idx, mode=mode).get(
1485
+ self._js_run_IDs_arr_name
1486
+ )
1487
+
1488
+ def _get_jobscripts_task_elements_arr(self, sub_idx: int, mode: str = "r") -> Array:
1489
+ return self._get_submission_metadata_group(sub_idx=sub_idx, mode=mode).get(
1490
+ self._js_task_elems_arr_name
1491
+ )
1492
+
1493
+ def _get_jobscripts_task_actions_arr(self, sub_idx: int, mode: str = "r") -> Array:
1494
+ return self._get_submission_metadata_group(sub_idx=sub_idx, mode=mode).get(
1495
+ self._js_task_acts_arr_name
1496
+ )
1497
+
1498
+ def _get_jobscripts_dependencies_arr(self, sub_idx: int, mode: str = "r") -> Array:
1499
+ return self._get_submission_metadata_group(sub_idx=sub_idx, mode=mode).get(
1500
+ self._js_deps_arr_name
1501
+ )
886
1502
 
887
- def _get_tasks_arr(self, mode: str = "r") -> zarr.Array:
1503
+ def _get_tasks_arr(self, mode: str = "r") -> Array:
888
1504
  return self._get_metadata_group(mode=mode).get(self._task_arr_name)
889
1505
 
890
- def _get_elements_arr(self, mode: str = "r") -> zarr.Array:
1506
+ def _get_elements_arr(self, mode: str = "r") -> Array:
891
1507
  return self._get_metadata_group(mode=mode).get(self._elem_arr_name)
892
1508
 
893
- def _get_iters_arr(self, mode: str = "r") -> zarr.Array:
1509
+ def _get_iters_arr(self, mode: str = "r") -> Array:
894
1510
  return self._get_metadata_group(mode=mode).get(self._iter_arr_name)
895
1511
 
896
- def _get_EARs_arr(self, mode: str = "r") -> zarr.Array:
1512
+ def _get_EARs_arr(self, mode: str = "r") -> Array:
897
1513
  return self._get_metadata_group(mode=mode).get(self._EAR_arr_name)
898
1514
 
1515
+ def _get_dirs_arr(self, mode: str = "r") -> zarr.Array:
1516
+ return self._get_metadata_group(mode=mode).get(self._run_dir_arr_name)
1517
+
899
1518
  @classmethod
900
1519
  def make_test_store_from_spec(
901
1520
  cls,
@@ -905,10 +1524,10 @@ class ZarrPersistentStore(PersistentStore):
905
1524
  overwrite=False,
906
1525
  ):
907
1526
  """Generate an store for testing purposes."""
1527
+ ts_fmt = "FIXME"
908
1528
 
909
1529
  path = Path(dir or "", path)
910
- store = zarr.DirectoryStore(path)
911
- root = zarr.group(store=store, overwrite=overwrite)
1530
+ root = zarr.group(store=DirectoryStore(path), overwrite=overwrite)
912
1531
  md = root.create_group("metadata")
913
1532
 
914
1533
  tasks_arr = md.create_dataset(
@@ -922,7 +1541,7 @@ class ZarrPersistentStore(PersistentStore):
922
1541
  name=cls._elem_arr_name,
923
1542
  shape=0,
924
1543
  dtype=object,
925
- object_codec=MsgPack(),
1544
+ object_codec=cls._CODEC,
926
1545
  chunks=1000,
927
1546
  )
928
1547
  elems_arr.attrs.update({"seq_idx": [], "src_idx": []})
@@ -931,7 +1550,7 @@ class ZarrPersistentStore(PersistentStore):
931
1550
  name=cls._iter_arr_name,
932
1551
  shape=0,
933
1552
  dtype=object,
934
- object_codec=MsgPack(),
1553
+ object_codec=cls._CODEC,
935
1554
  chunks=1000,
936
1555
  )
937
1556
  elem_iters_arr.attrs.update(
@@ -946,12 +1565,12 @@ class ZarrPersistentStore(PersistentStore):
946
1565
  name=cls._EAR_arr_name,
947
1566
  shape=0,
948
1567
  dtype=object,
949
- object_codec=MsgPack(),
1568
+ object_codec=cls._CODEC,
950
1569
  chunks=1000,
951
1570
  )
952
- EARs_arr.attrs.update({"parameter_paths": []})
1571
+ EARs_arr.attrs["parameter_paths"] = []
953
1572
 
954
- tasks, elems, elem_iters, EARs = super().prepare_test_store_from_spec(spec)
1573
+ tasks, elems, elem_iters, EARs_ = super().prepare_test_store_from_spec(spec)
955
1574
 
956
1575
  path = Path(path).resolve()
957
1576
  tasks = [ZarrStoreTask(**i).encode() for i in tasks]
@@ -960,21 +1579,13 @@ class ZarrPersistentStore(PersistentStore):
960
1579
  ZarrStoreElementIter(**i).encode(elem_iters_arr.attrs.asdict())
961
1580
  for i in elem_iters
962
1581
  ]
963
- EARs = [ZarrStoreEAR(**i).encode(EARs_arr.attrs.asdict()) for i in EARs]
1582
+ EARs = [ZarrStoreEAR(**i).encode(ts_fmt, EARs_arr.attrs.asdict()) for i in EARs_]
964
1583
 
965
1584
  append_items_to_ragged_array(tasks_arr, tasks)
966
1585
 
967
- elem_arr_add = np.empty((len(elements)), dtype=object)
968
- elem_arr_add[:] = elements
969
- elems_arr.append(elem_arr_add)
970
-
971
- iter_arr_add = np.empty((len(elem_iters)), dtype=object)
972
- iter_arr_add[:] = elem_iters
973
- elem_iters_arr.append(iter_arr_add)
974
-
975
- EAR_arr_add = np.empty((len(EARs)), dtype=object)
976
- EAR_arr_add[:] = EARs
977
- EARs_arr.append(EAR_arr_add)
1586
+ elems_arr.append(np.fromiter(elements, dtype=object))
1587
+ elem_iters_arr.append(np.fromiter(elem_iters, dtype=object))
1588
+ EARs_arr.append(np.fromiter(EARs, dtype=object))
978
1589
 
979
1590
  return cls(path)
980
1591
 
@@ -982,17 +1593,18 @@ class ZarrPersistentStore(PersistentStore):
982
1593
  with self.using_resource("attrs", "read") as attrs:
983
1594
  return attrs["template_components"]
984
1595
 
985
- def _get_persistent_template(self):
1596
+ def _get_persistent_template(self) -> dict[str, JSONed]:
986
1597
  with self.using_resource("attrs", "read") as attrs:
987
- return attrs["template"]
1598
+ return cast("dict[str, JSONed]", attrs["template"])
988
1599
 
989
1600
  @TimeIt.decorator
990
- def _get_persistent_tasks(self, id_lst: Iterable[int]) -> Dict[int, ZarrStoreTask]:
1601
+ def _get_persistent_tasks(self, id_lst: Iterable[int]) -> dict[int, ZarrStoreTask]:
991
1602
  tasks, id_lst = self._get_cached_persistent_tasks(id_lst)
992
1603
  if id_lst:
993
1604
  with self.using_resource("attrs", action="read") as attrs:
994
- task_dat = {}
995
- elem_IDs = []
1605
+ task_dat: dict[int, dict[str, Any]] = {}
1606
+ elem_IDs: list[int] = []
1607
+ i: dict[str, Any]
996
1608
  for idx, i in enumerate(attrs["tasks"]):
997
1609
  i = copy.deepcopy(i)
998
1610
  elem_IDs.append(i.pop("element_IDs_idx"))
@@ -1003,65 +1615,62 @@ class ZarrPersistentStore(PersistentStore):
1003
1615
  elem_IDs_arr_dat = self._get_tasks_arr().get_coordinate_selection(
1004
1616
  elem_IDs
1005
1617
  )
1006
- except zarr.errors.BoundsCheckError:
1618
+ except BoundsCheckError:
1007
1619
  raise MissingStoreTaskError(
1008
1620
  elem_IDs
1009
1621
  ) from None # TODO: not an ID list
1010
1622
 
1011
1623
  new_tasks = {
1012
1624
  id_: ZarrStoreTask.decode({**i, "element_IDs": elem_IDs_arr_dat[id_]})
1013
- for idx, (id_, i) in enumerate(task_dat.items())
1625
+ for id_, i in task_dat.items()
1014
1626
  }
1015
- else:
1016
- new_tasks = {}
1017
- self.task_cache.update(new_tasks)
1018
- tasks.update(new_tasks)
1627
+ self.task_cache.update(new_tasks)
1628
+ tasks.update(new_tasks)
1019
1629
  return tasks
1020
1630
 
1021
1631
  @TimeIt.decorator
1022
- def _get_persistent_loops(self, id_lst: Optional[Iterable[int]] = None):
1632
+ def _get_persistent_loops(
1633
+ self, id_lst: Iterable[int] | None = None
1634
+ ) -> dict[int, LoopDescriptor]:
1023
1635
  with self.using_resource("attrs", "read") as attrs:
1024
- loop_dat = {
1025
- idx: i
1636
+ return {
1637
+ idx: cast("LoopDescriptor", i)
1026
1638
  for idx, i in enumerate(attrs["loops"])
1027
1639
  if id_lst is None or idx in id_lst
1028
1640
  }
1029
- return loop_dat
1030
1641
 
1031
1642
  @TimeIt.decorator
1032
- def _get_persistent_submissions(self, id_lst: Optional[Iterable[int]] = None):
1643
+ def _get_persistent_submissions(
1644
+ self, id_lst: Iterable[int] | None = None
1645
+ ) -> dict[int, Mapping[str, JSONed]]:
1033
1646
  self.logger.debug("loading persistent submissions from the zarr store")
1647
+ ids = set(id_lst or ())
1034
1648
  with self.using_resource("attrs", "read") as attrs:
1035
1649
  subs_dat = copy.deepcopy(
1036
1650
  {
1037
1651
  idx: i
1038
1652
  for idx, i in enumerate(attrs["submissions"])
1039
- if id_lst is None or idx in id_lst
1653
+ if id_lst is None or idx in ids
1040
1654
  }
1041
1655
  )
1042
- # cast jobscript submit-times and jobscript `task_elements` keys:
1043
- for sub_idx, sub in subs_dat.items():
1044
- for js_idx, js in enumerate(sub["jobscripts"]):
1045
- for key in list(js["task_elements"].keys()):
1046
- subs_dat[sub_idx]["jobscripts"][js_idx]["task_elements"][
1047
- int(key)
1048
- ] = subs_dat[sub_idx]["jobscripts"][js_idx]["task_elements"].pop(
1049
- key
1050
- )
1051
1656
 
1052
1657
  return subs_dat
1053
1658
 
1054
1659
  @TimeIt.decorator
1055
1660
  def _get_persistent_elements(
1056
1661
  self, id_lst: Iterable[int]
1057
- ) -> Dict[int, ZarrStoreElement]:
1662
+ ) -> dict[int, ZarrStoreElement]:
1058
1663
  elems, id_lst = self._get_cached_persistent_elements(id_lst)
1059
1664
  if id_lst:
1665
+ self.logger.debug(
1666
+ f"loading {len(id_lst)} persistent element(s) from disk: "
1667
+ f"{shorten_list_str(id_lst)}."
1668
+ )
1060
1669
  arr = self._get_elements_arr()
1061
1670
  attrs = arr.attrs.asdict()
1062
1671
  try:
1063
1672
  elem_arr_dat = arr.get_coordinate_selection(id_lst)
1064
- except zarr.errors.BoundsCheckError:
1673
+ except BoundsCheckError:
1065
1674
  raise MissingStoreElementError(id_lst) from None
1066
1675
  elem_dat = dict(zip(id_lst, elem_arr_dat))
1067
1676
  new_elems = {
@@ -1074,14 +1683,18 @@ class ZarrPersistentStore(PersistentStore):
1074
1683
  @TimeIt.decorator
1075
1684
  def _get_persistent_element_iters(
1076
1685
  self, id_lst: Iterable[int]
1077
- ) -> Dict[int, ZarrStoreElementIter]:
1686
+ ) -> dict[int, ZarrStoreElementIter]:
1078
1687
  iters, id_lst = self._get_cached_persistent_element_iters(id_lst)
1079
1688
  if id_lst:
1689
+ self.logger.debug(
1690
+ f"loading {len(id_lst)} persistent element iteration(s) from disk: "
1691
+ f"{shorten_list_str(id_lst)}."
1692
+ )
1080
1693
  arr = self._get_iters_arr()
1081
1694
  attrs = arr.attrs.asdict()
1082
1695
  try:
1083
1696
  iter_arr_dat = arr.get_coordinate_selection(id_lst)
1084
- except zarr.errors.BoundsCheckError:
1697
+ except BoundsCheckError:
1085
1698
  raise MissingStoreElementIterationError(id_lst) from None
1086
1699
  iter_dat = dict(zip(id_lst, iter_arr_dat))
1087
1700
  new_iters = {
@@ -1092,19 +1705,29 @@ class ZarrPersistentStore(PersistentStore):
1092
1705
  return iters
1093
1706
 
1094
1707
  @TimeIt.decorator
1095
- def _get_persistent_EARs(self, id_lst: Iterable[int]) -> Dict[int, ZarrStoreEAR]:
1708
+ def _get_persistent_EARs(self, id_lst: Iterable[int]) -> dict[int, ZarrStoreEAR]:
1096
1709
  runs, id_lst = self._get_cached_persistent_EARs(id_lst)
1097
1710
  if id_lst:
1711
+ self.logger.debug(
1712
+ f"loading {len(id_lst)} persistent EAR(s) from disk: "
1713
+ f"{shorten_list_str(id_lst)}."
1714
+ )
1098
1715
  arr = self._get_EARs_arr()
1099
1716
  attrs = arr.attrs.asdict()
1717
+ sel: tuple[NDArray, NDArray] | list[int]
1100
1718
  try:
1101
- self.logger.debug(f"_get_persistent_EARs: {id_lst=}")
1102
- EAR_arr_dat = _zarr_get_coord_selection(arr, id_lst, self.logger)
1103
- except zarr.errors.BoundsCheckError:
1719
+ # convert to 2D array indices:
1720
+ sel = get_2D_idx(np.array(id_lst), num_cols=arr.shape[1])
1721
+ except IndexError:
1722
+ # 1D runs array from before update to 2D in Feb 2025 refactor/jobscript:
1723
+ sel = id_lst
1724
+ try:
1725
+ EAR_arr_dat = _zarr_get_coord_selection(arr, sel, self.logger)
1726
+ except BoundsCheckError:
1104
1727
  raise MissingStoreEARError(id_lst) from None
1105
1728
  EAR_dat = dict(zip(id_lst, EAR_arr_dat))
1106
1729
  new_runs = {
1107
- k: ZarrStoreEAR.decode(EAR_dat=v, attrs=attrs, ts_fmt=self.ts_fmt)
1730
+ k: ZarrStoreEAR.decode(EAR_dat=v, ts_fmt=self.ts_fmt, attrs=attrs)
1108
1731
  for k, v in EAR_dat.items()
1109
1732
  }
1110
1733
  self.EAR_cache.update(new_runs)
@@ -1114,20 +1737,25 @@ class ZarrPersistentStore(PersistentStore):
1114
1737
 
1115
1738
  @TimeIt.decorator
1116
1739
  def _get_persistent_parameters(
1117
- self,
1118
- id_lst: Iterable[int],
1119
- dataset_copy: Optional[bool] = False,
1120
- ) -> Dict[int, ZarrStoreParameter]:
1121
-
1740
+ self, id_lst: Iterable[int], *, dataset_copy: bool = False, **kwargs
1741
+ ) -> dict[int, ZarrStoreParameter]:
1122
1742
  params, id_lst = self._get_cached_persistent_parameters(id_lst)
1123
1743
  if id_lst:
1744
+
1745
+ self.logger.debug(
1746
+ f"loading {len(id_lst)} persistent parameter(s) from disk: "
1747
+ f"{shorten_list_str(id_lst)}."
1748
+ )
1749
+
1750
+ # TODO: implement the "parameter_metadata_cache" for zarr stores, which would
1751
+ # keep the base_arr and src_arr open
1124
1752
  base_arr = self._get_parameter_base_array(mode="r")
1125
1753
  src_arr = self._get_parameter_sources_array(mode="r")
1126
1754
 
1127
1755
  try:
1128
1756
  param_arr_dat = base_arr.get_coordinate_selection(list(id_lst))
1129
1757
  src_arr_dat = src_arr.get_coordinate_selection(list(id_lst))
1130
- except zarr.errors.BoundsCheckError:
1758
+ except BoundsCheckError:
1131
1759
  raise MissingParameterData(id_lst) from None
1132
1760
 
1133
1761
  param_dat = dict(zip(id_lst, param_arr_dat))
@@ -1149,13 +1777,15 @@ class ZarrPersistentStore(PersistentStore):
1149
1777
  return params
1150
1778
 
1151
1779
  @TimeIt.decorator
1152
- def _get_persistent_param_sources(self, id_lst: Iterable[int]) -> Dict[int, Dict]:
1780
+ def _get_persistent_param_sources(
1781
+ self, id_lst: Iterable[int]
1782
+ ) -> dict[int, ParamSource]:
1153
1783
  sources, id_lst = self._get_cached_persistent_param_sources(id_lst)
1154
1784
  if id_lst:
1155
1785
  src_arr = self._get_parameter_sources_array(mode="r")
1156
1786
  try:
1157
1787
  src_arr_dat = src_arr.get_coordinate_selection(list(id_lst))
1158
- except zarr.errors.BoundsCheckError:
1788
+ except BoundsCheckError:
1159
1789
  raise MissingParameterData(id_lst) from None
1160
1790
  new_sources = dict(zip(id_lst, src_arr_dat))
1161
1791
  self.param_sources_cache.update(new_sources)
@@ -1164,20 +1794,267 @@ class ZarrPersistentStore(PersistentStore):
1164
1794
 
1165
1795
  def _get_persistent_parameter_set_status(
1166
1796
  self, id_lst: Iterable[int]
1167
- ) -> Dict[int, bool]:
1797
+ ) -> dict[int, bool]:
1168
1798
  base_arr = self._get_parameter_base_array(mode="r")
1169
1799
  try:
1170
1800
  param_arr_dat = base_arr.get_coordinate_selection(list(id_lst))
1171
- except zarr.errors.BoundsCheckError:
1801
+ except BoundsCheckError:
1172
1802
  raise MissingParameterData(id_lst) from None
1173
1803
 
1174
1804
  return dict(zip(id_lst, [i is not None for i in param_arr_dat]))
1175
1805
 
1176
- def _get_persistent_parameter_IDs(self) -> List[int]:
1806
+ def _get_persistent_parameter_IDs(self) -> list[int]:
1177
1807
  # we assume the row index is equivalent to ID, might need to revisit in future
1178
1808
  base_arr = self._get_parameter_base_array(mode="r")
1179
1809
  return list(range(len(base_arr)))
1180
1810
 
1811
+ def get_submission_at_submit_metadata(
1812
+ self, sub_idx: int, metadata_attr: dict | None
1813
+ ) -> dict[str, Any]:
1814
+ """Retrieve the values of submission attributes that are stored at submit-time."""
1815
+ grp = self._get_submission_metadata_group(sub_idx)
1816
+ attrs = grp.attrs.asdict()
1817
+ return {k: attrs[k] for k in SUBMISSION_SUBMIT_TIME_KEYS}
1818
+
1819
+ def clear_jobscript_at_submit_metadata_cache(self):
1820
+ """Clear the cache of at-submit-time jobscript metadata."""
1821
+ self._jobscript_at_submit_metadata = {}
1822
+
1823
+ def get_jobscript_at_submit_metadata(
1824
+ self,
1825
+ sub_idx: int,
1826
+ js_idx: int,
1827
+ metadata_attr: dict | None,
1828
+ ) -> dict[str, Any]:
1829
+ """For the specified jobscript, retrieve the values of jobscript-submit-time
1830
+ attributes.
1831
+
1832
+ Notes
1833
+ -----
1834
+ If the cache does not exist, this method will retrieve and cache metadata for
1835
+ all jobscripts for which metadata has been set. If the cache does exist, but not
1836
+ for the requested jobscript, then this method will retrieve and cache metadata for
1837
+ all non-cached jobscripts for which metadata has been set. If metadata has not
1838
+ yet been set for the specified jobscript, and dict with all `None` values will be
1839
+ returned.
1840
+
1841
+ The cache can be cleared using the method
1842
+ `clear_jobscript_at_submit_metadata_cache`.
1843
+
1844
+ """
1845
+ if self._jobscript_at_submit_metadata:
1846
+ # cache exists, but might not include data for the requested jobscript:
1847
+ if js_idx in self._jobscript_at_submit_metadata:
1848
+ return self._jobscript_at_submit_metadata[js_idx]
1849
+
1850
+ arr = self._get_jobscripts_at_submit_metadata_arr(sub_idx)
1851
+ non_cached = set(range(len(arr))) - set(self._jobscript_at_submit_metadata.keys())
1852
+
1853
+ # populate cache:
1854
+ arr_non_cached = arr.get_coordinate_selection((list(non_cached),))
1855
+ for js_idx_i, arr_item in zip(non_cached, arr_non_cached):
1856
+ try:
1857
+ self._jobscript_at_submit_metadata[js_idx_i] = {
1858
+ i: arr_item[i_idx]
1859
+ for i_idx, i in enumerate(JOBSCRIPT_SUBMIT_TIME_KEYS)
1860
+ }
1861
+ except TypeError:
1862
+ # data for this jobscript is not set
1863
+ pass
1864
+
1865
+ if js_idx not in self._jobscript_at_submit_metadata:
1866
+ return {i: None for i in JOBSCRIPT_SUBMIT_TIME_KEYS}
1867
+
1868
+ return self._jobscript_at_submit_metadata[js_idx]
1869
+
1870
+ @TimeIt.decorator
1871
+ def get_jobscript_block_run_ID_array(
1872
+ self,
1873
+ sub_idx: int,
1874
+ js_idx: int,
1875
+ blk_idx: int,
1876
+ run_ID_arr: NDArray | None,
1877
+ ) -> NDArray:
1878
+ """For the specified jobscript-block, retrieve the run ID array."""
1879
+
1880
+ if run_ID_arr is not None:
1881
+ self.logger.debug("jobscript-block run IDs are still in memory.")
1882
+ # in the special case when the Submission object has just been created, the
1883
+ # run ID arrays will not yet be persistent.
1884
+ return np.asarray(run_ID_arr)
1885
+
1886
+ # otherwise, `append_submissions` has been called, the run IDs have been
1887
+ # removed from the JSON-representation of the submission object, and have been
1888
+ # saved in separate zarr arrays:
1889
+ if sub_idx not in self._jobscript_run_ID_arrays:
1890
+
1891
+ self.logger.debug(
1892
+ f"retrieving jobscript-block run IDs for submission {sub_idx} from disk,"
1893
+ f" and caching."
1894
+ )
1895
+
1896
+ # for a given submission, run IDs are stored for all jobscript-blocks in the
1897
+ # same array (and chunk), so retrieve all of them and cache:
1898
+
1899
+ arr = self._get_jobscripts_run_ID_arr(sub_idx)
1900
+ arr_dat = arr[:]
1901
+ block_shapes = arr.attrs["block_shapes"]
1902
+
1903
+ self._jobscript_run_ID_arrays[sub_idx] = {} # keyed by (js_idx, blk_idx)
1904
+ arr_idx = 0
1905
+ for js_idx_i, js_blk_shapes in enumerate(block_shapes):
1906
+ for blk_idx_j, blk_shape_j in enumerate(js_blk_shapes):
1907
+ self._jobscript_run_ID_arrays[sub_idx][
1908
+ (js_idx_i, blk_idx_j)
1909
+ ] = arr_dat[arr_idx, : blk_shape_j[0], : blk_shape_j[1]]
1910
+ arr_idx += 1
1911
+
1912
+ else:
1913
+ self.logger.debug(
1914
+ f"retrieving jobscript-block run IDs for submission {sub_idx} from cache."
1915
+ )
1916
+
1917
+ return self._jobscript_run_ID_arrays[sub_idx][(js_idx, blk_idx)]
1918
+
1919
+ def get_jobscript_block_task_elements_map(
1920
+ self,
1921
+ sub_idx: int,
1922
+ js_idx: int,
1923
+ blk_idx: int,
1924
+ task_elems_map: dict[int, list[int]] | None,
1925
+ ) -> dict[int, list[int]]:
1926
+ """For the specified jobscript-block, retrieve the task-elements mapping."""
1927
+
1928
+ if task_elems_map is not None:
1929
+ self.logger.debug("jobscript-block task elements are still in memory.")
1930
+ # in the special case when the Submission object has just been created, the
1931
+ # task elements arrays will not yet be persistent.
1932
+ return task_elems_map
1933
+
1934
+ # otherwise, `append_submissions` has been called, the task elements have been
1935
+ # removed from the JSON-representation of the submission object, and have been
1936
+ # saved in separate zarr arrays:
1937
+ if sub_idx not in self._jobscript_task_element_maps:
1938
+
1939
+ self.logger.debug(
1940
+ f"retrieving jobscript-block task elements for submission {sub_idx} from "
1941
+ f"disk, and caching."
1942
+ )
1943
+
1944
+ # for a given submission, task elements are stored for all jobscript-blocks in
1945
+ # the same array (and chunk), so retrieve all of them and cache:
1946
+
1947
+ arr = self._get_jobscripts_task_elements_arr(sub_idx)
1948
+ arr_dat = arr[:]
1949
+ block_shapes = arr.attrs["block_shapes"]
1950
+
1951
+ self._jobscript_task_element_maps[sub_idx] = {} # keys: (js_idx, blk_idx)
1952
+ arr_idx = 0
1953
+ for js_idx_i, js_blk_shapes in enumerate(block_shapes):
1954
+ for blk_idx_j, blk_shape_j in enumerate(js_blk_shapes):
1955
+ arr_i = arr_dat[arr_idx, : blk_shape_j[1], : blk_shape_j[0] + 1]
1956
+ self._jobscript_task_element_maps[sub_idx][(js_idx_i, blk_idx_j)] = {
1957
+ k[0]: list(k[1:]) for k in arr_i
1958
+ }
1959
+ arr_idx += 1
1960
+
1961
+ else:
1962
+ self.logger.debug(
1963
+ f"retrieving jobscript-block task elements for submission {sub_idx} from "
1964
+ "cache."
1965
+ )
1966
+
1967
+ return self._jobscript_task_element_maps[sub_idx][(js_idx, blk_idx)]
1968
+
1969
+ @TimeIt.decorator
1970
+ def get_jobscript_block_task_actions_array(
1971
+ self,
1972
+ sub_idx: int,
1973
+ js_idx: int,
1974
+ blk_idx: int,
1975
+ task_actions_arr: NDArray | list[tuple[int, int, int]] | None,
1976
+ ) -> NDArray:
1977
+ """For the specified jobscript-block, retrieve the task-actions array."""
1978
+
1979
+ if task_actions_arr is not None:
1980
+ self.logger.debug("jobscript-block task actions are still in memory.")
1981
+ # in the special case when the Submission object has just been created, the
1982
+ # task actions arrays will not yet be persistent.
1983
+ return np.asarray(task_actions_arr)
1984
+
1985
+ # otherwise, `append_submissions` has been called, the task actions have been
1986
+ # removed from the JSON-representation of the submission object, and have been
1987
+ # saved in separate zarr arrays:
1988
+ if sub_idx not in self._jobscript_task_actions_arrays:
1989
+
1990
+ self.logger.debug(
1991
+ f"retrieving jobscript-block task actions for submission {sub_idx} from "
1992
+ f"disk, and caching."
1993
+ )
1994
+
1995
+ # for a given submission, task actions are stored for all jobscript-blocks in
1996
+ # the same array (and chunk), so retrieve all of them and cache:
1997
+
1998
+ arr = self._get_jobscripts_task_actions_arr(sub_idx)
1999
+ arr_dat = arr[:]
2000
+ block_num_acts = arr.attrs["block_num_acts"]
2001
+
2002
+ num_acts_count = 0
2003
+ self._jobscript_task_actions_arrays[sub_idx] = {} # keys: (js_idx, blk_idx)
2004
+ for js_idx_i, js_blk_num_acts in enumerate(block_num_acts):
2005
+ for blk_idx_j, blk_num_acts_j in enumerate(js_blk_num_acts):
2006
+ arr_i = arr_dat[num_acts_count : num_acts_count + blk_num_acts_j]
2007
+ num_acts_count += blk_num_acts_j
2008
+ self._jobscript_task_actions_arrays[sub_idx][
2009
+ (js_idx_i, blk_idx_j)
2010
+ ] = arr_i
2011
+
2012
+ else:
2013
+ self.logger.debug(
2014
+ f"retrieving jobscript-block task actions for submission {sub_idx} from "
2015
+ "cache."
2016
+ )
2017
+
2018
+ return self._jobscript_task_actions_arrays[sub_idx][(js_idx, blk_idx)]
2019
+
2020
+ @TimeIt.decorator
2021
+ def get_jobscript_block_dependencies(
2022
+ self,
2023
+ sub_idx: int,
2024
+ js_idx: int,
2025
+ blk_idx: int,
2026
+ js_dependencies: dict[tuple[int, int], ResolvedJobscriptBlockDependencies] | None,
2027
+ ) -> dict[tuple[int, int], ResolvedJobscriptBlockDependencies]:
2028
+ """For the specified jobscript-block, retrieve the dependencies."""
2029
+
2030
+ if js_dependencies is not None:
2031
+ self.logger.debug("jobscript-block dependencies are still in memory.")
2032
+ # in the special case when the Submission object has just been created, the
2033
+ # dependencies will not yet be persistent.
2034
+ return js_dependencies
2035
+
2036
+ # otherwise, `append_submissions` has been called, the dependencies have been
2037
+ # removed from the JSON-representation of the submission object, and have been
2038
+ # saved in separate zarr arrays:
2039
+ if sub_idx not in self._jobscript_dependencies:
2040
+ self.logger.debug(
2041
+ f"retrieving jobscript-block dependencies for submission {sub_idx} from "
2042
+ f"disk, and caching."
2043
+ )
2044
+ # for a given submission, dependencies are stored for all jobscript-blocks in
2045
+ # the same array (and chunk), so retrieve all of them and cache:
2046
+ arr = self._get_jobscripts_dependencies_arr(sub_idx)
2047
+ self._jobscript_dependencies[
2048
+ sub_idx
2049
+ ] = self._decode_jobscript_block_dependencies(arr)
2050
+ else:
2051
+ self.logger.debug(
2052
+ f"retrieving jobscript-block dependencies for submission {sub_idx} from "
2053
+ "cache."
2054
+ )
2055
+
2056
+ return self._jobscript_dependencies[sub_idx][(js_idx, blk_idx)]
2057
+
1181
2058
  def get_ts_fmt(self):
1182
2059
  """
1183
2060
  Get the format for timestamps.
@@ -1208,11 +2085,11 @@ class ZarrPersistentStore(PersistentStore):
1208
2085
 
1209
2086
  def zip(
1210
2087
  self,
1211
- path=".",
1212
- log=None,
1213
- overwrite=False,
1214
- include_execute=False,
1215
- include_rechunk_backups=False,
2088
+ path: str = ".",
2089
+ log: str | None = None,
2090
+ overwrite: bool = False,
2091
+ include_execute: bool = False,
2092
+ include_rechunk_backups: bool = False,
1216
2093
  ):
1217
2094
  """
1218
2095
  Convert the persistent store to zipped form.
@@ -1224,69 +2101,66 @@ class ZarrPersistentStore(PersistentStore):
1224
2101
  directory, the zip file will be created within this directory. Otherwise,
1225
2102
  this path is assumed to be the full file path to the new zip file.
1226
2103
  """
1227
- console = Console()
1228
- status = console.status(f"Zipping workflow {self.workflow.name!r}...")
1229
- status.start()
1230
-
1231
- # TODO: this won't work for remote file systems
1232
- dst_path = Path(path).resolve()
1233
- if dst_path.is_dir():
1234
- dst_path = dst_path.joinpath(self.workflow.name).with_suffix(".zip")
1235
-
1236
- if not overwrite and dst_path.exists():
1237
- status.stop()
1238
- raise FileExistsError(
1239
- f"File at path already exists: {dst_path!r}. Pass `overwrite=True` to "
1240
- f"overwrite the existing file."
1241
- )
2104
+ with Console().status(f"Zipping workflow {self.workflow.name!r}..."):
2105
+ # TODO: this won't work for remote file systems
2106
+ dst_path = Path(path).resolve()
2107
+ if dst_path.is_dir():
2108
+ dst_path = dst_path.joinpath(self.workflow.name).with_suffix(".zip")
2109
+
2110
+ if not overwrite and dst_path.exists():
2111
+ raise FileExistsError(
2112
+ f"File at path already exists: {dst_path!r}. Pass `overwrite=True` to "
2113
+ f"overwrite the existing file."
2114
+ )
1242
2115
 
1243
- dst_path = str(dst_path)
2116
+ dst_path_s = str(dst_path)
1244
2117
 
1245
- src_zarr_store = self.zarr_store
1246
- zfs, _ = ask_pw_on_auth_exc(
1247
- ZipFileSystem,
1248
- fo=dst_path,
1249
- mode="w",
1250
- target_options={},
1251
- add_pw_to="target_options",
1252
- )
1253
- dst_zarr_store = zarr.storage.FSStore(url="", fs=zfs)
1254
- excludes = []
1255
- if not include_execute:
1256
- excludes.append("execute")
1257
- if not include_rechunk_backups:
1258
- excludes.append("runs.bak")
1259
- excludes.append("base.bak")
1260
-
1261
- zarr.convenience.copy_store(
1262
- src_zarr_store,
1263
- dst_zarr_store,
1264
- excludes=excludes or None,
1265
- log=log,
1266
- )
1267
- del zfs # ZipFileSystem remains open for instance lifetime
1268
- status.stop()
1269
- return dst_path
2118
+ src_zarr_store = self.zarr_store
2119
+ zfs, _ = ask_pw_on_auth_exc(
2120
+ ZipFileSystem,
2121
+ fo=dst_path_s,
2122
+ mode="w",
2123
+ target_options={},
2124
+ add_pw_to="target_options",
2125
+ )
2126
+ dst_zarr_store = FSStore(url="", fs=zfs)
2127
+ excludes = []
2128
+ if not include_execute:
2129
+ excludes.append("execute")
2130
+ if not include_rechunk_backups:
2131
+ excludes.append("runs.bak")
2132
+ excludes.append("base.bak")
2133
+
2134
+ zarr.copy_store(
2135
+ src_zarr_store,
2136
+ dst_zarr_store,
2137
+ excludes=excludes or None,
2138
+ log=log,
2139
+ )
2140
+ del zfs # ZipFileSystem remains open for instance lifetime
2141
+ return dst_path_s
2142
+
2143
+ def unzip(self, path: str = ".", log: str | None = None):
2144
+ raise ValueError("Not a zip store!")
1270
2145
 
1271
2146
  def _rechunk_arr(
1272
2147
  self,
1273
- arr,
1274
- chunk_size: Optional[int] = None,
1275
- backup: Optional[bool] = True,
1276
- status: Optional[bool] = True,
1277
- ):
1278
- arr_path = Path(self.workflow.path) / arr.path
2148
+ arr: Array,
2149
+ chunk_size: int | None = None,
2150
+ backup: bool = True,
2151
+ status: bool = True,
2152
+ ) -> Array:
2153
+ arr_path = Path(arr.store.path) / arr.path
1279
2154
  arr_name = arr.path.split("/")[-1]
1280
2155
 
1281
2156
  if status:
1282
- console = Console()
1283
- status = console.status("Rechunking...")
1284
- status.start()
2157
+ s = Console().status("Rechunking...")
2158
+ s.start()
1285
2159
  backup_time = None
1286
2160
 
1287
2161
  if backup:
1288
2162
  if status:
1289
- status.update("Backing up...")
2163
+ s.update("Backing up...")
1290
2164
  backup_path = arr_path.with_suffix(".bak")
1291
2165
  if backup_path.is_dir():
1292
2166
  pass
@@ -1298,18 +2172,26 @@ class ZarrPersistentStore(PersistentStore):
1298
2172
 
1299
2173
  tic = time.perf_counter()
1300
2174
  arr_rc_path = arr_path.with_suffix(".rechunked")
1301
- arr = zarr.open(arr_path)
1302
2175
  if status:
1303
- status.update("Creating new array...")
2176
+ s.update("Creating new array...")
2177
+
2178
+ # use the same store:
2179
+ try:
2180
+ arr_rc_store = arr.store.__class__(path=arr_rc_path)
2181
+ except TypeError:
2182
+ # FSStore
2183
+ arr_rc_store = arr.store.__class__(url=str(arr_rc_path))
2184
+
1304
2185
  arr_rc = zarr.create(
1305
- store=arr_rc_path,
2186
+ store=arr_rc_store,
1306
2187
  shape=arr.shape,
1307
2188
  chunks=arr.shape if chunk_size is None else chunk_size,
1308
2189
  dtype=object,
1309
- object_codec=MsgPack(),
2190
+ object_codec=self._CODEC,
1310
2191
  )
2192
+
1311
2193
  if status:
1312
- status.update("Copying data...")
2194
+ s.update("Copying data...")
1313
2195
  data = np.empty(shape=arr.shape, dtype=object)
1314
2196
  bad_data = []
1315
2197
  for idx in range(len(arr)):
@@ -1318,24 +2200,23 @@ class ZarrPersistentStore(PersistentStore):
1318
2200
  except RuntimeError:
1319
2201
  # blosc decompression errors
1320
2202
  bad_data.append(idx)
1321
- pass
1322
2203
  arr_rc[:] = data
1323
2204
 
1324
2205
  arr_rc.attrs.put(arr.attrs.asdict())
1325
2206
 
1326
2207
  if status:
1327
- status.update("Deleting old array...")
2208
+ s.update("Deleting old array...")
1328
2209
  shutil.rmtree(arr_path)
1329
2210
 
1330
2211
  if status:
1331
- status.update("Moving new array into place...")
2212
+ s.update("Moving new array into place...")
1332
2213
  shutil.move(arr_rc_path, arr_path)
1333
2214
 
1334
2215
  toc = time.perf_counter()
1335
2216
  rechunk_time = toc - tic
1336
2217
 
1337
2218
  if status:
1338
- status.stop()
2219
+ s.stop()
1339
2220
 
1340
2221
  if backup_time:
1341
2222
  print(f"Time to backup {arr_name}: {backup_time:.1f} s")
@@ -1349,10 +2230,10 @@ class ZarrPersistentStore(PersistentStore):
1349
2230
 
1350
2231
  def rechunk_parameter_base(
1351
2232
  self,
1352
- chunk_size: Optional[int] = None,
1353
- backup: Optional[bool] = True,
1354
- status: Optional[bool] = True,
1355
- ):
2233
+ chunk_size: int | None = None,
2234
+ backup: bool = True,
2235
+ status: bool = True,
2236
+ ) -> Array:
1356
2237
  """
1357
2238
  Rechunk the parameter data to be stored more efficiently.
1358
2239
  """
@@ -1361,16 +2242,22 @@ class ZarrPersistentStore(PersistentStore):
1361
2242
 
1362
2243
  def rechunk_runs(
1363
2244
  self,
1364
- chunk_size: Optional[int] = None,
1365
- backup: Optional[bool] = True,
1366
- status: Optional[bool] = True,
1367
- ):
2245
+ chunk_size: int | None = None,
2246
+ backup: bool = True,
2247
+ status: bool = True,
2248
+ ) -> Array:
1368
2249
  """
1369
2250
  Rechunk the run data to be stored more efficiently.
1370
2251
  """
1371
2252
  arr = self._get_EARs_arr()
1372
2253
  return self._rechunk_arr(arr, chunk_size, backup, status)
1373
2254
 
2255
+ def get_dirs_array(self) -> NDArray:
2256
+ """
2257
+ Retrieve the run directories array.
2258
+ """
2259
+ return self._get_dirs_arr()[:]
2260
+
1374
2261
 
1375
2262
  class ZarrZipPersistentStore(ZarrPersistentStore):
1376
2263
  """A store designed mainly as an archive format that can be uploaded to data
@@ -1381,8 +2268,8 @@ class ZarrZipPersistentStore(ZarrPersistentStore):
1381
2268
  Archive format persistent stores cannot be updated without being unzipped first.
1382
2269
  """
1383
2270
 
1384
- _name = "zip"
1385
- _features = PersistentStoreFeatures(
2271
+ _name: ClassVar[str] = "zip"
2272
+ _features: ClassVar[PersistentStoreFeatures] = PersistentStoreFeatures(
1386
2273
  create=False,
1387
2274
  edit=False,
1388
2275
  jobscript_parallelism=False,
@@ -1393,10 +2280,17 @@ class ZarrZipPersistentStore(ZarrPersistentStore):
1393
2280
 
1394
2281
  # TODO: enforce read-only nature
1395
2282
 
1396
- def zip(self):
2283
+ def zip(
2284
+ self,
2285
+ path: str = ".",
2286
+ log: str | None = None,
2287
+ overwrite: bool = False,
2288
+ include_execute: bool = False,
2289
+ include_rechunk_backups: bool = False,
2290
+ ):
1397
2291
  raise ValueError("Already a zip store!")
1398
2292
 
1399
- def unzip(self, path=".", log=None):
2293
+ def unzip(self, path: str = ".", log: str | None = None) -> str:
1400
2294
  """
1401
2295
  Expand the persistent store.
1402
2296
 
@@ -1409,28 +2303,23 @@ class ZarrZipPersistentStore(ZarrPersistentStore):
1409
2303
 
1410
2304
  """
1411
2305
 
1412
- console = Console()
1413
- status = console.status(f"Unzipping workflow {self.workflow.name!r}...")
1414
- status.start()
2306
+ with Console().status(f"Unzipping workflow {self.workflow.name!r}..."):
2307
+ # TODO: this won't work for remote file systems
2308
+ dst_path = Path(path).resolve()
2309
+ if dst_path.is_dir():
2310
+ dst_path = dst_path.joinpath(self.workflow.name)
1415
2311
 
1416
- # TODO: this won't work for remote file systems
1417
- dst_path = Path(path).resolve()
1418
- if dst_path.is_dir():
1419
- dst_path = dst_path.joinpath(self.workflow.name)
2312
+ if dst_path.exists():
2313
+ raise FileExistsError(f"Directory at path already exists: {dst_path!r}.")
1420
2314
 
1421
- if dst_path.exists():
1422
- status.stop()
1423
- raise FileExistsError(f"Directory at path already exists: {dst_path!r}.")
2315
+ dst_path_s = str(dst_path)
1424
2316
 
1425
- dst_path = str(dst_path)
2317
+ src_zarr_store = self.zarr_store
2318
+ dst_zarr_store = FSStore(url=dst_path_s)
2319
+ zarr.copy_store(src_zarr_store, dst_zarr_store, log=log)
2320
+ return dst_path_s
1426
2321
 
1427
- src_zarr_store = self.zarr_store
1428
- dst_zarr_store = zarr.storage.FSStore(url=dst_path)
1429
- zarr.convenience.copy_store(src_zarr_store, dst_zarr_store, log=log)
1430
- status.stop()
1431
- return dst_path
1432
-
1433
- def copy(self, path=None) -> str:
2322
+ def copy(self, path: PathLike = None) -> Path:
1434
2323
  # not sure how to do this.
1435
2324
  raise NotImplementedError()
1436
2325
 
@@ -1441,8 +2330,23 @@ class ZarrZipPersistentStore(ZarrPersistentStore):
1441
2330
  def _rechunk_arr(
1442
2331
  self,
1443
2332
  arr,
1444
- chunk_size: Optional[int] = None,
1445
- backup: Optional[bool] = True,
1446
- status: Optional[bool] = True,
1447
- ):
2333
+ chunk_size: int | None = None,
2334
+ backup: bool = True,
2335
+ status: bool = True,
2336
+ ) -> Array:
1448
2337
  raise NotImplementedError
2338
+
2339
+ def get_text_file(self, path: str | Path) -> str:
2340
+ """Retrieve the contents of a text file stored within the workflow."""
2341
+ path = Path(path)
2342
+ if path.is_absolute():
2343
+ path = path.relative_to(self.workflow.url)
2344
+ path = str(path.as_posix())
2345
+ assert self.fs
2346
+ try:
2347
+ with self.fs.open(path, mode="rt") as fp:
2348
+ return fp.read()
2349
+ except KeyError:
2350
+ raise FileNotFoundError(
2351
+ f"File within zip at location {path!r} does not exist."
2352
+ ) from None