hpcflow-new2 0.2.0a189__py3-none-any.whl → 0.2.0a199__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hpcflow/__pyinstaller/hook-hpcflow.py +9 -6
- hpcflow/_version.py +1 -1
- hpcflow/app.py +1 -0
- hpcflow/data/scripts/bad_script.py +2 -0
- hpcflow/data/scripts/do_nothing.py +2 -0
- hpcflow/data/scripts/env_specifier_test/input_file_generator_pass_env_spec.py +4 -0
- hpcflow/data/scripts/env_specifier_test/main_script_test_pass_env_spec.py +8 -0
- hpcflow/data/scripts/env_specifier_test/output_file_parser_pass_env_spec.py +4 -0
- hpcflow/data/scripts/env_specifier_test/v1/input_file_generator_basic.py +4 -0
- hpcflow/data/scripts/env_specifier_test/v1/main_script_test_direct_in_direct_out.py +7 -0
- hpcflow/data/scripts/env_specifier_test/v1/output_file_parser_basic.py +4 -0
- hpcflow/data/scripts/env_specifier_test/v2/main_script_test_direct_in_direct_out.py +7 -0
- hpcflow/data/scripts/input_file_generator_basic.py +3 -0
- hpcflow/data/scripts/input_file_generator_basic_FAIL.py +3 -0
- hpcflow/data/scripts/input_file_generator_test_stdout_stderr.py +8 -0
- hpcflow/data/scripts/main_script_test_direct_in.py +3 -0
- hpcflow/data/scripts/main_script_test_direct_in_direct_out_2.py +6 -0
- hpcflow/data/scripts/main_script_test_direct_in_direct_out_2_fail_allowed.py +6 -0
- hpcflow/data/scripts/main_script_test_direct_in_direct_out_2_fail_allowed_group.py +7 -0
- hpcflow/data/scripts/main_script_test_direct_in_direct_out_3.py +6 -0
- hpcflow/data/scripts/main_script_test_direct_in_group_direct_out_3.py +6 -0
- hpcflow/data/scripts/main_script_test_direct_in_group_one_fail_direct_out_3.py +6 -0
- hpcflow/data/scripts/main_script_test_hdf5_in_obj.py +1 -1
- hpcflow/data/scripts/main_script_test_hdf5_in_obj_2.py +12 -0
- hpcflow/data/scripts/main_script_test_hdf5_out_obj.py +1 -1
- hpcflow/data/scripts/main_script_test_json_out_FAIL.py +3 -0
- hpcflow/data/scripts/main_script_test_shell_env_vars.py +12 -0
- hpcflow/data/scripts/main_script_test_std_out_std_err.py +6 -0
- hpcflow/data/scripts/output_file_parser_basic.py +3 -0
- hpcflow/data/scripts/output_file_parser_basic_FAIL.py +7 -0
- hpcflow/data/scripts/output_file_parser_test_stdout_stderr.py +8 -0
- hpcflow/data/scripts/script_exit_test.py +5 -0
- hpcflow/data/template_components/environments.yaml +1 -1
- hpcflow/sdk/__init__.py +26 -15
- hpcflow/sdk/app.py +2192 -768
- hpcflow/sdk/cli.py +506 -296
- hpcflow/sdk/cli_common.py +105 -7
- hpcflow/sdk/config/__init__.py +1 -1
- hpcflow/sdk/config/callbacks.py +115 -43
- hpcflow/sdk/config/cli.py +126 -103
- hpcflow/sdk/config/config.py +674 -318
- hpcflow/sdk/config/config_file.py +131 -95
- hpcflow/sdk/config/errors.py +125 -84
- hpcflow/sdk/config/types.py +148 -0
- hpcflow/sdk/core/__init__.py +25 -1
- hpcflow/sdk/core/actions.py +1771 -1059
- hpcflow/sdk/core/app_aware.py +24 -0
- hpcflow/sdk/core/cache.py +139 -79
- hpcflow/sdk/core/command_files.py +263 -287
- hpcflow/sdk/core/commands.py +145 -112
- hpcflow/sdk/core/element.py +828 -535
- hpcflow/sdk/core/enums.py +192 -0
- hpcflow/sdk/core/environment.py +74 -93
- hpcflow/sdk/core/errors.py +455 -52
- hpcflow/sdk/core/execute.py +207 -0
- hpcflow/sdk/core/json_like.py +540 -272
- hpcflow/sdk/core/loop.py +751 -347
- hpcflow/sdk/core/loop_cache.py +164 -47
- hpcflow/sdk/core/object_list.py +370 -207
- hpcflow/sdk/core/parameters.py +1100 -627
- hpcflow/sdk/core/rule.py +59 -41
- hpcflow/sdk/core/run_dir_files.py +21 -37
- hpcflow/sdk/core/skip_reason.py +7 -0
- hpcflow/sdk/core/task.py +1649 -1339
- hpcflow/sdk/core/task_schema.py +308 -196
- hpcflow/sdk/core/test_utils.py +191 -114
- hpcflow/sdk/core/types.py +440 -0
- hpcflow/sdk/core/utils.py +485 -309
- hpcflow/sdk/core/validation.py +82 -9
- hpcflow/sdk/core/workflow.py +2544 -1178
- hpcflow/sdk/core/zarr_io.py +98 -137
- hpcflow/sdk/data/workflow_spec_schema.yaml +2 -0
- hpcflow/sdk/demo/cli.py +53 -33
- hpcflow/sdk/helper/cli.py +18 -15
- hpcflow/sdk/helper/helper.py +75 -63
- hpcflow/sdk/helper/watcher.py +61 -28
- hpcflow/sdk/log.py +122 -71
- hpcflow/sdk/persistence/__init__.py +8 -31
- hpcflow/sdk/persistence/base.py +1360 -606
- hpcflow/sdk/persistence/defaults.py +6 -0
- hpcflow/sdk/persistence/discovery.py +38 -0
- hpcflow/sdk/persistence/json.py +568 -188
- hpcflow/sdk/persistence/pending.py +382 -179
- hpcflow/sdk/persistence/store_resource.py +39 -23
- hpcflow/sdk/persistence/types.py +318 -0
- hpcflow/sdk/persistence/utils.py +14 -11
- hpcflow/sdk/persistence/zarr.py +1337 -433
- hpcflow/sdk/runtime.py +44 -41
- hpcflow/sdk/submission/{jobscript_info.py → enums.py} +39 -12
- hpcflow/sdk/submission/jobscript.py +1651 -692
- hpcflow/sdk/submission/schedulers/__init__.py +167 -39
- hpcflow/sdk/submission/schedulers/direct.py +121 -81
- hpcflow/sdk/submission/schedulers/sge.py +170 -129
- hpcflow/sdk/submission/schedulers/slurm.py +291 -268
- hpcflow/sdk/submission/schedulers/utils.py +12 -2
- hpcflow/sdk/submission/shells/__init__.py +14 -15
- hpcflow/sdk/submission/shells/base.py +150 -29
- hpcflow/sdk/submission/shells/bash.py +283 -173
- hpcflow/sdk/submission/shells/os_version.py +31 -30
- hpcflow/sdk/submission/shells/powershell.py +228 -170
- hpcflow/sdk/submission/submission.py +1014 -335
- hpcflow/sdk/submission/types.py +140 -0
- hpcflow/sdk/typing.py +182 -12
- hpcflow/sdk/utils/arrays.py +71 -0
- hpcflow/sdk/utils/deferred_file.py +55 -0
- hpcflow/sdk/utils/hashing.py +16 -0
- hpcflow/sdk/utils/patches.py +12 -0
- hpcflow/sdk/utils/strings.py +33 -0
- hpcflow/tests/api/test_api.py +32 -0
- hpcflow/tests/conftest.py +27 -6
- hpcflow/tests/data/multi_path_sequences.yaml +29 -0
- hpcflow/tests/data/workflow_test_run_abort.yaml +34 -35
- hpcflow/tests/schedulers/sge/test_sge_submission.py +36 -0
- hpcflow/tests/schedulers/slurm/test_slurm_submission.py +5 -2
- hpcflow/tests/scripts/test_input_file_generators.py +282 -0
- hpcflow/tests/scripts/test_main_scripts.py +866 -85
- hpcflow/tests/scripts/test_non_snippet_script.py +46 -0
- hpcflow/tests/scripts/test_ouput_file_parsers.py +353 -0
- hpcflow/tests/shells/wsl/test_wsl_submission.py +12 -4
- hpcflow/tests/unit/test_action.py +262 -75
- hpcflow/tests/unit/test_action_rule.py +9 -4
- hpcflow/tests/unit/test_app.py +33 -6
- hpcflow/tests/unit/test_cache.py +46 -0
- hpcflow/tests/unit/test_cli.py +134 -1
- hpcflow/tests/unit/test_command.py +71 -54
- hpcflow/tests/unit/test_config.py +142 -16
- hpcflow/tests/unit/test_config_file.py +21 -18
- hpcflow/tests/unit/test_element.py +58 -62
- hpcflow/tests/unit/test_element_iteration.py +50 -1
- hpcflow/tests/unit/test_element_set.py +29 -19
- hpcflow/tests/unit/test_group.py +4 -2
- hpcflow/tests/unit/test_input_source.py +116 -93
- hpcflow/tests/unit/test_input_value.py +29 -24
- hpcflow/tests/unit/test_jobscript_unit.py +757 -0
- hpcflow/tests/unit/test_json_like.py +44 -35
- hpcflow/tests/unit/test_loop.py +1396 -84
- hpcflow/tests/unit/test_meta_task.py +325 -0
- hpcflow/tests/unit/test_multi_path_sequences.py +229 -0
- hpcflow/tests/unit/test_object_list.py +17 -12
- hpcflow/tests/unit/test_parameter.py +29 -7
- hpcflow/tests/unit/test_persistence.py +237 -42
- hpcflow/tests/unit/test_resources.py +20 -18
- hpcflow/tests/unit/test_run.py +117 -6
- hpcflow/tests/unit/test_run_directories.py +29 -0
- hpcflow/tests/unit/test_runtime.py +2 -1
- hpcflow/tests/unit/test_schema_input.py +23 -15
- hpcflow/tests/unit/test_shell.py +23 -2
- hpcflow/tests/unit/test_slurm.py +8 -7
- hpcflow/tests/unit/test_submission.py +38 -89
- hpcflow/tests/unit/test_task.py +352 -247
- hpcflow/tests/unit/test_task_schema.py +33 -20
- hpcflow/tests/unit/test_utils.py +9 -11
- hpcflow/tests/unit/test_value_sequence.py +15 -12
- hpcflow/tests/unit/test_workflow.py +114 -83
- hpcflow/tests/unit/test_workflow_template.py +0 -1
- hpcflow/tests/unit/utils/test_arrays.py +40 -0
- hpcflow/tests/unit/utils/test_deferred_file_writer.py +34 -0
- hpcflow/tests/unit/utils/test_hashing.py +65 -0
- hpcflow/tests/unit/utils/test_patches.py +5 -0
- hpcflow/tests/unit/utils/test_redirect_std.py +50 -0
- hpcflow/tests/workflows/__init__.py +0 -0
- hpcflow/tests/workflows/test_directory_structure.py +31 -0
- hpcflow/tests/workflows/test_jobscript.py +334 -1
- hpcflow/tests/workflows/test_run_status.py +198 -0
- hpcflow/tests/workflows/test_skip_downstream.py +696 -0
- hpcflow/tests/workflows/test_submission.py +140 -0
- hpcflow/tests/workflows/test_workflows.py +160 -15
- hpcflow/tests/workflows/test_zip.py +18 -0
- hpcflow/viz_demo.ipynb +6587 -3
- {hpcflow_new2-0.2.0a189.dist-info → hpcflow_new2-0.2.0a199.dist-info}/METADATA +8 -4
- hpcflow_new2-0.2.0a199.dist-info/RECORD +221 -0
- hpcflow/sdk/core/parallel.py +0 -21
- hpcflow_new2-0.2.0a189.dist-info/RECORD +0 -158
- {hpcflow_new2-0.2.0a189.dist-info → hpcflow_new2-0.2.0a199.dist-info}/LICENSE +0 -0
- {hpcflow_new2-0.2.0a189.dist-info → hpcflow_new2-0.2.0a199.dist-info}/WHEEL +0 -0
- {hpcflow_new2-0.2.0a189.dist-info → hpcflow_new2-0.2.0a199.dist-info}/entry_points.txt +0 -0
hpcflow/sdk/persistence/zarr.py
CHANGED
@@ -7,19 +7,24 @@ from __future__ import annotations
|
|
7
7
|
import copy
|
8
8
|
from contextlib import contextmanager
|
9
9
|
from dataclasses import dataclass
|
10
|
-
from datetime import datetime
|
11
10
|
from pathlib import Path
|
11
|
+
from typing import Any, cast, TYPE_CHECKING
|
12
|
+
from typing_extensions import override
|
12
13
|
import shutil
|
13
14
|
import time
|
14
|
-
from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Union
|
15
15
|
|
16
16
|
import numpy as np
|
17
|
-
import
|
18
|
-
|
17
|
+
from numpy.ma.core import MaskedArray
|
18
|
+
import zarr # type: ignore
|
19
|
+
from zarr.errors import BoundsCheckError # type: ignore
|
20
|
+
from zarr.storage import DirectoryStore, FSStore # type: ignore
|
21
|
+
from fsspec.implementations.zip import ZipFileSystem # type: ignore
|
19
22
|
from rich.console import Console
|
20
|
-
from numcodecs import MsgPack, VLenArray, blosc, Blosc, Zstd
|
21
|
-
from reretry import retry
|
23
|
+
from numcodecs import MsgPack, VLenArray, blosc, Blosc, Zstd # type: ignore
|
24
|
+
from reretry import retry # type: ignore
|
22
25
|
|
26
|
+
from hpcflow.sdk.typing import hydrate
|
27
|
+
from hpcflow.sdk.core import RUN_DIR_ARR_DTYPE, RUN_DIR_ARR_FILL
|
23
28
|
from hpcflow.sdk.core.errors import (
|
24
29
|
MissingParameterData,
|
25
30
|
MissingStoreEARError,
|
@@ -38,18 +43,60 @@ from hpcflow.sdk.persistence.base import (
|
|
38
43
|
StoreParameter,
|
39
44
|
StoreTask,
|
40
45
|
)
|
46
|
+
from hpcflow.sdk.persistence.types import (
|
47
|
+
LoopDescriptor,
|
48
|
+
StoreCreationInfo,
|
49
|
+
TemplateMeta,
|
50
|
+
ZarrAttrsDict,
|
51
|
+
)
|
41
52
|
from hpcflow.sdk.persistence.store_resource import ZarrAttrsStoreResource
|
42
53
|
from hpcflow.sdk.persistence.utils import ask_pw_on_auth_exc
|
43
54
|
from hpcflow.sdk.persistence.pending import CommitResourceMap
|
44
55
|
from hpcflow.sdk.persistence.base import update_param_source_dict
|
45
56
|
from hpcflow.sdk.log import TimeIt
|
57
|
+
from hpcflow.sdk.submission.submission import (
|
58
|
+
JOBSCRIPT_SUBMIT_TIME_KEYS,
|
59
|
+
SUBMISSION_SUBMIT_TIME_KEYS,
|
60
|
+
)
|
61
|
+
from hpcflow.sdk.utils.arrays import get_2D_idx, split_arr
|
62
|
+
from hpcflow.sdk.utils.strings import shorten_list_str
|
63
|
+
|
64
|
+
if TYPE_CHECKING:
|
65
|
+
from collections.abc import (
|
66
|
+
Callable,
|
67
|
+
Iterable,
|
68
|
+
Iterator,
|
69
|
+
Mapping,
|
70
|
+
MutableMapping,
|
71
|
+
Sequence,
|
72
|
+
)
|
73
|
+
from datetime import datetime
|
74
|
+
from fsspec import AbstractFileSystem # type: ignore
|
75
|
+
from logging import Logger
|
76
|
+
from typing import ClassVar
|
77
|
+
from typing_extensions import Self, TypeAlias
|
78
|
+
from numpy.typing import NDArray
|
79
|
+
from zarr import Array, Group # type: ignore
|
80
|
+
from zarr.attrs import Attributes # type: ignore
|
81
|
+
from zarr.storage import Store # type: ignore
|
82
|
+
from ..submission.types import ResolvedJobscriptBlockDependencies
|
83
|
+
from .types import TypeLookup
|
84
|
+
from ..app import BaseApp
|
85
|
+
from ..core.json_like import JSONed, JSONDocument
|
86
|
+
from ..typing import ParamSource, PathLike, DataIndex
|
87
|
+
|
88
|
+
#: List of any (Zarr-serializable) value.
|
89
|
+
ListAny: TypeAlias = "list[Any]"
|
90
|
+
#: Zarr attribute mapping context.
|
91
|
+
ZarrAttrs: TypeAlias = "dict[str, Any]"
|
92
|
+
_JS: TypeAlias = "dict[str, list[dict[str, dict]]]"
|
46
93
|
|
47
94
|
|
48
95
|
blosc.use_threads = False # hpcflow is a multiprocess program in general
|
49
96
|
|
50
97
|
|
51
98
|
@TimeIt.decorator
|
52
|
-
def _zarr_get_coord_selection(arr, selection, logger):
|
99
|
+
def _zarr_get_coord_selection(arr: Array, selection: Any, logger: Logger):
|
53
100
|
@retry(
|
54
101
|
RuntimeError,
|
55
102
|
tries=10,
|
@@ -59,53 +106,84 @@ def _zarr_get_coord_selection(arr, selection, logger):
|
|
59
106
|
logger=logger,
|
60
107
|
)
|
61
108
|
@TimeIt.decorator
|
62
|
-
def _inner(arr, selection):
|
109
|
+
def _inner(arr: Array, selection: Any):
|
63
110
|
return arr.get_coordinate_selection(selection)
|
64
111
|
|
65
112
|
return _inner(arr, selection)
|
66
113
|
|
67
114
|
|
68
|
-
def _encode_numpy_array(
|
115
|
+
def _encode_numpy_array(
|
116
|
+
obj: NDArray,
|
117
|
+
type_lookup: TypeLookup,
|
118
|
+
path: list[int],
|
119
|
+
root_group: Group,
|
120
|
+
arr_path: list[int],
|
121
|
+
) -> int:
|
69
122
|
# Might need to generate new group:
|
70
123
|
param_arr_group = root_group.require_group(arr_path)
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
else:
|
75
|
-
new_idx = max(names) + 1
|
124
|
+
new_idx = (
|
125
|
+
max((int(i.removeprefix("arr_")) for i in param_arr_group.keys()), default=-1) + 1
|
126
|
+
)
|
76
127
|
param_arr_group.create_dataset(name=f"arr_{new_idx}", data=obj)
|
77
128
|
type_lookup["arrays"].append([path, new_idx])
|
78
129
|
|
79
130
|
return len(type_lookup["arrays"]) - 1
|
80
131
|
|
81
132
|
|
82
|
-
def _decode_numpy_arrays(
|
83
|
-
|
133
|
+
def _decode_numpy_arrays(
|
134
|
+
obj: dict | None,
|
135
|
+
type_lookup: TypeLookup,
|
136
|
+
path: list[int],
|
137
|
+
arr_group: Group,
|
138
|
+
dataset_copy: bool,
|
139
|
+
):
|
140
|
+
# Yuck! Type lies! Zarr's internal types are not modern Python types.
|
141
|
+
arrays = cast("Iterable[tuple[list[int], int]]", type_lookup.get("arrays", []))
|
142
|
+
obj_: dict | NDArray | None = obj
|
143
|
+
for arr_path, arr_idx in arrays:
|
84
144
|
try:
|
85
145
|
rel_path = get_relative_path(arr_path, path)
|
86
146
|
except ValueError:
|
87
147
|
continue
|
88
148
|
|
89
|
-
dataset = arr_group.get(f"arr_{arr_idx}")
|
149
|
+
dataset: NDArray = arr_group.get(f"arr_{arr_idx}")
|
90
150
|
if dataset_copy:
|
91
151
|
dataset = dataset[:]
|
92
152
|
|
93
153
|
if rel_path:
|
94
|
-
set_in_container(
|
154
|
+
set_in_container(obj_, rel_path, dataset)
|
95
155
|
else:
|
96
|
-
|
156
|
+
obj_ = dataset
|
97
157
|
|
98
|
-
return
|
158
|
+
return obj_
|
99
159
|
|
100
160
|
|
101
|
-
def _encode_masked_array(
|
161
|
+
def _encode_masked_array(
|
162
|
+
obj: MaskedArray,
|
163
|
+
type_lookup: TypeLookup,
|
164
|
+
path: list[int],
|
165
|
+
root_group: Group,
|
166
|
+
arr_path: list[int],
|
167
|
+
):
|
102
168
|
data_idx = _encode_numpy_array(obj.data, type_lookup, path, root_group, arr_path)
|
103
169
|
mask_idx = _encode_numpy_array(obj.mask, type_lookup, path, root_group, arr_path)
|
104
170
|
type_lookup["masked_arrays"].append([path, [data_idx, mask_idx]])
|
105
171
|
|
106
172
|
|
107
|
-
def _decode_masked_arrays(
|
108
|
-
|
173
|
+
def _decode_masked_arrays(
|
174
|
+
obj: dict,
|
175
|
+
type_lookup: TypeLookup,
|
176
|
+
path: list[int],
|
177
|
+
arr_group: Group,
|
178
|
+
dataset_copy: bool,
|
179
|
+
):
|
180
|
+
# Yuck! Type lies! Zarr's internal types are not modern Python types.
|
181
|
+
masked_arrays = cast(
|
182
|
+
"Iterable[tuple[list[int], tuple[int, int]]]",
|
183
|
+
type_lookup.get("masked_arrays", []),
|
184
|
+
)
|
185
|
+
obj_: dict | MaskedArray = obj
|
186
|
+
for arr_path, (data_idx, mask_idx) in masked_arrays:
|
109
187
|
try:
|
110
188
|
rel_path = get_relative_path(arr_path, path)
|
111
189
|
except ValueError:
|
@@ -113,17 +191,17 @@ def _decode_masked_arrays(obj, type_lookup, path, arr_group, dataset_copy):
|
|
113
191
|
|
114
192
|
data = arr_group.get(f"arr_{data_idx}")
|
115
193
|
mask = arr_group.get(f"arr_{mask_idx}")
|
116
|
-
dataset =
|
194
|
+
dataset: MaskedArray = MaskedArray(data=data, mask=mask)
|
117
195
|
|
118
196
|
if rel_path:
|
119
|
-
set_in_container(
|
197
|
+
set_in_container(obj_, rel_path, dataset)
|
120
198
|
else:
|
121
|
-
|
199
|
+
obj_ = dataset
|
122
200
|
|
123
|
-
return
|
201
|
+
return obj_
|
124
202
|
|
125
203
|
|
126
|
-
def append_items_to_ragged_array(arr, items):
|
204
|
+
def append_items_to_ragged_array(arr: Array, items: Sequence[int]):
|
127
205
|
"""Append an array to a Zarr ragged array.
|
128
206
|
|
129
207
|
I think `arr.append([item])` should work, but does not for some reason, so we do it
|
@@ -135,36 +213,39 @@ def append_items_to_ragged_array(arr, items):
|
|
135
213
|
|
136
214
|
|
137
215
|
@dataclass
|
138
|
-
class ZarrStoreTask(StoreTask):
|
216
|
+
class ZarrStoreTask(StoreTask[dict]):
|
139
217
|
"""
|
140
218
|
Represents a task in a Zarr persistent store.
|
141
219
|
"""
|
142
220
|
|
143
|
-
|
221
|
+
@override
|
222
|
+
def encode(self) -> tuple[int, dict, dict[str, Any]]:
|
144
223
|
"""Prepare store task data for the persistent store."""
|
145
224
|
wk_task = {"id_": self.id_, "element_IDs": np.array(self.element_IDs)}
|
146
|
-
task = {"id_": self.id_, **self.task_template}
|
225
|
+
task = {"id_": self.id_, **(self.task_template or {})}
|
147
226
|
return self.index, wk_task, task
|
148
227
|
|
228
|
+
@override
|
149
229
|
@classmethod
|
150
|
-
def decode(cls, task_dat:
|
230
|
+
def decode(cls, task_dat: dict) -> Self:
|
151
231
|
"""Initialise a `StoreTask` from persistent task data"""
|
152
232
|
task_dat["element_IDs"] = task_dat["element_IDs"].tolist()
|
153
|
-
return
|
233
|
+
return cls(is_pending=False, **task_dat)
|
154
234
|
|
155
235
|
|
156
236
|
@dataclass
|
157
|
-
class ZarrStoreElement(StoreElement):
|
237
|
+
class ZarrStoreElement(StoreElement[ListAny, ZarrAttrs]):
|
158
238
|
"""
|
159
239
|
Represents an element in a Zarr persistent store.
|
160
240
|
"""
|
161
241
|
|
162
|
-
|
242
|
+
@override
|
243
|
+
def encode(self, attrs: ZarrAttrs) -> ListAny:
|
163
244
|
"""Prepare store elements data for the persistent store.
|
164
245
|
|
165
246
|
This method mutates `attrs`.
|
166
247
|
"""
|
167
|
-
|
248
|
+
return [
|
168
249
|
self.id_,
|
169
250
|
self.index,
|
170
251
|
self.es_idx,
|
@@ -173,10 +254,10 @@ class ZarrStoreElement(StoreElement):
|
|
173
254
|
self.task_ID,
|
174
255
|
self.iteration_IDs,
|
175
256
|
]
|
176
|
-
return elem_enc
|
177
257
|
|
258
|
+
@override
|
178
259
|
@classmethod
|
179
|
-
def decode(cls, elem_dat:
|
260
|
+
def decode(cls, elem_dat: ListAny, attrs: ZarrAttrs) -> Self:
|
180
261
|
"""Initialise a `StoreElement` from persistent element data"""
|
181
262
|
obj_dat = {
|
182
263
|
"id_": elem_dat[0],
|
@@ -191,21 +272,22 @@ class ZarrStoreElement(StoreElement):
|
|
191
272
|
|
192
273
|
|
193
274
|
@dataclass
|
194
|
-
class ZarrStoreElementIter(StoreElementIter):
|
275
|
+
class ZarrStoreElementIter(StoreElementIter[ListAny, ZarrAttrs]):
|
195
276
|
"""
|
196
277
|
Represents an element iteration in a Zarr persistent store.
|
197
278
|
"""
|
198
279
|
|
199
|
-
|
280
|
+
@override
|
281
|
+
def encode(self, attrs: ZarrAttrs) -> ListAny:
|
200
282
|
"""Prepare store element iteration data for the persistent store.
|
201
283
|
|
202
284
|
This method mutates `attrs`.
|
203
285
|
"""
|
204
|
-
|
286
|
+
return [
|
205
287
|
self.id_,
|
206
288
|
self.element_ID,
|
207
289
|
int(self.EARs_initialised),
|
208
|
-
[[
|
290
|
+
[[ek, ev] for ek, ev in self.EAR_IDs.items()] if self.EAR_IDs else None,
|
209
291
|
[
|
210
292
|
[ensure_in(dk, attrs["parameter_paths"]), dv]
|
211
293
|
for dk, dv in self.data_idx.items()
|
@@ -213,11 +295,11 @@ class ZarrStoreElementIter(StoreElementIter):
|
|
213
295
|
[ensure_in(i, attrs["schema_parameters"]) for i in self.schema_parameters],
|
214
296
|
[[ensure_in(dk, attrs["loops"]), dv] for dk, dv in self.loop_idx.items()],
|
215
297
|
]
|
216
|
-
return iter_enc
|
217
298
|
|
299
|
+
@override
|
218
300
|
@classmethod
|
219
|
-
def decode(cls, iter_dat:
|
220
|
-
"""Initialise a `
|
301
|
+
def decode(cls, iter_dat: ListAny, attrs: ZarrAttrs) -> Self:
|
302
|
+
"""Initialise a `ZarrStoreElementIter` from persistent element iteration data"""
|
221
303
|
obj_dat = {
|
222
304
|
"id_": iter_dat[0],
|
223
305
|
"element_ID": iter_dat[1],
|
@@ -231,17 +313,18 @@ class ZarrStoreElementIter(StoreElementIter):
|
|
231
313
|
|
232
314
|
|
233
315
|
@dataclass
|
234
|
-
class ZarrStoreEAR(StoreEAR):
|
316
|
+
class ZarrStoreEAR(StoreEAR[ListAny, ZarrAttrs]):
|
235
317
|
"""
|
236
318
|
Represents an element action run in a Zarr persistent store.
|
237
319
|
"""
|
238
320
|
|
239
|
-
|
321
|
+
@override
|
322
|
+
def encode(self, ts_fmt: str, attrs: ZarrAttrs) -> ListAny:
|
240
323
|
"""Prepare store EAR data for the persistent store.
|
241
324
|
|
242
325
|
This method mutates `attrs`.
|
243
326
|
"""
|
244
|
-
|
327
|
+
return [
|
245
328
|
self.id_,
|
246
329
|
self.elem_iter_ID,
|
247
330
|
self.action_idx,
|
@@ -260,11 +343,13 @@ class ZarrStoreEAR(StoreEAR):
|
|
260
343
|
self.metadata,
|
261
344
|
self.run_hostname,
|
262
345
|
self.commands_idx,
|
346
|
+
self.port_number,
|
347
|
+
self.commands_file_ID,
|
263
348
|
]
|
264
|
-
return EAR_enc
|
265
349
|
|
350
|
+
@override
|
266
351
|
@classmethod
|
267
|
-
def decode(cls, EAR_dat:
|
352
|
+
def decode(cls, EAR_dat: ListAny, ts_fmt: str, attrs: ZarrAttrs) -> Self:
|
268
353
|
"""Initialise a `ZarrStoreEAR` from persistent EAR data"""
|
269
354
|
obj_dat = {
|
270
355
|
"id_": EAR_dat[0],
|
@@ -282,55 +367,44 @@ class ZarrStoreEAR(StoreEAR):
|
|
282
367
|
"metadata": EAR_dat[12],
|
283
368
|
"run_hostname": EAR_dat[13],
|
284
369
|
"commands_idx": EAR_dat[14],
|
370
|
+
"port_number": EAR_dat[15],
|
371
|
+
"commands_file_ID": EAR_dat[16],
|
285
372
|
}
|
286
373
|
return cls(is_pending=False, **obj_dat)
|
287
374
|
|
288
375
|
|
289
376
|
@dataclass
|
377
|
+
@hydrate
|
290
378
|
class ZarrStoreParameter(StoreParameter):
|
291
379
|
"""
|
292
380
|
Represents a parameter in a Zarr persistent store.
|
293
381
|
"""
|
294
382
|
|
295
|
-
_encoders = { # keys are types
|
383
|
+
_encoders: ClassVar[dict[type, Callable]] = { # keys are types
|
296
384
|
np.ndarray: _encode_numpy_array,
|
297
|
-
|
385
|
+
MaskedArray: _encode_masked_array,
|
298
386
|
}
|
299
|
-
_decoders = { # keys are keys in type_lookup
|
387
|
+
_decoders: ClassVar[dict[str, Callable]] = { # keys are keys in type_lookup
|
300
388
|
"arrays": _decode_numpy_arrays,
|
301
389
|
"masked_arrays": _decode_masked_arrays,
|
302
390
|
}
|
303
391
|
|
304
|
-
def encode(self, root_group: zarr.Group, arr_path: str) -> Dict[str, Any]:
|
305
|
-
return super().encode(root_group=root_group, arr_path=arr_path)
|
306
|
-
|
307
|
-
@classmethod
|
308
|
-
def decode(
|
309
|
-
cls,
|
310
|
-
id_: int,
|
311
|
-
data: Union[None, Dict],
|
312
|
-
source: Dict,
|
313
|
-
arr_group: zarr.Group,
|
314
|
-
path: Optional[List[str]] = None,
|
315
|
-
dataset_copy: bool = False,
|
316
|
-
) -> Any:
|
317
|
-
return super().decode(
|
318
|
-
id_=id_,
|
319
|
-
data=data,
|
320
|
-
source=source,
|
321
|
-
path=path,
|
322
|
-
arr_group=arr_group,
|
323
|
-
dataset_copy=dataset_copy,
|
324
|
-
)
|
325
|
-
|
326
392
|
|
327
|
-
class ZarrPersistentStore(
|
393
|
+
class ZarrPersistentStore(
|
394
|
+
PersistentStore[
|
395
|
+
ZarrStoreTask,
|
396
|
+
ZarrStoreElement,
|
397
|
+
ZarrStoreElementIter,
|
398
|
+
ZarrStoreEAR,
|
399
|
+
ZarrStoreParameter,
|
400
|
+
]
|
401
|
+
):
|
328
402
|
"""
|
329
403
|
A persistent store implemented using Zarr.
|
330
404
|
"""
|
331
405
|
|
332
|
-
_name = "zarr"
|
333
|
-
_features = PersistentStoreFeatures(
|
406
|
+
_name: ClassVar[str] = "zarr"
|
407
|
+
_features: ClassVar[PersistentStoreFeatures] = PersistentStoreFeatures(
|
334
408
|
create=True,
|
335
409
|
edit=True,
|
336
410
|
jobscript_parallelism=True,
|
@@ -339,39 +413,82 @@ class ZarrPersistentStore(PersistentStore):
|
|
339
413
|
submission=True,
|
340
414
|
)
|
341
415
|
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
416
|
+
@classmethod
|
417
|
+
def _store_task_cls(cls) -> type[ZarrStoreTask]:
|
418
|
+
return ZarrStoreTask
|
419
|
+
|
420
|
+
@classmethod
|
421
|
+
def _store_elem_cls(cls) -> type[ZarrStoreElement]:
|
422
|
+
return ZarrStoreElement
|
423
|
+
|
424
|
+
@classmethod
|
425
|
+
def _store_iter_cls(cls) -> type[ZarrStoreElementIter]:
|
426
|
+
return ZarrStoreElementIter
|
427
|
+
|
428
|
+
@classmethod
|
429
|
+
def _store_EAR_cls(cls) -> type[ZarrStoreEAR]:
|
430
|
+
return ZarrStoreEAR
|
431
|
+
|
432
|
+
@classmethod
|
433
|
+
def _store_param_cls(cls) -> type[ZarrStoreParameter]:
|
434
|
+
return ZarrStoreParameter
|
435
|
+
|
436
|
+
_param_grp_name: ClassVar[str] = "parameters"
|
437
|
+
_param_base_arr_name: ClassVar[str] = "base"
|
438
|
+
_param_sources_arr_name: ClassVar[str] = "sources"
|
439
|
+
_param_user_arr_grp_name: ClassVar[str] = "arrays"
|
440
|
+
_param_data_arr_grp_name: ClassVar = lambda _, param_idx: f"param_{param_idx}"
|
441
|
+
_subs_md_group_name: ClassVar[str] = "submissions"
|
442
|
+
_task_arr_name: ClassVar[str] = "tasks"
|
443
|
+
_elem_arr_name: ClassVar[str] = "elements"
|
444
|
+
_iter_arr_name: ClassVar[str] = "iters"
|
445
|
+
_EAR_arr_name: ClassVar[str] = "runs"
|
446
|
+
_run_dir_arr_name: ClassVar[str] = "run_dirs"
|
447
|
+
_js_at_submit_md_arr_name: ClassVar[str] = "js_at_submit_md"
|
448
|
+
_js_run_IDs_arr_name: ClassVar[str] = "js_run_IDs"
|
449
|
+
_js_task_elems_arr_name: ClassVar[str] = "js_task_elems"
|
450
|
+
_js_task_acts_arr_name: ClassVar[str] = "js_task_acts"
|
451
|
+
_js_deps_arr_name: ClassVar[str] = "js_deps"
|
452
|
+
_time_res: ClassVar[str] = "us" # microseconds; must not be smaller than micro!
|
453
|
+
|
454
|
+
_res_map: ClassVar[CommitResourceMap] = CommitResourceMap(
|
455
|
+
commit_template_components=("attrs",)
|
456
|
+
)
|
457
|
+
|
458
|
+
def __init__(self, app, workflow, path: str | Path, fs: AbstractFileSystem) -> None:
|
362
459
|
self._zarr_store = None # assigned on first access to `zarr_store`
|
363
460
|
self._resources = {
|
364
461
|
"attrs": ZarrAttrsStoreResource(
|
365
462
|
app, name="attrs", open_call=self._get_root_group
|
366
463
|
),
|
367
464
|
}
|
465
|
+
self._jobscript_at_submit_metadata: dict[
|
466
|
+
int, dict[str, Any]
|
467
|
+
] = {} # this is a cache
|
468
|
+
|
469
|
+
# these are caches; keys are submission index and then tuples of
|
470
|
+
# (jobscript index, jobscript-block index):
|
471
|
+
self._jobscript_run_ID_arrays: dict[int, dict[tuple[int, int], NDArray]] = {}
|
472
|
+
self._jobscript_task_element_maps: dict[
|
473
|
+
int, dict[tuple[int, int], dict[int, list[int]]]
|
474
|
+
] = {}
|
475
|
+
self._jobscript_task_actions_arrays: dict[
|
476
|
+
int, dict[tuple[int, int], NDArray]
|
477
|
+
] = {}
|
478
|
+
self._jobscript_dependencies: dict[
|
479
|
+
int,
|
480
|
+
dict[
|
481
|
+
tuple[int, int], dict[tuple[int, int], ResolvedJobscriptBlockDependencies]
|
482
|
+
],
|
483
|
+
] = {}
|
484
|
+
|
368
485
|
super().__init__(app, workflow, path, fs)
|
369
486
|
|
370
487
|
@contextmanager
|
371
|
-
def cached_load(self) -> Iterator[
|
488
|
+
def cached_load(self) -> Iterator[None]:
|
372
489
|
"""Context manager to cache the root attributes."""
|
373
490
|
with self.using_resource("attrs", "read") as attrs:
|
374
|
-
yield
|
491
|
+
yield
|
375
492
|
|
376
493
|
def remove_replaced_dir(self) -> None:
|
377
494
|
"""
|
@@ -380,8 +497,8 @@ class ZarrPersistentStore(PersistentStore):
|
|
380
497
|
with self.using_resource("attrs", "update") as md:
|
381
498
|
if "replaced_workflow" in md:
|
382
499
|
self.logger.debug("removing temporarily renamed pre-existing workflow.")
|
383
|
-
self.remove_path(md["replaced_workflow"]
|
384
|
-
md["replaced_workflow"]
|
500
|
+
self.remove_path(md["replaced_workflow"])
|
501
|
+
del md["replaced_workflow"]
|
385
502
|
|
386
503
|
def reinstate_replaced_dir(self) -> None:
|
387
504
|
"""
|
@@ -392,32 +509,38 @@ class ZarrPersistentStore(PersistentStore):
|
|
392
509
|
self.logger.debug(
|
393
510
|
"reinstating temporarily renamed pre-existing workflow."
|
394
511
|
)
|
395
|
-
self.rename_path(
|
512
|
+
self.rename_path(
|
513
|
+
md["replaced_workflow"],
|
514
|
+
self.path,
|
515
|
+
)
|
396
516
|
|
397
517
|
@staticmethod
|
398
|
-
def _get_zarr_store(path: str, fs) ->
|
399
|
-
return
|
518
|
+
def _get_zarr_store(path: str | Path, fs: AbstractFileSystem) -> Store:
|
519
|
+
return FSStore(url=str(path), fs=fs)
|
520
|
+
|
521
|
+
_CODEC: ClassVar = MsgPack()
|
400
522
|
|
401
523
|
@classmethod
|
402
524
|
def write_empty_workflow(
|
403
525
|
cls,
|
404
|
-
app,
|
405
|
-
|
406
|
-
|
526
|
+
app: BaseApp,
|
527
|
+
*,
|
528
|
+
template_js: TemplateMeta,
|
529
|
+
template_components_js: dict[str, Any],
|
407
530
|
wk_path: str,
|
408
|
-
fs,
|
531
|
+
fs: AbstractFileSystem,
|
409
532
|
name: str,
|
410
|
-
replaced_wk: str,
|
533
|
+
replaced_wk: str | None,
|
411
534
|
ts_fmt: str,
|
412
535
|
ts_name_fmt: str,
|
413
|
-
creation_info:
|
414
|
-
compressor:
|
415
|
-
compressor_kwargs:
|
536
|
+
creation_info: StoreCreationInfo,
|
537
|
+
compressor: str | None = "blosc",
|
538
|
+
compressor_kwargs: dict[str, Any] | None = None,
|
416
539
|
) -> None:
|
417
540
|
"""
|
418
541
|
Write an empty persistent workflow.
|
419
542
|
"""
|
420
|
-
attrs = {
|
543
|
+
attrs: ZarrAttrsDict = {
|
421
544
|
"name": name,
|
422
545
|
"ts_fmt": ts_fmt,
|
423
546
|
"ts_name_fmt": ts_name_fmt,
|
@@ -436,7 +559,11 @@ class ZarrPersistentStore(PersistentStore):
|
|
436
559
|
root = zarr.group(store=store, overwrite=False)
|
437
560
|
root.attrs.update(attrs)
|
438
561
|
|
439
|
-
|
562
|
+
# use a nested directory store for the metadata group so the runs array
|
563
|
+
# can be stored as a 2D array in nested directories, thereby limiting the maximum
|
564
|
+
# number of files stored in a given directory:
|
565
|
+
md_store = zarr.NestedDirectoryStore(Path(root.store.path).joinpath("metadata"))
|
566
|
+
md = zarr.group(store=md_store)
|
440
567
|
|
441
568
|
compressor_lookup = {
|
442
569
|
"blosc": Blosc,
|
@@ -459,7 +586,7 @@ class ZarrPersistentStore(PersistentStore):
|
|
459
586
|
name=cls._elem_arr_name,
|
460
587
|
shape=0,
|
461
588
|
dtype=object,
|
462
|
-
object_codec=
|
589
|
+
object_codec=cls._CODEC,
|
463
590
|
chunks=1000,
|
464
591
|
compressor=cmp,
|
465
592
|
)
|
@@ -469,7 +596,7 @@ class ZarrPersistentStore(PersistentStore):
|
|
469
596
|
name=cls._iter_arr_name,
|
470
597
|
shape=0,
|
471
598
|
dtype=object,
|
472
|
-
object_codec=
|
599
|
+
object_codec=cls._CODEC,
|
473
600
|
chunks=1000,
|
474
601
|
compressor=cmp,
|
475
602
|
)
|
@@ -483,20 +610,31 @@ class ZarrPersistentStore(PersistentStore):
|
|
483
610
|
|
484
611
|
EARs_arr = md.create_dataset(
|
485
612
|
name=cls._EAR_arr_name,
|
486
|
-
shape=0,
|
613
|
+
shape=(0, 1000),
|
487
614
|
dtype=object,
|
488
|
-
object_codec=
|
615
|
+
object_codec=cls._CODEC,
|
489
616
|
chunks=1, # single-chunk rows for multiprocess writing
|
490
617
|
compressor=cmp,
|
618
|
+
dimension_separator="/",
|
619
|
+
)
|
620
|
+
EARs_arr.attrs.update({"parameter_paths": [], "num_runs": 0})
|
621
|
+
|
622
|
+
# array for storing indices that can be used to reproduce run directory paths:
|
623
|
+
run_dir_arr = md.create_dataset(
|
624
|
+
name=cls._run_dir_arr_name,
|
625
|
+
shape=0,
|
626
|
+
chunks=10_000,
|
627
|
+
dtype=RUN_DIR_ARR_DTYPE,
|
628
|
+
fill_value=RUN_DIR_ARR_FILL,
|
629
|
+
write_empty_chunks=False,
|
491
630
|
)
|
492
|
-
EARs_arr.attrs.update({"parameter_paths": []})
|
493
631
|
|
494
632
|
parameter_data = root.create_group(name=cls._param_grp_name)
|
495
633
|
parameter_data.create_dataset(
|
496
634
|
name=cls._param_base_arr_name,
|
497
635
|
shape=0,
|
498
636
|
dtype=object,
|
499
|
-
object_codec=
|
637
|
+
object_codec=cls._CODEC,
|
500
638
|
chunks=1,
|
501
639
|
compressor=cmp,
|
502
640
|
write_empty_chunks=False,
|
@@ -506,15 +644,18 @@ class ZarrPersistentStore(PersistentStore):
|
|
506
644
|
name=cls._param_sources_arr_name,
|
507
645
|
shape=0,
|
508
646
|
dtype=object,
|
509
|
-
object_codec=
|
647
|
+
object_codec=cls._CODEC,
|
510
648
|
chunks=1000, # TODO: check this is a sensible size with many parameters
|
511
649
|
compressor=cmp,
|
512
650
|
)
|
513
651
|
parameter_data.create_group(name=cls._param_user_arr_grp_name)
|
514
652
|
|
515
|
-
|
653
|
+
# for storing submission metadata that should not be stored in the root group:
|
654
|
+
md.create_group(name=cls._subs_md_group_name)
|
655
|
+
|
656
|
+
def _append_tasks(self, tasks: Iterable[ZarrStoreTask]):
|
516
657
|
elem_IDs_arr = self._get_tasks_arr(mode="r+")
|
517
|
-
elem_IDs = []
|
658
|
+
elem_IDs: list[int] = []
|
518
659
|
with self.using_resource("attrs", "update") as attrs:
|
519
660
|
for i_idx, i in enumerate(tasks):
|
520
661
|
idx, wk_task_i, task_i = i.encode()
|
@@ -529,24 +670,350 @@ class ZarrPersistentStore(PersistentStore):
|
|
529
670
|
# increasing IDs.
|
530
671
|
append_items_to_ragged_array(arr=elem_IDs_arr, items=elem_IDs)
|
531
672
|
|
532
|
-
def _append_loops(self, loops:
|
673
|
+
def _append_loops(self, loops: dict[int, LoopDescriptor]):
|
533
674
|
with self.using_resource("attrs", action="update") as attrs:
|
534
|
-
for
|
675
|
+
for loop in loops.values():
|
535
676
|
attrs["loops"].append(
|
536
677
|
{
|
537
678
|
"num_added_iterations": loop["num_added_iterations"],
|
538
679
|
"iterable_parameters": loop["iterable_parameters"],
|
680
|
+
"output_parameters": loop["output_parameters"],
|
539
681
|
"parents": loop["parents"],
|
540
682
|
}
|
541
683
|
)
|
542
684
|
attrs["template"]["loops"].append(loop["loop_template"])
|
543
685
|
|
544
|
-
|
686
|
+
@staticmethod
|
687
|
+
def _extract_submission_run_IDs_array(
|
688
|
+
sub_js: Mapping[str, JSONed],
|
689
|
+
) -> tuple[np.ndarray, list[list[list[int]]]]:
|
690
|
+
"""For a JSON-like representation of a Submission object, remove and combine all
|
691
|
+
jobscript-block run ID lists into a single array with a fill value.
|
692
|
+
|
693
|
+
Notes
|
694
|
+
-----
|
695
|
+
This mutates `sub_js`, by setting `EAR_ID` jobscript-block keys to `None`.
|
696
|
+
|
697
|
+
Parameters
|
698
|
+
----------
|
699
|
+
sub_js
|
700
|
+
JSON-like representation of a `Submission` object.
|
701
|
+
|
702
|
+
Returns
|
703
|
+
-------
|
704
|
+
combined_run_IDs
|
705
|
+
Integer Numpy array that contains a concatenation of all 2D run ID arrays
|
706
|
+
from each jobscript-block. Technically a "jagged"/"ragged" array that is made
|
707
|
+
square with a large fill value.
|
708
|
+
block_shapes
|
709
|
+
List of length equal to the number of jobscripts in the submission. Each
|
710
|
+
sub-list contains a list of shapes (as a two-item list:
|
711
|
+
`[num_actions, num_elements]`) of the constituent blocks of that jobscript.
|
712
|
+
|
713
|
+
"""
|
714
|
+
arrs = []
|
715
|
+
max_acts, max_elems = 0, 0
|
716
|
+
|
717
|
+
# a list for each jobscript, containing shapes of run ID arrays in each block:
|
718
|
+
block_shapes = []
|
719
|
+
for js in cast("Sequence[Mapping[str, JSONed]]", sub_js["jobscripts"]):
|
720
|
+
block_shapes_js_i = []
|
721
|
+
for blk in cast("Sequence[MutableMapping[str, JSONed]]", js["blocks"]):
|
722
|
+
run_IDs_i = np.array(blk["EAR_ID"])
|
723
|
+
blk["EAR_ID"] = None # TODO: how to type?
|
724
|
+
block_shapes_js_i.append(list(run_IDs_i.shape))
|
725
|
+
if run_IDs_i.shape[0] > max_acts:
|
726
|
+
max_acts = run_IDs_i.shape[0]
|
727
|
+
if run_IDs_i.shape[1] > max_elems:
|
728
|
+
max_elems = run_IDs_i.shape[1]
|
729
|
+
arrs.append(run_IDs_i)
|
730
|
+
block_shapes.append(block_shapes_js_i)
|
731
|
+
|
732
|
+
combined_run_IDs = np.full(
|
733
|
+
(len(arrs), max_acts, max_elems),
|
734
|
+
dtype=np.uint32,
|
735
|
+
fill_value=np.iinfo(np.uint32).max,
|
736
|
+
)
|
737
|
+
for arr_idx, arr in enumerate(arrs):
|
738
|
+
combined_run_IDs[arr_idx][: arr.shape[0], : arr.shape[1]] = arr
|
739
|
+
|
740
|
+
return combined_run_IDs, block_shapes
|
741
|
+
|
742
|
+
@staticmethod
|
743
|
+
def _extract_submission_task_elements_array(
|
744
|
+
sub_js: Mapping[str, JSONed],
|
745
|
+
) -> tuple[np.ndarray, list[list[list[int]]]]:
|
746
|
+
"""For a JSON-like representation of a Submission object, remove and combine all
|
747
|
+
jobscript-block task-element mappings into a single array with a fill value.
|
748
|
+
|
749
|
+
Notes
|
750
|
+
-----
|
751
|
+
This mutates `sub_js`, by setting `task_elements` jobscript-block keys to `None`.
|
752
|
+
|
753
|
+
Parameters
|
754
|
+
----------
|
755
|
+
sub_js
|
756
|
+
JSON-like representation of a `Submission` object.
|
757
|
+
|
758
|
+
Returns
|
759
|
+
-------
|
760
|
+
combined_task_elems
|
761
|
+
Integer Numpy array that contains a concatenation of each task-element,
|
762
|
+
mapping, where each mapping is expressed as a 2D array whose first column
|
763
|
+
corresponds to the keys of the mappings, and whose remaining columns
|
764
|
+
correspond to the values of the mappings. Technically a "jagged"/"ragged"
|
765
|
+
array that is made square with a large fill value.
|
766
|
+
block_shapes
|
767
|
+
List of length equal to the number of jobscripts in the submission. Each
|
768
|
+
sub-list contains a list of shapes (as a two-item list:
|
769
|
+
`[num_actions, num_elements]`) of the constituent blocks of that jobscript.
|
770
|
+
|
771
|
+
"""
|
772
|
+
arrs = []
|
773
|
+
max_x, max_y = 0, 0
|
774
|
+
|
775
|
+
# a list for each jobscript, containing shapes of run ID arrays in each block:
|
776
|
+
block_shapes = []
|
777
|
+
for js in cast("Sequence[Mapping[str, JSONed]]", sub_js["jobscripts"]):
|
778
|
+
block_shapes_js_i = []
|
779
|
+
for blk in cast("Sequence[MutableMapping[str, JSONed]]", js["blocks"]):
|
780
|
+
|
781
|
+
task_elems_lst = []
|
782
|
+
for k, v in cast("Mapping[int, list[int]]", blk["task_elements"]).items():
|
783
|
+
task_elems_lst.append([k] + v)
|
784
|
+
task_elems_i = np.array(task_elems_lst)
|
785
|
+
|
786
|
+
block_shape_j = [task_elems_i.shape[1] - 1, task_elems_i.shape[0]]
|
787
|
+
block_shapes_js_i.append(block_shape_j)
|
788
|
+
|
789
|
+
blk["task_elements"] = None # TODO: how to type?
|
790
|
+
if task_elems_i.shape[1] > max_x:
|
791
|
+
max_x = task_elems_i.shape[1]
|
792
|
+
if task_elems_i.shape[0] > max_y:
|
793
|
+
max_y = task_elems_i.shape[0]
|
794
|
+
arrs.append(task_elems_i)
|
795
|
+
block_shapes.append(block_shapes_js_i)
|
796
|
+
|
797
|
+
combined_task_elems = np.full(
|
798
|
+
(len(arrs), max_y, max_x),
|
799
|
+
dtype=np.uint32,
|
800
|
+
fill_value=np.iinfo(np.uint32).max,
|
801
|
+
)
|
802
|
+
for arr_idx, arr in enumerate(arrs):
|
803
|
+
combined_task_elems[arr_idx][: arr.shape[0], : arr.shape[1]] = arr
|
804
|
+
|
805
|
+
return combined_task_elems, block_shapes
|
806
|
+
|
807
|
+
@staticmethod
|
808
|
+
def _extract_submission_task_actions_array(
|
809
|
+
sub_js: Mapping[str, JSONed],
|
810
|
+
) -> tuple[np.ndarray, list[list[int]]]:
|
811
|
+
"""For a JSON-like representation of a Submission object, remove and concatenate
|
812
|
+
all jobscript-block task-action arrays into a single array.
|
813
|
+
|
814
|
+
Notes
|
815
|
+
-----
|
816
|
+
This mutates `sub_js`, by setting `task_actions` jobscript-block keys to `None`.
|
817
|
+
|
818
|
+
Parameters
|
819
|
+
----------
|
820
|
+
sub_js
|
821
|
+
JSON-like representation of a `Submission` object.
|
822
|
+
|
823
|
+
Returns
|
824
|
+
-------
|
825
|
+
combined_task_acts
|
826
|
+
Integer 2D Numpy array which is a concatenation along the first axis of
|
827
|
+
task-action actions from all jobscript blocks. The second dimension is of
|
828
|
+
length three.
|
829
|
+
block_num_acts
|
830
|
+
List of length equal to the number of jobscripts in the submission. Each
|
831
|
+
sub-list contains a list of `num_actions` of the constituent blocks of that
|
832
|
+
jobscript.
|
833
|
+
|
834
|
+
"""
|
835
|
+
arrs = []
|
836
|
+
|
837
|
+
# a list for each jobscript, containing shapes of run ID arrays in each block:
|
838
|
+
|
839
|
+
blk_num_acts = []
|
840
|
+
for js in cast("Sequence[Mapping[str, JSONed]]", sub_js["jobscripts"]):
|
841
|
+
|
842
|
+
blk_num_acts_js_i = []
|
843
|
+
for blk in cast("Sequence[MutableMapping[str, JSONed]]", js["blocks"]):
|
844
|
+
|
845
|
+
blk_acts = np.array(blk["task_actions"])
|
846
|
+
blk["task_actions"] = None # TODO: how to type?
|
847
|
+
blk_num_acts_js_i.append(blk_acts.shape[0])
|
848
|
+
arrs.append(blk_acts)
|
849
|
+
|
850
|
+
blk_num_acts.append(blk_num_acts_js_i)
|
851
|
+
|
852
|
+
combined_task_acts = np.vstack(arrs)
|
853
|
+
|
854
|
+
return combined_task_acts, blk_num_acts
|
855
|
+
|
856
|
+
@staticmethod
|
857
|
+
def _encode_jobscript_block_dependencies(sub_js: Mapping[str, JSONed]) -> np.ndarray:
|
858
|
+
"""For a JSON-like representation of a Submission object, remove jobscript-block
|
859
|
+
dependencies for all jobscripts and transform to a single 1D integer array, that
|
860
|
+
can be transformed back by `_decode_jobscript_block_dependencies`.
|
861
|
+
|
862
|
+
Notes
|
863
|
+
-----
|
864
|
+
This mutates `sub_js`, by setting `depdendencies` jobscript-block keys to `None`.
|
865
|
+
"""
|
866
|
+
|
867
|
+
# TODO: avoid this horrible mess of casts
|
868
|
+
|
869
|
+
all_deps_arr = []
|
870
|
+
assert sub_js["jobscripts"] is not None
|
871
|
+
for js in cast("Sequence[Mapping[str, JSONed]]", sub_js["jobscripts"]):
|
872
|
+
for blk in cast("Sequence[MutableMapping[str, JSONed]]", js["blocks"]):
|
873
|
+
all_deps_i: list[int] = []
|
874
|
+
assert blk["dependencies"] is not None
|
875
|
+
blk_deps = cast(
|
876
|
+
"list[tuple[tuple[int, int], Mapping[str, JSONed]]]",
|
877
|
+
blk["dependencies"],
|
878
|
+
)
|
879
|
+
for (dep_js_idx, dep_blk_idx), dep in blk_deps:
|
880
|
+
deps_arr: list[int] = []
|
881
|
+
for elem_i, elements_j in cast(
|
882
|
+
"Mapping[int, Sequence[int]]", dep["js_element_mapping"]
|
883
|
+
).items():
|
884
|
+
deps_arr.extend([len(elements_j) + 1, elem_i] + list(elements_j))
|
885
|
+
blk_arr = [
|
886
|
+
dep_js_idx,
|
887
|
+
dep_blk_idx,
|
888
|
+
int(cast("bool", dep["is_array"])),
|
889
|
+
] + deps_arr
|
890
|
+
blk_arr = [len(blk_arr)] + blk_arr
|
891
|
+
all_deps_i.extend(blk_arr)
|
892
|
+
all_deps_i = [
|
893
|
+
cast("int", js["index"]),
|
894
|
+
cast("int", blk["index"]),
|
895
|
+
] + all_deps_i
|
896
|
+
blk["dependencies"] = None # TODO: how to type?
|
897
|
+
all_deps_arr.extend([len(all_deps_i)] + all_deps_i)
|
898
|
+
|
899
|
+
return np.array(all_deps_arr)
|
900
|
+
|
901
|
+
@staticmethod
|
902
|
+
def _decode_jobscript_block_dependencies(
|
903
|
+
arr: np.ndarray,
|
904
|
+
) -> dict[tuple[int, int], dict[tuple[int, int], ResolvedJobscriptBlockDependencies]]:
|
905
|
+
"""Re-generate jobscript-block dependencies that have been transformed by
|
906
|
+
`_encode_jobscript_block_dependencies` into a single 1D integer array.
|
907
|
+
|
908
|
+
Parameters
|
909
|
+
----------
|
910
|
+
arr:
|
911
|
+
The 1D integer array to transform back to a verbose jobscript-block dependency
|
912
|
+
mapping.
|
913
|
+
"""
|
914
|
+
# metadata is js/blk_idx for which the dependencies are stored:
|
915
|
+
block_arrs = split_arr(arr, metadata_size=2)
|
916
|
+
block_deps = {}
|
917
|
+
for i in block_arrs:
|
918
|
+
|
919
|
+
js_idx: int
|
920
|
+
blk_idx: int
|
921
|
+
dep_js_idx: int
|
922
|
+
dep_blk_idx: int
|
923
|
+
is_array: int
|
924
|
+
|
925
|
+
js_idx, blk_idx = i[0]
|
926
|
+
# metadata is js/blk_idx that this block depends on, plus whether the
|
927
|
+
# dependency is an array dependency:
|
928
|
+
deps_arrs = split_arr(i[1], metadata_size=3)
|
929
|
+
all_deps_ij: dict[tuple[int, int], ResolvedJobscriptBlockDependencies] = {}
|
930
|
+
for j in deps_arrs:
|
931
|
+
dep_js_idx, dep_blk_idx, is_array = j[0]
|
932
|
+
# no metadata:
|
933
|
+
elem_deps = split_arr(j[1], metadata_size=0)
|
934
|
+
all_deps_ij[(dep_js_idx, dep_blk_idx)] = {
|
935
|
+
"js_element_mapping": {},
|
936
|
+
"is_array": bool(is_array),
|
937
|
+
}
|
938
|
+
for k in elem_deps:
|
939
|
+
all_deps_ij[(dep_js_idx, dep_blk_idx)]["js_element_mapping"].update(
|
940
|
+
{k[1][0]: list(k[1][1:])}
|
941
|
+
)
|
942
|
+
|
943
|
+
block_deps[(js_idx, blk_idx)] = all_deps_ij
|
944
|
+
return block_deps
|
945
|
+
|
946
|
+
def _append_submissions(self, subs: dict[int, Mapping[str, JSONed]]):
|
947
|
+
|
948
|
+
for sub_idx, sub_i in subs.items():
|
949
|
+
|
950
|
+
# add a new metadata group for this submission:
|
951
|
+
sub_grp = self._get_all_submissions_metadata_group(mode="r+").create_group(
|
952
|
+
sub_idx
|
953
|
+
)
|
954
|
+
|
955
|
+
# add a new at-submit metadata array for jobscripts of this submission:
|
956
|
+
num_js = len(cast("list", sub_i["jobscripts"]))
|
957
|
+
sub_grp.create_dataset(
|
958
|
+
name=self._js_at_submit_md_arr_name,
|
959
|
+
shape=num_js,
|
960
|
+
dtype=object,
|
961
|
+
object_codec=MsgPack(),
|
962
|
+
chunks=1,
|
963
|
+
write_empty_chunks=False,
|
964
|
+
)
|
965
|
+
|
966
|
+
# add a new array to store run IDs for each jobscript:
|
967
|
+
combined_run_IDs, block_shapes = self._extract_submission_run_IDs_array(sub_i)
|
968
|
+
run_IDs_arr = sub_grp.create_dataset(
|
969
|
+
name=self._js_run_IDs_arr_name,
|
970
|
+
data=combined_run_IDs,
|
971
|
+
chunks=(None, None, None), # single chunk for the whole array
|
972
|
+
)
|
973
|
+
run_IDs_arr.attrs["block_shapes"] = block_shapes
|
974
|
+
|
975
|
+
# add a new array to store task-element map for each jobscript:
|
976
|
+
(
|
977
|
+
combined_task_elems,
|
978
|
+
block_shapes,
|
979
|
+
) = self._extract_submission_task_elements_array(sub_i)
|
980
|
+
task_elems_arr = sub_grp.create_dataset(
|
981
|
+
name=self._js_task_elems_arr_name,
|
982
|
+
data=combined_task_elems,
|
983
|
+
chunks=(None, None, None),
|
984
|
+
)
|
985
|
+
task_elems_arr.attrs["block_shapes"] = block_shapes
|
986
|
+
|
987
|
+
# add a new array to store task-actions for each jobscript:
|
988
|
+
(
|
989
|
+
combined_task_acts,
|
990
|
+
block_num_acts,
|
991
|
+
) = self._extract_submission_task_actions_array(sub_i)
|
992
|
+
task_acts_arr = sub_grp.create_dataset(
|
993
|
+
name=self._js_task_acts_arr_name,
|
994
|
+
data=combined_task_acts,
|
995
|
+
chunks=(None, None),
|
996
|
+
)
|
997
|
+
task_acts_arr.attrs["block_num_acts"] = block_num_acts
|
998
|
+
|
999
|
+
# add a new array to store jobscript-block dependencies for this submission:
|
1000
|
+
sub_grp.create_dataset(
|
1001
|
+
name=self._js_deps_arr_name,
|
1002
|
+
data=self._encode_jobscript_block_dependencies(sub_i),
|
1003
|
+
chunks=(None,),
|
1004
|
+
)
|
1005
|
+
|
1006
|
+
# TODO: store block shapes in `grp.attrs` since it is defined at the
|
1007
|
+
# submission level
|
1008
|
+
|
1009
|
+
# add attributes for at-submit-time submission metadata:
|
1010
|
+
grp = self._get_submission_metadata_group(sub_idx, mode="r+")
|
1011
|
+
grp.attrs["submission_parts"] = {}
|
1012
|
+
|
545
1013
|
with self.using_resource("attrs", action="update") as attrs:
|
546
|
-
|
547
|
-
attrs["submissions"].append(sub_i)
|
1014
|
+
attrs["submissions"].extend(subs.values())
|
548
1015
|
|
549
|
-
def _append_task_element_IDs(self, task_ID: int, elem_IDs:
|
1016
|
+
def _append_task_element_IDs(self, task_ID: int, elem_IDs: list[int]):
|
550
1017
|
# I don't think there's a way to "append" to an existing array in a zarr ragged
|
551
1018
|
# array? So we have to build a new array from existing + new.
|
552
1019
|
arr = self._get_tasks_arr(mode="r+")
|
@@ -554,169 +1021,262 @@ class ZarrPersistentStore(PersistentStore):
|
|
554
1021
|
elem_IDs_new = np.concatenate((elem_IDs_cur, elem_IDs))
|
555
1022
|
arr[task_ID] = elem_IDs_new
|
556
1023
|
|
557
|
-
|
558
|
-
|
559
|
-
|
1024
|
+
@staticmethod
|
1025
|
+
def __as_dict(attrs: Attributes) -> ZarrAttrs:
|
1026
|
+
"""
|
1027
|
+
Type thunk to work around incomplete typing in zarr.
|
1028
|
+
"""
|
1029
|
+
return cast("ZarrAttrs", attrs.asdict())
|
1030
|
+
|
1031
|
+
@contextmanager
|
1032
|
+
def __mutate_attrs(self, arr: Array) -> Iterator[ZarrAttrs]:
|
1033
|
+
attrs_orig = self.__as_dict(arr.attrs)
|
560
1034
|
attrs = copy.deepcopy(attrs_orig)
|
561
|
-
|
562
|
-
arr_add[:] = [i.encode(attrs) for i in elems]
|
563
|
-
arr.append(arr_add)
|
1035
|
+
yield attrs
|
564
1036
|
if attrs != attrs_orig:
|
565
1037
|
arr.attrs.put(attrs)
|
566
1038
|
|
567
|
-
def
|
1039
|
+
def _append_elements(self, elems: Sequence[ZarrStoreElement]):
|
1040
|
+
arr = self._get_elements_arr(mode="r+")
|
1041
|
+
with self.__mutate_attrs(arr) as attrs:
|
1042
|
+
arr_add = np.empty((len(elems)), dtype=object)
|
1043
|
+
arr_add[:] = [elem.encode(attrs) for elem in elems]
|
1044
|
+
arr.append(arr_add)
|
1045
|
+
|
1046
|
+
def _append_element_sets(self, task_id: int, es_js: Sequence[Mapping]):
|
568
1047
|
task_idx = task_idx = self._get_task_id_to_idx_map()[task_id]
|
569
1048
|
with self.using_resource("attrs", "update") as attrs:
|
570
1049
|
attrs["template"]["tasks"][task_idx]["element_sets"].extend(es_js)
|
571
1050
|
|
572
|
-
def _append_elem_iter_IDs(self, elem_ID: int, iter_IDs:
|
1051
|
+
def _append_elem_iter_IDs(self, elem_ID: int, iter_IDs: Iterable[int]):
|
573
1052
|
arr = self._get_elements_arr(mode="r+")
|
574
|
-
attrs = arr.attrs
|
575
|
-
elem_dat = arr[elem_ID]
|
1053
|
+
attrs = self.__as_dict(arr.attrs)
|
1054
|
+
elem_dat = cast("list", arr[elem_ID])
|
576
1055
|
store_elem = ZarrStoreElement.decode(elem_dat, attrs)
|
577
1056
|
store_elem = store_elem.append_iteration_IDs(iter_IDs)
|
578
|
-
arr[elem_ID] = store_elem.encode(
|
579
|
-
|
580
|
-
) # attrs shouldn't be mutated (TODO: test!)
|
1057
|
+
arr[elem_ID] = store_elem.encode(attrs)
|
1058
|
+
# attrs shouldn't be mutated (TODO: test!)
|
581
1059
|
|
582
|
-
def _append_elem_iters(self, iters:
|
1060
|
+
def _append_elem_iters(self, iters: Sequence[ZarrStoreElementIter]):
|
583
1061
|
arr = self._get_iters_arr(mode="r+")
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
arr.append(arr_add)
|
589
|
-
if attrs != attrs_orig:
|
590
|
-
arr.attrs.put(attrs)
|
1062
|
+
with self.__mutate_attrs(arr) as attrs:
|
1063
|
+
arr_add = np.empty((len(iters)), dtype=object)
|
1064
|
+
arr_add[:] = [i.encode(attrs) for i in iters]
|
1065
|
+
arr.append(arr_add)
|
591
1066
|
|
592
|
-
def _append_elem_iter_EAR_IDs(
|
1067
|
+
def _append_elem_iter_EAR_IDs(
|
1068
|
+
self, iter_ID: int, act_idx: int, EAR_IDs: Sequence[int]
|
1069
|
+
):
|
593
1070
|
arr = self._get_iters_arr(mode="r+")
|
594
|
-
attrs = arr.attrs
|
595
|
-
iter_dat = arr[iter_ID]
|
1071
|
+
attrs = self.__as_dict(arr.attrs)
|
1072
|
+
iter_dat = cast("list", arr[iter_ID])
|
596
1073
|
store_iter = ZarrStoreElementIter.decode(iter_dat, attrs)
|
597
1074
|
store_iter = store_iter.append_EAR_IDs(pend_IDs={act_idx: EAR_IDs})
|
598
|
-
arr[iter_ID] = store_iter.encode(
|
599
|
-
|
600
|
-
) # attrs shouldn't be mutated (TODO: test!)
|
1075
|
+
arr[iter_ID] = store_iter.encode(attrs)
|
1076
|
+
# attrs shouldn't be mutated (TODO: test!)
|
601
1077
|
|
602
1078
|
def _update_elem_iter_EARs_initialised(self, iter_ID: int):
|
603
1079
|
arr = self._get_iters_arr(mode="r+")
|
604
|
-
attrs = arr.attrs
|
605
|
-
iter_dat = arr[iter_ID]
|
1080
|
+
attrs = self.__as_dict(arr.attrs)
|
1081
|
+
iter_dat = cast("list", arr[iter_ID])
|
606
1082
|
store_iter = ZarrStoreElementIter.decode(iter_dat, attrs)
|
607
1083
|
store_iter = store_iter.set_EARs_initialised()
|
608
|
-
arr[iter_ID] = store_iter.encode(
|
609
|
-
|
610
|
-
) # attrs shouldn't be mutated (TODO: test!)
|
1084
|
+
arr[iter_ID] = store_iter.encode(attrs)
|
1085
|
+
# attrs shouldn't be mutated (TODO: test!)
|
611
1086
|
|
612
|
-
def
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
1087
|
+
def _update_at_submit_metadata(
|
1088
|
+
self,
|
1089
|
+
at_submit_metadata: dict[int, dict[str, Any]],
|
1090
|
+
):
|
1091
|
+
for sub_idx, metadata_i in at_submit_metadata.items():
|
1092
|
+
grp = self._get_submission_metadata_group(sub_idx, mode="r+")
|
1093
|
+
attrs = self.__as_dict(grp.attrs)
|
1094
|
+
attrs["submission_parts"].update(metadata_i["submission_parts"])
|
1095
|
+
grp.attrs.put(attrs)
|
1096
|
+
|
1097
|
+
def _update_loop_index(self, loop_indices: dict[int, dict[str, int]]):
|
617
1098
|
|
618
|
-
def _update_loop_index(self, iter_ID: int, loop_idx: Dict):
|
619
1099
|
arr = self._get_iters_arr(mode="r+")
|
620
|
-
attrs = arr.attrs
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
arr[iter_ID] = store_iter.encode(attrs)
|
1100
|
+
attrs = self.__as_dict(arr.attrs)
|
1101
|
+
iter_IDs = list(loop_indices.keys())
|
1102
|
+
iter_dat = arr.get_coordinate_selection(iter_IDs)
|
1103
|
+
store_iters = [ZarrStoreElementIter.decode(i, attrs) for i in iter_dat]
|
625
1104
|
|
626
|
-
|
1105
|
+
for idx, iter_ID_i in enumerate(iter_IDs):
|
1106
|
+
new_iter_i = store_iters[idx].update_loop_idx(loop_indices[iter_ID_i])
|
1107
|
+
# seems to be a Zarr bug that prevents `set_coordinate_selection` with an
|
1108
|
+
# object array, so set one-by-one:
|
1109
|
+
arr[iter_ID_i] = new_iter_i.encode(attrs)
|
1110
|
+
|
1111
|
+
def _update_loop_num_iters(self, index: int, num_iters: list[list[list[int] | int]]):
|
627
1112
|
with self.using_resource("attrs", action="update") as attrs:
|
628
1113
|
attrs["loops"][index]["num_added_iterations"] = num_iters
|
629
1114
|
|
630
|
-
def _update_loop_parents(self, index: int, parents:
|
1115
|
+
def _update_loop_parents(self, index: int, parents: list[str]):
|
631
1116
|
with self.using_resource("attrs", action="update") as attrs:
|
632
1117
|
attrs["loops"][index]["parents"] = parents
|
633
1118
|
|
634
|
-
def
|
635
|
-
arr = self._get_EARs_arr(mode="r+")
|
636
|
-
attrs_orig = arr.attrs.asdict()
|
637
|
-
attrs = copy.deepcopy(attrs_orig)
|
638
|
-
arr_add = np.empty((len(EARs)), dtype=object)
|
639
|
-
arr_add[:] = [i.encode(attrs, self.ts_fmt) for i in EARs]
|
640
|
-
arr.append(arr_add)
|
1119
|
+
def _update_iter_data_indices(self, iter_data_indices: dict[int, DataIndex]):
|
641
1120
|
|
642
|
-
|
643
|
-
|
1121
|
+
arr = self._get_iters_arr(mode="r+")
|
1122
|
+
attrs = self.__as_dict(arr.attrs)
|
1123
|
+
iter_IDs = list(iter_data_indices.keys())
|
1124
|
+
iter_dat = arr.get_coordinate_selection(iter_IDs)
|
1125
|
+
store_iters = [ZarrStoreElementIter.decode(i, attrs) for i in iter_dat]
|
644
1126
|
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
1127
|
+
for idx, iter_ID_i in enumerate(iter_IDs):
|
1128
|
+
new_iter_i = store_iters[idx].update_data_idx(iter_data_indices[iter_ID_i])
|
1129
|
+
# seems to be a Zarr bug that prevents `set_coordinate_selection` with an
|
1130
|
+
# object array, so set one-by-one:
|
1131
|
+
arr[iter_ID_i] = new_iter_i.encode(attrs)
|
1132
|
+
|
1133
|
+
def _update_run_data_indices(self, run_data_indices: dict[int, DataIndex]):
|
1134
|
+
self._update_runs(
|
1135
|
+
updates={k: {"data_idx": v} for k, v in run_data_indices.items()}
|
1136
|
+
)
|
649
1137
|
|
1138
|
+
def _append_EARs(self, EARs: Sequence[ZarrStoreEAR]):
|
650
1139
|
arr = self._get_EARs_arr(mode="r+")
|
651
|
-
|
652
|
-
|
1140
|
+
with self.__mutate_attrs(arr) as attrs:
|
1141
|
+
num_existing = attrs["num_runs"]
|
1142
|
+
num_add = len(EARs)
|
1143
|
+
num_tot = num_existing + num_add
|
1144
|
+
arr_add = np.empty(num_add, dtype=object)
|
1145
|
+
arr_add[:] = [i.encode(self.ts_fmt, attrs) for i in EARs]
|
653
1146
|
|
654
|
-
|
655
|
-
|
656
|
-
new_EAR_i = EARs[EAR_ID_i].update(submission_idx=sub_idx_i)
|
657
|
-
# seems to be a Zarr bug that prevents `set_coordinate_selection` with an
|
658
|
-
# object array, so set one-by-one:
|
659
|
-
arr[EAR_ID_i] = new_EAR_i.encode(attrs, self.ts_fmt)
|
1147
|
+
# get new 1D indices:
|
1148
|
+
new_idx: NDArray = np.arange(num_existing, num_tot)
|
660
1149
|
|
661
|
-
|
662
|
-
arr.
|
1150
|
+
# transform to 2D indices:
|
1151
|
+
r_idx, c_idx = get_2D_idx(new_idx, num_cols=arr.shape[1])
|
1152
|
+
|
1153
|
+
# add rows to accomodate new runs:
|
1154
|
+
max_r_idx = np.max(r_idx)
|
1155
|
+
if max_r_idx + 1 > arr.shape[0]:
|
1156
|
+
arr.resize(max_r_idx + 1, arr.shape[1])
|
1157
|
+
|
1158
|
+
# fill in new data:
|
1159
|
+
for arr_add_idx_i, (r_idx_i, c_idx_i) in enumerate(zip(r_idx, c_idx)):
|
1160
|
+
# seems to be a Zarr bug that prevents `set_coordinate_selection` with an
|
1161
|
+
# object array, so set one-by-one:
|
1162
|
+
arr[r_idx_i, c_idx_i] = arr_add[arr_add_idx_i]
|
1163
|
+
|
1164
|
+
attrs["num_runs"] = num_tot
|
1165
|
+
|
1166
|
+
# add more rows to run dirs array:
|
1167
|
+
dirs_arr = self._get_dirs_arr(mode="r+")
|
1168
|
+
dirs_arr.resize(num_tot)
|
1169
|
+
|
1170
|
+
def _set_run_dirs(self, run_dir_arr: np.ndarray, run_idx: np.ndarray):
|
1171
|
+
dirs_arr = self._get_dirs_arr(mode="r+")
|
1172
|
+
dirs_arr[run_idx] = run_dir_arr
|
1173
|
+
|
1174
|
+
@TimeIt.decorator
|
1175
|
+
def _update_runs(self, updates: dict[int, dict[str, Any]]):
|
1176
|
+
"""Update the provided EAR attribute values in the specified existing runs."""
|
1177
|
+
run_IDs = list(updates.keys())
|
1178
|
+
runs = self._get_persistent_EARs(run_IDs)
|
663
1179
|
|
664
|
-
def _update_EAR_start(self, EAR_id: int, s_time: datetime, s_snap: Dict, s_hn: str):
|
665
1180
|
arr = self._get_EARs_arr(mode="r+")
|
666
|
-
|
667
|
-
|
1181
|
+
with self.__mutate_attrs(arr) as attrs:
|
1182
|
+
# convert to 2D array indices:
|
1183
|
+
r_idx, c_idx = get_2D_idx(
|
1184
|
+
np.array(list(updates.keys())), num_cols=arr.shape[1]
|
1185
|
+
)
|
1186
|
+
for ri, ci, rID_i, upd_i in zip(
|
1187
|
+
r_idx, c_idx, updates.keys(), updates.values()
|
1188
|
+
):
|
1189
|
+
new_run_i = runs[rID_i].update(**upd_i)
|
1190
|
+
# seems to be a Zarr bug that prevents `set_coordinate_selection` with an
|
1191
|
+
# object array, so set one-by-one:
|
1192
|
+
arr[ri, ci] = new_run_i.encode(self.ts_fmt, attrs)
|
668
1193
|
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
1194
|
+
@TimeIt.decorator
|
1195
|
+
def _update_EAR_submission_data(self, sub_data: Mapping[int, tuple[int, int | None]]):
|
1196
|
+
self._update_runs(
|
1197
|
+
updates={
|
1198
|
+
k: {"submission_idx": v[0], "commands_file_ID": v[1]}
|
1199
|
+
for k, v in sub_data.items()
|
1200
|
+
}
|
674
1201
|
)
|
675
|
-
arr[EAR_id] = EAR_i.encode(attrs, self.ts_fmt)
|
676
1202
|
|
677
|
-
|
678
|
-
|
1203
|
+
def _update_EAR_start(
|
1204
|
+
self,
|
1205
|
+
run_starts: dict[int, tuple[datetime, dict[str, Any] | None, str, int | None]],
|
1206
|
+
):
|
1207
|
+
self._update_runs(
|
1208
|
+
updates={
|
1209
|
+
k: {
|
1210
|
+
"start_time": v[0],
|
1211
|
+
"snapshot_start": v[1],
|
1212
|
+
"run_hostname": v[2],
|
1213
|
+
"port_number": v[3],
|
1214
|
+
}
|
1215
|
+
for k, v in run_starts.items()
|
1216
|
+
}
|
1217
|
+
)
|
679
1218
|
|
680
1219
|
def _update_EAR_end(
|
681
|
-
self,
|
1220
|
+
self, run_ends: dict[int, tuple[datetime, dict[str, Any] | None, int, bool]]
|
682
1221
|
):
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
1222
|
+
self._update_runs(
|
1223
|
+
updates={
|
1224
|
+
k: {
|
1225
|
+
"end_time": v[0],
|
1226
|
+
"snapshot_end": v[1],
|
1227
|
+
"exit_code": v[2],
|
1228
|
+
"success": v[3],
|
1229
|
+
}
|
1230
|
+
for k, v in run_ends.items()
|
1231
|
+
}
|
693
1232
|
)
|
694
|
-
arr[EAR_id] = EAR_i.encode(attrs, self.ts_fmt)
|
695
1233
|
|
696
|
-
|
697
|
-
|
1234
|
+
def _update_EAR_skip(self, skips: dict[int, int]):
|
1235
|
+
self._update_runs(updates={k: {"skip": v} for k, v in skips.items()})
|
698
1236
|
|
699
|
-
def
|
700
|
-
arr = self._get_EARs_arr(mode="r+")
|
701
|
-
attrs_orig = arr.attrs.asdict()
|
702
|
-
attrs = copy.deepcopy(attrs_orig)
|
1237
|
+
def _update_js_metadata(self, js_meta: dict[int, dict[int, dict[str, Any]]]):
|
703
1238
|
|
704
|
-
|
705
|
-
EAR_i = EAR_i.update(skip=True)
|
706
|
-
arr[EAR_id] = EAR_i.encode(attrs, self.ts_fmt)
|
1239
|
+
arr_keys = JOBSCRIPT_SUBMIT_TIME_KEYS # these items go to the Zarr array
|
707
1240
|
|
708
|
-
|
709
|
-
|
1241
|
+
# split into attributes to save to the root group metadata, and those to save to
|
1242
|
+
# the submit-time jobscript metadata array
|
710
1243
|
|
711
|
-
|
712
|
-
|
713
|
-
|
714
|
-
|
715
|
-
|
716
|
-
|
1244
|
+
grp_dat = {} # keys are tuples of (sub_idx, js_idx), values are metadata dicts
|
1245
|
+
|
1246
|
+
for sub_idx, all_js_md in js_meta.items():
|
1247
|
+
js_arr = None
|
1248
|
+
for js_idx, js_meta_i in all_js_md.items():
|
1249
|
+
|
1250
|
+
grp_dat_i = {k: v for k, v in js_meta_i.items() if k not in arr_keys}
|
1251
|
+
if grp_dat_i:
|
1252
|
+
grp_dat[(sub_idx, js_idx)] = grp_dat_i
|
1253
|
+
arr_dat = [js_meta_i.get(k) for k in arr_keys]
|
1254
|
+
|
1255
|
+
if any(arr_dat):
|
1256
|
+
# we are updating the at-sumbmit metadata, so clear the cache:
|
1257
|
+
self.clear_jobscript_at_submit_metadata_cache()
|
1258
|
+
|
1259
|
+
js_arr = js_arr or self._get_jobscripts_at_submit_metadata_arr(
|
1260
|
+
mode="r+", sub_idx=sub_idx
|
717
1261
|
)
|
1262
|
+
self.logger.info(
|
1263
|
+
f"updating submit-time jobscript metadata array: {arr_dat!r}."
|
1264
|
+
)
|
1265
|
+
js_arr[js_idx] = arr_dat
|
1266
|
+
|
1267
|
+
if grp_dat:
|
1268
|
+
with self.using_resource("attrs", action="update") as attrs:
|
1269
|
+
for (sub_idx, js_idx), js_meta_i in grp_dat.items():
|
1270
|
+
self.logger.info(
|
1271
|
+
f"updating jobscript metadata in the root group for "
|
1272
|
+
f"(sub={sub_idx}, js={js_idx}): {js_meta_i!r}."
|
1273
|
+
)
|
1274
|
+
sub = cast(
|
1275
|
+
"dict[str, list[dict[str, Any]]]", attrs["submissions"][sub_idx]
|
1276
|
+
)
|
1277
|
+
sub["jobscripts"][js_idx].update(js_meta_i)
|
718
1278
|
|
719
|
-
def _append_parameters(self, params:
|
1279
|
+
def _append_parameters(self, params: Sequence[StoreParameter]):
|
720
1280
|
"""Add new persistent parameters."""
|
721
1281
|
base_arr = self._get_parameter_base_array(mode="r+", write_empty_chunks=False)
|
722
1282
|
src_arr = self._get_parameter_sources_array(mode="r+")
|
@@ -725,8 +1285,8 @@ class ZarrPersistentStore(PersistentStore):
|
|
725
1285
|
)
|
726
1286
|
|
727
1287
|
param_encode_root_group = self._get_parameter_user_array_group(mode="r+")
|
728
|
-
param_enc = []
|
729
|
-
src_enc = []
|
1288
|
+
param_enc: list[dict[str, Any] | int] = []
|
1289
|
+
src_enc: list[dict] = []
|
730
1290
|
for param_i in params:
|
731
1291
|
dat_i = param_i.encode(
|
732
1292
|
root_group=param_encode_root_group,
|
@@ -741,16 +1301,15 @@ class ZarrPersistentStore(PersistentStore):
|
|
741
1301
|
f"PersistentStore._append_parameters: finished adding {len(params)} parameters."
|
742
1302
|
)
|
743
1303
|
|
744
|
-
def _set_parameter_values(self, set_parameters:
|
1304
|
+
def _set_parameter_values(self, set_parameters: dict[int, tuple[Any, bool]]):
|
745
1305
|
"""Set multiple unset persistent parameters."""
|
746
1306
|
|
747
|
-
param_ids = list(set_parameters
|
1307
|
+
param_ids = list(set_parameters)
|
748
1308
|
# the `decode` call in `_get_persistent_parameters` should be quick:
|
749
1309
|
params = self._get_persistent_parameters(param_ids)
|
750
|
-
new_data = []
|
1310
|
+
new_data: list[dict[str, Any] | int] = []
|
751
1311
|
param_encode_root_group = self._get_parameter_user_array_group(mode="r+")
|
752
1312
|
for param_id, (value, is_file) in set_parameters.items():
|
753
|
-
|
754
1313
|
param_i = params[param_id]
|
755
1314
|
if is_file:
|
756
1315
|
param_i = param_i.set_file(value)
|
@@ -768,19 +1327,19 @@ class ZarrPersistentStore(PersistentStore):
|
|
768
1327
|
base_arr = self._get_parameter_base_array(mode="r+")
|
769
1328
|
base_arr.set_coordinate_selection(param_ids, new_data)
|
770
1329
|
|
771
|
-
def _update_parameter_sources(self, sources:
|
1330
|
+
def _update_parameter_sources(self, sources: Mapping[int, ParamSource]):
|
772
1331
|
"""Update the sources of multiple persistent parameters."""
|
773
1332
|
|
774
|
-
param_ids = list(sources
|
1333
|
+
param_ids = list(sources)
|
775
1334
|
src_arr = self._get_parameter_sources_array(mode="r+")
|
776
1335
|
existing_sources = src_arr.get_coordinate_selection(param_ids)
|
777
|
-
new_sources = [
|
778
|
-
|
779
|
-
|
780
|
-
|
1336
|
+
new_sources = [
|
1337
|
+
update_param_source_dict(cast("ParamSource", existing_sources[idx]), source_i)
|
1338
|
+
for idx, source_i in enumerate(sources.values())
|
1339
|
+
]
|
781
1340
|
src_arr.set_coordinate_selection(param_ids, new_sources)
|
782
1341
|
|
783
|
-
def _update_template_components(self, tc:
|
1342
|
+
def _update_template_components(self, tc: dict[str, Any]):
|
784
1343
|
with self.using_resource("attrs", "update") as md:
|
785
1344
|
md["template_components"] = tc
|
786
1345
|
|
@@ -819,7 +1378,7 @@ class ZarrPersistentStore(PersistentStore):
|
|
819
1378
|
if self.use_cache and self.num_EARs_cache is not None:
|
820
1379
|
num = self.num_EARs_cache
|
821
1380
|
else:
|
822
|
-
num =
|
1381
|
+
num = self._get_EARs_arr().attrs["num_runs"]
|
823
1382
|
if self.use_cache and self.num_EARs_cache is None:
|
824
1383
|
self.num_EARs_cache = num
|
825
1384
|
return num
|
@@ -832,46 +1391,55 @@ class ZarrPersistentStore(PersistentStore):
|
|
832
1391
|
return attrs["num_added_tasks"]
|
833
1392
|
|
834
1393
|
@property
|
835
|
-
def zarr_store(self) ->
|
1394
|
+
def zarr_store(self) -> Store:
|
836
1395
|
"""
|
837
1396
|
The underlying store object.
|
838
1397
|
"""
|
839
1398
|
if self._zarr_store is None:
|
1399
|
+
assert self.fs is not None
|
840
1400
|
self._zarr_store = self._get_zarr_store(self.path, self.fs)
|
841
1401
|
return self._zarr_store
|
842
1402
|
|
843
|
-
def _get_root_group(self, mode: str = "r", **kwargs) ->
|
1403
|
+
def _get_root_group(self, mode: str = "r", **kwargs) -> Group:
|
1404
|
+
# TODO: investigate if there are inefficiencies in how we retrieve zarr groups
|
1405
|
+
# and arrays, e.g. opening sub groups sequentially would open the root group
|
1406
|
+
# multiple times, and so read the root group attrs file multiple times?
|
1407
|
+
# it might make sense to define a ZarrAttrsStoreResource for each zarr group and
|
1408
|
+
# array (or at least non-parameter groups/arrays?), there could be some built-in
|
1409
|
+
# understanding of the hierarchy (e.g. via a `path` attribute) which would then
|
1410
|
+
# avoid reading parent groups multiple times --- if that is happening currently.
|
844
1411
|
return zarr.open(self.zarr_store, mode=mode, **kwargs)
|
845
1412
|
|
846
|
-
def _get_parameter_group(self, mode: str = "r", **kwargs) ->
|
1413
|
+
def _get_parameter_group(self, mode: str = "r", **kwargs) -> Group:
|
847
1414
|
return self._get_root_group(mode=mode, **kwargs).get(self._param_grp_name)
|
848
1415
|
|
849
|
-
def _get_parameter_base_array(self, mode: str = "r", **kwargs) ->
|
1416
|
+
def _get_parameter_base_array(self, mode: str = "r", **kwargs) -> Array:
|
850
1417
|
path = f"{self._param_grp_name}/{self._param_base_arr_name}"
|
851
1418
|
return zarr.open(self.zarr_store, mode=mode, path=path, **kwargs)
|
852
1419
|
|
853
|
-
def _get_parameter_sources_array(self, mode: str = "r") ->
|
1420
|
+
def _get_parameter_sources_array(self, mode: str = "r") -> Array:
|
854
1421
|
return self._get_parameter_group(mode=mode).get(self._param_sources_arr_name)
|
855
1422
|
|
856
|
-
def _get_parameter_user_array_group(self, mode: str = "r") ->
|
1423
|
+
def _get_parameter_user_array_group(self, mode: str = "r") -> Group:
|
857
1424
|
return self._get_parameter_group(mode=mode).get(self._param_user_arr_grp_name)
|
858
1425
|
|
859
1426
|
def _get_parameter_data_array_group(
|
860
1427
|
self,
|
861
1428
|
parameter_idx: int,
|
862
1429
|
mode: str = "r",
|
863
|
-
) ->
|
1430
|
+
) -> Group:
|
864
1431
|
return self._get_parameter_user_array_group(mode=mode).get(
|
865
1432
|
self._param_data_arr_grp_name(parameter_idx)
|
866
1433
|
)
|
867
1434
|
|
868
|
-
def _get_array_group_and_dataset(
|
1435
|
+
def _get_array_group_and_dataset(
|
1436
|
+
self, mode: str, param_id: int, data_path: list[int]
|
1437
|
+
):
|
869
1438
|
base_dat = self._get_parameter_base_array(mode="r")[param_id]
|
870
|
-
arr_idx = None
|
871
1439
|
for arr_dat_path, arr_idx in base_dat["type_lookup"]["arrays"]:
|
872
1440
|
if arr_dat_path == data_path:
|
873
1441
|
break
|
874
|
-
|
1442
|
+
else:
|
875
1443
|
raise ValueError(
|
876
1444
|
f"Could not find array path {data_path} in the base data for parameter "
|
877
1445
|
f"ID {param_id}."
|
@@ -881,21 +1449,72 @@ class ZarrPersistentStore(PersistentStore):
|
|
881
1449
|
)
|
882
1450
|
return group, f"arr_{arr_idx}"
|
883
1451
|
|
884
|
-
def _get_metadata_group(self, mode: str = "r") ->
|
885
|
-
|
1452
|
+
def _get_metadata_group(self, mode: str = "r") -> Group:
|
1453
|
+
try:
|
1454
|
+
path = Path(self.workflow.url).joinpath("metadata")
|
1455
|
+
md_store = zarr.NestedDirectoryStore(path)
|
1456
|
+
return zarr.open_group(store=md_store, mode=mode)
|
1457
|
+
except (FileNotFoundError, zarr.errors.GroupNotFoundError):
|
1458
|
+
# zip store?
|
1459
|
+
return zarr.open_group(self.zarr_store, path="metadata", mode=mode)
|
1460
|
+
|
1461
|
+
def _get_all_submissions_metadata_group(self, mode: str = "r") -> Group:
|
1462
|
+
return self._get_metadata_group(mode=mode).get(self._subs_md_group_name)
|
1463
|
+
|
1464
|
+
def _get_submission_metadata_group(self, sub_idx: int, mode: str = "r") -> Group:
|
1465
|
+
return self._get_all_submissions_metadata_group(mode=mode).get(sub_idx)
|
1466
|
+
|
1467
|
+
def _get_submission_metadata_group_path(self, sub_idx: int) -> Path:
|
1468
|
+
grp = self._get_submission_metadata_group(sub_idx)
|
1469
|
+
return Path(grp.store.path).joinpath(grp.path)
|
1470
|
+
|
1471
|
+
def _get_jobscripts_at_submit_metadata_arr(
|
1472
|
+
self, sub_idx: int, mode: str = "r"
|
1473
|
+
) -> Array:
|
1474
|
+
return self._get_submission_metadata_group(sub_idx=sub_idx, mode=mode).get(
|
1475
|
+
self._js_at_submit_md_arr_name
|
1476
|
+
)
|
1477
|
+
|
1478
|
+
def _get_jobscripts_at_submit_metadata_arr_path(self, sub_idx: int) -> Path:
|
1479
|
+
arr = self._get_jobscripts_at_submit_metadata_arr(sub_idx)
|
1480
|
+
return Path(arr.store.path).joinpath(arr.path)
|
1481
|
+
|
1482
|
+
@TimeIt.decorator
|
1483
|
+
def _get_jobscripts_run_ID_arr(self, sub_idx: int, mode: str = "r") -> Array:
|
1484
|
+
return self._get_submission_metadata_group(sub_idx=sub_idx, mode=mode).get(
|
1485
|
+
self._js_run_IDs_arr_name
|
1486
|
+
)
|
1487
|
+
|
1488
|
+
def _get_jobscripts_task_elements_arr(self, sub_idx: int, mode: str = "r") -> Array:
|
1489
|
+
return self._get_submission_metadata_group(sub_idx=sub_idx, mode=mode).get(
|
1490
|
+
self._js_task_elems_arr_name
|
1491
|
+
)
|
1492
|
+
|
1493
|
+
def _get_jobscripts_task_actions_arr(self, sub_idx: int, mode: str = "r") -> Array:
|
1494
|
+
return self._get_submission_metadata_group(sub_idx=sub_idx, mode=mode).get(
|
1495
|
+
self._js_task_acts_arr_name
|
1496
|
+
)
|
1497
|
+
|
1498
|
+
def _get_jobscripts_dependencies_arr(self, sub_idx: int, mode: str = "r") -> Array:
|
1499
|
+
return self._get_submission_metadata_group(sub_idx=sub_idx, mode=mode).get(
|
1500
|
+
self._js_deps_arr_name
|
1501
|
+
)
|
886
1502
|
|
887
|
-
def _get_tasks_arr(self, mode: str = "r") ->
|
1503
|
+
def _get_tasks_arr(self, mode: str = "r") -> Array:
|
888
1504
|
return self._get_metadata_group(mode=mode).get(self._task_arr_name)
|
889
1505
|
|
890
|
-
def _get_elements_arr(self, mode: str = "r") ->
|
1506
|
+
def _get_elements_arr(self, mode: str = "r") -> Array:
|
891
1507
|
return self._get_metadata_group(mode=mode).get(self._elem_arr_name)
|
892
1508
|
|
893
|
-
def _get_iters_arr(self, mode: str = "r") ->
|
1509
|
+
def _get_iters_arr(self, mode: str = "r") -> Array:
|
894
1510
|
return self._get_metadata_group(mode=mode).get(self._iter_arr_name)
|
895
1511
|
|
896
|
-
def _get_EARs_arr(self, mode: str = "r") ->
|
1512
|
+
def _get_EARs_arr(self, mode: str = "r") -> Array:
|
897
1513
|
return self._get_metadata_group(mode=mode).get(self._EAR_arr_name)
|
898
1514
|
|
1515
|
+
def _get_dirs_arr(self, mode: str = "r") -> zarr.Array:
|
1516
|
+
return self._get_metadata_group(mode=mode).get(self._run_dir_arr_name)
|
1517
|
+
|
899
1518
|
@classmethod
|
900
1519
|
def make_test_store_from_spec(
|
901
1520
|
cls,
|
@@ -905,10 +1524,10 @@ class ZarrPersistentStore(PersistentStore):
|
|
905
1524
|
overwrite=False,
|
906
1525
|
):
|
907
1526
|
"""Generate an store for testing purposes."""
|
1527
|
+
ts_fmt = "FIXME"
|
908
1528
|
|
909
1529
|
path = Path(dir or "", path)
|
910
|
-
|
911
|
-
root = zarr.group(store=store, overwrite=overwrite)
|
1530
|
+
root = zarr.group(store=DirectoryStore(path), overwrite=overwrite)
|
912
1531
|
md = root.create_group("metadata")
|
913
1532
|
|
914
1533
|
tasks_arr = md.create_dataset(
|
@@ -922,7 +1541,7 @@ class ZarrPersistentStore(PersistentStore):
|
|
922
1541
|
name=cls._elem_arr_name,
|
923
1542
|
shape=0,
|
924
1543
|
dtype=object,
|
925
|
-
object_codec=
|
1544
|
+
object_codec=cls._CODEC,
|
926
1545
|
chunks=1000,
|
927
1546
|
)
|
928
1547
|
elems_arr.attrs.update({"seq_idx": [], "src_idx": []})
|
@@ -931,7 +1550,7 @@ class ZarrPersistentStore(PersistentStore):
|
|
931
1550
|
name=cls._iter_arr_name,
|
932
1551
|
shape=0,
|
933
1552
|
dtype=object,
|
934
|
-
object_codec=
|
1553
|
+
object_codec=cls._CODEC,
|
935
1554
|
chunks=1000,
|
936
1555
|
)
|
937
1556
|
elem_iters_arr.attrs.update(
|
@@ -946,12 +1565,12 @@ class ZarrPersistentStore(PersistentStore):
|
|
946
1565
|
name=cls._EAR_arr_name,
|
947
1566
|
shape=0,
|
948
1567
|
dtype=object,
|
949
|
-
object_codec=
|
1568
|
+
object_codec=cls._CODEC,
|
950
1569
|
chunks=1000,
|
951
1570
|
)
|
952
|
-
EARs_arr.attrs
|
1571
|
+
EARs_arr.attrs["parameter_paths"] = []
|
953
1572
|
|
954
|
-
tasks, elems, elem_iters,
|
1573
|
+
tasks, elems, elem_iters, EARs_ = super().prepare_test_store_from_spec(spec)
|
955
1574
|
|
956
1575
|
path = Path(path).resolve()
|
957
1576
|
tasks = [ZarrStoreTask(**i).encode() for i in tasks]
|
@@ -960,21 +1579,13 @@ class ZarrPersistentStore(PersistentStore):
|
|
960
1579
|
ZarrStoreElementIter(**i).encode(elem_iters_arr.attrs.asdict())
|
961
1580
|
for i in elem_iters
|
962
1581
|
]
|
963
|
-
EARs = [ZarrStoreEAR(**i).encode(EARs_arr.attrs.asdict()) for i in
|
1582
|
+
EARs = [ZarrStoreEAR(**i).encode(ts_fmt, EARs_arr.attrs.asdict()) for i in EARs_]
|
964
1583
|
|
965
1584
|
append_items_to_ragged_array(tasks_arr, tasks)
|
966
1585
|
|
967
|
-
|
968
|
-
|
969
|
-
|
970
|
-
|
971
|
-
iter_arr_add = np.empty((len(elem_iters)), dtype=object)
|
972
|
-
iter_arr_add[:] = elem_iters
|
973
|
-
elem_iters_arr.append(iter_arr_add)
|
974
|
-
|
975
|
-
EAR_arr_add = np.empty((len(EARs)), dtype=object)
|
976
|
-
EAR_arr_add[:] = EARs
|
977
|
-
EARs_arr.append(EAR_arr_add)
|
1586
|
+
elems_arr.append(np.fromiter(elements, dtype=object))
|
1587
|
+
elem_iters_arr.append(np.fromiter(elem_iters, dtype=object))
|
1588
|
+
EARs_arr.append(np.fromiter(EARs, dtype=object))
|
978
1589
|
|
979
1590
|
return cls(path)
|
980
1591
|
|
@@ -982,17 +1593,18 @@ class ZarrPersistentStore(PersistentStore):
|
|
982
1593
|
with self.using_resource("attrs", "read") as attrs:
|
983
1594
|
return attrs["template_components"]
|
984
1595
|
|
985
|
-
def _get_persistent_template(self):
|
1596
|
+
def _get_persistent_template(self) -> dict[str, JSONed]:
|
986
1597
|
with self.using_resource("attrs", "read") as attrs:
|
987
|
-
return attrs["template"]
|
1598
|
+
return cast("dict[str, JSONed]", attrs["template"])
|
988
1599
|
|
989
1600
|
@TimeIt.decorator
|
990
|
-
def _get_persistent_tasks(self, id_lst: Iterable[int]) ->
|
1601
|
+
def _get_persistent_tasks(self, id_lst: Iterable[int]) -> dict[int, ZarrStoreTask]:
|
991
1602
|
tasks, id_lst = self._get_cached_persistent_tasks(id_lst)
|
992
1603
|
if id_lst:
|
993
1604
|
with self.using_resource("attrs", action="read") as attrs:
|
994
|
-
task_dat = {}
|
995
|
-
elem_IDs = []
|
1605
|
+
task_dat: dict[int, dict[str, Any]] = {}
|
1606
|
+
elem_IDs: list[int] = []
|
1607
|
+
i: dict[str, Any]
|
996
1608
|
for idx, i in enumerate(attrs["tasks"]):
|
997
1609
|
i = copy.deepcopy(i)
|
998
1610
|
elem_IDs.append(i.pop("element_IDs_idx"))
|
@@ -1003,65 +1615,62 @@ class ZarrPersistentStore(PersistentStore):
|
|
1003
1615
|
elem_IDs_arr_dat = self._get_tasks_arr().get_coordinate_selection(
|
1004
1616
|
elem_IDs
|
1005
1617
|
)
|
1006
|
-
except
|
1618
|
+
except BoundsCheckError:
|
1007
1619
|
raise MissingStoreTaskError(
|
1008
1620
|
elem_IDs
|
1009
1621
|
) from None # TODO: not an ID list
|
1010
1622
|
|
1011
1623
|
new_tasks = {
|
1012
1624
|
id_: ZarrStoreTask.decode({**i, "element_IDs": elem_IDs_arr_dat[id_]})
|
1013
|
-
for
|
1625
|
+
for id_, i in task_dat.items()
|
1014
1626
|
}
|
1015
|
-
|
1016
|
-
new_tasks
|
1017
|
-
self.task_cache.update(new_tasks)
|
1018
|
-
tasks.update(new_tasks)
|
1627
|
+
self.task_cache.update(new_tasks)
|
1628
|
+
tasks.update(new_tasks)
|
1019
1629
|
return tasks
|
1020
1630
|
|
1021
1631
|
@TimeIt.decorator
|
1022
|
-
def _get_persistent_loops(
|
1632
|
+
def _get_persistent_loops(
|
1633
|
+
self, id_lst: Iterable[int] | None = None
|
1634
|
+
) -> dict[int, LoopDescriptor]:
|
1023
1635
|
with self.using_resource("attrs", "read") as attrs:
|
1024
|
-
|
1025
|
-
idx: i
|
1636
|
+
return {
|
1637
|
+
idx: cast("LoopDescriptor", i)
|
1026
1638
|
for idx, i in enumerate(attrs["loops"])
|
1027
1639
|
if id_lst is None or idx in id_lst
|
1028
1640
|
}
|
1029
|
-
return loop_dat
|
1030
1641
|
|
1031
1642
|
@TimeIt.decorator
|
1032
|
-
def _get_persistent_submissions(
|
1643
|
+
def _get_persistent_submissions(
|
1644
|
+
self, id_lst: Iterable[int] | None = None
|
1645
|
+
) -> dict[int, Mapping[str, JSONed]]:
|
1033
1646
|
self.logger.debug("loading persistent submissions from the zarr store")
|
1647
|
+
ids = set(id_lst or ())
|
1034
1648
|
with self.using_resource("attrs", "read") as attrs:
|
1035
1649
|
subs_dat = copy.deepcopy(
|
1036
1650
|
{
|
1037
1651
|
idx: i
|
1038
1652
|
for idx, i in enumerate(attrs["submissions"])
|
1039
|
-
if id_lst is None or idx in
|
1653
|
+
if id_lst is None or idx in ids
|
1040
1654
|
}
|
1041
1655
|
)
|
1042
|
-
# cast jobscript submit-times and jobscript `task_elements` keys:
|
1043
|
-
for sub_idx, sub in subs_dat.items():
|
1044
|
-
for js_idx, js in enumerate(sub["jobscripts"]):
|
1045
|
-
for key in list(js["task_elements"].keys()):
|
1046
|
-
subs_dat[sub_idx]["jobscripts"][js_idx]["task_elements"][
|
1047
|
-
int(key)
|
1048
|
-
] = subs_dat[sub_idx]["jobscripts"][js_idx]["task_elements"].pop(
|
1049
|
-
key
|
1050
|
-
)
|
1051
1656
|
|
1052
1657
|
return subs_dat
|
1053
1658
|
|
1054
1659
|
@TimeIt.decorator
|
1055
1660
|
def _get_persistent_elements(
|
1056
1661
|
self, id_lst: Iterable[int]
|
1057
|
-
) ->
|
1662
|
+
) -> dict[int, ZarrStoreElement]:
|
1058
1663
|
elems, id_lst = self._get_cached_persistent_elements(id_lst)
|
1059
1664
|
if id_lst:
|
1665
|
+
self.logger.debug(
|
1666
|
+
f"loading {len(id_lst)} persistent element(s) from disk: "
|
1667
|
+
f"{shorten_list_str(id_lst)}."
|
1668
|
+
)
|
1060
1669
|
arr = self._get_elements_arr()
|
1061
1670
|
attrs = arr.attrs.asdict()
|
1062
1671
|
try:
|
1063
1672
|
elem_arr_dat = arr.get_coordinate_selection(id_lst)
|
1064
|
-
except
|
1673
|
+
except BoundsCheckError:
|
1065
1674
|
raise MissingStoreElementError(id_lst) from None
|
1066
1675
|
elem_dat = dict(zip(id_lst, elem_arr_dat))
|
1067
1676
|
new_elems = {
|
@@ -1074,14 +1683,18 @@ class ZarrPersistentStore(PersistentStore):
|
|
1074
1683
|
@TimeIt.decorator
|
1075
1684
|
def _get_persistent_element_iters(
|
1076
1685
|
self, id_lst: Iterable[int]
|
1077
|
-
) ->
|
1686
|
+
) -> dict[int, ZarrStoreElementIter]:
|
1078
1687
|
iters, id_lst = self._get_cached_persistent_element_iters(id_lst)
|
1079
1688
|
if id_lst:
|
1689
|
+
self.logger.debug(
|
1690
|
+
f"loading {len(id_lst)} persistent element iteration(s) from disk: "
|
1691
|
+
f"{shorten_list_str(id_lst)}."
|
1692
|
+
)
|
1080
1693
|
arr = self._get_iters_arr()
|
1081
1694
|
attrs = arr.attrs.asdict()
|
1082
1695
|
try:
|
1083
1696
|
iter_arr_dat = arr.get_coordinate_selection(id_lst)
|
1084
|
-
except
|
1697
|
+
except BoundsCheckError:
|
1085
1698
|
raise MissingStoreElementIterationError(id_lst) from None
|
1086
1699
|
iter_dat = dict(zip(id_lst, iter_arr_dat))
|
1087
1700
|
new_iters = {
|
@@ -1092,19 +1705,29 @@ class ZarrPersistentStore(PersistentStore):
|
|
1092
1705
|
return iters
|
1093
1706
|
|
1094
1707
|
@TimeIt.decorator
|
1095
|
-
def _get_persistent_EARs(self, id_lst: Iterable[int]) ->
|
1708
|
+
def _get_persistent_EARs(self, id_lst: Iterable[int]) -> dict[int, ZarrStoreEAR]:
|
1096
1709
|
runs, id_lst = self._get_cached_persistent_EARs(id_lst)
|
1097
1710
|
if id_lst:
|
1711
|
+
self.logger.debug(
|
1712
|
+
f"loading {len(id_lst)} persistent EAR(s) from disk: "
|
1713
|
+
f"{shorten_list_str(id_lst)}."
|
1714
|
+
)
|
1098
1715
|
arr = self._get_EARs_arr()
|
1099
1716
|
attrs = arr.attrs.asdict()
|
1717
|
+
sel: tuple[NDArray, NDArray] | list[int]
|
1100
1718
|
try:
|
1101
|
-
|
1102
|
-
|
1103
|
-
except
|
1719
|
+
# convert to 2D array indices:
|
1720
|
+
sel = get_2D_idx(np.array(id_lst), num_cols=arr.shape[1])
|
1721
|
+
except IndexError:
|
1722
|
+
# 1D runs array from before update to 2D in Feb 2025 refactor/jobscript:
|
1723
|
+
sel = id_lst
|
1724
|
+
try:
|
1725
|
+
EAR_arr_dat = _zarr_get_coord_selection(arr, sel, self.logger)
|
1726
|
+
except BoundsCheckError:
|
1104
1727
|
raise MissingStoreEARError(id_lst) from None
|
1105
1728
|
EAR_dat = dict(zip(id_lst, EAR_arr_dat))
|
1106
1729
|
new_runs = {
|
1107
|
-
k: ZarrStoreEAR.decode(EAR_dat=v,
|
1730
|
+
k: ZarrStoreEAR.decode(EAR_dat=v, ts_fmt=self.ts_fmt, attrs=attrs)
|
1108
1731
|
for k, v in EAR_dat.items()
|
1109
1732
|
}
|
1110
1733
|
self.EAR_cache.update(new_runs)
|
@@ -1114,20 +1737,25 @@ class ZarrPersistentStore(PersistentStore):
|
|
1114
1737
|
|
1115
1738
|
@TimeIt.decorator
|
1116
1739
|
def _get_persistent_parameters(
|
1117
|
-
self,
|
1118
|
-
|
1119
|
-
dataset_copy: Optional[bool] = False,
|
1120
|
-
) -> Dict[int, ZarrStoreParameter]:
|
1121
|
-
|
1740
|
+
self, id_lst: Iterable[int], *, dataset_copy: bool = False, **kwargs
|
1741
|
+
) -> dict[int, ZarrStoreParameter]:
|
1122
1742
|
params, id_lst = self._get_cached_persistent_parameters(id_lst)
|
1123
1743
|
if id_lst:
|
1744
|
+
|
1745
|
+
self.logger.debug(
|
1746
|
+
f"loading {len(id_lst)} persistent parameter(s) from disk: "
|
1747
|
+
f"{shorten_list_str(id_lst)}."
|
1748
|
+
)
|
1749
|
+
|
1750
|
+
# TODO: implement the "parameter_metadata_cache" for zarr stores, which would
|
1751
|
+
# keep the base_arr and src_arr open
|
1124
1752
|
base_arr = self._get_parameter_base_array(mode="r")
|
1125
1753
|
src_arr = self._get_parameter_sources_array(mode="r")
|
1126
1754
|
|
1127
1755
|
try:
|
1128
1756
|
param_arr_dat = base_arr.get_coordinate_selection(list(id_lst))
|
1129
1757
|
src_arr_dat = src_arr.get_coordinate_selection(list(id_lst))
|
1130
|
-
except
|
1758
|
+
except BoundsCheckError:
|
1131
1759
|
raise MissingParameterData(id_lst) from None
|
1132
1760
|
|
1133
1761
|
param_dat = dict(zip(id_lst, param_arr_dat))
|
@@ -1149,13 +1777,15 @@ class ZarrPersistentStore(PersistentStore):
|
|
1149
1777
|
return params
|
1150
1778
|
|
1151
1779
|
@TimeIt.decorator
|
1152
|
-
def _get_persistent_param_sources(
|
1780
|
+
def _get_persistent_param_sources(
|
1781
|
+
self, id_lst: Iterable[int]
|
1782
|
+
) -> dict[int, ParamSource]:
|
1153
1783
|
sources, id_lst = self._get_cached_persistent_param_sources(id_lst)
|
1154
1784
|
if id_lst:
|
1155
1785
|
src_arr = self._get_parameter_sources_array(mode="r")
|
1156
1786
|
try:
|
1157
1787
|
src_arr_dat = src_arr.get_coordinate_selection(list(id_lst))
|
1158
|
-
except
|
1788
|
+
except BoundsCheckError:
|
1159
1789
|
raise MissingParameterData(id_lst) from None
|
1160
1790
|
new_sources = dict(zip(id_lst, src_arr_dat))
|
1161
1791
|
self.param_sources_cache.update(new_sources)
|
@@ -1164,20 +1794,267 @@ class ZarrPersistentStore(PersistentStore):
|
|
1164
1794
|
|
1165
1795
|
def _get_persistent_parameter_set_status(
|
1166
1796
|
self, id_lst: Iterable[int]
|
1167
|
-
) ->
|
1797
|
+
) -> dict[int, bool]:
|
1168
1798
|
base_arr = self._get_parameter_base_array(mode="r")
|
1169
1799
|
try:
|
1170
1800
|
param_arr_dat = base_arr.get_coordinate_selection(list(id_lst))
|
1171
|
-
except
|
1801
|
+
except BoundsCheckError:
|
1172
1802
|
raise MissingParameterData(id_lst) from None
|
1173
1803
|
|
1174
1804
|
return dict(zip(id_lst, [i is not None for i in param_arr_dat]))
|
1175
1805
|
|
1176
|
-
def _get_persistent_parameter_IDs(self) ->
|
1806
|
+
def _get_persistent_parameter_IDs(self) -> list[int]:
|
1177
1807
|
# we assume the row index is equivalent to ID, might need to revisit in future
|
1178
1808
|
base_arr = self._get_parameter_base_array(mode="r")
|
1179
1809
|
return list(range(len(base_arr)))
|
1180
1810
|
|
1811
|
+
def get_submission_at_submit_metadata(
|
1812
|
+
self, sub_idx: int, metadata_attr: dict | None
|
1813
|
+
) -> dict[str, Any]:
|
1814
|
+
"""Retrieve the values of submission attributes that are stored at submit-time."""
|
1815
|
+
grp = self._get_submission_metadata_group(sub_idx)
|
1816
|
+
attrs = grp.attrs.asdict()
|
1817
|
+
return {k: attrs[k] for k in SUBMISSION_SUBMIT_TIME_KEYS}
|
1818
|
+
|
1819
|
+
def clear_jobscript_at_submit_metadata_cache(self):
|
1820
|
+
"""Clear the cache of at-submit-time jobscript metadata."""
|
1821
|
+
self._jobscript_at_submit_metadata = {}
|
1822
|
+
|
1823
|
+
def get_jobscript_at_submit_metadata(
|
1824
|
+
self,
|
1825
|
+
sub_idx: int,
|
1826
|
+
js_idx: int,
|
1827
|
+
metadata_attr: dict | None,
|
1828
|
+
) -> dict[str, Any]:
|
1829
|
+
"""For the specified jobscript, retrieve the values of jobscript-submit-time
|
1830
|
+
attributes.
|
1831
|
+
|
1832
|
+
Notes
|
1833
|
+
-----
|
1834
|
+
If the cache does not exist, this method will retrieve and cache metadata for
|
1835
|
+
all jobscripts for which metadata has been set. If the cache does exist, but not
|
1836
|
+
for the requested jobscript, then this method will retrieve and cache metadata for
|
1837
|
+
all non-cached jobscripts for which metadata has been set. If metadata has not
|
1838
|
+
yet been set for the specified jobscript, and dict with all `None` values will be
|
1839
|
+
returned.
|
1840
|
+
|
1841
|
+
The cache can be cleared using the method
|
1842
|
+
`clear_jobscript_at_submit_metadata_cache`.
|
1843
|
+
|
1844
|
+
"""
|
1845
|
+
if self._jobscript_at_submit_metadata:
|
1846
|
+
# cache exists, but might not include data for the requested jobscript:
|
1847
|
+
if js_idx in self._jobscript_at_submit_metadata:
|
1848
|
+
return self._jobscript_at_submit_metadata[js_idx]
|
1849
|
+
|
1850
|
+
arr = self._get_jobscripts_at_submit_metadata_arr(sub_idx)
|
1851
|
+
non_cached = set(range(len(arr))) - set(self._jobscript_at_submit_metadata.keys())
|
1852
|
+
|
1853
|
+
# populate cache:
|
1854
|
+
arr_non_cached = arr.get_coordinate_selection((list(non_cached),))
|
1855
|
+
for js_idx_i, arr_item in zip(non_cached, arr_non_cached):
|
1856
|
+
try:
|
1857
|
+
self._jobscript_at_submit_metadata[js_idx_i] = {
|
1858
|
+
i: arr_item[i_idx]
|
1859
|
+
for i_idx, i in enumerate(JOBSCRIPT_SUBMIT_TIME_KEYS)
|
1860
|
+
}
|
1861
|
+
except TypeError:
|
1862
|
+
# data for this jobscript is not set
|
1863
|
+
pass
|
1864
|
+
|
1865
|
+
if js_idx not in self._jobscript_at_submit_metadata:
|
1866
|
+
return {i: None for i in JOBSCRIPT_SUBMIT_TIME_KEYS}
|
1867
|
+
|
1868
|
+
return self._jobscript_at_submit_metadata[js_idx]
|
1869
|
+
|
1870
|
+
@TimeIt.decorator
|
1871
|
+
def get_jobscript_block_run_ID_array(
|
1872
|
+
self,
|
1873
|
+
sub_idx: int,
|
1874
|
+
js_idx: int,
|
1875
|
+
blk_idx: int,
|
1876
|
+
run_ID_arr: NDArray | None,
|
1877
|
+
) -> NDArray:
|
1878
|
+
"""For the specified jobscript-block, retrieve the run ID array."""
|
1879
|
+
|
1880
|
+
if run_ID_arr is not None:
|
1881
|
+
self.logger.debug("jobscript-block run IDs are still in memory.")
|
1882
|
+
# in the special case when the Submission object has just been created, the
|
1883
|
+
# run ID arrays will not yet be persistent.
|
1884
|
+
return np.asarray(run_ID_arr)
|
1885
|
+
|
1886
|
+
# otherwise, `append_submissions` has been called, the run IDs have been
|
1887
|
+
# removed from the JSON-representation of the submission object, and have been
|
1888
|
+
# saved in separate zarr arrays:
|
1889
|
+
if sub_idx not in self._jobscript_run_ID_arrays:
|
1890
|
+
|
1891
|
+
self.logger.debug(
|
1892
|
+
f"retrieving jobscript-block run IDs for submission {sub_idx} from disk,"
|
1893
|
+
f" and caching."
|
1894
|
+
)
|
1895
|
+
|
1896
|
+
# for a given submission, run IDs are stored for all jobscript-blocks in the
|
1897
|
+
# same array (and chunk), so retrieve all of them and cache:
|
1898
|
+
|
1899
|
+
arr = self._get_jobscripts_run_ID_arr(sub_idx)
|
1900
|
+
arr_dat = arr[:]
|
1901
|
+
block_shapes = arr.attrs["block_shapes"]
|
1902
|
+
|
1903
|
+
self._jobscript_run_ID_arrays[sub_idx] = {} # keyed by (js_idx, blk_idx)
|
1904
|
+
arr_idx = 0
|
1905
|
+
for js_idx_i, js_blk_shapes in enumerate(block_shapes):
|
1906
|
+
for blk_idx_j, blk_shape_j in enumerate(js_blk_shapes):
|
1907
|
+
self._jobscript_run_ID_arrays[sub_idx][
|
1908
|
+
(js_idx_i, blk_idx_j)
|
1909
|
+
] = arr_dat[arr_idx, : blk_shape_j[0], : blk_shape_j[1]]
|
1910
|
+
arr_idx += 1
|
1911
|
+
|
1912
|
+
else:
|
1913
|
+
self.logger.debug(
|
1914
|
+
f"retrieving jobscript-block run IDs for submission {sub_idx} from cache."
|
1915
|
+
)
|
1916
|
+
|
1917
|
+
return self._jobscript_run_ID_arrays[sub_idx][(js_idx, blk_idx)]
|
1918
|
+
|
1919
|
+
def get_jobscript_block_task_elements_map(
|
1920
|
+
self,
|
1921
|
+
sub_idx: int,
|
1922
|
+
js_idx: int,
|
1923
|
+
blk_idx: int,
|
1924
|
+
task_elems_map: dict[int, list[int]] | None,
|
1925
|
+
) -> dict[int, list[int]]:
|
1926
|
+
"""For the specified jobscript-block, retrieve the task-elements mapping."""
|
1927
|
+
|
1928
|
+
if task_elems_map is not None:
|
1929
|
+
self.logger.debug("jobscript-block task elements are still in memory.")
|
1930
|
+
# in the special case when the Submission object has just been created, the
|
1931
|
+
# task elements arrays will not yet be persistent.
|
1932
|
+
return task_elems_map
|
1933
|
+
|
1934
|
+
# otherwise, `append_submissions` has been called, the task elements have been
|
1935
|
+
# removed from the JSON-representation of the submission object, and have been
|
1936
|
+
# saved in separate zarr arrays:
|
1937
|
+
if sub_idx not in self._jobscript_task_element_maps:
|
1938
|
+
|
1939
|
+
self.logger.debug(
|
1940
|
+
f"retrieving jobscript-block task elements for submission {sub_idx} from "
|
1941
|
+
f"disk, and caching."
|
1942
|
+
)
|
1943
|
+
|
1944
|
+
# for a given submission, task elements are stored for all jobscript-blocks in
|
1945
|
+
# the same array (and chunk), so retrieve all of them and cache:
|
1946
|
+
|
1947
|
+
arr = self._get_jobscripts_task_elements_arr(sub_idx)
|
1948
|
+
arr_dat = arr[:]
|
1949
|
+
block_shapes = arr.attrs["block_shapes"]
|
1950
|
+
|
1951
|
+
self._jobscript_task_element_maps[sub_idx] = {} # keys: (js_idx, blk_idx)
|
1952
|
+
arr_idx = 0
|
1953
|
+
for js_idx_i, js_blk_shapes in enumerate(block_shapes):
|
1954
|
+
for blk_idx_j, blk_shape_j in enumerate(js_blk_shapes):
|
1955
|
+
arr_i = arr_dat[arr_idx, : blk_shape_j[1], : blk_shape_j[0] + 1]
|
1956
|
+
self._jobscript_task_element_maps[sub_idx][(js_idx_i, blk_idx_j)] = {
|
1957
|
+
k[0]: list(k[1:]) for k in arr_i
|
1958
|
+
}
|
1959
|
+
arr_idx += 1
|
1960
|
+
|
1961
|
+
else:
|
1962
|
+
self.logger.debug(
|
1963
|
+
f"retrieving jobscript-block task elements for submission {sub_idx} from "
|
1964
|
+
"cache."
|
1965
|
+
)
|
1966
|
+
|
1967
|
+
return self._jobscript_task_element_maps[sub_idx][(js_idx, blk_idx)]
|
1968
|
+
|
1969
|
+
@TimeIt.decorator
|
1970
|
+
def get_jobscript_block_task_actions_array(
|
1971
|
+
self,
|
1972
|
+
sub_idx: int,
|
1973
|
+
js_idx: int,
|
1974
|
+
blk_idx: int,
|
1975
|
+
task_actions_arr: NDArray | list[tuple[int, int, int]] | None,
|
1976
|
+
) -> NDArray:
|
1977
|
+
"""For the specified jobscript-block, retrieve the task-actions array."""
|
1978
|
+
|
1979
|
+
if task_actions_arr is not None:
|
1980
|
+
self.logger.debug("jobscript-block task actions are still in memory.")
|
1981
|
+
# in the special case when the Submission object has just been created, the
|
1982
|
+
# task actions arrays will not yet be persistent.
|
1983
|
+
return np.asarray(task_actions_arr)
|
1984
|
+
|
1985
|
+
# otherwise, `append_submissions` has been called, the task actions have been
|
1986
|
+
# removed from the JSON-representation of the submission object, and have been
|
1987
|
+
# saved in separate zarr arrays:
|
1988
|
+
if sub_idx not in self._jobscript_task_actions_arrays:
|
1989
|
+
|
1990
|
+
self.logger.debug(
|
1991
|
+
f"retrieving jobscript-block task actions for submission {sub_idx} from "
|
1992
|
+
f"disk, and caching."
|
1993
|
+
)
|
1994
|
+
|
1995
|
+
# for a given submission, task actions are stored for all jobscript-blocks in
|
1996
|
+
# the same array (and chunk), so retrieve all of them and cache:
|
1997
|
+
|
1998
|
+
arr = self._get_jobscripts_task_actions_arr(sub_idx)
|
1999
|
+
arr_dat = arr[:]
|
2000
|
+
block_num_acts = arr.attrs["block_num_acts"]
|
2001
|
+
|
2002
|
+
num_acts_count = 0
|
2003
|
+
self._jobscript_task_actions_arrays[sub_idx] = {} # keys: (js_idx, blk_idx)
|
2004
|
+
for js_idx_i, js_blk_num_acts in enumerate(block_num_acts):
|
2005
|
+
for blk_idx_j, blk_num_acts_j in enumerate(js_blk_num_acts):
|
2006
|
+
arr_i = arr_dat[num_acts_count : num_acts_count + blk_num_acts_j]
|
2007
|
+
num_acts_count += blk_num_acts_j
|
2008
|
+
self._jobscript_task_actions_arrays[sub_idx][
|
2009
|
+
(js_idx_i, blk_idx_j)
|
2010
|
+
] = arr_i
|
2011
|
+
|
2012
|
+
else:
|
2013
|
+
self.logger.debug(
|
2014
|
+
f"retrieving jobscript-block task actions for submission {sub_idx} from "
|
2015
|
+
"cache."
|
2016
|
+
)
|
2017
|
+
|
2018
|
+
return self._jobscript_task_actions_arrays[sub_idx][(js_idx, blk_idx)]
|
2019
|
+
|
2020
|
+
@TimeIt.decorator
|
2021
|
+
def get_jobscript_block_dependencies(
|
2022
|
+
self,
|
2023
|
+
sub_idx: int,
|
2024
|
+
js_idx: int,
|
2025
|
+
blk_idx: int,
|
2026
|
+
js_dependencies: dict[tuple[int, int], ResolvedJobscriptBlockDependencies] | None,
|
2027
|
+
) -> dict[tuple[int, int], ResolvedJobscriptBlockDependencies]:
|
2028
|
+
"""For the specified jobscript-block, retrieve the dependencies."""
|
2029
|
+
|
2030
|
+
if js_dependencies is not None:
|
2031
|
+
self.logger.debug("jobscript-block dependencies are still in memory.")
|
2032
|
+
# in the special case when the Submission object has just been created, the
|
2033
|
+
# dependencies will not yet be persistent.
|
2034
|
+
return js_dependencies
|
2035
|
+
|
2036
|
+
# otherwise, `append_submissions` has been called, the dependencies have been
|
2037
|
+
# removed from the JSON-representation of the submission object, and have been
|
2038
|
+
# saved in separate zarr arrays:
|
2039
|
+
if sub_idx not in self._jobscript_dependencies:
|
2040
|
+
self.logger.debug(
|
2041
|
+
f"retrieving jobscript-block dependencies for submission {sub_idx} from "
|
2042
|
+
f"disk, and caching."
|
2043
|
+
)
|
2044
|
+
# for a given submission, dependencies are stored for all jobscript-blocks in
|
2045
|
+
# the same array (and chunk), so retrieve all of them and cache:
|
2046
|
+
arr = self._get_jobscripts_dependencies_arr(sub_idx)
|
2047
|
+
self._jobscript_dependencies[
|
2048
|
+
sub_idx
|
2049
|
+
] = self._decode_jobscript_block_dependencies(arr)
|
2050
|
+
else:
|
2051
|
+
self.logger.debug(
|
2052
|
+
f"retrieving jobscript-block dependencies for submission {sub_idx} from "
|
2053
|
+
"cache."
|
2054
|
+
)
|
2055
|
+
|
2056
|
+
return self._jobscript_dependencies[sub_idx][(js_idx, blk_idx)]
|
2057
|
+
|
1181
2058
|
def get_ts_fmt(self):
|
1182
2059
|
"""
|
1183
2060
|
Get the format for timestamps.
|
@@ -1208,11 +2085,11 @@ class ZarrPersistentStore(PersistentStore):
|
|
1208
2085
|
|
1209
2086
|
def zip(
|
1210
2087
|
self,
|
1211
|
-
path=".",
|
1212
|
-
log=None,
|
1213
|
-
overwrite=False,
|
1214
|
-
include_execute=False,
|
1215
|
-
include_rechunk_backups=False,
|
2088
|
+
path: str = ".",
|
2089
|
+
log: str | None = None,
|
2090
|
+
overwrite: bool = False,
|
2091
|
+
include_execute: bool = False,
|
2092
|
+
include_rechunk_backups: bool = False,
|
1216
2093
|
):
|
1217
2094
|
"""
|
1218
2095
|
Convert the persistent store to zipped form.
|
@@ -1224,69 +2101,66 @@ class ZarrPersistentStore(PersistentStore):
|
|
1224
2101
|
directory, the zip file will be created within this directory. Otherwise,
|
1225
2102
|
this path is assumed to be the full file path to the new zip file.
|
1226
2103
|
"""
|
1227
|
-
|
1228
|
-
|
1229
|
-
|
1230
|
-
|
1231
|
-
|
1232
|
-
|
1233
|
-
|
1234
|
-
|
1235
|
-
|
1236
|
-
|
1237
|
-
|
1238
|
-
raise FileExistsError(
|
1239
|
-
f"File at path already exists: {dst_path!r}. Pass `overwrite=True` to "
|
1240
|
-
f"overwrite the existing file."
|
1241
|
-
)
|
2104
|
+
with Console().status(f"Zipping workflow {self.workflow.name!r}..."):
|
2105
|
+
# TODO: this won't work for remote file systems
|
2106
|
+
dst_path = Path(path).resolve()
|
2107
|
+
if dst_path.is_dir():
|
2108
|
+
dst_path = dst_path.joinpath(self.workflow.name).with_suffix(".zip")
|
2109
|
+
|
2110
|
+
if not overwrite and dst_path.exists():
|
2111
|
+
raise FileExistsError(
|
2112
|
+
f"File at path already exists: {dst_path!r}. Pass `overwrite=True` to "
|
2113
|
+
f"overwrite the existing file."
|
2114
|
+
)
|
1242
2115
|
|
1243
|
-
|
2116
|
+
dst_path_s = str(dst_path)
|
1244
2117
|
|
1245
|
-
|
1246
|
-
|
1247
|
-
|
1248
|
-
|
1249
|
-
|
1250
|
-
|
1251
|
-
|
1252
|
-
|
1253
|
-
|
1254
|
-
|
1255
|
-
|
1256
|
-
|
1257
|
-
|
1258
|
-
|
1259
|
-
|
1260
|
-
|
1261
|
-
|
1262
|
-
|
1263
|
-
|
1264
|
-
|
1265
|
-
|
1266
|
-
|
1267
|
-
|
1268
|
-
|
1269
|
-
|
2118
|
+
src_zarr_store = self.zarr_store
|
2119
|
+
zfs, _ = ask_pw_on_auth_exc(
|
2120
|
+
ZipFileSystem,
|
2121
|
+
fo=dst_path_s,
|
2122
|
+
mode="w",
|
2123
|
+
target_options={},
|
2124
|
+
add_pw_to="target_options",
|
2125
|
+
)
|
2126
|
+
dst_zarr_store = FSStore(url="", fs=zfs)
|
2127
|
+
excludes = []
|
2128
|
+
if not include_execute:
|
2129
|
+
excludes.append("execute")
|
2130
|
+
if not include_rechunk_backups:
|
2131
|
+
excludes.append("runs.bak")
|
2132
|
+
excludes.append("base.bak")
|
2133
|
+
|
2134
|
+
zarr.copy_store(
|
2135
|
+
src_zarr_store,
|
2136
|
+
dst_zarr_store,
|
2137
|
+
excludes=excludes or None,
|
2138
|
+
log=log,
|
2139
|
+
)
|
2140
|
+
del zfs # ZipFileSystem remains open for instance lifetime
|
2141
|
+
return dst_path_s
|
2142
|
+
|
2143
|
+
def unzip(self, path: str = ".", log: str | None = None):
|
2144
|
+
raise ValueError("Not a zip store!")
|
1270
2145
|
|
1271
2146
|
def _rechunk_arr(
|
1272
2147
|
self,
|
1273
|
-
arr,
|
1274
|
-
chunk_size:
|
1275
|
-
backup:
|
1276
|
-
status:
|
1277
|
-
):
|
1278
|
-
arr_path = Path(
|
2148
|
+
arr: Array,
|
2149
|
+
chunk_size: int | None = None,
|
2150
|
+
backup: bool = True,
|
2151
|
+
status: bool = True,
|
2152
|
+
) -> Array:
|
2153
|
+
arr_path = Path(arr.store.path) / arr.path
|
1279
2154
|
arr_name = arr.path.split("/")[-1]
|
1280
2155
|
|
1281
2156
|
if status:
|
1282
|
-
|
1283
|
-
|
1284
|
-
status.start()
|
2157
|
+
s = Console().status("Rechunking...")
|
2158
|
+
s.start()
|
1285
2159
|
backup_time = None
|
1286
2160
|
|
1287
2161
|
if backup:
|
1288
2162
|
if status:
|
1289
|
-
|
2163
|
+
s.update("Backing up...")
|
1290
2164
|
backup_path = arr_path.with_suffix(".bak")
|
1291
2165
|
if backup_path.is_dir():
|
1292
2166
|
pass
|
@@ -1298,18 +2172,26 @@ class ZarrPersistentStore(PersistentStore):
|
|
1298
2172
|
|
1299
2173
|
tic = time.perf_counter()
|
1300
2174
|
arr_rc_path = arr_path.with_suffix(".rechunked")
|
1301
|
-
arr = zarr.open(arr_path)
|
1302
2175
|
if status:
|
1303
|
-
|
2176
|
+
s.update("Creating new array...")
|
2177
|
+
|
2178
|
+
# use the same store:
|
2179
|
+
try:
|
2180
|
+
arr_rc_store = arr.store.__class__(path=arr_rc_path)
|
2181
|
+
except TypeError:
|
2182
|
+
# FSStore
|
2183
|
+
arr_rc_store = arr.store.__class__(url=str(arr_rc_path))
|
2184
|
+
|
1304
2185
|
arr_rc = zarr.create(
|
1305
|
-
store=
|
2186
|
+
store=arr_rc_store,
|
1306
2187
|
shape=arr.shape,
|
1307
2188
|
chunks=arr.shape if chunk_size is None else chunk_size,
|
1308
2189
|
dtype=object,
|
1309
|
-
object_codec=
|
2190
|
+
object_codec=self._CODEC,
|
1310
2191
|
)
|
2192
|
+
|
1311
2193
|
if status:
|
1312
|
-
|
2194
|
+
s.update("Copying data...")
|
1313
2195
|
data = np.empty(shape=arr.shape, dtype=object)
|
1314
2196
|
bad_data = []
|
1315
2197
|
for idx in range(len(arr)):
|
@@ -1318,24 +2200,23 @@ class ZarrPersistentStore(PersistentStore):
|
|
1318
2200
|
except RuntimeError:
|
1319
2201
|
# blosc decompression errors
|
1320
2202
|
bad_data.append(idx)
|
1321
|
-
pass
|
1322
2203
|
arr_rc[:] = data
|
1323
2204
|
|
1324
2205
|
arr_rc.attrs.put(arr.attrs.asdict())
|
1325
2206
|
|
1326
2207
|
if status:
|
1327
|
-
|
2208
|
+
s.update("Deleting old array...")
|
1328
2209
|
shutil.rmtree(arr_path)
|
1329
2210
|
|
1330
2211
|
if status:
|
1331
|
-
|
2212
|
+
s.update("Moving new array into place...")
|
1332
2213
|
shutil.move(arr_rc_path, arr_path)
|
1333
2214
|
|
1334
2215
|
toc = time.perf_counter()
|
1335
2216
|
rechunk_time = toc - tic
|
1336
2217
|
|
1337
2218
|
if status:
|
1338
|
-
|
2219
|
+
s.stop()
|
1339
2220
|
|
1340
2221
|
if backup_time:
|
1341
2222
|
print(f"Time to backup {arr_name}: {backup_time:.1f} s")
|
@@ -1349,10 +2230,10 @@ class ZarrPersistentStore(PersistentStore):
|
|
1349
2230
|
|
1350
2231
|
def rechunk_parameter_base(
|
1351
2232
|
self,
|
1352
|
-
chunk_size:
|
1353
|
-
backup:
|
1354
|
-
status:
|
1355
|
-
):
|
2233
|
+
chunk_size: int | None = None,
|
2234
|
+
backup: bool = True,
|
2235
|
+
status: bool = True,
|
2236
|
+
) -> Array:
|
1356
2237
|
"""
|
1357
2238
|
Rechunk the parameter data to be stored more efficiently.
|
1358
2239
|
"""
|
@@ -1361,16 +2242,22 @@ class ZarrPersistentStore(PersistentStore):
|
|
1361
2242
|
|
1362
2243
|
def rechunk_runs(
|
1363
2244
|
self,
|
1364
|
-
chunk_size:
|
1365
|
-
backup:
|
1366
|
-
status:
|
1367
|
-
):
|
2245
|
+
chunk_size: int | None = None,
|
2246
|
+
backup: bool = True,
|
2247
|
+
status: bool = True,
|
2248
|
+
) -> Array:
|
1368
2249
|
"""
|
1369
2250
|
Rechunk the run data to be stored more efficiently.
|
1370
2251
|
"""
|
1371
2252
|
arr = self._get_EARs_arr()
|
1372
2253
|
return self._rechunk_arr(arr, chunk_size, backup, status)
|
1373
2254
|
|
2255
|
+
def get_dirs_array(self) -> NDArray:
|
2256
|
+
"""
|
2257
|
+
Retrieve the run directories array.
|
2258
|
+
"""
|
2259
|
+
return self._get_dirs_arr()[:]
|
2260
|
+
|
1374
2261
|
|
1375
2262
|
class ZarrZipPersistentStore(ZarrPersistentStore):
|
1376
2263
|
"""A store designed mainly as an archive format that can be uploaded to data
|
@@ -1381,8 +2268,8 @@ class ZarrZipPersistentStore(ZarrPersistentStore):
|
|
1381
2268
|
Archive format persistent stores cannot be updated without being unzipped first.
|
1382
2269
|
"""
|
1383
2270
|
|
1384
|
-
_name = "zip"
|
1385
|
-
_features = PersistentStoreFeatures(
|
2271
|
+
_name: ClassVar[str] = "zip"
|
2272
|
+
_features: ClassVar[PersistentStoreFeatures] = PersistentStoreFeatures(
|
1386
2273
|
create=False,
|
1387
2274
|
edit=False,
|
1388
2275
|
jobscript_parallelism=False,
|
@@ -1393,10 +2280,17 @@ class ZarrZipPersistentStore(ZarrPersistentStore):
|
|
1393
2280
|
|
1394
2281
|
# TODO: enforce read-only nature
|
1395
2282
|
|
1396
|
-
def zip(
|
2283
|
+
def zip(
|
2284
|
+
self,
|
2285
|
+
path: str = ".",
|
2286
|
+
log: str | None = None,
|
2287
|
+
overwrite: bool = False,
|
2288
|
+
include_execute: bool = False,
|
2289
|
+
include_rechunk_backups: bool = False,
|
2290
|
+
):
|
1397
2291
|
raise ValueError("Already a zip store!")
|
1398
2292
|
|
1399
|
-
def unzip(self, path=".", log=None):
|
2293
|
+
def unzip(self, path: str = ".", log: str | None = None) -> str:
|
1400
2294
|
"""
|
1401
2295
|
Expand the persistent store.
|
1402
2296
|
|
@@ -1409,28 +2303,23 @@ class ZarrZipPersistentStore(ZarrPersistentStore):
|
|
1409
2303
|
|
1410
2304
|
"""
|
1411
2305
|
|
1412
|
-
|
1413
|
-
|
1414
|
-
|
2306
|
+
with Console().status(f"Unzipping workflow {self.workflow.name!r}..."):
|
2307
|
+
# TODO: this won't work for remote file systems
|
2308
|
+
dst_path = Path(path).resolve()
|
2309
|
+
if dst_path.is_dir():
|
2310
|
+
dst_path = dst_path.joinpath(self.workflow.name)
|
1415
2311
|
|
1416
|
-
|
1417
|
-
|
1418
|
-
if dst_path.is_dir():
|
1419
|
-
dst_path = dst_path.joinpath(self.workflow.name)
|
2312
|
+
if dst_path.exists():
|
2313
|
+
raise FileExistsError(f"Directory at path already exists: {dst_path!r}.")
|
1420
2314
|
|
1421
|
-
|
1422
|
-
status.stop()
|
1423
|
-
raise FileExistsError(f"Directory at path already exists: {dst_path!r}.")
|
2315
|
+
dst_path_s = str(dst_path)
|
1424
2316
|
|
1425
|
-
|
2317
|
+
src_zarr_store = self.zarr_store
|
2318
|
+
dst_zarr_store = FSStore(url=dst_path_s)
|
2319
|
+
zarr.copy_store(src_zarr_store, dst_zarr_store, log=log)
|
2320
|
+
return dst_path_s
|
1426
2321
|
|
1427
|
-
|
1428
|
-
dst_zarr_store = zarr.storage.FSStore(url=dst_path)
|
1429
|
-
zarr.convenience.copy_store(src_zarr_store, dst_zarr_store, log=log)
|
1430
|
-
status.stop()
|
1431
|
-
return dst_path
|
1432
|
-
|
1433
|
-
def copy(self, path=None) -> str:
|
2322
|
+
def copy(self, path: PathLike = None) -> Path:
|
1434
2323
|
# not sure how to do this.
|
1435
2324
|
raise NotImplementedError()
|
1436
2325
|
|
@@ -1441,8 +2330,23 @@ class ZarrZipPersistentStore(ZarrPersistentStore):
|
|
1441
2330
|
def _rechunk_arr(
|
1442
2331
|
self,
|
1443
2332
|
arr,
|
1444
|
-
chunk_size:
|
1445
|
-
backup:
|
1446
|
-
status:
|
1447
|
-
):
|
2333
|
+
chunk_size: int | None = None,
|
2334
|
+
backup: bool = True,
|
2335
|
+
status: bool = True,
|
2336
|
+
) -> Array:
|
1448
2337
|
raise NotImplementedError
|
2338
|
+
|
2339
|
+
def get_text_file(self, path: str | Path) -> str:
|
2340
|
+
"""Retrieve the contents of a text file stored within the workflow."""
|
2341
|
+
path = Path(path)
|
2342
|
+
if path.is_absolute():
|
2343
|
+
path = path.relative_to(self.workflow.url)
|
2344
|
+
path = str(path.as_posix())
|
2345
|
+
assert self.fs
|
2346
|
+
try:
|
2347
|
+
with self.fs.open(path, mode="rt") as fp:
|
2348
|
+
return fp.read()
|
2349
|
+
except KeyError:
|
2350
|
+
raise FileNotFoundError(
|
2351
|
+
f"File within zip at location {path!r} does not exist."
|
2352
|
+
) from None
|