hpcflow 0.1.9__py3-none-any.whl → 0.2.0a271__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hpcflow/__init__.py +2 -11
- hpcflow/__pyinstaller/__init__.py +5 -0
- hpcflow/__pyinstaller/hook-hpcflow.py +40 -0
- hpcflow/_version.py +1 -1
- hpcflow/app.py +43 -0
- hpcflow/cli.py +2 -462
- hpcflow/data/demo_data_manifest/__init__.py +3 -0
- hpcflow/data/demo_data_manifest/demo_data_manifest.json +6 -0
- hpcflow/data/jinja_templates/test/test_template.txt +8 -0
- hpcflow/data/programs/hello_world/README.md +1 -0
- hpcflow/data/programs/hello_world/hello_world.c +87 -0
- hpcflow/data/programs/hello_world/linux/hello_world +0 -0
- hpcflow/data/programs/hello_world/macos/hello_world +0 -0
- hpcflow/data/programs/hello_world/win/hello_world.exe +0 -0
- hpcflow/data/scripts/__init__.py +1 -0
- hpcflow/data/scripts/bad_script.py +2 -0
- hpcflow/data/scripts/demo_task_1_generate_t1_infile_1.py +8 -0
- hpcflow/data/scripts/demo_task_1_generate_t1_infile_2.py +8 -0
- hpcflow/data/scripts/demo_task_1_parse_p3.py +7 -0
- hpcflow/data/scripts/do_nothing.py +2 -0
- hpcflow/data/scripts/env_specifier_test/input_file_generator_pass_env_spec.py +4 -0
- hpcflow/data/scripts/env_specifier_test/main_script_test_pass_env_spec.py +8 -0
- hpcflow/data/scripts/env_specifier_test/output_file_parser_pass_env_spec.py +4 -0
- hpcflow/data/scripts/env_specifier_test/v1/input_file_generator_basic.py +4 -0
- hpcflow/data/scripts/env_specifier_test/v1/main_script_test_direct_in_direct_out.py +7 -0
- hpcflow/data/scripts/env_specifier_test/v1/output_file_parser_basic.py +4 -0
- hpcflow/data/scripts/env_specifier_test/v2/main_script_test_direct_in_direct_out.py +7 -0
- hpcflow/data/scripts/generate_t1_file_01.py +7 -0
- hpcflow/data/scripts/import_future_script.py +7 -0
- hpcflow/data/scripts/input_file_generator_basic.py +3 -0
- hpcflow/data/scripts/input_file_generator_basic_FAIL.py +3 -0
- hpcflow/data/scripts/input_file_generator_test_stdout_stderr.py +8 -0
- hpcflow/data/scripts/main_script_test_direct_in.py +3 -0
- hpcflow/data/scripts/main_script_test_direct_in_direct_out.py +6 -0
- hpcflow/data/scripts/main_script_test_direct_in_direct_out_2.py +6 -0
- hpcflow/data/scripts/main_script_test_direct_in_direct_out_2_fail_allowed.py +6 -0
- hpcflow/data/scripts/main_script_test_direct_in_direct_out_2_fail_allowed_group.py +7 -0
- hpcflow/data/scripts/main_script_test_direct_in_direct_out_3.py +6 -0
- hpcflow/data/scripts/main_script_test_direct_in_direct_out_all_iters_test.py +15 -0
- hpcflow/data/scripts/main_script_test_direct_in_direct_out_env_spec.py +7 -0
- hpcflow/data/scripts/main_script_test_direct_in_direct_out_labels.py +8 -0
- hpcflow/data/scripts/main_script_test_direct_in_group_direct_out_3.py +6 -0
- hpcflow/data/scripts/main_script_test_direct_in_group_one_fail_direct_out_3.py +6 -0
- hpcflow/data/scripts/main_script_test_direct_sub_param_in_direct_out.py +6 -0
- hpcflow/data/scripts/main_script_test_hdf5_in_obj.py +12 -0
- hpcflow/data/scripts/main_script_test_hdf5_in_obj_2.py +12 -0
- hpcflow/data/scripts/main_script_test_hdf5_in_obj_group.py +12 -0
- hpcflow/data/scripts/main_script_test_hdf5_out_obj.py +11 -0
- hpcflow/data/scripts/main_script_test_json_and_direct_in_json_out.py +14 -0
- hpcflow/data/scripts/main_script_test_json_in_json_and_direct_out.py +17 -0
- hpcflow/data/scripts/main_script_test_json_in_json_out.py +14 -0
- hpcflow/data/scripts/main_script_test_json_in_json_out_labels.py +16 -0
- hpcflow/data/scripts/main_script_test_json_in_obj.py +12 -0
- hpcflow/data/scripts/main_script_test_json_out_FAIL.py +3 -0
- hpcflow/data/scripts/main_script_test_json_out_obj.py +10 -0
- hpcflow/data/scripts/main_script_test_json_sub_param_in_json_out_labels.py +16 -0
- hpcflow/data/scripts/main_script_test_shell_env_vars.py +12 -0
- hpcflow/data/scripts/main_script_test_std_out_std_err.py +6 -0
- hpcflow/data/scripts/output_file_parser_basic.py +3 -0
- hpcflow/data/scripts/output_file_parser_basic_FAIL.py +7 -0
- hpcflow/data/scripts/output_file_parser_test_stdout_stderr.py +8 -0
- hpcflow/data/scripts/parse_t1_file_01.py +4 -0
- hpcflow/data/scripts/script_exit_test.py +5 -0
- hpcflow/data/template_components/__init__.py +1 -0
- hpcflow/data/template_components/command_files.yaml +26 -0
- hpcflow/data/template_components/environments.yaml +13 -0
- hpcflow/data/template_components/parameters.yaml +14 -0
- hpcflow/data/template_components/task_schemas.yaml +139 -0
- hpcflow/data/workflows/workflow_1.yaml +5 -0
- hpcflow/examples.ipynb +1037 -0
- hpcflow/sdk/__init__.py +149 -0
- hpcflow/sdk/app.py +4266 -0
- hpcflow/sdk/cli.py +1479 -0
- hpcflow/sdk/cli_common.py +385 -0
- hpcflow/sdk/config/__init__.py +5 -0
- hpcflow/sdk/config/callbacks.py +246 -0
- hpcflow/sdk/config/cli.py +388 -0
- hpcflow/sdk/config/config.py +1410 -0
- hpcflow/sdk/config/config_file.py +501 -0
- hpcflow/sdk/config/errors.py +272 -0
- hpcflow/sdk/config/types.py +150 -0
- hpcflow/sdk/core/__init__.py +38 -0
- hpcflow/sdk/core/actions.py +3857 -0
- hpcflow/sdk/core/app_aware.py +25 -0
- hpcflow/sdk/core/cache.py +224 -0
- hpcflow/sdk/core/command_files.py +814 -0
- hpcflow/sdk/core/commands.py +424 -0
- hpcflow/sdk/core/element.py +2071 -0
- hpcflow/sdk/core/enums.py +221 -0
- hpcflow/sdk/core/environment.py +256 -0
- hpcflow/sdk/core/errors.py +1043 -0
- hpcflow/sdk/core/execute.py +207 -0
- hpcflow/sdk/core/json_like.py +809 -0
- hpcflow/sdk/core/loop.py +1320 -0
- hpcflow/sdk/core/loop_cache.py +282 -0
- hpcflow/sdk/core/object_list.py +933 -0
- hpcflow/sdk/core/parameters.py +3371 -0
- hpcflow/sdk/core/rule.py +196 -0
- hpcflow/sdk/core/run_dir_files.py +57 -0
- hpcflow/sdk/core/skip_reason.py +7 -0
- hpcflow/sdk/core/task.py +3792 -0
- hpcflow/sdk/core/task_schema.py +993 -0
- hpcflow/sdk/core/test_utils.py +538 -0
- hpcflow/sdk/core/types.py +447 -0
- hpcflow/sdk/core/utils.py +1207 -0
- hpcflow/sdk/core/validation.py +87 -0
- hpcflow/sdk/core/values.py +477 -0
- hpcflow/sdk/core/workflow.py +4820 -0
- hpcflow/sdk/core/zarr_io.py +206 -0
- hpcflow/sdk/data/__init__.py +13 -0
- hpcflow/sdk/data/config_file_schema.yaml +34 -0
- hpcflow/sdk/data/config_schema.yaml +260 -0
- hpcflow/sdk/data/environments_spec_schema.yaml +21 -0
- hpcflow/sdk/data/files_spec_schema.yaml +5 -0
- hpcflow/sdk/data/parameters_spec_schema.yaml +7 -0
- hpcflow/sdk/data/task_schema_spec_schema.yaml +3 -0
- hpcflow/sdk/data/workflow_spec_schema.yaml +22 -0
- hpcflow/sdk/demo/__init__.py +3 -0
- hpcflow/sdk/demo/cli.py +242 -0
- hpcflow/sdk/helper/__init__.py +3 -0
- hpcflow/sdk/helper/cli.py +137 -0
- hpcflow/sdk/helper/helper.py +300 -0
- hpcflow/sdk/helper/watcher.py +192 -0
- hpcflow/sdk/log.py +288 -0
- hpcflow/sdk/persistence/__init__.py +18 -0
- hpcflow/sdk/persistence/base.py +2817 -0
- hpcflow/sdk/persistence/defaults.py +6 -0
- hpcflow/sdk/persistence/discovery.py +39 -0
- hpcflow/sdk/persistence/json.py +954 -0
- hpcflow/sdk/persistence/pending.py +948 -0
- hpcflow/sdk/persistence/store_resource.py +203 -0
- hpcflow/sdk/persistence/types.py +309 -0
- hpcflow/sdk/persistence/utils.py +73 -0
- hpcflow/sdk/persistence/zarr.py +2388 -0
- hpcflow/sdk/runtime.py +320 -0
- hpcflow/sdk/submission/__init__.py +3 -0
- hpcflow/sdk/submission/enums.py +70 -0
- hpcflow/sdk/submission/jobscript.py +2379 -0
- hpcflow/sdk/submission/schedulers/__init__.py +281 -0
- hpcflow/sdk/submission/schedulers/direct.py +233 -0
- hpcflow/sdk/submission/schedulers/sge.py +376 -0
- hpcflow/sdk/submission/schedulers/slurm.py +598 -0
- hpcflow/sdk/submission/schedulers/utils.py +25 -0
- hpcflow/sdk/submission/shells/__init__.py +52 -0
- hpcflow/sdk/submission/shells/base.py +229 -0
- hpcflow/sdk/submission/shells/bash.py +504 -0
- hpcflow/sdk/submission/shells/os_version.py +115 -0
- hpcflow/sdk/submission/shells/powershell.py +352 -0
- hpcflow/sdk/submission/submission.py +1402 -0
- hpcflow/sdk/submission/types.py +140 -0
- hpcflow/sdk/typing.py +194 -0
- hpcflow/sdk/utils/arrays.py +69 -0
- hpcflow/sdk/utils/deferred_file.py +55 -0
- hpcflow/sdk/utils/hashing.py +16 -0
- hpcflow/sdk/utils/patches.py +31 -0
- hpcflow/sdk/utils/strings.py +69 -0
- hpcflow/tests/api/test_api.py +32 -0
- hpcflow/tests/conftest.py +123 -0
- hpcflow/tests/data/__init__.py +0 -0
- hpcflow/tests/data/benchmark_N_elements.yaml +6 -0
- hpcflow/tests/data/benchmark_script_runner.yaml +26 -0
- hpcflow/tests/data/multi_path_sequences.yaml +29 -0
- hpcflow/tests/data/workflow_1.json +10 -0
- hpcflow/tests/data/workflow_1.yaml +5 -0
- hpcflow/tests/data/workflow_1_slurm.yaml +8 -0
- hpcflow/tests/data/workflow_1_wsl.yaml +8 -0
- hpcflow/tests/data/workflow_test_run_abort.yaml +42 -0
- hpcflow/tests/jinja_templates/test_jinja_templates.py +161 -0
- hpcflow/tests/programs/test_programs.py +180 -0
- hpcflow/tests/schedulers/direct_linux/test_direct_linux_submission.py +12 -0
- hpcflow/tests/schedulers/sge/test_sge_submission.py +36 -0
- hpcflow/tests/schedulers/slurm/test_slurm_submission.py +14 -0
- hpcflow/tests/scripts/test_input_file_generators.py +282 -0
- hpcflow/tests/scripts/test_main_scripts.py +1361 -0
- hpcflow/tests/scripts/test_non_snippet_script.py +46 -0
- hpcflow/tests/scripts/test_ouput_file_parsers.py +353 -0
- hpcflow/tests/shells/wsl/test_wsl_submission.py +14 -0
- hpcflow/tests/unit/test_action.py +1066 -0
- hpcflow/tests/unit/test_action_rule.py +24 -0
- hpcflow/tests/unit/test_app.py +132 -0
- hpcflow/tests/unit/test_cache.py +46 -0
- hpcflow/tests/unit/test_cli.py +172 -0
- hpcflow/tests/unit/test_command.py +377 -0
- hpcflow/tests/unit/test_config.py +195 -0
- hpcflow/tests/unit/test_config_file.py +162 -0
- hpcflow/tests/unit/test_element.py +666 -0
- hpcflow/tests/unit/test_element_iteration.py +88 -0
- hpcflow/tests/unit/test_element_set.py +158 -0
- hpcflow/tests/unit/test_group.py +115 -0
- hpcflow/tests/unit/test_input_source.py +1479 -0
- hpcflow/tests/unit/test_input_value.py +398 -0
- hpcflow/tests/unit/test_jobscript_unit.py +757 -0
- hpcflow/tests/unit/test_json_like.py +1247 -0
- hpcflow/tests/unit/test_loop.py +2674 -0
- hpcflow/tests/unit/test_meta_task.py +325 -0
- hpcflow/tests/unit/test_multi_path_sequences.py +259 -0
- hpcflow/tests/unit/test_object_list.py +116 -0
- hpcflow/tests/unit/test_parameter.py +243 -0
- hpcflow/tests/unit/test_persistence.py +664 -0
- hpcflow/tests/unit/test_resources.py +243 -0
- hpcflow/tests/unit/test_run.py +286 -0
- hpcflow/tests/unit/test_run_directories.py +29 -0
- hpcflow/tests/unit/test_runtime.py +9 -0
- hpcflow/tests/unit/test_schema_input.py +372 -0
- hpcflow/tests/unit/test_shell.py +129 -0
- hpcflow/tests/unit/test_slurm.py +39 -0
- hpcflow/tests/unit/test_submission.py +502 -0
- hpcflow/tests/unit/test_task.py +2560 -0
- hpcflow/tests/unit/test_task_schema.py +182 -0
- hpcflow/tests/unit/test_utils.py +616 -0
- hpcflow/tests/unit/test_value_sequence.py +549 -0
- hpcflow/tests/unit/test_values.py +91 -0
- hpcflow/tests/unit/test_workflow.py +827 -0
- hpcflow/tests/unit/test_workflow_template.py +186 -0
- hpcflow/tests/unit/utils/test_arrays.py +40 -0
- hpcflow/tests/unit/utils/test_deferred_file_writer.py +34 -0
- hpcflow/tests/unit/utils/test_hashing.py +65 -0
- hpcflow/tests/unit/utils/test_patches.py +5 -0
- hpcflow/tests/unit/utils/test_redirect_std.py +50 -0
- hpcflow/tests/unit/utils/test_strings.py +97 -0
- hpcflow/tests/workflows/__init__.py +0 -0
- hpcflow/tests/workflows/test_directory_structure.py +31 -0
- hpcflow/tests/workflows/test_jobscript.py +355 -0
- hpcflow/tests/workflows/test_run_status.py +198 -0
- hpcflow/tests/workflows/test_skip_downstream.py +696 -0
- hpcflow/tests/workflows/test_submission.py +140 -0
- hpcflow/tests/workflows/test_workflows.py +564 -0
- hpcflow/tests/workflows/test_zip.py +18 -0
- hpcflow/viz_demo.ipynb +6794 -0
- hpcflow-0.2.0a271.dist-info/LICENSE +375 -0
- hpcflow-0.2.0a271.dist-info/METADATA +65 -0
- hpcflow-0.2.0a271.dist-info/RECORD +237 -0
- {hpcflow-0.1.9.dist-info → hpcflow-0.2.0a271.dist-info}/WHEEL +4 -5
- hpcflow-0.2.0a271.dist-info/entry_points.txt +6 -0
- hpcflow/api.py +0 -458
- hpcflow/archive/archive.py +0 -308
- hpcflow/archive/cloud/cloud.py +0 -47
- hpcflow/archive/cloud/errors.py +0 -9
- hpcflow/archive/cloud/providers/dropbox.py +0 -432
- hpcflow/archive/errors.py +0 -5
- hpcflow/base_db.py +0 -4
- hpcflow/config.py +0 -232
- hpcflow/copytree.py +0 -66
- hpcflow/data/examples/_config.yml +0 -14
- hpcflow/data/examples/damask/demo/1.run.yml +0 -4
- hpcflow/data/examples/damask/demo/2.process.yml +0 -29
- hpcflow/data/examples/damask/demo/geom.geom +0 -2052
- hpcflow/data/examples/damask/demo/load.load +0 -1
- hpcflow/data/examples/damask/demo/material.config +0 -185
- hpcflow/data/examples/damask/inputs/geom.geom +0 -2052
- hpcflow/data/examples/damask/inputs/load.load +0 -1
- hpcflow/data/examples/damask/inputs/material.config +0 -185
- hpcflow/data/examples/damask/profiles/_variable_lookup.yml +0 -21
- hpcflow/data/examples/damask/profiles/damask.yml +0 -4
- hpcflow/data/examples/damask/profiles/damask_process.yml +0 -8
- hpcflow/data/examples/damask/profiles/damask_run.yml +0 -5
- hpcflow/data/examples/damask/profiles/default.yml +0 -6
- hpcflow/data/examples/thinking.yml +0 -177
- hpcflow/errors.py +0 -2
- hpcflow/init_db.py +0 -37
- hpcflow/models.py +0 -2549
- hpcflow/nesting.py +0 -9
- hpcflow/profiles.py +0 -455
- hpcflow/project.py +0 -81
- hpcflow/scheduler.py +0 -323
- hpcflow/utils.py +0 -103
- hpcflow/validation.py +0 -167
- hpcflow/variables.py +0 -544
- hpcflow-0.1.9.dist-info/METADATA +0 -168
- hpcflow-0.1.9.dist-info/RECORD +0 -45
- hpcflow-0.1.9.dist-info/entry_points.txt +0 -8
- hpcflow-0.1.9.dist-info/top_level.txt +0 -1
- /hpcflow/{archive → data/jinja_templates}/__init__.py +0 -0
- /hpcflow/{archive/cloud → data/programs}/__init__.py +0 -0
- /hpcflow/{archive/cloud/providers → data/workflows}/__init__.py +0 -0
|
@@ -0,0 +1,2388 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Persistence model based on writing Zarr arrays.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import copy
|
|
8
|
+
from contextlib import contextmanager
|
|
9
|
+
from collections import defaultdict
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, cast, TYPE_CHECKING
|
|
13
|
+
from typing_extensions import override
|
|
14
|
+
import shutil
|
|
15
|
+
import time
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
from numpy.ma.core import MaskedArray
|
|
19
|
+
import zarr # type: ignore
|
|
20
|
+
from zarr.errors import BoundsCheckError # type: ignore
|
|
21
|
+
from zarr.storage import DirectoryStore, FSStore # type: ignore
|
|
22
|
+
from zarr.util import guess_chunks # type: ignore
|
|
23
|
+
from fsspec.implementations.zip import ZipFileSystem # type: ignore
|
|
24
|
+
from rich.console import Console
|
|
25
|
+
from numcodecs import MsgPack, VLenArray, blosc, Blosc, Zstd # type: ignore
|
|
26
|
+
from reretry import retry # type: ignore
|
|
27
|
+
|
|
28
|
+
from hpcflow.sdk.typing import hydrate
|
|
29
|
+
from hpcflow.sdk.core import RUN_DIR_ARR_DTYPE, RUN_DIR_ARR_FILL
|
|
30
|
+
from hpcflow.sdk.core.errors import (
|
|
31
|
+
MissingParameterData,
|
|
32
|
+
MissingStoreEARError,
|
|
33
|
+
MissingStoreElementError,
|
|
34
|
+
MissingStoreElementIterationError,
|
|
35
|
+
MissingStoreTaskError,
|
|
36
|
+
)
|
|
37
|
+
from hpcflow.sdk.core.utils import (
|
|
38
|
+
ensure_in,
|
|
39
|
+
get_relative_path,
|
|
40
|
+
set_in_container,
|
|
41
|
+
get_in_container,
|
|
42
|
+
)
|
|
43
|
+
from hpcflow.sdk.persistence.base import (
|
|
44
|
+
PARAM_DATA_NOT_SET,
|
|
45
|
+
PersistentStoreFeatures,
|
|
46
|
+
PersistentStore,
|
|
47
|
+
StoreEAR,
|
|
48
|
+
StoreElement,
|
|
49
|
+
StoreElementIter,
|
|
50
|
+
StoreParameter,
|
|
51
|
+
StoreTask,
|
|
52
|
+
)
|
|
53
|
+
from hpcflow.sdk.persistence.types import (
|
|
54
|
+
LoopDescriptor,
|
|
55
|
+
StoreCreationInfo,
|
|
56
|
+
TemplateMeta,
|
|
57
|
+
ZarrAttrsDict,
|
|
58
|
+
)
|
|
59
|
+
from hpcflow.sdk.persistence.store_resource import ZarrAttrsStoreResource
|
|
60
|
+
from hpcflow.sdk.persistence.utils import ask_pw_on_auth_exc
|
|
61
|
+
from hpcflow.sdk.persistence.pending import CommitResourceMap
|
|
62
|
+
from hpcflow.sdk.persistence.base import update_param_source_dict
|
|
63
|
+
from hpcflow.sdk.log import TimeIt
|
|
64
|
+
from hpcflow.sdk.submission.submission import (
|
|
65
|
+
JOBSCRIPT_SUBMIT_TIME_KEYS,
|
|
66
|
+
SUBMISSION_SUBMIT_TIME_KEYS,
|
|
67
|
+
)
|
|
68
|
+
from hpcflow.sdk.utils.arrays import get_2D_idx, split_arr
|
|
69
|
+
from hpcflow.sdk.utils.patches import override_module_attrs
|
|
70
|
+
from hpcflow.sdk.utils.strings import shorten_list_str
|
|
71
|
+
|
|
72
|
+
if TYPE_CHECKING:
|
|
73
|
+
from collections.abc import (
|
|
74
|
+
Callable,
|
|
75
|
+
Iterable,
|
|
76
|
+
Iterator,
|
|
77
|
+
Mapping,
|
|
78
|
+
MutableMapping,
|
|
79
|
+
Sequence,
|
|
80
|
+
)
|
|
81
|
+
from datetime import datetime
|
|
82
|
+
from fsspec import AbstractFileSystem # type: ignore
|
|
83
|
+
from logging import Logger
|
|
84
|
+
from typing import ClassVar
|
|
85
|
+
from typing_extensions import Self, TypeAlias
|
|
86
|
+
from numpy.typing import NDArray
|
|
87
|
+
from zarr import Array, Group # type: ignore
|
|
88
|
+
from zarr.attrs import Attributes # type: ignore
|
|
89
|
+
from zarr.storage import Store # type: ignore
|
|
90
|
+
from ..submission.types import ResolvedJobscriptBlockDependencies
|
|
91
|
+
from .types import TypeLookup
|
|
92
|
+
from ..app import BaseApp
|
|
93
|
+
from ..core.json_like import JSONed, JSONDocument
|
|
94
|
+
from ..typing import ParamSource, PathLike, DataIndex
|
|
95
|
+
|
|
96
|
+
#: List of any (Zarr-serializable) value.
|
|
97
|
+
ListAny: TypeAlias = "list[Any]"
|
|
98
|
+
#: Zarr attribute mapping context.
|
|
99
|
+
ZarrAttrs: TypeAlias = "dict[str, Any]"
|
|
100
|
+
#: Soft lower limit for the number of bytes in an array chunk
|
|
101
|
+
_ARRAY_CHUNK_MIN: int = 500 * 1024 * 1024 # 500 MiB
|
|
102
|
+
#: Hard upper limit for the number of bytes in an array chunk. Should be lower than the
|
|
103
|
+
#: maximum buffer size of the blosc encoder, if we're using it (2 GiB)
|
|
104
|
+
_ARRAY_CHUNK_MAX: int = 1024 * 1024 * 1024 # 1 GiB
|
|
105
|
+
_JS: TypeAlias = "dict[str, list[dict[str, dict]]]"
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
blosc.use_threads = False # hpcflow is a multiprocess program in general
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@TimeIt.decorator
|
|
112
|
+
def _zarr_get_coord_selection(arr: Array, selection: Any, logger: Logger):
|
|
113
|
+
@retry(
|
|
114
|
+
RuntimeError,
|
|
115
|
+
tries=10,
|
|
116
|
+
delay=1,
|
|
117
|
+
backoff=1.5,
|
|
118
|
+
jitter=(0, 5),
|
|
119
|
+
logger=logger,
|
|
120
|
+
)
|
|
121
|
+
@TimeIt.decorator
|
|
122
|
+
def _inner(arr: Array, selection: Any):
|
|
123
|
+
return arr.get_coordinate_selection(selection)
|
|
124
|
+
|
|
125
|
+
return _inner(arr, selection)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _encode_numpy_array(
|
|
129
|
+
obj: NDArray,
|
|
130
|
+
type_lookup: TypeLookup,
|
|
131
|
+
path: list[int],
|
|
132
|
+
root_group: Group,
|
|
133
|
+
arr_path: list[int],
|
|
134
|
+
root_encoder: Callable,
|
|
135
|
+
) -> int:
|
|
136
|
+
# Might need to generate new group:
|
|
137
|
+
param_arr_group = root_group.require_group(arr_path)
|
|
138
|
+
new_idx = (
|
|
139
|
+
max((int(i.removeprefix("arr_")) for i in param_arr_group.keys()), default=-1) + 1
|
|
140
|
+
)
|
|
141
|
+
with override_module_attrs(
|
|
142
|
+
"zarr.util", {"CHUNK_MIN": _ARRAY_CHUNK_MIN, "CHUNK_MAX": _ARRAY_CHUNK_MAX}
|
|
143
|
+
):
|
|
144
|
+
# `guess_chunks` also ensures chunk shape is at least 1 in each dimension:
|
|
145
|
+
chunk_shape = guess_chunks(obj.shape, obj.dtype.itemsize)
|
|
146
|
+
|
|
147
|
+
param_arr_group.create_dataset(name=f"arr_{new_idx}", data=obj, chunks=chunk_shape)
|
|
148
|
+
type_lookup["arrays"].append([path, new_idx])
|
|
149
|
+
|
|
150
|
+
return len(type_lookup["arrays"]) - 1
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _decode_numpy_arrays(
|
|
154
|
+
obj: dict | None,
|
|
155
|
+
type_lookup: TypeLookup,
|
|
156
|
+
path: list[int],
|
|
157
|
+
arr_group: Group,
|
|
158
|
+
dataset_copy: bool,
|
|
159
|
+
):
|
|
160
|
+
# Yuck! Type lies! Zarr's internal types are not modern Python types.
|
|
161
|
+
arrays = cast("Iterable[tuple[list[int], int]]", type_lookup.get("arrays", []))
|
|
162
|
+
obj_: dict | NDArray | None = obj
|
|
163
|
+
for arr_path, arr_idx in arrays:
|
|
164
|
+
try:
|
|
165
|
+
rel_path = get_relative_path(arr_path, path)
|
|
166
|
+
except ValueError:
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
dataset: NDArray = arr_group.get(f"arr_{arr_idx}")
|
|
170
|
+
if dataset_copy:
|
|
171
|
+
dataset = dataset[:]
|
|
172
|
+
|
|
173
|
+
if rel_path:
|
|
174
|
+
set_in_container(obj_, rel_path, dataset)
|
|
175
|
+
else:
|
|
176
|
+
obj_ = dataset
|
|
177
|
+
|
|
178
|
+
return obj_
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _encode_masked_array(
|
|
182
|
+
obj: MaskedArray,
|
|
183
|
+
type_lookup: TypeLookup,
|
|
184
|
+
path: list[int],
|
|
185
|
+
root_group: Group,
|
|
186
|
+
arr_path: list[int],
|
|
187
|
+
root_encoder: Callable,
|
|
188
|
+
):
|
|
189
|
+
"""Encode a masked array as two normal arrays, and return the fill value."""
|
|
190
|
+
# no need to add "array" entries to the type lookup, so pass an empty `type_lookup`:
|
|
191
|
+
type_lookup_: TypeLookup = defaultdict(list)
|
|
192
|
+
data_idx = _encode_numpy_array(
|
|
193
|
+
obj.data, type_lookup_, path, root_group, arr_path, root_encoder
|
|
194
|
+
)
|
|
195
|
+
mask_idx = _encode_numpy_array(
|
|
196
|
+
cast("NDArray", obj.mask), type_lookup_, path, root_group, arr_path, root_encoder
|
|
197
|
+
)
|
|
198
|
+
type_lookup["masked_arrays"].append([path, [data_idx, mask_idx]])
|
|
199
|
+
return obj.fill_value.item()
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _decode_masked_arrays(
|
|
203
|
+
obj: dict,
|
|
204
|
+
type_lookup: TypeLookup,
|
|
205
|
+
path: list[int],
|
|
206
|
+
arr_group: Group,
|
|
207
|
+
dataset_copy: bool,
|
|
208
|
+
):
|
|
209
|
+
# Yuck! Type lies! Zarr's internal types are not modern Python types.
|
|
210
|
+
masked_arrays = cast(
|
|
211
|
+
"Iterable[tuple[list[int], tuple[int, int]]]",
|
|
212
|
+
type_lookup.get("masked_arrays", []),
|
|
213
|
+
)
|
|
214
|
+
obj_: dict | MaskedArray = obj
|
|
215
|
+
for arr_path, (data_idx, mask_idx) in masked_arrays:
|
|
216
|
+
try:
|
|
217
|
+
rel_path = get_relative_path(arr_path, path)
|
|
218
|
+
except ValueError:
|
|
219
|
+
continue
|
|
220
|
+
|
|
221
|
+
fill_value = get_in_container(obj_, rel_path)
|
|
222
|
+
data = arr_group.get(f"arr_{data_idx}")
|
|
223
|
+
mask = arr_group.get(f"arr_{mask_idx}")
|
|
224
|
+
dataset: MaskedArray = MaskedArray(data=data, mask=mask, fill_value=fill_value)
|
|
225
|
+
|
|
226
|
+
if rel_path:
|
|
227
|
+
set_in_container(obj_, rel_path, dataset)
|
|
228
|
+
else:
|
|
229
|
+
obj_ = dataset
|
|
230
|
+
return obj_
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _encode_bytes(obj: dict, **kwargs):
|
|
234
|
+
return obj # msgpack can handle bytes
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def append_items_to_ragged_array(arr: Array, items: Sequence[int]):
|
|
238
|
+
"""Append an array to a Zarr ragged array.
|
|
239
|
+
|
|
240
|
+
I think `arr.append([item])` should work, but does not for some reason, so we do it
|
|
241
|
+
here by resizing and assignment."""
|
|
242
|
+
num = len(items)
|
|
243
|
+
arr.resize((len(arr) + num))
|
|
244
|
+
for idx, i in enumerate(items):
|
|
245
|
+
arr[-(num - idx)] = i
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
@dataclass
|
|
249
|
+
class ZarrStoreTask(StoreTask[dict]):
|
|
250
|
+
"""
|
|
251
|
+
Represents a task in a Zarr persistent store.
|
|
252
|
+
"""
|
|
253
|
+
|
|
254
|
+
@override
|
|
255
|
+
def encode(self) -> tuple[int, dict, dict[str, Any]]:
|
|
256
|
+
"""Prepare store task data for the persistent store."""
|
|
257
|
+
wk_task = {"id_": self.id_, "element_IDs": np.array(self.element_IDs)}
|
|
258
|
+
task = {"id_": self.id_, **(self.task_template or {})}
|
|
259
|
+
return self.index, wk_task, task
|
|
260
|
+
|
|
261
|
+
@override
|
|
262
|
+
@classmethod
|
|
263
|
+
def decode(cls, task_dat: dict) -> Self:
|
|
264
|
+
"""Initialise a `StoreTask` from persistent task data"""
|
|
265
|
+
task_dat["element_IDs"] = task_dat["element_IDs"].tolist()
|
|
266
|
+
return cls(is_pending=False, **task_dat)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
@dataclass
|
|
270
|
+
class ZarrStoreElement(StoreElement[ListAny, ZarrAttrs]):
|
|
271
|
+
"""
|
|
272
|
+
Represents an element in a Zarr persistent store.
|
|
273
|
+
"""
|
|
274
|
+
|
|
275
|
+
@override
|
|
276
|
+
def encode(self, attrs: ZarrAttrs) -> ListAny:
|
|
277
|
+
"""Prepare store elements data for the persistent store.
|
|
278
|
+
|
|
279
|
+
This method mutates `attrs`.
|
|
280
|
+
"""
|
|
281
|
+
return [
|
|
282
|
+
self.id_,
|
|
283
|
+
self.index,
|
|
284
|
+
self.es_idx,
|
|
285
|
+
[[ensure_in(k, attrs["seq_idx"]), v] for k, v in self.seq_idx.items()],
|
|
286
|
+
[[ensure_in(k, attrs["src_idx"]), v] for k, v in self.src_idx.items()],
|
|
287
|
+
self.task_ID,
|
|
288
|
+
self.iteration_IDs,
|
|
289
|
+
]
|
|
290
|
+
|
|
291
|
+
@override
|
|
292
|
+
@classmethod
|
|
293
|
+
def decode(cls, elem_dat: ListAny, attrs: ZarrAttrs) -> Self:
|
|
294
|
+
"""Initialise a `StoreElement` from persistent element data"""
|
|
295
|
+
obj_dat = {
|
|
296
|
+
"id_": elem_dat[0],
|
|
297
|
+
"index": elem_dat[1],
|
|
298
|
+
"es_idx": elem_dat[2],
|
|
299
|
+
"seq_idx": {attrs["seq_idx"][k]: v for (k, v) in elem_dat[3]},
|
|
300
|
+
"src_idx": {attrs["src_idx"][k]: v for (k, v) in elem_dat[4]},
|
|
301
|
+
"task_ID": elem_dat[5],
|
|
302
|
+
"iteration_IDs": elem_dat[6],
|
|
303
|
+
}
|
|
304
|
+
return cls(is_pending=False, **obj_dat)
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
@dataclass
|
|
308
|
+
class ZarrStoreElementIter(StoreElementIter[ListAny, ZarrAttrs]):
|
|
309
|
+
"""
|
|
310
|
+
Represents an element iteration in a Zarr persistent store.
|
|
311
|
+
"""
|
|
312
|
+
|
|
313
|
+
@override
|
|
314
|
+
def encode(self, attrs: ZarrAttrs) -> ListAny:
|
|
315
|
+
"""Prepare store element iteration data for the persistent store.
|
|
316
|
+
|
|
317
|
+
This method mutates `attrs`.
|
|
318
|
+
"""
|
|
319
|
+
return [
|
|
320
|
+
self.id_,
|
|
321
|
+
self.element_ID,
|
|
322
|
+
int(self.EARs_initialised),
|
|
323
|
+
[[ek, ev] for ek, ev in self.EAR_IDs.items()] if self.EAR_IDs else None,
|
|
324
|
+
[
|
|
325
|
+
[ensure_in(dk, attrs["parameter_paths"]), dv]
|
|
326
|
+
for dk, dv in self.data_idx.items()
|
|
327
|
+
],
|
|
328
|
+
[ensure_in(i, attrs["schema_parameters"]) for i in self.schema_parameters],
|
|
329
|
+
[[ensure_in(dk, attrs["loops"]), dv] for dk, dv in self.loop_idx.items()],
|
|
330
|
+
]
|
|
331
|
+
|
|
332
|
+
@override
|
|
333
|
+
@classmethod
|
|
334
|
+
def decode(cls, iter_dat: ListAny, attrs: ZarrAttrs) -> Self:
|
|
335
|
+
"""Initialise a `ZarrStoreElementIter` from persistent element iteration data"""
|
|
336
|
+
obj_dat = {
|
|
337
|
+
"id_": iter_dat[0],
|
|
338
|
+
"element_ID": iter_dat[1],
|
|
339
|
+
"EARs_initialised": bool(iter_dat[2]),
|
|
340
|
+
"EAR_IDs": {i[0]: i[1] for i in iter_dat[3]} if iter_dat[3] else None,
|
|
341
|
+
"data_idx": {attrs["parameter_paths"][i[0]]: i[1] for i in iter_dat[4]},
|
|
342
|
+
"schema_parameters": [attrs["schema_parameters"][i] for i in iter_dat[5]],
|
|
343
|
+
"loop_idx": {attrs["loops"][i[0]]: i[1] for i in iter_dat[6]},
|
|
344
|
+
}
|
|
345
|
+
return cls(is_pending=False, **obj_dat)
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
@dataclass
|
|
349
|
+
class ZarrStoreEAR(StoreEAR[ListAny, ZarrAttrs]):
|
|
350
|
+
"""
|
|
351
|
+
Represents an element action run in a Zarr persistent store.
|
|
352
|
+
"""
|
|
353
|
+
|
|
354
|
+
@override
|
|
355
|
+
def encode(self, ts_fmt: str, attrs: ZarrAttrs) -> ListAny:
|
|
356
|
+
"""Prepare store EAR data for the persistent store.
|
|
357
|
+
|
|
358
|
+
This method mutates `attrs`.
|
|
359
|
+
"""
|
|
360
|
+
return [
|
|
361
|
+
self.id_,
|
|
362
|
+
self.elem_iter_ID,
|
|
363
|
+
self.action_idx,
|
|
364
|
+
[
|
|
365
|
+
[ensure_in(dk, attrs["parameter_paths"]), dv]
|
|
366
|
+
for dk, dv in self.data_idx.items()
|
|
367
|
+
],
|
|
368
|
+
self.submission_idx,
|
|
369
|
+
self.skip,
|
|
370
|
+
self.success,
|
|
371
|
+
self._encode_datetime(self.start_time, ts_fmt),
|
|
372
|
+
self._encode_datetime(self.end_time, ts_fmt),
|
|
373
|
+
self.snapshot_start,
|
|
374
|
+
self.snapshot_end,
|
|
375
|
+
self.exit_code,
|
|
376
|
+
self.metadata,
|
|
377
|
+
self.run_hostname,
|
|
378
|
+
self.commands_idx,
|
|
379
|
+
self.port_number,
|
|
380
|
+
self.commands_file_ID,
|
|
381
|
+
]
|
|
382
|
+
|
|
383
|
+
@override
|
|
384
|
+
@classmethod
|
|
385
|
+
def decode(cls, EAR_dat: ListAny, ts_fmt: str, attrs: ZarrAttrs) -> Self:
|
|
386
|
+
"""Initialise a `ZarrStoreEAR` from persistent EAR data"""
|
|
387
|
+
obj_dat = {
|
|
388
|
+
"id_": EAR_dat[0],
|
|
389
|
+
"elem_iter_ID": EAR_dat[1],
|
|
390
|
+
"action_idx": EAR_dat[2],
|
|
391
|
+
"data_idx": {attrs["parameter_paths"][i[0]]: i[1] for i in EAR_dat[3]},
|
|
392
|
+
"submission_idx": EAR_dat[4],
|
|
393
|
+
"skip": EAR_dat[5],
|
|
394
|
+
"success": EAR_dat[6],
|
|
395
|
+
"start_time": cls._decode_datetime(EAR_dat[7], ts_fmt),
|
|
396
|
+
"end_time": cls._decode_datetime(EAR_dat[8], ts_fmt),
|
|
397
|
+
"snapshot_start": EAR_dat[9],
|
|
398
|
+
"snapshot_end": EAR_dat[10],
|
|
399
|
+
"exit_code": EAR_dat[11],
|
|
400
|
+
"metadata": EAR_dat[12],
|
|
401
|
+
"run_hostname": EAR_dat[13],
|
|
402
|
+
"commands_idx": EAR_dat[14],
|
|
403
|
+
"port_number": EAR_dat[15],
|
|
404
|
+
"commands_file_ID": EAR_dat[16],
|
|
405
|
+
}
|
|
406
|
+
return cls(is_pending=False, **obj_dat)
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
@dataclass
|
|
410
|
+
@hydrate
|
|
411
|
+
class ZarrStoreParameter(StoreParameter):
|
|
412
|
+
"""
|
|
413
|
+
Represents a parameter in a Zarr persistent store.
|
|
414
|
+
"""
|
|
415
|
+
|
|
416
|
+
_encoders: ClassVar[dict[type, Callable]] = { # keys are types
|
|
417
|
+
np.ndarray: _encode_numpy_array,
|
|
418
|
+
MaskedArray: _encode_masked_array,
|
|
419
|
+
bytes: _encode_bytes,
|
|
420
|
+
}
|
|
421
|
+
_decoders: ClassVar[dict[str, Callable]] = { # keys are keys in type_lookup
|
|
422
|
+
"arrays": _decode_numpy_arrays,
|
|
423
|
+
"masked_arrays": _decode_masked_arrays,
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
class ZarrPersistentStore(
|
|
428
|
+
PersistentStore[
|
|
429
|
+
ZarrStoreTask,
|
|
430
|
+
ZarrStoreElement,
|
|
431
|
+
ZarrStoreElementIter,
|
|
432
|
+
ZarrStoreEAR,
|
|
433
|
+
ZarrStoreParameter,
|
|
434
|
+
]
|
|
435
|
+
):
|
|
436
|
+
"""
|
|
437
|
+
A persistent store implemented using Zarr.
|
|
438
|
+
"""
|
|
439
|
+
|
|
440
|
+
_name: ClassVar[str] = "zarr"
|
|
441
|
+
_features: ClassVar[PersistentStoreFeatures] = PersistentStoreFeatures(
|
|
442
|
+
create=True,
|
|
443
|
+
edit=True,
|
|
444
|
+
jobscript_parallelism=True,
|
|
445
|
+
EAR_parallelism=True,
|
|
446
|
+
schedulers=True,
|
|
447
|
+
submission=True,
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
@classmethod
|
|
451
|
+
def _store_task_cls(cls) -> type[ZarrStoreTask]:
|
|
452
|
+
return ZarrStoreTask
|
|
453
|
+
|
|
454
|
+
@classmethod
|
|
455
|
+
def _store_elem_cls(cls) -> type[ZarrStoreElement]:
|
|
456
|
+
return ZarrStoreElement
|
|
457
|
+
|
|
458
|
+
@classmethod
|
|
459
|
+
def _store_iter_cls(cls) -> type[ZarrStoreElementIter]:
|
|
460
|
+
return ZarrStoreElementIter
|
|
461
|
+
|
|
462
|
+
@classmethod
|
|
463
|
+
def _store_EAR_cls(cls) -> type[ZarrStoreEAR]:
|
|
464
|
+
return ZarrStoreEAR
|
|
465
|
+
|
|
466
|
+
@classmethod
|
|
467
|
+
def _store_param_cls(cls) -> type[ZarrStoreParameter]:
|
|
468
|
+
return ZarrStoreParameter
|
|
469
|
+
|
|
470
|
+
_param_grp_name: ClassVar[str] = "parameters"
|
|
471
|
+
_param_base_arr_name: ClassVar[str] = "base"
|
|
472
|
+
_param_sources_arr_name: ClassVar[str] = "sources"
|
|
473
|
+
_param_user_arr_grp_name: ClassVar[str] = "arrays"
|
|
474
|
+
_param_data_arr_grp_name: ClassVar = lambda _, param_idx: f"param_{param_idx}"
|
|
475
|
+
_subs_md_group_name: ClassVar[str] = "submissions"
|
|
476
|
+
_task_arr_name: ClassVar[str] = "tasks"
|
|
477
|
+
_elem_arr_name: ClassVar[str] = "elements"
|
|
478
|
+
_iter_arr_name: ClassVar[str] = "iters"
|
|
479
|
+
_EAR_arr_name: ClassVar[str] = "runs"
|
|
480
|
+
_run_dir_arr_name: ClassVar[str] = "run_dirs"
|
|
481
|
+
_js_at_submit_md_arr_name: ClassVar[str] = "js_at_submit_md"
|
|
482
|
+
_js_run_IDs_arr_name: ClassVar[str] = "js_run_IDs"
|
|
483
|
+
_js_task_elems_arr_name: ClassVar[str] = "js_task_elems"
|
|
484
|
+
_js_task_acts_arr_name: ClassVar[str] = "js_task_acts"
|
|
485
|
+
_js_deps_arr_name: ClassVar[str] = "js_deps"
|
|
486
|
+
_time_res: ClassVar[str] = "us" # microseconds; must not be smaller than micro!
|
|
487
|
+
|
|
488
|
+
_res_map: ClassVar[CommitResourceMap] = CommitResourceMap(
|
|
489
|
+
commit_template_components=("attrs",)
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
def __init__(self, app, workflow, path: str | Path, fs: AbstractFileSystem) -> None:
|
|
493
|
+
self._zarr_store = None # assigned on first access to `zarr_store`
|
|
494
|
+
self._resources = {
|
|
495
|
+
"attrs": ZarrAttrsStoreResource(
|
|
496
|
+
app, name="attrs", open_call=self._get_root_group
|
|
497
|
+
),
|
|
498
|
+
}
|
|
499
|
+
self._jobscript_at_submit_metadata: dict[int, dict[str, Any]] = (
|
|
500
|
+
{}
|
|
501
|
+
) # this is a cache
|
|
502
|
+
|
|
503
|
+
# these are caches; keys are submission index and then tuples of
|
|
504
|
+
# (jobscript index, jobscript-block index):
|
|
505
|
+
self._jobscript_run_ID_arrays: dict[int, dict[tuple[int, int], NDArray]] = {}
|
|
506
|
+
self._jobscript_task_element_maps: dict[
|
|
507
|
+
int, dict[tuple[int, int], dict[int, list[int]]]
|
|
508
|
+
] = {}
|
|
509
|
+
self._jobscript_task_actions_arrays: dict[int, dict[tuple[int, int], NDArray]] = (
|
|
510
|
+
{}
|
|
511
|
+
)
|
|
512
|
+
self._jobscript_dependencies: dict[
|
|
513
|
+
int,
|
|
514
|
+
dict[
|
|
515
|
+
tuple[int, int], dict[tuple[int, int], ResolvedJobscriptBlockDependencies]
|
|
516
|
+
],
|
|
517
|
+
] = {}
|
|
518
|
+
|
|
519
|
+
super().__init__(app, workflow, path, fs)
|
|
520
|
+
|
|
521
|
+
@contextmanager
|
|
522
|
+
def cached_load(self) -> Iterator[None]:
|
|
523
|
+
"""Context manager to cache the root attributes."""
|
|
524
|
+
with self.using_resource("attrs", "read") as attrs:
|
|
525
|
+
yield
|
|
526
|
+
|
|
527
|
+
def remove_replaced_dir(self) -> None:
|
|
528
|
+
"""
|
|
529
|
+
Remove the directory containing replaced workflow details.
|
|
530
|
+
"""
|
|
531
|
+
with self.using_resource("attrs", "update") as md:
|
|
532
|
+
if "replaced_workflow" in md:
|
|
533
|
+
self.logger.debug("removing temporarily renamed pre-existing workflow.")
|
|
534
|
+
self.remove_path(md["replaced_workflow"])
|
|
535
|
+
del md["replaced_workflow"]
|
|
536
|
+
|
|
537
|
+
def reinstate_replaced_dir(self) -> None:
|
|
538
|
+
"""
|
|
539
|
+
Reinstate the directory containing replaced workflow details.
|
|
540
|
+
"""
|
|
541
|
+
with self.using_resource("attrs", "read") as md:
|
|
542
|
+
if "replaced_workflow" in md:
|
|
543
|
+
self.logger.debug(
|
|
544
|
+
"reinstating temporarily renamed pre-existing workflow."
|
|
545
|
+
)
|
|
546
|
+
self.rename_path(
|
|
547
|
+
md["replaced_workflow"],
|
|
548
|
+
self.path,
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
@staticmethod
|
|
552
|
+
def _get_zarr_store(path: str | Path, fs: AbstractFileSystem) -> Store:
|
|
553
|
+
return FSStore(url=str(path), fs=fs)
|
|
554
|
+
|
|
555
|
+
_CODEC: ClassVar = MsgPack()
|
|
556
|
+
|
|
557
|
+
@classmethod
|
|
558
|
+
def write_empty_workflow(
|
|
559
|
+
cls,
|
|
560
|
+
app: BaseApp,
|
|
561
|
+
*,
|
|
562
|
+
template_js: TemplateMeta,
|
|
563
|
+
template_components_js: dict[str, Any],
|
|
564
|
+
wk_path: str,
|
|
565
|
+
fs: AbstractFileSystem,
|
|
566
|
+
name: str,
|
|
567
|
+
replaced_wk: str | None,
|
|
568
|
+
ts_fmt: str,
|
|
569
|
+
ts_name_fmt: str,
|
|
570
|
+
creation_info: StoreCreationInfo,
|
|
571
|
+
compressor: str | None = "blosc",
|
|
572
|
+
compressor_kwargs: dict[str, Any] | None = None,
|
|
573
|
+
) -> None:
|
|
574
|
+
"""
|
|
575
|
+
Write an empty persistent workflow.
|
|
576
|
+
"""
|
|
577
|
+
attrs: ZarrAttrsDict = {
|
|
578
|
+
"name": name,
|
|
579
|
+
"ts_fmt": ts_fmt,
|
|
580
|
+
"ts_name_fmt": ts_name_fmt,
|
|
581
|
+
"creation_info": creation_info,
|
|
582
|
+
"template": template_js,
|
|
583
|
+
"template_components": template_components_js,
|
|
584
|
+
"num_added_tasks": 0,
|
|
585
|
+
"tasks": [],
|
|
586
|
+
"loops": [],
|
|
587
|
+
"submissions": [],
|
|
588
|
+
}
|
|
589
|
+
if replaced_wk:
|
|
590
|
+
attrs["replaced_workflow"] = replaced_wk
|
|
591
|
+
|
|
592
|
+
store = cls._get_zarr_store(wk_path, fs)
|
|
593
|
+
root = zarr.group(store=store, overwrite=False)
|
|
594
|
+
root.attrs.update(attrs)
|
|
595
|
+
|
|
596
|
+
# use a nested directory store for the metadata group so the runs array
|
|
597
|
+
# can be stored as a 2D array in nested directories, thereby limiting the maximum
|
|
598
|
+
# number of files stored in a given directory:
|
|
599
|
+
md_store = zarr.NestedDirectoryStore(Path(root.store.path).joinpath("metadata"))
|
|
600
|
+
md = zarr.group(store=md_store)
|
|
601
|
+
|
|
602
|
+
compressor_lookup = {
|
|
603
|
+
"blosc": Blosc,
|
|
604
|
+
"zstd": Zstd,
|
|
605
|
+
}
|
|
606
|
+
if compressor:
|
|
607
|
+
cmp = compressor_lookup[compressor.lower()](**(compressor_kwargs or {}))
|
|
608
|
+
else:
|
|
609
|
+
cmp = None
|
|
610
|
+
|
|
611
|
+
tasks_arr = md.create_dataset(
|
|
612
|
+
name=cls._task_arr_name,
|
|
613
|
+
shape=0,
|
|
614
|
+
dtype=object,
|
|
615
|
+
object_codec=VLenArray(int),
|
|
616
|
+
compressor=cmp,
|
|
617
|
+
)
|
|
618
|
+
|
|
619
|
+
elems_arr = md.create_dataset(
|
|
620
|
+
name=cls._elem_arr_name,
|
|
621
|
+
shape=0,
|
|
622
|
+
dtype=object,
|
|
623
|
+
object_codec=cls._CODEC,
|
|
624
|
+
chunks=1000,
|
|
625
|
+
compressor=cmp,
|
|
626
|
+
)
|
|
627
|
+
elems_arr.attrs.update({"seq_idx": [], "src_idx": []})
|
|
628
|
+
|
|
629
|
+
elem_iters_arr = md.create_dataset(
|
|
630
|
+
name=cls._iter_arr_name,
|
|
631
|
+
shape=0,
|
|
632
|
+
dtype=object,
|
|
633
|
+
object_codec=cls._CODEC,
|
|
634
|
+
chunks=1000,
|
|
635
|
+
compressor=cmp,
|
|
636
|
+
)
|
|
637
|
+
elem_iters_arr.attrs.update(
|
|
638
|
+
{
|
|
639
|
+
"loops": [],
|
|
640
|
+
"schema_parameters": [],
|
|
641
|
+
"parameter_paths": [],
|
|
642
|
+
}
|
|
643
|
+
)
|
|
644
|
+
|
|
645
|
+
EARs_arr = md.create_dataset(
|
|
646
|
+
name=cls._EAR_arr_name,
|
|
647
|
+
shape=(0, 1000),
|
|
648
|
+
dtype=object,
|
|
649
|
+
object_codec=cls._CODEC,
|
|
650
|
+
chunks=1, # single-chunk rows for multiprocess writing
|
|
651
|
+
compressor=cmp,
|
|
652
|
+
dimension_separator="/",
|
|
653
|
+
)
|
|
654
|
+
EARs_arr.attrs.update({"parameter_paths": [], "num_runs": 0})
|
|
655
|
+
|
|
656
|
+
# array for storing indices that can be used to reproduce run directory paths:
|
|
657
|
+
run_dir_arr = md.create_dataset(
|
|
658
|
+
name=cls._run_dir_arr_name,
|
|
659
|
+
shape=0,
|
|
660
|
+
chunks=10_000,
|
|
661
|
+
dtype=RUN_DIR_ARR_DTYPE,
|
|
662
|
+
fill_value=RUN_DIR_ARR_FILL,
|
|
663
|
+
write_empty_chunks=False,
|
|
664
|
+
)
|
|
665
|
+
|
|
666
|
+
parameter_data = root.create_group(name=cls._param_grp_name)
|
|
667
|
+
parameter_data.create_dataset(
|
|
668
|
+
name=cls._param_base_arr_name,
|
|
669
|
+
shape=0,
|
|
670
|
+
dtype=object,
|
|
671
|
+
object_codec=cls._CODEC,
|
|
672
|
+
chunks=1,
|
|
673
|
+
compressor=cmp,
|
|
674
|
+
write_empty_chunks=False,
|
|
675
|
+
fill_value=PARAM_DATA_NOT_SET,
|
|
676
|
+
)
|
|
677
|
+
parameter_data.create_dataset(
|
|
678
|
+
name=cls._param_sources_arr_name,
|
|
679
|
+
shape=0,
|
|
680
|
+
dtype=object,
|
|
681
|
+
object_codec=cls._CODEC,
|
|
682
|
+
chunks=1000, # TODO: check this is a sensible size with many parameters
|
|
683
|
+
compressor=cmp,
|
|
684
|
+
)
|
|
685
|
+
parameter_data.create_group(name=cls._param_user_arr_grp_name)
|
|
686
|
+
|
|
687
|
+
# for storing submission metadata that should not be stored in the root group:
|
|
688
|
+
md.create_group(name=cls._subs_md_group_name)
|
|
689
|
+
|
|
690
|
+
def _append_tasks(self, tasks: Iterable[ZarrStoreTask]):
|
|
691
|
+
elem_IDs_arr = self._get_tasks_arr(mode="r+")
|
|
692
|
+
elem_IDs: list[int] = []
|
|
693
|
+
with self.using_resource("attrs", "update") as attrs:
|
|
694
|
+
for i_idx, i in enumerate(tasks):
|
|
695
|
+
idx, wk_task_i, task_i = i.encode()
|
|
696
|
+
elem_IDs.append(wk_task_i.pop("element_IDs"))
|
|
697
|
+
wk_task_i["element_IDs_idx"] = len(elem_IDs_arr) + i_idx
|
|
698
|
+
|
|
699
|
+
attrs["tasks"].insert(idx, wk_task_i)
|
|
700
|
+
attrs["template"]["tasks"].insert(idx, task_i)
|
|
701
|
+
attrs["num_added_tasks"] += 1
|
|
702
|
+
|
|
703
|
+
# tasks array rows correspond to task IDs, and we assume `tasks` have sequentially
|
|
704
|
+
# increasing IDs.
|
|
705
|
+
append_items_to_ragged_array(arr=elem_IDs_arr, items=elem_IDs)
|
|
706
|
+
|
|
707
|
+
def _append_loops(self, loops: dict[int, LoopDescriptor]):
|
|
708
|
+
with self.using_resource("attrs", action="update") as attrs:
|
|
709
|
+
for loop in loops.values():
|
|
710
|
+
attrs["loops"].append(
|
|
711
|
+
{
|
|
712
|
+
"num_added_iterations": loop["num_added_iterations"],
|
|
713
|
+
"iterable_parameters": loop["iterable_parameters"],
|
|
714
|
+
"output_parameters": loop["output_parameters"],
|
|
715
|
+
"parents": loop["parents"],
|
|
716
|
+
}
|
|
717
|
+
)
|
|
718
|
+
attrs["template"]["loops"].append(loop["loop_template"])
|
|
719
|
+
|
|
720
|
+
@staticmethod
|
|
721
|
+
def _extract_submission_run_IDs_array(
|
|
722
|
+
sub_js: Mapping[str, JSONed],
|
|
723
|
+
) -> tuple[np.ndarray, list[list[list[int]]]]:
|
|
724
|
+
"""For a JSON-like representation of a Submission object, remove and combine all
|
|
725
|
+
jobscript-block run ID lists into a single array with a fill value.
|
|
726
|
+
|
|
727
|
+
Notes
|
|
728
|
+
-----
|
|
729
|
+
This mutates `sub_js`, by setting `EAR_ID` jobscript-block keys to `None`.
|
|
730
|
+
|
|
731
|
+
Parameters
|
|
732
|
+
----------
|
|
733
|
+
sub_js
|
|
734
|
+
JSON-like representation of a `Submission` object.
|
|
735
|
+
|
|
736
|
+
Returns
|
|
737
|
+
-------
|
|
738
|
+
combined_run_IDs
|
|
739
|
+
Integer Numpy array that contains a concatenation of all 2D run ID arrays
|
|
740
|
+
from each jobscript-block. Technically a "jagged"/"ragged" array that is made
|
|
741
|
+
square with a large fill value.
|
|
742
|
+
block_shapes
|
|
743
|
+
List of length equal to the number of jobscripts in the submission. Each
|
|
744
|
+
sub-list contains a list of shapes (as a two-item list:
|
|
745
|
+
`[num_actions, num_elements]`) of the constituent blocks of that jobscript.
|
|
746
|
+
|
|
747
|
+
"""
|
|
748
|
+
arrs = []
|
|
749
|
+
max_acts, max_elems = 0, 0
|
|
750
|
+
|
|
751
|
+
# a list for each jobscript, containing shapes of run ID arrays in each block:
|
|
752
|
+
block_shapes = []
|
|
753
|
+
for js in cast("Sequence[Mapping[str, JSONed]]", sub_js["jobscripts"]):
|
|
754
|
+
block_shapes_js_i = []
|
|
755
|
+
for blk in cast("Sequence[MutableMapping[str, JSONed]]", js["blocks"]):
|
|
756
|
+
run_IDs_i = np.array(blk["EAR_ID"])
|
|
757
|
+
blk["EAR_ID"] = None # TODO: how to type?
|
|
758
|
+
block_shapes_js_i.append(list(run_IDs_i.shape))
|
|
759
|
+
if run_IDs_i.shape[0] > max_acts:
|
|
760
|
+
max_acts = run_IDs_i.shape[0]
|
|
761
|
+
if run_IDs_i.shape[1] > max_elems:
|
|
762
|
+
max_elems = run_IDs_i.shape[1]
|
|
763
|
+
arrs.append(run_IDs_i)
|
|
764
|
+
block_shapes.append(block_shapes_js_i)
|
|
765
|
+
|
|
766
|
+
combined_run_IDs = np.full(
|
|
767
|
+
(len(arrs), max_acts, max_elems),
|
|
768
|
+
dtype=np.int32,
|
|
769
|
+
fill_value=-1,
|
|
770
|
+
)
|
|
771
|
+
for arr_idx, arr in enumerate(arrs):
|
|
772
|
+
combined_run_IDs[arr_idx][: arr.shape[0], : arr.shape[1]] = arr
|
|
773
|
+
|
|
774
|
+
return combined_run_IDs, block_shapes
|
|
775
|
+
|
|
776
|
+
@staticmethod
|
|
777
|
+
def _extract_submission_task_elements_array(
|
|
778
|
+
sub_js: Mapping[str, JSONed],
|
|
779
|
+
) -> tuple[np.ndarray, list[list[list[int]]]]:
|
|
780
|
+
"""For a JSON-like representation of a Submission object, remove and combine all
|
|
781
|
+
jobscript-block task-element mappings into a single array with a fill value.
|
|
782
|
+
|
|
783
|
+
Notes
|
|
784
|
+
-----
|
|
785
|
+
This mutates `sub_js`, by setting `task_elements` jobscript-block keys to `None`.
|
|
786
|
+
|
|
787
|
+
Parameters
|
|
788
|
+
----------
|
|
789
|
+
sub_js
|
|
790
|
+
JSON-like representation of a `Submission` object.
|
|
791
|
+
|
|
792
|
+
Returns
|
|
793
|
+
-------
|
|
794
|
+
combined_task_elems
|
|
795
|
+
Integer Numpy array that contains a concatenation of each task-element,
|
|
796
|
+
mapping, where each mapping is expressed as a 2D array whose first column
|
|
797
|
+
corresponds to the keys of the mappings, and whose remaining columns
|
|
798
|
+
correspond to the values of the mappings. Technically a "jagged"/"ragged"
|
|
799
|
+
array that is made square with a large fill value.
|
|
800
|
+
block_shapes
|
|
801
|
+
List of length equal to the number of jobscripts in the submission. Each
|
|
802
|
+
sub-list contains a list of shapes (as a two-item list:
|
|
803
|
+
`[num_actions, num_elements]`) of the constituent blocks of that jobscript.
|
|
804
|
+
|
|
805
|
+
"""
|
|
806
|
+
arrs = []
|
|
807
|
+
max_x, max_y = 0, 0
|
|
808
|
+
|
|
809
|
+
# a list for each jobscript, containing shapes of run ID arrays in each block:
|
|
810
|
+
block_shapes = []
|
|
811
|
+
for js in cast("Sequence[Mapping[str, JSONed]]", sub_js["jobscripts"]):
|
|
812
|
+
block_shapes_js_i = []
|
|
813
|
+
for blk in cast("Sequence[MutableMapping[str, JSONed]]", js["blocks"]):
|
|
814
|
+
|
|
815
|
+
task_elems_lst = []
|
|
816
|
+
for k, v in cast("Mapping[int, list[int]]", blk["task_elements"]).items():
|
|
817
|
+
task_elems_lst.append([k] + v)
|
|
818
|
+
task_elems_i = np.array(task_elems_lst)
|
|
819
|
+
|
|
820
|
+
block_shape_j = [task_elems_i.shape[1] - 1, task_elems_i.shape[0]]
|
|
821
|
+
block_shapes_js_i.append(block_shape_j)
|
|
822
|
+
|
|
823
|
+
blk["task_elements"] = None # TODO: how to type?
|
|
824
|
+
if task_elems_i.shape[1] > max_x:
|
|
825
|
+
max_x = task_elems_i.shape[1]
|
|
826
|
+
if task_elems_i.shape[0] > max_y:
|
|
827
|
+
max_y = task_elems_i.shape[0]
|
|
828
|
+
arrs.append(task_elems_i)
|
|
829
|
+
block_shapes.append(block_shapes_js_i)
|
|
830
|
+
|
|
831
|
+
combined_task_elems = np.full(
|
|
832
|
+
(len(arrs), max_y, max_x),
|
|
833
|
+
dtype=np.uint32,
|
|
834
|
+
fill_value=np.iinfo(np.uint32).max,
|
|
835
|
+
)
|
|
836
|
+
for arr_idx, arr in enumerate(arrs):
|
|
837
|
+
combined_task_elems[arr_idx][: arr.shape[0], : arr.shape[1]] = arr
|
|
838
|
+
|
|
839
|
+
return combined_task_elems, block_shapes
|
|
840
|
+
|
|
841
|
+
@staticmethod
|
|
842
|
+
def _extract_submission_task_actions_array(
|
|
843
|
+
sub_js: Mapping[str, JSONed],
|
|
844
|
+
) -> tuple[np.ndarray, list[list[int]]]:
|
|
845
|
+
"""For a JSON-like representation of a Submission object, remove and concatenate
|
|
846
|
+
all jobscript-block task-action arrays into a single array.
|
|
847
|
+
|
|
848
|
+
Notes
|
|
849
|
+
-----
|
|
850
|
+
This mutates `sub_js`, by setting `task_actions` jobscript-block keys to `None`.
|
|
851
|
+
|
|
852
|
+
Parameters
|
|
853
|
+
----------
|
|
854
|
+
sub_js
|
|
855
|
+
JSON-like representation of a `Submission` object.
|
|
856
|
+
|
|
857
|
+
Returns
|
|
858
|
+
-------
|
|
859
|
+
combined_task_acts
|
|
860
|
+
Integer 2D Numpy array which is a concatenation along the first axis of
|
|
861
|
+
task-action actions from all jobscript blocks. The second dimension is of
|
|
862
|
+
length three.
|
|
863
|
+
block_num_acts
|
|
864
|
+
List of length equal to the number of jobscripts in the submission. Each
|
|
865
|
+
sub-list contains a list of `num_actions` of the constituent blocks of that
|
|
866
|
+
jobscript.
|
|
867
|
+
|
|
868
|
+
"""
|
|
869
|
+
arrs = []
|
|
870
|
+
|
|
871
|
+
# a list for each jobscript, containing shapes of run ID arrays in each block:
|
|
872
|
+
|
|
873
|
+
blk_num_acts = []
|
|
874
|
+
for js in cast("Sequence[Mapping[str, JSONed]]", sub_js["jobscripts"]):
|
|
875
|
+
|
|
876
|
+
blk_num_acts_js_i = []
|
|
877
|
+
for blk in cast("Sequence[MutableMapping[str, JSONed]]", js["blocks"]):
|
|
878
|
+
|
|
879
|
+
blk_acts = np.array(blk["task_actions"])
|
|
880
|
+
blk["task_actions"] = None # TODO: how to type?
|
|
881
|
+
blk_num_acts_js_i.append(blk_acts.shape[0])
|
|
882
|
+
arrs.append(blk_acts)
|
|
883
|
+
|
|
884
|
+
blk_num_acts.append(blk_num_acts_js_i)
|
|
885
|
+
|
|
886
|
+
combined_task_acts = np.vstack(arrs)
|
|
887
|
+
|
|
888
|
+
return combined_task_acts, blk_num_acts
|
|
889
|
+
|
|
890
|
+
@staticmethod
|
|
891
|
+
def _encode_jobscript_block_dependencies(sub_js: Mapping[str, JSONed]) -> np.ndarray:
|
|
892
|
+
"""For a JSON-like representation of a Submission object, remove jobscript-block
|
|
893
|
+
dependencies for all jobscripts and transform to a single 1D integer array, that
|
|
894
|
+
can be transformed back by `_decode_jobscript_block_dependencies`.
|
|
895
|
+
|
|
896
|
+
Notes
|
|
897
|
+
-----
|
|
898
|
+
This mutates `sub_js`, by setting `depdendencies` jobscript-block keys to `None`.
|
|
899
|
+
"""
|
|
900
|
+
|
|
901
|
+
# TODO: avoid this horrible mess of casts
|
|
902
|
+
|
|
903
|
+
all_deps_arr = []
|
|
904
|
+
assert sub_js["jobscripts"] is not None
|
|
905
|
+
for js in cast("Sequence[Mapping[str, JSONed]]", sub_js["jobscripts"]):
|
|
906
|
+
for blk in cast("Sequence[MutableMapping[str, JSONed]]", js["blocks"]):
|
|
907
|
+
all_deps_i: list[int] = []
|
|
908
|
+
assert blk["dependencies"] is not None
|
|
909
|
+
blk_deps = cast(
|
|
910
|
+
"list[tuple[tuple[int, int], Mapping[str, JSONed]]]",
|
|
911
|
+
blk["dependencies"],
|
|
912
|
+
)
|
|
913
|
+
for (dep_js_idx, dep_blk_idx), dep in blk_deps:
|
|
914
|
+
deps_arr: list[int] = []
|
|
915
|
+
for elem_i, elements_j in cast(
|
|
916
|
+
"Mapping[int, Sequence[int]]", dep["js_element_mapping"]
|
|
917
|
+
).items():
|
|
918
|
+
deps_arr.extend([len(elements_j) + 1, elem_i] + list(elements_j))
|
|
919
|
+
blk_arr = [
|
|
920
|
+
dep_js_idx,
|
|
921
|
+
dep_blk_idx,
|
|
922
|
+
int(cast("bool", dep["is_array"])),
|
|
923
|
+
] + deps_arr
|
|
924
|
+
blk_arr = [len(blk_arr)] + blk_arr
|
|
925
|
+
all_deps_i.extend(blk_arr)
|
|
926
|
+
all_deps_i = [
|
|
927
|
+
cast("int", js["index"]),
|
|
928
|
+
cast("int", blk["index"]),
|
|
929
|
+
] + all_deps_i
|
|
930
|
+
blk["dependencies"] = None # TODO: how to type?
|
|
931
|
+
all_deps_arr.extend([len(all_deps_i)] + all_deps_i)
|
|
932
|
+
|
|
933
|
+
return np.array(all_deps_arr)
|
|
934
|
+
|
|
935
|
+
@staticmethod
|
|
936
|
+
def _decode_jobscript_block_dependencies(
|
|
937
|
+
arr: np.ndarray,
|
|
938
|
+
) -> dict[tuple[int, int], dict[tuple[int, int], ResolvedJobscriptBlockDependencies]]:
|
|
939
|
+
"""Re-generate jobscript-block dependencies that have been transformed by
|
|
940
|
+
`_encode_jobscript_block_dependencies` into a single 1D integer array.
|
|
941
|
+
|
|
942
|
+
Parameters
|
|
943
|
+
----------
|
|
944
|
+
arr:
|
|
945
|
+
The 1D integer array to transform back to a verbose jobscript-block dependency
|
|
946
|
+
mapping.
|
|
947
|
+
"""
|
|
948
|
+
# metadata is js/blk_idx for which the dependencies are stored:
|
|
949
|
+
block_arrs = split_arr(arr, metadata_size=2)
|
|
950
|
+
block_deps = {}
|
|
951
|
+
for i in block_arrs:
|
|
952
|
+
|
|
953
|
+
js_idx: int
|
|
954
|
+
blk_idx: int
|
|
955
|
+
dep_js_idx: int
|
|
956
|
+
dep_blk_idx: int
|
|
957
|
+
is_array: int
|
|
958
|
+
|
|
959
|
+
js_idx, blk_idx = i[0]
|
|
960
|
+
# metadata is js/blk_idx that this block depends on, plus whether the
|
|
961
|
+
# dependency is an array dependency:
|
|
962
|
+
deps_arrs = split_arr(i[1], metadata_size=3)
|
|
963
|
+
all_deps_ij: dict[tuple[int, int], ResolvedJobscriptBlockDependencies] = {}
|
|
964
|
+
for j in deps_arrs:
|
|
965
|
+
dep_js_idx, dep_blk_idx, is_array = j[0]
|
|
966
|
+
# no metadata:
|
|
967
|
+
elem_deps = split_arr(j[1], metadata_size=0)
|
|
968
|
+
all_deps_ij[(dep_js_idx, dep_blk_idx)] = {
|
|
969
|
+
"js_element_mapping": {},
|
|
970
|
+
"is_array": bool(is_array),
|
|
971
|
+
}
|
|
972
|
+
for k in elem_deps:
|
|
973
|
+
all_deps_ij[(dep_js_idx, dep_blk_idx)]["js_element_mapping"].update(
|
|
974
|
+
{k[1][0]: list(k[1][1:])}
|
|
975
|
+
)
|
|
976
|
+
|
|
977
|
+
block_deps[(js_idx, blk_idx)] = all_deps_ij
|
|
978
|
+
return block_deps
|
|
979
|
+
|
|
980
|
+
def _append_submissions(self, subs: dict[int, Mapping[str, JSONed]]):
|
|
981
|
+
|
|
982
|
+
for sub_idx, sub_i in subs.items():
|
|
983
|
+
|
|
984
|
+
# add a new metadata group for this submission:
|
|
985
|
+
sub_grp = self._get_all_submissions_metadata_group(mode="r+").create_group(
|
|
986
|
+
sub_idx
|
|
987
|
+
)
|
|
988
|
+
|
|
989
|
+
# add a new at-submit metadata array for jobscripts of this submission:
|
|
990
|
+
num_js = len(cast("list", sub_i["jobscripts"]))
|
|
991
|
+
sub_grp.create_dataset(
|
|
992
|
+
name=self._js_at_submit_md_arr_name,
|
|
993
|
+
shape=num_js,
|
|
994
|
+
dtype=object,
|
|
995
|
+
object_codec=MsgPack(),
|
|
996
|
+
chunks=1,
|
|
997
|
+
write_empty_chunks=False,
|
|
998
|
+
)
|
|
999
|
+
|
|
1000
|
+
# add a new array to store run IDs for each jobscript:
|
|
1001
|
+
combined_run_IDs, block_shapes = self._extract_submission_run_IDs_array(sub_i)
|
|
1002
|
+
run_IDs_arr = sub_grp.create_dataset(
|
|
1003
|
+
name=self._js_run_IDs_arr_name,
|
|
1004
|
+
data=combined_run_IDs,
|
|
1005
|
+
chunks=(None, None, None), # single chunk for the whole array
|
|
1006
|
+
)
|
|
1007
|
+
run_IDs_arr.attrs["block_shapes"] = block_shapes
|
|
1008
|
+
|
|
1009
|
+
# add a new array to store task-element map for each jobscript:
|
|
1010
|
+
(
|
|
1011
|
+
combined_task_elems,
|
|
1012
|
+
block_shapes,
|
|
1013
|
+
) = self._extract_submission_task_elements_array(sub_i)
|
|
1014
|
+
task_elems_arr = sub_grp.create_dataset(
|
|
1015
|
+
name=self._js_task_elems_arr_name,
|
|
1016
|
+
data=combined_task_elems,
|
|
1017
|
+
chunks=(None, None, None),
|
|
1018
|
+
)
|
|
1019
|
+
task_elems_arr.attrs["block_shapes"] = block_shapes
|
|
1020
|
+
|
|
1021
|
+
# add a new array to store task-actions for each jobscript:
|
|
1022
|
+
(
|
|
1023
|
+
combined_task_acts,
|
|
1024
|
+
block_num_acts,
|
|
1025
|
+
) = self._extract_submission_task_actions_array(sub_i)
|
|
1026
|
+
task_acts_arr = sub_grp.create_dataset(
|
|
1027
|
+
name=self._js_task_acts_arr_name,
|
|
1028
|
+
data=combined_task_acts,
|
|
1029
|
+
chunks=(None, None),
|
|
1030
|
+
)
|
|
1031
|
+
task_acts_arr.attrs["block_num_acts"] = block_num_acts
|
|
1032
|
+
|
|
1033
|
+
# add a new array to store jobscript-block dependencies for this submission:
|
|
1034
|
+
sub_grp.create_dataset(
|
|
1035
|
+
name=self._js_deps_arr_name,
|
|
1036
|
+
data=self._encode_jobscript_block_dependencies(sub_i),
|
|
1037
|
+
chunks=(None,),
|
|
1038
|
+
)
|
|
1039
|
+
|
|
1040
|
+
# TODO: store block shapes in `grp.attrs` since it is defined at the
|
|
1041
|
+
# submission level
|
|
1042
|
+
|
|
1043
|
+
# add attributes for at-submit-time submission metadata:
|
|
1044
|
+
grp = self._get_submission_metadata_group(sub_idx, mode="r+")
|
|
1045
|
+
grp.attrs["submission_parts"] = {}
|
|
1046
|
+
|
|
1047
|
+
with self.using_resource("attrs", action="update") as attrs:
|
|
1048
|
+
attrs["submissions"].extend(subs.values())
|
|
1049
|
+
|
|
1050
|
+
def _append_task_element_IDs(self, task_ID: int, elem_IDs: list[int]):
|
|
1051
|
+
# I don't think there's a way to "append" to an existing array in a zarr ragged
|
|
1052
|
+
# array? So we have to build a new array from existing + new.
|
|
1053
|
+
arr = self._get_tasks_arr(mode="r+")
|
|
1054
|
+
elem_IDs_cur = arr[task_ID]
|
|
1055
|
+
elem_IDs_new = np.concatenate((elem_IDs_cur, elem_IDs))
|
|
1056
|
+
arr[task_ID] = elem_IDs_new
|
|
1057
|
+
|
|
1058
|
+
@staticmethod
|
|
1059
|
+
def __as_dict(attrs: Attributes) -> ZarrAttrs:
|
|
1060
|
+
"""
|
|
1061
|
+
Type thunk to work around incomplete typing in zarr.
|
|
1062
|
+
"""
|
|
1063
|
+
return cast("ZarrAttrs", attrs.asdict())
|
|
1064
|
+
|
|
1065
|
+
@contextmanager
|
|
1066
|
+
def __mutate_attrs(self, arr: Array) -> Iterator[ZarrAttrs]:
|
|
1067
|
+
attrs_orig = self.__as_dict(arr.attrs)
|
|
1068
|
+
attrs = copy.deepcopy(attrs_orig)
|
|
1069
|
+
yield attrs
|
|
1070
|
+
if attrs != attrs_orig:
|
|
1071
|
+
arr.attrs.put(attrs)
|
|
1072
|
+
|
|
1073
|
+
def _append_elements(self, elems: Sequence[ZarrStoreElement]):
|
|
1074
|
+
arr = self._get_elements_arr(mode="r+")
|
|
1075
|
+
with self.__mutate_attrs(arr) as attrs:
|
|
1076
|
+
arr_add = np.empty((len(elems)), dtype=object)
|
|
1077
|
+
arr_add[:] = [elem.encode(attrs) for elem in elems]
|
|
1078
|
+
arr.append(arr_add)
|
|
1079
|
+
|
|
1080
|
+
def _append_element_sets(self, task_id: int, es_js: Sequence[Mapping]):
|
|
1081
|
+
task_idx = task_idx = self._get_task_id_to_idx_map()[task_id]
|
|
1082
|
+
with self.using_resource("attrs", "update") as attrs:
|
|
1083
|
+
attrs["template"]["tasks"][task_idx]["element_sets"].extend(es_js)
|
|
1084
|
+
|
|
1085
|
+
def _append_elem_iter_IDs(self, elem_ID: int, iter_IDs: Iterable[int]):
|
|
1086
|
+
arr = self._get_elements_arr(mode="r+")
|
|
1087
|
+
attrs = self.__as_dict(arr.attrs)
|
|
1088
|
+
elem_dat = cast("list", arr[elem_ID])
|
|
1089
|
+
store_elem = ZarrStoreElement.decode(elem_dat, attrs)
|
|
1090
|
+
store_elem = store_elem.append_iteration_IDs(iter_IDs)
|
|
1091
|
+
arr[elem_ID] = store_elem.encode(attrs)
|
|
1092
|
+
# attrs shouldn't be mutated (TODO: test!)
|
|
1093
|
+
|
|
1094
|
+
def _append_elem_iters(self, iters: Sequence[ZarrStoreElementIter]):
|
|
1095
|
+
arr = self._get_iters_arr(mode="r+")
|
|
1096
|
+
with self.__mutate_attrs(arr) as attrs:
|
|
1097
|
+
arr_add = np.empty((len(iters)), dtype=object)
|
|
1098
|
+
arr_add[:] = [i.encode(attrs) for i in iters]
|
|
1099
|
+
arr.append(arr_add)
|
|
1100
|
+
|
|
1101
|
+
def _append_elem_iter_EAR_IDs(
|
|
1102
|
+
self, iter_ID: int, act_idx: int, EAR_IDs: Sequence[int]
|
|
1103
|
+
):
|
|
1104
|
+
arr = self._get_iters_arr(mode="r+")
|
|
1105
|
+
attrs = self.__as_dict(arr.attrs)
|
|
1106
|
+
iter_dat = cast("list", arr[iter_ID])
|
|
1107
|
+
store_iter = ZarrStoreElementIter.decode(iter_dat, attrs)
|
|
1108
|
+
store_iter = store_iter.append_EAR_IDs(pend_IDs={act_idx: EAR_IDs})
|
|
1109
|
+
arr[iter_ID] = store_iter.encode(attrs)
|
|
1110
|
+
# attrs shouldn't be mutated (TODO: test!)
|
|
1111
|
+
|
|
1112
|
+
def _update_elem_iter_EARs_initialised(self, iter_ID: int):
|
|
1113
|
+
arr = self._get_iters_arr(mode="r+")
|
|
1114
|
+
attrs = self.__as_dict(arr.attrs)
|
|
1115
|
+
iter_dat = cast("list", arr[iter_ID])
|
|
1116
|
+
store_iter = ZarrStoreElementIter.decode(iter_dat, attrs)
|
|
1117
|
+
store_iter = store_iter.set_EARs_initialised()
|
|
1118
|
+
arr[iter_ID] = store_iter.encode(attrs)
|
|
1119
|
+
# attrs shouldn't be mutated (TODO: test!)
|
|
1120
|
+
|
|
1121
|
+
def _update_at_submit_metadata(
|
|
1122
|
+
self,
|
|
1123
|
+
at_submit_metadata: dict[int, dict[str, Any]],
|
|
1124
|
+
):
|
|
1125
|
+
for sub_idx, metadata_i in at_submit_metadata.items():
|
|
1126
|
+
grp = self._get_submission_metadata_group(sub_idx, mode="r+")
|
|
1127
|
+
attrs = self.__as_dict(grp.attrs)
|
|
1128
|
+
attrs["submission_parts"].update(metadata_i["submission_parts"])
|
|
1129
|
+
grp.attrs.put(attrs)
|
|
1130
|
+
|
|
1131
|
+
def _update_loop_index(self, loop_indices: dict[int, dict[str, int]]):
|
|
1132
|
+
|
|
1133
|
+
arr = self._get_iters_arr(mode="r+")
|
|
1134
|
+
attrs = self.__as_dict(arr.attrs)
|
|
1135
|
+
iter_IDs = list(loop_indices.keys())
|
|
1136
|
+
iter_dat = arr.get_coordinate_selection(iter_IDs)
|
|
1137
|
+
store_iters = [ZarrStoreElementIter.decode(i, attrs) for i in iter_dat]
|
|
1138
|
+
|
|
1139
|
+
for idx, iter_ID_i in enumerate(iter_IDs):
|
|
1140
|
+
new_iter_i = store_iters[idx].update_loop_idx(loop_indices[iter_ID_i])
|
|
1141
|
+
# seems to be a Zarr bug that prevents `set_coordinate_selection` with an
|
|
1142
|
+
# object array, so set one-by-one:
|
|
1143
|
+
arr[iter_ID_i] = new_iter_i.encode(attrs)
|
|
1144
|
+
|
|
1145
|
+
def _update_loop_num_iters(self, index: int, num_iters: list[list[list[int] | int]]):
|
|
1146
|
+
with self.using_resource("attrs", action="update") as attrs:
|
|
1147
|
+
attrs["loops"][index]["num_added_iterations"] = num_iters
|
|
1148
|
+
|
|
1149
|
+
def _update_loop_parents(self, index: int, parents: list[str]):
|
|
1150
|
+
with self.using_resource("attrs", action="update") as attrs:
|
|
1151
|
+
attrs["loops"][index]["parents"] = parents
|
|
1152
|
+
|
|
1153
|
+
def _update_iter_data_indices(self, iter_data_indices: dict[int, DataIndex]):
|
|
1154
|
+
|
|
1155
|
+
arr = self._get_iters_arr(mode="r+")
|
|
1156
|
+
attrs = self.__as_dict(arr.attrs)
|
|
1157
|
+
iter_IDs = list(iter_data_indices.keys())
|
|
1158
|
+
iter_dat = arr.get_coordinate_selection(iter_IDs)
|
|
1159
|
+
store_iters = [ZarrStoreElementIter.decode(i, attrs) for i in iter_dat]
|
|
1160
|
+
|
|
1161
|
+
for idx, iter_ID_i in enumerate(iter_IDs):
|
|
1162
|
+
new_iter_i = store_iters[idx].update_data_idx(iter_data_indices[iter_ID_i])
|
|
1163
|
+
# seems to be a Zarr bug that prevents `set_coordinate_selection` with an
|
|
1164
|
+
# object array, so set one-by-one:
|
|
1165
|
+
arr[iter_ID_i] = new_iter_i.encode(attrs)
|
|
1166
|
+
|
|
1167
|
+
def _update_run_data_indices(self, run_data_indices: dict[int, DataIndex]):
|
|
1168
|
+
self._update_runs(
|
|
1169
|
+
updates={k: {"data_idx": v} for k, v in run_data_indices.items()}
|
|
1170
|
+
)
|
|
1171
|
+
|
|
1172
|
+
def _append_EARs(self, EARs: Sequence[ZarrStoreEAR]):
|
|
1173
|
+
arr = self._get_EARs_arr(mode="r+")
|
|
1174
|
+
with self.__mutate_attrs(arr) as attrs:
|
|
1175
|
+
num_existing = attrs["num_runs"]
|
|
1176
|
+
num_add = len(EARs)
|
|
1177
|
+
num_tot = num_existing + num_add
|
|
1178
|
+
arr_add = np.empty(num_add, dtype=object)
|
|
1179
|
+
arr_add[:] = [i.encode(self.ts_fmt, attrs) for i in EARs]
|
|
1180
|
+
|
|
1181
|
+
# get new 1D indices:
|
|
1182
|
+
new_idx: NDArray = np.arange(num_existing, num_tot)
|
|
1183
|
+
|
|
1184
|
+
# transform to 2D indices:
|
|
1185
|
+
r_idx, c_idx = get_2D_idx(new_idx, num_cols=arr.shape[1])
|
|
1186
|
+
|
|
1187
|
+
# add rows to accomodate new runs:
|
|
1188
|
+
max_r_idx = np.max(r_idx)
|
|
1189
|
+
if max_r_idx + 1 > arr.shape[0]:
|
|
1190
|
+
arr.resize(max_r_idx + 1, arr.shape[1])
|
|
1191
|
+
|
|
1192
|
+
# fill in new data:
|
|
1193
|
+
for arr_add_idx_i, (r_idx_i, c_idx_i) in enumerate(zip(r_idx, c_idx)):
|
|
1194
|
+
# seems to be a Zarr bug that prevents `set_coordinate_selection` with an
|
|
1195
|
+
# object array, so set one-by-one:
|
|
1196
|
+
arr[r_idx_i, c_idx_i] = arr_add[arr_add_idx_i]
|
|
1197
|
+
|
|
1198
|
+
attrs["num_runs"] = num_tot
|
|
1199
|
+
|
|
1200
|
+
# add more rows to run dirs array:
|
|
1201
|
+
dirs_arr = self._get_dirs_arr(mode="r+")
|
|
1202
|
+
dirs_arr.resize(num_tot)
|
|
1203
|
+
|
|
1204
|
+
def _set_run_dirs(self, run_dir_arr: np.ndarray, run_idx: np.ndarray):
|
|
1205
|
+
dirs_arr = self._get_dirs_arr(mode="r+")
|
|
1206
|
+
dirs_arr[run_idx] = run_dir_arr
|
|
1207
|
+
|
|
1208
|
+
@TimeIt.decorator
|
|
1209
|
+
def _update_runs(self, updates: dict[int, dict[str, Any]]):
|
|
1210
|
+
"""Update the provided EAR attribute values in the specified existing runs."""
|
|
1211
|
+
run_IDs = list(updates.keys())
|
|
1212
|
+
runs = self._get_persistent_EARs(run_IDs)
|
|
1213
|
+
|
|
1214
|
+
arr = self._get_EARs_arr(mode="r+")
|
|
1215
|
+
with self.__mutate_attrs(arr) as attrs:
|
|
1216
|
+
# convert to 2D array indices:
|
|
1217
|
+
r_idx, c_idx = get_2D_idx(
|
|
1218
|
+
np.array(list(updates.keys())), num_cols=arr.shape[1]
|
|
1219
|
+
)
|
|
1220
|
+
for ri, ci, rID_i, upd_i in zip(
|
|
1221
|
+
r_idx, c_idx, updates.keys(), updates.values()
|
|
1222
|
+
):
|
|
1223
|
+
new_run_i = runs[rID_i].update(**upd_i)
|
|
1224
|
+
# seems to be a Zarr bug that prevents `set_coordinate_selection` with an
|
|
1225
|
+
# object array, so set one-by-one:
|
|
1226
|
+
arr[ri, ci] = new_run_i.encode(self.ts_fmt, attrs)
|
|
1227
|
+
|
|
1228
|
+
@TimeIt.decorator
|
|
1229
|
+
def _update_EAR_submission_data(self, sub_data: Mapping[int, tuple[int, int | None]]):
|
|
1230
|
+
self._update_runs(
|
|
1231
|
+
updates={
|
|
1232
|
+
k: {"submission_idx": v[0], "commands_file_ID": v[1]}
|
|
1233
|
+
for k, v in sub_data.items()
|
|
1234
|
+
}
|
|
1235
|
+
)
|
|
1236
|
+
|
|
1237
|
+
def _update_EAR_start(
|
|
1238
|
+
self,
|
|
1239
|
+
run_starts: dict[int, tuple[datetime, dict[str, Any] | None, str, int | None]],
|
|
1240
|
+
):
|
|
1241
|
+
self._update_runs(
|
|
1242
|
+
updates={
|
|
1243
|
+
k: {
|
|
1244
|
+
"start_time": v[0],
|
|
1245
|
+
"snapshot_start": v[1],
|
|
1246
|
+
"run_hostname": v[2],
|
|
1247
|
+
"port_number": v[3],
|
|
1248
|
+
}
|
|
1249
|
+
for k, v in run_starts.items()
|
|
1250
|
+
}
|
|
1251
|
+
)
|
|
1252
|
+
|
|
1253
|
+
def _update_EAR_end(
|
|
1254
|
+
self, run_ends: dict[int, tuple[datetime, dict[str, Any] | None, int, bool]]
|
|
1255
|
+
):
|
|
1256
|
+
self._update_runs(
|
|
1257
|
+
updates={
|
|
1258
|
+
k: {
|
|
1259
|
+
"end_time": v[0],
|
|
1260
|
+
"snapshot_end": v[1],
|
|
1261
|
+
"exit_code": v[2],
|
|
1262
|
+
"success": v[3],
|
|
1263
|
+
}
|
|
1264
|
+
for k, v in run_ends.items()
|
|
1265
|
+
}
|
|
1266
|
+
)
|
|
1267
|
+
|
|
1268
|
+
def _update_EAR_skip(self, skips: dict[int, int]):
|
|
1269
|
+
self._update_runs(updates={k: {"skip": v} for k, v in skips.items()})
|
|
1270
|
+
|
|
1271
|
+
def _update_js_metadata(self, js_meta: dict[int, dict[int, dict[str, Any]]]):
|
|
1272
|
+
|
|
1273
|
+
arr_keys = JOBSCRIPT_SUBMIT_TIME_KEYS # these items go to the Zarr array
|
|
1274
|
+
|
|
1275
|
+
# split into attributes to save to the root group metadata, and those to save to
|
|
1276
|
+
# the submit-time jobscript metadata array
|
|
1277
|
+
|
|
1278
|
+
grp_dat = {} # keys are tuples of (sub_idx, js_idx), values are metadata dicts
|
|
1279
|
+
|
|
1280
|
+
for sub_idx, all_js_md in js_meta.items():
|
|
1281
|
+
js_arr = None
|
|
1282
|
+
for js_idx, js_meta_i in all_js_md.items():
|
|
1283
|
+
|
|
1284
|
+
grp_dat_i = {k: v for k, v in js_meta_i.items() if k not in arr_keys}
|
|
1285
|
+
if grp_dat_i:
|
|
1286
|
+
grp_dat[(sub_idx, js_idx)] = grp_dat_i
|
|
1287
|
+
arr_dat = [js_meta_i.get(k) for k in arr_keys]
|
|
1288
|
+
|
|
1289
|
+
if any(arr_dat):
|
|
1290
|
+
# we are updating the at-sumbmit metadata, so clear the cache:
|
|
1291
|
+
self.clear_jobscript_at_submit_metadata_cache()
|
|
1292
|
+
|
|
1293
|
+
js_arr = js_arr or self._get_jobscripts_at_submit_metadata_arr(
|
|
1294
|
+
mode="r+", sub_idx=sub_idx
|
|
1295
|
+
)
|
|
1296
|
+
self.logger.info(
|
|
1297
|
+
f"updating submit-time jobscript metadata array: {arr_dat!r}."
|
|
1298
|
+
)
|
|
1299
|
+
js_arr[js_idx] = arr_dat
|
|
1300
|
+
|
|
1301
|
+
if grp_dat:
|
|
1302
|
+
with self.using_resource("attrs", action="update") as attrs:
|
|
1303
|
+
for (sub_idx, js_idx), js_meta_i in grp_dat.items():
|
|
1304
|
+
self.logger.info(
|
|
1305
|
+
f"updating jobscript metadata in the root group for "
|
|
1306
|
+
f"(sub={sub_idx}, js={js_idx}): {js_meta_i!r}."
|
|
1307
|
+
)
|
|
1308
|
+
sub = cast(
|
|
1309
|
+
"dict[str, list[dict[str, Any]]]", attrs["submissions"][sub_idx]
|
|
1310
|
+
)
|
|
1311
|
+
sub["jobscripts"][js_idx].update(js_meta_i)
|
|
1312
|
+
|
|
1313
|
+
def _append_parameters(self, params: Sequence[StoreParameter]):
|
|
1314
|
+
"""Add new persistent parameters."""
|
|
1315
|
+
self._ensure_all_encoders()
|
|
1316
|
+
base_arr = self._get_parameter_base_array(mode="r+", write_empty_chunks=False)
|
|
1317
|
+
src_arr = self._get_parameter_sources_array(mode="r+")
|
|
1318
|
+
self.logger.debug(
|
|
1319
|
+
f"PersistentStore._append_parameters: adding {len(params)} parameters."
|
|
1320
|
+
)
|
|
1321
|
+
|
|
1322
|
+
param_encode_root_group = self._get_parameter_user_array_group(mode="r+")
|
|
1323
|
+
param_enc: list[dict[str, Any] | int] = []
|
|
1324
|
+
src_enc: list[dict] = []
|
|
1325
|
+
for param_i in params:
|
|
1326
|
+
dat_i = param_i.encode(
|
|
1327
|
+
root_group=param_encode_root_group,
|
|
1328
|
+
arr_path=self._param_data_arr_grp_name(param_i.id_),
|
|
1329
|
+
)
|
|
1330
|
+
param_enc.append(dat_i)
|
|
1331
|
+
src_enc.append(dict(sorted(param_i.source.items())))
|
|
1332
|
+
|
|
1333
|
+
base_arr.append(param_enc)
|
|
1334
|
+
src_arr.append(src_enc)
|
|
1335
|
+
self.logger.debug(
|
|
1336
|
+
f"PersistentStore._append_parameters: finished adding {len(params)} parameters."
|
|
1337
|
+
)
|
|
1338
|
+
|
|
1339
|
+
def _set_parameter_values(self, set_parameters: dict[int, tuple[Any, bool]]):
|
|
1340
|
+
"""Set multiple unset persistent parameters."""
|
|
1341
|
+
self._ensure_all_encoders()
|
|
1342
|
+
param_ids = list(set_parameters)
|
|
1343
|
+
# the `decode` call in `_get_persistent_parameters` should be quick:
|
|
1344
|
+
params = self._get_persistent_parameters(param_ids)
|
|
1345
|
+
new_data: list[dict[str, Any] | int] = []
|
|
1346
|
+
param_encode_root_group = self._get_parameter_user_array_group(mode="r+")
|
|
1347
|
+
for param_id, (value, is_file) in set_parameters.items():
|
|
1348
|
+
param_i = params[param_id]
|
|
1349
|
+
if is_file:
|
|
1350
|
+
param_i = param_i.set_file(value)
|
|
1351
|
+
else:
|
|
1352
|
+
param_i = param_i.set_data(value)
|
|
1353
|
+
|
|
1354
|
+
new_data.append(
|
|
1355
|
+
param_i.encode(
|
|
1356
|
+
root_group=param_encode_root_group,
|
|
1357
|
+
arr_path=self._param_data_arr_grp_name(param_i.id_),
|
|
1358
|
+
)
|
|
1359
|
+
)
|
|
1360
|
+
|
|
1361
|
+
# no need to update sources array:
|
|
1362
|
+
base_arr = self._get_parameter_base_array(mode="r+")
|
|
1363
|
+
base_arr.set_coordinate_selection(param_ids, new_data)
|
|
1364
|
+
|
|
1365
|
+
def _update_parameter_sources(self, sources: Mapping[int, ParamSource]):
|
|
1366
|
+
"""Update the sources of multiple persistent parameters."""
|
|
1367
|
+
|
|
1368
|
+
param_ids = list(sources)
|
|
1369
|
+
src_arr = self._get_parameter_sources_array(mode="r+")
|
|
1370
|
+
existing_sources = src_arr.get_coordinate_selection(param_ids)
|
|
1371
|
+
new_sources = [
|
|
1372
|
+
update_param_source_dict(cast("ParamSource", existing_sources[idx]), source_i)
|
|
1373
|
+
for idx, source_i in enumerate(sources.values())
|
|
1374
|
+
]
|
|
1375
|
+
src_arr.set_coordinate_selection(param_ids, new_sources)
|
|
1376
|
+
|
|
1377
|
+
def _update_template_components(self, tc: dict[str, Any]):
|
|
1378
|
+
with self.using_resource("attrs", "update") as md:
|
|
1379
|
+
md["template_components"] = tc
|
|
1380
|
+
|
|
1381
|
+
@TimeIt.decorator
|
|
1382
|
+
def _get_num_persistent_tasks(self) -> int:
|
|
1383
|
+
"""Get the number of persistent tasks."""
|
|
1384
|
+
if self.use_cache and self.num_tasks_cache is not None:
|
|
1385
|
+
num = self.num_tasks_cache
|
|
1386
|
+
else:
|
|
1387
|
+
num = len(self._get_tasks_arr())
|
|
1388
|
+
if self.use_cache and self.num_tasks_cache is None:
|
|
1389
|
+
self.num_tasks_cache = num
|
|
1390
|
+
return num
|
|
1391
|
+
|
|
1392
|
+
def _get_num_persistent_loops(self) -> int:
|
|
1393
|
+
"""Get the number of persistent loops."""
|
|
1394
|
+
with self.using_resource("attrs", "read") as attrs:
|
|
1395
|
+
return len(attrs["loops"])
|
|
1396
|
+
|
|
1397
|
+
def _get_num_persistent_submissions(self) -> int:
|
|
1398
|
+
"""Get the number of persistent submissions."""
|
|
1399
|
+
with self.using_resource("attrs", "read") as attrs:
|
|
1400
|
+
return len(attrs["submissions"])
|
|
1401
|
+
|
|
1402
|
+
def _get_num_persistent_elements(self) -> int:
|
|
1403
|
+
"""Get the number of persistent elements."""
|
|
1404
|
+
return len(self._get_elements_arr())
|
|
1405
|
+
|
|
1406
|
+
def _get_num_persistent_elem_iters(self) -> int:
|
|
1407
|
+
"""Get the number of persistent element iterations."""
|
|
1408
|
+
return len(self._get_iters_arr())
|
|
1409
|
+
|
|
1410
|
+
@TimeIt.decorator
|
|
1411
|
+
def _get_num_persistent_EARs(self) -> int:
|
|
1412
|
+
"""Get the number of persistent EARs."""
|
|
1413
|
+
if self.use_cache and self.num_EARs_cache is not None:
|
|
1414
|
+
num = self.num_EARs_cache
|
|
1415
|
+
else:
|
|
1416
|
+
num = self._get_EARs_arr().attrs["num_runs"]
|
|
1417
|
+
if self.use_cache and self.num_EARs_cache is None:
|
|
1418
|
+
self.num_EARs_cache = num
|
|
1419
|
+
return num
|
|
1420
|
+
|
|
1421
|
+
def _get_num_persistent_parameters(self):
|
|
1422
|
+
return len(self._get_parameter_base_array())
|
|
1423
|
+
|
|
1424
|
+
def _get_num_persistent_added_tasks(self):
|
|
1425
|
+
with self.using_resource("attrs", "read") as attrs:
|
|
1426
|
+
return attrs["num_added_tasks"]
|
|
1427
|
+
|
|
1428
|
+
@property
|
|
1429
|
+
def zarr_store(self) -> Store:
|
|
1430
|
+
"""
|
|
1431
|
+
The underlying store object.
|
|
1432
|
+
"""
|
|
1433
|
+
if self._zarr_store is None:
|
|
1434
|
+
assert self.fs is not None
|
|
1435
|
+
self._zarr_store = self._get_zarr_store(self.path, self.fs)
|
|
1436
|
+
return self._zarr_store
|
|
1437
|
+
|
|
1438
|
+
def _get_root_group(self, mode: str = "r", **kwargs) -> Group:
|
|
1439
|
+
# TODO: investigate if there are inefficiencies in how we retrieve zarr groups
|
|
1440
|
+
# and arrays, e.g. opening sub groups sequentially would open the root group
|
|
1441
|
+
# multiple times, and so read the root group attrs file multiple times?
|
|
1442
|
+
# it might make sense to define a ZarrAttrsStoreResource for each zarr group and
|
|
1443
|
+
# array (or at least non-parameter groups/arrays?), there could be some built-in
|
|
1444
|
+
# understanding of the hierarchy (e.g. via a `path` attribute) which would then
|
|
1445
|
+
# avoid reading parent groups multiple times --- if that is happening currently.
|
|
1446
|
+
return zarr.open(self.zarr_store, mode=mode, **kwargs)
|
|
1447
|
+
|
|
1448
|
+
def _get_parameter_group(self, mode: str = "r", **kwargs) -> Group:
|
|
1449
|
+
return self._get_root_group(mode=mode, **kwargs).get(self._param_grp_name)
|
|
1450
|
+
|
|
1451
|
+
def _get_parameter_base_array(self, mode: str = "r", **kwargs) -> Array:
|
|
1452
|
+
path = f"{self._param_grp_name}/{self._param_base_arr_name}"
|
|
1453
|
+
return zarr.open(self.zarr_store, mode=mode, path=path, **kwargs)
|
|
1454
|
+
|
|
1455
|
+
def _get_parameter_sources_array(self, mode: str = "r") -> Array:
|
|
1456
|
+
return self._get_parameter_group(mode=mode).get(self._param_sources_arr_name)
|
|
1457
|
+
|
|
1458
|
+
def _get_parameter_user_array_group(self, mode: str = "r") -> Group:
|
|
1459
|
+
return self._get_parameter_group(mode=mode).get(self._param_user_arr_grp_name)
|
|
1460
|
+
|
|
1461
|
+
def _get_parameter_data_array_group(
|
|
1462
|
+
self,
|
|
1463
|
+
parameter_idx: int,
|
|
1464
|
+
mode: str = "r",
|
|
1465
|
+
) -> Group:
|
|
1466
|
+
return self._get_parameter_user_array_group(mode=mode).get(
|
|
1467
|
+
self._param_data_arr_grp_name(parameter_idx)
|
|
1468
|
+
)
|
|
1469
|
+
|
|
1470
|
+
def _get_array_group_and_dataset(
|
|
1471
|
+
self, mode: str, param_id: int, data_path: list[int]
|
|
1472
|
+
):
|
|
1473
|
+
base_dat = self._get_parameter_base_array(mode="r")[param_id]
|
|
1474
|
+
for arr_dat_path, arr_idx in base_dat["type_lookup"]["arrays"]:
|
|
1475
|
+
if arr_dat_path == data_path:
|
|
1476
|
+
break
|
|
1477
|
+
else:
|
|
1478
|
+
raise ValueError(
|
|
1479
|
+
f"Could not find array path {data_path} in the base data for parameter "
|
|
1480
|
+
f"ID {param_id}."
|
|
1481
|
+
)
|
|
1482
|
+
group = self._get_parameter_user_array_group(mode=mode).get(
|
|
1483
|
+
f"{self._param_data_arr_grp_name(param_id)}"
|
|
1484
|
+
)
|
|
1485
|
+
return group, f"arr_{arr_idx}"
|
|
1486
|
+
|
|
1487
|
+
def _get_metadata_group(self, mode: str = "r") -> Group:
|
|
1488
|
+
try:
|
|
1489
|
+
path = Path(self.workflow.url).joinpath("metadata")
|
|
1490
|
+
md_store = zarr.NestedDirectoryStore(path)
|
|
1491
|
+
return zarr.open_group(store=md_store, mode=mode)
|
|
1492
|
+
except (FileNotFoundError, zarr.errors.GroupNotFoundError):
|
|
1493
|
+
# zip store?
|
|
1494
|
+
return zarr.open_group(self.zarr_store, path="metadata", mode=mode)
|
|
1495
|
+
|
|
1496
|
+
def _get_all_submissions_metadata_group(self, mode: str = "r") -> Group:
|
|
1497
|
+
return self._get_metadata_group(mode=mode).get(self._subs_md_group_name)
|
|
1498
|
+
|
|
1499
|
+
def _get_submission_metadata_group(self, sub_idx: int, mode: str = "r") -> Group:
|
|
1500
|
+
return self._get_all_submissions_metadata_group(mode=mode).get(sub_idx)
|
|
1501
|
+
|
|
1502
|
+
def _get_submission_metadata_group_path(self, sub_idx: int) -> Path:
|
|
1503
|
+
grp = self._get_submission_metadata_group(sub_idx)
|
|
1504
|
+
return Path(grp.store.path).joinpath(grp.path)
|
|
1505
|
+
|
|
1506
|
+
def _get_jobscripts_at_submit_metadata_arr(
|
|
1507
|
+
self, sub_idx: int, mode: str = "r"
|
|
1508
|
+
) -> Array:
|
|
1509
|
+
return self._get_submission_metadata_group(sub_idx=sub_idx, mode=mode).get(
|
|
1510
|
+
self._js_at_submit_md_arr_name
|
|
1511
|
+
)
|
|
1512
|
+
|
|
1513
|
+
def _get_jobscripts_at_submit_metadata_arr_path(self, sub_idx: int) -> Path:
|
|
1514
|
+
arr = self._get_jobscripts_at_submit_metadata_arr(sub_idx)
|
|
1515
|
+
return Path(arr.store.path).joinpath(arr.path)
|
|
1516
|
+
|
|
1517
|
+
@TimeIt.decorator
|
|
1518
|
+
def _get_jobscripts_run_ID_arr(self, sub_idx: int, mode: str = "r") -> Array:
|
|
1519
|
+
return self._get_submission_metadata_group(sub_idx=sub_idx, mode=mode).get(
|
|
1520
|
+
self._js_run_IDs_arr_name
|
|
1521
|
+
)
|
|
1522
|
+
|
|
1523
|
+
def _get_jobscripts_task_elements_arr(self, sub_idx: int, mode: str = "r") -> Array:
|
|
1524
|
+
return self._get_submission_metadata_group(sub_idx=sub_idx, mode=mode).get(
|
|
1525
|
+
self._js_task_elems_arr_name
|
|
1526
|
+
)
|
|
1527
|
+
|
|
1528
|
+
def _get_jobscripts_task_actions_arr(self, sub_idx: int, mode: str = "r") -> Array:
|
|
1529
|
+
return self._get_submission_metadata_group(sub_idx=sub_idx, mode=mode).get(
|
|
1530
|
+
self._js_task_acts_arr_name
|
|
1531
|
+
)
|
|
1532
|
+
|
|
1533
|
+
def _get_jobscripts_dependencies_arr(self, sub_idx: int, mode: str = "r") -> Array:
|
|
1534
|
+
return self._get_submission_metadata_group(sub_idx=sub_idx, mode=mode).get(
|
|
1535
|
+
self._js_deps_arr_name
|
|
1536
|
+
)
|
|
1537
|
+
|
|
1538
|
+
def _get_tasks_arr(self, mode: str = "r") -> Array:
|
|
1539
|
+
return self._get_metadata_group(mode=mode).get(self._task_arr_name)
|
|
1540
|
+
|
|
1541
|
+
def _get_elements_arr(self, mode: str = "r") -> Array:
|
|
1542
|
+
return self._get_metadata_group(mode=mode).get(self._elem_arr_name)
|
|
1543
|
+
|
|
1544
|
+
def _get_iters_arr(self, mode: str = "r") -> Array:
|
|
1545
|
+
return self._get_metadata_group(mode=mode).get(self._iter_arr_name)
|
|
1546
|
+
|
|
1547
|
+
def _get_EARs_arr(self, mode: str = "r") -> Array:
|
|
1548
|
+
return self._get_metadata_group(mode=mode).get(self._EAR_arr_name)
|
|
1549
|
+
|
|
1550
|
+
def _get_dirs_arr(self, mode: str = "r") -> zarr.Array:
|
|
1551
|
+
return self._get_metadata_group(mode=mode).get(self._run_dir_arr_name)
|
|
1552
|
+
|
|
1553
|
+
@classmethod
|
|
1554
|
+
def make_test_store_from_spec(
|
|
1555
|
+
cls,
|
|
1556
|
+
spec,
|
|
1557
|
+
dir=None,
|
|
1558
|
+
path="test_store",
|
|
1559
|
+
overwrite=False,
|
|
1560
|
+
):
|
|
1561
|
+
"""Generate an store for testing purposes."""
|
|
1562
|
+
ts_fmt = "FIXME"
|
|
1563
|
+
|
|
1564
|
+
path = Path(dir or "", path)
|
|
1565
|
+
root = zarr.group(store=DirectoryStore(path), overwrite=overwrite)
|
|
1566
|
+
md = root.create_group("metadata")
|
|
1567
|
+
|
|
1568
|
+
tasks_arr = md.create_dataset(
|
|
1569
|
+
name=cls._task_arr_name,
|
|
1570
|
+
shape=0,
|
|
1571
|
+
dtype=object,
|
|
1572
|
+
object_codec=VLenArray(int),
|
|
1573
|
+
)
|
|
1574
|
+
|
|
1575
|
+
elems_arr = md.create_dataset(
|
|
1576
|
+
name=cls._elem_arr_name,
|
|
1577
|
+
shape=0,
|
|
1578
|
+
dtype=object,
|
|
1579
|
+
object_codec=cls._CODEC,
|
|
1580
|
+
chunks=1000,
|
|
1581
|
+
)
|
|
1582
|
+
elems_arr.attrs.update({"seq_idx": [], "src_idx": []})
|
|
1583
|
+
|
|
1584
|
+
elem_iters_arr = md.create_dataset(
|
|
1585
|
+
name=cls._iter_arr_name,
|
|
1586
|
+
shape=0,
|
|
1587
|
+
dtype=object,
|
|
1588
|
+
object_codec=cls._CODEC,
|
|
1589
|
+
chunks=1000,
|
|
1590
|
+
)
|
|
1591
|
+
elem_iters_arr.attrs.update(
|
|
1592
|
+
{
|
|
1593
|
+
"loops": [],
|
|
1594
|
+
"schema_parameters": [],
|
|
1595
|
+
"parameter_paths": [],
|
|
1596
|
+
}
|
|
1597
|
+
)
|
|
1598
|
+
|
|
1599
|
+
EARs_arr = md.create_dataset(
|
|
1600
|
+
name=cls._EAR_arr_name,
|
|
1601
|
+
shape=0,
|
|
1602
|
+
dtype=object,
|
|
1603
|
+
object_codec=cls._CODEC,
|
|
1604
|
+
chunks=1000,
|
|
1605
|
+
)
|
|
1606
|
+
EARs_arr.attrs["parameter_paths"] = []
|
|
1607
|
+
|
|
1608
|
+
tasks, elems, elem_iters, EARs_ = super().prepare_test_store_from_spec(spec)
|
|
1609
|
+
|
|
1610
|
+
path = Path(path).resolve()
|
|
1611
|
+
tasks = [ZarrStoreTask(**i).encode() for i in tasks]
|
|
1612
|
+
elements = [ZarrStoreElement(**i).encode(elems_arr.attrs.asdict()) for i in elems]
|
|
1613
|
+
elem_iters = [
|
|
1614
|
+
ZarrStoreElementIter(**i).encode(elem_iters_arr.attrs.asdict())
|
|
1615
|
+
for i in elem_iters
|
|
1616
|
+
]
|
|
1617
|
+
EARs = [ZarrStoreEAR(**i).encode(ts_fmt, EARs_arr.attrs.asdict()) for i in EARs_]
|
|
1618
|
+
|
|
1619
|
+
append_items_to_ragged_array(tasks_arr, tasks)
|
|
1620
|
+
|
|
1621
|
+
elems_arr.append(np.fromiter(elements, dtype=object))
|
|
1622
|
+
elem_iters_arr.append(np.fromiter(elem_iters, dtype=object))
|
|
1623
|
+
EARs_arr.append(np.fromiter(EARs, dtype=object))
|
|
1624
|
+
|
|
1625
|
+
return cls(path)
|
|
1626
|
+
|
|
1627
|
+
def _get_persistent_template_components(self):
|
|
1628
|
+
with self.using_resource("attrs", "read") as attrs:
|
|
1629
|
+
return attrs["template_components"]
|
|
1630
|
+
|
|
1631
|
+
def _get_persistent_template(self) -> dict[str, JSONed]:
|
|
1632
|
+
with self.using_resource("attrs", "read") as attrs:
|
|
1633
|
+
return cast("dict[str, JSONed]", attrs["template"])
|
|
1634
|
+
|
|
1635
|
+
@TimeIt.decorator
|
|
1636
|
+
def _get_persistent_tasks(self, id_lst: Iterable[int]) -> dict[int, ZarrStoreTask]:
|
|
1637
|
+
tasks, id_lst = self._get_cached_persistent_tasks(id_lst)
|
|
1638
|
+
if id_lst:
|
|
1639
|
+
with self.using_resource("attrs", action="read") as attrs:
|
|
1640
|
+
task_dat: dict[int, dict[str, Any]] = {}
|
|
1641
|
+
elem_IDs: list[int] = []
|
|
1642
|
+
i: dict[str, Any]
|
|
1643
|
+
for idx, i in enumerate(attrs["tasks"]):
|
|
1644
|
+
i = copy.deepcopy(i)
|
|
1645
|
+
elem_IDs.append(i.pop("element_IDs_idx"))
|
|
1646
|
+
if id_lst is None or i["id_"] in id_lst:
|
|
1647
|
+
task_dat[i["id_"]] = {**i, "index": idx}
|
|
1648
|
+
if task_dat:
|
|
1649
|
+
try:
|
|
1650
|
+
elem_IDs_arr_dat = self._get_tasks_arr().get_coordinate_selection(
|
|
1651
|
+
elem_IDs
|
|
1652
|
+
)
|
|
1653
|
+
except BoundsCheckError:
|
|
1654
|
+
raise MissingStoreTaskError(
|
|
1655
|
+
elem_IDs
|
|
1656
|
+
) from None # TODO: not an ID list
|
|
1657
|
+
|
|
1658
|
+
new_tasks = {
|
|
1659
|
+
id_: ZarrStoreTask.decode({**i, "element_IDs": elem_IDs_arr_dat[id_]})
|
|
1660
|
+
for id_, i in task_dat.items()
|
|
1661
|
+
}
|
|
1662
|
+
self.task_cache.update(new_tasks)
|
|
1663
|
+
tasks.update(new_tasks)
|
|
1664
|
+
return tasks
|
|
1665
|
+
|
|
1666
|
+
@TimeIt.decorator
|
|
1667
|
+
def _get_persistent_loops(
|
|
1668
|
+
self, id_lst: Iterable[int] | None = None
|
|
1669
|
+
) -> dict[int, LoopDescriptor]:
|
|
1670
|
+
with self.using_resource("attrs", "read") as attrs:
|
|
1671
|
+
return {
|
|
1672
|
+
idx: cast("LoopDescriptor", i)
|
|
1673
|
+
for idx, i in enumerate(attrs["loops"])
|
|
1674
|
+
if id_lst is None or idx in id_lst
|
|
1675
|
+
}
|
|
1676
|
+
|
|
1677
|
+
@TimeIt.decorator
|
|
1678
|
+
def _get_persistent_submissions(
|
|
1679
|
+
self, id_lst: Iterable[int] | None = None
|
|
1680
|
+
) -> dict[int, Mapping[str, JSONed]]:
|
|
1681
|
+
self.logger.debug("loading persistent submissions from the zarr store")
|
|
1682
|
+
ids = set(id_lst or ())
|
|
1683
|
+
with self.using_resource("attrs", "read") as attrs:
|
|
1684
|
+
subs_dat = copy.deepcopy(
|
|
1685
|
+
{
|
|
1686
|
+
idx: i
|
|
1687
|
+
for idx, i in enumerate(attrs["submissions"])
|
|
1688
|
+
if id_lst is None or idx in ids
|
|
1689
|
+
}
|
|
1690
|
+
)
|
|
1691
|
+
|
|
1692
|
+
return subs_dat
|
|
1693
|
+
|
|
1694
|
+
@TimeIt.decorator
|
|
1695
|
+
def _get_persistent_elements(
|
|
1696
|
+
self, id_lst: Iterable[int]
|
|
1697
|
+
) -> dict[int, ZarrStoreElement]:
|
|
1698
|
+
elems, id_lst = self._get_cached_persistent_elements(id_lst)
|
|
1699
|
+
if id_lst:
|
|
1700
|
+
self.logger.debug(
|
|
1701
|
+
f"loading {len(id_lst)} persistent element(s) from disk: "
|
|
1702
|
+
f"{shorten_list_str(id_lst)}."
|
|
1703
|
+
)
|
|
1704
|
+
arr = self._get_elements_arr()
|
|
1705
|
+
attrs = arr.attrs.asdict()
|
|
1706
|
+
try:
|
|
1707
|
+
elem_arr_dat = arr.get_coordinate_selection(id_lst)
|
|
1708
|
+
except BoundsCheckError:
|
|
1709
|
+
raise MissingStoreElementError(id_lst) from None
|
|
1710
|
+
elem_dat = dict(zip(id_lst, elem_arr_dat))
|
|
1711
|
+
new_elems = {
|
|
1712
|
+
k: ZarrStoreElement.decode(v, attrs) for k, v in elem_dat.items()
|
|
1713
|
+
}
|
|
1714
|
+
self.element_cache.update(new_elems)
|
|
1715
|
+
elems.update(new_elems)
|
|
1716
|
+
return elems
|
|
1717
|
+
|
|
1718
|
+
@TimeIt.decorator
|
|
1719
|
+
def _get_persistent_element_iters(
|
|
1720
|
+
self, id_lst: Iterable[int]
|
|
1721
|
+
) -> dict[int, ZarrStoreElementIter]:
|
|
1722
|
+
iters, id_lst = self._get_cached_persistent_element_iters(id_lst)
|
|
1723
|
+
if id_lst:
|
|
1724
|
+
self.logger.debug(
|
|
1725
|
+
f"loading {len(id_lst)} persistent element iteration(s) from disk: "
|
|
1726
|
+
f"{shorten_list_str(id_lst)}."
|
|
1727
|
+
)
|
|
1728
|
+
arr = self._get_iters_arr()
|
|
1729
|
+
attrs = arr.attrs.asdict()
|
|
1730
|
+
try:
|
|
1731
|
+
iter_arr_dat = arr.get_coordinate_selection(id_lst)
|
|
1732
|
+
except BoundsCheckError:
|
|
1733
|
+
raise MissingStoreElementIterationError(id_lst) from None
|
|
1734
|
+
iter_dat = dict(zip(id_lst, iter_arr_dat))
|
|
1735
|
+
new_iters = {
|
|
1736
|
+
k: ZarrStoreElementIter.decode(v, attrs) for k, v in iter_dat.items()
|
|
1737
|
+
}
|
|
1738
|
+
self.element_iter_cache.update(new_iters)
|
|
1739
|
+
iters.update(new_iters)
|
|
1740
|
+
return iters
|
|
1741
|
+
|
|
1742
|
+
@TimeIt.decorator
|
|
1743
|
+
def _get_persistent_EARs(self, id_lst: Iterable[int]) -> dict[int, ZarrStoreEAR]:
|
|
1744
|
+
runs, id_lst = self._get_cached_persistent_EARs(id_lst)
|
|
1745
|
+
if id_lst:
|
|
1746
|
+
self.logger.debug(
|
|
1747
|
+
f"loading {len(id_lst)} persistent EAR(s) from disk: "
|
|
1748
|
+
f"{shorten_list_str(id_lst)}."
|
|
1749
|
+
)
|
|
1750
|
+
arr = self._get_EARs_arr()
|
|
1751
|
+
attrs = arr.attrs.asdict()
|
|
1752
|
+
sel: tuple[NDArray, NDArray] | list[int]
|
|
1753
|
+
try:
|
|
1754
|
+
# convert to 2D array indices:
|
|
1755
|
+
sel = get_2D_idx(np.array(id_lst), num_cols=arr.shape[1])
|
|
1756
|
+
except IndexError:
|
|
1757
|
+
# 1D runs array from before update to 2D in Feb 2025 refactor/jobscript:
|
|
1758
|
+
sel = id_lst
|
|
1759
|
+
try:
|
|
1760
|
+
EAR_arr_dat = _zarr_get_coord_selection(arr, sel, self.logger)
|
|
1761
|
+
except BoundsCheckError:
|
|
1762
|
+
raise MissingStoreEARError(id_lst) from None
|
|
1763
|
+
EAR_dat = dict(zip(id_lst, EAR_arr_dat))
|
|
1764
|
+
new_runs = {
|
|
1765
|
+
k: ZarrStoreEAR.decode(EAR_dat=v, ts_fmt=self.ts_fmt, attrs=attrs)
|
|
1766
|
+
for k, v in EAR_dat.items()
|
|
1767
|
+
}
|
|
1768
|
+
self.EAR_cache.update(new_runs)
|
|
1769
|
+
runs.update(new_runs)
|
|
1770
|
+
|
|
1771
|
+
return runs
|
|
1772
|
+
|
|
1773
|
+
@TimeIt.decorator
|
|
1774
|
+
def _get_persistent_parameters(
|
|
1775
|
+
self, id_lst: Iterable[int], *, dataset_copy: bool = False, **kwargs
|
|
1776
|
+
) -> dict[int, ZarrStoreParameter]:
|
|
1777
|
+
self._ensure_all_decoders()
|
|
1778
|
+
params, id_lst = self._get_cached_persistent_parameters(id_lst)
|
|
1779
|
+
if id_lst:
|
|
1780
|
+
|
|
1781
|
+
self.logger.debug(
|
|
1782
|
+
f"loading {len(id_lst)} persistent parameter(s) from disk: "
|
|
1783
|
+
f"{shorten_list_str(id_lst)}."
|
|
1784
|
+
)
|
|
1785
|
+
|
|
1786
|
+
# TODO: implement the "parameter_metadata_cache" for zarr stores, which would
|
|
1787
|
+
# keep the base_arr and src_arr open
|
|
1788
|
+
base_arr = self._get_parameter_base_array(mode="r")
|
|
1789
|
+
src_arr = self._get_parameter_sources_array(mode="r")
|
|
1790
|
+
|
|
1791
|
+
try:
|
|
1792
|
+
param_arr_dat = base_arr.get_coordinate_selection(list(id_lst))
|
|
1793
|
+
src_arr_dat = src_arr.get_coordinate_selection(list(id_lst))
|
|
1794
|
+
except BoundsCheckError:
|
|
1795
|
+
raise MissingParameterData(id_lst) from None
|
|
1796
|
+
|
|
1797
|
+
param_dat = dict(zip(id_lst, param_arr_dat))
|
|
1798
|
+
src_dat = dict(zip(id_lst, src_arr_dat))
|
|
1799
|
+
|
|
1800
|
+
new_params = {
|
|
1801
|
+
k: ZarrStoreParameter.decode(
|
|
1802
|
+
id_=k,
|
|
1803
|
+
data=v,
|
|
1804
|
+
source=src_dat[k],
|
|
1805
|
+
arr_group=self._get_parameter_data_array_group(k),
|
|
1806
|
+
dataset_copy=dataset_copy,
|
|
1807
|
+
)
|
|
1808
|
+
for k, v in param_dat.items()
|
|
1809
|
+
}
|
|
1810
|
+
self.parameter_cache.update(new_params)
|
|
1811
|
+
params.update(new_params)
|
|
1812
|
+
|
|
1813
|
+
return params
|
|
1814
|
+
|
|
1815
|
+
@TimeIt.decorator
|
|
1816
|
+
def _get_persistent_param_sources(
|
|
1817
|
+
self, id_lst: Iterable[int]
|
|
1818
|
+
) -> dict[int, ParamSource]:
|
|
1819
|
+
sources, id_lst = self._get_cached_persistent_param_sources(id_lst)
|
|
1820
|
+
if id_lst:
|
|
1821
|
+
src_arr = self._get_parameter_sources_array(mode="r")
|
|
1822
|
+
try:
|
|
1823
|
+
src_arr_dat = src_arr.get_coordinate_selection(list(id_lst))
|
|
1824
|
+
except BoundsCheckError:
|
|
1825
|
+
raise MissingParameterData(id_lst) from None
|
|
1826
|
+
new_sources = dict(zip(id_lst, src_arr_dat))
|
|
1827
|
+
self.param_sources_cache.update(new_sources)
|
|
1828
|
+
sources.update(new_sources)
|
|
1829
|
+
return sources
|
|
1830
|
+
|
|
1831
|
+
def _get_persistent_parameter_set_status(
|
|
1832
|
+
self, id_lst: Iterable[int]
|
|
1833
|
+
) -> dict[int, bool]:
|
|
1834
|
+
base_arr = self._get_parameter_base_array(mode="r")
|
|
1835
|
+
try:
|
|
1836
|
+
param_arr_dat = base_arr.get_coordinate_selection(list(id_lst))
|
|
1837
|
+
except BoundsCheckError:
|
|
1838
|
+
raise MissingParameterData(id_lst) from None
|
|
1839
|
+
|
|
1840
|
+
return dict(zip(id_lst, [i is not None for i in param_arr_dat]))
|
|
1841
|
+
|
|
1842
|
+
def _get_persistent_parameter_IDs(self) -> list[int]:
|
|
1843
|
+
# we assume the row index is equivalent to ID, might need to revisit in future
|
|
1844
|
+
base_arr = self._get_parameter_base_array(mode="r")
|
|
1845
|
+
return list(range(len(base_arr)))
|
|
1846
|
+
|
|
1847
|
+
def get_submission_at_submit_metadata(
|
|
1848
|
+
self, sub_idx: int, metadata_attr: dict | None
|
|
1849
|
+
) -> dict[str, Any]:
|
|
1850
|
+
"""Retrieve the values of submission attributes that are stored at submit-time."""
|
|
1851
|
+
grp = self._get_submission_metadata_group(sub_idx)
|
|
1852
|
+
attrs = grp.attrs.asdict()
|
|
1853
|
+
return {k: attrs[k] for k in SUBMISSION_SUBMIT_TIME_KEYS}
|
|
1854
|
+
|
|
1855
|
+
def clear_jobscript_at_submit_metadata_cache(self):
|
|
1856
|
+
"""Clear the cache of at-submit-time jobscript metadata."""
|
|
1857
|
+
self._jobscript_at_submit_metadata = {}
|
|
1858
|
+
|
|
1859
|
+
def get_jobscript_at_submit_metadata(
|
|
1860
|
+
self,
|
|
1861
|
+
sub_idx: int,
|
|
1862
|
+
js_idx: int,
|
|
1863
|
+
metadata_attr: dict | None,
|
|
1864
|
+
) -> dict[str, Any]:
|
|
1865
|
+
"""For the specified jobscript, retrieve the values of jobscript-submit-time
|
|
1866
|
+
attributes.
|
|
1867
|
+
|
|
1868
|
+
Notes
|
|
1869
|
+
-----
|
|
1870
|
+
If the cache does not exist, this method will retrieve and cache metadata for
|
|
1871
|
+
all jobscripts for which metadata has been set. If the cache does exist, but not
|
|
1872
|
+
for the requested jobscript, then this method will retrieve and cache metadata for
|
|
1873
|
+
all non-cached jobscripts for which metadata has been set. If metadata has not
|
|
1874
|
+
yet been set for the specified jobscript, and dict with all `None` values will be
|
|
1875
|
+
returned.
|
|
1876
|
+
|
|
1877
|
+
The cache can be cleared using the method
|
|
1878
|
+
`clear_jobscript_at_submit_metadata_cache`.
|
|
1879
|
+
|
|
1880
|
+
"""
|
|
1881
|
+
if self._jobscript_at_submit_metadata:
|
|
1882
|
+
# cache exists, but might not include data for the requested jobscript:
|
|
1883
|
+
if js_idx in self._jobscript_at_submit_metadata:
|
|
1884
|
+
return self._jobscript_at_submit_metadata[js_idx]
|
|
1885
|
+
|
|
1886
|
+
arr = self._get_jobscripts_at_submit_metadata_arr(sub_idx)
|
|
1887
|
+
non_cached = set(range(len(arr))) - set(self._jobscript_at_submit_metadata.keys())
|
|
1888
|
+
|
|
1889
|
+
# populate cache:
|
|
1890
|
+
arr_non_cached = arr.get_coordinate_selection((list(non_cached),))
|
|
1891
|
+
for js_idx_i, arr_item in zip(non_cached, arr_non_cached):
|
|
1892
|
+
try:
|
|
1893
|
+
self._jobscript_at_submit_metadata[js_idx_i] = {
|
|
1894
|
+
i: arr_item[i_idx]
|
|
1895
|
+
for i_idx, i in enumerate(JOBSCRIPT_SUBMIT_TIME_KEYS)
|
|
1896
|
+
}
|
|
1897
|
+
except TypeError:
|
|
1898
|
+
# data for this jobscript is not set
|
|
1899
|
+
pass
|
|
1900
|
+
|
|
1901
|
+
if js_idx not in self._jobscript_at_submit_metadata:
|
|
1902
|
+
return {i: None for i in JOBSCRIPT_SUBMIT_TIME_KEYS}
|
|
1903
|
+
|
|
1904
|
+
return self._jobscript_at_submit_metadata[js_idx]
|
|
1905
|
+
|
|
1906
|
+
@TimeIt.decorator
|
|
1907
|
+
def get_jobscript_block_run_ID_array(
|
|
1908
|
+
self,
|
|
1909
|
+
sub_idx: int,
|
|
1910
|
+
js_idx: int,
|
|
1911
|
+
blk_idx: int,
|
|
1912
|
+
run_ID_arr: NDArray | None,
|
|
1913
|
+
) -> NDArray:
|
|
1914
|
+
"""For the specified jobscript-block, retrieve the run ID array."""
|
|
1915
|
+
|
|
1916
|
+
if run_ID_arr is not None:
|
|
1917
|
+
self.logger.debug("jobscript-block run IDs are still in memory.")
|
|
1918
|
+
# in the special case when the Submission object has just been created, the
|
|
1919
|
+
# run ID arrays will not yet be persistent.
|
|
1920
|
+
return np.asarray(run_ID_arr)
|
|
1921
|
+
|
|
1922
|
+
# otherwise, `append_submissions` has been called, the run IDs have been
|
|
1923
|
+
# removed from the JSON-representation of the submission object, and have been
|
|
1924
|
+
# saved in separate zarr arrays:
|
|
1925
|
+
if sub_idx not in self._jobscript_run_ID_arrays:
|
|
1926
|
+
|
|
1927
|
+
self.logger.debug(
|
|
1928
|
+
f"retrieving jobscript-block run IDs for submission {sub_idx} from disk,"
|
|
1929
|
+
f" and caching."
|
|
1930
|
+
)
|
|
1931
|
+
|
|
1932
|
+
# for a given submission, run IDs are stored for all jobscript-blocks in the
|
|
1933
|
+
# same array (and chunk), so retrieve all of them and cache:
|
|
1934
|
+
|
|
1935
|
+
arr = self._get_jobscripts_run_ID_arr(sub_idx)
|
|
1936
|
+
arr_dat = arr[:]
|
|
1937
|
+
block_shapes = arr.attrs["block_shapes"]
|
|
1938
|
+
|
|
1939
|
+
self._jobscript_run_ID_arrays[sub_idx] = {} # keyed by (js_idx, blk_idx)
|
|
1940
|
+
arr_idx = 0
|
|
1941
|
+
for js_idx_i, js_blk_shapes in enumerate(block_shapes):
|
|
1942
|
+
for blk_idx_j, blk_shape_j in enumerate(js_blk_shapes):
|
|
1943
|
+
self._jobscript_run_ID_arrays[sub_idx][(js_idx_i, blk_idx_j)] = (
|
|
1944
|
+
arr_dat[arr_idx, : blk_shape_j[0], : blk_shape_j[1]]
|
|
1945
|
+
)
|
|
1946
|
+
arr_idx += 1
|
|
1947
|
+
|
|
1948
|
+
else:
|
|
1949
|
+
self.logger.debug(
|
|
1950
|
+
f"retrieving jobscript-block run IDs for submission {sub_idx} from cache."
|
|
1951
|
+
)
|
|
1952
|
+
|
|
1953
|
+
return self._jobscript_run_ID_arrays[sub_idx][(js_idx, blk_idx)]
|
|
1954
|
+
|
|
1955
|
+
def get_jobscript_block_task_elements_map(
|
|
1956
|
+
self,
|
|
1957
|
+
sub_idx: int,
|
|
1958
|
+
js_idx: int,
|
|
1959
|
+
blk_idx: int,
|
|
1960
|
+
task_elems_map: dict[int, list[int]] | None,
|
|
1961
|
+
) -> dict[int, list[int]]:
|
|
1962
|
+
"""For the specified jobscript-block, retrieve the task-elements mapping."""
|
|
1963
|
+
|
|
1964
|
+
if task_elems_map is not None:
|
|
1965
|
+
self.logger.debug("jobscript-block task elements are still in memory.")
|
|
1966
|
+
# in the special case when the Submission object has just been created, the
|
|
1967
|
+
# task elements arrays will not yet be persistent.
|
|
1968
|
+
return task_elems_map
|
|
1969
|
+
|
|
1970
|
+
# otherwise, `append_submissions` has been called, the task elements have been
|
|
1971
|
+
# removed from the JSON-representation of the submission object, and have been
|
|
1972
|
+
# saved in separate zarr arrays:
|
|
1973
|
+
if sub_idx not in self._jobscript_task_element_maps:
|
|
1974
|
+
|
|
1975
|
+
self.logger.debug(
|
|
1976
|
+
f"retrieving jobscript-block task elements for submission {sub_idx} from "
|
|
1977
|
+
f"disk, and caching."
|
|
1978
|
+
)
|
|
1979
|
+
|
|
1980
|
+
# for a given submission, task elements are stored for all jobscript-blocks in
|
|
1981
|
+
# the same array (and chunk), so retrieve all of them and cache:
|
|
1982
|
+
|
|
1983
|
+
arr = self._get_jobscripts_task_elements_arr(sub_idx)
|
|
1984
|
+
arr_dat = arr[:]
|
|
1985
|
+
block_shapes = arr.attrs["block_shapes"]
|
|
1986
|
+
|
|
1987
|
+
self._jobscript_task_element_maps[sub_idx] = {} # keys: (js_idx, blk_idx)
|
|
1988
|
+
arr_idx = 0
|
|
1989
|
+
for js_idx_i, js_blk_shapes in enumerate(block_shapes):
|
|
1990
|
+
for blk_idx_j, blk_shape_j in enumerate(js_blk_shapes):
|
|
1991
|
+
arr_i = arr_dat[arr_idx, : blk_shape_j[1], : blk_shape_j[0] + 1]
|
|
1992
|
+
self._jobscript_task_element_maps[sub_idx][(js_idx_i, blk_idx_j)] = {
|
|
1993
|
+
k[0]: list(k[1:]) for k in arr_i
|
|
1994
|
+
}
|
|
1995
|
+
arr_idx += 1
|
|
1996
|
+
|
|
1997
|
+
else:
|
|
1998
|
+
self.logger.debug(
|
|
1999
|
+
f"retrieving jobscript-block task elements for submission {sub_idx} from "
|
|
2000
|
+
"cache."
|
|
2001
|
+
)
|
|
2002
|
+
|
|
2003
|
+
return self._jobscript_task_element_maps[sub_idx][(js_idx, blk_idx)]
|
|
2004
|
+
|
|
2005
|
+
@TimeIt.decorator
|
|
2006
|
+
def get_jobscript_block_task_actions_array(
|
|
2007
|
+
self,
|
|
2008
|
+
sub_idx: int,
|
|
2009
|
+
js_idx: int,
|
|
2010
|
+
blk_idx: int,
|
|
2011
|
+
task_actions_arr: NDArray | list[tuple[int, int, int]] | None,
|
|
2012
|
+
) -> NDArray:
|
|
2013
|
+
"""For the specified jobscript-block, retrieve the task-actions array."""
|
|
2014
|
+
|
|
2015
|
+
if task_actions_arr is not None:
|
|
2016
|
+
self.logger.debug("jobscript-block task actions are still in memory.")
|
|
2017
|
+
# in the special case when the Submission object has just been created, the
|
|
2018
|
+
# task actions arrays will not yet be persistent.
|
|
2019
|
+
return np.asarray(task_actions_arr)
|
|
2020
|
+
|
|
2021
|
+
# otherwise, `append_submissions` has been called, the task actions have been
|
|
2022
|
+
# removed from the JSON-representation of the submission object, and have been
|
|
2023
|
+
# saved in separate zarr arrays:
|
|
2024
|
+
if sub_idx not in self._jobscript_task_actions_arrays:
|
|
2025
|
+
|
|
2026
|
+
self.logger.debug(
|
|
2027
|
+
f"retrieving jobscript-block task actions for submission {sub_idx} from "
|
|
2028
|
+
f"disk, and caching."
|
|
2029
|
+
)
|
|
2030
|
+
|
|
2031
|
+
# for a given submission, task actions are stored for all jobscript-blocks in
|
|
2032
|
+
# the same array (and chunk), so retrieve all of them and cache:
|
|
2033
|
+
|
|
2034
|
+
arr = self._get_jobscripts_task_actions_arr(sub_idx)
|
|
2035
|
+
arr_dat = arr[:]
|
|
2036
|
+
block_num_acts = arr.attrs["block_num_acts"]
|
|
2037
|
+
|
|
2038
|
+
num_acts_count = 0
|
|
2039
|
+
self._jobscript_task_actions_arrays[sub_idx] = {} # keys: (js_idx, blk_idx)
|
|
2040
|
+
for js_idx_i, js_blk_num_acts in enumerate(block_num_acts):
|
|
2041
|
+
for blk_idx_j, blk_num_acts_j in enumerate(js_blk_num_acts):
|
|
2042
|
+
arr_i = arr_dat[num_acts_count : num_acts_count + blk_num_acts_j]
|
|
2043
|
+
num_acts_count += blk_num_acts_j
|
|
2044
|
+
self._jobscript_task_actions_arrays[sub_idx][
|
|
2045
|
+
(js_idx_i, blk_idx_j)
|
|
2046
|
+
] = arr_i
|
|
2047
|
+
|
|
2048
|
+
else:
|
|
2049
|
+
self.logger.debug(
|
|
2050
|
+
f"retrieving jobscript-block task actions for submission {sub_idx} from "
|
|
2051
|
+
"cache."
|
|
2052
|
+
)
|
|
2053
|
+
|
|
2054
|
+
return self._jobscript_task_actions_arrays[sub_idx][(js_idx, blk_idx)]
|
|
2055
|
+
|
|
2056
|
+
@TimeIt.decorator
|
|
2057
|
+
def get_jobscript_block_dependencies(
|
|
2058
|
+
self,
|
|
2059
|
+
sub_idx: int,
|
|
2060
|
+
js_idx: int,
|
|
2061
|
+
blk_idx: int,
|
|
2062
|
+
js_dependencies: dict[tuple[int, int], ResolvedJobscriptBlockDependencies] | None,
|
|
2063
|
+
) -> dict[tuple[int, int], ResolvedJobscriptBlockDependencies]:
|
|
2064
|
+
"""For the specified jobscript-block, retrieve the dependencies."""
|
|
2065
|
+
|
|
2066
|
+
if js_dependencies is not None:
|
|
2067
|
+
self.logger.debug("jobscript-block dependencies are still in memory.")
|
|
2068
|
+
# in the special case when the Submission object has just been created, the
|
|
2069
|
+
# dependencies will not yet be persistent.
|
|
2070
|
+
return js_dependencies
|
|
2071
|
+
|
|
2072
|
+
# otherwise, `append_submissions` has been called, the dependencies have been
|
|
2073
|
+
# removed from the JSON-representation of the submission object, and have been
|
|
2074
|
+
# saved in separate zarr arrays:
|
|
2075
|
+
if sub_idx not in self._jobscript_dependencies:
|
|
2076
|
+
self.logger.debug(
|
|
2077
|
+
f"retrieving jobscript-block dependencies for submission {sub_idx} from "
|
|
2078
|
+
f"disk, and caching."
|
|
2079
|
+
)
|
|
2080
|
+
# for a given submission, dependencies are stored for all jobscript-blocks in
|
|
2081
|
+
# the same array (and chunk), so retrieve all of them and cache:
|
|
2082
|
+
arr = self._get_jobscripts_dependencies_arr(sub_idx)
|
|
2083
|
+
self._jobscript_dependencies[sub_idx] = (
|
|
2084
|
+
self._decode_jobscript_block_dependencies(arr)
|
|
2085
|
+
)
|
|
2086
|
+
else:
|
|
2087
|
+
self.logger.debug(
|
|
2088
|
+
f"retrieving jobscript-block dependencies for submission {sub_idx} from "
|
|
2089
|
+
"cache."
|
|
2090
|
+
)
|
|
2091
|
+
|
|
2092
|
+
return self._jobscript_dependencies[sub_idx][(js_idx, blk_idx)]
|
|
2093
|
+
|
|
2094
|
+
def get_ts_fmt(self):
|
|
2095
|
+
"""
|
|
2096
|
+
Get the format for timestamps.
|
|
2097
|
+
"""
|
|
2098
|
+
with self.using_resource("attrs", action="read") as attrs:
|
|
2099
|
+
return attrs["ts_fmt"]
|
|
2100
|
+
|
|
2101
|
+
def get_ts_name_fmt(self):
|
|
2102
|
+
"""
|
|
2103
|
+
Get the format for timestamps to use in names.
|
|
2104
|
+
"""
|
|
2105
|
+
with self.using_resource("attrs", action="read") as attrs:
|
|
2106
|
+
return attrs["ts_name_fmt"]
|
|
2107
|
+
|
|
2108
|
+
def get_creation_info(self):
|
|
2109
|
+
"""
|
|
2110
|
+
Get information about the creation of the workflow.
|
|
2111
|
+
"""
|
|
2112
|
+
with self.using_resource("attrs", action="read") as attrs:
|
|
2113
|
+
return copy.deepcopy(attrs["creation_info"])
|
|
2114
|
+
|
|
2115
|
+
def get_name(self):
|
|
2116
|
+
"""
|
|
2117
|
+
Get the name of the workflow.
|
|
2118
|
+
"""
|
|
2119
|
+
with self.using_resource("attrs", action="read") as attrs:
|
|
2120
|
+
return attrs["name"]
|
|
2121
|
+
|
|
2122
|
+
def zip(
|
|
2123
|
+
self,
|
|
2124
|
+
path: str = ".",
|
|
2125
|
+
log: str | None = None,
|
|
2126
|
+
overwrite: bool = False,
|
|
2127
|
+
include_execute: bool = False,
|
|
2128
|
+
include_rechunk_backups: bool = False,
|
|
2129
|
+
):
|
|
2130
|
+
"""
|
|
2131
|
+
Convert the persistent store to zipped form.
|
|
2132
|
+
|
|
2133
|
+
Parameters
|
|
2134
|
+
----------
|
|
2135
|
+
path:
|
|
2136
|
+
Path at which to create the new zipped workflow. If this is an existing
|
|
2137
|
+
directory, the zip file will be created within this directory. Otherwise,
|
|
2138
|
+
this path is assumed to be the full file path to the new zip file.
|
|
2139
|
+
"""
|
|
2140
|
+
with Console().status(f"Zipping workflow {self.workflow.name!r}..."):
|
|
2141
|
+
# TODO: this won't work for remote file systems
|
|
2142
|
+
dst_path = Path(path).resolve()
|
|
2143
|
+
if dst_path.is_dir():
|
|
2144
|
+
dst_path = dst_path.joinpath(self.workflow.name).with_suffix(".zip")
|
|
2145
|
+
|
|
2146
|
+
if not overwrite and dst_path.exists():
|
|
2147
|
+
raise FileExistsError(
|
|
2148
|
+
f"File at path already exists: {dst_path!r}. Pass `overwrite=True` to "
|
|
2149
|
+
f"overwrite the existing file."
|
|
2150
|
+
)
|
|
2151
|
+
|
|
2152
|
+
dst_path_s = str(dst_path)
|
|
2153
|
+
|
|
2154
|
+
src_zarr_store = self.zarr_store
|
|
2155
|
+
zfs, _ = ask_pw_on_auth_exc(
|
|
2156
|
+
ZipFileSystem,
|
|
2157
|
+
fo=dst_path_s,
|
|
2158
|
+
mode="w",
|
|
2159
|
+
target_options={},
|
|
2160
|
+
add_pw_to="target_options",
|
|
2161
|
+
)
|
|
2162
|
+
dst_zarr_store = FSStore(url="", fs=zfs)
|
|
2163
|
+
excludes = []
|
|
2164
|
+
if not include_execute:
|
|
2165
|
+
excludes.append("execute")
|
|
2166
|
+
if not include_rechunk_backups:
|
|
2167
|
+
excludes.append("runs.bak")
|
|
2168
|
+
excludes.append("base.bak")
|
|
2169
|
+
|
|
2170
|
+
zarr.copy_store(
|
|
2171
|
+
src_zarr_store,
|
|
2172
|
+
dst_zarr_store,
|
|
2173
|
+
excludes=excludes or None,
|
|
2174
|
+
log=log,
|
|
2175
|
+
)
|
|
2176
|
+
del zfs # ZipFileSystem remains open for instance lifetime
|
|
2177
|
+
return dst_path_s
|
|
2178
|
+
|
|
2179
|
+
def unzip(self, path: str = ".", log: str | None = None):
|
|
2180
|
+
raise ValueError("Not a zip store!")
|
|
2181
|
+
|
|
2182
|
+
def _rechunk_arr(
|
|
2183
|
+
self,
|
|
2184
|
+
arr: Array,
|
|
2185
|
+
chunk_size: int | None = None,
|
|
2186
|
+
backup: bool = True,
|
|
2187
|
+
status: bool = True,
|
|
2188
|
+
) -> Array:
|
|
2189
|
+
arr_path = Path(arr.store.path) / arr.path
|
|
2190
|
+
arr_name = arr.path.split("/")[-1]
|
|
2191
|
+
|
|
2192
|
+
if status:
|
|
2193
|
+
s = Console().status("Rechunking...")
|
|
2194
|
+
s.start()
|
|
2195
|
+
backup_time = None
|
|
2196
|
+
|
|
2197
|
+
if backup:
|
|
2198
|
+
if status:
|
|
2199
|
+
s.update("Backing up...")
|
|
2200
|
+
backup_path = arr_path.with_suffix(".bak")
|
|
2201
|
+
if backup_path.is_dir():
|
|
2202
|
+
pass
|
|
2203
|
+
else:
|
|
2204
|
+
tic = time.perf_counter()
|
|
2205
|
+
shutil.copytree(arr_path, backup_path)
|
|
2206
|
+
toc = time.perf_counter()
|
|
2207
|
+
backup_time = toc - tic
|
|
2208
|
+
|
|
2209
|
+
tic = time.perf_counter()
|
|
2210
|
+
arr_rc_path = arr_path.with_suffix(".rechunked")
|
|
2211
|
+
if status:
|
|
2212
|
+
s.update("Creating new array...")
|
|
2213
|
+
|
|
2214
|
+
# use the same store:
|
|
2215
|
+
try:
|
|
2216
|
+
arr_rc_store = arr.store.__class__(path=arr_rc_path)
|
|
2217
|
+
except TypeError:
|
|
2218
|
+
# FSStore
|
|
2219
|
+
arr_rc_store = arr.store.__class__(url=str(arr_rc_path))
|
|
2220
|
+
|
|
2221
|
+
arr_rc = zarr.create(
|
|
2222
|
+
store=arr_rc_store,
|
|
2223
|
+
shape=arr.shape,
|
|
2224
|
+
chunks=arr.shape if chunk_size is None else chunk_size,
|
|
2225
|
+
dtype=object,
|
|
2226
|
+
object_codec=self._CODEC,
|
|
2227
|
+
)
|
|
2228
|
+
|
|
2229
|
+
if status:
|
|
2230
|
+
s.update("Copying data...")
|
|
2231
|
+
data = np.empty(shape=arr.shape, dtype=object)
|
|
2232
|
+
bad_data = []
|
|
2233
|
+
for idx in range(len(arr)):
|
|
2234
|
+
try:
|
|
2235
|
+
data[idx] = arr[idx]
|
|
2236
|
+
except RuntimeError:
|
|
2237
|
+
# blosc decompression errors
|
|
2238
|
+
bad_data.append(idx)
|
|
2239
|
+
arr_rc[:] = data
|
|
2240
|
+
|
|
2241
|
+
arr_rc.attrs.put(arr.attrs.asdict())
|
|
2242
|
+
|
|
2243
|
+
if status:
|
|
2244
|
+
s.update("Deleting old array...")
|
|
2245
|
+
shutil.rmtree(arr_path)
|
|
2246
|
+
|
|
2247
|
+
if status:
|
|
2248
|
+
s.update("Moving new array into place...")
|
|
2249
|
+
shutil.move(arr_rc_path, arr_path)
|
|
2250
|
+
|
|
2251
|
+
toc = time.perf_counter()
|
|
2252
|
+
rechunk_time = toc - tic
|
|
2253
|
+
|
|
2254
|
+
if status:
|
|
2255
|
+
s.stop()
|
|
2256
|
+
|
|
2257
|
+
if backup_time:
|
|
2258
|
+
print(f"Time to backup {arr_name}: {backup_time:.1f} s")
|
|
2259
|
+
|
|
2260
|
+
print(f"Time to rechunk and move {arr_name}: {rechunk_time:.1f} s")
|
|
2261
|
+
|
|
2262
|
+
if bad_data:
|
|
2263
|
+
print(f"Bad data at {arr_name} indices: {bad_data}.")
|
|
2264
|
+
|
|
2265
|
+
return arr_rc
|
|
2266
|
+
|
|
2267
|
+
def rechunk_parameter_base(
|
|
2268
|
+
self,
|
|
2269
|
+
chunk_size: int | None = None,
|
|
2270
|
+
backup: bool = True,
|
|
2271
|
+
status: bool = True,
|
|
2272
|
+
) -> Array:
|
|
2273
|
+
"""
|
|
2274
|
+
Rechunk the parameter data to be stored more efficiently.
|
|
2275
|
+
"""
|
|
2276
|
+
arr = self._get_parameter_base_array()
|
|
2277
|
+
return self._rechunk_arr(arr, chunk_size, backup, status)
|
|
2278
|
+
|
|
2279
|
+
def rechunk_runs(
|
|
2280
|
+
self,
|
|
2281
|
+
chunk_size: int | None = None,
|
|
2282
|
+
backup: bool = True,
|
|
2283
|
+
status: bool = True,
|
|
2284
|
+
) -> Array:
|
|
2285
|
+
"""
|
|
2286
|
+
Rechunk the run data to be stored more efficiently.
|
|
2287
|
+
"""
|
|
2288
|
+
arr = self._get_EARs_arr()
|
|
2289
|
+
return self._rechunk_arr(arr, chunk_size, backup, status)
|
|
2290
|
+
|
|
2291
|
+
def get_dirs_array(self) -> NDArray:
|
|
2292
|
+
"""
|
|
2293
|
+
Retrieve the run directories array.
|
|
2294
|
+
"""
|
|
2295
|
+
return self._get_dirs_arr()[:]
|
|
2296
|
+
|
|
2297
|
+
|
|
2298
|
+
class ZarrZipPersistentStore(ZarrPersistentStore):
|
|
2299
|
+
"""A store designed mainly as an archive format that can be uploaded to data
|
|
2300
|
+
repositories such as Zenodo.
|
|
2301
|
+
|
|
2302
|
+
Note
|
|
2303
|
+
----
|
|
2304
|
+
Archive format persistent stores cannot be updated without being unzipped first.
|
|
2305
|
+
"""
|
|
2306
|
+
|
|
2307
|
+
_name: ClassVar[str] = "zip"
|
|
2308
|
+
_features: ClassVar[PersistentStoreFeatures] = PersistentStoreFeatures(
|
|
2309
|
+
create=False,
|
|
2310
|
+
edit=False,
|
|
2311
|
+
jobscript_parallelism=False,
|
|
2312
|
+
EAR_parallelism=False,
|
|
2313
|
+
schedulers=False,
|
|
2314
|
+
submission=False,
|
|
2315
|
+
)
|
|
2316
|
+
|
|
2317
|
+
# TODO: enforce read-only nature
|
|
2318
|
+
|
|
2319
|
+
def zip(
|
|
2320
|
+
self,
|
|
2321
|
+
path: str = ".",
|
|
2322
|
+
log: str | None = None,
|
|
2323
|
+
overwrite: bool = False,
|
|
2324
|
+
include_execute: bool = False,
|
|
2325
|
+
include_rechunk_backups: bool = False,
|
|
2326
|
+
):
|
|
2327
|
+
raise ValueError("Already a zip store!")
|
|
2328
|
+
|
|
2329
|
+
def unzip(self, path: str = ".", log: str | None = None) -> str:
|
|
2330
|
+
"""
|
|
2331
|
+
Expand the persistent store.
|
|
2332
|
+
|
|
2333
|
+
Parameters
|
|
2334
|
+
----------
|
|
2335
|
+
path:
|
|
2336
|
+
Path at which to create the new unzipped workflow. If this is an existing
|
|
2337
|
+
directory, the new workflow directory will be created within this directory.
|
|
2338
|
+
Otherwise, this path will represent the new workflow directory path.
|
|
2339
|
+
|
|
2340
|
+
"""
|
|
2341
|
+
|
|
2342
|
+
with Console().status(f"Unzipping workflow {self.workflow.name!r}..."):
|
|
2343
|
+
# TODO: this won't work for remote file systems
|
|
2344
|
+
dst_path = Path(path).resolve()
|
|
2345
|
+
if dst_path.is_dir():
|
|
2346
|
+
dst_path = dst_path.joinpath(self.workflow.name)
|
|
2347
|
+
|
|
2348
|
+
if dst_path.exists():
|
|
2349
|
+
raise FileExistsError(f"Directory at path already exists: {dst_path!r}.")
|
|
2350
|
+
|
|
2351
|
+
dst_path_s = str(dst_path)
|
|
2352
|
+
|
|
2353
|
+
src_zarr_store = self.zarr_store
|
|
2354
|
+
dst_zarr_store = FSStore(url=dst_path_s)
|
|
2355
|
+
zarr.copy_store(src_zarr_store, dst_zarr_store, log=log)
|
|
2356
|
+
return dst_path_s
|
|
2357
|
+
|
|
2358
|
+
def copy(self, path: PathLike = None) -> Path:
|
|
2359
|
+
# not sure how to do this.
|
|
2360
|
+
raise NotImplementedError()
|
|
2361
|
+
|
|
2362
|
+
def delete_no_confirm(self) -> None:
|
|
2363
|
+
# `ZipFileSystem.rm()` does not seem to be implemented.
|
|
2364
|
+
raise NotImplementedError()
|
|
2365
|
+
|
|
2366
|
+
def _rechunk_arr(
|
|
2367
|
+
self,
|
|
2368
|
+
arr,
|
|
2369
|
+
chunk_size: int | None = None,
|
|
2370
|
+
backup: bool = True,
|
|
2371
|
+
status: bool = True,
|
|
2372
|
+
) -> Array:
|
|
2373
|
+
raise NotImplementedError
|
|
2374
|
+
|
|
2375
|
+
def get_text_file(self, path: str | Path) -> str:
|
|
2376
|
+
"""Retrieve the contents of a text file stored within the workflow."""
|
|
2377
|
+
path = Path(path)
|
|
2378
|
+
if path.is_absolute():
|
|
2379
|
+
path = path.relative_to(self.workflow.url)
|
|
2380
|
+
path = str(path.as_posix())
|
|
2381
|
+
assert self.fs
|
|
2382
|
+
try:
|
|
2383
|
+
with self.fs.open(path, mode="rt") as fp:
|
|
2384
|
+
return fp.read()
|
|
2385
|
+
except KeyError:
|
|
2386
|
+
raise FileNotFoundError(
|
|
2387
|
+
f"File within zip at location {path!r} does not exist."
|
|
2388
|
+
) from None
|