hpcflow 0.1.15__py3-none-any.whl → 0.2.0a271__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (275) hide show
  1. hpcflow/__init__.py +2 -11
  2. hpcflow/__pyinstaller/__init__.py +5 -0
  3. hpcflow/__pyinstaller/hook-hpcflow.py +40 -0
  4. hpcflow/_version.py +1 -1
  5. hpcflow/app.py +43 -0
  6. hpcflow/cli.py +2 -461
  7. hpcflow/data/demo_data_manifest/__init__.py +3 -0
  8. hpcflow/data/demo_data_manifest/demo_data_manifest.json +6 -0
  9. hpcflow/data/jinja_templates/test/test_template.txt +8 -0
  10. hpcflow/data/programs/hello_world/README.md +1 -0
  11. hpcflow/data/programs/hello_world/hello_world.c +87 -0
  12. hpcflow/data/programs/hello_world/linux/hello_world +0 -0
  13. hpcflow/data/programs/hello_world/macos/hello_world +0 -0
  14. hpcflow/data/programs/hello_world/win/hello_world.exe +0 -0
  15. hpcflow/data/scripts/__init__.py +1 -0
  16. hpcflow/data/scripts/bad_script.py +2 -0
  17. hpcflow/data/scripts/demo_task_1_generate_t1_infile_1.py +8 -0
  18. hpcflow/data/scripts/demo_task_1_generate_t1_infile_2.py +8 -0
  19. hpcflow/data/scripts/demo_task_1_parse_p3.py +7 -0
  20. hpcflow/data/scripts/do_nothing.py +2 -0
  21. hpcflow/data/scripts/env_specifier_test/input_file_generator_pass_env_spec.py +4 -0
  22. hpcflow/data/scripts/env_specifier_test/main_script_test_pass_env_spec.py +8 -0
  23. hpcflow/data/scripts/env_specifier_test/output_file_parser_pass_env_spec.py +4 -0
  24. hpcflow/data/scripts/env_specifier_test/v1/input_file_generator_basic.py +4 -0
  25. hpcflow/data/scripts/env_specifier_test/v1/main_script_test_direct_in_direct_out.py +7 -0
  26. hpcflow/data/scripts/env_specifier_test/v1/output_file_parser_basic.py +4 -0
  27. hpcflow/data/scripts/env_specifier_test/v2/main_script_test_direct_in_direct_out.py +7 -0
  28. hpcflow/data/scripts/generate_t1_file_01.py +7 -0
  29. hpcflow/data/scripts/import_future_script.py +7 -0
  30. hpcflow/data/scripts/input_file_generator_basic.py +3 -0
  31. hpcflow/data/scripts/input_file_generator_basic_FAIL.py +3 -0
  32. hpcflow/data/scripts/input_file_generator_test_stdout_stderr.py +8 -0
  33. hpcflow/data/scripts/main_script_test_direct_in.py +3 -0
  34. hpcflow/data/scripts/main_script_test_direct_in_direct_out.py +6 -0
  35. hpcflow/data/scripts/main_script_test_direct_in_direct_out_2.py +6 -0
  36. hpcflow/data/scripts/main_script_test_direct_in_direct_out_2_fail_allowed.py +6 -0
  37. hpcflow/data/scripts/main_script_test_direct_in_direct_out_2_fail_allowed_group.py +7 -0
  38. hpcflow/data/scripts/main_script_test_direct_in_direct_out_3.py +6 -0
  39. hpcflow/data/scripts/main_script_test_direct_in_direct_out_all_iters_test.py +15 -0
  40. hpcflow/data/scripts/main_script_test_direct_in_direct_out_env_spec.py +7 -0
  41. hpcflow/data/scripts/main_script_test_direct_in_direct_out_labels.py +8 -0
  42. hpcflow/data/scripts/main_script_test_direct_in_group_direct_out_3.py +6 -0
  43. hpcflow/data/scripts/main_script_test_direct_in_group_one_fail_direct_out_3.py +6 -0
  44. hpcflow/data/scripts/main_script_test_direct_sub_param_in_direct_out.py +6 -0
  45. hpcflow/data/scripts/main_script_test_hdf5_in_obj.py +12 -0
  46. hpcflow/data/scripts/main_script_test_hdf5_in_obj_2.py +12 -0
  47. hpcflow/data/scripts/main_script_test_hdf5_in_obj_group.py +12 -0
  48. hpcflow/data/scripts/main_script_test_hdf5_out_obj.py +11 -0
  49. hpcflow/data/scripts/main_script_test_json_and_direct_in_json_out.py +14 -0
  50. hpcflow/data/scripts/main_script_test_json_in_json_and_direct_out.py +17 -0
  51. hpcflow/data/scripts/main_script_test_json_in_json_out.py +14 -0
  52. hpcflow/data/scripts/main_script_test_json_in_json_out_labels.py +16 -0
  53. hpcflow/data/scripts/main_script_test_json_in_obj.py +12 -0
  54. hpcflow/data/scripts/main_script_test_json_out_FAIL.py +3 -0
  55. hpcflow/data/scripts/main_script_test_json_out_obj.py +10 -0
  56. hpcflow/data/scripts/main_script_test_json_sub_param_in_json_out_labels.py +16 -0
  57. hpcflow/data/scripts/main_script_test_shell_env_vars.py +12 -0
  58. hpcflow/data/scripts/main_script_test_std_out_std_err.py +6 -0
  59. hpcflow/data/scripts/output_file_parser_basic.py +3 -0
  60. hpcflow/data/scripts/output_file_parser_basic_FAIL.py +7 -0
  61. hpcflow/data/scripts/output_file_parser_test_stdout_stderr.py +8 -0
  62. hpcflow/data/scripts/parse_t1_file_01.py +4 -0
  63. hpcflow/data/scripts/script_exit_test.py +5 -0
  64. hpcflow/data/template_components/__init__.py +1 -0
  65. hpcflow/data/template_components/command_files.yaml +26 -0
  66. hpcflow/data/template_components/environments.yaml +13 -0
  67. hpcflow/data/template_components/parameters.yaml +14 -0
  68. hpcflow/data/template_components/task_schemas.yaml +139 -0
  69. hpcflow/data/workflows/workflow_1.yaml +5 -0
  70. hpcflow/examples.ipynb +1037 -0
  71. hpcflow/sdk/__init__.py +149 -0
  72. hpcflow/sdk/app.py +4266 -0
  73. hpcflow/sdk/cli.py +1479 -0
  74. hpcflow/sdk/cli_common.py +385 -0
  75. hpcflow/sdk/config/__init__.py +5 -0
  76. hpcflow/sdk/config/callbacks.py +246 -0
  77. hpcflow/sdk/config/cli.py +388 -0
  78. hpcflow/sdk/config/config.py +1410 -0
  79. hpcflow/sdk/config/config_file.py +501 -0
  80. hpcflow/sdk/config/errors.py +272 -0
  81. hpcflow/sdk/config/types.py +150 -0
  82. hpcflow/sdk/core/__init__.py +38 -0
  83. hpcflow/sdk/core/actions.py +3857 -0
  84. hpcflow/sdk/core/app_aware.py +25 -0
  85. hpcflow/sdk/core/cache.py +224 -0
  86. hpcflow/sdk/core/command_files.py +814 -0
  87. hpcflow/sdk/core/commands.py +424 -0
  88. hpcflow/sdk/core/element.py +2071 -0
  89. hpcflow/sdk/core/enums.py +221 -0
  90. hpcflow/sdk/core/environment.py +256 -0
  91. hpcflow/sdk/core/errors.py +1043 -0
  92. hpcflow/sdk/core/execute.py +207 -0
  93. hpcflow/sdk/core/json_like.py +809 -0
  94. hpcflow/sdk/core/loop.py +1320 -0
  95. hpcflow/sdk/core/loop_cache.py +282 -0
  96. hpcflow/sdk/core/object_list.py +933 -0
  97. hpcflow/sdk/core/parameters.py +3371 -0
  98. hpcflow/sdk/core/rule.py +196 -0
  99. hpcflow/sdk/core/run_dir_files.py +57 -0
  100. hpcflow/sdk/core/skip_reason.py +7 -0
  101. hpcflow/sdk/core/task.py +3792 -0
  102. hpcflow/sdk/core/task_schema.py +993 -0
  103. hpcflow/sdk/core/test_utils.py +538 -0
  104. hpcflow/sdk/core/types.py +447 -0
  105. hpcflow/sdk/core/utils.py +1207 -0
  106. hpcflow/sdk/core/validation.py +87 -0
  107. hpcflow/sdk/core/values.py +477 -0
  108. hpcflow/sdk/core/workflow.py +4820 -0
  109. hpcflow/sdk/core/zarr_io.py +206 -0
  110. hpcflow/sdk/data/__init__.py +13 -0
  111. hpcflow/sdk/data/config_file_schema.yaml +34 -0
  112. hpcflow/sdk/data/config_schema.yaml +260 -0
  113. hpcflow/sdk/data/environments_spec_schema.yaml +21 -0
  114. hpcflow/sdk/data/files_spec_schema.yaml +5 -0
  115. hpcflow/sdk/data/parameters_spec_schema.yaml +7 -0
  116. hpcflow/sdk/data/task_schema_spec_schema.yaml +3 -0
  117. hpcflow/sdk/data/workflow_spec_schema.yaml +22 -0
  118. hpcflow/sdk/demo/__init__.py +3 -0
  119. hpcflow/sdk/demo/cli.py +242 -0
  120. hpcflow/sdk/helper/__init__.py +3 -0
  121. hpcflow/sdk/helper/cli.py +137 -0
  122. hpcflow/sdk/helper/helper.py +300 -0
  123. hpcflow/sdk/helper/watcher.py +192 -0
  124. hpcflow/sdk/log.py +288 -0
  125. hpcflow/sdk/persistence/__init__.py +18 -0
  126. hpcflow/sdk/persistence/base.py +2817 -0
  127. hpcflow/sdk/persistence/defaults.py +6 -0
  128. hpcflow/sdk/persistence/discovery.py +39 -0
  129. hpcflow/sdk/persistence/json.py +954 -0
  130. hpcflow/sdk/persistence/pending.py +948 -0
  131. hpcflow/sdk/persistence/store_resource.py +203 -0
  132. hpcflow/sdk/persistence/types.py +309 -0
  133. hpcflow/sdk/persistence/utils.py +73 -0
  134. hpcflow/sdk/persistence/zarr.py +2388 -0
  135. hpcflow/sdk/runtime.py +320 -0
  136. hpcflow/sdk/submission/__init__.py +3 -0
  137. hpcflow/sdk/submission/enums.py +70 -0
  138. hpcflow/sdk/submission/jobscript.py +2379 -0
  139. hpcflow/sdk/submission/schedulers/__init__.py +281 -0
  140. hpcflow/sdk/submission/schedulers/direct.py +233 -0
  141. hpcflow/sdk/submission/schedulers/sge.py +376 -0
  142. hpcflow/sdk/submission/schedulers/slurm.py +598 -0
  143. hpcflow/sdk/submission/schedulers/utils.py +25 -0
  144. hpcflow/sdk/submission/shells/__init__.py +52 -0
  145. hpcflow/sdk/submission/shells/base.py +229 -0
  146. hpcflow/sdk/submission/shells/bash.py +504 -0
  147. hpcflow/sdk/submission/shells/os_version.py +115 -0
  148. hpcflow/sdk/submission/shells/powershell.py +352 -0
  149. hpcflow/sdk/submission/submission.py +1402 -0
  150. hpcflow/sdk/submission/types.py +140 -0
  151. hpcflow/sdk/typing.py +194 -0
  152. hpcflow/sdk/utils/arrays.py +69 -0
  153. hpcflow/sdk/utils/deferred_file.py +55 -0
  154. hpcflow/sdk/utils/hashing.py +16 -0
  155. hpcflow/sdk/utils/patches.py +31 -0
  156. hpcflow/sdk/utils/strings.py +69 -0
  157. hpcflow/tests/api/test_api.py +32 -0
  158. hpcflow/tests/conftest.py +123 -0
  159. hpcflow/tests/data/__init__.py +0 -0
  160. hpcflow/tests/data/benchmark_N_elements.yaml +6 -0
  161. hpcflow/tests/data/benchmark_script_runner.yaml +26 -0
  162. hpcflow/tests/data/multi_path_sequences.yaml +29 -0
  163. hpcflow/tests/data/workflow_1.json +10 -0
  164. hpcflow/tests/data/workflow_1.yaml +5 -0
  165. hpcflow/tests/data/workflow_1_slurm.yaml +8 -0
  166. hpcflow/tests/data/workflow_1_wsl.yaml +8 -0
  167. hpcflow/tests/data/workflow_test_run_abort.yaml +42 -0
  168. hpcflow/tests/jinja_templates/test_jinja_templates.py +161 -0
  169. hpcflow/tests/programs/test_programs.py +180 -0
  170. hpcflow/tests/schedulers/direct_linux/test_direct_linux_submission.py +12 -0
  171. hpcflow/tests/schedulers/sge/test_sge_submission.py +36 -0
  172. hpcflow/tests/schedulers/slurm/test_slurm_submission.py +14 -0
  173. hpcflow/tests/scripts/test_input_file_generators.py +282 -0
  174. hpcflow/tests/scripts/test_main_scripts.py +1361 -0
  175. hpcflow/tests/scripts/test_non_snippet_script.py +46 -0
  176. hpcflow/tests/scripts/test_ouput_file_parsers.py +353 -0
  177. hpcflow/tests/shells/wsl/test_wsl_submission.py +14 -0
  178. hpcflow/tests/unit/test_action.py +1066 -0
  179. hpcflow/tests/unit/test_action_rule.py +24 -0
  180. hpcflow/tests/unit/test_app.py +132 -0
  181. hpcflow/tests/unit/test_cache.py +46 -0
  182. hpcflow/tests/unit/test_cli.py +172 -0
  183. hpcflow/tests/unit/test_command.py +377 -0
  184. hpcflow/tests/unit/test_config.py +195 -0
  185. hpcflow/tests/unit/test_config_file.py +162 -0
  186. hpcflow/tests/unit/test_element.py +666 -0
  187. hpcflow/tests/unit/test_element_iteration.py +88 -0
  188. hpcflow/tests/unit/test_element_set.py +158 -0
  189. hpcflow/tests/unit/test_group.py +115 -0
  190. hpcflow/tests/unit/test_input_source.py +1479 -0
  191. hpcflow/tests/unit/test_input_value.py +398 -0
  192. hpcflow/tests/unit/test_jobscript_unit.py +757 -0
  193. hpcflow/tests/unit/test_json_like.py +1247 -0
  194. hpcflow/tests/unit/test_loop.py +2674 -0
  195. hpcflow/tests/unit/test_meta_task.py +325 -0
  196. hpcflow/tests/unit/test_multi_path_sequences.py +259 -0
  197. hpcflow/tests/unit/test_object_list.py +116 -0
  198. hpcflow/tests/unit/test_parameter.py +243 -0
  199. hpcflow/tests/unit/test_persistence.py +664 -0
  200. hpcflow/tests/unit/test_resources.py +243 -0
  201. hpcflow/tests/unit/test_run.py +286 -0
  202. hpcflow/tests/unit/test_run_directories.py +29 -0
  203. hpcflow/tests/unit/test_runtime.py +9 -0
  204. hpcflow/tests/unit/test_schema_input.py +372 -0
  205. hpcflow/tests/unit/test_shell.py +129 -0
  206. hpcflow/tests/unit/test_slurm.py +39 -0
  207. hpcflow/tests/unit/test_submission.py +502 -0
  208. hpcflow/tests/unit/test_task.py +2560 -0
  209. hpcflow/tests/unit/test_task_schema.py +182 -0
  210. hpcflow/tests/unit/test_utils.py +616 -0
  211. hpcflow/tests/unit/test_value_sequence.py +549 -0
  212. hpcflow/tests/unit/test_values.py +91 -0
  213. hpcflow/tests/unit/test_workflow.py +827 -0
  214. hpcflow/tests/unit/test_workflow_template.py +186 -0
  215. hpcflow/tests/unit/utils/test_arrays.py +40 -0
  216. hpcflow/tests/unit/utils/test_deferred_file_writer.py +34 -0
  217. hpcflow/tests/unit/utils/test_hashing.py +65 -0
  218. hpcflow/tests/unit/utils/test_patches.py +5 -0
  219. hpcflow/tests/unit/utils/test_redirect_std.py +50 -0
  220. hpcflow/tests/unit/utils/test_strings.py +97 -0
  221. hpcflow/tests/workflows/__init__.py +0 -0
  222. hpcflow/tests/workflows/test_directory_structure.py +31 -0
  223. hpcflow/tests/workflows/test_jobscript.py +355 -0
  224. hpcflow/tests/workflows/test_run_status.py +198 -0
  225. hpcflow/tests/workflows/test_skip_downstream.py +696 -0
  226. hpcflow/tests/workflows/test_submission.py +140 -0
  227. hpcflow/tests/workflows/test_workflows.py +564 -0
  228. hpcflow/tests/workflows/test_zip.py +18 -0
  229. hpcflow/viz_demo.ipynb +6794 -0
  230. hpcflow-0.2.0a271.dist-info/LICENSE +375 -0
  231. hpcflow-0.2.0a271.dist-info/METADATA +65 -0
  232. hpcflow-0.2.0a271.dist-info/RECORD +237 -0
  233. {hpcflow-0.1.15.dist-info → hpcflow-0.2.0a271.dist-info}/WHEEL +4 -5
  234. hpcflow-0.2.0a271.dist-info/entry_points.txt +6 -0
  235. hpcflow/api.py +0 -490
  236. hpcflow/archive/archive.py +0 -307
  237. hpcflow/archive/cloud/cloud.py +0 -45
  238. hpcflow/archive/cloud/errors.py +0 -9
  239. hpcflow/archive/cloud/providers/dropbox.py +0 -427
  240. hpcflow/archive/errors.py +0 -5
  241. hpcflow/base_db.py +0 -4
  242. hpcflow/config.py +0 -233
  243. hpcflow/copytree.py +0 -66
  244. hpcflow/data/examples/_config.yml +0 -14
  245. hpcflow/data/examples/damask/demo/1.run.yml +0 -4
  246. hpcflow/data/examples/damask/demo/2.process.yml +0 -29
  247. hpcflow/data/examples/damask/demo/geom.geom +0 -2052
  248. hpcflow/data/examples/damask/demo/load.load +0 -1
  249. hpcflow/data/examples/damask/demo/material.config +0 -185
  250. hpcflow/data/examples/damask/inputs/geom.geom +0 -2052
  251. hpcflow/data/examples/damask/inputs/load.load +0 -1
  252. hpcflow/data/examples/damask/inputs/material.config +0 -185
  253. hpcflow/data/examples/damask/profiles/_variable_lookup.yml +0 -21
  254. hpcflow/data/examples/damask/profiles/damask.yml +0 -4
  255. hpcflow/data/examples/damask/profiles/damask_process.yml +0 -8
  256. hpcflow/data/examples/damask/profiles/damask_run.yml +0 -5
  257. hpcflow/data/examples/damask/profiles/default.yml +0 -6
  258. hpcflow/data/examples/thinking.yml +0 -177
  259. hpcflow/errors.py +0 -2
  260. hpcflow/init_db.py +0 -37
  261. hpcflow/models.py +0 -2595
  262. hpcflow/nesting.py +0 -9
  263. hpcflow/profiles.py +0 -455
  264. hpcflow/project.py +0 -81
  265. hpcflow/scheduler.py +0 -322
  266. hpcflow/utils.py +0 -103
  267. hpcflow/validation.py +0 -166
  268. hpcflow/variables.py +0 -543
  269. hpcflow-0.1.15.dist-info/METADATA +0 -168
  270. hpcflow-0.1.15.dist-info/RECORD +0 -45
  271. hpcflow-0.1.15.dist-info/entry_points.txt +0 -8
  272. hpcflow-0.1.15.dist-info/top_level.txt +0 -1
  273. /hpcflow/{archive → data/jinja_templates}/__init__.py +0 -0
  274. /hpcflow/{archive/cloud → data/programs}/__init__.py +0 -0
  275. /hpcflow/{archive/cloud/providers → data/workflows}/__init__.py +0 -0
@@ -0,0 +1,2388 @@
1
+ """
2
+ Persistence model based on writing Zarr arrays.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import copy
8
+ from contextlib import contextmanager
9
+ from collections import defaultdict
10
+ from dataclasses import dataclass
11
+ from pathlib import Path
12
+ from typing import Any, cast, TYPE_CHECKING
13
+ from typing_extensions import override
14
+ import shutil
15
+ import time
16
+
17
+ import numpy as np
18
+ from numpy.ma.core import MaskedArray
19
+ import zarr # type: ignore
20
+ from zarr.errors import BoundsCheckError # type: ignore
21
+ from zarr.storage import DirectoryStore, FSStore # type: ignore
22
+ from zarr.util import guess_chunks # type: ignore
23
+ from fsspec.implementations.zip import ZipFileSystem # type: ignore
24
+ from rich.console import Console
25
+ from numcodecs import MsgPack, VLenArray, blosc, Blosc, Zstd # type: ignore
26
+ from reretry import retry # type: ignore
27
+
28
+ from hpcflow.sdk.typing import hydrate
29
+ from hpcflow.sdk.core import RUN_DIR_ARR_DTYPE, RUN_DIR_ARR_FILL
30
+ from hpcflow.sdk.core.errors import (
31
+ MissingParameterData,
32
+ MissingStoreEARError,
33
+ MissingStoreElementError,
34
+ MissingStoreElementIterationError,
35
+ MissingStoreTaskError,
36
+ )
37
+ from hpcflow.sdk.core.utils import (
38
+ ensure_in,
39
+ get_relative_path,
40
+ set_in_container,
41
+ get_in_container,
42
+ )
43
+ from hpcflow.sdk.persistence.base import (
44
+ PARAM_DATA_NOT_SET,
45
+ PersistentStoreFeatures,
46
+ PersistentStore,
47
+ StoreEAR,
48
+ StoreElement,
49
+ StoreElementIter,
50
+ StoreParameter,
51
+ StoreTask,
52
+ )
53
+ from hpcflow.sdk.persistence.types import (
54
+ LoopDescriptor,
55
+ StoreCreationInfo,
56
+ TemplateMeta,
57
+ ZarrAttrsDict,
58
+ )
59
+ from hpcflow.sdk.persistence.store_resource import ZarrAttrsStoreResource
60
+ from hpcflow.sdk.persistence.utils import ask_pw_on_auth_exc
61
+ from hpcflow.sdk.persistence.pending import CommitResourceMap
62
+ from hpcflow.sdk.persistence.base import update_param_source_dict
63
+ from hpcflow.sdk.log import TimeIt
64
+ from hpcflow.sdk.submission.submission import (
65
+ JOBSCRIPT_SUBMIT_TIME_KEYS,
66
+ SUBMISSION_SUBMIT_TIME_KEYS,
67
+ )
68
+ from hpcflow.sdk.utils.arrays import get_2D_idx, split_arr
69
+ from hpcflow.sdk.utils.patches import override_module_attrs
70
+ from hpcflow.sdk.utils.strings import shorten_list_str
71
+
72
+ if TYPE_CHECKING:
73
+ from collections.abc import (
74
+ Callable,
75
+ Iterable,
76
+ Iterator,
77
+ Mapping,
78
+ MutableMapping,
79
+ Sequence,
80
+ )
81
+ from datetime import datetime
82
+ from fsspec import AbstractFileSystem # type: ignore
83
+ from logging import Logger
84
+ from typing import ClassVar
85
+ from typing_extensions import Self, TypeAlias
86
+ from numpy.typing import NDArray
87
+ from zarr import Array, Group # type: ignore
88
+ from zarr.attrs import Attributes # type: ignore
89
+ from zarr.storage import Store # type: ignore
90
+ from ..submission.types import ResolvedJobscriptBlockDependencies
91
+ from .types import TypeLookup
92
+ from ..app import BaseApp
93
+ from ..core.json_like import JSONed, JSONDocument
94
+ from ..typing import ParamSource, PathLike, DataIndex
95
+
96
+ #: List of any (Zarr-serializable) value.
97
+ ListAny: TypeAlias = "list[Any]"
98
+ #: Zarr attribute mapping context.
99
+ ZarrAttrs: TypeAlias = "dict[str, Any]"
100
+ #: Soft lower limit for the number of bytes in an array chunk
101
+ _ARRAY_CHUNK_MIN: int = 500 * 1024 * 1024 # 500 MiB
102
+ #: Hard upper limit for the number of bytes in an array chunk. Should be lower than the
103
+ #: maximum buffer size of the blosc encoder, if we're using it (2 GiB)
104
+ _ARRAY_CHUNK_MAX: int = 1024 * 1024 * 1024 # 1 GiB
105
+ _JS: TypeAlias = "dict[str, list[dict[str, dict]]]"
106
+
107
+
108
+ blosc.use_threads = False # hpcflow is a multiprocess program in general
109
+
110
+
111
+ @TimeIt.decorator
112
+ def _zarr_get_coord_selection(arr: Array, selection: Any, logger: Logger):
113
+ @retry(
114
+ RuntimeError,
115
+ tries=10,
116
+ delay=1,
117
+ backoff=1.5,
118
+ jitter=(0, 5),
119
+ logger=logger,
120
+ )
121
+ @TimeIt.decorator
122
+ def _inner(arr: Array, selection: Any):
123
+ return arr.get_coordinate_selection(selection)
124
+
125
+ return _inner(arr, selection)
126
+
127
+
128
+ def _encode_numpy_array(
129
+ obj: NDArray,
130
+ type_lookup: TypeLookup,
131
+ path: list[int],
132
+ root_group: Group,
133
+ arr_path: list[int],
134
+ root_encoder: Callable,
135
+ ) -> int:
136
+ # Might need to generate new group:
137
+ param_arr_group = root_group.require_group(arr_path)
138
+ new_idx = (
139
+ max((int(i.removeprefix("arr_")) for i in param_arr_group.keys()), default=-1) + 1
140
+ )
141
+ with override_module_attrs(
142
+ "zarr.util", {"CHUNK_MIN": _ARRAY_CHUNK_MIN, "CHUNK_MAX": _ARRAY_CHUNK_MAX}
143
+ ):
144
+ # `guess_chunks` also ensures chunk shape is at least 1 in each dimension:
145
+ chunk_shape = guess_chunks(obj.shape, obj.dtype.itemsize)
146
+
147
+ param_arr_group.create_dataset(name=f"arr_{new_idx}", data=obj, chunks=chunk_shape)
148
+ type_lookup["arrays"].append([path, new_idx])
149
+
150
+ return len(type_lookup["arrays"]) - 1
151
+
152
+
153
+ def _decode_numpy_arrays(
154
+ obj: dict | None,
155
+ type_lookup: TypeLookup,
156
+ path: list[int],
157
+ arr_group: Group,
158
+ dataset_copy: bool,
159
+ ):
160
+ # Yuck! Type lies! Zarr's internal types are not modern Python types.
161
+ arrays = cast("Iterable[tuple[list[int], int]]", type_lookup.get("arrays", []))
162
+ obj_: dict | NDArray | None = obj
163
+ for arr_path, arr_idx in arrays:
164
+ try:
165
+ rel_path = get_relative_path(arr_path, path)
166
+ except ValueError:
167
+ continue
168
+
169
+ dataset: NDArray = arr_group.get(f"arr_{arr_idx}")
170
+ if dataset_copy:
171
+ dataset = dataset[:]
172
+
173
+ if rel_path:
174
+ set_in_container(obj_, rel_path, dataset)
175
+ else:
176
+ obj_ = dataset
177
+
178
+ return obj_
179
+
180
+
181
+ def _encode_masked_array(
182
+ obj: MaskedArray,
183
+ type_lookup: TypeLookup,
184
+ path: list[int],
185
+ root_group: Group,
186
+ arr_path: list[int],
187
+ root_encoder: Callable,
188
+ ):
189
+ """Encode a masked array as two normal arrays, and return the fill value."""
190
+ # no need to add "array" entries to the type lookup, so pass an empty `type_lookup`:
191
+ type_lookup_: TypeLookup = defaultdict(list)
192
+ data_idx = _encode_numpy_array(
193
+ obj.data, type_lookup_, path, root_group, arr_path, root_encoder
194
+ )
195
+ mask_idx = _encode_numpy_array(
196
+ cast("NDArray", obj.mask), type_lookup_, path, root_group, arr_path, root_encoder
197
+ )
198
+ type_lookup["masked_arrays"].append([path, [data_idx, mask_idx]])
199
+ return obj.fill_value.item()
200
+
201
+
202
+ def _decode_masked_arrays(
203
+ obj: dict,
204
+ type_lookup: TypeLookup,
205
+ path: list[int],
206
+ arr_group: Group,
207
+ dataset_copy: bool,
208
+ ):
209
+ # Yuck! Type lies! Zarr's internal types are not modern Python types.
210
+ masked_arrays = cast(
211
+ "Iterable[tuple[list[int], tuple[int, int]]]",
212
+ type_lookup.get("masked_arrays", []),
213
+ )
214
+ obj_: dict | MaskedArray = obj
215
+ for arr_path, (data_idx, mask_idx) in masked_arrays:
216
+ try:
217
+ rel_path = get_relative_path(arr_path, path)
218
+ except ValueError:
219
+ continue
220
+
221
+ fill_value = get_in_container(obj_, rel_path)
222
+ data = arr_group.get(f"arr_{data_idx}")
223
+ mask = arr_group.get(f"arr_{mask_idx}")
224
+ dataset: MaskedArray = MaskedArray(data=data, mask=mask, fill_value=fill_value)
225
+
226
+ if rel_path:
227
+ set_in_container(obj_, rel_path, dataset)
228
+ else:
229
+ obj_ = dataset
230
+ return obj_
231
+
232
+
233
+ def _encode_bytes(obj: dict, **kwargs):
234
+ return obj # msgpack can handle bytes
235
+
236
+
237
+ def append_items_to_ragged_array(arr: Array, items: Sequence[int]):
238
+ """Append an array to a Zarr ragged array.
239
+
240
+ I think `arr.append([item])` should work, but does not for some reason, so we do it
241
+ here by resizing and assignment."""
242
+ num = len(items)
243
+ arr.resize((len(arr) + num))
244
+ for idx, i in enumerate(items):
245
+ arr[-(num - idx)] = i
246
+
247
+
248
+ @dataclass
249
+ class ZarrStoreTask(StoreTask[dict]):
250
+ """
251
+ Represents a task in a Zarr persistent store.
252
+ """
253
+
254
+ @override
255
+ def encode(self) -> tuple[int, dict, dict[str, Any]]:
256
+ """Prepare store task data for the persistent store."""
257
+ wk_task = {"id_": self.id_, "element_IDs": np.array(self.element_IDs)}
258
+ task = {"id_": self.id_, **(self.task_template or {})}
259
+ return self.index, wk_task, task
260
+
261
+ @override
262
+ @classmethod
263
+ def decode(cls, task_dat: dict) -> Self:
264
+ """Initialise a `StoreTask` from persistent task data"""
265
+ task_dat["element_IDs"] = task_dat["element_IDs"].tolist()
266
+ return cls(is_pending=False, **task_dat)
267
+
268
+
269
+ @dataclass
270
+ class ZarrStoreElement(StoreElement[ListAny, ZarrAttrs]):
271
+ """
272
+ Represents an element in a Zarr persistent store.
273
+ """
274
+
275
+ @override
276
+ def encode(self, attrs: ZarrAttrs) -> ListAny:
277
+ """Prepare store elements data for the persistent store.
278
+
279
+ This method mutates `attrs`.
280
+ """
281
+ return [
282
+ self.id_,
283
+ self.index,
284
+ self.es_idx,
285
+ [[ensure_in(k, attrs["seq_idx"]), v] for k, v in self.seq_idx.items()],
286
+ [[ensure_in(k, attrs["src_idx"]), v] for k, v in self.src_idx.items()],
287
+ self.task_ID,
288
+ self.iteration_IDs,
289
+ ]
290
+
291
+ @override
292
+ @classmethod
293
+ def decode(cls, elem_dat: ListAny, attrs: ZarrAttrs) -> Self:
294
+ """Initialise a `StoreElement` from persistent element data"""
295
+ obj_dat = {
296
+ "id_": elem_dat[0],
297
+ "index": elem_dat[1],
298
+ "es_idx": elem_dat[2],
299
+ "seq_idx": {attrs["seq_idx"][k]: v for (k, v) in elem_dat[3]},
300
+ "src_idx": {attrs["src_idx"][k]: v for (k, v) in elem_dat[4]},
301
+ "task_ID": elem_dat[5],
302
+ "iteration_IDs": elem_dat[6],
303
+ }
304
+ return cls(is_pending=False, **obj_dat)
305
+
306
+
307
+ @dataclass
308
+ class ZarrStoreElementIter(StoreElementIter[ListAny, ZarrAttrs]):
309
+ """
310
+ Represents an element iteration in a Zarr persistent store.
311
+ """
312
+
313
+ @override
314
+ def encode(self, attrs: ZarrAttrs) -> ListAny:
315
+ """Prepare store element iteration data for the persistent store.
316
+
317
+ This method mutates `attrs`.
318
+ """
319
+ return [
320
+ self.id_,
321
+ self.element_ID,
322
+ int(self.EARs_initialised),
323
+ [[ek, ev] for ek, ev in self.EAR_IDs.items()] if self.EAR_IDs else None,
324
+ [
325
+ [ensure_in(dk, attrs["parameter_paths"]), dv]
326
+ for dk, dv in self.data_idx.items()
327
+ ],
328
+ [ensure_in(i, attrs["schema_parameters"]) for i in self.schema_parameters],
329
+ [[ensure_in(dk, attrs["loops"]), dv] for dk, dv in self.loop_idx.items()],
330
+ ]
331
+
332
+ @override
333
+ @classmethod
334
+ def decode(cls, iter_dat: ListAny, attrs: ZarrAttrs) -> Self:
335
+ """Initialise a `ZarrStoreElementIter` from persistent element iteration data"""
336
+ obj_dat = {
337
+ "id_": iter_dat[0],
338
+ "element_ID": iter_dat[1],
339
+ "EARs_initialised": bool(iter_dat[2]),
340
+ "EAR_IDs": {i[0]: i[1] for i in iter_dat[3]} if iter_dat[3] else None,
341
+ "data_idx": {attrs["parameter_paths"][i[0]]: i[1] for i in iter_dat[4]},
342
+ "schema_parameters": [attrs["schema_parameters"][i] for i in iter_dat[5]],
343
+ "loop_idx": {attrs["loops"][i[0]]: i[1] for i in iter_dat[6]},
344
+ }
345
+ return cls(is_pending=False, **obj_dat)
346
+
347
+
348
+ @dataclass
349
+ class ZarrStoreEAR(StoreEAR[ListAny, ZarrAttrs]):
350
+ """
351
+ Represents an element action run in a Zarr persistent store.
352
+ """
353
+
354
+ @override
355
+ def encode(self, ts_fmt: str, attrs: ZarrAttrs) -> ListAny:
356
+ """Prepare store EAR data for the persistent store.
357
+
358
+ This method mutates `attrs`.
359
+ """
360
+ return [
361
+ self.id_,
362
+ self.elem_iter_ID,
363
+ self.action_idx,
364
+ [
365
+ [ensure_in(dk, attrs["parameter_paths"]), dv]
366
+ for dk, dv in self.data_idx.items()
367
+ ],
368
+ self.submission_idx,
369
+ self.skip,
370
+ self.success,
371
+ self._encode_datetime(self.start_time, ts_fmt),
372
+ self._encode_datetime(self.end_time, ts_fmt),
373
+ self.snapshot_start,
374
+ self.snapshot_end,
375
+ self.exit_code,
376
+ self.metadata,
377
+ self.run_hostname,
378
+ self.commands_idx,
379
+ self.port_number,
380
+ self.commands_file_ID,
381
+ ]
382
+
383
+ @override
384
+ @classmethod
385
+ def decode(cls, EAR_dat: ListAny, ts_fmt: str, attrs: ZarrAttrs) -> Self:
386
+ """Initialise a `ZarrStoreEAR` from persistent EAR data"""
387
+ obj_dat = {
388
+ "id_": EAR_dat[0],
389
+ "elem_iter_ID": EAR_dat[1],
390
+ "action_idx": EAR_dat[2],
391
+ "data_idx": {attrs["parameter_paths"][i[0]]: i[1] for i in EAR_dat[3]},
392
+ "submission_idx": EAR_dat[4],
393
+ "skip": EAR_dat[5],
394
+ "success": EAR_dat[6],
395
+ "start_time": cls._decode_datetime(EAR_dat[7], ts_fmt),
396
+ "end_time": cls._decode_datetime(EAR_dat[8], ts_fmt),
397
+ "snapshot_start": EAR_dat[9],
398
+ "snapshot_end": EAR_dat[10],
399
+ "exit_code": EAR_dat[11],
400
+ "metadata": EAR_dat[12],
401
+ "run_hostname": EAR_dat[13],
402
+ "commands_idx": EAR_dat[14],
403
+ "port_number": EAR_dat[15],
404
+ "commands_file_ID": EAR_dat[16],
405
+ }
406
+ return cls(is_pending=False, **obj_dat)
407
+
408
+
409
+ @dataclass
410
+ @hydrate
411
+ class ZarrStoreParameter(StoreParameter):
412
+ """
413
+ Represents a parameter in a Zarr persistent store.
414
+ """
415
+
416
+ _encoders: ClassVar[dict[type, Callable]] = { # keys are types
417
+ np.ndarray: _encode_numpy_array,
418
+ MaskedArray: _encode_masked_array,
419
+ bytes: _encode_bytes,
420
+ }
421
+ _decoders: ClassVar[dict[str, Callable]] = { # keys are keys in type_lookup
422
+ "arrays": _decode_numpy_arrays,
423
+ "masked_arrays": _decode_masked_arrays,
424
+ }
425
+
426
+
427
+ class ZarrPersistentStore(
428
+ PersistentStore[
429
+ ZarrStoreTask,
430
+ ZarrStoreElement,
431
+ ZarrStoreElementIter,
432
+ ZarrStoreEAR,
433
+ ZarrStoreParameter,
434
+ ]
435
+ ):
436
+ """
437
+ A persistent store implemented using Zarr.
438
+ """
439
+
440
+ _name: ClassVar[str] = "zarr"
441
+ _features: ClassVar[PersistentStoreFeatures] = PersistentStoreFeatures(
442
+ create=True,
443
+ edit=True,
444
+ jobscript_parallelism=True,
445
+ EAR_parallelism=True,
446
+ schedulers=True,
447
+ submission=True,
448
+ )
449
+
450
+ @classmethod
451
+ def _store_task_cls(cls) -> type[ZarrStoreTask]:
452
+ return ZarrStoreTask
453
+
454
+ @classmethod
455
+ def _store_elem_cls(cls) -> type[ZarrStoreElement]:
456
+ return ZarrStoreElement
457
+
458
+ @classmethod
459
+ def _store_iter_cls(cls) -> type[ZarrStoreElementIter]:
460
+ return ZarrStoreElementIter
461
+
462
+ @classmethod
463
+ def _store_EAR_cls(cls) -> type[ZarrStoreEAR]:
464
+ return ZarrStoreEAR
465
+
466
+ @classmethod
467
+ def _store_param_cls(cls) -> type[ZarrStoreParameter]:
468
+ return ZarrStoreParameter
469
+
470
+ _param_grp_name: ClassVar[str] = "parameters"
471
+ _param_base_arr_name: ClassVar[str] = "base"
472
+ _param_sources_arr_name: ClassVar[str] = "sources"
473
+ _param_user_arr_grp_name: ClassVar[str] = "arrays"
474
+ _param_data_arr_grp_name: ClassVar = lambda _, param_idx: f"param_{param_idx}"
475
+ _subs_md_group_name: ClassVar[str] = "submissions"
476
+ _task_arr_name: ClassVar[str] = "tasks"
477
+ _elem_arr_name: ClassVar[str] = "elements"
478
+ _iter_arr_name: ClassVar[str] = "iters"
479
+ _EAR_arr_name: ClassVar[str] = "runs"
480
+ _run_dir_arr_name: ClassVar[str] = "run_dirs"
481
+ _js_at_submit_md_arr_name: ClassVar[str] = "js_at_submit_md"
482
+ _js_run_IDs_arr_name: ClassVar[str] = "js_run_IDs"
483
+ _js_task_elems_arr_name: ClassVar[str] = "js_task_elems"
484
+ _js_task_acts_arr_name: ClassVar[str] = "js_task_acts"
485
+ _js_deps_arr_name: ClassVar[str] = "js_deps"
486
+ _time_res: ClassVar[str] = "us" # microseconds; must not be smaller than micro!
487
+
488
+ _res_map: ClassVar[CommitResourceMap] = CommitResourceMap(
489
+ commit_template_components=("attrs",)
490
+ )
491
+
492
+ def __init__(self, app, workflow, path: str | Path, fs: AbstractFileSystem) -> None:
493
+ self._zarr_store = None # assigned on first access to `zarr_store`
494
+ self._resources = {
495
+ "attrs": ZarrAttrsStoreResource(
496
+ app, name="attrs", open_call=self._get_root_group
497
+ ),
498
+ }
499
+ self._jobscript_at_submit_metadata: dict[int, dict[str, Any]] = (
500
+ {}
501
+ ) # this is a cache
502
+
503
+ # these are caches; keys are submission index and then tuples of
504
+ # (jobscript index, jobscript-block index):
505
+ self._jobscript_run_ID_arrays: dict[int, dict[tuple[int, int], NDArray]] = {}
506
+ self._jobscript_task_element_maps: dict[
507
+ int, dict[tuple[int, int], dict[int, list[int]]]
508
+ ] = {}
509
+ self._jobscript_task_actions_arrays: dict[int, dict[tuple[int, int], NDArray]] = (
510
+ {}
511
+ )
512
+ self._jobscript_dependencies: dict[
513
+ int,
514
+ dict[
515
+ tuple[int, int], dict[tuple[int, int], ResolvedJobscriptBlockDependencies]
516
+ ],
517
+ ] = {}
518
+
519
+ super().__init__(app, workflow, path, fs)
520
+
521
+ @contextmanager
522
+ def cached_load(self) -> Iterator[None]:
523
+ """Context manager to cache the root attributes."""
524
+ with self.using_resource("attrs", "read") as attrs:
525
+ yield
526
+
527
+ def remove_replaced_dir(self) -> None:
528
+ """
529
+ Remove the directory containing replaced workflow details.
530
+ """
531
+ with self.using_resource("attrs", "update") as md:
532
+ if "replaced_workflow" in md:
533
+ self.logger.debug("removing temporarily renamed pre-existing workflow.")
534
+ self.remove_path(md["replaced_workflow"])
535
+ del md["replaced_workflow"]
536
+
537
+ def reinstate_replaced_dir(self) -> None:
538
+ """
539
+ Reinstate the directory containing replaced workflow details.
540
+ """
541
+ with self.using_resource("attrs", "read") as md:
542
+ if "replaced_workflow" in md:
543
+ self.logger.debug(
544
+ "reinstating temporarily renamed pre-existing workflow."
545
+ )
546
+ self.rename_path(
547
+ md["replaced_workflow"],
548
+ self.path,
549
+ )
550
+
551
+ @staticmethod
552
+ def _get_zarr_store(path: str | Path, fs: AbstractFileSystem) -> Store:
553
+ return FSStore(url=str(path), fs=fs)
554
+
555
+ _CODEC: ClassVar = MsgPack()
556
+
557
+ @classmethod
558
+ def write_empty_workflow(
559
+ cls,
560
+ app: BaseApp,
561
+ *,
562
+ template_js: TemplateMeta,
563
+ template_components_js: dict[str, Any],
564
+ wk_path: str,
565
+ fs: AbstractFileSystem,
566
+ name: str,
567
+ replaced_wk: str | None,
568
+ ts_fmt: str,
569
+ ts_name_fmt: str,
570
+ creation_info: StoreCreationInfo,
571
+ compressor: str | None = "blosc",
572
+ compressor_kwargs: dict[str, Any] | None = None,
573
+ ) -> None:
574
+ """
575
+ Write an empty persistent workflow.
576
+ """
577
+ attrs: ZarrAttrsDict = {
578
+ "name": name,
579
+ "ts_fmt": ts_fmt,
580
+ "ts_name_fmt": ts_name_fmt,
581
+ "creation_info": creation_info,
582
+ "template": template_js,
583
+ "template_components": template_components_js,
584
+ "num_added_tasks": 0,
585
+ "tasks": [],
586
+ "loops": [],
587
+ "submissions": [],
588
+ }
589
+ if replaced_wk:
590
+ attrs["replaced_workflow"] = replaced_wk
591
+
592
+ store = cls._get_zarr_store(wk_path, fs)
593
+ root = zarr.group(store=store, overwrite=False)
594
+ root.attrs.update(attrs)
595
+
596
+ # use a nested directory store for the metadata group so the runs array
597
+ # can be stored as a 2D array in nested directories, thereby limiting the maximum
598
+ # number of files stored in a given directory:
599
+ md_store = zarr.NestedDirectoryStore(Path(root.store.path).joinpath("metadata"))
600
+ md = zarr.group(store=md_store)
601
+
602
+ compressor_lookup = {
603
+ "blosc": Blosc,
604
+ "zstd": Zstd,
605
+ }
606
+ if compressor:
607
+ cmp = compressor_lookup[compressor.lower()](**(compressor_kwargs or {}))
608
+ else:
609
+ cmp = None
610
+
611
+ tasks_arr = md.create_dataset(
612
+ name=cls._task_arr_name,
613
+ shape=0,
614
+ dtype=object,
615
+ object_codec=VLenArray(int),
616
+ compressor=cmp,
617
+ )
618
+
619
+ elems_arr = md.create_dataset(
620
+ name=cls._elem_arr_name,
621
+ shape=0,
622
+ dtype=object,
623
+ object_codec=cls._CODEC,
624
+ chunks=1000,
625
+ compressor=cmp,
626
+ )
627
+ elems_arr.attrs.update({"seq_idx": [], "src_idx": []})
628
+
629
+ elem_iters_arr = md.create_dataset(
630
+ name=cls._iter_arr_name,
631
+ shape=0,
632
+ dtype=object,
633
+ object_codec=cls._CODEC,
634
+ chunks=1000,
635
+ compressor=cmp,
636
+ )
637
+ elem_iters_arr.attrs.update(
638
+ {
639
+ "loops": [],
640
+ "schema_parameters": [],
641
+ "parameter_paths": [],
642
+ }
643
+ )
644
+
645
+ EARs_arr = md.create_dataset(
646
+ name=cls._EAR_arr_name,
647
+ shape=(0, 1000),
648
+ dtype=object,
649
+ object_codec=cls._CODEC,
650
+ chunks=1, # single-chunk rows for multiprocess writing
651
+ compressor=cmp,
652
+ dimension_separator="/",
653
+ )
654
+ EARs_arr.attrs.update({"parameter_paths": [], "num_runs": 0})
655
+
656
+ # array for storing indices that can be used to reproduce run directory paths:
657
+ run_dir_arr = md.create_dataset(
658
+ name=cls._run_dir_arr_name,
659
+ shape=0,
660
+ chunks=10_000,
661
+ dtype=RUN_DIR_ARR_DTYPE,
662
+ fill_value=RUN_DIR_ARR_FILL,
663
+ write_empty_chunks=False,
664
+ )
665
+
666
+ parameter_data = root.create_group(name=cls._param_grp_name)
667
+ parameter_data.create_dataset(
668
+ name=cls._param_base_arr_name,
669
+ shape=0,
670
+ dtype=object,
671
+ object_codec=cls._CODEC,
672
+ chunks=1,
673
+ compressor=cmp,
674
+ write_empty_chunks=False,
675
+ fill_value=PARAM_DATA_NOT_SET,
676
+ )
677
+ parameter_data.create_dataset(
678
+ name=cls._param_sources_arr_name,
679
+ shape=0,
680
+ dtype=object,
681
+ object_codec=cls._CODEC,
682
+ chunks=1000, # TODO: check this is a sensible size with many parameters
683
+ compressor=cmp,
684
+ )
685
+ parameter_data.create_group(name=cls._param_user_arr_grp_name)
686
+
687
+ # for storing submission metadata that should not be stored in the root group:
688
+ md.create_group(name=cls._subs_md_group_name)
689
+
690
+ def _append_tasks(self, tasks: Iterable[ZarrStoreTask]):
691
+ elem_IDs_arr = self._get_tasks_arr(mode="r+")
692
+ elem_IDs: list[int] = []
693
+ with self.using_resource("attrs", "update") as attrs:
694
+ for i_idx, i in enumerate(tasks):
695
+ idx, wk_task_i, task_i = i.encode()
696
+ elem_IDs.append(wk_task_i.pop("element_IDs"))
697
+ wk_task_i["element_IDs_idx"] = len(elem_IDs_arr) + i_idx
698
+
699
+ attrs["tasks"].insert(idx, wk_task_i)
700
+ attrs["template"]["tasks"].insert(idx, task_i)
701
+ attrs["num_added_tasks"] += 1
702
+
703
+ # tasks array rows correspond to task IDs, and we assume `tasks` have sequentially
704
+ # increasing IDs.
705
+ append_items_to_ragged_array(arr=elem_IDs_arr, items=elem_IDs)
706
+
707
+ def _append_loops(self, loops: dict[int, LoopDescriptor]):
708
+ with self.using_resource("attrs", action="update") as attrs:
709
+ for loop in loops.values():
710
+ attrs["loops"].append(
711
+ {
712
+ "num_added_iterations": loop["num_added_iterations"],
713
+ "iterable_parameters": loop["iterable_parameters"],
714
+ "output_parameters": loop["output_parameters"],
715
+ "parents": loop["parents"],
716
+ }
717
+ )
718
+ attrs["template"]["loops"].append(loop["loop_template"])
719
+
720
+ @staticmethod
721
+ def _extract_submission_run_IDs_array(
722
+ sub_js: Mapping[str, JSONed],
723
+ ) -> tuple[np.ndarray, list[list[list[int]]]]:
724
+ """For a JSON-like representation of a Submission object, remove and combine all
725
+ jobscript-block run ID lists into a single array with a fill value.
726
+
727
+ Notes
728
+ -----
729
+ This mutates `sub_js`, by setting `EAR_ID` jobscript-block keys to `None`.
730
+
731
+ Parameters
732
+ ----------
733
+ sub_js
734
+ JSON-like representation of a `Submission` object.
735
+
736
+ Returns
737
+ -------
738
+ combined_run_IDs
739
+ Integer Numpy array that contains a concatenation of all 2D run ID arrays
740
+ from each jobscript-block. Technically a "jagged"/"ragged" array that is made
741
+ square with a large fill value.
742
+ block_shapes
743
+ List of length equal to the number of jobscripts in the submission. Each
744
+ sub-list contains a list of shapes (as a two-item list:
745
+ `[num_actions, num_elements]`) of the constituent blocks of that jobscript.
746
+
747
+ """
748
+ arrs = []
749
+ max_acts, max_elems = 0, 0
750
+
751
+ # a list for each jobscript, containing shapes of run ID arrays in each block:
752
+ block_shapes = []
753
+ for js in cast("Sequence[Mapping[str, JSONed]]", sub_js["jobscripts"]):
754
+ block_shapes_js_i = []
755
+ for blk in cast("Sequence[MutableMapping[str, JSONed]]", js["blocks"]):
756
+ run_IDs_i = np.array(blk["EAR_ID"])
757
+ blk["EAR_ID"] = None # TODO: how to type?
758
+ block_shapes_js_i.append(list(run_IDs_i.shape))
759
+ if run_IDs_i.shape[0] > max_acts:
760
+ max_acts = run_IDs_i.shape[0]
761
+ if run_IDs_i.shape[1] > max_elems:
762
+ max_elems = run_IDs_i.shape[1]
763
+ arrs.append(run_IDs_i)
764
+ block_shapes.append(block_shapes_js_i)
765
+
766
+ combined_run_IDs = np.full(
767
+ (len(arrs), max_acts, max_elems),
768
+ dtype=np.int32,
769
+ fill_value=-1,
770
+ )
771
+ for arr_idx, arr in enumerate(arrs):
772
+ combined_run_IDs[arr_idx][: arr.shape[0], : arr.shape[1]] = arr
773
+
774
+ return combined_run_IDs, block_shapes
775
+
776
+ @staticmethod
777
+ def _extract_submission_task_elements_array(
778
+ sub_js: Mapping[str, JSONed],
779
+ ) -> tuple[np.ndarray, list[list[list[int]]]]:
780
+ """For a JSON-like representation of a Submission object, remove and combine all
781
+ jobscript-block task-element mappings into a single array with a fill value.
782
+
783
+ Notes
784
+ -----
785
+ This mutates `sub_js`, by setting `task_elements` jobscript-block keys to `None`.
786
+
787
+ Parameters
788
+ ----------
789
+ sub_js
790
+ JSON-like representation of a `Submission` object.
791
+
792
+ Returns
793
+ -------
794
+ combined_task_elems
795
+ Integer Numpy array that contains a concatenation of each task-element,
796
+ mapping, where each mapping is expressed as a 2D array whose first column
797
+ corresponds to the keys of the mappings, and whose remaining columns
798
+ correspond to the values of the mappings. Technically a "jagged"/"ragged"
799
+ array that is made square with a large fill value.
800
+ block_shapes
801
+ List of length equal to the number of jobscripts in the submission. Each
802
+ sub-list contains a list of shapes (as a two-item list:
803
+ `[num_actions, num_elements]`) of the constituent blocks of that jobscript.
804
+
805
+ """
806
+ arrs = []
807
+ max_x, max_y = 0, 0
808
+
809
+ # a list for each jobscript, containing shapes of run ID arrays in each block:
810
+ block_shapes = []
811
+ for js in cast("Sequence[Mapping[str, JSONed]]", sub_js["jobscripts"]):
812
+ block_shapes_js_i = []
813
+ for blk in cast("Sequence[MutableMapping[str, JSONed]]", js["blocks"]):
814
+
815
+ task_elems_lst = []
816
+ for k, v in cast("Mapping[int, list[int]]", blk["task_elements"]).items():
817
+ task_elems_lst.append([k] + v)
818
+ task_elems_i = np.array(task_elems_lst)
819
+
820
+ block_shape_j = [task_elems_i.shape[1] - 1, task_elems_i.shape[0]]
821
+ block_shapes_js_i.append(block_shape_j)
822
+
823
+ blk["task_elements"] = None # TODO: how to type?
824
+ if task_elems_i.shape[1] > max_x:
825
+ max_x = task_elems_i.shape[1]
826
+ if task_elems_i.shape[0] > max_y:
827
+ max_y = task_elems_i.shape[0]
828
+ arrs.append(task_elems_i)
829
+ block_shapes.append(block_shapes_js_i)
830
+
831
+ combined_task_elems = np.full(
832
+ (len(arrs), max_y, max_x),
833
+ dtype=np.uint32,
834
+ fill_value=np.iinfo(np.uint32).max,
835
+ )
836
+ for arr_idx, arr in enumerate(arrs):
837
+ combined_task_elems[arr_idx][: arr.shape[0], : arr.shape[1]] = arr
838
+
839
+ return combined_task_elems, block_shapes
840
+
841
+ @staticmethod
842
+ def _extract_submission_task_actions_array(
843
+ sub_js: Mapping[str, JSONed],
844
+ ) -> tuple[np.ndarray, list[list[int]]]:
845
+ """For a JSON-like representation of a Submission object, remove and concatenate
846
+ all jobscript-block task-action arrays into a single array.
847
+
848
+ Notes
849
+ -----
850
+ This mutates `sub_js`, by setting `task_actions` jobscript-block keys to `None`.
851
+
852
+ Parameters
853
+ ----------
854
+ sub_js
855
+ JSON-like representation of a `Submission` object.
856
+
857
+ Returns
858
+ -------
859
+ combined_task_acts
860
+ Integer 2D Numpy array which is a concatenation along the first axis of
861
+ task-action actions from all jobscript blocks. The second dimension is of
862
+ length three.
863
+ block_num_acts
864
+ List of length equal to the number of jobscripts in the submission. Each
865
+ sub-list contains a list of `num_actions` of the constituent blocks of that
866
+ jobscript.
867
+
868
+ """
869
+ arrs = []
870
+
871
+ # a list for each jobscript, containing shapes of run ID arrays in each block:
872
+
873
+ blk_num_acts = []
874
+ for js in cast("Sequence[Mapping[str, JSONed]]", sub_js["jobscripts"]):
875
+
876
+ blk_num_acts_js_i = []
877
+ for blk in cast("Sequence[MutableMapping[str, JSONed]]", js["blocks"]):
878
+
879
+ blk_acts = np.array(blk["task_actions"])
880
+ blk["task_actions"] = None # TODO: how to type?
881
+ blk_num_acts_js_i.append(blk_acts.shape[0])
882
+ arrs.append(blk_acts)
883
+
884
+ blk_num_acts.append(blk_num_acts_js_i)
885
+
886
+ combined_task_acts = np.vstack(arrs)
887
+
888
+ return combined_task_acts, blk_num_acts
889
+
890
+ @staticmethod
891
+ def _encode_jobscript_block_dependencies(sub_js: Mapping[str, JSONed]) -> np.ndarray:
892
+ """For a JSON-like representation of a Submission object, remove jobscript-block
893
+ dependencies for all jobscripts and transform to a single 1D integer array, that
894
+ can be transformed back by `_decode_jobscript_block_dependencies`.
895
+
896
+ Notes
897
+ -----
898
+ This mutates `sub_js`, by setting `depdendencies` jobscript-block keys to `None`.
899
+ """
900
+
901
+ # TODO: avoid this horrible mess of casts
902
+
903
+ all_deps_arr = []
904
+ assert sub_js["jobscripts"] is not None
905
+ for js in cast("Sequence[Mapping[str, JSONed]]", sub_js["jobscripts"]):
906
+ for blk in cast("Sequence[MutableMapping[str, JSONed]]", js["blocks"]):
907
+ all_deps_i: list[int] = []
908
+ assert blk["dependencies"] is not None
909
+ blk_deps = cast(
910
+ "list[tuple[tuple[int, int], Mapping[str, JSONed]]]",
911
+ blk["dependencies"],
912
+ )
913
+ for (dep_js_idx, dep_blk_idx), dep in blk_deps:
914
+ deps_arr: list[int] = []
915
+ for elem_i, elements_j in cast(
916
+ "Mapping[int, Sequence[int]]", dep["js_element_mapping"]
917
+ ).items():
918
+ deps_arr.extend([len(elements_j) + 1, elem_i] + list(elements_j))
919
+ blk_arr = [
920
+ dep_js_idx,
921
+ dep_blk_idx,
922
+ int(cast("bool", dep["is_array"])),
923
+ ] + deps_arr
924
+ blk_arr = [len(blk_arr)] + blk_arr
925
+ all_deps_i.extend(blk_arr)
926
+ all_deps_i = [
927
+ cast("int", js["index"]),
928
+ cast("int", blk["index"]),
929
+ ] + all_deps_i
930
+ blk["dependencies"] = None # TODO: how to type?
931
+ all_deps_arr.extend([len(all_deps_i)] + all_deps_i)
932
+
933
+ return np.array(all_deps_arr)
934
+
935
+ @staticmethod
936
+ def _decode_jobscript_block_dependencies(
937
+ arr: np.ndarray,
938
+ ) -> dict[tuple[int, int], dict[tuple[int, int], ResolvedJobscriptBlockDependencies]]:
939
+ """Re-generate jobscript-block dependencies that have been transformed by
940
+ `_encode_jobscript_block_dependencies` into a single 1D integer array.
941
+
942
+ Parameters
943
+ ----------
944
+ arr:
945
+ The 1D integer array to transform back to a verbose jobscript-block dependency
946
+ mapping.
947
+ """
948
+ # metadata is js/blk_idx for which the dependencies are stored:
949
+ block_arrs = split_arr(arr, metadata_size=2)
950
+ block_deps = {}
951
+ for i in block_arrs:
952
+
953
+ js_idx: int
954
+ blk_idx: int
955
+ dep_js_idx: int
956
+ dep_blk_idx: int
957
+ is_array: int
958
+
959
+ js_idx, blk_idx = i[0]
960
+ # metadata is js/blk_idx that this block depends on, plus whether the
961
+ # dependency is an array dependency:
962
+ deps_arrs = split_arr(i[1], metadata_size=3)
963
+ all_deps_ij: dict[tuple[int, int], ResolvedJobscriptBlockDependencies] = {}
964
+ for j in deps_arrs:
965
+ dep_js_idx, dep_blk_idx, is_array = j[0]
966
+ # no metadata:
967
+ elem_deps = split_arr(j[1], metadata_size=0)
968
+ all_deps_ij[(dep_js_idx, dep_blk_idx)] = {
969
+ "js_element_mapping": {},
970
+ "is_array": bool(is_array),
971
+ }
972
+ for k in elem_deps:
973
+ all_deps_ij[(dep_js_idx, dep_blk_idx)]["js_element_mapping"].update(
974
+ {k[1][0]: list(k[1][1:])}
975
+ )
976
+
977
+ block_deps[(js_idx, blk_idx)] = all_deps_ij
978
+ return block_deps
979
+
980
+ def _append_submissions(self, subs: dict[int, Mapping[str, JSONed]]):
981
+
982
+ for sub_idx, sub_i in subs.items():
983
+
984
+ # add a new metadata group for this submission:
985
+ sub_grp = self._get_all_submissions_metadata_group(mode="r+").create_group(
986
+ sub_idx
987
+ )
988
+
989
+ # add a new at-submit metadata array for jobscripts of this submission:
990
+ num_js = len(cast("list", sub_i["jobscripts"]))
991
+ sub_grp.create_dataset(
992
+ name=self._js_at_submit_md_arr_name,
993
+ shape=num_js,
994
+ dtype=object,
995
+ object_codec=MsgPack(),
996
+ chunks=1,
997
+ write_empty_chunks=False,
998
+ )
999
+
1000
+ # add a new array to store run IDs for each jobscript:
1001
+ combined_run_IDs, block_shapes = self._extract_submission_run_IDs_array(sub_i)
1002
+ run_IDs_arr = sub_grp.create_dataset(
1003
+ name=self._js_run_IDs_arr_name,
1004
+ data=combined_run_IDs,
1005
+ chunks=(None, None, None), # single chunk for the whole array
1006
+ )
1007
+ run_IDs_arr.attrs["block_shapes"] = block_shapes
1008
+
1009
+ # add a new array to store task-element map for each jobscript:
1010
+ (
1011
+ combined_task_elems,
1012
+ block_shapes,
1013
+ ) = self._extract_submission_task_elements_array(sub_i)
1014
+ task_elems_arr = sub_grp.create_dataset(
1015
+ name=self._js_task_elems_arr_name,
1016
+ data=combined_task_elems,
1017
+ chunks=(None, None, None),
1018
+ )
1019
+ task_elems_arr.attrs["block_shapes"] = block_shapes
1020
+
1021
+ # add a new array to store task-actions for each jobscript:
1022
+ (
1023
+ combined_task_acts,
1024
+ block_num_acts,
1025
+ ) = self._extract_submission_task_actions_array(sub_i)
1026
+ task_acts_arr = sub_grp.create_dataset(
1027
+ name=self._js_task_acts_arr_name,
1028
+ data=combined_task_acts,
1029
+ chunks=(None, None),
1030
+ )
1031
+ task_acts_arr.attrs["block_num_acts"] = block_num_acts
1032
+
1033
+ # add a new array to store jobscript-block dependencies for this submission:
1034
+ sub_grp.create_dataset(
1035
+ name=self._js_deps_arr_name,
1036
+ data=self._encode_jobscript_block_dependencies(sub_i),
1037
+ chunks=(None,),
1038
+ )
1039
+
1040
+ # TODO: store block shapes in `grp.attrs` since it is defined at the
1041
+ # submission level
1042
+
1043
+ # add attributes for at-submit-time submission metadata:
1044
+ grp = self._get_submission_metadata_group(sub_idx, mode="r+")
1045
+ grp.attrs["submission_parts"] = {}
1046
+
1047
+ with self.using_resource("attrs", action="update") as attrs:
1048
+ attrs["submissions"].extend(subs.values())
1049
+
1050
+ def _append_task_element_IDs(self, task_ID: int, elem_IDs: list[int]):
1051
+ # I don't think there's a way to "append" to an existing array in a zarr ragged
1052
+ # array? So we have to build a new array from existing + new.
1053
+ arr = self._get_tasks_arr(mode="r+")
1054
+ elem_IDs_cur = arr[task_ID]
1055
+ elem_IDs_new = np.concatenate((elem_IDs_cur, elem_IDs))
1056
+ arr[task_ID] = elem_IDs_new
1057
+
1058
+ @staticmethod
1059
+ def __as_dict(attrs: Attributes) -> ZarrAttrs:
1060
+ """
1061
+ Type thunk to work around incomplete typing in zarr.
1062
+ """
1063
+ return cast("ZarrAttrs", attrs.asdict())
1064
+
1065
+ @contextmanager
1066
+ def __mutate_attrs(self, arr: Array) -> Iterator[ZarrAttrs]:
1067
+ attrs_orig = self.__as_dict(arr.attrs)
1068
+ attrs = copy.deepcopy(attrs_orig)
1069
+ yield attrs
1070
+ if attrs != attrs_orig:
1071
+ arr.attrs.put(attrs)
1072
+
1073
+ def _append_elements(self, elems: Sequence[ZarrStoreElement]):
1074
+ arr = self._get_elements_arr(mode="r+")
1075
+ with self.__mutate_attrs(arr) as attrs:
1076
+ arr_add = np.empty((len(elems)), dtype=object)
1077
+ arr_add[:] = [elem.encode(attrs) for elem in elems]
1078
+ arr.append(arr_add)
1079
+
1080
+ def _append_element_sets(self, task_id: int, es_js: Sequence[Mapping]):
1081
+ task_idx = task_idx = self._get_task_id_to_idx_map()[task_id]
1082
+ with self.using_resource("attrs", "update") as attrs:
1083
+ attrs["template"]["tasks"][task_idx]["element_sets"].extend(es_js)
1084
+
1085
+ def _append_elem_iter_IDs(self, elem_ID: int, iter_IDs: Iterable[int]):
1086
+ arr = self._get_elements_arr(mode="r+")
1087
+ attrs = self.__as_dict(arr.attrs)
1088
+ elem_dat = cast("list", arr[elem_ID])
1089
+ store_elem = ZarrStoreElement.decode(elem_dat, attrs)
1090
+ store_elem = store_elem.append_iteration_IDs(iter_IDs)
1091
+ arr[elem_ID] = store_elem.encode(attrs)
1092
+ # attrs shouldn't be mutated (TODO: test!)
1093
+
1094
+ def _append_elem_iters(self, iters: Sequence[ZarrStoreElementIter]):
1095
+ arr = self._get_iters_arr(mode="r+")
1096
+ with self.__mutate_attrs(arr) as attrs:
1097
+ arr_add = np.empty((len(iters)), dtype=object)
1098
+ arr_add[:] = [i.encode(attrs) for i in iters]
1099
+ arr.append(arr_add)
1100
+
1101
+ def _append_elem_iter_EAR_IDs(
1102
+ self, iter_ID: int, act_idx: int, EAR_IDs: Sequence[int]
1103
+ ):
1104
+ arr = self._get_iters_arr(mode="r+")
1105
+ attrs = self.__as_dict(arr.attrs)
1106
+ iter_dat = cast("list", arr[iter_ID])
1107
+ store_iter = ZarrStoreElementIter.decode(iter_dat, attrs)
1108
+ store_iter = store_iter.append_EAR_IDs(pend_IDs={act_idx: EAR_IDs})
1109
+ arr[iter_ID] = store_iter.encode(attrs)
1110
+ # attrs shouldn't be mutated (TODO: test!)
1111
+
1112
+ def _update_elem_iter_EARs_initialised(self, iter_ID: int):
1113
+ arr = self._get_iters_arr(mode="r+")
1114
+ attrs = self.__as_dict(arr.attrs)
1115
+ iter_dat = cast("list", arr[iter_ID])
1116
+ store_iter = ZarrStoreElementIter.decode(iter_dat, attrs)
1117
+ store_iter = store_iter.set_EARs_initialised()
1118
+ arr[iter_ID] = store_iter.encode(attrs)
1119
+ # attrs shouldn't be mutated (TODO: test!)
1120
+
1121
+ def _update_at_submit_metadata(
1122
+ self,
1123
+ at_submit_metadata: dict[int, dict[str, Any]],
1124
+ ):
1125
+ for sub_idx, metadata_i in at_submit_metadata.items():
1126
+ grp = self._get_submission_metadata_group(sub_idx, mode="r+")
1127
+ attrs = self.__as_dict(grp.attrs)
1128
+ attrs["submission_parts"].update(metadata_i["submission_parts"])
1129
+ grp.attrs.put(attrs)
1130
+
1131
+ def _update_loop_index(self, loop_indices: dict[int, dict[str, int]]):
1132
+
1133
+ arr = self._get_iters_arr(mode="r+")
1134
+ attrs = self.__as_dict(arr.attrs)
1135
+ iter_IDs = list(loop_indices.keys())
1136
+ iter_dat = arr.get_coordinate_selection(iter_IDs)
1137
+ store_iters = [ZarrStoreElementIter.decode(i, attrs) for i in iter_dat]
1138
+
1139
+ for idx, iter_ID_i in enumerate(iter_IDs):
1140
+ new_iter_i = store_iters[idx].update_loop_idx(loop_indices[iter_ID_i])
1141
+ # seems to be a Zarr bug that prevents `set_coordinate_selection` with an
1142
+ # object array, so set one-by-one:
1143
+ arr[iter_ID_i] = new_iter_i.encode(attrs)
1144
+
1145
+ def _update_loop_num_iters(self, index: int, num_iters: list[list[list[int] | int]]):
1146
+ with self.using_resource("attrs", action="update") as attrs:
1147
+ attrs["loops"][index]["num_added_iterations"] = num_iters
1148
+
1149
+ def _update_loop_parents(self, index: int, parents: list[str]):
1150
+ with self.using_resource("attrs", action="update") as attrs:
1151
+ attrs["loops"][index]["parents"] = parents
1152
+
1153
+ def _update_iter_data_indices(self, iter_data_indices: dict[int, DataIndex]):
1154
+
1155
+ arr = self._get_iters_arr(mode="r+")
1156
+ attrs = self.__as_dict(arr.attrs)
1157
+ iter_IDs = list(iter_data_indices.keys())
1158
+ iter_dat = arr.get_coordinate_selection(iter_IDs)
1159
+ store_iters = [ZarrStoreElementIter.decode(i, attrs) for i in iter_dat]
1160
+
1161
+ for idx, iter_ID_i in enumerate(iter_IDs):
1162
+ new_iter_i = store_iters[idx].update_data_idx(iter_data_indices[iter_ID_i])
1163
+ # seems to be a Zarr bug that prevents `set_coordinate_selection` with an
1164
+ # object array, so set one-by-one:
1165
+ arr[iter_ID_i] = new_iter_i.encode(attrs)
1166
+
1167
+ def _update_run_data_indices(self, run_data_indices: dict[int, DataIndex]):
1168
+ self._update_runs(
1169
+ updates={k: {"data_idx": v} for k, v in run_data_indices.items()}
1170
+ )
1171
+
1172
+ def _append_EARs(self, EARs: Sequence[ZarrStoreEAR]):
1173
+ arr = self._get_EARs_arr(mode="r+")
1174
+ with self.__mutate_attrs(arr) as attrs:
1175
+ num_existing = attrs["num_runs"]
1176
+ num_add = len(EARs)
1177
+ num_tot = num_existing + num_add
1178
+ arr_add = np.empty(num_add, dtype=object)
1179
+ arr_add[:] = [i.encode(self.ts_fmt, attrs) for i in EARs]
1180
+
1181
+ # get new 1D indices:
1182
+ new_idx: NDArray = np.arange(num_existing, num_tot)
1183
+
1184
+ # transform to 2D indices:
1185
+ r_idx, c_idx = get_2D_idx(new_idx, num_cols=arr.shape[1])
1186
+
1187
+ # add rows to accomodate new runs:
1188
+ max_r_idx = np.max(r_idx)
1189
+ if max_r_idx + 1 > arr.shape[0]:
1190
+ arr.resize(max_r_idx + 1, arr.shape[1])
1191
+
1192
+ # fill in new data:
1193
+ for arr_add_idx_i, (r_idx_i, c_idx_i) in enumerate(zip(r_idx, c_idx)):
1194
+ # seems to be a Zarr bug that prevents `set_coordinate_selection` with an
1195
+ # object array, so set one-by-one:
1196
+ arr[r_idx_i, c_idx_i] = arr_add[arr_add_idx_i]
1197
+
1198
+ attrs["num_runs"] = num_tot
1199
+
1200
+ # add more rows to run dirs array:
1201
+ dirs_arr = self._get_dirs_arr(mode="r+")
1202
+ dirs_arr.resize(num_tot)
1203
+
1204
+ def _set_run_dirs(self, run_dir_arr: np.ndarray, run_idx: np.ndarray):
1205
+ dirs_arr = self._get_dirs_arr(mode="r+")
1206
+ dirs_arr[run_idx] = run_dir_arr
1207
+
1208
+ @TimeIt.decorator
1209
+ def _update_runs(self, updates: dict[int, dict[str, Any]]):
1210
+ """Update the provided EAR attribute values in the specified existing runs."""
1211
+ run_IDs = list(updates.keys())
1212
+ runs = self._get_persistent_EARs(run_IDs)
1213
+
1214
+ arr = self._get_EARs_arr(mode="r+")
1215
+ with self.__mutate_attrs(arr) as attrs:
1216
+ # convert to 2D array indices:
1217
+ r_idx, c_idx = get_2D_idx(
1218
+ np.array(list(updates.keys())), num_cols=arr.shape[1]
1219
+ )
1220
+ for ri, ci, rID_i, upd_i in zip(
1221
+ r_idx, c_idx, updates.keys(), updates.values()
1222
+ ):
1223
+ new_run_i = runs[rID_i].update(**upd_i)
1224
+ # seems to be a Zarr bug that prevents `set_coordinate_selection` with an
1225
+ # object array, so set one-by-one:
1226
+ arr[ri, ci] = new_run_i.encode(self.ts_fmt, attrs)
1227
+
1228
+ @TimeIt.decorator
1229
+ def _update_EAR_submission_data(self, sub_data: Mapping[int, tuple[int, int | None]]):
1230
+ self._update_runs(
1231
+ updates={
1232
+ k: {"submission_idx": v[0], "commands_file_ID": v[1]}
1233
+ for k, v in sub_data.items()
1234
+ }
1235
+ )
1236
+
1237
+ def _update_EAR_start(
1238
+ self,
1239
+ run_starts: dict[int, tuple[datetime, dict[str, Any] | None, str, int | None]],
1240
+ ):
1241
+ self._update_runs(
1242
+ updates={
1243
+ k: {
1244
+ "start_time": v[0],
1245
+ "snapshot_start": v[1],
1246
+ "run_hostname": v[2],
1247
+ "port_number": v[3],
1248
+ }
1249
+ for k, v in run_starts.items()
1250
+ }
1251
+ )
1252
+
1253
+ def _update_EAR_end(
1254
+ self, run_ends: dict[int, tuple[datetime, dict[str, Any] | None, int, bool]]
1255
+ ):
1256
+ self._update_runs(
1257
+ updates={
1258
+ k: {
1259
+ "end_time": v[0],
1260
+ "snapshot_end": v[1],
1261
+ "exit_code": v[2],
1262
+ "success": v[3],
1263
+ }
1264
+ for k, v in run_ends.items()
1265
+ }
1266
+ )
1267
+
1268
+ def _update_EAR_skip(self, skips: dict[int, int]):
1269
+ self._update_runs(updates={k: {"skip": v} for k, v in skips.items()})
1270
+
1271
+ def _update_js_metadata(self, js_meta: dict[int, dict[int, dict[str, Any]]]):
1272
+
1273
+ arr_keys = JOBSCRIPT_SUBMIT_TIME_KEYS # these items go to the Zarr array
1274
+
1275
+ # split into attributes to save to the root group metadata, and those to save to
1276
+ # the submit-time jobscript metadata array
1277
+
1278
+ grp_dat = {} # keys are tuples of (sub_idx, js_idx), values are metadata dicts
1279
+
1280
+ for sub_idx, all_js_md in js_meta.items():
1281
+ js_arr = None
1282
+ for js_idx, js_meta_i in all_js_md.items():
1283
+
1284
+ grp_dat_i = {k: v for k, v in js_meta_i.items() if k not in arr_keys}
1285
+ if grp_dat_i:
1286
+ grp_dat[(sub_idx, js_idx)] = grp_dat_i
1287
+ arr_dat = [js_meta_i.get(k) for k in arr_keys]
1288
+
1289
+ if any(arr_dat):
1290
+ # we are updating the at-sumbmit metadata, so clear the cache:
1291
+ self.clear_jobscript_at_submit_metadata_cache()
1292
+
1293
+ js_arr = js_arr or self._get_jobscripts_at_submit_metadata_arr(
1294
+ mode="r+", sub_idx=sub_idx
1295
+ )
1296
+ self.logger.info(
1297
+ f"updating submit-time jobscript metadata array: {arr_dat!r}."
1298
+ )
1299
+ js_arr[js_idx] = arr_dat
1300
+
1301
+ if grp_dat:
1302
+ with self.using_resource("attrs", action="update") as attrs:
1303
+ for (sub_idx, js_idx), js_meta_i in grp_dat.items():
1304
+ self.logger.info(
1305
+ f"updating jobscript metadata in the root group for "
1306
+ f"(sub={sub_idx}, js={js_idx}): {js_meta_i!r}."
1307
+ )
1308
+ sub = cast(
1309
+ "dict[str, list[dict[str, Any]]]", attrs["submissions"][sub_idx]
1310
+ )
1311
+ sub["jobscripts"][js_idx].update(js_meta_i)
1312
+
1313
+ def _append_parameters(self, params: Sequence[StoreParameter]):
1314
+ """Add new persistent parameters."""
1315
+ self._ensure_all_encoders()
1316
+ base_arr = self._get_parameter_base_array(mode="r+", write_empty_chunks=False)
1317
+ src_arr = self._get_parameter_sources_array(mode="r+")
1318
+ self.logger.debug(
1319
+ f"PersistentStore._append_parameters: adding {len(params)} parameters."
1320
+ )
1321
+
1322
+ param_encode_root_group = self._get_parameter_user_array_group(mode="r+")
1323
+ param_enc: list[dict[str, Any] | int] = []
1324
+ src_enc: list[dict] = []
1325
+ for param_i in params:
1326
+ dat_i = param_i.encode(
1327
+ root_group=param_encode_root_group,
1328
+ arr_path=self._param_data_arr_grp_name(param_i.id_),
1329
+ )
1330
+ param_enc.append(dat_i)
1331
+ src_enc.append(dict(sorted(param_i.source.items())))
1332
+
1333
+ base_arr.append(param_enc)
1334
+ src_arr.append(src_enc)
1335
+ self.logger.debug(
1336
+ f"PersistentStore._append_parameters: finished adding {len(params)} parameters."
1337
+ )
1338
+
1339
+ def _set_parameter_values(self, set_parameters: dict[int, tuple[Any, bool]]):
1340
+ """Set multiple unset persistent parameters."""
1341
+ self._ensure_all_encoders()
1342
+ param_ids = list(set_parameters)
1343
+ # the `decode` call in `_get_persistent_parameters` should be quick:
1344
+ params = self._get_persistent_parameters(param_ids)
1345
+ new_data: list[dict[str, Any] | int] = []
1346
+ param_encode_root_group = self._get_parameter_user_array_group(mode="r+")
1347
+ for param_id, (value, is_file) in set_parameters.items():
1348
+ param_i = params[param_id]
1349
+ if is_file:
1350
+ param_i = param_i.set_file(value)
1351
+ else:
1352
+ param_i = param_i.set_data(value)
1353
+
1354
+ new_data.append(
1355
+ param_i.encode(
1356
+ root_group=param_encode_root_group,
1357
+ arr_path=self._param_data_arr_grp_name(param_i.id_),
1358
+ )
1359
+ )
1360
+
1361
+ # no need to update sources array:
1362
+ base_arr = self._get_parameter_base_array(mode="r+")
1363
+ base_arr.set_coordinate_selection(param_ids, new_data)
1364
+
1365
+ def _update_parameter_sources(self, sources: Mapping[int, ParamSource]):
1366
+ """Update the sources of multiple persistent parameters."""
1367
+
1368
+ param_ids = list(sources)
1369
+ src_arr = self._get_parameter_sources_array(mode="r+")
1370
+ existing_sources = src_arr.get_coordinate_selection(param_ids)
1371
+ new_sources = [
1372
+ update_param_source_dict(cast("ParamSource", existing_sources[idx]), source_i)
1373
+ for idx, source_i in enumerate(sources.values())
1374
+ ]
1375
+ src_arr.set_coordinate_selection(param_ids, new_sources)
1376
+
1377
+ def _update_template_components(self, tc: dict[str, Any]):
1378
+ with self.using_resource("attrs", "update") as md:
1379
+ md["template_components"] = tc
1380
+
1381
+ @TimeIt.decorator
1382
+ def _get_num_persistent_tasks(self) -> int:
1383
+ """Get the number of persistent tasks."""
1384
+ if self.use_cache and self.num_tasks_cache is not None:
1385
+ num = self.num_tasks_cache
1386
+ else:
1387
+ num = len(self._get_tasks_arr())
1388
+ if self.use_cache and self.num_tasks_cache is None:
1389
+ self.num_tasks_cache = num
1390
+ return num
1391
+
1392
+ def _get_num_persistent_loops(self) -> int:
1393
+ """Get the number of persistent loops."""
1394
+ with self.using_resource("attrs", "read") as attrs:
1395
+ return len(attrs["loops"])
1396
+
1397
+ def _get_num_persistent_submissions(self) -> int:
1398
+ """Get the number of persistent submissions."""
1399
+ with self.using_resource("attrs", "read") as attrs:
1400
+ return len(attrs["submissions"])
1401
+
1402
+ def _get_num_persistent_elements(self) -> int:
1403
+ """Get the number of persistent elements."""
1404
+ return len(self._get_elements_arr())
1405
+
1406
+ def _get_num_persistent_elem_iters(self) -> int:
1407
+ """Get the number of persistent element iterations."""
1408
+ return len(self._get_iters_arr())
1409
+
1410
+ @TimeIt.decorator
1411
+ def _get_num_persistent_EARs(self) -> int:
1412
+ """Get the number of persistent EARs."""
1413
+ if self.use_cache and self.num_EARs_cache is not None:
1414
+ num = self.num_EARs_cache
1415
+ else:
1416
+ num = self._get_EARs_arr().attrs["num_runs"]
1417
+ if self.use_cache and self.num_EARs_cache is None:
1418
+ self.num_EARs_cache = num
1419
+ return num
1420
+
1421
+ def _get_num_persistent_parameters(self):
1422
+ return len(self._get_parameter_base_array())
1423
+
1424
+ def _get_num_persistent_added_tasks(self):
1425
+ with self.using_resource("attrs", "read") as attrs:
1426
+ return attrs["num_added_tasks"]
1427
+
1428
+ @property
1429
+ def zarr_store(self) -> Store:
1430
+ """
1431
+ The underlying store object.
1432
+ """
1433
+ if self._zarr_store is None:
1434
+ assert self.fs is not None
1435
+ self._zarr_store = self._get_zarr_store(self.path, self.fs)
1436
+ return self._zarr_store
1437
+
1438
+ def _get_root_group(self, mode: str = "r", **kwargs) -> Group:
1439
+ # TODO: investigate if there are inefficiencies in how we retrieve zarr groups
1440
+ # and arrays, e.g. opening sub groups sequentially would open the root group
1441
+ # multiple times, and so read the root group attrs file multiple times?
1442
+ # it might make sense to define a ZarrAttrsStoreResource for each zarr group and
1443
+ # array (or at least non-parameter groups/arrays?), there could be some built-in
1444
+ # understanding of the hierarchy (e.g. via a `path` attribute) which would then
1445
+ # avoid reading parent groups multiple times --- if that is happening currently.
1446
+ return zarr.open(self.zarr_store, mode=mode, **kwargs)
1447
+
1448
+ def _get_parameter_group(self, mode: str = "r", **kwargs) -> Group:
1449
+ return self._get_root_group(mode=mode, **kwargs).get(self._param_grp_name)
1450
+
1451
+ def _get_parameter_base_array(self, mode: str = "r", **kwargs) -> Array:
1452
+ path = f"{self._param_grp_name}/{self._param_base_arr_name}"
1453
+ return zarr.open(self.zarr_store, mode=mode, path=path, **kwargs)
1454
+
1455
+ def _get_parameter_sources_array(self, mode: str = "r") -> Array:
1456
+ return self._get_parameter_group(mode=mode).get(self._param_sources_arr_name)
1457
+
1458
+ def _get_parameter_user_array_group(self, mode: str = "r") -> Group:
1459
+ return self._get_parameter_group(mode=mode).get(self._param_user_arr_grp_name)
1460
+
1461
+ def _get_parameter_data_array_group(
1462
+ self,
1463
+ parameter_idx: int,
1464
+ mode: str = "r",
1465
+ ) -> Group:
1466
+ return self._get_parameter_user_array_group(mode=mode).get(
1467
+ self._param_data_arr_grp_name(parameter_idx)
1468
+ )
1469
+
1470
+ def _get_array_group_and_dataset(
1471
+ self, mode: str, param_id: int, data_path: list[int]
1472
+ ):
1473
+ base_dat = self._get_parameter_base_array(mode="r")[param_id]
1474
+ for arr_dat_path, arr_idx in base_dat["type_lookup"]["arrays"]:
1475
+ if arr_dat_path == data_path:
1476
+ break
1477
+ else:
1478
+ raise ValueError(
1479
+ f"Could not find array path {data_path} in the base data for parameter "
1480
+ f"ID {param_id}."
1481
+ )
1482
+ group = self._get_parameter_user_array_group(mode=mode).get(
1483
+ f"{self._param_data_arr_grp_name(param_id)}"
1484
+ )
1485
+ return group, f"arr_{arr_idx}"
1486
+
1487
+ def _get_metadata_group(self, mode: str = "r") -> Group:
1488
+ try:
1489
+ path = Path(self.workflow.url).joinpath("metadata")
1490
+ md_store = zarr.NestedDirectoryStore(path)
1491
+ return zarr.open_group(store=md_store, mode=mode)
1492
+ except (FileNotFoundError, zarr.errors.GroupNotFoundError):
1493
+ # zip store?
1494
+ return zarr.open_group(self.zarr_store, path="metadata", mode=mode)
1495
+
1496
+ def _get_all_submissions_metadata_group(self, mode: str = "r") -> Group:
1497
+ return self._get_metadata_group(mode=mode).get(self._subs_md_group_name)
1498
+
1499
+ def _get_submission_metadata_group(self, sub_idx: int, mode: str = "r") -> Group:
1500
+ return self._get_all_submissions_metadata_group(mode=mode).get(sub_idx)
1501
+
1502
+ def _get_submission_metadata_group_path(self, sub_idx: int) -> Path:
1503
+ grp = self._get_submission_metadata_group(sub_idx)
1504
+ return Path(grp.store.path).joinpath(grp.path)
1505
+
1506
+ def _get_jobscripts_at_submit_metadata_arr(
1507
+ self, sub_idx: int, mode: str = "r"
1508
+ ) -> Array:
1509
+ return self._get_submission_metadata_group(sub_idx=sub_idx, mode=mode).get(
1510
+ self._js_at_submit_md_arr_name
1511
+ )
1512
+
1513
+ def _get_jobscripts_at_submit_metadata_arr_path(self, sub_idx: int) -> Path:
1514
+ arr = self._get_jobscripts_at_submit_metadata_arr(sub_idx)
1515
+ return Path(arr.store.path).joinpath(arr.path)
1516
+
1517
+ @TimeIt.decorator
1518
+ def _get_jobscripts_run_ID_arr(self, sub_idx: int, mode: str = "r") -> Array:
1519
+ return self._get_submission_metadata_group(sub_idx=sub_idx, mode=mode).get(
1520
+ self._js_run_IDs_arr_name
1521
+ )
1522
+
1523
+ def _get_jobscripts_task_elements_arr(self, sub_idx: int, mode: str = "r") -> Array:
1524
+ return self._get_submission_metadata_group(sub_idx=sub_idx, mode=mode).get(
1525
+ self._js_task_elems_arr_name
1526
+ )
1527
+
1528
+ def _get_jobscripts_task_actions_arr(self, sub_idx: int, mode: str = "r") -> Array:
1529
+ return self._get_submission_metadata_group(sub_idx=sub_idx, mode=mode).get(
1530
+ self._js_task_acts_arr_name
1531
+ )
1532
+
1533
+ def _get_jobscripts_dependencies_arr(self, sub_idx: int, mode: str = "r") -> Array:
1534
+ return self._get_submission_metadata_group(sub_idx=sub_idx, mode=mode).get(
1535
+ self._js_deps_arr_name
1536
+ )
1537
+
1538
+ def _get_tasks_arr(self, mode: str = "r") -> Array:
1539
+ return self._get_metadata_group(mode=mode).get(self._task_arr_name)
1540
+
1541
+ def _get_elements_arr(self, mode: str = "r") -> Array:
1542
+ return self._get_metadata_group(mode=mode).get(self._elem_arr_name)
1543
+
1544
+ def _get_iters_arr(self, mode: str = "r") -> Array:
1545
+ return self._get_metadata_group(mode=mode).get(self._iter_arr_name)
1546
+
1547
+ def _get_EARs_arr(self, mode: str = "r") -> Array:
1548
+ return self._get_metadata_group(mode=mode).get(self._EAR_arr_name)
1549
+
1550
+ def _get_dirs_arr(self, mode: str = "r") -> zarr.Array:
1551
+ return self._get_metadata_group(mode=mode).get(self._run_dir_arr_name)
1552
+
1553
+ @classmethod
1554
+ def make_test_store_from_spec(
1555
+ cls,
1556
+ spec,
1557
+ dir=None,
1558
+ path="test_store",
1559
+ overwrite=False,
1560
+ ):
1561
+ """Generate an store for testing purposes."""
1562
+ ts_fmt = "FIXME"
1563
+
1564
+ path = Path(dir or "", path)
1565
+ root = zarr.group(store=DirectoryStore(path), overwrite=overwrite)
1566
+ md = root.create_group("metadata")
1567
+
1568
+ tasks_arr = md.create_dataset(
1569
+ name=cls._task_arr_name,
1570
+ shape=0,
1571
+ dtype=object,
1572
+ object_codec=VLenArray(int),
1573
+ )
1574
+
1575
+ elems_arr = md.create_dataset(
1576
+ name=cls._elem_arr_name,
1577
+ shape=0,
1578
+ dtype=object,
1579
+ object_codec=cls._CODEC,
1580
+ chunks=1000,
1581
+ )
1582
+ elems_arr.attrs.update({"seq_idx": [], "src_idx": []})
1583
+
1584
+ elem_iters_arr = md.create_dataset(
1585
+ name=cls._iter_arr_name,
1586
+ shape=0,
1587
+ dtype=object,
1588
+ object_codec=cls._CODEC,
1589
+ chunks=1000,
1590
+ )
1591
+ elem_iters_arr.attrs.update(
1592
+ {
1593
+ "loops": [],
1594
+ "schema_parameters": [],
1595
+ "parameter_paths": [],
1596
+ }
1597
+ )
1598
+
1599
+ EARs_arr = md.create_dataset(
1600
+ name=cls._EAR_arr_name,
1601
+ shape=0,
1602
+ dtype=object,
1603
+ object_codec=cls._CODEC,
1604
+ chunks=1000,
1605
+ )
1606
+ EARs_arr.attrs["parameter_paths"] = []
1607
+
1608
+ tasks, elems, elem_iters, EARs_ = super().prepare_test_store_from_spec(spec)
1609
+
1610
+ path = Path(path).resolve()
1611
+ tasks = [ZarrStoreTask(**i).encode() for i in tasks]
1612
+ elements = [ZarrStoreElement(**i).encode(elems_arr.attrs.asdict()) for i in elems]
1613
+ elem_iters = [
1614
+ ZarrStoreElementIter(**i).encode(elem_iters_arr.attrs.asdict())
1615
+ for i in elem_iters
1616
+ ]
1617
+ EARs = [ZarrStoreEAR(**i).encode(ts_fmt, EARs_arr.attrs.asdict()) for i in EARs_]
1618
+
1619
+ append_items_to_ragged_array(tasks_arr, tasks)
1620
+
1621
+ elems_arr.append(np.fromiter(elements, dtype=object))
1622
+ elem_iters_arr.append(np.fromiter(elem_iters, dtype=object))
1623
+ EARs_arr.append(np.fromiter(EARs, dtype=object))
1624
+
1625
+ return cls(path)
1626
+
1627
+ def _get_persistent_template_components(self):
1628
+ with self.using_resource("attrs", "read") as attrs:
1629
+ return attrs["template_components"]
1630
+
1631
+ def _get_persistent_template(self) -> dict[str, JSONed]:
1632
+ with self.using_resource("attrs", "read") as attrs:
1633
+ return cast("dict[str, JSONed]", attrs["template"])
1634
+
1635
+ @TimeIt.decorator
1636
+ def _get_persistent_tasks(self, id_lst: Iterable[int]) -> dict[int, ZarrStoreTask]:
1637
+ tasks, id_lst = self._get_cached_persistent_tasks(id_lst)
1638
+ if id_lst:
1639
+ with self.using_resource("attrs", action="read") as attrs:
1640
+ task_dat: dict[int, dict[str, Any]] = {}
1641
+ elem_IDs: list[int] = []
1642
+ i: dict[str, Any]
1643
+ for idx, i in enumerate(attrs["tasks"]):
1644
+ i = copy.deepcopy(i)
1645
+ elem_IDs.append(i.pop("element_IDs_idx"))
1646
+ if id_lst is None or i["id_"] in id_lst:
1647
+ task_dat[i["id_"]] = {**i, "index": idx}
1648
+ if task_dat:
1649
+ try:
1650
+ elem_IDs_arr_dat = self._get_tasks_arr().get_coordinate_selection(
1651
+ elem_IDs
1652
+ )
1653
+ except BoundsCheckError:
1654
+ raise MissingStoreTaskError(
1655
+ elem_IDs
1656
+ ) from None # TODO: not an ID list
1657
+
1658
+ new_tasks = {
1659
+ id_: ZarrStoreTask.decode({**i, "element_IDs": elem_IDs_arr_dat[id_]})
1660
+ for id_, i in task_dat.items()
1661
+ }
1662
+ self.task_cache.update(new_tasks)
1663
+ tasks.update(new_tasks)
1664
+ return tasks
1665
+
1666
+ @TimeIt.decorator
1667
+ def _get_persistent_loops(
1668
+ self, id_lst: Iterable[int] | None = None
1669
+ ) -> dict[int, LoopDescriptor]:
1670
+ with self.using_resource("attrs", "read") as attrs:
1671
+ return {
1672
+ idx: cast("LoopDescriptor", i)
1673
+ for idx, i in enumerate(attrs["loops"])
1674
+ if id_lst is None or idx in id_lst
1675
+ }
1676
+
1677
+ @TimeIt.decorator
1678
+ def _get_persistent_submissions(
1679
+ self, id_lst: Iterable[int] | None = None
1680
+ ) -> dict[int, Mapping[str, JSONed]]:
1681
+ self.logger.debug("loading persistent submissions from the zarr store")
1682
+ ids = set(id_lst or ())
1683
+ with self.using_resource("attrs", "read") as attrs:
1684
+ subs_dat = copy.deepcopy(
1685
+ {
1686
+ idx: i
1687
+ for idx, i in enumerate(attrs["submissions"])
1688
+ if id_lst is None or idx in ids
1689
+ }
1690
+ )
1691
+
1692
+ return subs_dat
1693
+
1694
+ @TimeIt.decorator
1695
+ def _get_persistent_elements(
1696
+ self, id_lst: Iterable[int]
1697
+ ) -> dict[int, ZarrStoreElement]:
1698
+ elems, id_lst = self._get_cached_persistent_elements(id_lst)
1699
+ if id_lst:
1700
+ self.logger.debug(
1701
+ f"loading {len(id_lst)} persistent element(s) from disk: "
1702
+ f"{shorten_list_str(id_lst)}."
1703
+ )
1704
+ arr = self._get_elements_arr()
1705
+ attrs = arr.attrs.asdict()
1706
+ try:
1707
+ elem_arr_dat = arr.get_coordinate_selection(id_lst)
1708
+ except BoundsCheckError:
1709
+ raise MissingStoreElementError(id_lst) from None
1710
+ elem_dat = dict(zip(id_lst, elem_arr_dat))
1711
+ new_elems = {
1712
+ k: ZarrStoreElement.decode(v, attrs) for k, v in elem_dat.items()
1713
+ }
1714
+ self.element_cache.update(new_elems)
1715
+ elems.update(new_elems)
1716
+ return elems
1717
+
1718
+ @TimeIt.decorator
1719
+ def _get_persistent_element_iters(
1720
+ self, id_lst: Iterable[int]
1721
+ ) -> dict[int, ZarrStoreElementIter]:
1722
+ iters, id_lst = self._get_cached_persistent_element_iters(id_lst)
1723
+ if id_lst:
1724
+ self.logger.debug(
1725
+ f"loading {len(id_lst)} persistent element iteration(s) from disk: "
1726
+ f"{shorten_list_str(id_lst)}."
1727
+ )
1728
+ arr = self._get_iters_arr()
1729
+ attrs = arr.attrs.asdict()
1730
+ try:
1731
+ iter_arr_dat = arr.get_coordinate_selection(id_lst)
1732
+ except BoundsCheckError:
1733
+ raise MissingStoreElementIterationError(id_lst) from None
1734
+ iter_dat = dict(zip(id_lst, iter_arr_dat))
1735
+ new_iters = {
1736
+ k: ZarrStoreElementIter.decode(v, attrs) for k, v in iter_dat.items()
1737
+ }
1738
+ self.element_iter_cache.update(new_iters)
1739
+ iters.update(new_iters)
1740
+ return iters
1741
+
1742
+ @TimeIt.decorator
1743
+ def _get_persistent_EARs(self, id_lst: Iterable[int]) -> dict[int, ZarrStoreEAR]:
1744
+ runs, id_lst = self._get_cached_persistent_EARs(id_lst)
1745
+ if id_lst:
1746
+ self.logger.debug(
1747
+ f"loading {len(id_lst)} persistent EAR(s) from disk: "
1748
+ f"{shorten_list_str(id_lst)}."
1749
+ )
1750
+ arr = self._get_EARs_arr()
1751
+ attrs = arr.attrs.asdict()
1752
+ sel: tuple[NDArray, NDArray] | list[int]
1753
+ try:
1754
+ # convert to 2D array indices:
1755
+ sel = get_2D_idx(np.array(id_lst), num_cols=arr.shape[1])
1756
+ except IndexError:
1757
+ # 1D runs array from before update to 2D in Feb 2025 refactor/jobscript:
1758
+ sel = id_lst
1759
+ try:
1760
+ EAR_arr_dat = _zarr_get_coord_selection(arr, sel, self.logger)
1761
+ except BoundsCheckError:
1762
+ raise MissingStoreEARError(id_lst) from None
1763
+ EAR_dat = dict(zip(id_lst, EAR_arr_dat))
1764
+ new_runs = {
1765
+ k: ZarrStoreEAR.decode(EAR_dat=v, ts_fmt=self.ts_fmt, attrs=attrs)
1766
+ for k, v in EAR_dat.items()
1767
+ }
1768
+ self.EAR_cache.update(new_runs)
1769
+ runs.update(new_runs)
1770
+
1771
+ return runs
1772
+
1773
+ @TimeIt.decorator
1774
+ def _get_persistent_parameters(
1775
+ self, id_lst: Iterable[int], *, dataset_copy: bool = False, **kwargs
1776
+ ) -> dict[int, ZarrStoreParameter]:
1777
+ self._ensure_all_decoders()
1778
+ params, id_lst = self._get_cached_persistent_parameters(id_lst)
1779
+ if id_lst:
1780
+
1781
+ self.logger.debug(
1782
+ f"loading {len(id_lst)} persistent parameter(s) from disk: "
1783
+ f"{shorten_list_str(id_lst)}."
1784
+ )
1785
+
1786
+ # TODO: implement the "parameter_metadata_cache" for zarr stores, which would
1787
+ # keep the base_arr and src_arr open
1788
+ base_arr = self._get_parameter_base_array(mode="r")
1789
+ src_arr = self._get_parameter_sources_array(mode="r")
1790
+
1791
+ try:
1792
+ param_arr_dat = base_arr.get_coordinate_selection(list(id_lst))
1793
+ src_arr_dat = src_arr.get_coordinate_selection(list(id_lst))
1794
+ except BoundsCheckError:
1795
+ raise MissingParameterData(id_lst) from None
1796
+
1797
+ param_dat = dict(zip(id_lst, param_arr_dat))
1798
+ src_dat = dict(zip(id_lst, src_arr_dat))
1799
+
1800
+ new_params = {
1801
+ k: ZarrStoreParameter.decode(
1802
+ id_=k,
1803
+ data=v,
1804
+ source=src_dat[k],
1805
+ arr_group=self._get_parameter_data_array_group(k),
1806
+ dataset_copy=dataset_copy,
1807
+ )
1808
+ for k, v in param_dat.items()
1809
+ }
1810
+ self.parameter_cache.update(new_params)
1811
+ params.update(new_params)
1812
+
1813
+ return params
1814
+
1815
+ @TimeIt.decorator
1816
+ def _get_persistent_param_sources(
1817
+ self, id_lst: Iterable[int]
1818
+ ) -> dict[int, ParamSource]:
1819
+ sources, id_lst = self._get_cached_persistent_param_sources(id_lst)
1820
+ if id_lst:
1821
+ src_arr = self._get_parameter_sources_array(mode="r")
1822
+ try:
1823
+ src_arr_dat = src_arr.get_coordinate_selection(list(id_lst))
1824
+ except BoundsCheckError:
1825
+ raise MissingParameterData(id_lst) from None
1826
+ new_sources = dict(zip(id_lst, src_arr_dat))
1827
+ self.param_sources_cache.update(new_sources)
1828
+ sources.update(new_sources)
1829
+ return sources
1830
+
1831
+ def _get_persistent_parameter_set_status(
1832
+ self, id_lst: Iterable[int]
1833
+ ) -> dict[int, bool]:
1834
+ base_arr = self._get_parameter_base_array(mode="r")
1835
+ try:
1836
+ param_arr_dat = base_arr.get_coordinate_selection(list(id_lst))
1837
+ except BoundsCheckError:
1838
+ raise MissingParameterData(id_lst) from None
1839
+
1840
+ return dict(zip(id_lst, [i is not None for i in param_arr_dat]))
1841
+
1842
+ def _get_persistent_parameter_IDs(self) -> list[int]:
1843
+ # we assume the row index is equivalent to ID, might need to revisit in future
1844
+ base_arr = self._get_parameter_base_array(mode="r")
1845
+ return list(range(len(base_arr)))
1846
+
1847
+ def get_submission_at_submit_metadata(
1848
+ self, sub_idx: int, metadata_attr: dict | None
1849
+ ) -> dict[str, Any]:
1850
+ """Retrieve the values of submission attributes that are stored at submit-time."""
1851
+ grp = self._get_submission_metadata_group(sub_idx)
1852
+ attrs = grp.attrs.asdict()
1853
+ return {k: attrs[k] for k in SUBMISSION_SUBMIT_TIME_KEYS}
1854
+
1855
+ def clear_jobscript_at_submit_metadata_cache(self):
1856
+ """Clear the cache of at-submit-time jobscript metadata."""
1857
+ self._jobscript_at_submit_metadata = {}
1858
+
1859
+ def get_jobscript_at_submit_metadata(
1860
+ self,
1861
+ sub_idx: int,
1862
+ js_idx: int,
1863
+ metadata_attr: dict | None,
1864
+ ) -> dict[str, Any]:
1865
+ """For the specified jobscript, retrieve the values of jobscript-submit-time
1866
+ attributes.
1867
+
1868
+ Notes
1869
+ -----
1870
+ If the cache does not exist, this method will retrieve and cache metadata for
1871
+ all jobscripts for which metadata has been set. If the cache does exist, but not
1872
+ for the requested jobscript, then this method will retrieve and cache metadata for
1873
+ all non-cached jobscripts for which metadata has been set. If metadata has not
1874
+ yet been set for the specified jobscript, and dict with all `None` values will be
1875
+ returned.
1876
+
1877
+ The cache can be cleared using the method
1878
+ `clear_jobscript_at_submit_metadata_cache`.
1879
+
1880
+ """
1881
+ if self._jobscript_at_submit_metadata:
1882
+ # cache exists, but might not include data for the requested jobscript:
1883
+ if js_idx in self._jobscript_at_submit_metadata:
1884
+ return self._jobscript_at_submit_metadata[js_idx]
1885
+
1886
+ arr = self._get_jobscripts_at_submit_metadata_arr(sub_idx)
1887
+ non_cached = set(range(len(arr))) - set(self._jobscript_at_submit_metadata.keys())
1888
+
1889
+ # populate cache:
1890
+ arr_non_cached = arr.get_coordinate_selection((list(non_cached),))
1891
+ for js_idx_i, arr_item in zip(non_cached, arr_non_cached):
1892
+ try:
1893
+ self._jobscript_at_submit_metadata[js_idx_i] = {
1894
+ i: arr_item[i_idx]
1895
+ for i_idx, i in enumerate(JOBSCRIPT_SUBMIT_TIME_KEYS)
1896
+ }
1897
+ except TypeError:
1898
+ # data for this jobscript is not set
1899
+ pass
1900
+
1901
+ if js_idx not in self._jobscript_at_submit_metadata:
1902
+ return {i: None for i in JOBSCRIPT_SUBMIT_TIME_KEYS}
1903
+
1904
+ return self._jobscript_at_submit_metadata[js_idx]
1905
+
1906
+ @TimeIt.decorator
1907
+ def get_jobscript_block_run_ID_array(
1908
+ self,
1909
+ sub_idx: int,
1910
+ js_idx: int,
1911
+ blk_idx: int,
1912
+ run_ID_arr: NDArray | None,
1913
+ ) -> NDArray:
1914
+ """For the specified jobscript-block, retrieve the run ID array."""
1915
+
1916
+ if run_ID_arr is not None:
1917
+ self.logger.debug("jobscript-block run IDs are still in memory.")
1918
+ # in the special case when the Submission object has just been created, the
1919
+ # run ID arrays will not yet be persistent.
1920
+ return np.asarray(run_ID_arr)
1921
+
1922
+ # otherwise, `append_submissions` has been called, the run IDs have been
1923
+ # removed from the JSON-representation of the submission object, and have been
1924
+ # saved in separate zarr arrays:
1925
+ if sub_idx not in self._jobscript_run_ID_arrays:
1926
+
1927
+ self.logger.debug(
1928
+ f"retrieving jobscript-block run IDs for submission {sub_idx} from disk,"
1929
+ f" and caching."
1930
+ )
1931
+
1932
+ # for a given submission, run IDs are stored for all jobscript-blocks in the
1933
+ # same array (and chunk), so retrieve all of them and cache:
1934
+
1935
+ arr = self._get_jobscripts_run_ID_arr(sub_idx)
1936
+ arr_dat = arr[:]
1937
+ block_shapes = arr.attrs["block_shapes"]
1938
+
1939
+ self._jobscript_run_ID_arrays[sub_idx] = {} # keyed by (js_idx, blk_idx)
1940
+ arr_idx = 0
1941
+ for js_idx_i, js_blk_shapes in enumerate(block_shapes):
1942
+ for blk_idx_j, blk_shape_j in enumerate(js_blk_shapes):
1943
+ self._jobscript_run_ID_arrays[sub_idx][(js_idx_i, blk_idx_j)] = (
1944
+ arr_dat[arr_idx, : blk_shape_j[0], : blk_shape_j[1]]
1945
+ )
1946
+ arr_idx += 1
1947
+
1948
+ else:
1949
+ self.logger.debug(
1950
+ f"retrieving jobscript-block run IDs for submission {sub_idx} from cache."
1951
+ )
1952
+
1953
+ return self._jobscript_run_ID_arrays[sub_idx][(js_idx, blk_idx)]
1954
+
1955
+ def get_jobscript_block_task_elements_map(
1956
+ self,
1957
+ sub_idx: int,
1958
+ js_idx: int,
1959
+ blk_idx: int,
1960
+ task_elems_map: dict[int, list[int]] | None,
1961
+ ) -> dict[int, list[int]]:
1962
+ """For the specified jobscript-block, retrieve the task-elements mapping."""
1963
+
1964
+ if task_elems_map is not None:
1965
+ self.logger.debug("jobscript-block task elements are still in memory.")
1966
+ # in the special case when the Submission object has just been created, the
1967
+ # task elements arrays will not yet be persistent.
1968
+ return task_elems_map
1969
+
1970
+ # otherwise, `append_submissions` has been called, the task elements have been
1971
+ # removed from the JSON-representation of the submission object, and have been
1972
+ # saved in separate zarr arrays:
1973
+ if sub_idx not in self._jobscript_task_element_maps:
1974
+
1975
+ self.logger.debug(
1976
+ f"retrieving jobscript-block task elements for submission {sub_idx} from "
1977
+ f"disk, and caching."
1978
+ )
1979
+
1980
+ # for a given submission, task elements are stored for all jobscript-blocks in
1981
+ # the same array (and chunk), so retrieve all of them and cache:
1982
+
1983
+ arr = self._get_jobscripts_task_elements_arr(sub_idx)
1984
+ arr_dat = arr[:]
1985
+ block_shapes = arr.attrs["block_shapes"]
1986
+
1987
+ self._jobscript_task_element_maps[sub_idx] = {} # keys: (js_idx, blk_idx)
1988
+ arr_idx = 0
1989
+ for js_idx_i, js_blk_shapes in enumerate(block_shapes):
1990
+ for blk_idx_j, blk_shape_j in enumerate(js_blk_shapes):
1991
+ arr_i = arr_dat[arr_idx, : blk_shape_j[1], : blk_shape_j[0] + 1]
1992
+ self._jobscript_task_element_maps[sub_idx][(js_idx_i, blk_idx_j)] = {
1993
+ k[0]: list(k[1:]) for k in arr_i
1994
+ }
1995
+ arr_idx += 1
1996
+
1997
+ else:
1998
+ self.logger.debug(
1999
+ f"retrieving jobscript-block task elements for submission {sub_idx} from "
2000
+ "cache."
2001
+ )
2002
+
2003
+ return self._jobscript_task_element_maps[sub_idx][(js_idx, blk_idx)]
2004
+
2005
+ @TimeIt.decorator
2006
+ def get_jobscript_block_task_actions_array(
2007
+ self,
2008
+ sub_idx: int,
2009
+ js_idx: int,
2010
+ blk_idx: int,
2011
+ task_actions_arr: NDArray | list[tuple[int, int, int]] | None,
2012
+ ) -> NDArray:
2013
+ """For the specified jobscript-block, retrieve the task-actions array."""
2014
+
2015
+ if task_actions_arr is not None:
2016
+ self.logger.debug("jobscript-block task actions are still in memory.")
2017
+ # in the special case when the Submission object has just been created, the
2018
+ # task actions arrays will not yet be persistent.
2019
+ return np.asarray(task_actions_arr)
2020
+
2021
+ # otherwise, `append_submissions` has been called, the task actions have been
2022
+ # removed from the JSON-representation of the submission object, and have been
2023
+ # saved in separate zarr arrays:
2024
+ if sub_idx not in self._jobscript_task_actions_arrays:
2025
+
2026
+ self.logger.debug(
2027
+ f"retrieving jobscript-block task actions for submission {sub_idx} from "
2028
+ f"disk, and caching."
2029
+ )
2030
+
2031
+ # for a given submission, task actions are stored for all jobscript-blocks in
2032
+ # the same array (and chunk), so retrieve all of them and cache:
2033
+
2034
+ arr = self._get_jobscripts_task_actions_arr(sub_idx)
2035
+ arr_dat = arr[:]
2036
+ block_num_acts = arr.attrs["block_num_acts"]
2037
+
2038
+ num_acts_count = 0
2039
+ self._jobscript_task_actions_arrays[sub_idx] = {} # keys: (js_idx, blk_idx)
2040
+ for js_idx_i, js_blk_num_acts in enumerate(block_num_acts):
2041
+ for blk_idx_j, blk_num_acts_j in enumerate(js_blk_num_acts):
2042
+ arr_i = arr_dat[num_acts_count : num_acts_count + blk_num_acts_j]
2043
+ num_acts_count += blk_num_acts_j
2044
+ self._jobscript_task_actions_arrays[sub_idx][
2045
+ (js_idx_i, blk_idx_j)
2046
+ ] = arr_i
2047
+
2048
+ else:
2049
+ self.logger.debug(
2050
+ f"retrieving jobscript-block task actions for submission {sub_idx} from "
2051
+ "cache."
2052
+ )
2053
+
2054
+ return self._jobscript_task_actions_arrays[sub_idx][(js_idx, blk_idx)]
2055
+
2056
+ @TimeIt.decorator
2057
+ def get_jobscript_block_dependencies(
2058
+ self,
2059
+ sub_idx: int,
2060
+ js_idx: int,
2061
+ blk_idx: int,
2062
+ js_dependencies: dict[tuple[int, int], ResolvedJobscriptBlockDependencies] | None,
2063
+ ) -> dict[tuple[int, int], ResolvedJobscriptBlockDependencies]:
2064
+ """For the specified jobscript-block, retrieve the dependencies."""
2065
+
2066
+ if js_dependencies is not None:
2067
+ self.logger.debug("jobscript-block dependencies are still in memory.")
2068
+ # in the special case when the Submission object has just been created, the
2069
+ # dependencies will not yet be persistent.
2070
+ return js_dependencies
2071
+
2072
+ # otherwise, `append_submissions` has been called, the dependencies have been
2073
+ # removed from the JSON-representation of the submission object, and have been
2074
+ # saved in separate zarr arrays:
2075
+ if sub_idx not in self._jobscript_dependencies:
2076
+ self.logger.debug(
2077
+ f"retrieving jobscript-block dependencies for submission {sub_idx} from "
2078
+ f"disk, and caching."
2079
+ )
2080
+ # for a given submission, dependencies are stored for all jobscript-blocks in
2081
+ # the same array (and chunk), so retrieve all of them and cache:
2082
+ arr = self._get_jobscripts_dependencies_arr(sub_idx)
2083
+ self._jobscript_dependencies[sub_idx] = (
2084
+ self._decode_jobscript_block_dependencies(arr)
2085
+ )
2086
+ else:
2087
+ self.logger.debug(
2088
+ f"retrieving jobscript-block dependencies for submission {sub_idx} from "
2089
+ "cache."
2090
+ )
2091
+
2092
+ return self._jobscript_dependencies[sub_idx][(js_idx, blk_idx)]
2093
+
2094
+ def get_ts_fmt(self):
2095
+ """
2096
+ Get the format for timestamps.
2097
+ """
2098
+ with self.using_resource("attrs", action="read") as attrs:
2099
+ return attrs["ts_fmt"]
2100
+
2101
+ def get_ts_name_fmt(self):
2102
+ """
2103
+ Get the format for timestamps to use in names.
2104
+ """
2105
+ with self.using_resource("attrs", action="read") as attrs:
2106
+ return attrs["ts_name_fmt"]
2107
+
2108
+ def get_creation_info(self):
2109
+ """
2110
+ Get information about the creation of the workflow.
2111
+ """
2112
+ with self.using_resource("attrs", action="read") as attrs:
2113
+ return copy.deepcopy(attrs["creation_info"])
2114
+
2115
+ def get_name(self):
2116
+ """
2117
+ Get the name of the workflow.
2118
+ """
2119
+ with self.using_resource("attrs", action="read") as attrs:
2120
+ return attrs["name"]
2121
+
2122
+ def zip(
2123
+ self,
2124
+ path: str = ".",
2125
+ log: str | None = None,
2126
+ overwrite: bool = False,
2127
+ include_execute: bool = False,
2128
+ include_rechunk_backups: bool = False,
2129
+ ):
2130
+ """
2131
+ Convert the persistent store to zipped form.
2132
+
2133
+ Parameters
2134
+ ----------
2135
+ path:
2136
+ Path at which to create the new zipped workflow. If this is an existing
2137
+ directory, the zip file will be created within this directory. Otherwise,
2138
+ this path is assumed to be the full file path to the new zip file.
2139
+ """
2140
+ with Console().status(f"Zipping workflow {self.workflow.name!r}..."):
2141
+ # TODO: this won't work for remote file systems
2142
+ dst_path = Path(path).resolve()
2143
+ if dst_path.is_dir():
2144
+ dst_path = dst_path.joinpath(self.workflow.name).with_suffix(".zip")
2145
+
2146
+ if not overwrite and dst_path.exists():
2147
+ raise FileExistsError(
2148
+ f"File at path already exists: {dst_path!r}. Pass `overwrite=True` to "
2149
+ f"overwrite the existing file."
2150
+ )
2151
+
2152
+ dst_path_s = str(dst_path)
2153
+
2154
+ src_zarr_store = self.zarr_store
2155
+ zfs, _ = ask_pw_on_auth_exc(
2156
+ ZipFileSystem,
2157
+ fo=dst_path_s,
2158
+ mode="w",
2159
+ target_options={},
2160
+ add_pw_to="target_options",
2161
+ )
2162
+ dst_zarr_store = FSStore(url="", fs=zfs)
2163
+ excludes = []
2164
+ if not include_execute:
2165
+ excludes.append("execute")
2166
+ if not include_rechunk_backups:
2167
+ excludes.append("runs.bak")
2168
+ excludes.append("base.bak")
2169
+
2170
+ zarr.copy_store(
2171
+ src_zarr_store,
2172
+ dst_zarr_store,
2173
+ excludes=excludes or None,
2174
+ log=log,
2175
+ )
2176
+ del zfs # ZipFileSystem remains open for instance lifetime
2177
+ return dst_path_s
2178
+
2179
+ def unzip(self, path: str = ".", log: str | None = None):
2180
+ raise ValueError("Not a zip store!")
2181
+
2182
+ def _rechunk_arr(
2183
+ self,
2184
+ arr: Array,
2185
+ chunk_size: int | None = None,
2186
+ backup: bool = True,
2187
+ status: bool = True,
2188
+ ) -> Array:
2189
+ arr_path = Path(arr.store.path) / arr.path
2190
+ arr_name = arr.path.split("/")[-1]
2191
+
2192
+ if status:
2193
+ s = Console().status("Rechunking...")
2194
+ s.start()
2195
+ backup_time = None
2196
+
2197
+ if backup:
2198
+ if status:
2199
+ s.update("Backing up...")
2200
+ backup_path = arr_path.with_suffix(".bak")
2201
+ if backup_path.is_dir():
2202
+ pass
2203
+ else:
2204
+ tic = time.perf_counter()
2205
+ shutil.copytree(arr_path, backup_path)
2206
+ toc = time.perf_counter()
2207
+ backup_time = toc - tic
2208
+
2209
+ tic = time.perf_counter()
2210
+ arr_rc_path = arr_path.with_suffix(".rechunked")
2211
+ if status:
2212
+ s.update("Creating new array...")
2213
+
2214
+ # use the same store:
2215
+ try:
2216
+ arr_rc_store = arr.store.__class__(path=arr_rc_path)
2217
+ except TypeError:
2218
+ # FSStore
2219
+ arr_rc_store = arr.store.__class__(url=str(arr_rc_path))
2220
+
2221
+ arr_rc = zarr.create(
2222
+ store=arr_rc_store,
2223
+ shape=arr.shape,
2224
+ chunks=arr.shape if chunk_size is None else chunk_size,
2225
+ dtype=object,
2226
+ object_codec=self._CODEC,
2227
+ )
2228
+
2229
+ if status:
2230
+ s.update("Copying data...")
2231
+ data = np.empty(shape=arr.shape, dtype=object)
2232
+ bad_data = []
2233
+ for idx in range(len(arr)):
2234
+ try:
2235
+ data[idx] = arr[idx]
2236
+ except RuntimeError:
2237
+ # blosc decompression errors
2238
+ bad_data.append(idx)
2239
+ arr_rc[:] = data
2240
+
2241
+ arr_rc.attrs.put(arr.attrs.asdict())
2242
+
2243
+ if status:
2244
+ s.update("Deleting old array...")
2245
+ shutil.rmtree(arr_path)
2246
+
2247
+ if status:
2248
+ s.update("Moving new array into place...")
2249
+ shutil.move(arr_rc_path, arr_path)
2250
+
2251
+ toc = time.perf_counter()
2252
+ rechunk_time = toc - tic
2253
+
2254
+ if status:
2255
+ s.stop()
2256
+
2257
+ if backup_time:
2258
+ print(f"Time to backup {arr_name}: {backup_time:.1f} s")
2259
+
2260
+ print(f"Time to rechunk and move {arr_name}: {rechunk_time:.1f} s")
2261
+
2262
+ if bad_data:
2263
+ print(f"Bad data at {arr_name} indices: {bad_data}.")
2264
+
2265
+ return arr_rc
2266
+
2267
+ def rechunk_parameter_base(
2268
+ self,
2269
+ chunk_size: int | None = None,
2270
+ backup: bool = True,
2271
+ status: bool = True,
2272
+ ) -> Array:
2273
+ """
2274
+ Rechunk the parameter data to be stored more efficiently.
2275
+ """
2276
+ arr = self._get_parameter_base_array()
2277
+ return self._rechunk_arr(arr, chunk_size, backup, status)
2278
+
2279
+ def rechunk_runs(
2280
+ self,
2281
+ chunk_size: int | None = None,
2282
+ backup: bool = True,
2283
+ status: bool = True,
2284
+ ) -> Array:
2285
+ """
2286
+ Rechunk the run data to be stored more efficiently.
2287
+ """
2288
+ arr = self._get_EARs_arr()
2289
+ return self._rechunk_arr(arr, chunk_size, backup, status)
2290
+
2291
+ def get_dirs_array(self) -> NDArray:
2292
+ """
2293
+ Retrieve the run directories array.
2294
+ """
2295
+ return self._get_dirs_arr()[:]
2296
+
2297
+
2298
+ class ZarrZipPersistentStore(ZarrPersistentStore):
2299
+ """A store designed mainly as an archive format that can be uploaded to data
2300
+ repositories such as Zenodo.
2301
+
2302
+ Note
2303
+ ----
2304
+ Archive format persistent stores cannot be updated without being unzipped first.
2305
+ """
2306
+
2307
+ _name: ClassVar[str] = "zip"
2308
+ _features: ClassVar[PersistentStoreFeatures] = PersistentStoreFeatures(
2309
+ create=False,
2310
+ edit=False,
2311
+ jobscript_parallelism=False,
2312
+ EAR_parallelism=False,
2313
+ schedulers=False,
2314
+ submission=False,
2315
+ )
2316
+
2317
+ # TODO: enforce read-only nature
2318
+
2319
+ def zip(
2320
+ self,
2321
+ path: str = ".",
2322
+ log: str | None = None,
2323
+ overwrite: bool = False,
2324
+ include_execute: bool = False,
2325
+ include_rechunk_backups: bool = False,
2326
+ ):
2327
+ raise ValueError("Already a zip store!")
2328
+
2329
+ def unzip(self, path: str = ".", log: str | None = None) -> str:
2330
+ """
2331
+ Expand the persistent store.
2332
+
2333
+ Parameters
2334
+ ----------
2335
+ path:
2336
+ Path at which to create the new unzipped workflow. If this is an existing
2337
+ directory, the new workflow directory will be created within this directory.
2338
+ Otherwise, this path will represent the new workflow directory path.
2339
+
2340
+ """
2341
+
2342
+ with Console().status(f"Unzipping workflow {self.workflow.name!r}..."):
2343
+ # TODO: this won't work for remote file systems
2344
+ dst_path = Path(path).resolve()
2345
+ if dst_path.is_dir():
2346
+ dst_path = dst_path.joinpath(self.workflow.name)
2347
+
2348
+ if dst_path.exists():
2349
+ raise FileExistsError(f"Directory at path already exists: {dst_path!r}.")
2350
+
2351
+ dst_path_s = str(dst_path)
2352
+
2353
+ src_zarr_store = self.zarr_store
2354
+ dst_zarr_store = FSStore(url=dst_path_s)
2355
+ zarr.copy_store(src_zarr_store, dst_zarr_store, log=log)
2356
+ return dst_path_s
2357
+
2358
+ def copy(self, path: PathLike = None) -> Path:
2359
+ # not sure how to do this.
2360
+ raise NotImplementedError()
2361
+
2362
+ def delete_no_confirm(self) -> None:
2363
+ # `ZipFileSystem.rm()` does not seem to be implemented.
2364
+ raise NotImplementedError()
2365
+
2366
+ def _rechunk_arr(
2367
+ self,
2368
+ arr,
2369
+ chunk_size: int | None = None,
2370
+ backup: bool = True,
2371
+ status: bool = True,
2372
+ ) -> Array:
2373
+ raise NotImplementedError
2374
+
2375
+ def get_text_file(self, path: str | Path) -> str:
2376
+ """Retrieve the contents of a text file stored within the workflow."""
2377
+ path = Path(path)
2378
+ if path.is_absolute():
2379
+ path = path.relative_to(self.workflow.url)
2380
+ path = str(path.as_posix())
2381
+ assert self.fs
2382
+ try:
2383
+ with self.fs.open(path, mode="rt") as fp:
2384
+ return fp.read()
2385
+ except KeyError:
2386
+ raise FileNotFoundError(
2387
+ f"File within zip at location {path!r} does not exist."
2388
+ ) from None